From 74104d8ea54de1fd1ede5498820519acdb9aa0af Mon Sep 17 00:00:00 2001 From: adinhodovic Date: Fri, 28 Nov 2025 13:23:24 +0100 Subject: [PATCH] feat: Add new mixins --- assets/envoy-2/alerts.yaml | 198 ++ .../envoy-2/dashboards/envoy-downstream.json | 982 +++++++++ .../dashboards/envoy-gateway-overview.json | 538 +++++ assets/envoy-2/dashboards/envoy-overview.json | 1653 +++++++++++++++ assets/envoy-2/dashboards/envoy-upstream.json | 1122 ++++++++++ assets/envoy-2/rules.yaml | 1 + assets/syncthing/alerts.yaml | 40 + .../dashboards/syncthing-overview.json | 1829 +++++++++++++++++ assets/syncthing/rules.yaml | 1 + assets/tailscale/alerts.yaml | 109 + .../dashboards/tailscale-machine.json | 1492 ++++++++++++++ .../dashboards/tailscale-overview.json | 1613 +++++++++++++++ assets/tailscale/rules.yaml | 1 + hack/go.mod | 1 + hack/go.sum | 10 + mixins.json | 15 + site/content/ceph/_index.md | 4 +- site/content/cilium-enterprise/_index.md | 6 +- site/content/cortex/_index.md | 2 +- site/content/envoy-2/_index.md | 248 +++ site/content/loki/_index.md | 4 +- site/content/prometheus/_index.md | 2 +- site/content/syncthing/_index.md | 72 + site/content/tailscale/_index.md | 156 ++ site/static/mixins.json | 21 +- 25 files changed, 10108 insertions(+), 12 deletions(-) create mode 100644 assets/envoy-2/alerts.yaml create mode 100644 assets/envoy-2/dashboards/envoy-downstream.json create mode 100644 assets/envoy-2/dashboards/envoy-gateway-overview.json create mode 100644 assets/envoy-2/dashboards/envoy-overview.json create mode 100644 assets/envoy-2/dashboards/envoy-upstream.json create mode 100644 assets/envoy-2/rules.yaml create mode 100644 assets/syncthing/alerts.yaml create mode 100644 assets/syncthing/dashboards/syncthing-overview.json create mode 100644 assets/syncthing/rules.yaml create mode 100644 assets/tailscale/alerts.yaml create mode 100644 assets/tailscale/dashboards/tailscale-machine.json create mode 100644 assets/tailscale/dashboards/tailscale-overview.json create mode 100644 assets/tailscale/rules.yaml create mode 100644 site/content/envoy-2/_index.md create mode 100644 site/content/syncthing/_index.md create mode 100644 site/content/tailscale/_index.md diff --git a/assets/envoy-2/alerts.yaml b/assets/envoy-2/alerts.yaml new file mode 100644 index 00000000..b87d9442 --- /dev/null +++ b/assets/envoy-2/alerts.yaml @@ -0,0 +1,198 @@ +groups: +- name: envoy + rules: + - alert: EnvoyUpstreamHighHttp4xxErrorRate + annotations: + dashboard_url: https://grafana.com/d/envoy-upstream-skj2/envoy-upstream?var-namespace={{ + $labels.namespace }}&var-envoy_cluster_name={{ $labels.envoy_cluster_name + }} + description: More than 5% HTTP requests with status 4xx for cluster {{ $labels.envoy_cluster_name + }} in {{ $labels.namespace }} the past 5m. + summary: Envoy upstream high HTTP 4xx error rate. + expr: | + ( + sum( + rate( + envoy_cluster_upstream_rq_xx{ + job=~".*", + envoy_response_code_class="4", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + / + sum( + rate( + envoy_cluster_upstream_rq_total{ + job=~".*", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + * 100 + ) > 5 + and + sum( + rate( + envoy_cluster_upstream_rq_xx{ + job=~".*", + envoy_response_code_class="4", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + > 5 + for: 1m + labels: + severity: info + - alert: EnvoyUpstreamHighHttp5xxErrorRate + annotations: + dashboard_url: https://grafana.com/d/envoy-upstream-skj2/envoy-upstream?var-namespace={{ + $labels.namespace }}&var-envoy_cluster_name={{ $labels.envoy_cluster_name + }} + description: More than 5% HTTP requests with status 5xx for cluster {{ $labels.envoy_cluster_name + }} in {{ $labels.namespace }} the past 5m. + summary: Envoy upstream high HTTP 5xx error rate. + expr: | + ( + sum( + rate( + envoy_cluster_upstream_rq_xx{ + job=~".*", + envoy_response_code_class="5", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + / + sum( + rate( + envoy_cluster_upstream_rq_total{ + job=~".*", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + * 100 + ) > 5 + and + sum( + rate( + envoy_cluster_upstream_rq_xx{ + job=~".*", + envoy_response_code_class="5", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + > 5 + for: 1m + labels: + severity: critical + - alert: EnvoyCircuitBreakerOpen + annotations: + dashboard_url: https://grafana.com/d/envoy-upstream-skj2/envoy-upstream?var-namespace={{ + $labels.namespace }}&var-envoy_cluster_name={{ $labels.envoy_cluster_name + }} + description: Circuit breaker is open for cluster {{ $labels.envoy_cluster_name + }} in {{ $labels.namespace }} for the past 5m. + summary: Envoy circuit breaker is open. + expr: | + sum( + ( + envoy_cluster_circuit_breakers_default_rq_open{ + job=~".*", + envoy_cluster_name!~"" + } + or + envoy_cluster_circuit_breakers_default_cx_open{ + job=~".*", + envoy_cluster_name!~"" + } + or + envoy_cluster_circuit_breakers_default_cx_pool_open{ + job=~".*", + envoy_cluster_name!~"" + } + ) + ) by (cluster, namespace, envoy_cluster_name) > 0 + for: 5m + labels: + severity: warning + - alert: EnvoyUpstreamConnectionFailures + annotations: + dashboard_url: https://grafana.com/d/envoy-upstream-skj2/envoy-upstream?var-namespace={{ + $labels.namespace }}&var-envoy_cluster_name={{ $labels.envoy_cluster_name + }} + description: More than 100 connection failures for cluster {{ $labels.envoy_cluster_name + }} in {{ $labels.namespace }} the past 5m. + summary: Envoy upstream connection failures detected. + expr: | + sum( + increase( + envoy_cluster_upstream_cx_connect_fail{ + job=~".*", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + > 100 + for: 10m + labels: + severity: warning + - alert: EnvoyUpstreamUnhealthyHosts + annotations: + dashboard_url: https://grafana.com/d/envoy-upstream-skj2/envoy-upstream?var-namespace={{ + $labels.namespace }}&var-envoy_cluster_name={{ $labels.envoy_cluster_name + }} + description: More than 33% of hosts are unhealthy for cluster {{ $labels.envoy_cluster_name + }} in {{ $labels.namespace }} for the past 5m. + summary: Envoy upstream has unhealthy hosts. + expr: | + ( + sum( + envoy_cluster_membership_total{ + job=~".*", + envoy_cluster_name!~"" + } + ) by (cluster, namespace, envoy_cluster_name) + - + sum( + envoy_cluster_membership_healthy{ + job=~".*", + envoy_cluster_name!~"" + } + ) by (cluster, namespace, envoy_cluster_name) + ) + / + sum( + envoy_cluster_membership_total{ + job=~".*", + envoy_cluster_name!~"" + } + ) by (cluster, namespace, envoy_cluster_name) + * 100 + > 33 + for: 5m + labels: + severity: warning + - alert: EnvoyXDSUpdateFailed + annotations: + dashboard_url: https://grafana.com/d/envoy-gateway-overview-skj2/envoy-gateway-overview?var-namespace={{ + $labels.namespace }} + description: XDS snapshot update failed for node {{ $labels.nodeID }} in {{ + $labels.namespace }} with status {{ $labels.status }} the past 5m. + summary: Envoy Gateway XDS snapshot update failed. + expr: | + sum( + increase( + xds_snapshot_update_total{ + job=~".*", + status!="success" + }[5m] + ) + ) by (cluster, namespace, status, nodeID) + > 0 + for: 1m + labels: + severity: warning diff --git a/assets/envoy-2/dashboards/envoy-downstream.json b/assets/envoy-2/dashboards/envoy-downstream.json new file mode 100644 index 00000000..a135066b --- /dev/null +++ b/assets/envoy-2/dashboards/envoy-downstream.json @@ -0,0 +1,982 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ ] + }, + "description": "A dashboard that monitors Envoy with a focus on giving an overview of downsreams. The dashboards were generated using [envoy-mixin](https://github.com/adinhodovic/envoy-mixin). Open issues and create feature requests in the repository.", + "editable": false, + "links": [ + { + "asDropdown": true, + "includeVars": false, + "keepTime": true, + "tags": [ + "envoy", + "envoy-mixin", + "gateway" + ], + "targetBlank": true, + "title": "Envoy", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of downstreams by job.", + "fieldConfig": { + "defaults": { + "unit": "downstreams" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 2, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n) by (job)\n", + "instant": true, + "legendFormat": "{{ job }}" + } + ], + "title": "Downstreams Count by Job", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of downstream request rates by Envoy HTTP connection manager prefix.", + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 5 + }, + "id": 3, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(20,\n sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }[1h]\n )\n ) by (envoy_http_conn_manager_prefix)\n)\n", + "instant": true, + "legendFormat": "{{ envoy_http_conn_manager_prefix }}" + } + ], + "title": "Downstream Rate by Envoy HTTP Conn Manager Prefix [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of downstream request rates by response code class.", + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 5 + }, + "id": 4, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }[1h]\n )\n) by (envoy_response_code_class)\n", + "instant": true, + "legendFormat": "{{ envoy_response_code_class }}xx" + } + ], + "title": "Downstream Rate by Code Class [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of active downstream connections by Envoy HTTP connection manager prefix.", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 5 + }, + "id": 5, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(20,\n sum(\n envoy_http_downstream_cx_active{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n ) by (envoy_http_conn_manager_prefix)\n)\n", + "instant": true, + "legendFormat": "{{ envoy_http_conn_manager_prefix }}" + } + ], + "title": "Downstream Active Connections by Envoy HTTP Conn Manager Prefix", + "type": "piechart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 6, + "repeat": "envoy_http_conn_manager_prefix", + "title": "$envoy_http_conn_manager_prefix", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream request rate by Envoy HTTP connection manager prefix over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n) by (envoy_http_conn_manager_prefix)\n", + "legendFormat": "{{ envoy_http_conn_manager_prefix }}" + } + ], + "title": "Downstream Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream latency percentiles over time.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.5,\n sum(\n rate(\n envoy_http_downstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.95,\n sum(\n rate(\n envoy_http_downstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": true, + "expr": "histogram_quantile(\n 0.99,\n sum(\n rate(\n envoy_http_downstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P99" + } + ], + "title": "Downstream Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream success rate over time, counting 5xx response codes as errors.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMax": 100, + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n,\n envoy_response_code_class!=\"5\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n)\n* 100\n", + "legendFormat": "Success Rate" + } + ], + "title": "Downstream Success Rate (Excluding 4xx errors)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream success rate over time, counting 4xx and 5xx response codes as errors.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMax": 100, + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n,\n envoy_response_code_class!~\"4|5\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n)\n* 100\n", + "legendFormat": "Success Rate" + } + ], + "title": "Downstream Success Rate (Including 4xx errors)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream request rate by response code class over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }[$__rate_interval]\n )\n) by (envoy_response_code_class)\n", + "legendFormat": "{{ envoy_response_code_class }}xx" + } + ], + "title": "Downstream Rate by Code Class", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream connections over time.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n envoy_http_downstream_cx_active{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }\n) by (envoy_http_conn_manager_prefix)\n", + "legendFormat": "Active Connections" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n envoy_http_downstream_cx_destroy{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_http_conn_manager_prefix)\n", + "legendFormat": "Destroyed Connections" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n envoy_http_downstream_cx_idle_timeout{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_http_conn_manager_prefix)\n", + "legendFormat": "Idle Timeout" + } + ], + "title": "Downstream Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream connection bytes received and transmitted over time.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_cx_rx_bytes_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_http_conn_manager_prefix)\n", + "legendFormat": "Received" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_cx_tx_bytes_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_http_conn_manager_prefix)\n", + "legendFormat": "Transmitted" + } + ], + "title": "Downstream Connection Bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream request resets and timeouts over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_rx_reset{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_http_conn_manager_prefix)\n", + "legendFormat": "RX Reset" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_tx_reset{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_http_conn_manager_prefix)\n", + "legendFormat": "TX Reset" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_timeout{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_http_conn_manager_prefix)\n", + "legendFormat": "Timeout" + } + ], + "title": "Downstream Request Resets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream request rate by pod over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n) by (pod, envoy_http_conn_manager_prefix)\n", + "legendFormat": "{{ pod }}" + } + ], + "title": "Downstream Rate by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream active connections by pod over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n envoy_http_downstream_cx_active{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=\"$envoy_http_conn_manager_prefix\"\n\n }\n) by (pod, envoy_http_conn_manager_prefix)\n", + "legendFormat": "{{ pod }}" + } + ], + "title": "Downstream Active Connections by Pod", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "envoy", + "envoy-mixin", + "gateway" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "hide": 2, + "label": "Cluster", + "name": "cluster", + "query": "label_values(envoy_cluster_upstream_rq_xx{}, cluster)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "query": "label_values(envoy_cluster_upstream_rq_xx{cluster=\"$cluster\"}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values(envoy_cluster_upstream_rq_xx{cluster=\"$cluster\", namespace=~\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Envoy HTTP Conn Manager Prefix", + "multi": true, + "name": "envoy_http_conn_manager_prefix", + "query": "label_values(envoy_http_downstream_rq_xx{cluster=\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}, envoy_http_conn_manager_prefix)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Pod", + "multi": true, + "name": "pod", + "query": "label_values(envoy_listener_http_downstream_rq_xx{cluster=\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", envoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"}, pod)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Envoy / Downstream", + "uid": "envoy-downstream-skj2" +} diff --git a/assets/envoy-2/dashboards/envoy-gateway-overview.json b/assets/envoy-2/dashboards/envoy-gateway-overview.json new file mode 100644 index 00000000..ed3bd59e --- /dev/null +++ b/assets/envoy-2/dashboards/envoy-gateway-overview.json @@ -0,0 +1,538 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ ] + }, + "description": "A dashboard that monitors Envoy Gateway with a focus on Kubernetes objects and XDS updates. The dashboards were generated using [envoy-mixin](https://github.com/adinhodovic/envoy-mixin). Open issues and create feature requests in the repository.", + "editable": false, + "links": [ + { + "asDropdown": true, + "includeVars": false, + "keepTime": true, + "tags": [ + "envoy", + "envoy-mixin", + "gateway" + ], + "targetBlank": true, + "title": "Envoy", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Envoy XDS", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of XDS snapshot updates by status and node ID.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n xds_snapshot_update_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n) by (status, nodeID)\n", + "legendFormat": "{{ status }}/{{ nodeID }}" + } + ], + "title": "XDS Snapshot Update Rate by Status/NodeID", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 9 + }, + "id": 3, + "title": "Kubernetes", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of resource apply operations by status and kind.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 10 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n resource_apply_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n) by (status, kind)\n", + "legendFormat": "{{ status }}/{{ kind }}" + } + ], + "title": "Resource Apply Rate by Status/Kind", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The duration of resource apply operations (P50 and P95).", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 10 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.5,\n sum(\n rate(\n resource_apply_duration_seconds_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.95,\n sum(\n rate(\n resource_apply_duration_seconds_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P95" + } + ], + "title": "Resource Apply Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of resource delete operations by status and kind.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n resource_delete_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n) by (status, kind)\n", + "legendFormat": "{{ status }}/{{ kind }}" + } + ], + "title": "Resource Delete Rate by Status/Kind", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The duration of resource delete operations (P50 and P95).", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.5,\n sum(\n rate(\n resource_delete_duration_seconds_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.95,\n sum(\n rate(\n resource_delete_duration_seconds_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P95" + } + ], + "title": "Resource Delete Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of status update operations by kind and status.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n status_update_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n) by (kind, status)\n", + "legendFormat": "{{ kind }}/{{ status }}" + } + ], + "title": "Status Update Rate by Kind/Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The duration of status update operations (P50 and P95).", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.5,\n sum(\n rate(\n status_update_duration_seconds_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.95,\n sum(\n rate(\n status_update_duration_seconds_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P95" + } + ], + "title": "Status Update Duration", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "envoy", + "envoy-mixin", + "gateway" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "query": "label_values(xds_snapshot_update_total{}, cluster)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "query": "label_values(xds_snapshot_update_total{cluster=\"$cluster\"}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values(xds_snapshot_update_total{cluster=\"$cluster\", namespace=~\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Envoy Gateway / Overview", + "uid": "envoy-gateway-overview-skj2" +} diff --git a/assets/envoy-2/dashboards/envoy-overview.json b/assets/envoy-2/dashboards/envoy-overview.json new file mode 100644 index 00000000..d4fef9ac --- /dev/null +++ b/assets/envoy-2/dashboards/envoy-overview.json @@ -0,0 +1,1653 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ ] + }, + "description": "A dashboard that monitors Envoy with a focus on giving an generic overview. The dashboards were generated using [envoy-mixin](https://github.com/adinhodovic/envoy-mixin). Open issues and create feature requests in the repository.", + "editable": false, + "links": [ + { + "asDropdown": true, + "includeVars": false, + "keepTime": true, + "tags": [ + "envoy", + "envoy-mixin", + "gateway" + ], + "targetBlank": true, + "title": "Envoy", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of Envoy pods being monitored.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "graphMode": "area", + "percentChangeColorMode": "standard", + "showPercentChange": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(\n count (\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n ) by (pod)\n)\n" + } + ], + "title": "Envoy Pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of upstreams being monitored.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 3, + "options": { + "graphMode": "area", + "percentChangeColorMode": "standard", + "showPercentChange": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(\n count(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n ) by (envoy_cluster_name)\n)\n" + } + ], + "title": "Upstreams", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of downstreams being monitored.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 4, + "options": { + "graphMode": "area", + "percentChangeColorMode": "standard", + "showPercentChange": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(\n count(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n ) by (envoy_http_conn_manager_prefix)\n)\n" + } + ], + "title": "Downstreams", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of active upstream connections across all Envoy clusters being monitored.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "graphMode": "area", + "percentChangeColorMode": "standard", + "showPercentChange": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n envoy_cluster_upstream_cx_active{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n)\n" + } + ], + "title": "Upstream Active Connections", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of active downstream connections across all Envoy clusters being monitored.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 6, + "options": { + "graphMode": "area", + "percentChangeColorMode": "standard", + "showPercentChange": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n envoy_http_downstream_cx_active{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n)\n" + } + ], + "title": "Downstream Active Connections", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The percentage of healthy members in the Envoy clusters being monitored.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 7, + "options": { + "graphMode": "area", + "percentChangeColorMode": "standard", + "showPercentChange": false + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n envoy_cluster_membership_healthy{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n)\n/\nsum(\n envoy_cluster_membership_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n)\n* 100\n" + } + ], + "title": "Membership Healthy Percent", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of upstream request rates by Envoy cluster name.", + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 8, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(20,\n sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (envoy_cluster_name)\n)\n", + "instant": true, + "legendFormat": "{{ envoy_cluster_name }}" + } + ], + "title": "Upstream Rate by Envoy Cluster Name [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of upstream request rates by response code class.", + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 5 + }, + "id": 9, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n) by (envoy_response_code_class)\n", + "instant": true, + "legendFormat": "{{ envoy_response_code_class }}xx" + } + ], + "title": "Upstream Rate by Code Class [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of downstream request rates by Envoy HTTP connection manager prefix.", + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 5 + }, + "id": 10, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(20,\n sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n ) by (envoy_http_conn_manager_prefix)\n)\n", + "instant": true, + "legendFormat": "{{ envoy_http_conn_manager_prefix }}" + } + ], + "title": "Downstream Rate by Envoy HTTP Conn Manager Prefix [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of upstream request rates by pod.", + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 5 + }, + "id": 11, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n) by (pod)\n", + "instant": true, + "legendFormat": "{{ pod }}" + } + ], + "title": "Upstream Rate by Pod [1h]", + "type": "piechart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 12, + "title": "Upstream", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream request rate over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 12 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n)\n", + "legendFormat": "Upstream" + } + ], + "title": "Upstream Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream latency percentiles over time.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.5,\n sum(\n rate(\n envoy_cluster_upstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.95,\n sum(\n rate(\n envoy_cluster_upstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": true, + "expr": "histogram_quantile(\n 0.99,\n sum(\n rate(\n envoy_cluster_upstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P99" + } + ], + "title": "Upstream Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream success rate over time, counting 5xx response codes as errors.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMax": 100, + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n,\n envoy_response_code_class!=\"5\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n)\n* 100\n", + "legendFormat": "Success Rate" + } + ], + "title": "Upstream Success Rate (Excluding 4xx errors)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream success rate over time, counting 4xx and 5xx response codes as errors.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMax": 100, + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n,\n envoy_response_code_class!~\"4|5\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n)\n* 100\n", + "legendFormat": "Success Rate" + } + ], + "title": "Upstream Success Rate (Including 4xx errors)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "An overview table showing various upstream metrics by Envoy cluster name [1h].", + "fieldConfig": { + "defaults": { + "links": [ + { + "targetBlank": true, + "title": "Go To Upstream", + "type": "dashboard", + "url": "/d/envoy-upstream-skj2/envoy-upstream?&var-envoy_cluster_name=${__data.fields.Envoy Cluster Name}" + } + ], + "thresholds": { + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "SSL Expirations" + }, + "properties": [ + { + "id": "unit", + "value": "dateTimeFromNow" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "P50 Latency" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "P95 Latency" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Request Rate" + }, + "properties": [ + { + "id": "unit", + "value": "reqps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Success Rate (5xx)" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Success Rate (4xx & 5xx)" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Healthy Cluster Percent" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 28 + }, + "id": 17, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Request Rate" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(40,\n sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (envoy_cluster_name)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n,\n envoy_response_code_class!=\"5\"\n }[1h]\n )\n) by (job, envoy_cluster_name)\n/\nsum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n) by (job, envoy_cluster_name)\n* 100\nand on (envoy_cluster_name) (\n topk(40,\n sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (envoy_cluster_name)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n,\n envoy_response_code_class!~\"4|5\"\n }[1h]\n )\n) by (job, envoy_cluster_name)\n/\nsum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n) by (job, envoy_cluster_name)\n* 100\nand on (envoy_cluster_name) (\n topk(40,\n sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (envoy_cluster_name)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.5,\n sum(\n rate(\n envoy_cluster_upstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (le, job, envoy_cluster_name)\n)\nand on (envoy_cluster_name) (\n topk(40,\n sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (envoy_cluster_name)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.95,\n sum(\n rate(\n envoy_cluster_upstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (le, job, envoy_cluster_name)\n)\nand on (envoy_cluster_name) (\n topk(40,\n sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (envoy_cluster_name)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n avg_over_time(\n envoy_cluster_upstream_cx_active{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n) by (job, envoy_cluster_name)\nand on (envoy_cluster_name) (\n topk(40,\n sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (envoy_cluster_name)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n envoy_cluster_upstream_cx_destroy{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n) by (job, envoy_cluster_name)\nand on (envoy_cluster_name) (\n topk(40,\n sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (envoy_cluster_name)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n envoy_cluster_membership_healthy{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }\n) by (job, envoy_cluster_name)\n/\nsum(\n envoy_cluster_membership_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }\n) by (job, envoy_cluster_name)\n* 100\nand on (envoy_cluster_name) (\n topk(40,\n sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=~\"$envoy_cluster_name\"\n\n }[1h]\n )\n ) by (envoy_cluster_name)\n)\n\n)\n\n", + "format": "table", + "instant": true + } + ], + "title": "Upstream Overview [1h]", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value #A": 1, + "Value #B": 2, + "Value #C": 3, + "Value #D": 4, + "Value #E": 5, + "Value #F": 6, + "Value #G": 7, + "Value #H": 8, + "Value #I": 9, + "envoy_cluster_name": 0 + }, + "renameByName": { + "Value #A": "Request Rate", + "Value #B": "Success Rate (5xx)", + "Value #C": "Success Rate (4xx & 5xx)", + "Value #D": "P50 Latency", + "Value #E": "P95 Latency", + "Value #F": "Active Connections", + "Value #G": "Destroyed Connections", + "Value #H": "Healthy Cluster Percent", + "envoy_cluster_name": "Envoy Cluster Name", + "job": "Job" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 18, + "title": "Downstream", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream request rate by Envoy HTTP connection manager prefix over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "topk(20,\n sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n ) by (envoy_http_conn_manager_prefix)\n)\n", + "legendFormat": "{{ envoy_http_conn_manager_prefix }}" + } + ], + "title": "Downstream Rate by Envoy HTTP Conn Manager Prefix", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream latency percentiles over time.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.5,\n sum(\n rate(\n envoy_http_downstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.95,\n sum(\n rate(\n envoy_http_downstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": true, + "expr": "histogram_quantile(\n 0.99,\n sum(\n rate(\n envoy_http_downstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P99" + } + ], + "title": "Downstream Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream success rate over time, counting 5xx response codes as errors.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMax": 100, + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 49 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n,\n envoy_response_code_class!=\"5\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n)\n* 100\n", + "legendFormat": "Success Rate" + } + ], + "title": "Downstream Success Rate (Excluding 4xx errors)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The downstream success rate over time, counting 4xx and 5xx response codes as errors.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMax": 100, + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 49 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n,\n envoy_response_code_class!~\"4|5\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[$__rate_interval]\n )\n)\n* 100\n", + "legendFormat": "Success Rate" + } + ], + "title": "Downstream Success Rate (Including 4xx errors)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "An overview table showing various downstream metrics by Envoy HTTP connection manager prefix [1h].", + "fieldConfig": { + "defaults": { + "links": [ + { + "targetBlank": true, + "title": "Go To Downstream", + "type": "dashboard", + "url": "/d/envoy-downstream-skj2/envoy-downstream?var-envoy_http_conn_manager_prefix=${__data.fields.Envoy HTTP Conn Manager Prefix}" + } + ], + "thresholds": { + "steps": [ ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "P50 Latency" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "P95 Latency" + }, + "properties": [ + { + "id": "unit", + "value": "ms" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Request Rate" + }, + "properties": [ + { + "id": "unit", + "value": "reqps" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Success Rate (5xx)" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Success Rate (4xx & 5xx)" + }, + "properties": [ + { + "id": "unit", + "value": "percent" + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 57 + }, + "id": 23, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Request Rate" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(40,\n sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n ) by (envoy_http_conn_manager_prefix)\n)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n,\n envoy_response_code_class!=\"5\"\n }[1h]\n )\n) by (job, envoy_http_conn_manager_prefix)\n/\nsum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n) by (job, envoy_http_conn_manager_prefix)\n* 100\nand on (envoy_http_conn_manager_prefix) (\n topk(40,\n sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n ) by (envoy_http_conn_manager_prefix)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n,\n envoy_response_code_class!~\"4|5\"\n }[1h]\n )\n) by (job, envoy_http_conn_manager_prefix)\n/\nsum(\n rate(\n envoy_http_downstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n) by (job, envoy_http_conn_manager_prefix)\n* 100\nand on (envoy_http_conn_manager_prefix) (\n topk(40,\n sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n ) by (envoy_http_conn_manager_prefix)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.5,\n sum(\n rate(\n envoy_http_downstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n ) by (le, job, envoy_http_conn_manager_prefix)\n)\nand on (envoy_http_conn_manager_prefix) (\n topk(40,\n sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n ) by (envoy_http_conn_manager_prefix)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "histogram_quantile(\n 0.95,\n sum(\n rate(\n envoy_http_downstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n ) by (le, job, envoy_http_conn_manager_prefix)\n)\nand on (envoy_http_conn_manager_prefix) (\n topk(40,\n sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n ) by (envoy_http_conn_manager_prefix)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n avg_over_time(\n envoy_http_downstream_cx_active{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n) by (job, envoy_http_conn_manager_prefix)\nand on (envoy_http_conn_manager_prefix) (\n topk(40,\n sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n ) by (envoy_http_conn_manager_prefix)\n)\n\n)\n\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n envoy_http_downstream_cx_destroy{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n) by (job, envoy_http_conn_manager_prefix)\nand on (envoy_http_conn_manager_prefix) (\n topk(40,\n sum(\n rate(\n envoy_http_downstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_http_conn_manager_prefix=~\"$envoy_http_conn_manager_prefix\"\n\n }[1h]\n )\n ) by (envoy_http_conn_manager_prefix)\n)\n\n)\n\n", + "format": "table", + "instant": true + } + ], + "title": "Downstream Overview [1h]", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value #A": 1, + "Value #B": 2, + "Value #C": 3, + "Value #D": 4, + "Value #E": 5, + "Value #F": 6, + "Value #G": 7, + "envoy_http_conn_manager_prefix": 0 + }, + "renameByName": { + "Value #A": "Request Rate", + "Value #B": "Success Rate (5xx)", + "Value #C": "Success Rate (4xx & 5xx)", + "Value #D": "P50 Latency", + "Value #E": "P95 Latency", + "Value #F": "Active Connections", + "Value #G": "Destroyed Connections", + "envoy_http_conn_manager_prefix": "Envoy HTTP Conn Manager Prefix", + "job": "Job" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 69 + }, + "id": 24, + "title": "SSL", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The SSL certificate expiration times by Envoy TLS certificate over time.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "dateTimeAsIso" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "min" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "min", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "min(\n envoy_listener_ssl_certificate_expiration_unix_time_seconds{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n) by (job, envoy_tls_certificate)\n* 1000\n", + "legendFormat": "{{ envoy_tls_certificate }}" + } + ], + "title": "SSL Expirations by Envoy TLS Certificate", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "envoy", + "envoy-mixin", + "gateway" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "hide": 2, + "label": "Cluster", + "name": "cluster", + "query": "label_values(envoy_cluster_upstream_rq_xx{}, cluster)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "query": "label_values(envoy_cluster_upstream_rq_xx{cluster=\"$cluster\"}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values(envoy_cluster_upstream_rq_xx{cluster=\"$cluster\", namespace=~\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Envoy Cluster Name", + "multi": true, + "name": "envoy_cluster_name", + "query": "label_values(envoy_cluster_upstream_rq_xx{cluster=\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}, envoy_cluster_name)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Envoy HTTP Conn Manager Prefix", + "multi": true, + "name": "envoy_http_conn_manager_prefix", + "query": "label_values(envoy_http_downstream_rq_xx{cluster=\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}, envoy_http_conn_manager_prefix)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Pod", + "multi": true, + "name": "pod", + "query": "label_values(envoy_listener_http_downstream_rq_xx{cluster=\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}, pod)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Envoy / Overview", + "uid": "envoy-overview-skj2" +} diff --git a/assets/envoy-2/dashboards/envoy-upstream.json b/assets/envoy-2/dashboards/envoy-upstream.json new file mode 100644 index 00000000..c3e4ad18 --- /dev/null +++ b/assets/envoy-2/dashboards/envoy-upstream.json @@ -0,0 +1,1122 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ ] + }, + "description": "A dashboard that monitors Envoy with a focus on giving an overview of upstreams. The dashboards were generated using [envoy-mixin](https://github.com/adinhodovic/envoy-mixin). Open issues and create feature requests in the repository.", + "editable": false, + "links": [ + { + "asDropdown": true, + "includeVars": false, + "keepTime": true, + "tags": [ + "envoy", + "envoy-mixin", + "gateway" + ], + "targetBlank": true, + "title": "Envoy", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of upstreams by job.", + "fieldConfig": { + "defaults": { + "unit": "upstreams" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 2, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n) by (job)\n", + "instant": true, + "legendFormat": "{{ job }}" + } + ], + "title": "Upstreams Count by Job", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of upstream request rates by Envoy cluster name.", + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 5 + }, + "id": 3, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(20,\n sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }[1h]\n )\n ) by (envoy_cluster_name)\n)\n", + "instant": true, + "legendFormat": "{{ envoy_cluster_name }}" + } + ], + "title": "Upstream Rate by Envoy Cluster Name [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of upstream request rates by response code class.", + "fieldConfig": { + "defaults": { + "unit": "reqps" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 5 + }, + "id": 4, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }[1h]\n )\n) by (envoy_response_code_class)\n", + "instant": true, + "legendFormat": "{{ envoy_response_code_class }}xx" + } + ], + "title": "Upstream Rate by Code Class [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of active upstream connections by Envoy cluster name.", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 5 + }, + "id": 5, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(20,\n sum(\n envoy_cluster_upstream_cx_active{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }\n ) by (envoy_cluster_name)\n)\n", + "instant": true, + "legendFormat": "{{ envoy_cluster_name }}" + } + ], + "title": "Upstream Active Connections by Envoy Cluster Name", + "type": "piechart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 6, + "repeat": "envoy_cluster_name", + "title": "$envoy_cluster_name", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream request rate by Envoy cluster name over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n) by (envoy_cluster_name)\n", + "legendFormat": "{{ envoy_cluster_name }}" + } + ], + "title": "Upstream Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream latency percentiles over time.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "ms" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.5,\n sum(\n rate(\n envoy_cluster_upstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P50" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "histogram_quantile(\n 0.95,\n sum(\n rate(\n envoy_cluster_upstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P95" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": true, + "expr": "histogram_quantile(\n 0.99,\n sum(\n rate(\n envoy_cluster_upstream_rq_time_bucket{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n ) by (le)\n)\n", + "legendFormat": "P99" + } + ], + "title": "Upstream Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream success rate over time, counting 5xx response codes as errors.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMax": 100, + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n,\n envoy_response_code_class!=\"5\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n)\n* 100\n", + "legendFormat": "Success Rate" + } + ], + "title": "Upstream Success Rate (Excluding 4xx errors)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream success rate over time, counting 4xx and 5xx response codes as errors.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMax": 100, + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n,\n envoy_response_code_class!~\"4|5\"\n }[$__rate_interval]\n )\n)\n/\nsum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n)\n* 100\n", + "legendFormat": "Success Rate" + } + ], + "title": "Upstream Success Rate (Including 4xx errors)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream request rate by response code class over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_xx{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n\n }[$__rate_interval]\n )\n) by (envoy_response_code_class)\n", + "legendFormat": "{{ envoy_response_code_class }}xx" + } + ], + "title": "Upstream Rate by Code Class", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream request rate by response code over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n) by (envoy_response_code)\n", + "legendFormat": "{{ envoy_response_code }}" + } + ], + "title": "Upstream Rate by Code", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The percentage of healthy upstream members by Envoy cluster name over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMax": 100, + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n envoy_cluster_membership_healthy{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }\n) by (job, envoy_cluster_name)\n/\nsum(\n envoy_cluster_membership_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }\n) by (job, envoy_cluster_name)\n* 100\n", + "legendFormat": "{{ envoy_cluster_name }}" + } + ], + "title": "Upstream Healthy Percent", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream connections over time.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n envoy_cluster_upstream_cx_active{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }\n) by (envoy_cluster_name)\n", + "legendFormat": "Active Connections" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n envoy_cluster_upstream_cx_overflow{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_cluster_name)\n", + "legendFormat": "Overflow Connections" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n envoy_cluster_upstream_cx_destroy{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_cluster_name)\n", + "legendFormat": "Destroyed Connections" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n envoy_cluster_upstream_cx_connect_fail{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_cluster_name)\n", + "legendFormat": "Connect Failures" + } + ], + "title": "Upstream Connections", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream circuit breakers over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n envoy_cluster_circuit_breakers_default_cx_open{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }\n) by (job, envoy_cluster_name)\n", + "legendFormat": "Open Connections" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n envoy_cluster_circuit_breakers_default_cx_pool_open{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }\n) by (job, envoy_cluster_name)\n", + "legendFormat": "Open Pool Connections" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n envoy_cluster_circuit_breakers_default_rq_open{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }\n) by (job, envoy_cluster_name)\n", + "legendFormat": "Open Requests" + } + ], + "title": "Upstream Circuit Breakers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream retry rates over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_retry{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_cluster_name)\n", + "legendFormat": "Retry Rate" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_retry_overflow{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_cluster_name)\n", + "legendFormat": "Retry Overflow Rate" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_timeout{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n) by (job, envoy_cluster_name)\n", + "legendFormat": "Timeout Rate" + } + ], + "title": "Upstream Retry Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream request rate by pod over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n envoy_cluster_upstream_rq_total{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }[$__rate_interval]\n )\n) by (pod, envoy_cluster_name)\n", + "legendFormat": "{{ pod }}" + } + ], + "title": "Upstream Rate by Pod", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The upstream active connections by pod over time.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n envoy_cluster_upstream_cx_active{\n cluster=\"$cluster\",\nnamespace=~\"$namespace\",\njob=~\"$job\",\npod=~\"$pod\"\n\n,\nenvoy_cluster_name=\"$envoy_cluster_name\"\n\n }\n) by (pod, envoy_cluster_name)\n", + "legendFormat": "{{ pod }}" + } + ], + "title": "Upstream Active Connections by Pod", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "envoy", + "envoy-mixin", + "gateway" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "hide": 2, + "label": "Cluster", + "name": "cluster", + "query": "label_values(envoy_cluster_upstream_rq_xx{}, cluster)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "query": "label_values(envoy_cluster_upstream_rq_xx{cluster=\"$cluster\"}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Job", + "multi": true, + "name": "job", + "query": "label_values(envoy_cluster_upstream_rq_xx{cluster=\"$cluster\", namespace=~\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Envoy Cluster Name", + "multi": true, + "name": "envoy_cluster_name", + "query": "label_values(envoy_cluster_upstream_rq_xx{cluster=\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}, envoy_cluster_name)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Pod", + "multi": true, + "name": "pod", + "query": "label_values(envoy_cluster_upstream_rq_xx{cluster=\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", envoy_cluster_name=~\"$envoy_cluster_name\"}, pod)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Envoy / Upstream", + "uid": "envoy-upstream-skj2" +} diff --git a/assets/envoy-2/rules.yaml b/assets/envoy-2/rules.yaml new file mode 100644 index 00000000..2ae22208 --- /dev/null +++ b/assets/envoy-2/rules.yaml @@ -0,0 +1 @@ +groups: [] diff --git a/assets/syncthing/alerts.yaml b/assets/syncthing/alerts.yaml new file mode 100644 index 00000000..f1d479bc --- /dev/null +++ b/assets/syncthing/alerts.yaml @@ -0,0 +1,40 @@ +groups: +- name: syncthing + rules: + - alert: SyncthingEventsDropped + annotations: + dashboard_url: https://grafana.com/d/syncthing-overview-jkwq/syncthing-overview?var-job={{ + $labels.job }} + description: The job {{ $labels.job }} has dropped events of type {{ $labels.event + }} in the last minute. + summary: Syncthing events dropped. + expr: | + sum( + increase( + syncthing_events_total{ + state="dropped" + }[5m] + ) + ) by (cluster, job, event) + > 0 + for: 1m + labels: + severity: warning + - alert: SyncthingFolderOutOfSync + annotations: + dashboard_url: https://grafana.com/d/syncthing-overview-jkwq/syncthing-overview?var-job={{ + $labels.job }}&var-folder={{ $labels.folder }} + description: The folder {{ $labels.folder }} in job {{ $labels.job }} is out + of sync for more than 1h. + summary: Syncthing folder out of sync. + expr: | + sum( + syncthing_model_folder_summary{ + scope="need", + type="bytes" + } + ) by (cluster, job, folder) + > 0 + for: 1h + labels: + severity: info diff --git a/assets/syncthing/dashboards/syncthing-overview.json b/assets/syncthing/dashboards/syncthing-overview.json new file mode 100644 index 00000000..b7eaab25 --- /dev/null +++ b/assets/syncthing/dashboards/syncthing-overview.json @@ -0,0 +1,1829 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ ] + }, + "description": "A dashboard that monitors Syncthing. The dashboards were generated using syncthing-mixin. https://github.com/adinhodovic/syncthing-mixin. Open issues and create feature requests in the repository.", + "editable": false, + "links": [ + { + "asDropdown": false, + "includeVars": false, + "keepTime": true, + "tags": [ + "syncthing", + "syncthing-mixin" + ], + "targetBlank": true, + "title": "Syncthing", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of folders being synchronized by Syncthing.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(\n syncthing_model_folder_state{\n cluster=\"$cluster\",\njob=~\"$job\"\n\n }\n) by (folder)\n" + } + ], + "title": "Folders", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of devices connected to Syncthing.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 3, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n count(\n syncthing_protocol_recv_bytes_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n\n }\n ) by (device)\n)\n" + } + ], + "title": "Devices", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of events occurring in Syncthing, measured in events per second.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 4, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n increase(\n syncthing_events_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n)\n" + } + ], + "title": "Events", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of filesystem operations performed by Syncthing, measured in operations per second.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 5, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n increase(\n syncthing_fs_operations_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n)\n" + } + ], + "title": "Filesystem Operations", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "unit": "bool" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 6, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n syncthing_model_folder_state{\n cluster=\"$cluster\",\njob=~\"$job\"\n\n } == 0\n)\n", + "instant": true, + "legendFormat": "Synced" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n syncthing_model_folder_state{\n cluster=\"$cluster\",\njob=~\"$job\"\n\n } > 0\n)\n", + "instant": true, + "legendFormat": "Unsynced" + } + ], + "title": "Folders by State", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The distribution of events in Syncthing, categorized by event type over the last hour.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 7, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n syncthing_events_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n\n }[1h]\n )\n) by (event)\n> 0\n", + "instant": true, + "legendFormat": "{{ event }}" + } + ], + "title": "Events by Type [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of bytes in each folder being synchronized by Syncthing.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n syncthing_model_folder_summary{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n scope=\"global\",\n type=\"bytes\"\n }\n)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Folder Total Bytes", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of directories in each folder being synchronized by Syncthing.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n syncthing_model_folder_summary{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n scope=\"global\",\n type=\"directories\"\n }\n)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Folder Total Directories", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of files in each folder being synchronized by Syncthing.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n syncthing_model_folder_summary{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n scope=\"global\",\n type=\"files\"\n }\n)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Folder Total Files", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of deleted items in each folder being synchronized by Syncthing.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n syncthing_model_folder_summary{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n scope=\"global\",\n type=\"deleted\"\n }\n)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Folder Total Deleted Items", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of created events in Syncthing, categorized by event type and measured in events per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 14 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n syncthing_events_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n state=\"created\"\n }[$__rate_interval]\n )\n) by (event)\n", + "legendFormat": "{{ event }}" + } + ], + "title": "Events Created Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of delivered events in Syncthing, categorized by event type and measured in events per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 14 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n syncthing_events_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n state=\"delivered\"\n }[$__rate_interval]\n )\n) by (event)\n", + "legendFormat": "{{ event }}" + } + ], + "title": "Events Delivered Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of Dropped events in Syncthing, categorized by event type and measured in events per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 14 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n syncthing_events_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n state=\"dropped\"\n }[$__rate_interval]\n )\n) by (event)\n", + "legendFormat": "{{ event }}" + } + ], + "title": "Events Dropped Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 15, + "title": "Folders", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The current state of each folder being synchronized by Syncthing, categorized by state type.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 22 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n syncthing_model_folder_state{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nfolder=~\"$folder\"\n\n }\n) by (folder, state)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Folder State", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The number of bytes needed to be synchronized in each folder by Syncthing, categorized by folder.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 22 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n syncthing_model_folder_summary{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nfolder=~\"$folder\"\n,\n scope=\"need\",\n type=\"bytes\"\n }\n) by (folder)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Needed Bytes by Folder", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of bytes in each folder being synchronized by Syncthing, categorized by folder.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n syncthing_model_folder_summary{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nfolder=~\"$folder\"\n,\n scope=\"global\",\n type=\"bytes\"\n }\n) by (folder)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Total Bytes by Folder", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of directories in each folder being synchronized by Syncthing, categorized by folder.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n syncthing_model_folder_summary{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nfolder=~\"$folder\"\n,\n scope=\"global\",\n type=\"directories\"\n }\n) by (folder)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Total Directories by Folder", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of files in each folder being synchronized by Syncthing, categorized by folder.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n syncthing_model_folder_summary{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nfolder=~\"$folder\"\n,\n scope=\"global\",\n type=\"files\"\n }\n) by (folder)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Total Files by Folder", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of deleted items in each folder being synchronized by Syncthing, categorized by folder.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n syncthing_model_folder_summary{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nfolder=~\"$folder\"\n,\n scope=\"global\",\n type=\"deleted\"\n }\n) by (folder)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Total Deleted Items by Folder", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of pull operations for each folder, measured in pulls per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n syncthing_model_folder_pulls_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nfolder=~\"$folder\"\n\n }[$__rate_interval]\n )\n) by (folder)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Folder Pulls Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of scan operations for each folder, measured in scans per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n syncthing_model_folder_scans_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nfolder=~\"$folder\"\n\n }[$__rate_interval]\n )\n) by (folder)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Folder Scans Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of bytes hashed during scans by Syncthing, measured in bytes per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 46 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n rate(\n syncthing_scanner_hashed_bytes_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nfolder=~\"$folder\"\n\n }[$__rate_interval]\n )\n) by (folder)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Scanner Hashed Bytes Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of items scanned by Syncthing, measured in items per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 46 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n syncthing_scanner_scanned_items_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nfolder=~\"$folder\"\n\n }[$__rate_interval]\n )\n) by (folder)\n", + "legendFormat": "{{ folder }}" + } + ], + "title": "Scanner Scanned Items Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 51 + }, + "id": 26, + "title": "Filesystem", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of filesystem operations performed by Syncthing, categorized by operation type and root directory, measured in operations per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 52 + }, + "id": 27, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n syncthing_fs_operations_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nroot=~\"$file_system\"\n\n }[$__rate_interval]\n )\n) by (root, operation)\n> 0\n", + "legendFormat": "{{ root }} - {{ operation }}" + } + ], + "title": "Filesystem Operations Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of bytes processed during filesystem operations by Syncthing, categorized by operation type and root directory, measured in bytes per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 28, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n syncthing_fs_operation_bytes_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nroot=~\"$file_system\"\n\n }[$__rate_interval]\n )\n) by (root, operation)\n> 0\n", + "legendFormat": "{{ root }} - {{ operation }}" + } + ], + "title": "Filesystem Operations Bytes Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of time spent on filesystem operations by Syncthing, categorized by operation type and root directory, measured in seconds per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "s" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 29, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n syncthing_fs_operation_seconds_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\nroot=~\"$file_system\"\n\n }[$__rate_interval]\n )\n) by (root, operation)\n> 0\n", + "legendFormat": "{{ root }} - {{ operation }}" + } + ], + "title": "Filesystem Operations Seconds Rate", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 64 + }, + "id": 30, + "title": "Network", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of bytes sent over the network by Syncthing, measured in bytes per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 31, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n label_replace(\n increase(\n syncthing_protocol_sent_bytes_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\ndevice=~\"$device\"\n\n }[$__rate_interval]\n ),\n \"device\", \"$1\", \"device\", \"^(.{1,15}).*\"\n )\n) by (device)\n", + "legendFormat": "{{ device }}" + } + ], + "title": "Network Sent Bytes Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of messages sent over the network by Syncthing, measured in messages per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n label_replace(\n increase(\n syncthing_protocol_sent_messages_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\ndevice=~\"$device\"\n\n }[$__rate_interval]\n ),\n \"device\", \"$1\", \"device\", \"^(.{1,15}).*\"\n )\n) by (device)\n", + "legendFormat": "{{ device }}" + } + ], + "title": "Network Sent Messages Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of bytes received over the network by Syncthing, measured in bytes per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "bytes" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 71 + }, + "id": 33, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n label_replace(\n increase(\n syncthing_protocol_recv_bytes_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\ndevice=~\"$device\"\n\n }[$__rate_interval]\n ),\n \"device\", \"$1\", \"device\", \"^(.{1,15}).*\"\n )\n) by (device)\n", + "legendFormat": "{{ device }}" + } + ], + "title": "Network Received Bytes Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The rate of messages received over the network by Syncthing, measured in messages per second.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 71 + }, + "id": 34, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n label_replace(\n increase(\n syncthing_protocol_recv_messages_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\ndevice=~\"$device\"\n\n }[$__rate_interval]\n ),\n \"device\", \"$1\", \"device\", \"^(.{1,15}).*\"\n )\n) by (device)\n", + "legendFormat": "{{ device }}" + } + ], + "title": "Network Received Messages Rate", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "syncthing", + "syncthing-mixin" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "hide": 2, + "label": "Cluster", + "name": "cluster", + "query": "label_values(syncthing_events_total{}, cluster)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "query": "label_values(syncthing_events_total{cluster=\"$cluster\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Folder", + "multi": true, + "name": "folder", + "query": "label_values(syncthing_model_folder_summary{cluster=\"$cluster\", job=~\"$job\"}, folder)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "Device", + "multi": true, + "name": "device", + "query": "label_values(syncthing_protocol_sent_bytes_total{cluster=\"$cluster\", job=~\"$job\"}, device)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "File System", + "multi": true, + "name": "file_system", + "query": "label_values(syncthing_fs_operations_total{cluster=\"$cluster\", job=~\"$job\"}, root)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timezone": "utc", + "title": "Syncthing / Overview", + "uid": "syncthing-overview-jkwq" +} diff --git a/assets/syncthing/rules.yaml b/assets/syncthing/rules.yaml new file mode 100644 index 00000000..2ae22208 --- /dev/null +++ b/assets/syncthing/rules.yaml @@ -0,0 +1 @@ +groups: [] diff --git a/assets/tailscale/alerts.yaml b/assets/tailscale/alerts.yaml new file mode 100644 index 00000000..f6525451 --- /dev/null +++ b/assets/tailscale/alerts.yaml @@ -0,0 +1,109 @@ +groups: +- name: tailscale-tailnet-alerts + rules: + - alert: TailscaleDeviceUnauthorized + annotations: + dashboard_url: https://grafana.com/d/tailscale-mixin-over-k12e/tailscale-overview + description: 'Tailscale Device {{ $labels.name }} (ID: {{ $labels.id }}) in + Tailnet {{ $labels.tailnet }} is unauthorized. Please authorize it in the + Tailscale admin console.' + summary: Tailscale Device is Unauthorized + expr: | + sum( + tailscale_devices_authorized + ) by (tailnet, name, id) + == 0 + for: 15m + labels: + mixin: tailscale + severity: warning + - alert: TailscaleUserUnapproved + annotations: + dashboard_url: https://grafana.com/d/tailscale-mixin-over-k12e/tailscale-overview + description: 'Tailscale User {{ $labels.login_name }} (ID: {{ $labels.id }}) + in Tailnet {{ $labels.tailnet }} is unapproved. Please approve it in the Tailscale + admin console.' + summary: Tailscale User is Unapproved + expr: | + sum( + tailscale_users_info{ + status="needs-approval" + } + ) by (tailnet, login_name, id) + == 1 + for: 15m + labels: + mixin: tailscale + severity: warning + - alert: TailscaleUserRecentlyCreated + annotations: + dashboard_url: https://grafana.com/d/tailscale-mixin-over-k12e/tailscale-overview + description: 'Tailscale User {{ $labels.login_name }} (ID: {{ $labels.id }}) + in Tailnet {{ $labels.tailnet }} was created within the last 300 seconds.' + summary: Tailscale User Recently Created + expr: | + time() - + ( + max( + tailscale_users_created_timestamp{} + ) by (tailnet, id, login_name) + ) + < 300 + labels: + mixin: tailscale + severity: info + - alert: TailscaleDeviceUnapprovedRoutes + annotations: + dashboard_url: https://grafana.com/d/tailscale-mixin-over-k12e/tailscale-overview + description: 'Tailscale Device {{ $labels.name }} (ID: {{ $labels.id }}) in + Tailnet {{ $labels.tailnet }} has more than 10% unapproved routes for longer + than 15m.' + summary: Tailscale Device has Unapproved Routes + expr: | + 100 - + ( + ( + sum( + tailscale_devices_routes_enabled + ) by (tailnet, name, id) + / + sum( + tailscale_devices_routes_advertised + ) by (tailnet, name, id) + ) + * 100 + ) + > 10 + for: 15m + labels: + mixin: tailscale + severity: warning +- name: tailscaled-machine-alerts + rules: + - alert: TailscaledMachineHighOutboundDroppedPackets + annotations: + dashboard_url: https://grafana.com/d/tailscaled-mixin-over-k12e/tailscale-machine?var-tailscale_machine={{ + $labels.tailscale_machine }} + description: Tailscaled Machine {{ $labels.tailscale_machine }} has a high rate + of outbound dropped packets (>{{ 50 }}%) for longer than 15m. + summary: Tailscaled Machine has High Outbound Dropped Packets + expr: | + sum( + increase( + tailscaled_outbound_dropped_packets_total{} + [5m] + ) + ) by (tailscale_machine) + / + sum ( + increase( + tailscaled_outbound_packets_total{} + [5m] + ) + ) by (tailscale_machine) + * 100 + > 50 + for: 15m + labels: + mixin: tailscale + severity: warning diff --git a/assets/tailscale/dashboards/tailscale-machine.json b/assets/tailscale/dashboards/tailscale-machine.json new file mode 100644 index 00000000..d8c7760b --- /dev/null +++ b/assets/tailscale/dashboards/tailscale-machine.json @@ -0,0 +1,1492 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ ] + }, + "description": "A dashboard that gives an overview of Tailscale Machine daemon metrics. The dashboards were generated using [tailscale-mixin](https://github.com/adinhodovic/tailscale-exporter/tree/main/tailscale-mixin). Open issues and create feature requests in the repository.", + "editable": false, + "links": [ + { + "asDropdown": false, + "includeVars": false, + "keepTime": true, + "tags": [ + "tailscale", + "tailscale-mixin" + ], + "targetBlank": true, + "title": "Tailscale", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A stat panel showing the number of Tailscale machines reporting to the selected Tailscale control plane.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "count(\n tailscaled_health_messages{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }\n)\n" + } + ], + "title": "Tailscale Machines", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart panel showing the number of advertised and approved routes for the selected Tailscale machines.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 3, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscaled_advertised_routes{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }\n)\n", + "instant": true, + "legendFormat": "Advertised" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscaled_approved_routes{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }\n)\n", + "instant": true, + "legendFormat": "Approved" + } + ], + "title": "Advertised / Approved Routes", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart panel showing the distribution of outbound paths for the selected Tailscale machines.", + "fieldConfig": { + "defaults": { + "unit": "bps" + } + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 4, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n tailscaled_inbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[1h]\n )\n) by (path)\n", + "instant": true, + "legendFormat": "{{ path }}" + } + ], + "title": "Paths Distribution Inbound [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart panel showing the distribution of inbound paths for the selected Tailscale machines.", + "fieldConfig": { + "defaults": { + "unit": "bps" + } + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n tailscaled_outbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[1h]\n )\n) by (path)\n", + "instant": true, + "legendFormat": "{{ path }}" + } + ], + "title": "Paths Distribution Outbound [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart panel showing the distribution of outbound paths for the selected Tailscale machines.", + "fieldConfig": { + "defaults": { + "unit": "bps" + } + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 6, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n tailscaled_inbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[1h]\n )\n)\n", + "instant": true, + "legendFormat": "Inbound" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n tailscaled_outbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[1h]\n )\n)\n", + "instant": true, + "legendFormat": "Outbound" + } + ], + "title": "Inbound vs Outbound Traffic [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart panel showing the distribution of dropped packets by reason for the selected Tailscale machines.", + "fieldConfig": { + "defaults": { + "unit": "pps" + } + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 7, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n tailscaled_outbound_dropped_packets_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[1h]\n )\n) by (reason)\n", + "instant": true, + "legendFormat": "{{ reason }}" + } + ], + "title": "Dropped Packets by Reason [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A table panel showing the top 20 Tailscale machines by inbound traffic over the last hour.", + "fieldConfig": { + "defaults": { + "unit": "Bps" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 8, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Inbound Traffic (Bps)" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "topk(\n 20,\n sum(\n rate(\n tailscaled_inbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[1h]\n )\n ) by (tailscale_machine)\n)\n", + "format": "table", + "instant": true + } + ], + "title": "Top 20 Machines by Inbound Traffic (1h)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value": 1, + "tailscale_machine": 0 + }, + "renameByName": { + "Value": "Inbound Traffic (Bps)", + "tailscale_machine": "Tailscale Machine" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A table panel showing the Tailscale machines with unadvertised routes.", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 7 + }, + "id": 9, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Unapproved Routes" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscaled_advertised_routes{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }\n) by (tailscale_machine)\n-\nsum(\n tailscaled_approved_routes{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }\n) by (tailscale_machine)\n> 0\n", + "format": "table", + "instant": true + } + ], + "title": "Machines with Unapproved Routes", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value": 1, + "tailscale_machine": 0 + }, + "renameByName": { + "Value": "Unapproved Routes", + "tailscale_machine": "Tailscale Machine" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A table panel showing the Tailscale machines with dropped packets in the last hour.", + "fieldConfig": { + "defaults": { + "unit": "percent" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 7 + }, + "id": 10, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Dropped Packets" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n tailscaled_outbound_dropped_packets_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[1h]\n )\n) by (tailscale_machine)\n/\nsum(\n increase(\n tailscaled_outbound_packets_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[1h]\n )\n) by (tailscale_machine)\n* 100\n> 0\n", + "format": "table", + "instant": true + } + ], + "title": "Machines with Dropped Packets (1h)", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "job": true + }, + "indexByName": { + "Value": 1, + "tailscale_machine": 0 + }, + "renameByName": { + "Value": "Dropped Packets", + "tailscale_machine": "Tailscale Machine" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 11, + "title": "Network Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A bar gauge panel showing the number of health messages by type.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n tailscaled_health_messages{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }\n) by (type)\n", + "legendFormat": "{{ type }}" + } + ], + "title": "Health Messages by Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the outbound dropped packets by reason.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "pps" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n tailscaled_outbound_dropped_packets_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n) by (reason)\n", + "legendFormat": "{{ reason }}" + } + ], + "title": "Outbound Dropped Packets by Reason", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the inbound bytes by path.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "bps" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n tailscaled_inbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n) by (path)\n", + "legendFormat": "{{ path }}" + } + ], + "title": "Inbound Bytes by Path", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the inbound packets by path.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "pps" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n tailscaled_inbound_packets_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n) by (path)\n", + "legendFormat": "{{ path }}" + } + ], + "title": "Inbound Packets by Path", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the outbound bytes by path.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "bps" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n tailscaled_outbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n) by (path)\n", + "legendFormat": "{{ path }}" + } + ], + "title": "Outbound Bytes by Path", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the outbound packets by path.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "pps" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n tailscaled_outbound_packets_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n) by (path)\n", + "legendFormat": "{{ path }}" + } + ], + "title": "Outbound Packets by Path", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 18, + "repeat": "tailscale_machine", + "title": "Tailscale Machine $tailscale_machine", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart panel showing the number of advertised and approved routes for the selected Tailscale machines.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 0, + "y": 32 + }, + "id": 19, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscaled_advertised_routes{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n\n }\n)\n", + "instant": true, + "legendFormat": "Advertised" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscaled_approved_routes{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n\n }\n)\n", + "instant": true, + "legendFormat": "Approved" + } + ], + "title": "Advertised / Approved Routes", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart panel showing the DERP vs Non-DERP outbound traffic for the selected Tailscale machine.", + "fieldConfig": { + "defaults": { + "unit": "bps" + } + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 8, + "y": 32 + }, + "id": 20, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n tailscaled_outbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n,\n path=\"derp\"\n }[1h]\n )\n)\n", + "instant": true, + "legendFormat": "DERP" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n rate(\n tailscaled_outbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n,\n path!=\"derp\"\n }[1h]\n )\n)\n", + "instant": true, + "legendFormat": "Non-DERP" + } + ], + "title": "DERP vs Non-DERP Outbound Traffic [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart panel showing the distribution of dropped packets by reason for the selected Tailscale machines.", + "fieldConfig": { + "defaults": { + "unit": "pps" + } + }, + "gridPos": { + "h": 4, + "w": 8, + "x": 16, + "y": 32 + }, + "id": 21, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n increase(\n tailscaled_outbound_dropped_packets_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n\n }[1h]\n )\n) by (reason)\n", + "instant": true, + "legendFormat": "{{ reason }}" + } + ], + "title": "Dropped Packets by Reason [1h]", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A bar gauge panel showing the number of health messages by type for the selected Tailscale machine.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n tailscaled_health_messages{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n\n }\n) by (type)\n", + "legendFormat": "{{ type }}" + } + ], + "title": "Health Messages by Type", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the outbound dropped packets by reason for the selected Tailscale machine.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 36 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n tailscaled_outbound_dropped_packets_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n\n }[$__rate_interval]\n )\n) by (reason)\n", + "legendFormat": "{{ reason }}" + } + ], + "title": "Outbound Dropped Packets by Reason", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the inbound bytes by path for the selected Tailscale machine.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n tailscaled_inbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n\n }[$__rate_interval]\n )\n) by (path)\n", + "legendFormat": "{{ path }}" + } + ], + "title": "Inbound Bytes by Path", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the inbound packets by path for the selected Tailscale machine.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "pps" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n tailscaled_inbound_packets_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n\n }[$__rate_interval]\n )\n) by (path)\n", + "legendFormat": "{{ path }}" + } + ], + "title": "Inbound Packets by Path", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the outbound bytes by path for the selected Tailscale machine.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 0, + "y": 46 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n tailscaled_outbound_bytes_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n\n }[$__rate_interval]\n )\n) by (path)\n", + "legendFormat": "{{ path }}" + } + ], + "title": "Outbound Bytes by Path", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the outbound packets by path for the selected Tailscale machine.", + "fieldConfig": { + "defaults": { + "custom": { + "axisSoftMin": 0, + "fillOpacity": 100, + "lineWidth": 1, + "stacking": { + "mode": "normal" + } + }, + "unit": "pps" + } + }, + "gridPos": { + "h": 5, + "w": 12, + "x": 12, + "y": 46 + }, + "id": 27, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n increase(\n tailscaled_outbound_packets_total{\n cluster=\"$cluster\"\n,\njob=~\"$job\"\n,\ntailscale_machine=\"$tailscale_machine\"\n\n }[$__rate_interval]\n )\n) by (path)\n", + "legendFormat": "{{ path }}" + } + ], + "title": "Outbound Packets by Path", + "type": "timeseries" + } + ], + "schemaVersion": 39, + "tags": [ + "tailscale", + "tailscale-mixin" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "hide": 2, + "label": "Cluster", + "name": "cluster", + "query": "label_values(tailscaled_health_messages, cluster)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "query": "label_values(tailscaled_health_messages{cluster=\"$cluster\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Tailscale Machine", + "name": "tailscale_machine", + "query": "label_values(tailscaled_health_messages{cluster=\"$cluster\", job=~\"$job\"}, tailscale_machine)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timezone": "utc", + "title": "Tailscale / Machine", + "uid": "tailscaled-mixin-over-k12e" +} diff --git a/assets/tailscale/dashboards/tailscale-overview.json b/assets/tailscale/dashboards/tailscale-overview.json new file mode 100644 index 00000000..20566481 --- /dev/null +++ b/assets/tailscale/dashboards/tailscale-overview.json @@ -0,0 +1,1613 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ ] + }, + "description": "A dashboard that gives an overview of Tailscale API metrics. The dashboards were generated using [tailscale-mixin](https://github.com/adinhodovic/tailscale-exporter/tree/main/tailscale-mixin). Open issues and create feature requests in the repository.", + "editable": false, + "links": [ + { + "asDropdown": false, + "includeVars": false, + "keepTime": true, + "tags": [ + "tailscale", + "tailscale-mixin" + ], + "targetBlank": true, + "title": "Tailscale", + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "title": "Summary", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of users in the selected tailnet.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 2, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n tailscale_users_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n)\n" + } + ], + "title": "Total Users", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of devices in the selected tailnet.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 1 + }, + "id": 3, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n tailscale_devices_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n)\n" + } + ], + "title": "Total Devices", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of devices that are currently online in the selected tailnet.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 1 + }, + "id": 4, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n tailscale_devices_online{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n)\n" + } + ], + "title": "Devices Logged In", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of keys in the selected tailnet.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 12, + "y": 1 + }, + "id": 5, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n tailscale_keys_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n)\n" + } + ], + "title": "Total Keys", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A table showing the nameservers configured for the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "string" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 16, + "y": 1 + }, + "id": 6, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": false, + "displayName": "Nameserver" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscale_dns_nameservers_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (nameserver)\n", + "format": "table", + "instant": true + } + ], + "title": "Nameservers", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true + }, + "indexByName": { + "Value": 1, + "nameserver": 0 + }, + "renameByName": { + "nameserver": "Nameserver" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "Whether Magic DNS is enabled for the selected tailnet.", + "fieldConfig": { + "defaults": { + "mappings": [ ], + "thresholds": { + "steps": [ + { + "color": "green", + "value": 0 + } + ] + }, + "unit": "bool" + } + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 7, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(\n tailscale_dns_magic_dns{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n)\n" + } + ], + "title": "Magic DNS Enabled", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A table showing the current settings for the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "bool" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 8, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": true, + "displayName": "Tailnet" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "tailscale_tailnet_settings_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n}\n", + "format": "table", + "instant": true + } + ], + "title": "Tailnet Settings", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "cluster": true, + "container": true, + "endpoint": true, + "environment": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "prometheus": true, + "region": true, + "service": true + }, + "indexByName": { + "Value": 8, + "acls_externally_managed_on": 1, + "devices_approval_on": 2, + "devices_auto_updates_on": 3, + "network_flow_logging_on": 4, + "posture_identity_collection_on": 5, + "regional_routing_on": 6, + "tailnet": 0, + "users_approval_on": 7 + }, + "renameByName": { + "Value": "Up", + "acls_externally_managed_on": "ACLs Externally Managed", + "devices_approval_on": "Devices Approval", + "devices_auto_updates_on": "Devices Auto Updates", + "network_flow_logging_on": "Network Flow Logging", + "posture_identity_collection_on": "Posture Identity Collection", + "regional_routing_on": "Regional Routing", + "tailnet": "Tailnet", + "users_approval_on": "Users Approval" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 9, + "title": "Devices", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart showing the distribution of devices by operating system in the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 11 + }, + "id": 10, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_devices_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (os)\n", + "instant": true, + "legendFormat": "{{ os }}" + } + ], + "title": "Devices by OS", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart showing the distribution of devices by Tailscale client version in the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 11 + }, + "id": 11, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_devices_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (client_version)\n", + "instant": true, + "legendFormat": "{{ client_version }}" + } + ], + "title": "Devices by Version", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of devices that have an update available in the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 11 + }, + "id": 12, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_devices_update_available{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } == 1\n)\n", + "instant": true, + "legendFormat": "Update Available" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_devices_update_available{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } == 0\n)\n", + "instant": true, + "legendFormat": "No Update Available" + } + ], + "title": "Devices with Update Available", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of devices that are authorized to access the tailnet.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 11 + }, + "id": 13, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_devices_authorized{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } == 1\n)\n", + "instant": true, + "legendFormat": "Authorized" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_devices_authorized{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } == 0\n)\n", + "instant": true, + "legendFormat": "Not Authorized" + } + ], + "title": "Authorized Devices", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A table showing all devices in the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "string" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 14, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": false, + "displayName": "Name" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "tailscale_devices_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n}\n", + "format": "table", + "instant": true + } + ], + "title": "Devices", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value": true, + "__name__": true, + "cluster": true, + "container": true, + "endpoint": true, + "environment": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "prometheus": true, + "region": true, + "service": true, + "tailnet": true + }, + "indexByName": { + "client_version": 3, + "hostname": 4, + "id": 6, + "machine_key": 7, + "name": 0, + "node_key": 8, + "os": 2, + "tailscale_ip": 5, + "user": 1 + }, + "renameByName": { + "client_version": "Client Version", + "hostname": "Host Name", + "id": "ID", + "machine_key": "Machine Key", + "name": "Name", + "node_key": "Node Key", + "os": "OS", + "tailscale_ip": "Tailscale IP", + "user": "User" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A table showing all devices in the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "bool" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Created" + }, + "properties": [ + { + "id": "unit", + "value": "dateTimeAsIso" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Expires" + }, + "properties": [ + { + "id": "unit", + "value": "dateTimeAsIso" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "ID" + }, + "properties": [ + { + "id": "unit", + "value": "string" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Routes Enabled" + }, + "properties": [ + { + "id": "unit", + "value": "short" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Routes Advertised" + }, + "properties": [ + { + "id": "unit", + "value": "short" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 15, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": false, + "displayName": "Name" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "min(\n tailscale_devices_created_timestamp{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } * 1000\n) by (name, id)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "min(\n tailscale_devices_expires_timestamp{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } * 1000\n) by (name, id)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscale_devices_authorized{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (name, id)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscale_devices_blocks_incoming{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (name, id)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscale_devices_routes_enabled{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (name, id)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscale_devices_routes_advertised{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (name, id)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscale_devices_key_expiry_disabled{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (name, id)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "sum(\n tailscale_devices_external{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (name, id)\n", + "format": "table", + "instant": true + } + ], + "title": "Devices Settings", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "cluster": true, + "container": true, + "endpoint": true, + "environment": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "prometheus": true, + "region": true, + "service": true, + "tailnet": true + }, + "indexByName": { + "Value #A": 2, + "Value #B": 3, + "Value #C": 4, + "Value #D": 5, + "Value #E": 6, + "Value #F": 7, + "Value #G": 8, + "Value #H": 9, + "id": 1, + "name": 0 + }, + "renameByName": { + "Value #A": "Created", + "Value #B": "Expires", + "Value #C": "Authorized", + "Value #D": "Blocks Incoming", + "Value #E": "Routes Enabled", + "Value #F": "Routes Advertised", + "Value #G": "Key Expiry Disabled", + "Value #H": "External", + "id": "ID", + "name": "Name" + } + } + } + ], + "type": "table" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing devices that have an update available.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "bool" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "sum(\n tailscale_devices_update_available{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (name, id)\n== 1\n", + "legendFormat": "{{name}}" + } + ], + "title": "Update Available", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A timeseries panel showing the last time a device was seen.", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 10 + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 36 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true, + "sortBy": "Mean", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "exemplar": false, + "expr": "time() -\nmax(\n tailscale_devices_last_seen_timestamp{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (name, id, client_version)\n", + "legendFormat": "{{name}}" + } + ], + "title": "Time Since Last Seen", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 18, + "title": "Users", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart showing the distribution of users by role in the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 45 + }, + "id": 19, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_users_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (role)\n", + "instant": true, + "legendFormat": "{{ role }}" + } + ], + "title": "Users by Role", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart showing the distribution of users by status in the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 45 + }, + "id": 20, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_users_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (status)\n", + "instant": true, + "legendFormat": "{{ status }}" + } + ], + "title": "Users by Status", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A pie chart showing the distribution of users by type in the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 45 + }, + "id": 21, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_users_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n }\n) by (type)\n", + "instant": true, + "legendFormat": "{{ type }}" + } + ], + "title": "Users by Type", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "The total number of users that are currently logged in to the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 45 + }, + "id": 22, + "options": { + "displayLabels": [ + "percent" + ], + "legend": { + "displayMode": "table", + "placement": "right", + "showLegend": true, + "values": [ + "percent" + ] + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_users_currently_logged_in{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } == 1\n)\n", + "instant": true, + "legendFormat": "Logged In" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "count(\n tailscale_users_currently_logged_in{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } == 0\n)\n", + "instant": true, + "legendFormat": "Logged Out" + } + ], + "title": "Users Logged In", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A table showing all users in the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Created" + }, + "properties": [ + { + "id": "unit", + "value": "dateTimeAsIso" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Last Seen" + }, + "properties": [ + { + "id": "unit", + "value": "dateTimeAsIso" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 50 + }, + "id": 23, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": false, + "displayName": "Login Name" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "tailscale_users_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n}\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "min(\n tailscale_users_created_timestamp{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } * 1000\n) by (name, id)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "max(\n tailscale_users_last_seen_timestamp{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } * 1000\n) by (name, id)\n", + "format": "table", + "instant": true + } + ], + "title": "Users", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value #A": true, + "__name__": true, + "cluster": true, + "container": true, + "endpoint": true, + "environment": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "prometheus": true, + "region": true, + "service": true, + "tailnet": true + }, + "indexByName": { + "Value #B": 3, + "Value #C": 4, + "display_name": 1, + "id": 2, + "login_name": 0, + "role": 5, + "status": 6, + "type": 7 + }, + "renameByName": { + "Value #B": "Created", + "Value #C": "Last Seen", + "display_name": "Display Name", + "id": "ID", + "login_name": "Login Name", + "role": "Role", + "status": "Status", + "type": "Type" + } + } + } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 60 + }, + "id": 24, + "title": "Keys", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "description": "A table showing all keys in the selected tailnet.", + "fieldConfig": { + "defaults": { + "unit": "string" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Created" + }, + "properties": [ + { + "id": "unit", + "value": "dateTimeAsIso" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Expires" + }, + "properties": [ + { + "id": "unit", + "value": "dateTimeAsIso" + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 24, + "x": 0, + "y": 61 + }, + "id": 25, + "options": { + "footer": { + "enablePagination": true + }, + "sortBy": [ + { + "desc": false, + "displayName": "Name" + } + ] + }, + "pluginVersion": "v11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "tailscale_keys_info{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n}\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "min(\n tailscale_keys_created_timestamp{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } * 1000\n) by (name, id, key_type, user_id)\n", + "format": "table", + "instant": true + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "expr": "min(\n tailscale_keys_expires_timestamp{\n cluster=\"$cluster\"\n,\nnamespace=\"$namespace\",\njob=\"$job\",\ntailnet=\"$tailnet\"\n\n } * 1000\n) by (name, id, key_type, user_id)\n", + "format": "table", + "instant": true + } + ], + "title": "Keys", + "transformations": [ + { + "id": "merge" + }, + { + "id": "organize", + "options": { + "excludeByName": { + "#Value #A": true, + "Time": true, + "Value": true, + "__name__": true, + "cluster": true, + "container": true, + "endpoint": true, + "environment": true, + "instance": true, + "job": true, + "namespace": true, + "pod": true, + "prometheus": true, + "region": true, + "service": true, + "tailnet": true + }, + "indexByName": { + "Value #B": 4, + "Value #C": 5, + "id": 1, + "key_type": 3, + "name": 0, + "user_id": 2 + }, + "renameByName": { + "Value #B": "Created", + "Value #C": "Expires", + "id": "ID", + "key_type": "Key Type", + "user_id": "User ID" + } + } + } + ], + "type": "table" + } + ], + "schemaVersion": 39, + "tags": [ + "tailscale", + "tailscale-mixin" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "default", + "value": "default" + }, + "label": "Data source", + "name": "datasource", + "query": "prometheus", + "type": "datasource" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "hide": 2, + "label": "Cluster", + "name": "cluster", + "query": "label_values(tailscale_up{}, cluster)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Namespace", + "name": "namespace", + "query": "label_values(tailscale_up{cluster=\"$cluster\"}, namespace)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Job", + "name": "job", + "query": "label_values(tailscale_up{cluster=\"$cluster\", namespace=\"$namespace\"}, job)", + "refresh": 2, + "sort": 1, + "type": "query" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "label": "Tailnet", + "name": "tailnet", + "query": "label_values(tailscale_up{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$job\"}, tailnet)", + "refresh": 2, + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timezone": "utc", + "title": "Tailscale / Overview", + "uid": "tailscale-mixin-over-k12e" +} diff --git a/assets/tailscale/rules.yaml b/assets/tailscale/rules.yaml new file mode 100644 index 00000000..2ae22208 --- /dev/null +++ b/assets/tailscale/rules.yaml @@ -0,0 +1 @@ +groups: [] diff --git a/hack/go.mod b/hack/go.mod index 44ca5ef8..9b2f64d5 100644 --- a/hack/go.mod +++ b/hack/go.mod @@ -11,6 +11,7 @@ require ( require ( github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 // indirect + github.com/elliotchance/orderedmap/v2 v2.2.0 // indirect github.com/fatih/color v1.13.0 // indirect github.com/ghodss/yaml v1.0.0 // indirect github.com/mattn/go-colorable v0.1.12 // indirect diff --git a/hack/go.sum b/hack/go.sum index c2cb61c1..3a77a437 100644 --- a/hack/go.sum +++ b/hack/go.sum @@ -1,8 +1,10 @@ github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc h1:cAKDfWh5VpdgMhJosfJnn5/FoN2SRZ4p7fJNX58YPaU= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= +github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafoB+tBA3gMyHYHrpOtNuDiK/uB5uXxq5wM= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf h1:qet1QNfXsQxTZqLG4oE62mJzwPIB8+Tee4RNCL9ulrY= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137 h1:s6gZFSlWYmbqAuRjVTiNNhvNRfY2Wxp9nhfyel4rklc= github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= github.com/brancz/gojsontoyaml v0.0.0-20191212081931-bf2969bbd742 h1:PdvQdwUXiFnSmWsOJcBXLpyH3mJfP2FMPTT3J0i7+8o= github.com/brancz/gojsontoyaml v0.0.0-20191212081931-bf2969bbd742/go.mod h1:IyUJYN1gvWjtLF5ZuygmxbnsAyP3aJS6cHzIuZY50B0= @@ -10,17 +12,22 @@ github.com/campoy/embedmd v1.0.0/go.mod h1:oxyr9RCiSXg0M3VJ3ks0UGfp98BpSSGr0kpiX github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/elliotchance/orderedmap/v2 v2.2.0 h1:7/2iwO98kYT4XkOjA9mBEIwvi4KpGB4cyHeOFOnj4Vk= +github.com/elliotchance/orderedmap/v2 v2.2.0/go.mod h1:85lZyVbpGaGvHvnKa7Qhx7zncAdBIBq6u56Hb1PRU5Q= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/fatih/color v1.10.0 h1:s36xzo75JdqLaaWoiEHk767eHiwo0598uUxyfiPkDsg= github.com/fatih/color v1.10.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM= +github.com/fatih/color v1.13.0 h1:8LOYc1KYPPmyKMuN8QV2DNRWNbLo6LZ0iLs8+mlH53w= github.com/fatih/color v1.13.0/go.mod h1:kLAiJbzzSOZDVNGyDpeOxJ47H46qBXwg5ILebYFFOfk= github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/google/go-jsonnet v0.18.0 h1:/6pTy6g+Jh1a1I2UMoAODkqELFiVIdOxbNwv0DDzoOg= github.com/google/go-jsonnet v0.18.0/go.mod h1:C3fTzyVJDslXdiTqw/bTFk7vSGyCtH3MGRbDfvEwGd0= +github.com/google/go-jsonnet v0.20.0 h1:WG4TTSARuV7bSm4PMB4ohjxe33IHT5WVTrJSU33uT4g= github.com/google/go-jsonnet v0.20.0/go.mod h1:VbgWF9JX7ztlv770x/TolZNGGFfiHEVx9G6ca2eUmeA= github.com/jsonnet-bundler/jsonnet-bundler v0.4.0 h1:4BKZ6LDqPc2wJDmaKnmYD/vDjUptJtnUpai802MibFc= github.com/jsonnet-bundler/jsonnet-bundler v0.4.0/go.mod h1:/by7P/OoohkI3q4CgSFqcoFsVY+IaNbzOVDknEsKDeU= +github.com/jsonnet-bundler/jsonnet-bundler v0.6.0 h1:DBnynmjyWBVQ9gUBmTh49x3Dw5/u4CvGO3k2k1CsYNo= github.com/jsonnet-bundler/jsonnet-bundler v0.6.0/go.mod h1:5esRxD59TyScj6qxT3o7GH0sryBKvVmx2zaEYDXtQkg= github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= @@ -31,13 +38,16 @@ github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaO github.com/mattn/go-colorable v0.1.8 h1:c1ghPdyEDarC70ftn0y+A/Ee++9zz8ljHG1b13eJ0s8= github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= github.com/mattn/go-colorable v0.1.9/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= +github.com/mattn/go-colorable v0.1.12 h1:jF+Du6AlPIjs2BiUiQlKOX0rt3SujHxPnksPKZbaA40= github.com/mattn/go-colorable v0.1.12/go.mod h1:u5H1YNBxpqRaxsYJYSkiCWKzEfiAb1Gb520KVy5xxl4= github.com/mattn/go-isatty v0.0.6/go.mod h1:Iq45c/XA43vh69/j3iqttzPXn0bhXyGjM0Hdxcsrc5s= github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY= github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= +github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y= github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= github.com/pkg/errors v0.8.0 h1:WdK/asTD0HN+q6hsWO3/vpuAkAr+tw6aNJNDFFf0+qw= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= diff --git a/mixins.json b/mixins.json index 3e313bd4..14e584cc 100644 --- a/mixins.json +++ b/mixins.json @@ -266,6 +266,11 @@ "source": "https://github.com/grafana/jsonnet-libs", "subdir": "envoy-mixin" }, + { + "name": "envoy-2", + "source": "https://github.com/adinhodovic/envoy-mixin", + "subdir": "" + }, { "name": "f5-bigip", "source": "https://github.com/grafana/jsonnet-libs", @@ -451,6 +456,11 @@ "source": "https://github.com/grafana/jsonnet-libs", "subdir": "squid-mixin" }, + { + "name": "syncthing", + "source": "https://github.com/adinhodovic/syncthing-mixin", + "subdir": "" + }, { "name": "supabase", "source": "https://github.com/grafana/jsonnet-libs", @@ -541,6 +551,11 @@ "name": "kubernetes-autoscaling", "source": "https://github.com/adinhodovic/kubernetes-autoscaling-mixin", "subdir": "" + }, + { + "name": "tailscale", + "source": "https://github.com/adinhodovic/tailscale-exporter", + "subdir": "tailscale-mixin" } ] } diff --git a/site/content/ceph/_index.md b/site/content/ceph/_index.md index 683e6b6e..257d9f27 100644 --- a/site/content/ceph/_index.md +++ b/site/content/ceph/_index.md @@ -1632,9 +1632,9 @@ Following dashboards are generated from mixins and hosted on github: - [ceph-cluster-advanced](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/ceph-cluster-advanced.json) -- [ceph-nvmeof-performance](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/ceph-nvmeof-performance.json) -- [ceph-nvmeof](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/ceph-nvmeof.json) - [cephfsdashboard](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/cephfsdashboard.json) +- [ceph-nvmeof](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/ceph-nvmeof.json) +- [ceph-nvmeof-performance](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/ceph-nvmeof-performance.json) - [host-details](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/host-details.json) - [hosts-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/hosts-overview.json) - [multi-cluster-overview](https://github.com/monitoring-mixins/website/blob/master/assets/ceph/dashboards/multi-cluster-overview.json) diff --git a/site/content/cilium-enterprise/_index.md b/site/content/cilium-enterprise/_index.md index 452f24cc..5fb0be44 100644 --- a/site/content/cilium-enterprise/_index.md +++ b/site/content/cilium-enterprise/_index.md @@ -68,10 +68,8 @@ Complete list of pregenerated alerts is available [here](https://github.com/moni Following dashboards are generated from mixins and hosted on github: -- [cilium-L3-policy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L3-policy.json) -- [cilium-L7-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L7-proxy.json) -- [cilium-agent-overview](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent-overview.json) - [cilium-agent](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent.json) +- [cilium-agent-overview](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-agent-overview.json) - [cilium-api](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-api.json) - [cilium-bpf](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-bpf.json) - [cilium-conntrack](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-conntrack.json) @@ -80,6 +78,8 @@ Following dashboards are generated from mixins and hosted on github: - [cilium-fqdn-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-fqdn-proxy.json) - [cilium-identities](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-identities.json) - [cilium-kubernetes](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-kubernetes.json) +- [cilium-L3-policy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L3-policy.json) +- [cilium-L7-proxy](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-L7-proxy.json) - [cilium-network](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-network.json) - [cilium-nodes](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-nodes.json) - [cilium-operator](https://github.com/monitoring-mixins/website/blob/master/assets/cilium-enterprise/dashboards/cilium-operator.json) diff --git a/site/content/cortex/_index.md b/site/content/cortex/_index.md index bf69a7dc..cd90f8a8 100644 --- a/site/content/cortex/_index.md +++ b/site/content/cortex/_index.md @@ -2491,8 +2491,8 @@ Following dashboards are generated from mixins and hosted on github: - [alertmanager](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/alertmanager.json) -- [cortex-compactor-resources](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor-resources.json) - [cortex-compactor](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor.json) +- [cortex-compactor-resources](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-compactor-resources.json) - [cortex-config](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-config.json) - [cortex-object-store](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-object-store.json) - [cortex-queries](https://github.com/monitoring-mixins/website/blob/master/assets/cortex/dashboards/cortex-queries.json) diff --git a/site/content/envoy-2/_index.md b/site/content/envoy-2/_index.md new file mode 100644 index 00000000..6d400562 --- /dev/null +++ b/site/content/envoy-2/_index.md @@ -0,0 +1,248 @@ +--- +title: envoy-2 +--- + +## Overview + + + +{{< panel style="danger" >}} +Jsonnet source code is available at [github.com/adinhodovic/envoy-mixin](https://github.com/adinhodovic/envoy-mixin) +{{< /panel >}} + +## Alerts + +{{< panel style="warning" >}} +Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/envoy-2/alerts.yaml). +{{< /panel >}} + +### envoy + +##### EnvoyUpstreamHighHttp4xxErrorRate + +{{< code lang="yaml" >}} +alert: EnvoyUpstreamHighHttp4xxErrorRate +annotations: + dashboard_url: https://grafana.com/d/envoy-upstream-skj2/envoy-upstream?var-namespace={{ + $labels.namespace }}&var-envoy_cluster_name={{ $labels.envoy_cluster_name }} + description: More than 5% HTTP requests with status 4xx for cluster {{ $labels.envoy_cluster_name + }} in {{ $labels.namespace }} the past 5m. + summary: Envoy upstream high HTTP 4xx error rate. +expr: | + ( + sum( + rate( + envoy_cluster_upstream_rq_xx{ + job=~".*", + envoy_response_code_class="4", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + / + sum( + rate( + envoy_cluster_upstream_rq_total{ + job=~".*", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + * 100 + ) > 5 + and + sum( + rate( + envoy_cluster_upstream_rq_xx{ + job=~".*", + envoy_response_code_class="4", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + > 5 +for: 1m +labels: + severity: info +{{< /code >}} + +##### EnvoyUpstreamHighHttp5xxErrorRate + +{{< code lang="yaml" >}} +alert: EnvoyUpstreamHighHttp5xxErrorRate +annotations: + dashboard_url: https://grafana.com/d/envoy-upstream-skj2/envoy-upstream?var-namespace={{ + $labels.namespace }}&var-envoy_cluster_name={{ $labels.envoy_cluster_name }} + description: More than 5% HTTP requests with status 5xx for cluster {{ $labels.envoy_cluster_name + }} in {{ $labels.namespace }} the past 5m. + summary: Envoy upstream high HTTP 5xx error rate. +expr: | + ( + sum( + rate( + envoy_cluster_upstream_rq_xx{ + job=~".*", + envoy_response_code_class="5", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + / + sum( + rate( + envoy_cluster_upstream_rq_total{ + job=~".*", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + * 100 + ) > 5 + and + sum( + rate( + envoy_cluster_upstream_rq_xx{ + job=~".*", + envoy_response_code_class="5", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + > 5 +for: 1m +labels: + severity: critical +{{< /code >}} + +##### EnvoyCircuitBreakerOpen + +{{< code lang="yaml" >}} +alert: EnvoyCircuitBreakerOpen +annotations: + dashboard_url: https://grafana.com/d/envoy-upstream-skj2/envoy-upstream?var-namespace={{ + $labels.namespace }}&var-envoy_cluster_name={{ $labels.envoy_cluster_name }} + description: Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} + in {{ $labels.namespace }} for the past 5m. + summary: Envoy circuit breaker is open. +expr: | + sum( + ( + envoy_cluster_circuit_breakers_default_rq_open{ + job=~".*", + envoy_cluster_name!~"" + } + or + envoy_cluster_circuit_breakers_default_cx_open{ + job=~".*", + envoy_cluster_name!~"" + } + or + envoy_cluster_circuit_breakers_default_cx_pool_open{ + job=~".*", + envoy_cluster_name!~"" + } + ) + ) by (cluster, namespace, envoy_cluster_name) > 0 +for: 5m +labels: + severity: warning +{{< /code >}} + +##### EnvoyUpstreamConnectionFailures + +{{< code lang="yaml" >}} +alert: EnvoyUpstreamConnectionFailures +annotations: + dashboard_url: https://grafana.com/d/envoy-upstream-skj2/envoy-upstream?var-namespace={{ + $labels.namespace }}&var-envoy_cluster_name={{ $labels.envoy_cluster_name }} + description: More than 100 connection failures for cluster {{ $labels.envoy_cluster_name + }} in {{ $labels.namespace }} the past 5m. + summary: Envoy upstream connection failures detected. +expr: | + sum( + increase( + envoy_cluster_upstream_cx_connect_fail{ + job=~".*", + envoy_cluster_name!~"" + }[5m] + ) + ) by (cluster, namespace, envoy_cluster_name) + > 100 +for: 10m +labels: + severity: warning +{{< /code >}} + +##### EnvoyUpstreamUnhealthyHosts + +{{< code lang="yaml" >}} +alert: EnvoyUpstreamUnhealthyHosts +annotations: + dashboard_url: https://grafana.com/d/envoy-upstream-skj2/envoy-upstream?var-namespace={{ + $labels.namespace }}&var-envoy_cluster_name={{ $labels.envoy_cluster_name }} + description: More than 33% of hosts are unhealthy for cluster {{ $labels.envoy_cluster_name + }} in {{ $labels.namespace }} for the past 5m. + summary: Envoy upstream has unhealthy hosts. +expr: | + ( + sum( + envoy_cluster_membership_total{ + job=~".*", + envoy_cluster_name!~"" + } + ) by (cluster, namespace, envoy_cluster_name) + - + sum( + envoy_cluster_membership_healthy{ + job=~".*", + envoy_cluster_name!~"" + } + ) by (cluster, namespace, envoy_cluster_name) + ) + / + sum( + envoy_cluster_membership_total{ + job=~".*", + envoy_cluster_name!~"" + } + ) by (cluster, namespace, envoy_cluster_name) + * 100 + > 33 +for: 5m +labels: + severity: warning +{{< /code >}} + +##### EnvoyXDSUpdateFailed + +{{< code lang="yaml" >}} +alert: EnvoyXDSUpdateFailed +annotations: + dashboard_url: https://grafana.com/d/envoy-gateway-overview-skj2/envoy-gateway-overview?var-namespace={{ + $labels.namespace }} + description: XDS snapshot update failed for node {{ $labels.nodeID }} in {{ $labels.namespace + }} with status {{ $labels.status }} the past 5m. + summary: Envoy Gateway XDS snapshot update failed. +expr: | + sum( + increase( + xds_snapshot_update_total{ + job=~".*", + status!="success" + }[5m] + ) + ) by (cluster, namespace, status, nodeID) + > 0 +for: 1m +labels: + severity: warning +{{< /code >}} + +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [envoy-downstream](https://github.com/monitoring-mixins/website/blob/master/assets/envoy-2/dashboards/envoy-downstream.json) +- [envoy-gateway-overview](https://github.com/monitoring-mixins/website/blob/master/assets/envoy-2/dashboards/envoy-gateway-overview.json) +- [envoy-overview](https://github.com/monitoring-mixins/website/blob/master/assets/envoy-2/dashboards/envoy-overview.json) +- [envoy-upstream](https://github.com/monitoring-mixins/website/blob/master/assets/envoy-2/dashboards/envoy-upstream.json) diff --git a/site/content/loki/_index.md b/site/content/loki/_index.md index 5d3b24f0..34595dd7 100644 --- a/site/content/loki/_index.md +++ b/site/content/loki/_index.md @@ -281,9 +281,9 @@ Following dashboards are generated from mixins and hosted on github: - [loki-logs](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-logs.json) - [loki-mixin-recording-rules](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-mixin-recording-rules.json) - [loki-operational](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-operational.json) -- [loki-reads-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads-resources.json) - [loki-reads](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads.json) +- [loki-reads-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-reads-resources.json) - [loki-retention](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-retention.json) - [loki-thanos-object-storage](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-thanos-object-storage.json) -- [loki-writes-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes-resources.json) - [loki-writes](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes.json) +- [loki-writes-resources](https://github.com/monitoring-mixins/website/blob/master/assets/loki/dashboards/loki-writes-resources.json) diff --git a/site/content/prometheus/_index.md b/site/content/prometheus/_index.md index 414f277c..0faea10a 100644 --- a/site/content/prometheus/_index.md +++ b/site/content/prometheus/_index.md @@ -433,5 +433,5 @@ labels: Following dashboards are generated from mixins and hosted on github: -- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json) - [prometheus](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus.json) +- [prometheus-remote-write](https://github.com/monitoring-mixins/website/blob/master/assets/prometheus/dashboards/prometheus-remote-write.json) diff --git a/site/content/syncthing/_index.md b/site/content/syncthing/_index.md new file mode 100644 index 00000000..36f1a93d --- /dev/null +++ b/site/content/syncthing/_index.md @@ -0,0 +1,72 @@ +--- +title: syncthing +--- + +## Overview + + + +{{< panel style="danger" >}} +Jsonnet source code is available at [github.com/adinhodovic/syncthing-mixin](https://github.com/adinhodovic/syncthing-mixin) +{{< /panel >}} + +## Alerts + +{{< panel style="warning" >}} +Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/syncthing/alerts.yaml). +{{< /panel >}} + +### syncthing + +##### SyncthingEventsDropped + +{{< code lang="yaml" >}} +alert: SyncthingEventsDropped +annotations: + dashboard_url: https://grafana.com/d/syncthing-overview-jkwq/syncthing-overview?var-job={{ + $labels.job }} + description: The job {{ $labels.job }} has dropped events of type {{ $labels.event + }} in the last minute. + summary: Syncthing events dropped. +expr: | + sum( + increase( + syncthing_events_total{ + state="dropped" + }[5m] + ) + ) by (cluster, job, event) + > 0 +for: 1m +labels: + severity: warning +{{< /code >}} + +##### SyncthingFolderOutOfSync + +{{< code lang="yaml" >}} +alert: SyncthingFolderOutOfSync +annotations: + dashboard_url: https://grafana.com/d/syncthing-overview-jkwq/syncthing-overview?var-job={{ + $labels.job }}&var-folder={{ $labels.folder }} + description: The folder {{ $labels.folder }} in job {{ $labels.job }} is out of + sync for more than 1h. + summary: Syncthing folder out of sync. +expr: | + sum( + syncthing_model_folder_summary{ + scope="need", + type="bytes" + } + ) by (cluster, job, folder) + > 0 +for: 1h +labels: + severity: info +{{< /code >}} + +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [syncthing-overview](https://github.com/monitoring-mixins/website/blob/master/assets/syncthing/dashboards/syncthing-overview.json) diff --git a/site/content/tailscale/_index.md b/site/content/tailscale/_index.md new file mode 100644 index 00000000..9b913823 --- /dev/null +++ b/site/content/tailscale/_index.md @@ -0,0 +1,156 @@ +--- +title: tailscale +--- + +## Overview + + + +{{< panel style="danger" >}} +Jsonnet source code is available at [github.com/adinhodovic/tailscale-exporter](https://github.com/adinhodovic/tailscale-exporter/tree/master/tailscale-mixin) +{{< /panel >}} + +## Alerts + +{{< panel style="warning" >}} +Complete list of pregenerated alerts is available [here](https://github.com/monitoring-mixins/website/blob/master/assets/tailscale/alerts.yaml). +{{< /panel >}} + +### tailscale-tailnet-alerts + +##### TailscaleDeviceUnauthorized + +{{< code lang="yaml" >}} +alert: TailscaleDeviceUnauthorized +annotations: + dashboard_url: https://grafana.com/d/tailscale-mixin-over-k12e/tailscale-overview + description: 'Tailscale Device {{ $labels.name }} (ID: {{ $labels.id }}) in Tailnet + {{ $labels.tailnet }} is unauthorized. Please authorize it in the Tailscale admin + console.' + summary: Tailscale Device is Unauthorized +expr: | + sum( + tailscale_devices_authorized + ) by (tailnet, name, id) + == 0 +for: 15m +labels: + mixin: tailscale + severity: warning +{{< /code >}} + +##### TailscaleUserUnapproved + +{{< code lang="yaml" >}} +alert: TailscaleUserUnapproved +annotations: + dashboard_url: https://grafana.com/d/tailscale-mixin-over-k12e/tailscale-overview + description: 'Tailscale User {{ $labels.login_name }} (ID: {{ $labels.id }}) in + Tailnet {{ $labels.tailnet }} is unapproved. Please approve it in the Tailscale + admin console.' + summary: Tailscale User is Unapproved +expr: | + sum( + tailscale_users_info{ + status="needs-approval" + } + ) by (tailnet, login_name, id) + == 1 +for: 15m +labels: + mixin: tailscale + severity: warning +{{< /code >}} + +##### TailscaleUserRecentlyCreated + +{{< code lang="yaml" >}} +alert: TailscaleUserRecentlyCreated +annotations: + dashboard_url: https://grafana.com/d/tailscale-mixin-over-k12e/tailscale-overview + description: 'Tailscale User {{ $labels.login_name }} (ID: {{ $labels.id }}) in + Tailnet {{ $labels.tailnet }} was created within the last 300 seconds.' + summary: Tailscale User Recently Created +expr: | + time() - + ( + max( + tailscale_users_created_timestamp{} + ) by (tailnet, id, login_name) + ) + < 300 +labels: + mixin: tailscale + severity: info +{{< /code >}} + +##### TailscaleDeviceUnapprovedRoutes + +{{< code lang="yaml" >}} +alert: TailscaleDeviceUnapprovedRoutes +annotations: + dashboard_url: https://grafana.com/d/tailscale-mixin-over-k12e/tailscale-overview + description: 'Tailscale Device {{ $labels.name }} (ID: {{ $labels.id }}) in Tailnet + {{ $labels.tailnet }} has more than 10% unapproved routes for longer than 15m.' + summary: Tailscale Device has Unapproved Routes +expr: | + 100 - + ( + ( + sum( + tailscale_devices_routes_enabled + ) by (tailnet, name, id) + / + sum( + tailscale_devices_routes_advertised + ) by (tailnet, name, id) + ) + * 100 + ) + > 10 +for: 15m +labels: + mixin: tailscale + severity: warning +{{< /code >}} + +### tailscaled-machine-alerts + +##### TailscaledMachineHighOutboundDroppedPackets + +{{< code lang="yaml" >}} +alert: TailscaledMachineHighOutboundDroppedPackets +annotations: + dashboard_url: https://grafana.com/d/tailscaled-mixin-over-k12e/tailscale-machine?var-tailscale_machine={{ + $labels.tailscale_machine }} + description: Tailscaled Machine {{ $labels.tailscale_machine }} has a high rate + of outbound dropped packets (>{{ 50 }}%) for longer than 15m. + summary: Tailscaled Machine has High Outbound Dropped Packets +expr: | + sum( + increase( + tailscaled_outbound_dropped_packets_total{} + [5m] + ) + ) by (tailscale_machine) + / + sum ( + increase( + tailscaled_outbound_packets_total{} + [5m] + ) + ) by (tailscale_machine) + * 100 + > 50 +for: 15m +labels: + mixin: tailscale + severity: warning +{{< /code >}} + +## Dashboards +Following dashboards are generated from mixins and hosted on github: + + +- [tailscale-machine](https://github.com/monitoring-mixins/website/blob/master/assets/tailscale/dashboards/tailscale-machine.json) +- [tailscale-overview](https://github.com/monitoring-mixins/website/blob/master/assets/tailscale/dashboards/tailscale-overview.json) diff --git a/site/static/mixins.json b/site/static/mixins.json index fcded1e2..14e584cc 100644 --- a/site/static/mixins.json +++ b/site/static/mixins.json @@ -37,8 +37,8 @@ }, { "name": "ceph", - "source": "https://github.com/ceph/ceph-mixins", - "subdir": "", + "source": "https://github.com/ceph/ceph", + "subdir": "monitoring/ceph-mixin", "description": "A set of Prometheus alerts for Ceph.\n\nThe scope of this project is to provide Ceph specific Prometheus rule files using Prometheus Mixins.\n" }, { @@ -266,6 +266,11 @@ "source": "https://github.com/grafana/jsonnet-libs", "subdir": "envoy-mixin" }, + { + "name": "envoy-2", + "source": "https://github.com/adinhodovic/envoy-mixin", + "subdir": "" + }, { "name": "f5-bigip", "source": "https://github.com/grafana/jsonnet-libs", @@ -434,7 +439,7 @@ { "name": "snmp", "source": "https://github.com/grafana/jsonnet-libs", - "subdir": "snmp-mixin" + "subdir": "snmp-observ-lib" }, { "name": "spark", @@ -451,6 +456,11 @@ "source": "https://github.com/grafana/jsonnet-libs", "subdir": "squid-mixin" }, + { + "name": "syncthing", + "source": "https://github.com/adinhodovic/syncthing-mixin", + "subdir": "" + }, { "name": "supabase", "source": "https://github.com/grafana/jsonnet-libs", @@ -541,6 +551,11 @@ "name": "kubernetes-autoscaling", "source": "https://github.com/adinhodovic/kubernetes-autoscaling-mixin", "subdir": "" + }, + { + "name": "tailscale", + "source": "https://github.com/adinhodovic/tailscale-exporter", + "subdir": "tailscale-mixin" } ] }