diff --git a/.direnv/bin/nix-direnv-reload b/.direnv/bin/nix-direnv-reload deleted file mode 100755 index f8d9cfc..0000000 --- a/.direnv/bin/nix-direnv-reload +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env bash -set -e -if [[ ! -d "/home/rupansh/overclock/overcast" ]]; then - echo "Cannot find source directory; Did you move it?" - echo "(Looking for "/home/rupansh/overclock/overcast")" - echo 'Cannot force reload with this script - use "direnv reload" manually and then try again' - exit 1 -fi - -# rebuild the cache forcefully -_nix_direnv_force_reload=1 direnv exec "/home/rupansh/overclock/overcast" true - -# Update the mtime for .envrc. -# This will cause direnv to reload again - but without re-building. -touch "/home/rupansh/overclock/overcast/.envrc" - -# Also update the timestamp of whatever profile_rc we have. -# This makes sure that we know we are up to date. -touch -r "/home/rupansh/overclock/overcast/.envrc" "/home/rupansh/overclock/overcast/.direnv"/*.rc diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..4ade41b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,13 @@ +.git +.claude +.direnv +.vscode +.DS_Store +.env +.env.local +.env.*.local +.envrc +Lightbringer.toml +target +secrets +shred-store diff --git a/.gitignore b/.gitignore index e1b1166..5966eac 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,12 @@ /target .direnv -Lightbringer.toml \ No newline at end of file +Lightbringer.toml +.env +.env.local +.env.*.local + +/secrets/* +!/secrets/.gitignore +!/secrets/*.example +!/secrets/*.example.* diff --git a/Cargo.lock b/Cargo.lock index 2ca69c3..f5e67fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4413,6 +4413,7 @@ dependencies = [ "solana-ledger", "solana-net-utils", "solana-packet", + "solana-quic-definitions", "solana-rpc-client-types 3.0.14 (registry+https://github.com/rust-lang/crates.io-index)", "solana-sdk", "solana-streamer", diff --git a/Cargo.toml b/Cargo.toml index 661755e..c8b524d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,12 +15,14 @@ solana-streamer = { git = "https://github.com/Overclock-Validator/agave.git", re solana-ledger = { git = "https://github.com/Overclock-Validator/agave.git", rev = "bb110d1fad7f1523c13709dcd1a0336f9f9975b4" } solana-entry = "3.0.14" solana-net-utils = { git = "https://github.com/Overclock-Validator/agave.git", rev = "bb110d1fad7f1523c13709dcd1a0336f9f9975b4", features = ["agave-unstable-api"] } +solana-quic-definitions = "=3.0.0" solana-rpc-client-types = "3.0.14" solana-epoch-info = "3.0.0" solana-core = { git = "https://github.com/Overclock-Validator/agave.git", rev = "bb110d1fad7f1523c13709dcd1a0336f9f9975b4" } solana-packet = "3.0.0" solana-commitment-config = "3.1.0" -# pins because obviously someone doesn't want to follow semver +# Keep these transitive Solana crates pinned to the exact versions required by +# the Agave revision above. solana-sysvar = { version = "=3.0.0", features = ["serde"] } solana-hash = { version = "=3.0.0", features = ["serde"] } solana-blake3-hasher = "=3.0.0" @@ -65,7 +67,7 @@ uluru = "3.1.0" circular-buffer = "0.1.9" lrumap = "0.1.0" -# grpc streaming related +# gRPC streaming dependencies. tokio = { version = "1.0", features = ["rt", "sync", "time", "net", "fs"] } tokio-stream = { version = "0.1", features = ["sync"] } tonic = "0.14" @@ -78,4 +80,4 @@ tokio-util = { version = "0.7.16", features = ["time"] } tonic-prost-build = "0.14" [features] -debug = [] \ No newline at end of file +debug = [] diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..9955550 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,58 @@ +# syntax=docker/dockerfile:1.7 + +ARG RUST_VERSION=1.92.0 + +FROM rust:${RUST_VERSION}-bookworm AS builder + +WORKDIR /usr/src/lightbringer + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + clang \ + cmake \ + git \ + libclang-dev \ + libssl-dev \ + libudev-dev \ + pkg-config \ + protobuf-compiler \ + zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +COPY Cargo.toml Cargo.lock rust-toolchain.toml build.rs ./ +COPY pb ./pb +COPY src ./src + +RUN cargo build --release --bin lightbringer + +FROM builder AS tester + +COPY decoded_shreds.json stored_shreds.json ./ + +RUN cargo test --all-targets + +FROM debian:bookworm-slim AS runtime + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + ca-certificates \ + libgcc-s1 \ + libnghttp2-14 \ + libssl3 \ + libstdc++6 \ + libudev1 \ + zlib1g \ + && rm -rf /var/lib/apt/lists/* + +RUN useradd --system --create-home --home-dir /var/lib/lightbringer --shell /usr/sbin/nologin lightbringer \ + && mkdir -p /var/lib/lightbringer/shred-store \ + && chown -R lightbringer:lightbringer /var/lib/lightbringer + +COPY --from=builder /usr/src/lightbringer/target/release/lightbringer /usr/local/bin/lightbringer + +USER lightbringer +WORKDIR /var/lib/lightbringer + +ENTRYPOINT ["lightbringer"] diff --git a/Lightbringer.example.toml b/Lightbringer.example.toml index 322bf22..45d184e 100644 --- a/Lightbringer.example.toml +++ b/Lightbringer.example.toml @@ -1,26 +1,32 @@ -# entrypoint to solana gossip (must be a remote addr) -gossip_entrypoint = "177.177.177.177:8000" +# Replace this TEST-NET address with a real Solana gossip entrypoint. +gossip_entrypoint = "203.0.113.10:8000" -# folder for storing shreds (optional, default = "./shred-store") +# Shred storage path. storage = "./shred-store" -# Debugging RPC address (optional, default = "127.0.0.1:3000") +# Local debug RPC address. rpc_addr = "127.0.0.1:3000" -# gRPC slot stream (optional, default = "127.0.0.1:3001") +# gRPC slot stream address. grpc_addr = "127.0.0.1:3001" -# optional, fallback to mock metrics if not set +[gossip] +gossip_port = 65400 +port_range_start = 65401 +port_range_end = 65500 + +# Compose mounts the InfluxDB token here. +# Remove this section when running without InfluxDB. [influxdb] -host = "http://localhost:8181" -database = "test" -token = "xyz" +host = "http://127.0.0.1:18181" +database = "lightbringer" +token_file = "/run/secrets/influxdb-admin-token" -# optional, grpc streams confirmed blocks if set -[block_confirmation] -rpc_http = "http://localhost:8900" -rpc_websocket = "ws://localhost:8900" +# Enable only when a local RPC node is available. +# [block_confirmation] +# rpc_http = "http://localhost:8900" +# rpc_websocket = "ws://localhost:8900" -# optional, suppress info/debug log lines (only warnings and errors are emitted) +# Suppress info/debug logs. [log] -quiet = false \ No newline at end of file +quiet = false diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..c947176 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,193 @@ +name: lightbringer + +services: + secrets-preflight: + image: influxdb:3.9.2-core + restart: "no" + user: "0:0" + network_mode: none + cap_drop: + - ALL + read_only: true + security_opt: + - no-new-privileges:true + entrypoint: + - /bin/sh + - /docker-secrets/check-readable.sh + secrets: + - source: influxdb_admin_token + target: influxdb-admin-token + - source: grafana_admin_password + target: grafana-admin-password + volumes: + - ./docker/secrets/check-readable.sh:/docker-secrets/check-readable.sh:ro + + lightbringer-config-preflight: + profiles: + - lightbringer + image: influxdb:3.9.2-core + restart: "no" + user: "0:0" + network_mode: none + cap_drop: + - ALL + read_only: true + security_opt: + - no-new-privileges:true + entrypoint: + - /bin/sh + - /docker-secrets/check-readable.sh + environment: + LIGHTBRINGER_CONFIG_FILE: /docker-config/Lightbringer.toml + secrets: + - source: influxdb_admin_token + target: influxdb-admin-token + - source: grafana_admin_password + target: grafana-admin-password + volumes: + - ./docker/secrets/check-readable.sh:/docker-secrets/check-readable.sh:ro + - ./Lightbringer.toml:/docker-config/Lightbringer.toml:ro + + influxdb3: + image: influxdb:3.9.2-core + restart: unless-stopped + cap_drop: + - ALL + security_opt: + - no-new-privileges:true + depends_on: + secrets-preflight: + condition: service_completed_successfully + command: + - influxdb3 + - serve + - --node-id=lightbringer-node + - --object-store=file + - --data-dir=/var/lib/influxdb3/data + - --admin-token-file=/run/secrets/influxdb-admin-token + - --disable-authz=health,ping + ports: + - "127.0.0.1:18181:8181" + healthcheck: + test: + [ + "CMD-SHELL", + "token=$$(sed -n 's/.*\"token\"[[:space:]]*:[[:space:]]*\"\\([^\"]*\\)\".*/\\1/p' /run/secrets/influxdb-admin-token | head -n 1); [ -n \"$$token\" ] && INFLUXDB3_AUTH_TOKEN=\"$$token\" influxdb3 show databases --host http://127.0.0.1:8181 >/dev/null", + ] + interval: 10s + timeout: 5s + retries: 12 + start_period: 20s + secrets: + - source: influxdb_admin_token + target: influxdb-admin-token + volumes: + - influxdb3-data:/var/lib/influxdb3 + + influxdb3-init: + image: influxdb:3.9.2-core + restart: "no" + cap_drop: + - ALL + security_opt: + - no-new-privileges:true + depends_on: + influxdb3: + condition: service_healthy + entrypoint: + - /bin/sh + - /docker-entrypoint-initdb.d/init-database.sh + environment: + INFLUXDB_ADMIN_TOKEN_FILE: /run/secrets/influxdb-admin-token + INFLUXDB_DATABASE: lightbringer + INFLUXDB_URL: http://influxdb3:8181 + secrets: + - source: influxdb_admin_token + target: influxdb-admin-token + volumes: + - ./docker/influxdb/init-database.sh:/docker-entrypoint-initdb.d/init-database.sh:ro + + grafana: + build: + context: . + dockerfile: docker/grafana/Dockerfile + args: + GRAFANA_VERSION: 13.0.1-security-01 + restart: unless-stopped + cap_drop: + - ALL + security_opt: + - no-new-privileges:true + depends_on: + influxdb3-init: + condition: service_completed_successfully + environment: + GF_AUTH_ANONYMOUS_ENABLED: "false" + GF_SECURITY_ADMIN_USER: admin + GF_SECURITY_ADMIN_PASSWORD__FILE: /run/secrets/grafana-admin-password + INFLUXDB_ADMIN_TOKEN_FILE: /run/secrets/influxdb-admin-token + INFLUXDB_DATABASE: lightbringer + INFLUXDB_URL: http://influxdb3:8181 + ports: + - "127.0.0.1:3300:3000" + healthcheck: + test: ["CMD", "curl", "-fsS", "http://127.0.0.1:3000/api/health"] + interval: 10s + timeout: 5s + retries: 12 + start_period: 20s + secrets: + - source: influxdb_admin_token + target: influxdb-admin-token + - source: grafana_admin_password + target: grafana-admin-password + volumes: + - grafana-data:/var/lib/grafana + - ./docker/grafana/provisioning:/etc/grafana/provisioning:ro + - ./docker/grafana/dashboards:/var/lib/grafana/dashboards:ro + + lightbringer: + profiles: + - lightbringer + build: + context: . + dockerfile: Dockerfile + restart: unless-stopped + depends_on: + influxdb3-init: + condition: service_completed_successfully + lightbringer-config-preflight: + condition: service_completed_successfully + network_mode: host + cap_drop: + - ALL + security_opt: + - no-new-privileges:true + - seccomp=unconfined + ulimits: + memlock: + soft: -1 + hard: -1 + secrets: + - source: influxdb_admin_token + target: influxdb-admin-token + configs: + - source: lightbringer_config + target: /var/lib/lightbringer/Lightbringer.toml + volumes: + - lightbringer-shred-store:/var/lib/lightbringer/shred-store + +configs: + lightbringer_config: + file: ./Lightbringer.toml + +secrets: + influxdb_admin_token: + file: ./secrets/influxdb-admin-token.json + grafana_admin_password: + file: ./secrets/grafana-admin-password + +volumes: + grafana-data: + influxdb3-data: + lightbringer-shred-store: diff --git a/docker/grafana/Dockerfile b/docker/grafana/Dockerfile new file mode 100644 index 0000000..2e64039 --- /dev/null +++ b/docker/grafana/Dockerfile @@ -0,0 +1,20 @@ +ARG GRAFANA_VERSION=13.0.1-security-01 + +FROM grafana/grafana:${GRAFANA_VERSION} + +USER root + +RUN if command -v apk >/dev/null 2>&1; then \ + apk add --no-cache curl jq; \ + else \ + apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates curl jq \ + && rm -rf /var/lib/apt/lists/*; \ + fi + +COPY docker/grafana/entrypoint.sh /usr/local/bin/lightbringer-grafana-entrypoint +RUN chmod +x /usr/local/bin/lightbringer-grafana-entrypoint + +USER grafana + +ENTRYPOINT ["lightbringer-grafana-entrypoint"] diff --git a/docker/grafana/dashboards/absolute-repair-time.json b/docker/grafana/dashboards/absolute-repair-time.json new file mode 100644 index 0000000..f487fc9 --- /dev/null +++ b/docker/grafana/dashboards/absolute-repair-time.json @@ -0,0 +1,580 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [ + + ], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 500 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH repair_initiates AS (\n SELECT\n slot,\n MIN(time) AS repair_start\n FROM slot\n WHERE kind = 'repair_initiate'\n GROUP BY slot\n),\ncompletions AS (\n SELECT\n slot,\n MIN(time) AS repair_end\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n GROUP BY slot\n),\nrepair_durations AS (\n SELECT\n completions.slot,\n CAST((completions.repair_end - repair_initiates.repair_start) AS BIGINT) / 1000000.0 AS repair_duration_ms,\n repair_initiates.repair_start,\n completions.repair_end\n FROM repair_initiates\n INNER JOIN completions ON repair_initiates.slot = completions.slot\n WHERE completions.repair_end >= repair_initiates.repair_start\n)\nSELECT MEDIAN(repair_duration_ms) AS median_repair_ms FROM repair_durations", + "sql": { + "columns": [ + { + "parameters": [ + + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Median Repair Duration", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 500 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH repair_initiates AS (\n SELECT\n slot,\n MIN(time) AS repair_start\n FROM slot\n WHERE kind = 'repair_initiate'\n GROUP BY slot\n),\ncompletions AS (\n SELECT\n slot,\n MIN(time) AS repair_end\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n GROUP BY slot\n),\nrepair_durations AS (\n SELECT\n completions.slot,\n CAST((completions.repair_end - repair_initiates.repair_start) AS BIGINT) / 1000000.0 AS repair_duration_ms,\n repair_initiates.repair_start,\n completions.repair_end\n FROM repair_initiates\n INNER JOIN completions ON repair_initiates.slot = completions.slot\n WHERE completions.repair_end >= repair_initiates.repair_start\n)\nSELECT APPROX_PERCENTILE_CONT(repair_duration_ms, 0.95) AS p95_repair_ms FROM repair_durations", + "sql": { + "columns": [ + { + "parameters": [ + + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "P95 Repair Duration", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 500 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH repair_initiates AS (\n SELECT\n slot,\n MIN(time) AS repair_start\n FROM slot\n WHERE kind = 'repair_initiate'\n GROUP BY slot\n),\ncompletions AS (\n SELECT\n slot,\n MIN(time) AS repair_end\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n GROUP BY slot\n),\nrepair_durations AS (\n SELECT\n completions.slot,\n CAST((completions.repair_end - repair_initiates.repair_start) AS BIGINT) / 1000000.0 AS repair_duration_ms,\n repair_initiates.repair_start,\n completions.repair_end\n FROM repair_initiates\n INNER JOIN completions ON repair_initiates.slot = completions.slot\n WHERE completions.repair_end >= repair_initiates.repair_start\n)\nSELECT MAX(repair_duration_ms) AS max_repair_ms FROM repair_durations", + "sql": { + "columns": [ + { + "parameters": [ + + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Max Repair Duration", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "links": [ + + ], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH repair_initiates AS (\n SELECT\n slot,\n MIN(time) AS repair_start\n FROM slot\n WHERE kind = 'repair_initiate'\n GROUP BY slot\n),\ncompletions AS (\n SELECT\n slot,\n MIN(time) AS repair_end\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n GROUP BY slot\n),\nrepair_durations AS (\n SELECT\n completions.slot,\n CAST((completions.repair_end - repair_initiates.repair_start) AS BIGINT) / 1000000.0 AS repair_duration_ms,\n repair_initiates.repair_start,\n completions.repair_end\n FROM repair_initiates\n INNER JOIN completions ON repair_initiates.slot = completions.slot\n WHERE completions.repair_end >= repair_initiates.repair_start\n)\nSELECT COUNT(*) AS repaired_slots FROM repair_durations", + "sql": { + "columns": [ + { + "parameters": [ + + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Repaired Slots", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Duration from first repair initiation to first completion for repaired slots.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 2, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 500 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [ + + ] + }, + "gridPos": { + "h": 11, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 5, + "links": [ + + ], + "options": { + "barRadius": 0, + "barWidth": 0.9, + "fullHighlight": true, + "groupWidth": 0.72, + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "never", + "stacking": "none", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + }, + "xField": "slot", + "xTickLabelRotation": 45, + "xTickLabelSpacing": 100 + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH repair_initiates AS (\n SELECT\n slot,\n MIN(time) AS repair_start\n FROM slot\n WHERE kind = 'repair_initiate'\n GROUP BY slot\n),\ncompletions AS (\n SELECT\n slot,\n MIN(time) AS repair_end\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n GROUP BY slot\n),\nrepair_durations AS (\n SELECT\n completions.slot,\n CAST((completions.repair_end - repair_initiates.repair_start) AS BIGINT) / 1000000.0 AS repair_duration_ms,\n repair_initiates.repair_start,\n completions.repair_end\n FROM repair_initiates\n INNER JOIN completions ON repair_initiates.slot = completions.slot\n WHERE completions.repair_end >= repair_initiates.repair_start\n)\nSELECT\n CAST(slot AS VARCHAR) AS slot,\n repair_duration_ms,\n repair_start,\n repair_end\nFROM repair_durations\nORDER BY repair_end", + "sql": { + "columns": [ + { + "parameters": [ + + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Repair Duration by Slot", + "transformations": [ + + ], + "type": "barchart" + } + ], + "preload": false, + "refresh": "10s", + "schemaVersion": 41, + "tags": [ + "lightbringer" + ], + "templating": { + "list": [ + + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "hidden": false, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "browser", + "title": "Absolute Repair Time", + "uid": "absolute-repair-time", + "version": 0, + "weekStart": "" +} diff --git a/docker/grafana/dashboards/absolute-slot-completion-time.json b/docker/grafana/dashboards/absolute-slot-completion-time.json new file mode 100644 index 0000000..43f926c --- /dev/null +++ b/docker/grafana/dashboards/absolute-slot-completion-time.json @@ -0,0 +1,543 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": {}, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Most recent time between consecutive completed slots.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 600 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH slot_completions AS (\n SELECT\n slot,\n time AS completion_time,\n LAG(time) OVER (ORDER BY time) AS prev_completion_time\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n),\ncompletion_deltas AS (\n SELECT\n slot,\n completion_time,\n CAST((completion_time - prev_completion_time) AS BIGINT) / 1000000.0 AS completion_delta_ms\n FROM slot_completions\n WHERE prev_completion_time IS NOT NULL\n)\nSELECT completion_delta_ms AS latest_completion_ms\nFROM completion_deltas\nORDER BY completion_time DESC\nLIMIT 1", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Latest Completion Delta", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 600 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH slot_completions AS (\n SELECT\n slot,\n time AS completion_time,\n LAG(time) OVER (ORDER BY time) AS prev_completion_time\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n),\ncompletion_deltas AS (\n SELECT\n slot,\n completion_time,\n CAST((completion_time - prev_completion_time) AS BIGINT) / 1000000.0 AS completion_delta_ms\n FROM slot_completions\n WHERE prev_completion_time IS NOT NULL\n)\nSELECT MEDIAN(completion_delta_ms) AS median_completion_ms FROM completion_deltas", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Median Completion Delta", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 600 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH slot_completions AS (\n SELECT\n slot,\n time AS completion_time,\n LAG(time) OVER (ORDER BY time) AS prev_completion_time\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n),\ncompletion_deltas AS (\n SELECT\n slot,\n completion_time,\n CAST((completion_time - prev_completion_time) AS BIGINT) / 1000000.0 AS completion_delta_ms\n FROM slot_completions\n WHERE prev_completion_time IS NOT NULL\n)\nSELECT APPROX_PERCENTILE_CONT(completion_delta_ms, 0.95) AS p95_completion_ms FROM completion_deltas", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "P95 Completion Delta", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT COUNT(*) AS completed_slots FROM slot WHERE kind = 'completion' AND $__timeFilter(time)", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Completed Slots", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Per-slot completion deltas. Large gaps are intentionally kept visible because they are operationally important.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 2, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 600 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 5, + "links": [], + "options": { + "barRadius": 0, + "barWidth": 0.9, + "fullHighlight": true, + "groupWidth": 0.72, + "legend": { + "calcs": [ + "mean", + "max" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "never", + "stacking": "none", + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + }, + "xField": "slot", + "xTickLabelRotation": 45, + "xTickLabelSpacing": 100 + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH slot_completions AS (\n SELECT\n slot,\n time AS completion_time,\n LAG(time) OVER (ORDER BY time) AS prev_completion_time\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n),\ncompletion_deltas AS (\n SELECT\n slot,\n completion_time,\n CAST((completion_time - prev_completion_time) AS BIGINT) / 1000000.0 AS completion_delta_ms\n FROM slot_completions\n WHERE prev_completion_time IS NOT NULL\n)\nSELECT\n CAST(slot AS VARCHAR) AS slot,\n completion_delta_ms\nFROM completion_deltas\nORDER BY completion_time", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Completion Delta by Slot", + "transformations": [], + "type": "barchart" + } + ], + "preload": false, + "refresh": "10s", + "schemaVersion": 41, + "tags": [ + "lightbringer" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "hidden": false, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "browser", + "title": "Absolute Slot Completion Time", + "uid": "absolute-slot-completion-time", + "version": 0, + "weekStart": "" +} diff --git a/docker/grafana/dashboards/lightbringer-aggregate-stats.json b/docker/grafana/dashboards/lightbringer-aggregate-stats.json new file mode 100644 index 0000000..ef172ce --- /dev/null +++ b/docker/grafana/dashboards/lightbringer-aggregate-stats.json @@ -0,0 +1,1190 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": {}, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT COUNT(*) AS completed_slots FROM slot WHERE kind = 'completion' AND $__timeFilter(time)", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Completed Slots", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Raw count of repair initiations in the selected time range. Use Repair Rate for health thresholds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT COUNT(*) AS repair_starts FROM slot WHERE kind = 'repair_initiate' AND $__timeFilter(time)", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Repair Starts", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 25 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH completions AS (\n SELECT slot, MIN(time) AS completion_time\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n GROUP BY slot\n),\nrepairs AS (\n SELECT slot, MIN(time) AS repair_time\n FROM slot\n WHERE kind = 'repair_initiate'\n GROUP BY slot\n)\nSELECT\n CASE\n WHEN COUNT(*) = 0 THEN 0.0\n ELSE (CAST(SUM(CASE WHEN repairs.slot IS NULL THEN 0 ELSE 1 END) AS DOUBLE) / CAST(COUNT(*) AS DOUBLE)) * 100.0\n END AS repair_rate_percent\nFROM completions\nLEFT JOIN repairs ON repairs.slot = completions.slot", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Repair Rate", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT MAX(slot) AS latest_completed_slot FROM slot WHERE kind = 'completion' AND $__timeFilter(time)", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Latest Completed Slot", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Age of the newest slot event written by Lightbringer. This turns stale if Lightbringer or metrics writes stop.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 30 + }, + { + "color": "red", + "value": 120 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 10, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST((now() - MAX(time)) AS BIGINT) / 1000000000.0 AS last_slot_event_age_seconds FROM slot", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Last Slot Event Age", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Age of the newest memory sample. Memory is sampled every 10 seconds by the Lightbringer metrics thread.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 30 + }, + { + "color": "red", + "value": 120 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 11, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST((now() - MAX(time)) AS BIGINT) / 1000000000.0 AS last_memory_sample_age_seconds FROM memory", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Last Memory Sample Age", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Average and p95 completion delta per 5 second bucket.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 600 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms", + "decimals": 1 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 5, + "links": [], + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "time_series", + "rawQuery": true, + "rawSql": "WITH slot_completions AS (\n SELECT\n slot,\n time AS completion_time,\n LAG(time) OVER (ORDER BY time) AS prev_completion_time\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n),\ncompletion_deltas AS (\n SELECT\n slot,\n completion_time,\n CAST((completion_time - prev_completion_time) AS BIGINT) / 1000000.0 AS completion_delta_ms\n FROM slot_completions\n WHERE prev_completion_time IS NOT NULL\n)\nSELECT\n window_time AS time,\n AVG(completion_delta_ms) AS avg_completion_ms,\n APPROX_PERCENTILE_CONT(completion_delta_ms, 0.95) AS p95_completion_ms\nFROM (\n SELECT\n DATE_BIN(INTERVAL '5 seconds', completion_time) AS window_time,\n completion_delta_ms\n FROM completion_deltas\n)\nGROUP BY window_time\nORDER BY window_time", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Completion Latency (5s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Completed slots per second, bucketed over 10 seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "ops", + "decimals": 2 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 6, + "links": [], + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "time_series", + "rawQuery": true, + "rawSql": "SELECT\n window_time AS time,\n COUNT(*) / 10.0 AS slots_per_second\nFROM (\n SELECT DATE_BIN(INTERVAL '10 seconds', time) AS window_time\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n)\nGROUP BY window_time\nORDER BY window_time", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Completion Throughput (10s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Percentage of completed slots that had a repair initiation.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 25 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "percent", + "decimals": 1 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 7, + "links": [], + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "time_series", + "rawQuery": true, + "rawSql": "WITH completions AS (\n SELECT\n slot,\n MIN(time) AS completion_time\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n GROUP BY slot\n),\nrepairs AS (\n SELECT\n slot,\n MIN(time) AS repair_time\n FROM slot\n WHERE kind = 'repair_initiate'\n GROUP BY slot\n),\nslot_status AS (\n SELECT\n completions.slot,\n DATE_BIN(INTERVAL '10 seconds', completions.completion_time) AS window_time,\n CASE WHEN repairs.slot IS NULL THEN 0 ELSE 1 END AS has_repair\n FROM completions\n LEFT JOIN repairs ON repairs.slot = completions.slot\n)\nSELECT\n window_time AS time,\n (CAST(SUM(has_repair) AS DOUBLE) / CAST(COUNT(*) AS DOUBLE)) * 100.0 AS repair_rate_percent\nFROM slot_status\nGROUP BY window_time\nORDER BY window_time", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Repair Rate (10s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Standard deviation of completion deltas per 5 second bucket.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 600 + }, + { + "color": "red", + "value": 1000 + } + ] + }, + "unit": "ms", + "decimals": 1 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 8, + "links": [], + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "time_series", + "rawQuery": true, + "rawSql": "WITH slot_completions AS (\n SELECT\n slot,\n time AS completion_time,\n LAG(time) OVER (ORDER BY time) AS prev_completion_time\n FROM slot\n WHERE kind = 'completion'\n AND $__timeFilter(time)\n),\ncompletion_deltas AS (\n SELECT\n slot,\n completion_time,\n CAST((completion_time - prev_completion_time) AS BIGINT) / 1000000.0 AS completion_delta_ms\n FROM slot_completions\n WHERE prev_completion_time IS NOT NULL\n)\nSELECT\n window_time AS time,\n STDDEV(completion_delta_ms) AS jitter_ms\nFROM (\n SELECT\n DATE_BIN(INTERVAL '5 seconds', completion_time) AS window_time,\n completion_delta_ms\n FROM completion_deltas\n)\nGROUP BY window_time\nORDER BY window_time", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Completion Jitter (5s)", + "type": "timeseries" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Count of repair initiations per 10 second bucket.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 25 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "none", + "decimals": 0 + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 9, + "links": [], + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "time_series", + "rawQuery": true, + "rawSql": "SELECT\n window_time AS time,\n COUNT(*) AS repair_starts\nFROM (\n SELECT DATE_BIN(INTERVAL '10 seconds', time) AS window_time\n FROM slot\n WHERE kind = 'repair_initiate'\n AND $__timeFilter(time)\n)\nGROUP BY window_time\nORDER BY window_time", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Repair Starts (10s)", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "10s", + "schemaVersion": 41, + "tags": [ + "lightbringer" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "hidden": false, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "browser", + "title": "Lightbringer Aggregate Stats", + "uid": "lightbringer-aggregate-stats", + "version": 0, + "weekStart": "" +} diff --git a/docker/grafana/dashboards/lightbringer-memory.json b/docker/grafana/dashboards/lightbringer-memory.json new file mode 100644 index 0000000..232c823 --- /dev/null +++ b/docker/grafana/dashboards/lightbringer-memory.json @@ -0,0 +1,856 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": {}, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 2147483648 + }, + { + "color": "red", + "value": 4294967296 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 1, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT rss_bytes AS current_rss_bytes FROM memory WHERE $__timeFilter(time) ORDER BY time DESC LIMIT 1", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Current RSS", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 2147483648 + }, + { + "color": "red", + "value": 4294967296 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 2, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT MAX(rss_bytes) AS peak_rss_bytes FROM memory WHERE $__timeFilter(time)", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Peak RSS", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 2147483648 + }, + { + "color": "red", + "value": 4294967296 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 3, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT virtual_bytes AS current_virtual_bytes FROM memory WHERE $__timeFilter(time) ORDER BY time DESC LIMIT 1", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Current Virtual", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 2147483648 + }, + { + "color": "red", + "value": 4294967296 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT MAX(virtual_bytes) AS peak_virtual_bytes FROM memory WHERE $__timeFilter(time)", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Peak Virtual", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Most recent rate of RSS change. Sustained positive growth is the signal to investigate.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 524288 + }, + { + "color": "red", + "value": 2097152 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 0, + "y": 4 + }, + "id": 7, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "WITH samples AS (\n SELECT\n time,\n rss_bytes,\n LAG(time) OVER (ORDER BY time) AS prev_time,\n LAG(rss_bytes) OVER (ORDER BY time) AS prev_rss_bytes\n FROM memory\n WHERE $__timeFilter(time)\n),\nrates AS (\n SELECT\n time,\n (CAST(rss_bytes AS DOUBLE) - CAST(prev_rss_bytes AS DOUBLE)) /\n (CAST((time - prev_time) AS BIGINT) / 1000000000.0) AS rss_bytes_per_second\n FROM samples\n WHERE prev_time IS NOT NULL\n AND CAST((time - prev_time) AS BIGINT) > 0\n)\nSELECT rss_bytes_per_second\nFROM rates\nORDER BY time DESC\nLIMIT 1", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Current RSS Growth", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Age of the newest memory sample. Lightbringer should emit this about every 10 seconds.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 30 + }, + { + "color": "red", + "value": 120 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 4 + }, + "id": 8, + "links": [], + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST((now() - MAX(time)) AS BIGINT) / 1000000000.0 AS last_memory_sample_age_seconds FROM memory", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Last Sample Age", + "type": "stat" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "RSS and virtual memory sampled from /proc/self/statm.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 2147483648 + }, + { + "color": "red", + "value": 4294967296 + } + ] + }, + "unit": "bytes", + "decimals": 1 + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 5, + "links": [], + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "time_series", + "rawQuery": true, + "rawSql": "SELECT\n time,\n rss_bytes,\n virtual_bytes\nFROM memory\nWHERE $__timeFilter(time)\nORDER BY time", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "Process Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "description": "Rate of RSS change between memory samples. Sustained positive growth can indicate a leak or backlog.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 15, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 4, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "yellow", + "value": 524288 + }, + { + "color": "red", + "value": 2097152 + } + ] + }, + "unit": "Bps", + "decimals": 2 + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 6, + "links": [], + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "13.0.1-security-01", + "targets": [ + { + "dataset": "iox", + "editorMode": "code", + "format": "time_series", + "rawQuery": true, + "rawSql": "WITH samples AS (\n SELECT\n time,\n rss_bytes,\n LAG(time) OVER (ORDER BY time) AS prev_time,\n LAG(rss_bytes) OVER (ORDER BY time) AS prev_rss_bytes\n FROM memory\n WHERE $__timeFilter(time)\n)\nSELECT\n time,\n (CAST(rss_bytes AS DOUBLE) - CAST(prev_rss_bytes AS DOUBLE)) /\n (CAST((time - prev_time) AS BIGINT) / 1000000000.0) AS rss_bytes_per_second\nFROM samples\nWHERE prev_time IS NOT NULL\n AND CAST((time - prev_time) AS BIGINT) > 0\nORDER BY time", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ] + }, + "datasource": { + "type": "influxdb", + "uid": "lightbringer-influxdb" + }, + "refId": "A", + "hide": false + } + ], + "title": "RSS Growth Rate", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "10s", + "schemaVersion": 41, + "tags": [ + "lightbringer" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "hidden": false, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "browser", + "title": "Lightbringer Memory", + "uid": "lightbringer-memory", + "version": 0, + "weekStart": "" +} diff --git a/docker/grafana/entrypoint.sh b/docker/grafana/entrypoint.sh new file mode 100755 index 0000000..da64dff --- /dev/null +++ b/docker/grafana/entrypoint.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env sh +set -eu + +token_file="${INFLUXDB_ADMIN_TOKEN_FILE:-/run/secrets/influxdb-admin-token}" +if [ ! -r "$token_file" ]; then + printf 'InfluxDB token file is missing or unreadable: %s\n' "$token_file" >&2 + exit 1 +fi + +INFLUXDB_ADMIN_TOKEN="$(jq -r '.token // empty' "$token_file")" +if [ -z "$INFLUXDB_ADMIN_TOKEN" ]; then + printf 'InfluxDB token file does not contain a token field\n' >&2 + exit 1 +fi +case "$INFLUXDB_ADMIN_TOKEN" in + apiv3_*) ;; + *) + printf 'InfluxDB token must start with apiv3_\n' >&2 + exit 1 + ;; +esac +export INFLUXDB_ADMIN_TOKEN + +exec /run.sh "$@" diff --git a/docker/grafana/provisioning/dashboards/lightbringer.yml b/docker/grafana/provisioning/dashboards/lightbringer.yml new file mode 100644 index 0000000..b41db7e --- /dev/null +++ b/docker/grafana/provisioning/dashboards/lightbringer.yml @@ -0,0 +1,14 @@ +apiVersion: 1 + +providers: + - name: lightbringer + orgId: 1 + folder: Lightbringer + folderUid: lightbringer + type: file + disableDeletion: false + updateIntervalSeconds: 30 + allowUiUpdates: false + options: + path: /var/lib/grafana/dashboards + foldersFromFilesStructure: false diff --git a/docker/grafana/provisioning/datasources/lightbringer-influxdb.yml b/docker/grafana/provisioning/datasources/lightbringer-influxdb.yml new file mode 100644 index 0000000..bbce0c4 --- /dev/null +++ b/docker/grafana/provisioning/datasources/lightbringer-influxdb.yml @@ -0,0 +1,21 @@ +apiVersion: 1 + +deleteDatasources: + - name: lightbringer-influxdb + orgId: 1 + +datasources: + - name: lightbringer-influxdb + uid: lightbringer-influxdb + type: influxdb + access: proxy + url: ${INFLUXDB_URL} + isDefault: true + editable: false + jsonData: + version: SQL + dbName: ${INFLUXDB_DATABASE} + httpMode: POST + insecureGrpc: true + secureJsonData: + token: $INFLUXDB_ADMIN_TOKEN diff --git a/docker/influxdb/init-database.sh b/docker/influxdb/init-database.sh new file mode 100755 index 0000000..5642b92 --- /dev/null +++ b/docker/influxdb/init-database.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env sh +set -eu + +error() { + printf '%s\n' "$*" >&2 + exit 1 +} + +validate_retention_period() { + value="$1" + case "$value" in + ''|none) return 0 ;; + esac + + rest="$value" + while [ -n "$rest" ]; do + number="$(printf '%s' "$rest" | sed -n 's/^\([0-9][0-9]*\).*/\1/p')" + [ -n "$number" ] || error "INFLUXDB_RETENTION_PERIOD must be empty, none, or a duration using h, d, w, mo, or y" + rest="${rest#"$number"}" + case "$rest" in + mo*) rest="${rest#mo}" ;; + h*|d*|w*|y*) rest="${rest#?}" ;; + *) error "INFLUXDB_RETENTION_PERIOD must be empty, none, or a duration using h, d, w, mo, or y" ;; + esac + done +} + +token_file="${INFLUXDB_ADMIN_TOKEN_FILE:-/run/secrets/influxdb-admin-token}" +[ -r "$token_file" ] || error "InfluxDB token file is missing or unreadable: $token_file" + +token="$(sed -n 's/.*"token"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "$token_file" | head -n 1)" +[ -n "$token" ] || error "InfluxDB token file does not contain a token field" +case "$token" in + apiv3_*) ;; + *) error "InfluxDB token must start with apiv3_" ;; +esac +export INFLUXDB3_AUTH_TOKEN="$token" + +database="${INFLUXDB_DATABASE:-lightbringer}" +case "$database" in + [A-Za-z0-9]*) ;; + *) + error "INFLUXDB_DATABASE must start with a letter or number and contain only letters, numbers, underscore, dash, or slash" + ;; +esac +case "$database" in + *[!A-Za-z0-9_/-]*) + error "INFLUXDB_DATABASE must start with a letter or number and contain only letters, numbers, underscore, dash, or slash" + ;; +esac +[ "${#database}" -le 64 ] || error "INFLUXDB_DATABASE must be 64 characters or fewer" + +influxdb_url="${INFLUXDB_URL:-http://influxdb3:8181}" +retention_period="${INFLUXDB_RETENTION_PERIOD:-}" +validate_retention_period "$retention_period" + +output_file="$(mktemp)" +trap 'rm -f "$output_file"' EXIT + +run_create() { + ready_message="$1" + exists_message="$2" + failure_message="$3" + shift 3 + + : >"$output_file" + set +e + "$@" >"$output_file" 2>&1 + status="$?" + set -e + + if [ "$status" -eq 0 ]; then + printf '%s\n' "$ready_message" + elif grep -Eiq 'already.*exists|exists.*already|resource that already exists|409 Conflict' "$output_file"; then + printf '%s\n' "$exists_message" + else + printf '%s\n' "$failure_message" >&2 + cat "$output_file" >&2 + exit "$status" + fi +} + +if [ -n "$retention_period" ]; then + run_create \ + "InfluxDB database is ready: $database" \ + "InfluxDB database already exists: $database" \ + "Failed to create InfluxDB database $database" \ + influxdb3 create database \ + --host "$influxdb_url" \ + --retention-period "$retention_period" \ + "$database" +else + run_create \ + "InfluxDB database is ready: $database" \ + "InfluxDB database already exists: $database" \ + "Failed to create InfluxDB database $database" \ + influxdb3 create database \ + --host "$influxdb_url" \ + "$database" +fi + +run_create \ + 'InfluxDB table is ready: slot' \ + 'InfluxDB table already exists: slot' \ + 'Failed to create InfluxDB table slot' \ + influxdb3 create table \ + --host "$influxdb_url" \ + --database "$database" \ + --tags kind \ + --fields slot:int64 \ + slot + +run_create \ + 'InfluxDB table is ready: memory' \ + 'InfluxDB table already exists: memory' \ + 'Failed to create InfluxDB table memory' \ + influxdb3 create table \ + --host "$influxdb_url" \ + --database "$database" \ + --tags kind \ + --fields rss_bytes:int64,virtual_bytes:int64 \ + memory diff --git a/docker/secrets/check-readable.sh b/docker/secrets/check-readable.sh new file mode 100755 index 0000000..6ddf4cc --- /dev/null +++ b/docker/secrets/check-readable.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env sh +set -eu + +error() { + printf '%s\n' "$*" >&2 + exit 1 +} + +require_other_read() { + path="$1" + label="$2" + + [ -f "$path" ] || error "$label is missing at $path" + + mode="$(stat -c '%a' "$path")" || error "failed to inspect $label permissions" + case "$mode" in + *[4567]) ;; + *) + error "$label must be readable by non-root containers. Use chmod 644 for the file and keep its directory private with chmod 700." + ;; + esac +} + +influxdb_token_file="${INFLUXDB_ADMIN_TOKEN_FILE:-/run/secrets/influxdb-admin-token}" +grafana_password_file="${GRAFANA_ADMIN_PASSWORD_FILE:-/run/secrets/grafana-admin-password}" + +require_other_read "$influxdb_token_file" "InfluxDB token file" +require_other_read "$grafana_password_file" "Grafana password file" + +if [ -n "${LIGHTBRINGER_CONFIG_FILE:-}" ]; then + require_other_read "$LIGHTBRINGER_CONFIG_FILE" "Lightbringer config file" +fi + +token="$(sed -n 's/.*"token"[[:space:]]*:[[:space:]]*"\([^"]*\)".*/\1/p' "$influxdb_token_file" | head -n 1)" +[ -n "$token" ] || error "InfluxDB token file does not contain a token field" +case "$token" in + apiv3_*) ;; + *) error "InfluxDB token must start with apiv3_" ;; +esac + +[ -s "$grafana_password_file" ] || error "Grafana password file must not be empty" + +printf '%s\n' "Compose inputs are readable by non-root containers" diff --git a/secrets/.gitignore b/secrets/.gitignore new file mode 100644 index 0000000..323b721 --- /dev/null +++ b/secrets/.gitignore @@ -0,0 +1,4 @@ +* +!.gitignore +!*.example +!*.example.* diff --git a/secrets/grafana-admin-password.example b/secrets/grafana-admin-password.example new file mode 100644 index 0000000..61339e0 --- /dev/null +++ b/secrets/grafana-admin-password.example @@ -0,0 +1 @@ +replace-with-a-long-random-password diff --git a/secrets/influxdb-admin-token.example.json b/secrets/influxdb-admin-token.example.json new file mode 100644 index 0000000..7ee9213 --- /dev/null +++ b/secrets/influxdb-admin-token.example.json @@ -0,0 +1,5 @@ +{ + "token": "replace-with-output-from-influxdb3-create-token", + "name": "admin", + "description": "Admin token for local Lightbringer observability" +} diff --git a/src/coding.rs b/src/coding.rs index 239e662..0fc975f 100644 --- a/src/coding.rs +++ b/src/coding.rs @@ -6,7 +6,7 @@ fn try_deshred() { use solana_ledger::shred::Shred; - simple_logger::init_with_level(log::Level::Info).unwrap(); + let _ = simple_logger::init_with_level(log::Level::Info); let f = fs::read("./decoded_shreds.json").unwrap(); let raw_shreds: Vec = serde_json::from_slice(&f).unwrap(); let shreds = raw_shreds.into_iter().map(|s| { @@ -38,7 +38,7 @@ fn try_deshred() { fn try_decode_and_deshred() { use std::fs; - simple_logger::init_with_level(log::Level::Info).unwrap(); + let _ = simple_logger::init_with_level(log::Level::Info); let f = fs::read("./stored_shreds.json").unwrap(); let raw_shreds: Vec = serde_json::from_slice(&f).unwrap(); let shreds = raw_shreds.into_iter().map(|s| { diff --git a/src/config.rs b/src/config.rs index e027db8..ef398a5 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,4 +1,8 @@ -use std::{net::SocketAddr, path::PathBuf}; +use std::{ + fs, + net::{SocketAddr, ToSocketAddrs}, + path::{Path, PathBuf}, +}; use anyhow::anyhow; use figment::{ @@ -8,14 +12,89 @@ use figment::{ use http::Uri; use serde::{Deserialize, Serialize}; use serde_with::{DisplayFromStr, serde_as}; +use solana_net_utils::MINIMUM_VALIDATOR_PORT_RANGE_WIDTH; +use solana_quic_definitions::QUIC_PORT_OFFSET; #[derive(Serialize, Deserialize)] +struct InfluxDbConfigRaw { + pub host: String, + pub database: String, + pub token: Option, + pub token_file: Option, +} + pub struct InfluxDbConfig { pub host: String, pub database: String, pub token: String, } +#[derive(Deserialize)] +struct InfluxDbTokenFile { + token: String, +} + +fn read_influxdb_token_file(path: &Path) -> anyhow::Result { + let contents = fs::read_to_string(path).map_err(|e| { + anyhow!( + "failed to read `influxdb.token_file` {}: {e}", + path.display() + ) + })?; + let trimmed = contents.trim(); + if trimmed.is_empty() { + return Err(anyhow!("`influxdb.token_file` {} is empty", path.display())); + } + if trimmed.starts_with('{') { + let token_file: InfluxDbTokenFile = serde_json::from_str(trimmed).map_err(|e| { + anyhow!( + "failed to parse `influxdb.token_file` {} as JSON: {e}", + path.display() + ) + })?; + let token = token_file.token.trim(); + if token.is_empty() { + return Err(anyhow!( + "`influxdb.token_file` {} does not contain a token", + path.display() + )); + } + Ok(token.to_string()) + } else { + Ok(trimmed.to_string()) + } +} + +impl TryFrom for InfluxDbConfig { + type Error = anyhow::Error; + + fn try_from(value: InfluxDbConfigRaw) -> Result { + let token = match (value.token, value.token_file) { + (Some(_), Some(_)) => { + return Err(anyhow!( + "set only one of `influxdb.token` or `influxdb.token_file`" + )); + } + (Some(token), None) if token.trim().is_empty() => { + return Err(anyhow!("`influxdb.token` must not be empty")); + } + (Some(token), None) => token.trim().to_string(), + (None, Some(token_file)) => read_influxdb_token_file(&token_file)?, + (None, None) => { + return Err(anyhow!( + "set one of `influxdb.token` or `influxdb.token_file`" + )); + } + }; + + Ok(Self { + host: value.host, + database: value.database, + token, + }) + } +} + #[serde_as] #[derive(Serialize, Deserialize)] pub struct BlockConfirmationConfig { @@ -25,6 +104,44 @@ pub struct BlockConfirmationConfig { pub rpc_http: Uri, } +fn default_gossip_port() -> u16 { + 65400 +} + +fn default_port_range_start() -> u16 { + 65401 +} + +fn default_port_range_end() -> u16 { + 65500 +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct GossipConfig { + #[serde(default = "default_gossip_port")] + pub gossip_port: u16, + #[serde(default = "default_port_range_start")] + pub port_range_start: u16, + #[serde(default = "default_port_range_end")] + pub port_range_end: u16, +} + +impl Default for GossipConfig { + fn default() -> Self { + Self { + gossip_port: default_gossip_port(), + port_range_start: default_port_range_start(), + port_range_end: default_port_range_end(), + } + } +} + +impl GossipConfig { + pub fn port_range(self) -> (u16, u16) { + (self.port_range_start, self.port_range_end) + } +} + /// Logger configuration. /// /// When `quiet` is true, only Warn/Error level logs are emitted. Runtime @@ -41,7 +158,9 @@ struct ConfigRaw { storage: String, rpc_addr: String, grpc_addr: String, - influxdb: Option, + #[serde(default)] + gossip: GossipConfig, + influxdb: Option, block_confirmation: Option, log: Option, } @@ -53,6 +172,7 @@ impl Default for ConfigRaw { storage: "./shred-store".to_string(), rpc_addr: "127.0.0.1:3000".to_string(), grpc_addr: "127.0.0.1:3001".to_string(), + gossip: GossipConfig::default(), influxdb: None, block_confirmation: None, log: None, @@ -65,6 +185,7 @@ pub struct Config { pub storage: PathBuf, pub rpc_addr: SocketAddr, pub grpc_addr: SocketAddr, + pub gossip: GossipConfig, pub influxdb: Option, pub block_confirmation: Option, pub log: Option, @@ -74,11 +195,14 @@ impl TryFrom for Config { type Error = anyhow::Error; fn try_from(value: ConfigRaw) -> Result { - let gossip_entrypoint: SocketAddr = value + let gossip_entrypoint_raw = value .gossip_entrypoint - .ok_or_else(|| anyhow!("`gossip_entrypoint` must be specified in config"))? - .parse() - .map_err(|e| anyhow!("invalid `gossip_entrypoint`: {e}"))?; + .ok_or_else(|| anyhow!("`gossip_entrypoint` must be specified in config"))?; + let gossip_entrypoint: SocketAddr = gossip_entrypoint_raw + .to_socket_addrs() + .map_err(|e| anyhow!("invalid `gossip_entrypoint`: {e}"))? + .find(SocketAddr::is_ipv4) + .ok_or_else(|| anyhow!("`gossip_entrypoint` did not resolve to an IPv4 address"))?; let storage = PathBuf::from(value.storage); @@ -92,12 +216,54 @@ impl TryFrom for Config { .parse() .map_err(|e| anyhow!("invalid `grpc_addr`: {e}"))?; + if value.gossip.gossip_port == 0 { + return Err(anyhow!("`gossip.gossip_port` must be non-zero")); + } + if value.gossip.port_range_start == 0 || value.gossip.port_range_end == 0 { + return Err(anyhow!("`gossip.port_range_*` values must be non-zero")); + } + if value.gossip.port_range_start > value.gossip.port_range_end { + return Err(anyhow!( + "`gossip.port_range_start` must be <= `gossip.port_range_end`" + )); + } + if value + .gossip + .port_range_end + .saturating_sub(value.gossip.port_range_start) + < MINIMUM_VALIDATOR_PORT_RANGE_WIDTH + { + return Err(anyhow!( + "`gossip.port_range_end - gossip.port_range_start` must be at least {MINIMUM_VALIDATOR_PORT_RANGE_WIDTH}" + )); + } + if value + .gossip + .port_range_end + .checked_add(QUIC_PORT_OFFSET) + .is_none() + { + return Err(anyhow!( + "`gossip.port_range_end + {QUIC_PORT_OFFSET}` must fit in u16" + )); + } + if (value.gossip.port_range_start..=value.gossip.port_range_end) + .contains(&value.gossip.gossip_port) + { + return Err(anyhow!( + "`gossip.gossip_port` must not overlap `gossip.port_range_start..=gossip.port_range_end`" + )); + } + + let influxdb = value.influxdb.map(InfluxDbConfig::try_from).transpose()?; + Ok(Self { gossip_entrypoint, storage, rpc_addr, grpc_addr, - influxdb: value.influxdb, + gossip: value.gossip, + influxdb, block_confirmation: value.block_confirmation, log: value.log, }) @@ -121,6 +287,10 @@ impl Config { mod tests { use super::*; use figment::providers::{Format, Serialized, Toml}; + use std::{ + sync::atomic::{AtomicUsize, Ordering}, + time::{SystemTime, UNIX_EPOCH}, + }; fn parse_toml(toml: &str) -> ConfigRaw { Figment::new() @@ -137,6 +307,21 @@ rpc_addr = "127.0.0.1:3000" grpc_addr = "127.0.0.1:3001" "#; + fn temp_token_file(contents: &str) -> PathBuf { + static COUNTER: AtomicUsize = AtomicUsize::new(0); + let suffix = COUNTER.fetch_add(1, Ordering::Relaxed); + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("time before unix epoch") + .as_nanos(); + let path = std::env::temp_dir().join(format!( + "lightbringer-influxdb-token-{}-{nanos}-{suffix}.json", + std::process::id() + )); + std::fs::write(&path, contents).expect("write token file"); + path + } + #[test] fn log_section_absent_yields_none() { let raw = parse_toml(REQUIRED); @@ -184,4 +369,110 @@ grpc_addr = "127.0.0.1:3001" let cfg: Config = raw.try_into().expect("validate"); assert!(cfg.log.expect("log").quiet); } + + #[test] + fn gossip_config_defaults_to_existing_ports() { + let raw = parse_toml(REQUIRED); + let cfg: Config = raw.try_into().expect("validate"); + assert_eq!( + cfg.gossip, + GossipConfig { + gossip_port: 65400, + port_range_start: 65401, + port_range_end: 65500, + } + ); + } + + #[test] + fn gossip_config_can_override_ports() { + let toml = format!( + "{REQUIRED}\n[gossip]\ngossip_port = 55000\nport_range_start = 55001\nport_range_end = 55100\n" + ); + let raw = parse_toml(&toml); + let cfg: Config = raw.try_into().expect("validate"); + assert_eq!(cfg.gossip.gossip_port, 55000); + assert_eq!(cfg.gossip.port_range(), (55001, 55100)); + } + + #[test] + fn gossip_config_rejects_invalid_port_range() { + let toml = format!( + "{REQUIRED}\n[gossip]\ngossip_port = 55000\nport_range_start = 55100\nport_range_end = 55001\n" + ); + let raw = parse_toml(&toml); + let result: Result = raw.try_into(); + assert!(result.is_err(), "expected invalid port range to fail"); + } + + #[test] + fn gossip_config_rejects_too_narrow_port_range() { + let toml = format!( + "{REQUIRED}\n[gossip]\ngossip_port = 55000\nport_range_start = 55001\nport_range_end = 55010\n" + ); + let raw = parse_toml(&toml); + let result: Result = raw.try_into(); + assert!(result.is_err(), "expected narrow port range to fail"); + } + + #[test] + fn gossip_config_rejects_port_range_that_overflows_quic_offset() { + let toml = format!( + "{REQUIRED}\n[gossip]\ngossip_port = 55000\nport_range_start = 65500\nport_range_end = 65535\n" + ); + let raw = parse_toml(&toml); + let result: Result = raw.try_into(); + assert!( + result.is_err(), + "expected high port range to fail QUIC offset validation" + ); + } + + #[test] + fn gossip_config_rejects_overlapping_gossip_port() { + let toml = format!( + "{REQUIRED}\n[gossip]\ngossip_port = 55010\nport_range_start = 55001\nport_range_end = 55100\n" + ); + let raw = parse_toml(&toml); + let result: Result = raw.try_into(); + assert!(result.is_err(), "expected overlapping gossip port to fail"); + } + + #[test] + fn gossip_entrypoint_hostname_resolves_to_ipv4() { + let toml = REQUIRED.replace("127.0.0.1:8000", "localhost:8000"); + let raw = parse_toml(&toml); + let cfg: Config = raw.try_into().expect("validate"); + assert!(cfg.gossip_entrypoint.is_ipv4()); + assert_eq!(cfg.gossip_entrypoint.port(), 8000); + } + + #[test] + fn influxdb_token_file_json_parses() { + let token_file = temp_token_file(r#"{"token":"apiv3_test-token"}"#); + let toml = format!( + "{REQUIRED}\n[influxdb]\nhost = \"http://127.0.0.1:18181\"\ndatabase = \"lightbringer\"\ntoken_file = \"{}\"\n", + token_file.display() + ); + let raw = parse_toml(&toml); + let cfg: Config = raw.try_into().expect("validate"); + std::fs::remove_file(token_file).ok(); + assert_eq!(cfg.influxdb.expect("influxdb").token, "apiv3_test-token"); + } + + #[test] + fn influxdb_rejects_token_and_token_file_together() { + let token_file = temp_token_file(r#"{"token":"apiv3_test-token"}"#); + let toml = format!( + "{REQUIRED}\n[influxdb]\nhost = \"http://127.0.0.1:18181\"\ndatabase = \"lightbringer\"\ntoken = \"apiv3_inline\"\ntoken_file = \"{}\"\n", + token_file.display() + ); + let raw = parse_toml(&toml); + let result: Result = raw.try_into(); + std::fs::remove_file(token_file).ok(); + assert!( + result.is_err(), + "expected inline token and token_file to fail" + ); + } } diff --git a/src/gossip_manager.rs b/src/gossip_manager.rs index c942d7b..44d10d5 100644 --- a/src/gossip_manager.rs +++ b/src/gossip_manager.rs @@ -18,6 +18,8 @@ use solana_net_utils::{get_public_ip_addr_with_binding, multihomed_sockets::Bind use solana_sdk::{signature::Keypair, signer::Signer}; use solana_streamer::socket::SocketAddrSpace; +use crate::config::GossipConfig; + pub struct GossipManager { exit: Arc, gossip_service: GossipService, @@ -30,15 +32,19 @@ pub struct Sockets { } impl GossipManager { - pub fn new(gossip_entry: SocketAddr, keypair: Arc) -> anyhow::Result<(Self, Sockets)> { + pub fn new( + gossip_entry: SocketAddr, + keypair: Arc, + gossip_config: GossipConfig, + ) -> anyhow::Result<(Self, Sockets)> { let cluster_entrypoints = vec![ContactInfo::new_gossip_entry_point(&gossip_entry)]; let bind_ip_addr = IpAddr::V4(Ipv4Addr::UNSPECIFIED); let my_ip = get_public_ip_addr_with_binding(&gossip_entry, bind_ip_addr) .map_err(|e| anyhow!("Failed to get public IP: {}", e))?; let node_config = NodeConfig { - gossip_port: 65400, - port_range: (65401, 65500), + gossip_port: gossip_config.gossip_port, + port_range: gossip_config.port_range(), advertised_ip: my_ip, public_tpu_addr: None, public_tpu_forwards_addr: None, @@ -88,18 +94,10 @@ impl GossipManager { self.cluster_info.my_contact_info() } - // pub fn lookup_info(&self, pubkey: &Pubkey) -> Option { - // self.cluster_info.lookup_contact_info(pubkey, |x| x.clone()) - // } - pub fn get_cluster_info(&self) -> Arc { self.cluster_info.clone() } - pub fn get_all_peers(&self) -> Vec<(ContactInfo, u64)> { - self.cluster_info.all_peers() - } - pub fn stop(self) -> anyhow::Result<()> { self.exit.store(true, Ordering::Relaxed); diff --git a/src/grpc_slot_stream/shred_source.rs b/src/grpc_slot_stream/shred_source.rs index b5e58cb..b36ceaa 100644 --- a/src/grpc_slot_stream/shred_source.rs +++ b/src/grpc_slot_stream/shred_source.rs @@ -16,6 +16,10 @@ use crate::{ types::{PacketInfo, ShredInfoView}, }; +type FinishedSlots = Rc>>; +type SlotCache = Rc), 300>>>; +type BlockNotification = Rc>)>>>; + pub struct SlotForGrpc { pub slot: u64, pub shreds: Vec, @@ -58,13 +62,13 @@ impl ShredSource for SlotMetaShreds { #[derive(Clone, Default)] struct SlotShredsWaiter { - finished_slots: Rc>>, - slot_cache: Rc), 300>>>, - block_notif: Rc>)>>>, + finished_slots: FinishedSlots, + slot_cache: SlotCache, + block_notif: BlockNotification, } impl SlotShredsWaiter { - /// Insert a slot, notifying the receiver if they are waiting for it + /// Inserts a slot and notifies any waiter for it. fn insert(&self, slot: SlotRaw) { let mut block_notif = self.block_notif.borrow_mut(); let mut finished_slots = self.finished_slots.borrow_mut(); @@ -153,8 +157,7 @@ async fn confirmed_slot_shreds_glommio_runner_with_backqueue( Some(()) })); - // This wrapper is required because the confirmation stream must be driven constantly - // else the websocket connection will be dropped + // The confirmation stream must be polled continuously to keep the websocket open. let (conf_stream_tx, conf_stream_rx) = local_channel::new_bounded(1000); let conf_stream_handle = spawn_local(async move { while let Ok(notif) = conf_stream.next().await { diff --git a/src/main.rs b/src/main.rs index 11d6936..1abba29 100644 --- a/src/main.rs +++ b/src/main.rs @@ -95,7 +95,8 @@ fn main() { let keypair = Arc::new(Keypair::new()); - let (gossip, sockets) = GossipManager::new(conf.gossip_entrypoint, keypair.clone()).unwrap(); + let (gossip, sockets) = + GossipManager::new(conf.gossip_entrypoint, keypair.clone(), conf.gossip).unwrap(); let version = gossip.version; let mut threadpool = ThreadManager::<7>::new(); @@ -128,12 +129,12 @@ fn main() { } })); - // Shred Filter + // Shred filter threadpool.spawn(move |exit| packet_filter_loop(exit, filter_rx, slot_meta_tx.to_sync())); - // Slot Repair + // Slot repair let (repair_tx, repair_rx) = kanal::bounded_async(10000); - // allow upto 20 slots to be queued for repairing at a time + // Allow up to 20 slots to queue for repair. let (repair_socket_tx, repair_socket_rx) = kanal::bounded_async(20); let (repair_manager_tx, repair_manager_rx) = kanal::unbounded_async(); @@ -163,7 +164,7 @@ fn main() { .await }); - // Shred Storage + // Shred storage let shred_store = ShredStore::new(lsm_ks, shred_cutoff_slot).unwrap(); threadpool.spawn( enclose!((shred_store) move |exit| shred_store.slot_listener_loop(exit, slot_store_rx)), diff --git a/src/metrics/points.rs b/src/metrics/points.rs index 5b9c69b..e59d89b 100644 --- a/src/metrics/points.rs +++ b/src/metrics/points.rs @@ -71,11 +71,31 @@ impl DataPoint for SlotMeasurement { const PAGE_SIZE: u64 = 4096; // x86_64 Linux +pub enum MemoryMeasurementKind { + Process, +} + +impl Display for MemoryMeasurementKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + MemoryMeasurementKind::Process => write!(f, "process"), + } + } +} + +impl From for influxdb::Type { + fn from(value: MemoryMeasurementKind) -> Self { + Self::Text(value.to_string()) + } +} + #[derive(InfluxDbWriteable)] pub struct MemoryMeasurement { time: Timestamp, rss_bytes: u64, virtual_bytes: u64, + #[influxdb(tag)] + kind: MemoryMeasurementKind, } impl MemoryMeasurement { @@ -88,6 +108,7 @@ impl MemoryMeasurement { time: now(), rss_bytes: rss_pages * PAGE_SIZE, virtual_bytes: virtual_pages * PAGE_SIZE, + kind: MemoryMeasurementKind::Process, }) } } diff --git a/src/packet_filter.rs b/src/packet_filter.rs index 071a483..595fa4e 100644 --- a/src/packet_filter.rs +++ b/src/packet_filter.rs @@ -60,7 +60,7 @@ impl PacketProcessor { } } struct PacketProcessorPool { - jobs: Vec>, + _jobs: Vec>, senders: Vec>, leader_schedule: LeaderScheduleSync, } @@ -88,7 +88,7 @@ impl PacketProcessorPool { } Self { - jobs, + _jobs: jobs, senders, leader_schedule, } diff --git a/src/repair/outstanding_timers.rs b/src/repair/outstanding_timers.rs index 80aac8e..759c78a 100644 --- a/src/repair/outstanding_timers.rs +++ b/src/repair/outstanding_timers.rs @@ -63,25 +63,31 @@ pub struct OutstandingTimerStore( BTreeMap, Instant)>, ); +struct OutstandingTimer { + kind: OutstandingRequestKind, + nonce: Nonce, + slot: u64, + shred_index: u32, + socket: SocketAddr, + sent_at: Instant, + timer: TimerActionOnce<()>, +} + impl OutstandingTimerStore { - pub fn insert( - &mut self, - slot: u64, - nonce: Nonce, - socket: SocketAddr, - req_type: OutstandingRequestKind, - shred_index: u32, - timer: TimerActionOnce<()>, - sent_at: Instant, - ) { + fn insert(&mut self, request: OutstandingTimer) { self.0.insert( - repair_key(slot, nonce, socket), - (req_type, shred_index, timer, sent_at), + repair_key(request.slot, request.nonce, request.socket), + ( + request.kind, + request.shred_index, + request.timer, + request.sent_at, + ), ); } - /// remove an outstanding timer - /// returning the request type and the time it was sent + /// Removes an outstanding timer. + /// Returns the request kind and send time. pub fn remove( &mut self, slot: u64, @@ -130,20 +136,20 @@ pub async fn start_outstanding_requests_loop( let sent_at = req.sent_at; let (slot, nonce, socket, kind, shred) = (req.slot, req.nonce, req.socket, req.kind, req.shred); - outstanding_timers.borrow_mut().insert( + outstanding_timers.borrow_mut().insert(OutstandingTimer { slot, nonce, socket, kind, - shred, - TimerActionOnce::do_in( + shred_index: shred, + timer: TimerActionOnce::do_in( REPAIR_REQUEST_TIMEOUT, enclose!((outstanding_tx) async move { _ = outstanding_tx.try_send(OutstandingRequestMsg::Timeout(req)); }), ), sent_at, - ); + }); } OutstandingRequestMsg::Timeout(mut req) => { let removed = outstanding_timers @@ -176,20 +182,20 @@ pub async fn start_outstanding_requests_loop( let sent_at = req.sent_at; let (slot, nonce, socket, kind, shred) = (req.slot, req.nonce, req.socket, req.kind, req.shred); - outstanding_timers.borrow_mut().insert( + outstanding_timers.borrow_mut().insert(OutstandingTimer { slot, nonce, socket, kind, - shred, - TimerActionOnce::do_in( + shred_index: shred, + timer: TimerActionOnce::do_in( REPAIR_REQUEST_TIMEOUT, enclose!((outstanding_tx) async move { _ = outstanding_tx.try_send(OutstandingRequestMsg::Timeout(req)); }), ), sent_at, - ); + }); } } } diff --git a/src/store/shred.rs b/src/store/shred.rs index 3d7fc1a..21eafdb 100644 --- a/src/store/shred.rs +++ b/src/store/shred.rs @@ -20,11 +20,7 @@ use crate::{ pub const SHRED_KEYSPACE: &str = "shred_store"; const RETENTION_SLOTS: u64 = 72_000; // ~ 8 hrs - -pub struct ShredRes { - data: Option, - code: Option, -} +type CompactionFilterFactories = Arc Option> + Send + Sync>; #[derive(Clone)] pub struct SlotRaw { @@ -68,9 +64,7 @@ impl Factory for ShredCutoffFactory { } } -pub fn compaction_filter_factories( - cutoff_slot: Arc, -) -> Arc Option> + Send + Sync> { +pub fn compaction_filter_factories(cutoff_slot: Arc) -> CompactionFilterFactories { Arc::new(move |keyspace| match keyspace { SHRED_KEYSPACE => Some(Arc::new(ShredCutoffFactory(cutoff_slot.clone()))), _ => None, diff --git a/src/store/slot_meta.rs b/src/store/slot_meta.rs index 604ebb4..15079fa 100644 --- a/src/store/slot_meta.rs +++ b/src/store/slot_meta.rs @@ -1,7 +1,3 @@ -// use pure in-memory data structures -// we'll use scc::HashMap -// FOR later: investigate queues - use std::{ collections::{BTreeSet, HashMap}, rc::Rc, @@ -34,7 +30,7 @@ pub struct SlotMetadata { pub timestamp_ms: u64, pub completed_batches: BTreeSet, pub required_batches: Option, - // highest shred index seen + // Highest shred index seen. pub max_inclusive_shred: u32, pub shreds: Option>, pub timed_out: bool, @@ -108,8 +104,8 @@ impl SlotMetadata { missing_shreds } - /// find required shreds to complete the slot - /// returning None if the last shred hasn't been seen yet + /// Finds required shreds to complete the slot. + /// Returns unbounded repairs until the last shred is seen. pub fn calculate_missing_shreds(&self) -> RepairReq { if let Some(shreds) = self.calculate_missing_shreds_bounded() { return RepairReq::MissingBoundedShreds { @@ -141,14 +137,14 @@ pub struct SlotMetadataStore { } enum SlotTimerMsg { - ShredInsertion { slot: u64 }, - ShredCompletion { slot: u64 }, - ShredTimeout { slot: u64 }, + Insertion { slot: u64 }, + Completion { slot: u64 }, + Timeout { slot: u64 }, } impl SlotMetadataStore { pub fn new(version: u16) -> Self { - // stores the last 4096 slots only + // Stores the last 4096 slots. let hash_cache = scc::HashCache::with_capacity(0, 4096); Self { inner: Arc::new(hash_cache), @@ -190,12 +186,12 @@ impl SlotMetadataStore { let timer_msg = match store_res { SlotMetaStoreRes::Ignored => continue, SlotMetaStoreRes::Complete(raw_slot) => { - _ = timer_tx.try_send(SlotTimerMsg::ShredCompletion { slot }); + _ = timer_tx.try_send(SlotTimerMsg::Completion { slot }); _ = store_tx.send(raw_slot.clone()).await; _ = grpc_tx.send(raw_slot).await; continue; } - SlotMetaStoreRes::Incomplete => SlotTimerMsg::ShredInsertion { slot }, + SlotMetaStoreRes::Incomplete => SlotTimerMsg::Insertion { slot }, }; _ = timer_tx.try_send(timer_msg); } @@ -206,7 +202,7 @@ impl SlotMetadataStore { let mut timers: HashMap> = HashMap::new(); while let Some(msg) = timer_rx.recv().await { match msg { - SlotTimerMsg::ShredInsertion { slot } => { + SlotTimerMsg::Insertion { slot } => { timers .entry(slot) .and_modify(|timer| { @@ -215,11 +211,11 @@ impl SlotMetadataStore { .or_insert_with(|| { let timer_tx = timer_tx.clone(); TimerActionOnce::do_in(DEFER_REPAIR_THRESHOLD, async move { - _ = timer_tx.send(SlotTimerMsg::ShredTimeout { slot }).await; + _ = timer_tx.send(SlotTimerMsg::Timeout { slot }).await; }) }); } - SlotTimerMsg::ShredCompletion { slot } => { + SlotTimerMsg::Completion { slot } => { if let Some(timer) = timers.remove(&slot) { timer.destroy(); } @@ -234,7 +230,7 @@ impl SlotMetadataStore { _ = repair_tx.send(RepairReq::CancelRepair { slot }).await; } } - SlotTimerMsg::ShredTimeout { slot } => { + SlotTimerMsg::Timeout { slot } => { timers.remove(&slot); let Some(mut slot_meta) = self.inner.get_sync(&slot) else { continue; @@ -257,7 +253,7 @@ impl SlotMetadataStore { ); } - // stores the shred, returning whether slots are complete or not + // Stores a shred and reports whether the slot is complete. fn store_shred(&self, hdr: ShredCommonHeader, shred: PacketInfo) -> SlotMetaStoreRes { let slot = hdr.slot; let fec_index = hdr.fec_set_index;