From bb4951a7b6a71757ec7b3af5607bef29400aa068 Mon Sep 17 00:00:00 2001 From: zuchka Date: Fri, 8 May 2026 18:31:23 -0700 Subject: [PATCH 1/6] feat(evaluator): humanize_duration + default template helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pure functions, not yet wired into renderMessage — that lands in the next commit. Covered by 19 table-driven test cases across both helpers. humanize_duration: any numeric seconds value through time.Duration.String() default: returns fallback only on nil or empty string (narrower than sprig — 0 and false pass through to avoid {{ .exit_code | default 0 }} footgun) Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/evaluator/template_funcs.go | 57 +++++++++++++++++++ internal/evaluator/template_funcs_test.go | 68 +++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 internal/evaluator/template_funcs.go create mode 100644 internal/evaluator/template_funcs_test.go diff --git a/internal/evaluator/template_funcs.go b/internal/evaluator/template_funcs.go new file mode 100644 index 0000000..6de1d9c --- /dev/null +++ b/internal/evaluator/template_funcs.go @@ -0,0 +1,57 @@ +package evaluator + +import ( + "fmt" + "time" +) + +// humanizeDuration converts a numeric seconds value to a human-readable +// duration string using Go's native time.Duration.String() format +// (e.g., 30m43s, 4m7.3s, 500ms). Non-numeric inputs pass through via +// fmt.Sprint so templates don't crash on bad data; users see something +// obviously wrong (e.g. "", "[1 2]") and can fix the template. +// +// Used as a text/template FuncMap entry. Pipe form in templates: +// +// {{ .duration_seconds | humanize_duration }} +func humanizeDuration(seconds interface{}) string { + var f float64 + switch v := seconds.(type) { + case float64: + f = v + case float32: + f = float64(v) + case int: + f = float64(v) + case int32: + f = float64(v) + case int64: + f = float64(v) + case uint: + f = float64(v) + case uint32: + f = float64(v) + case uint64: + f = float64(v) + default: + return fmt.Sprint(seconds) + } + return time.Duration(f * float64(time.Second)).String() +} + +// defaultValue returns fallback when value is nil or the empty string; +// otherwise returns value unchanged. Intentionally narrower than sprig's +// default to avoid the "0/false fall back" footgun. +// +// Used as a text/template FuncMap entry. Pipe form in templates: +// +// {{ .branch | default "unknown" }} +func defaultValue(fallback interface{}, value interface{}) interface{} { + if value == nil { + return fallback + } + if s, ok := value.(string); ok && s == "" { + return fallback + } + return value +} diff --git a/internal/evaluator/template_funcs_test.go b/internal/evaluator/template_funcs_test.go new file mode 100644 index 0000000..89a4df3 --- /dev/null +++ b/internal/evaluator/template_funcs_test.go @@ -0,0 +1,68 @@ +package evaluator + +import ( + "testing" +) + +// TestHumanizeDuration verifies the duration humanization helper renders +// numeric seconds values via Go's native time.Duration.String() format +// (e.g. 30m43s, 4m7.3s, 500ms), and that non-numeric input passes through +// via fmt.Sprint so templates don't crash on bad data. +func TestHumanizeDuration(t *testing.T) { + tests := []struct { + name string + in interface{} + want string + }{ + {"zero", 0, "0s"}, + {"subsecond_float", 0.5, "500ms"}, + {"int_seconds", 7, "7s"}, + {"float_minutes_with_decimal", 247.3, "4m7.3s"}, + {"int_half_hour_plus", 1843, "30m43s"}, + {"int_two_hours_with_zero_minutes", 7245, "2h0m45s"}, + {"negative_seconds", -30, "-30s"}, + {"explicit_float64", float64(60), "1m0s"}, + {"explicit_int64", int64(60), "1m0s"}, + {"explicit_uint", uint(120), "2m0s"}, + {"non_numeric_string_passthrough", "hello", "hello"}, + {"nil_passthrough", nil, ""}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := humanizeDuration(tt.in) + if got != tt.want { + t.Errorf("humanizeDuration(%v) = %q, want %q", tt.in, got, tt.want) + } + }) + } +} + +// TestDefault verifies the default helper returns the fallback only when +// the value is nil or the empty string. We intentionally do NOT replicate +// sprig's "0/false/empty-collection falls back" behavior — those values +// are real and would create footguns for {{ .exit_code | default 0 }}- +// style rules. +func TestDefault(t *testing.T) { + tests := []struct { + name string + fallback interface{} + value interface{} + want interface{} + }{ + {"nil_value_uses_fallback", "main", nil, "main"}, + {"empty_string_uses_fallback", "main", "", "main"}, + {"non_empty_string_passes_through", "main", "dev", "dev"}, + {"zero_int_passes_through", 99, 0, 0}, + {"false_passes_through", true, false, false}, + {"string_zero_passes_through", "fallback", "0", "0"}, + {"string_false_passes_through", "fallback", "false", "false"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := defaultValue(tt.fallback, tt.value) + if got != tt.want { + t.Errorf("defaultValue(%v, %v) = %v, want %v", tt.fallback, tt.value, got, tt.want) + } + }) + } +} From 06bc624b3e69678cf6bfab382f675a5a7f8e6164 Mon Sep 17 00:00:00 2001 From: zuchka Date: Fri, 8 May 2026 18:38:49 -0700 Subject: [PATCH 2/6] feat(evaluator): wire template helpers into renderMessage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds template.FuncMap registration for humanize_duration and default plus Option("missingkey=zero") so missing fields surface via Go's zero interface handling. Two new integration tests exercise both helpers through the full renderMessage pipeline. Note: missingkey=zero on map[string]interface{} still renders missing keys as (nil interface prints that way in Go templates), so TestRenderMessage_MissingFieldStillRendersGracefully required no update — its existing assertion remains correct. Co-Authored-By: Claude Opus 4.7 (1M context) --- internal/evaluator/engine.go | 8 ++++- internal/evaluator/template_funcs_test.go | 44 +++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/internal/evaluator/engine.go b/internal/evaluator/engine.go index 4363366..14e9376 100644 --- a/internal/evaluator/engine.go +++ b/internal/evaluator/engine.go @@ -457,7 +457,13 @@ func renderMessage(tmpl string, alert Alert) string { if tmpl == "" { return fmt.Sprintf("rule %q fired (metric=%s value=%v)", alert.Rule, alert.Metric, alert.Value) } - t, err := template.New("msg").Parse(tmpl) + t, err := template.New("msg"). + Option("missingkey=zero"). + Funcs(template.FuncMap{ + "humanize_duration": humanizeDuration, + "default": defaultValue, + }). + Parse(tmpl) if err != nil { return tmpl // return raw if template is invalid } diff --git a/internal/evaluator/template_funcs_test.go b/internal/evaluator/template_funcs_test.go index 89a4df3..1a11b32 100644 --- a/internal/evaluator/template_funcs_test.go +++ b/internal/evaluator/template_funcs_test.go @@ -2,6 +2,7 @@ package evaluator import ( "testing" + "time" ) // TestHumanizeDuration verifies the duration humanization helper renders @@ -66,3 +67,46 @@ func TestDefault(t *testing.T) { }) } } + +// TestRenderMessage_DefaultHelper_FillsMissingField proves the default +// helper works through the full renderMessage pipeline for a field that +// is not in Labels, Floats, or the alert struct. After missingkey=zero +// is wired, the missing field surfaces as nil in the template scope and +// default returns the fallback. +func TestRenderMessage_DefaultHelper_FillsMissingField(t *testing.T) { + alert := Alert{ + Rule: "build_failed", + Metric: "run.exit", + Value: 1, + Labels: map[string]string{"runner": "github-actions"}, + FiredAt: time.Now(), + } + + got := renderMessage(`Build on {{ .branch | default "unknown" }} failed`, alert) + + want := "Build on unknown failed" + if got != want { + t.Errorf("renderMessage default helper: got %q, want %q", got, want) + } +} + +// TestRenderMessage_HumanizeDuration_ThroughPipeline proves humanize_duration +// works through the full renderMessage pipeline against the synthetic +// run.exit event shape (duration_seconds in Floats). +func TestRenderMessage_HumanizeDuration_ThroughPipeline(t *testing.T) { + alert := Alert{ + Rule: "training_failed", + Metric: "run.exit", + Value: 1, + Labels: map[string]string{"runner": "mlflow"}, + Floats: map[string]float64{"duration_seconds": 1843}, + FiredAt: time.Now(), + } + + got := renderMessage("Run failed after {{ .duration_seconds | humanize_duration }}", alert) + + want := "Run failed after 30m43s" + if got != want { + t.Errorf("renderMessage humanize_duration: got %q, want %q", got, want) + } +} From 2a862bd5e36fc4a892466b2f02ccc3f54ca8e772 Mon Sep 17 00:00:00 2001 From: zuchka Date: Fri, 8 May 2026 18:54:50 -0700 Subject: [PATCH 3/6] docs(recipes): use humanize_duration for duration_seconds Sweeps the {{ .duration_seconds }}s pattern across all 9 platform recipes, the canonical _template.md, and ding.yaml.example. Renders human-readable durations (30m43s, 4m7.3s) instead of raw float seconds (1843, 247.3) in the example alert messages. Verified end-to-end via ding test-rule with a synthetic run.exit event. Co-Authored-By: Claude Opus 4.7 (1M context) --- ding.yaml.example | 2 +- docs/recipes/_template.md | 2 +- docs/recipes/argo-workflows.md | 2 +- docs/recipes/buildkite.md | 2 +- docs/recipes/gitlab-ci.md | 2 +- docs/recipes/jenkins.md | 2 +- docs/recipes/kubernetes-jobs.md | 4 ++-- docs/recipes/mlflow.md | 2 +- docs/recipes/modal.md | 2 +- docs/recipes/ray.md | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/ding.yaml.example b/ding.yaml.example index b1be8a7..691c007 100644 --- a/ding.yaml.example +++ b/ding.yaml.example @@ -143,7 +143,7 @@ rules: # match: # metric: run.exit # condition: value > 0 - # message: "job failed: exit code {{ .value }} after {{ .duration_seconds }}s" + # message: "job failed: exit code {{ .value }} after {{ .duration_seconds | humanize_duration }}" # alert: # - notifier: github_actions diff --git a/docs/recipes/_template.md b/docs/recipes/_template.md index 01ebcf4..20aa0cc 100644 --- a/docs/recipes/_template.md +++ b/docs/recipes/_template.md @@ -32,7 +32,7 @@ rules: match: metric: run.exit condition: value > 0 - message: "Job failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)" + message: "Job failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})" alert: - notifier: slack ``` diff --git a/docs/recipes/argo-workflows.md b/docs/recipes/argo-workflows.md index 22d90bb..cb23ea9 100644 --- a/docs/recipes/argo-workflows.md +++ b/docs/recipes/argo-workflows.md @@ -49,7 +49,7 @@ data: match: metric: run.exit condition: value > 0 - message: "Argo step {{ .pod }} (workflow {{ .workflow }}) failed with exit {{ .exit_code }} after {{ .duration_seconds }}s" + message: "Argo step {{ .pod }} (workflow {{ .workflow }}) failed with exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }}" alert: - notifier: slack --- diff --git a/docs/recipes/buildkite.md b/docs/recipes/buildkite.md index 05902bf..1009bc0 100644 --- a/docs/recipes/buildkite.md +++ b/docs/recipes/buildkite.md @@ -33,7 +33,7 @@ rules: match: metric: run.exit condition: value > 0 - message: "{{ .repo }}@{{ .branch }} failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)" + message: "{{ .repo }}@{{ .branch }} failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})" alert: - notifier: slack ``` diff --git a/docs/recipes/gitlab-ci.md b/docs/recipes/gitlab-ci.md index 7a6704b..2693ca9 100644 --- a/docs/recipes/gitlab-ci.md +++ b/docs/recipes/gitlab-ci.md @@ -35,7 +35,7 @@ rules: match: metric: run.exit condition: value > 0 - message: "Pipeline {{ .branch }} failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)" + message: "Pipeline {{ .branch }} failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})" alert: - notifier: slack ``` diff --git a/docs/recipes/jenkins.md b/docs/recipes/jenkins.md index f1d906a..19e3496 100644 --- a/docs/recipes/jenkins.md +++ b/docs/recipes/jenkins.md @@ -47,7 +47,7 @@ rules: match: metric: run.exit condition: value > 0 - message: "{{ .job }} build {{ .build }} failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)" + message: "{{ .job }} build {{ .build }} failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})" alert: - notifier: slack ``` diff --git a/docs/recipes/kubernetes-jobs.md b/docs/recipes/kubernetes-jobs.md index 6898509..741a616 100644 --- a/docs/recipes/kubernetes-jobs.md +++ b/docs/recipes/kubernetes-jobs.md @@ -58,7 +58,7 @@ data: match: metric: run.exit condition: value > 0 - message: "{{ .pod }} (Job {{ .job_name }}) failed with exit {{ .exit_code }} after {{ .duration_seconds }}s" + message: "{{ .pod }} (Job {{ .job_name }}) failed with exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }}" alert: - notifier: slack --- @@ -225,7 +225,7 @@ rules: - name: job_failed match: { metric: run.exit } condition: value > 0 - message: "Job failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)" + message: "Job failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})" alert: - notifier: k8s ``` diff --git a/docs/recipes/mlflow.md b/docs/recipes/mlflow.md index 98e767b..07903d2 100644 --- a/docs/recipes/mlflow.md +++ b/docs/recipes/mlflow.md @@ -49,7 +49,7 @@ rules: match: { metric: run.exit } condition: value > 0 message: | - MLflow run failed (exit {{ .exit_code }} after {{ .duration_seconds }}s) + MLflow run failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }}) <{{ .tracking_uri }}/#/experiments/{{ .experiment_id }}/runs/{{ .run_id }}|View run in MLflow UI> alert: - notifier: slack diff --git a/docs/recipes/modal.md b/docs/recipes/modal.md index acb9781..5c53990 100644 --- a/docs/recipes/modal.md +++ b/docs/recipes/modal.md @@ -67,7 +67,7 @@ rules: - name: training_failed match: { metric: run.exit } condition: value > 0 - message: "Modal function {{ .function_name }} (task {{ .modal_task_id }}) failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)" + message: "Modal function {{ .function_name }} (task {{ .modal_task_id }}) failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})" alert: - notifier: slack ``` diff --git a/docs/recipes/ray.md b/docs/recipes/ray.md index f1d445f..0736546 100644 --- a/docs/recipes/ray.md +++ b/docs/recipes/ray.md @@ -39,7 +39,7 @@ rules: - name: training_failed match: { metric: run.exit } condition: value > 0 - message: "Ray job {{ .run_id }} failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)" + message: "Ray job {{ .run_id }} failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})" alert: - notifier: slack ``` From 0cfccb39ea29c62fd7c5c04ff095eda7201171ad Mon Sep 17 00:00:00 2001 From: zuchka Date: Fri, 8 May 2026 18:58:27 -0700 Subject: [PATCH 4/6] docs(recipes): update rendered-output examples to match humanize_duration The "What you get" prose example in modal.md and ray.md showed raw seconds (287s, 1843s) but the YAML now produces humanized output via humanize_duration. Update prose to match: 287s -> 4m47s, 1843s -> 30m43s. mlflow.md:90 was already consistent (42 humanizes to 42s, no change). Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/recipes/modal.md | 2 +- docs/recipes/ray.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/recipes/modal.md b/docs/recipes/modal.md index 5c53990..d4a722c 100644 --- a/docs/recipes/modal.md +++ b/docs/recipes/modal.md @@ -91,7 +91,7 @@ A Slack message during training when `val_loss` exceeds threshold: …and on function exit: > 🔔 `training_failed` -> Modal function trainer (task ta-abc123def) failed (exit 1 after 287s) +> Modal function trainer (task ta-abc123def) failed (exit 1 after 4m47s) The `modal_task_id` matches the task ID visible in the Modal dashboard, so the Slack alert is one click away from the function's logs and metrics. diff --git a/docs/recipes/ray.md b/docs/recipes/ray.md index 0736546..c01d296 100644 --- a/docs/recipes/ray.md +++ b/docs/recipes/ray.md @@ -112,7 +112,7 @@ A Slack message during training when `val_loss` exceeds threshold: …and on training-process exit: > 🔔 `training_failed` -> Ray job raysubmit_abcdef1234567890 failed (exit 1 after 1843s) +> Ray job raysubmit_abcdef1234567890 failed (exit 1 after 30m43s) All Path A alerts are auto-tagged with `run_id` + `runner=ray`. The `run_id` matches the UUID printed by `ray job list`. From d37dfbdee64fd911dcde5f4bb1769626ced66dee Mon Sep 17 00:00:00 2001 From: zuchka Date: Fri, 8 May 2026 18:59:22 -0700 Subject: [PATCH 5/6] docs(configuration): document template helpers section Adds a new ### Template helpers subsection covering humanize_duration and default, with input/output tables matching the implementation contract and a note on the deliberate diverge from sprig's default semantics for 0/false values. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/configuration.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/docs/configuration.md b/docs/configuration.md index bb0a8c6..af2f0cd 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -362,6 +362,43 @@ wall-clock windows in long-running serve deployments. | `.sum` | windowed | Sum over window | | `.count` | windowed | Event count over window | +### Template helpers + +Two helper functions are available in message templates beyond Go's default `text/template` syntax: + +#### `humanize_duration` + +Renders a numeric seconds value as a human-readable duration string using Go's native `time.Duration.String()` format. Useful for the `{{ .duration_seconds }}` field on `run.exit` events. + +```yaml +message: "Job failed after {{ .duration_seconds | humanize_duration }}" +``` + +| Input (seconds) | Rendered | +|----------------:|----------| +| `0` | `0s` | +| `0.5` | `500ms` | +| `7` | `7s` | +| `247.3` | `4m7.3s` | +| `1843` | `30m43s` | +| `7245` | `2h0m45s` | + +Accepts any numeric type (int, int64, float64, etc.) interpreted as seconds. Non-numeric inputs pass through unchanged via `fmt.Sprint`, so a typo or a missing field renders something visibly wrong rather than crashing the template. + +#### `default` + +Returns a fallback when the piped value is `nil` (typically a missing field) or the empty string. Numeric `0` and boolean `false` pass through unchanged — they are real values, not absences. This is intentionally narrower than sprig's `default` to avoid the `{{ .exit_code | default 0 }}` footgun. + +```yaml +message: "Build on {{ .branch | default \"unknown\" }} failed" +``` + +| `.branch` value | Rendered | +|---|---| +| `"main"` | `Build on main failed` | +| `""` | `Build on unknown failed` | +| missing | `Build on unknown failed` | + ### Per-label-set cooldowns Cooldowns are tracked independently per unique label combination. A noisy `web-01` does not suppress alerts from `web-02`. From 7af63c9c707800b4cfea937ed4a4a0aceacad96d Mon Sep 17 00:00:00 2001 From: zuchka Date: Fri, 8 May 2026 19:07:03 -0700 Subject: [PATCH 6/6] docs(readme): use humanize_duration in run.exit example The README's run.exit example still showed the raw {{ .duration_seconds }}s form. Final-review caught the inconsistency: the recipe sweep updated all 9 platform recipes + _template.md + ding.yaml.example, but README.md contains the canonical run.exit example a new user reads first. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8cd3a36..bc98c91 100644 --- a/README.md +++ b/README.md @@ -167,7 +167,7 @@ Match it like any other metric: - name: nonzero_exit match: { metric: run.exit } condition: value > 0 - message: "job failed with exit code {{ .value }} after {{ .duration_seconds }}s" + message: "job failed with exit code {{ .value }} after {{ .duration_seconds | humanize_duration }}" alert: [{ notifier: github_actions }] ```