ding-labs · zuchka · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/README.md b/README.md
@@ -167,7 +167,7 @@ Match it like any other metric:
 - name: nonzero_exit
   match: { metric: run.exit }
   condition: value > 0
-  message: "job failed with exit code {{ .value }} after {{ .duration_seconds }}s"
+  message: "job failed with exit code {{ .value }} after {{ .duration_seconds | humanize_duration }}"
   alert: [{ notifier: github_actions }]
 ```
 

diff --git a/ding.yaml.example b/ding.yaml.example
@@ -143,7 +143,7 @@ rules:
   #   match:
   #     metric: run.exit
   #   condition: value > 0
-  #   message: "job failed: exit code {{ .value }} after {{ .duration_seconds }}s"
+  #   message: "job failed: exit code {{ .value }} after {{ .duration_seconds | humanize_duration }}"
   #   alert:
   #     - notifier: github_actions
 

diff --git a/docs/configuration.md b/docs/configuration.md
@@ -362,6 +362,43 @@ wall-clock windows in long-running serve deployments.
 | `.sum` | windowed | Sum over window |
 | `.count` | windowed | Event count over window |
 
+### Template helpers
+
+Two helper functions are available in message templates beyond Go's default `text/template` syntax:
+
+#### `humanize_duration`
+
+Renders a numeric seconds value as a human-readable duration string using Go's native `time.Duration.String()` format. Useful for the `{{ .duration_seconds }}` field on `run.exit` events.
+
+```yaml
+message: "Job failed after {{ .duration_seconds | humanize_duration }}"
+```
+
+| Input (seconds) | Rendered |
+|----------------:|----------|
+| `0` | `0s` |
+| `0.5` | `500ms` |
+| `7` | `7s` |
+| `247.3` | `4m7.3s` |
+| `1843` | `30m43s` |
+| `7245` | `2h0m45s` |
+
+Accepts any numeric type (int, int64, float64, etc.) interpreted as seconds. Non-numeric inputs pass through unchanged via `fmt.Sprint`, so a typo or a missing field renders something visibly wrong rather than crashing the template.
+
+#### `default`
+
+Returns a fallback when the piped value is `nil` (typically a missing field) or the empty string. Numeric `0` and boolean `false` pass through unchanged — they are real values, not absences. This is intentionally narrower than sprig's `default` to avoid the `{{ .exit_code | default 0 }}` footgun.
+
+```yaml
+message: "Build on {{ .branch | default \"unknown\" }} failed"
+```
+
+| `.branch` value | Rendered |
+|---|---|
+| `"main"` | `Build on main failed` |
+| `""` | `Build on unknown failed` |
+| missing | `Build on unknown failed` |
+
 ### Per-label-set cooldowns
 
 Cooldowns are tracked independently per unique label combination. A noisy `web-01` does not suppress alerts from `web-02`.

diff --git a/docs/recipes/_template.md b/docs/recipes/_template.md
@@ -32,7 +32,7 @@ rules:
     match:
       metric: run.exit
     condition: value > 0
-    message: "Job failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)"
+    message: "Job failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})"
     alert:
       - notifier: slack
 ```

diff --git a/docs/recipes/argo-workflows.md b/docs/recipes/argo-workflows.md
@@ -49,7 +49,7 @@ data:
         match:
           metric: run.exit
         condition: value > 0
-        message: "Argo step {{ .pod }} (workflow {{ .workflow }}) failed with exit {{ .exit_code }} after {{ .duration_seconds }}s"
+        message: "Argo step {{ .pod }} (workflow {{ .workflow }}) failed with exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }}"
         alert:
           - notifier: slack
 ---

diff --git a/docs/recipes/buildkite.md b/docs/recipes/buildkite.md
@@ -33,7 +33,7 @@ rules:
     match:
       metric: run.exit
     condition: value > 0
-    message: "{{ .repo }}@{{ .branch }} failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)"
+    message: "{{ .repo }}@{{ .branch }} failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})"
     alert:
       - notifier: slack
 ```

diff --git a/docs/recipes/gitlab-ci.md b/docs/recipes/gitlab-ci.md
@@ -35,7 +35,7 @@ rules:
     match:
       metric: run.exit
     condition: value > 0
-    message: "Pipeline {{ .branch }} failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)"
+    message: "Pipeline {{ .branch }} failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})"
     alert:
       - notifier: slack
 ```

diff --git a/docs/recipes/jenkins.md b/docs/recipes/jenkins.md
@@ -47,7 +47,7 @@ rules:
     match:
       metric: run.exit
     condition: value > 0
-    message: "{{ .job }} build {{ .build }} failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)"
+    message: "{{ .job }} build {{ .build }} failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})"
     alert:
       - notifier: slack
 ```

diff --git a/docs/recipes/kubernetes-jobs.md b/docs/recipes/kubernetes-jobs.md
@@ -58,7 +58,7 @@ data:
         match:
           metric: run.exit
         condition: value > 0
-        message: "{{ .pod }} (Job {{ .job_name }}) failed with exit {{ .exit_code }} after {{ .duration_seconds }}s"
+        message: "{{ .pod }} (Job {{ .job_name }}) failed with exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }}"
         alert:
           - notifier: slack
 ---
@@ -225,7 +225,7 @@ rules:
   - name: job_failed
     match: { metric: run.exit }
     condition: value > 0
-    message: "Job failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)"
+    message: "Job failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})"
     alert:
       - notifier: k8s
 ```

diff --git a/docs/recipes/mlflow.md b/docs/recipes/mlflow.md
@@ -49,7 +49,7 @@ rules:
     match: { metric: run.exit }
     condition: value > 0
     message: |
-      MLflow run failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)
+      MLflow run failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})
       <{{ .tracking_uri }}/#/experiments/{{ .experiment_id }}/runs/{{ .run_id }}|View run in MLflow UI>
     alert:
       - notifier: slack

diff --git a/docs/recipes/modal.md b/docs/recipes/modal.md
@@ -67,7 +67,7 @@ rules:
   - name: training_failed
     match: { metric: run.exit }
     condition: value > 0
-    message: "Modal function {{ .function_name }} (task {{ .modal_task_id }}) failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)"
+    message: "Modal function {{ .function_name }} (task {{ .modal_task_id }}) failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})"
     alert:
       - notifier: slack
 ```
@@ -91,7 +91,7 @@ A Slack message during training when `val_loss` exceeds threshold:
 …and on function exit:
 
 > 🔔 `training_failed`
-> Modal function trainer (task ta-abc123def) failed (exit 1 after 287s)
+> Modal function trainer (task ta-abc123def) failed (exit 1 after 4m47s)
 
 The `modal_task_id` matches the task ID visible in the Modal dashboard, so the Slack alert is one click away from the function's logs and metrics.
 

diff --git a/docs/recipes/ray.md b/docs/recipes/ray.md
@@ -39,7 +39,7 @@ rules:
   - name: training_failed
     match: { metric: run.exit }
     condition: value > 0
-    message: "Ray job {{ .run_id }} failed (exit {{ .exit_code }} after {{ .duration_seconds }}s)"
+    message: "Ray job {{ .run_id }} failed (exit {{ .exit_code }} after {{ .duration_seconds | humanize_duration }})"
     alert:
       - notifier: slack
 ```
@@ -112,7 +112,7 @@ A Slack message during training when `val_loss` exceeds threshold:
 …and on training-process exit:
 
 > 🔔 `training_failed`
-> Ray job raysubmit_abcdef1234567890 failed (exit 1 after 1843s)
+> Ray job raysubmit_abcdef1234567890 failed (exit 1 after 30m43s)
 
 All Path A alerts are auto-tagged with `run_id` + `runner=ray`. The `run_id` matches the UUID printed by `ray job list`.
 

diff --git a/internal/evaluator/engine.go b/internal/evaluator/engine.go
@@ -457,7 +457,13 @@ func renderMessage(tmpl string, alert Alert) string {
 	if tmpl == "" {
 		return fmt.Sprintf("rule %q fired (metric=%s value=%v)", alert.Rule, alert.Metric, alert.Value)
 	}
-	t, err := template.New("msg").Parse(tmpl)
+	t, err := template.New("msg").
+		Option("missingkey=zero").
+		Funcs(template.FuncMap{
+			"humanize_duration": humanizeDuration,
+			"default":           defaultValue,
+		}).
+		Parse(tmpl)
 	if err != nil {
 		return tmpl // return raw if template is invalid
 	}

diff --git a/internal/evaluator/template_funcs.go b/internal/evaluator/template_funcs.go
@@ -0,0 +1,57 @@
+package evaluator
+
+import (
+	"fmt"
+	"time"
+)
+
+// humanizeDuration converts a numeric seconds value to a human-readable
+// duration string using Go's native time.Duration.String() format
+// (e.g., 30m43s, 4m7.3s, 500ms). Non-numeric inputs pass through via
+// fmt.Sprint so templates don't crash on bad data; users see something
+// obviously wrong (e.g. "<nil>", "[1 2]") and can fix the template.
+//
+// Used as a text/template FuncMap entry. Pipe form in templates:
+//
+//	{{ .duration_seconds | humanize_duration }}
+func humanizeDuration(seconds interface{}) string {
+	var f float64
+	switch v := seconds.(type) {
+	case float64:
+		f = v
+	case float32:
+		f = float64(v)
+	case int:
+		f = float64(v)
+	case int32:
+		f = float64(v)
+	case int64:
+		f = float64(v)
+	case uint:
+		f = float64(v)
+	case uint32:
+		f = float64(v)
+	case uint64:
+		f = float64(v)
+	default:
+		return fmt.Sprint(seconds)
+	}
+	return time.Duration(f * float64(time.Second)).String()
+}
+
+// defaultValue returns fallback when value is nil or the empty string;
+// otherwise returns value unchanged. Intentionally narrower than sprig's
+// default to avoid the "0/false fall back" footgun.
+//
+// Used as a text/template FuncMap entry. Pipe form in templates:
+//
+//	{{ .branch | default "unknown" }}
+func defaultValue(fallback interface{}, value interface{}) interface{} {
+	if value == nil {
+		return fallback
+	}
+	if s, ok := value.(string); ok && s == "" {
+		return fallback
+	}
+	return value
+}
diff --git a/internal/evaluator/template_funcs_test.go b/internal/evaluator/template_funcs_test.go
@@ -0,0 +1,112 @@
+package evaluator
+
+import (
+	"testing"
+	"time"
+)
+
+// TestHumanizeDuration verifies the duration humanization helper renders
+// numeric seconds values via Go's native time.Duration.String() format
+// (e.g. 30m43s, 4m7.3s, 500ms), and that non-numeric input passes through
+// via fmt.Sprint so templates don't crash on bad data.
+func TestHumanizeDuration(t *testing.T) {
+	tests := []struct {
+		name string
+		in   interface{}
+		want string
+	}{
+		{"zero", 0, "0s"},
+		{"subsecond_float", 0.5, "500ms"},
+		{"int_seconds", 7, "7s"},
+		{"float_minutes_with_decimal", 247.3, "4m7.3s"},
+		{"int_half_hour_plus", 1843, "30m43s"},
+		{"int_two_hours_with_zero_minutes", 7245, "2h0m45s"},
+		{"negative_seconds", -30, "-30s"},
+		{"explicit_float64", float64(60), "1m0s"},
+		{"explicit_int64", int64(60), "1m0s"},
+		{"explicit_uint", uint(120), "2m0s"},
+		{"non_numeric_string_passthrough", "hello", "hello"},
+		{"nil_passthrough", nil, "<nil>"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := humanizeDuration(tt.in)
+			if got != tt.want {
+				t.Errorf("humanizeDuration(%v) = %q, want %q", tt.in, got, tt.want)
+			}
+		})
+	}
+}
+
+// TestDefault verifies the default helper returns the fallback only when
+// the value is nil or the empty string. We intentionally do NOT replicate
+// sprig's "0/false/empty-collection falls back" behavior — those values
+// are real and would create footguns for {{ .exit_code | default 0 }}-
+// style rules.
+func TestDefault(t *testing.T) {
+	tests := []struct {
+		name     string
+		fallback interface{}
+		value    interface{}
+		want     interface{}
+	}{
+		{"nil_value_uses_fallback", "main", nil, "main"},
+		{"empty_string_uses_fallback", "main", "", "main"},
+		{"non_empty_string_passes_through", "main", "dev", "dev"},
+		{"zero_int_passes_through", 99, 0, 0},
+		{"false_passes_through", true, false, false},
+		{"string_zero_passes_through", "fallback", "0", "0"},
+		{"string_false_passes_through", "fallback", "false", "false"},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := defaultValue(tt.fallback, tt.value)
+			if got != tt.want {
+				t.Errorf("defaultValue(%v, %v) = %v, want %v", tt.fallback, tt.value, got, tt.want)
+			}
+		})
+	}
+}
+
+// TestRenderMessage_DefaultHelper_FillsMissingField proves the default
+// helper works through the full renderMessage pipeline for a field that
+// is not in Labels, Floats, or the alert struct. After missingkey=zero
+// is wired, the missing field surfaces as nil in the template scope and
+// default returns the fallback.
+func TestRenderMessage_DefaultHelper_FillsMissingField(t *testing.T) {
+	alert := Alert{
+		Rule:    "build_failed",
+		Metric:  "run.exit",
+		Value:   1,
+		Labels:  map[string]string{"runner": "github-actions"},
+		FiredAt: time.Now(),
+	}
+
+	got := renderMessage(`Build on {{ .branch | default "unknown" }} failed`, alert)
+
+	want := "Build on unknown failed"
+	if got != want {
+		t.Errorf("renderMessage default helper: got %q, want %q", got, want)
+	}
+}
+
+// TestRenderMessage_HumanizeDuration_ThroughPipeline proves humanize_duration
+// works through the full renderMessage pipeline against the synthetic
+// run.exit event shape (duration_seconds in Floats).
+func TestRenderMessage_HumanizeDuration_ThroughPipeline(t *testing.T) {
+	alert := Alert{
+		Rule:    "training_failed",
+		Metric:  "run.exit",
+		Value:   1,
+		Labels:  map[string]string{"runner": "mlflow"},
+		Floats:  map[string]float64{"duration_seconds": 1843},
+		FiredAt: time.Now(),
+	}
+
+	got := renderMessage("Run failed after {{ .duration_seconds | humanize_duration }}", alert)
+
+	want := "Run failed after 30m43s"
+	if got != want {
+		t.Errorf("renderMessage humanize_duration: got %q, want %q", got, want)
+	}
+}