agentcore/errors.go at main · voocel/agentcore · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
package agentcore

import (
	"context"
	"errors"
	"fmt"
	"sort"
	"strings"
	"time"
)

// Errors produced by agentcore fall into two layers:
//
//   1. Agent layer — loop control, agent state machine, context management.
//      Surfaced as sentinel errors (Err*) or typed errors (*Error).
//      Match with errors.Is(err, ErrXxx) or errors.As(err, &SomeError{}).
//
//   2. Model layer — provider errors. The kernel does not import any LLM SDK;
//      model adapters classify their SDK's errors at the boundary by mapping
//      them onto this package's sentinels (ErrContextOverflow, ErrProvider*)
//      via errors.Is and the RetryableError / RetryHinter interfaces. Adapters
//      Unwrap to the original SDK error, so callers that DO know the SDK can
//      still match it with errors.As.
//
// User cancellation surfaces as context.Canceled, not a custom sentinel —
// use errors.Is(err, context.Canceled) for abort detection.

// Sentinel errors. Use with errors.Is.
var (
	ErrMaxTurns         = errors.New("max turns reached")
	ErrNoModel          = errors.New("no model configured")
	ErrNoMessages       = errors.New("cannot continue: no messages in context")
	ErrAlreadyRunning   = errors.New("agent is already running")
	ErrBadContinuation  = errors.New("cannot continue from this message role without queued messages")
	ErrStopGuard        = errors.New("stop guard escalated: run terminated")
	ErrContextOverflow  = errors.New("context window overflow")
	ErrStreamPartial    = errors.New("stream closed without done event")
	ErrToolValidation   = errors.New("tool argument validation failed")
	ErrInjectNilMessage = errors.New("inject message is nil")
)

// Provider runtime sentinels. These categorize errors returned by the model
// adapter at call time (provider API errors, network failures, server
// responses). Use ClassifyProvider to derive the most specific sentinel from
// an error chain, or match directly with errors.Is.
var (
	ErrProviderRateLimit  = errors.New("provider rate limit")
	ErrProviderTimeout    = errors.New("provider timeout")
	ErrProviderStreamIdle = errors.New("provider stream idle")
	ErrProviderNetwork    = errors.New("provider network")
	ErrProviderAuth       = errors.New("provider auth")
)

// RetryableError when implemented by an error in the chain, tells the loop
// whether re-issuing the identical request may succeed. Model adapters
// implement it so the kernel decides same-provider retries without importing
// any LLM SDK. Errors that do not implement it are treated as non-retryable
// (the loop still has its own message-pattern classification as a fallback).
type RetryableError interface {
	Retryable() bool
}

// RetryHinter when implemented, supplies a provider-specified backoff hint
// (e.g. a Retry-After header). The loop honors it for the next retry delay,
// capped at its own maximum. A zero duration means "no hint, use backoff".
type RetryHinter interface {
	RetryAfter() time.Duration
}

// isRetryable reports whether the error chain advertises retryability via
// RetryableError.
func isRetryable(err error) bool {
	var r RetryableError
	return errors.As(err, &r) && r.Retryable()
}

// retryAfterHint extracts a provider backoff hint from the chain, or 0 if none
// is present.
func retryAfterHint(err error) time.Duration {
	var h RetryHinter
	if errors.As(err, &h) {
		return h.RetryAfter()
	}
	return 0
}

// MaxTurnsError carries the configured turn limit. errors.Is matches ErrMaxTurns.
type MaxTurnsError struct {
	Limit int
}

func (e *MaxTurnsError) Error() string        { return fmt.Sprintf("max turns (%d) reached", e.Limit) }
func (e *MaxTurnsError) Is(target error) bool { return target == ErrMaxTurns }

// PartialStreamError indicates a stream closed without a terminal done event.
// Partial carries any content received before truncation; callers can inspect
// it for diagnostics but MUST NOT persist it as a completed message — the
// stream did not finish cleanly (missing StopReason, possibly truncated
// tool_call args, unclosed thinking blocks).
type PartialStreamError struct {
	Partial Message
}

func (e *PartialStreamError) Error() string        { return "stream closed without done event" }
func (e *PartialStreamError) Is(target error) bool { return target == ErrStreamPartial }

// ContextOverflowError wraps an underlying context-overflow cause (typically
// a provider error). errors.Is matches ErrContextOverflow; Unwrap reaches the
// raw cause so callers can extract provider-specific details if needed.
type ContextOverflowError struct {
	Cause error
}

func (e *ContextOverflowError) Error() string {
	if e.Cause == nil {
		return "context window overflow"
	}
	return "context window overflow: " + e.Cause.Error()
}
func (e *ContextOverflowError) Unwrap() error        { return e.Cause }
func (e *ContextOverflowError) Is(target error) bool { return target == ErrContextOverflow }

// ToolValidationError is returned when tool call arguments fail schema
// validation. The agent loop surfaces it as a tool_result with IsError=true,
// not as a fatal loop error, so the model can self-correct on the next turn.
// errors.Is matches ErrToolValidation.
type ToolValidationError struct {
	ToolName string
	Issues   []ValidationIssue
}

func (e *ToolValidationError) Error() string        { return formatValidationIssues(e.ToolName, e.Issues) }
func (e *ToolValidationError) Is(target error) bool { return target == ErrToolValidation }

// ValidationIssue describes a single schema mismatch from tool arg validation.
type ValidationIssue struct {
	Kind     string // IssueMissing or IssueType
	Path     string
	Expected string // for IssueType only
	Received string // for IssueType only
	Hint     string // optional fix hint, appended to the rendered message
}

const (
	IssueMissing = "missing"
	IssueType    = "type"
)

// IsContextOverflow reports whether err indicates a context-overflow condition.
// Both the agentcore wrapper (*ContextOverflowError) and adapter-classified
// provider errors map onto ErrContextOverflow, so a single errors.Is covers
// both layers. Convenience for callers that want to detect "request too big"
// without caring where it surfaced.
func IsContextOverflow(err error) bool {
	return errors.Is(err, ErrContextOverflow)
}

// ErrorKind returns a stable, log-friendly label for err: "canceled",
// "stop_guard", "max_turns", "context_overflow", "stream_partial",
// "tool_validation", "stream_idle", "rate_limit", "timeout", "auth",
// "network". Returns "" for nil and "unknown" when nothing matches.
//
// Labels are part of the public API contract — they will not change between
// minor versions, so harnesses can key alert routing and log filters on them
// instead of matching error strings.
func ErrorKind(err error) string {
	if err == nil {
		return ""
	}
	switch {
	case errors.Is(err, context.Canceled):
		return "canceled"
	case errors.Is(err, ErrStopGuard):
		return "stop_guard"
	case errors.Is(err, ErrMaxTurns):
		return "max_turns"
	case IsContextOverflow(err):
		return "context_overflow"
	case errors.Is(err, ErrStreamPartial):
		return "stream_partial"
	case errors.Is(err, ErrToolValidation):
		return "tool_validation"
	}
	switch classifyProviderSentinel(err) {
	case ErrProviderStreamIdle:
		return "stream_idle"
	case ErrProviderRateLimit:
		return "rate_limit"
	case ErrProviderTimeout:
		return "timeout"
	case ErrProviderAuth:
		return "auth"
	case ErrProviderNetwork:
		return "network"
	}
	return "unknown"
}

// streamIdleMsgPattern matches the rendered message of a stream-idle abort.
const streamIdleMsgPattern = "stream idle timeout"

// IsStreamIdleMessage reports whether s contains the rendered marker of a
// stream idle-timeout abort. Useful when only the error string survives
// (sub-agent JSON results, structured event payloads that flatten the chain).
func IsStreamIdleMessage(s string) bool {
	return strings.Contains(strings.ToLower(s), streamIdleMsgPattern)
}

// ClassifyProvider inspects an LLM/provider error and returns the most specific
// matching sentinel from this package's Err* variables. Returns nil when err is
// nil; returns err unchanged when no classification applies, so callers can wrap
// with their own context.
//
// Stream-idle is checked before generic timeout: it is a stuck connection that
// failover can typically rescue, whereas a generic timeout may just be a slow
// model. Both error-chain matching (adapters map stream-idle onto
// ErrProviderStreamIdle) and message pattern matching are supported because
// sub-agent JSON results flatten the original error to a plain string.
//
// Context overflow is intentionally not returned here — use IsContextOverflow,
// which covers both the agentcore wrapper and adapter-classified errors.
func ClassifyProvider(err error) error {
	if err == nil {
		return nil
	}
	if sentinel := classifyProviderSentinel(err); sentinel != nil {
		return sentinel
	}
	return err
}

func classifyProviderSentinel(err error) error {
	if errors.Is(err, ErrProviderStreamIdle) {
		return ErrProviderStreamIdle
	}
	if errors.Is(err, context.DeadlineExceeded) {
		return ErrProviderTimeout
	}

	msg := strings.ToLower(err.Error())
	switch {
	case strings.Contains(msg, streamIdleMsgPattern):
		return ErrProviderStreamIdle
	case containsAny(msg, "rate limit", "too many requests", "429"):
		return ErrProviderRateLimit
	case containsAny(msg, "deadline exceeded", "timeout", "timed out"):
		return ErrProviderTimeout
	case containsAny(msg, "invalid api key", "incorrect api key", "unauthorized", "authentication failed", "forbidden", "401", "403"):
		return ErrProviderAuth
	case containsAny(msg, "connection refused", "connection reset", "no such host", "dial tcp", "tls handshake timeout", "server misbehaving", "broken pipe", "eof"):
		return ErrProviderNetwork
	}
	return nil
}

// IsFailoverEligible reports whether err matches a transient provider error
// suitable for cross-provider failover: rate_limit, timeout, network, or
// stream_idle. Returns false for auth errors, context_overflow, user
// cancellation, or unclassified errors.
func IsFailoverEligible(err error) bool {
	if err == nil || errors.Is(err, context.Canceled) {
		return false
	}
	classified := ClassifyProvider(err)
	return errors.Is(classified, ErrProviderRateLimit) ||
		errors.Is(classified, ErrProviderTimeout) ||
		errors.Is(classified, ErrProviderNetwork) ||
		errors.Is(classified, ErrProviderStreamIdle)
}

// FailoverReason returns a stable short label ("rate_limit" / "timeout" /
// "stream_idle" / "network") suitable for structured logging. Returns "" when
// err is not failover-eligible.
func FailoverReason(err error) string {
	if err == nil {
		return ""
	}
	classified := ClassifyProvider(err)
	switch {
	case errors.Is(classified, ErrProviderStreamIdle):
		return "stream_idle"
	case errors.Is(classified, ErrProviderRateLimit):
		return "rate_limit"
	case errors.Is(classified, ErrProviderTimeout):
		return "timeout"
	case errors.Is(classified, ErrProviderNetwork):
		return "network"
	}
	return ""
}

func containsAny(msg string, patterns ...string) bool {
	for _, pattern := range patterns {
		if strings.Contains(msg, pattern) {
			return true
		}
	}
	return false
}

// formatValidationIssues renders issues as a single multi-line block.
// Missing params come first (most fundamental error), then type mismatches;
// within each group, paths sort alphabetically for stable output.
func formatValidationIssues(toolName string, issues []ValidationIssue) string {
	// Sort a copy: Error() must not mutate the caller's Issues slice.
	issues = append([]ValidationIssue(nil), issues...)
	sort.SliceStable(issues, func(i, j int) bool {
		if issues[i].Kind != issues[j].Kind {
			return issues[i].Kind == IssueMissing
		}
		return issues[i].Path < issues[j].Path
	})

	lines := make([]string, 0, len(issues))
	for _, it := range issues {
		var line string
		switch it.Kind {
		case IssueMissing:
			line = fmt.Sprintf("The required parameter `%s` is missing", it.Path)
		case IssueType:
			line = fmt.Sprintf(
				"The parameter `%s` type is expected as `%s` but provided as `%s`",
				it.Path, it.Expected, it.Received,
			)
		default:
			continue
		}
		if it.Hint != "" {
			line += ". " + it.Hint
		}
		lines = append(lines, line)
	}

	noun := "issue"
	if len(lines) > 1 {
		noun = "issues"
	}
	header := fmt.Sprintf("InputValidationError: %s failed due to the following %s:", toolName, noun)
	return header + "\n" + strings.Join(lines, "\n")
}