diff --git a/cli/src/commands/autopilot/mod.rs b/cli/src/commands/autopilot/mod.rs index ba7bf0934..61ea148c4 100644 --- a/cli/src/commands/autopilot/mod.rs +++ b/cli/src/commands/autopilot/mod.rs @@ -1648,6 +1648,9 @@ impl stakpak_gateway::dispatcher::RunOverrideResolver for ProfileRunOverrideReso .map(stakpak_gateway::client::AutoApproveOverride::AllowList), system_prompt: resolved.system_prompt, max_turns: resolved.max_turns, + context_window: resolved.context_window, + context_budget_threshold: resolved.context_budget_threshold, + keep_last_n_assistant_messages: resolved.keep_last_n_assistant_messages, }; if overrides.is_empty() { diff --git a/cli/src/commands/watch/commands/run.rs b/cli/src/commands/watch/commands/run.rs index f40ef77f8..0cd9810b1 100644 --- a/cli/src/commands/watch/commands/run.rs +++ b/cli/src/commands/watch/commands/run.rs @@ -901,6 +901,9 @@ fn resolve_schedule_profile_overrides( auto_approve: normalized_auto_approve, system_prompt: resolved.system_prompt, max_turns: resolved.max_turns, + context_window: resolved.context_window, + context_budget_threshold: resolved.context_budget_threshold, + keep_last_n_assistant_messages: resolved.keep_last_n_assistant_messages, }; let overrides = if overrides.is_empty() { diff --git a/cli/src/config/app.rs b/cli/src/config/app.rs index 798188a5a..49a9ea65c 100644 --- a/cli/src/config/app.rs +++ b/cli/src/config/app.rs @@ -55,6 +55,12 @@ pub struct AppConfig { pub system_prompt: Option, /// Optional max turn override for sessions using this profile. pub max_turns: Option, + /// Optional context window override for sessions using this profile. + pub context_window: Option, + /// Optional context budget threshold for sessions using this profile. + pub context_budget_threshold: Option, + /// Optional keep-last-N-assistant-messages for sessions using this profile. + pub keep_last_n_assistant_messages: Option, /// Unique ID for anonymous telemetry pub anonymous_id: Option, /// Whether to collect telemetry data @@ -170,6 +176,9 @@ impl AppConfig { subagent: profile_config.subagent, system_prompt: profile_config.system_prompt, max_turns: profile_config.max_turns, + context_window: profile_config.context_window, + context_budget_threshold: profile_config.context_budget_threshold, + keep_last_n_assistant_messages: profile_config.keep_last_n_assistant_messages, anonymous_id: settings.anonymous_id, collect_telemetry: settings.collect_telemetry, editor: settings.editor, diff --git a/cli/src/config/profile.rs b/cli/src/config/profile.rs index 64ee536f1..c15123291 100644 --- a/cli/src/config/profile.rs +++ b/cli/src/config/profile.rs @@ -99,6 +99,23 @@ pub struct ProfileConfig { #[serde(skip_serializing_if = "Option::is_none")] pub max_turns: Option, + /// Override the model's context window size (in tokens). + /// When set, this value replaces the model's default context window + /// for budget and trimming calculations. + #[serde(skip_serializing_if = "Option::is_none")] + pub context_window: Option, + + /// Fraction of the context window at which context trimming triggers. + /// Range: 0.1–1.0 (e.g. 0.8 = start trimming at 80% of context window). + /// Default: 0.8 (80%). + #[serde(skip_serializing_if = "Option::is_none")] + pub context_budget_threshold: Option, + + /// Number of recent assistant messages to keep untrimmed during context + /// trimming. Default: 5. + #[serde(skip_serializing_if = "Option::is_none")] + pub keep_last_n_assistant_messages: Option, + // ========================================================================= // Legacy model fields - kept for backward compatibility during migration // These are read but deprecated (will migrate to 'model' field) @@ -140,6 +157,9 @@ impl ProfileConfig { recent_models: default.recent_models.clone(), system_prompt: default.system_prompt.clone(), max_turns: default.max_turns, + context_window: default.context_window, + context_budget_threshold: default.context_budget_threshold, + keep_last_n_assistant_messages: default.keep_last_n_assistant_messages, // Enable warden for readonly sandboxed execution warden: Some(WardenConfig::readonly_profile()), // Don't copy allowed_tools/auto_approve - readonly has its own restrictions @@ -437,6 +457,15 @@ impl ProfileConfig { max_turns: self .max_turns .or_else(|| other.and_then(|config| config.max_turns)), + context_window: self + .context_window + .or_else(|| other.and_then(|config| config.context_window)), + context_budget_threshold: self + .context_budget_threshold + .or_else(|| other.and_then(|config| config.context_budget_threshold)), + keep_last_n_assistant_messages: self + .keep_last_n_assistant_messages + .or_else(|| other.and_then(|config| config.keep_last_n_assistant_messages)), // Legacy fields - kept for reading only, not merged eco_model: None, smart_model: None, diff --git a/cli/src/config/profile_resolver.rs b/cli/src/config/profile_resolver.rs index 1ddb9ce79..af0bdc06d 100644 --- a/cli/src/config/profile_resolver.rs +++ b/cli/src/config/profile_resolver.rs @@ -4,13 +4,16 @@ use stakpak_shared::utils::normalize_optional_string; use super::AppConfig; -#[derive(Debug, Clone, Default, PartialEq, Eq)] +#[derive(Debug, Clone, Default, PartialEq)] pub(crate) struct ResolvedProfileOverrides { pub model: Option, pub auto_approve: Option>, pub allowed_tools: Option>, pub system_prompt: Option, pub max_turns: Option, + pub context_window: Option, + pub context_budget_threshold: Option, + pub keep_last_n_assistant_messages: Option, } pub(crate) fn resolve_profile_run_overrides( @@ -24,12 +27,18 @@ pub(crate) fn resolve_profile_run_overrides( let allowed_tools = normalize_tool_list(config.allowed_tools); let system_prompt = normalize_optional_string(config.system_prompt); let max_turns = config.max_turns; + let context_window = config.context_window; + let context_budget_threshold = config.context_budget_threshold; + let keep_last_n_assistant_messages = config.keep_last_n_assistant_messages; if model.is_none() && auto_approve.is_none() && allowed_tools.is_none() && system_prompt.is_none() && max_turns.is_none() + && context_window.is_none() + && context_budget_threshold.is_none() + && keep_last_n_assistant_messages.is_none() { return None; } @@ -40,6 +49,9 @@ pub(crate) fn resolve_profile_run_overrides( allowed_tools, system_prompt, max_turns, + context_window, + context_budget_threshold, + keep_last_n_assistant_messages, }) } diff --git a/libs/api/src/client/mod.rs b/libs/api/src/client/mod.rs index 8683ea732..d2e5b01ed 100644 --- a/libs/api/src/client/mod.rs +++ b/libs/api/src/client/mod.rs @@ -61,6 +61,17 @@ pub struct AgentClientConfig { pub store_path: Option, /// Hook registry for lifecycle events pub hook_registry: Option>, + /// How many recent assistant messages to keep untrimmed when context + /// trimming is triggered (default: 5). + pub keep_last_n_assistant_messages: Option, + /// Fraction of the context window at which trimming triggers + /// (e.g. 0.8 = 80%, default: 0.8). + pub context_budget_threshold: Option, + /// Override the model's context window size (in tokens). + /// When set, replaces the model's built-in `context_window` for budget + /// calculations. Useful for local/custom models where the window may + /// not be auto-detected correctly. + pub context_window: Option, } impl AgentClientConfig { @@ -94,6 +105,24 @@ impl AgentClientConfig { self.hook_registry = Some(registry); self } + + /// Set context trimming: number of recent assistant messages to preserve + pub fn with_keep_last_n_assistant_messages(mut self, n: usize) -> Self { + self.keep_last_n_assistant_messages = Some(n); + self + } + + /// Set context trimming: budget threshold (0.0–1.0) + pub fn with_context_budget_threshold(mut self, threshold: f32) -> Self { + self.context_budget_threshold = Some(threshold); + self + } + + /// Set context window override (in tokens) + pub fn with_context_window(mut self, window: u64) -> Self { + self.context_window = Some(window); + self + } } // ============================================================================= @@ -221,8 +250,9 @@ impl AgentClient { hook_registry.register( LifecycleEvent::BeforeInference, Box::new(TaskBoardContextHook::new(TaskBoardContextHookOptions { - keep_last_n_assistant_messages: Some(5), // Keep the last 5 assistant messages in context - context_budget_threshold: Some(0.8), // defaults to 0.8 (80%) + keep_last_n_assistant_messages: config.keep_last_n_assistant_messages.or(Some(5)), + context_budget_threshold: config.context_budget_threshold.or(Some(0.8)), + context_window: config.context_window, })), ); let hook_registry = Arc::new(hook_registry); diff --git a/libs/api/src/local/context_managers/task_board_context_manager.rs b/libs/api/src/local/context_managers/task_board_context_manager.rs index 80a9b96ef..2c7a0154f 100644 --- a/libs/api/src/local/context_managers/task_board_context_manager.rs +++ b/libs/api/src/local/context_managers/task_board_context_manager.rs @@ -8,6 +8,7 @@ use stakpak_shared::models::{ pub struct TaskBoardContextManager { keep_last_n_assistant_messages: usize, context_budget_threshold: f32, + context_window: Option, } impl super::ContextManager for TaskBoardContextManager { @@ -466,6 +467,7 @@ mod tests { TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 2, // Only keep last 2 assistant messages untrimmed context_budget_threshold: 0.8, + context_window: None, }) } @@ -1306,6 +1308,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 2, context_budget_threshold: 0.8, + context_window: None, }); // Build: user, assistant, user, assistant, user, user, assistant @@ -1410,6 +1413,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 2, context_budget_threshold: 0.8, + context_window: None, }); // Build 10 turns of user/assistant @@ -1514,6 +1518,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 1, context_budget_threshold: 0.8, + context_window: None, }); // Realistic agent flow: @@ -1618,6 +1623,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 10, context_budget_threshold: 0.8, + context_window: None, }); // Only 3 assistant messages but keep_last_n = 10 @@ -1682,6 +1688,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 0, context_budget_threshold: 0.8, + context_window: None, }); let messages = vec![ @@ -1735,6 +1742,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 1, context_budget_threshold: 0.8, + context_window: None, }); // Use large assistant messages and small user messages so that trimming @@ -1812,6 +1820,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 2, context_budget_threshold: 0.8, + context_window: None, }); // 10 turns → 20 messages, small window → establishes a trim index @@ -1848,6 +1857,7 @@ mod tests { let cm_generous = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 8, context_budget_threshold: 0.8, + context_window: None, }); let (_, metadata2) = cm_generous.reduce_context_with_budget(messages, 100, metadata1, None); @@ -1870,6 +1880,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 2, context_budget_threshold: 0.8, + context_window: None, }); // Build a conversation that's just under threshold without tools @@ -1974,6 +1985,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 3, context_budget_threshold: 0.8, + context_window: None, }); // 5 turns of: user → assistant(tool_call) → tool(result) → assistant(follow-up) @@ -2080,6 +2092,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 50, context_budget_threshold: 0.8, + context_window: None, }); // Simulate a 200k context window model (like Claude) @@ -2362,6 +2375,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 50, context_budget_threshold: 0.3, + context_window: None, }); // Simulate a session with 10 turns (20 messages) — well under 50 @@ -2435,6 +2449,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 3, context_budget_threshold: 0.8, + context_window: None, }); // 6 turns: user + assistant with large content. @@ -2519,6 +2534,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 20, context_budget_threshold: 0.8, + context_window: None, }); // Simulate a session with 30 turns of tool-heavy interaction. @@ -2651,6 +2667,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 2, context_budget_threshold: 0.8, + context_window: None, }); // Build conversation that exceeds threshold @@ -2742,6 +2759,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 10, context_budget_threshold: 0.8, + context_window: None, }); let messages = vec![ @@ -2790,6 +2808,7 @@ mod tests { let cm = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: 50, // high keep_last_n, like production context_budget_threshold: 0.3, + context_window: None, }); // 5 turns with large assistant responses @@ -2882,6 +2901,11 @@ pub struct TaskBoardContextManagerOptions { pub keep_last_n_assistant_messages: usize, /// Fraction of context window at which trimming triggers (e.g., 0.8 = 80%) pub context_budget_threshold: f32, + /// Override the model's context window size (in tokens). + /// When set, replaces the model's built-in `context_window` for budget + /// calculations. Useful for local/custom models where the window may + /// not be auto-detected correctly. + pub context_window: Option, } impl TaskBoardContextManager { @@ -2889,6 +2913,7 @@ impl TaskBoardContextManager { Self { keep_last_n_assistant_messages: options.keep_last_n_assistant_messages, context_budget_threshold: options.context_budget_threshold, + context_window: options.context_window, } } } diff --git a/libs/api/src/local/hooks/task_board_context/mod.rs b/libs/api/src/local/hooks/task_board_context/mod.rs index 797a4dae4..05c3e44d7 100644 --- a/libs/api/src/local/hooks/task_board_context/mod.rs +++ b/libs/api/src/local/hooks/task_board_context/mod.rs @@ -21,6 +21,11 @@ pub struct TaskBoardContextHookOptions { pub keep_last_n_assistant_messages: Option, /// Fraction of the context window at which trimming triggers (e.g. 0.8 = 80%). pub context_budget_threshold: Option, + /// Override the model's context window size (in tokens). + /// When set, replaces the model's built-in `context_window` for budget + /// calculations. Useful for local/custom models where the window may + /// not be auto-detected correctly. + pub context_window: Option, } impl TaskBoardContextHook { @@ -28,6 +33,7 @@ impl TaskBoardContextHook { let context_manager = TaskBoardContextManager::new(TaskBoardContextManagerOptions { keep_last_n_assistant_messages: options.keep_last_n_assistant_messages.unwrap_or(50), context_budget_threshold: options.context_budget_threshold.unwrap_or(0.8), + context_window: options.context_window, }); Self { context_manager } @@ -42,9 +48,16 @@ define_hook!( return Ok(HookAction::Continue); } - let model = ctx.state.active_model.clone(); + let mut model = ctx.state.active_model.clone(); let max_output_tokens: u64 = 16000; + // Apply context_window override if configured, so the budget-aware + // context trimming uses the user-specified window instead of the + // model's built-in limit. + if let Some(override_window) = self.context_manager.context_window { + model.limit.context = override_window; + } + // Subtract fixed overhead from context window so the trimmer budgets // only the space actually available for chat messages. // - System prompt: added after trimming (line 67+), not in message list diff --git a/libs/gateway/src/dispatcher.rs b/libs/gateway/src/dispatcher.rs index 5d3546899..0b1f2a8bf 100644 --- a/libs/gateway/src/dispatcher.rs +++ b/libs/gateway/src/dispatcher.rs @@ -3198,6 +3198,7 @@ mod tests { auto_approve: Some(AutoApproveOverride::AllowList(vec!["view".to_string()])), system_prompt: Some("ops prompt".to_string()), max_turns: Some(16), + ..RunOverrides::default() }, )]), }; diff --git a/libs/server/src/openapi.rs b/libs/server/src/openapi.rs index 6fab0c99e..815f09148 100644 --- a/libs/server/src/openapi.rs +++ b/libs/server/src/openapi.rs @@ -206,6 +206,12 @@ pub struct RunOverridesDoc { pub auto_approve: Option, pub system_prompt: Option, pub max_turns: Option, + /// Override the model's context window size in tokens (1000–2000000) + pub context_window: Option, + /// Fraction of the context window at which trimming triggers (0.1–1.0) + pub context_budget_threshold: Option, + /// Number of recent assistant messages to keep untrimmed during trimming + pub keep_last_n_assistant_messages: Option, } #[derive(Debug, Serialize, Deserialize, ToSchema)] diff --git a/libs/server/src/routes.rs b/libs/server/src/routes.rs index 329c2b7fe..63877fe72 100644 --- a/libs/server/src/routes.rs +++ b/libs/server/src/routes.rs @@ -250,6 +250,12 @@ const DEFAULT_MAX_TURNS: usize = 64; const MIN_MAX_TURNS: usize = 1; const MAX_MAX_TURNS: usize = 256; const MAX_SYSTEM_PROMPT_CHARS: usize = 32 * 1024; +const DEFAULT_CONTEXT_BUDGET_THRESHOLD: f32 = 0.8; +const DEFAULT_KEEP_LAST_N_ASSISTANT_MESSAGES: usize = 5; +const MIN_CONTEXT_BUDGET_THRESHOLD: f32 = 0.1; +const MAX_CONTEXT_BUDGET_THRESHOLD: f32 = 1.0; +const MIN_CONTEXT_WINDOW: u64 = 1_000; +const MAX_CONTEXT_WINDOW: u64 = 2_000_000; pub fn router(state: AppState, auth: AuthConfig) -> Router { public_router() @@ -574,12 +580,23 @@ async fn sessions_message_handler( // bypasses validate_session_message_request. .clamp(MIN_MAX_TURNS, MAX_MAX_TURNS); + let context_window_override = overrides.and_then(|value| value.context_window); + let context_budget_threshold = overrides + .and_then(|value| value.context_budget_threshold) + .unwrap_or(DEFAULT_CONTEXT_BUDGET_THRESHOLD); + let keep_last_n_assistant_messages = overrides + .and_then(|value| value.keep_last_n_assistant_messages) + .unwrap_or(DEFAULT_KEEP_LAST_N_ASSISTANT_MESSAGES); + let run_config = RunConfig { model, inference: state.inference.clone(), tool_approval_policy, system_prompt: system_prompt_override, max_turns, + context_window: context_window_override, + context_budget_threshold: Some(context_budget_threshold), + keep_last_n_assistant_messages: Some(keep_last_n_assistant_messages), }; let caller_context = map_caller_context_inputs(request.context.as_deref()); @@ -1098,6 +1115,30 @@ fn validate_session_message_request(request: &SessionMessageRequest) -> Option, pub max_turns: usize, + /// Override the model's context window size (in tokens). + /// When set, replaces the model's default `limit.context` for budget calculations. + pub context_window: Option, + /// Fraction of the context window at which context trimming triggers (0.0–1.0). + /// Default when not set: 0.8. + pub context_budget_threshold: Option, + /// Number of most recent assistant messages to keep untrimmed during trimming. + /// Default when not set: 5. + pub keep_last_n_assistant_messages: Option, } impl std::fmt::Debug for RunConfig { @@ -42,6 +51,12 @@ impl std::fmt::Debug for RunConfig { .field("tool_approval_policy", &self.tool_approval_policy) .field("system_prompt", &self.system_prompt) .field("max_turns", &self.max_turns) + .field("context_window", &self.context_window) + .field("context_budget_threshold", &self.context_budget_threshold) + .field( + "keep_last_n_assistant_messages", + &self.keep_last_n_assistant_messages, + ) .field("inference", &"") .finish() } diff --git a/libs/shared/src/models/overrides.rs b/libs/shared/src/models/overrides.rs index 44ea6cd0c..1050ce833 100644 --- a/libs/shared/src/models/overrides.rs +++ b/libs/shared/src/models/overrides.rs @@ -10,7 +10,7 @@ pub enum AutoApproveOverride { } /// Per-request run overrides merged with runtime defaults. -#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] pub struct RunOverrides { #[serde(default, skip_serializing_if = "Option::is_none")] pub model: Option, @@ -20,6 +20,22 @@ pub struct RunOverrides { pub system_prompt: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub max_turns: Option, + /// Override the model's context window size (in tokens). + /// When set, this value replaces the model's default context window + /// for budget and trimming calculations. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub context_window: Option, + /// Fraction of the context window at which context trimming triggers. + /// Range: 0.0–1.0 (e.g. 0.8 = start trimming at 80% of context window). + /// Default when not set: 0.8. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub context_budget_threshold: Option, + /// Number of most recent assistant messages to keep untrimmed when + /// context trimming is triggered. Only assistant and tool messages are + /// trimmed; user and system messages are always preserved in full. + /// Default when not set: 5. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub keep_last_n_assistant_messages: Option, } impl RunOverrides { @@ -28,6 +44,9 @@ impl RunOverrides { && self.auto_approve.is_none() && self.system_prompt.is_none() && self.max_turns.is_none() + && self.context_window.is_none() + && self.context_budget_threshold.is_none() + && self.keep_last_n_assistant_messages.is_none() } } @@ -63,10 +82,28 @@ mod tests { ])), system_prompt: Some("hello".to_string()), max_turns: Some(24), + context_window: Some(200_000), + context_budget_threshold: Some(0.7), + keep_last_n_assistant_messages: Some(10), }; let encoded = serde_json::to_string(&overrides).expect("serialize overrides"); let decoded: RunOverrides = serde_json::from_str(&encoded).expect("deserialize overrides"); assert_eq!(decoded, overrides); } + + #[test] + fn run_overrides_partial_context_fields() { + // Verify PartialEq works correctly when only some fields are set + let a = RunOverrides { + context_window: Some(100_000), + ..RunOverrides::default() + }; + let b = RunOverrides { + context_window: Some(100_000), + ..RunOverrides::default() + }; + assert_eq!(a, b); + assert!(!a.is_empty()); + } }