diff --git a/README.md b/README.md index 8f6b55b..6be4388 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,7 @@ async fn main() -> Result<(), CanoError> { .add_exit_state(FlowState::Complete); // 5. Run. - let result = workflow.orchestrate(FlowState::Start).await?; + let result = workflow.orchestrate(FlowState::Start, CancellationToken::disabled()).await?; println!("Workflow finished: {:?}", result); Ok(()) diff --git a/cano-e2e/src/bin/cano_workflow_app.rs b/cano-e2e/src/bin/cano_workflow_app.rs index 260aad7..3b609ea 100644 --- a/cano-e2e/src/bin/cano_workflow_app.rs +++ b/cano-e2e/src/bin/cano_workflow_app.rs @@ -13,6 +13,7 @@ use std::str::FromStr; use std::sync::Arc; +use cano::CancellationToken; use cano_e2e::{Faults, Phase, PostgresCheckpointStore, StdoutTracer, build_workflow}; #[tokio::main] @@ -62,8 +63,16 @@ async fn main() -> anyhow::Result<()> { emit(&format!("READY {workflow_id} {mode}")); let result = match mode.as_str() { - "resume" => workflow.resume_from(workflow_id.clone()).await, - "run" => workflow.orchestrate(Phase::Reserve).await, + "resume" => { + workflow + .resume_from(workflow_id.clone(), CancellationToken::disabled()) + .await + } + "run" => { + workflow + .orchestrate(Phase::Reserve, CancellationToken::disabled()) + .await + } other => anyhow::bail!("unknown mode {other:?}"), }; match result { diff --git a/cano-macros/tests/batch_task_impl.rs b/cano-macros/tests/batch_task_impl.rs index ff59645..9aec98e 100644 --- a/cano-macros/tests/batch_task_impl.rs +++ b/cano-macros/tests/batch_task_impl.rs @@ -105,7 +105,10 @@ async fn inherent_inferred_integrates_with_workflow() { .register(Step::Process, InherentInferred) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Process).await.unwrap(); + let result = workflow + .orchestrate(Step::Process, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } @@ -197,7 +200,10 @@ async fn inherent_with_key_integrates_with_workflow() { .register(Step::Process, InherentWithKey) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Process).await.unwrap(); + let result = workflow + .orchestrate(Step::Process, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } @@ -394,7 +400,10 @@ async fn trait_form_integrates_with_workflow() { .register(Step::Process, TraitBatch) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Process).await.unwrap(); + let result = workflow + .orchestrate(Step::Process, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } @@ -534,7 +543,10 @@ async fn end_to_end_workflow_load_process_finish() { .register(Step::Process, LoadStep) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Process).await.unwrap(); + let result = workflow + .orchestrate(Step::Process, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); let output: Vec = store.get("output").unwrap(); diff --git a/cano-macros/tests/compensatable_task_impl.rs b/cano-macros/tests/compensatable_task_impl.rs index 6f8bde2..3172527 100644 --- a/cano-macros/tests/compensatable_task_impl.rs +++ b/cano-macros/tests/compensatable_task_impl.rs @@ -72,7 +72,10 @@ async fn inherent_compensatable_impl_registers_and_compensates() { .register(Step::Boom, Boom) .add_exit_state(Step::Done); - let err = workflow.orchestrate(Step::Reserve).await.unwrap_err(); + let err = workflow + .orchestrate(Step::Reserve, CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.message(), "boom"); // clean rollback -> the original failure is surfaced assert!( compensated.load(Ordering::SeqCst), @@ -85,7 +88,10 @@ async fn inherent_compensatable_impl_registers_and_compensates() { .register_with_compensation(Step::Reserve, ReserveNamed) .add_exit_state(Step::Done); assert_eq!( - workflow.orchestrate(Step::Reserve).await.unwrap(), + workflow + .orchestrate(Step::Reserve, CancellationToken::disabled()) + .await + .unwrap(), Step::Done ); } diff --git a/cano-macros/tests/poll_task_impl.rs b/cano-macros/tests/poll_task_impl.rs index 7b467da..b27622e 100644 --- a/cano-macros/tests/poll_task_impl.rs +++ b/cano-macros/tests/poll_task_impl.rs @@ -219,7 +219,10 @@ async fn inherent_poller_integrates_with_workflow() { .register(Step::Poll, InherentPoller) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Poll).await.unwrap(); + let result = workflow + .orchestrate(Step::Poll, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } @@ -229,7 +232,10 @@ async fn trait_poller_integrates_with_workflow() { .register(Step::Poll, TraitPoller) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Poll).await.unwrap(); + let result = workflow + .orchestrate(Step::Poll, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } diff --git a/cano-macros/tests/router_task_impl.rs b/cano-macros/tests/router_task_impl.rs index 30ae1ff..00b0f17 100644 --- a/cano-macros/tests/router_task_impl.rs +++ b/cano-macros/tests/router_task_impl.rs @@ -192,7 +192,10 @@ async fn inherent_router_integrates_with_workflow() { .register(Step::PathA, PathATask) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Route).await.unwrap(); + let result = workflow + .orchestrate(Step::Route, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } @@ -203,6 +206,9 @@ async fn trait_router_integrates_with_workflow() { .register(Step::PathA, PathATask) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Route).await.unwrap(); + let result = workflow + .orchestrate(Step::Route, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } diff --git a/cano-macros/tests/stepped_task_impl.rs b/cano-macros/tests/stepped_task_impl.rs index 3f55a28..f26c827 100644 --- a/cano-macros/tests/stepped_task_impl.rs +++ b/cano-macros/tests/stepped_task_impl.rs @@ -380,7 +380,10 @@ async fn stepped_task_in_workflow() { .register(MyState::Work, stepper) .add_exit_state(MyState::Done); - let result = workflow.orchestrate(MyState::Work).await.unwrap(); + let result = workflow + .orchestrate(MyState::Work, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, MyState::Done); } diff --git a/cano-macros/tests/task_impl_name.rs b/cano-macros/tests/task_impl_name.rs index 5a2c747..cc3bf3c 100644 --- a/cano-macros/tests/task_impl_name.rs +++ b/cano-macros/tests/task_impl_name.rs @@ -55,5 +55,11 @@ async fn task_still_runs_in_a_workflow() { let workflow = Workflow::bare() .register(Step::Start, NamedInherentTask) .add_exit_state(Step::Done); - assert_eq!(workflow.orchestrate(Step::Start).await.unwrap(), Step::Done); + assert_eq!( + workflow + .orchestrate(Step::Start, CancellationToken::disabled()) + .await + .unwrap(), + Step::Done + ); } diff --git a/cano-macros/tests/timer_task_impl.rs b/cano-macros/tests/timer_task_impl.rs index 5ef14f2..d663f77 100644 --- a/cano-macros/tests/timer_task_impl.rs +++ b/cano-macros/tests/timer_task_impl.rs @@ -212,7 +212,10 @@ async fn inherent_timer_integrates_with_workflow() { .register(Step::Wait, TraitTimer) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Wait).await.unwrap(); + let result = workflow + .orchestrate(Step::Wait, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } diff --git a/cano/Cargo.toml b/cano/Cargo.toml index 3ffae3b..13691c3 100644 --- a/cano/Cargo.toml +++ b/cano/Cargo.toml @@ -101,6 +101,11 @@ name = "scheduler_graceful_shutdown" path = "examples/scheduler_graceful_shutdown.rs" required-features = ["scheduler"] +[[example]] +name = "scheduler_cancellation" +path = "examples/scheduler_cancellation.rs" +required-features = ["scheduler"] + [[example]] name = "scheduler_mixed_workflows" path = "examples/scheduler_mixed_workflows.rs" @@ -201,6 +206,10 @@ path = "examples/saga_payment.rs" name = "workflow_total_timeout" path = "examples/workflow_total_timeout.rs" +[[example]] +name = "workflow_cancellation" +path = "examples/workflow_cancellation.rs" + [[example]] name = "router_task" path = "examples/router_task.rs" diff --git a/cano/benches/workflow_performance.rs b/cano/benches/workflow_performance.rs index ee4cbd5..f85e700 100644 --- a/cano/benches/workflow_performance.rs +++ b/cano/benches/workflow_performance.rs @@ -577,7 +577,9 @@ fn bench_orchestrate_overhead(c: &mut Criterion) { b.to_async(&runtime).iter(|| { let workflow = Arc::clone(&workflow); async move { - let _ = workflow.orchestrate(S::Done).await; + let _ = workflow + .orchestrate(S::Done, CancellationToken::disabled()) + .await; } }); }); @@ -617,7 +619,9 @@ fn bench_large_split_collect(c: &mut Criterion) { .add_exit_state(S::Done); b.to_async(&runtime).iter(|| async { - let _ = workflow.orchestrate(S::Start).await; + let _ = workflow + .orchestrate(S::Start, CancellationToken::disabled()) + .await; }); }); } @@ -659,7 +663,9 @@ fn bench_tracing_overhead(c: &mut Criterion) { b.to_async(&runtime).iter(|| { let workflow = Arc::clone(&workflow); async move { - let _ = workflow.orchestrate(S::Done).await; + let _ = workflow + .orchestrate(S::Done, CancellationToken::disabled()) + .await; } }); }); diff --git a/cano/examples/ai_workflow_yes_and.rs b/cano/examples/ai_workflow_yes_and.rs index 96ba928..3849df7 100644 --- a/cano/examples/ai_workflow_yes_and.rs +++ b/cano/examples/ai_workflow_yes_and.rs @@ -318,7 +318,9 @@ async fn main() -> Result<(), CanoError> { println!("Starting improvised story...\n"); - let final_state = workflow.orchestrate(ConversationState::Start).await?; + let final_state = workflow + .orchestrate(ConversationState::Start, CancellationToken::disabled()) + .await?; println!("\nStory completed with state: {final_state:?}"); diff --git a/cano/examples/batch_task.rs b/cano/examples/batch_task.rs index a93230b..6726e78 100644 --- a/cano/examples/batch_task.rs +++ b/cano/examples/batch_task.rs @@ -240,7 +240,9 @@ async fn main() -> CanoResult<()> { .register(Step::Summarise, Summarise { url_count }) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::ParseUrls).await?; + let result = workflow + .orchestrate(Step::ParseUrls, CancellationToken::disabled()) + .await?; assert_eq!(result, Step::Done); println!("\ncompleted at {result:?}"); diff --git a/cano/examples/circuit_breaker.rs b/cano/examples/circuit_breaker.rs index 5c7ad63..80283bb 100644 --- a/cano/examples/circuit_breaker.rs +++ b/cano/examples/circuit_breaker.rs @@ -98,7 +98,9 @@ async fn main() -> Result<(), CanoError> { println!("Phase 1 — dependency unhealthy, threshold = 3."); for attempt in 1..=5 { - let outcome = workflow.orchestrate(Step::Call).await; + let outcome = workflow + .orchestrate(Step::Call, CancellationToken::disabled()) + .await; // `orchestrate` wraps task failures in `WithStateContext`; unwrap one layer // before pattern-matching on the underlying variant. let label = match outcome { @@ -125,7 +127,10 @@ async fn main() -> Result<(), CanoError> { println!("Phase 3 — half-open trial probes the dependency, then closes the breaker."); for attempt in 1..=3 { - match workflow.orchestrate(Step::Call).await { + match workflow + .orchestrate(Step::Call, CancellationToken::disabled()) + .await + { Ok(_) => println!( " recovery call {attempt}: ok | state={:?}", breaker.state() diff --git a/cano/examples/circuit_breaker_manual.rs b/cano/examples/circuit_breaker_manual.rs index e0539b3..902713b 100644 --- a/cano/examples/circuit_breaker_manual.rs +++ b/cano/examples/circuit_breaker_manual.rs @@ -151,7 +151,9 @@ async fn main() -> Result<(), CanoError> { // ------------------------------------------------------------------ println!("Phase 1: dependency unhealthy (threshold = 3 consecutive failures)"); for attempt in 1..=5 { - let outcome = workflow.orchestrate(Step::Call).await; + let outcome = workflow + .orchestrate(Step::Call, CancellationToken::disabled()) + .await; // `orchestrate` wraps task failures in `WithStateContext`; unwrap one layer // before pattern-matching on the underlying variant. let label = match &outcome { @@ -185,7 +187,10 @@ async fn main() -> Result<(), CanoError> { // ------------------------------------------------------------------ println!("\nPhase 3: half-open trial — one probe closes the breaker"); for attempt in 1..=3 { - match workflow.orchestrate(Step::Call).await { + match workflow + .orchestrate(Step::Call, CancellationToken::disabled()) + .await + { Ok(_) => println!(" call {attempt}: ok | breaker={:?}", breaker.state()), Err(e) => println!(" call {attempt}: err: {e} | breaker={:?}", breaker.state()), } diff --git a/cano/examples/custom_checkpoint_store.rs b/cano/examples/custom_checkpoint_store.rs index 7ff97dc..49f0e74 100644 --- a/cano/examples/custom_checkpoint_store.rs +++ b/cano/examples/custom_checkpoint_store.rs @@ -173,7 +173,10 @@ async fn main() -> Result<(), Box> { // --- First run: Process crashes; the checkpoint log is kept. ------- println!("=== run 1: Process will crash ==="); - match workflow.orchestrate(Step::Init).await { + match workflow + .orchestrate(Step::Init, CancellationToken::disabled()) + .await + { Ok(s) => println!(" completed at {s:?} (unexpected)"), Err(e) => println!(" stopped with error: {e}"), } @@ -201,7 +204,9 @@ async fn main() -> Result<(), Box> { // --- Second run: resume from last checkpoint (Process, attempt 2). --- println!("\n=== run 2: resume_from ==="); - let final_state = workflow.resume_from(run_id).await?; + let final_state = workflow + .resume_from(run_id, CancellationToken::disabled()) + .await?; println!(" reached {final_state:?}"); assert_eq!(final_state, Step::Done); diff --git a/cano/examples/join_strategies.rs b/cano/examples/join_strategies.rs index 76cd54e..c550357 100644 --- a/cano/examples/join_strategies.rs +++ b/cano/examples/join_strategies.rs @@ -110,7 +110,9 @@ async fn run_strategy(label: &str, strategy: JoinStrategy) -> CanoResult<()> { .add_exit_state(Step::Done); let start = Instant::now(); - let result = workflow.orchestrate(Step::Parallel).await?; + let result = workflow + .orchestrate(Step::Parallel, CancellationToken::disabled()) + .await?; let elapsed = start.elapsed(); // Count how many workers managed to log a result before being cancelled. diff --git a/cano/examples/metrics_demo.rs b/cano/examples/metrics_demo.rs index e249564..8ba6f01 100644 --- a/cano/examples/metrics_demo.rs +++ b/cano/examples/metrics_demo.rs @@ -60,7 +60,7 @@ async fn main() { // Run the workflow 3 times directly. for _ in 0..3 { workflow() - .orchestrate(Step::Fetch) + .orchestrate(Step::Fetch, CancellationToken::disabled()) .await .expect("workflow run"); } diff --git a/cano/examples/metrics_tracing_context.rs b/cano/examples/metrics_tracing_context.rs index 49c2426..8c55b4a 100644 --- a/cano/examples/metrics_tracing_context.rs +++ b/cano/examples/metrics_tracing_context.rs @@ -94,7 +94,7 @@ async fn main() { // Path 1: Cano's own `workflow_orchestrate` span carries `workflow_id`. workflow() .with_workflow_id("demo-run-1") - .orchestrate(Step::Fetch) + .orchestrate(Step::Fetch, CancellationToken::disabled()) .await .expect("workflow run"); @@ -103,7 +103,7 @@ async fn main() { let span = info_span!("api_request", request_id = "abc"); let _enter = span.enter(); workflow() - .orchestrate(Step::Fetch) + .orchestrate(Step::Fetch, CancellationToken::disabled()) .await .expect("workflow run"); } diff --git a/cano/examples/mixed_workflow.rs b/cano/examples/mixed_workflow.rs index 276b664..77b66c5 100644 --- a/cano/examples/mixed_workflow.rs +++ b/cano/examples/mixed_workflow.rs @@ -240,7 +240,10 @@ async fn main() -> CanoResult<()> { .register(WorkflowState::GenerateReport, ReportTask) .add_exit_states(vec![WorkflowState::Complete]); - match workflow.orchestrate(WorkflowState::GenerateData).await { + match workflow + .orchestrate(WorkflowState::GenerateData, CancellationToken::disabled()) + .await + { Ok(_final_state) => { println!("\nWorkflow completed successfully!"); diff --git a/cano/examples/observer_metrics.rs b/cano/examples/observer_metrics.rs index 04ff78a..6e7d678 100644 --- a/cano/examples/observer_metrics.rs +++ b/cano/examples/observer_metrics.rs @@ -141,7 +141,10 @@ async fn main() -> Result<(), Box> { .with_observer(metrics.clone()); for run in 1..=2 { - match workflow.orchestrate(Step::Start).await { + match workflow + .orchestrate(Step::Start, CancellationToken::disabled()) + .await + { Ok(state) => println!("run {run}: reached {state:?}"), Err(error) => println!("run {run}: stopped — {error}"), } diff --git a/cano/examples/panic_safety.rs b/cano/examples/panic_safety.rs index 9d26a18..6b3359f 100644 --- a/cano/examples/panic_safety.rs +++ b/cano/examples/panic_safety.rs @@ -127,7 +127,10 @@ async fn main() -> Result<(), Box> { .register(Step::PanicTask, Panicker) .add_exit_state(Step::Done); - match workflow.orchestrate(Step::PanicTask).await { + match workflow + .orchestrate(Step::PanicTask, CancellationToken::disabled()) + .await + { Ok(s) => println!(" outcome: Ok({s:?}) (unexpected)"), Err(e) => { println!(" outcome: Err(\"{e}\")"); @@ -157,7 +160,10 @@ async fn main() -> Result<(), Box> { .register(Step::PanicTask, PanicAfterReserve) .add_exit_state(Step::Done); - match workflow.orchestrate(Step::Reserve).await { + match workflow + .orchestrate(Step::Reserve, CancellationToken::disabled()) + .await + { Ok(s) => println!(" outcome: Ok({s:?}) (unexpected)"), Err(e) => { println!(" outcome: Err(\"{e}\")"); diff --git a/cano/examples/poll_retry_on_error.rs b/cano/examples/poll_retry_on_error.rs index 7b652e2..1e4263c 100644 --- a/cano/examples/poll_retry_on_error.rs +++ b/cano/examples/poll_retry_on_error.rs @@ -115,7 +115,10 @@ async fn main() -> Result<(), Box> { .register(Step::Poll, poller) .add_exit_state(Step::Done); - match workflow.orchestrate(Step::Poll).await { + match workflow + .orchestrate(Step::Poll, CancellationToken::disabled()) + .await + { Ok(state) => println!(" result: Ok({state:?}) -- loop tolerated the streak\n"), Err(e) => println!(" result: Err({e}) -- unexpected failure\n"), } @@ -135,7 +138,10 @@ async fn main() -> Result<(), Box> { .register(Step::Poll, poller) .add_exit_state(Step::Done); - match workflow.orchestrate(Step::Poll).await { + match workflow + .orchestrate(Step::Poll, CancellationToken::disabled()) + .await + { Ok(state) => println!(" result: Ok({state:?}) -- unexpected success\n"), Err(e) => println!(" result: Err(\"{e}\") -- loop aborted after streak > cap\n"), } @@ -195,7 +201,10 @@ async fn main() -> Result<(), Box> { ) .add_exit_state(Step::Done); - match workflow.orchestrate(Step::Poll).await { + match workflow + .orchestrate(Step::Poll, CancellationToken::disabled()) + .await + { Ok(state) => println!(" result: Ok({state:?}) -- Pending reset the counter\n"), Err(e) => println!(" result: Err({e}) -- unexpected failure\n"), } diff --git a/cano/examples/poll_task.rs b/cano/examples/poll_task.rs index 8026ccb..0ede9af 100644 --- a/cano/examples/poll_task.rs +++ b/cano/examples/poll_task.rs @@ -160,7 +160,9 @@ async fn main() -> CanoResult<()> { .register(Step::Process, Process) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::AwaitJob).await?; + let result = workflow + .orchestrate(Step::AwaitJob, CancellationToken::disabled()) + .await?; assert_eq!(result, Step::Done); println!("\ncompleted at {result:?}"); diff --git a/cano/examples/processing_models_tour.rs b/cano/examples/processing_models_tour.rs index 457dd21..dc5fb6d 100644 --- a/cano/examples/processing_models_tour.rs +++ b/cano/examples/processing_models_tour.rs @@ -242,7 +242,9 @@ async fn main() -> Result<(), Box> { .with_checkpoint_store(checkpoint_store.clone()) .with_workflow_id(run_id); - let result = workflow.orchestrate(Stage::Route).await?; + let result = workflow + .orchestrate(Stage::Route, CancellationToken::disabled()) + .await?; assert_eq!(result, Stage::Done); println!("\ncompleted at {result:?}"); diff --git a/cano/examples/router_task.rs b/cano/examples/router_task.rs index 1af745e..151999c 100644 --- a/cano/examples/router_task.rs +++ b/cano/examples/router_task.rs @@ -123,13 +123,17 @@ fn build_workflow(use_fast_path: bool) -> Workflow { async fn main() -> CanoResult<()> { println!("=== fast-path run ==="); let workflow = build_workflow(true); - let result = workflow.orchestrate(Step::Classify).await?; + let result = workflow + .orchestrate(Step::Classify, CancellationToken::disabled()) + .await?; assert_eq!(result, Step::Done); println!("completed at {result:?}\n"); println!("=== slow-path run ==="); let workflow = build_workflow(false); - let result = workflow.orchestrate(Step::Classify).await?; + let result = workflow + .orchestrate(Step::Classify, CancellationToken::disabled()) + .await?; assert_eq!(result, Step::Done); println!("completed at {result:?}"); diff --git a/cano/examples/saga_payment.rs b/cano/examples/saga_payment.rs index 445050b..b3bb217 100644 --- a/cano/examples/saga_payment.rs +++ b/cano/examples/saga_payment.rs @@ -118,7 +118,10 @@ async fn main() -> Result<(), Box> { .register(Step::Ship, ShipOrder) // plain — and it fails .add_exit_state(Step::Done); - match workflow.orchestrate(Step::Reserve).await { + match workflow + .orchestrate(Step::Reserve, CancellationToken::disabled()) + .await + { Ok(state) => println!("\ncompleted at {state:?}"), Err(error) => println!("\nfailed, rolled back: {error}"), } diff --git a/cano/examples/saga_recovery.rs b/cano/examples/saga_recovery.rs index 1d0d574..577f50a 100644 --- a/cano/examples/saga_recovery.rs +++ b/cano/examples/saga_recovery.rs @@ -161,7 +161,10 @@ async fn main() -> Result<(), Box> { // --- (a) First run: Charge fails; compensations drain LIFO. ---------- println!("--- run: Reserve → Authorize → Charge (fails) → compensate LIFO ---\n"); - match workflow.orchestrate(Step::Reserve).await { + match workflow + .orchestrate(Step::Reserve, CancellationToken::disabled()) + .await + { Ok(s) => println!("\ncompleted at {s:?} (unexpected)"), Err(e) => println!("\nfailed + rolled back: {e}"), } diff --git a/cano/examples/scheduler_book_prepositions.rs b/cano/examples/scheduler_book_prepositions.rs index 9ff257c..8d14917 100644 --- a/cano/examples/scheduler_book_prepositions.rs +++ b/cano/examples/scheduler_book_prepositions.rs @@ -273,7 +273,9 @@ async fn main() -> CanoResult<()> { ) .add_exit_states(vec![WorkflowPhase::Analyze, WorkflowPhase::Complete]); - let _ = workflow1.orchestrate(WorkflowPhase::Download).await?; + let _ = workflow1 + .orchestrate(WorkflowPhase::Download, CancellationToken::disabled()) + .await?; // Book 2: Alice's Adventures in Wonderland let workflow2 = Workflow::new(Resources::new().insert("store", store.clone())) @@ -283,7 +285,9 @@ async fn main() -> CanoResult<()> { ) .add_exit_states(vec![WorkflowPhase::Analyze, WorkflowPhase::Complete]); - let _ = workflow2.orchestrate(WorkflowPhase::Download).await?; + let _ = workflow2 + .orchestrate(WorkflowPhase::Download, CancellationToken::disabled()) + .await?; // Book 3: A Christmas Carol let workflow3 = Workflow::new(Resources::new().insert("store", store.clone())) @@ -293,7 +297,9 @@ async fn main() -> CanoResult<()> { ) .add_exit_states(vec![WorkflowPhase::Analyze, WorkflowPhase::Complete]); - let _ = workflow3.orchestrate(WorkflowPhase::Download).await?; + let _ = workflow3 + .orchestrate(WorkflowPhase::Download, CancellationToken::disabled()) + .await?; // Analyze and rank the downloaded books println!("\nAnalyzing and ranking books...\n"); @@ -304,7 +310,7 @@ async fn main() -> CanoResult<()> { .add_exit_state(WorkflowPhase::Complete); analysis_workflow - .orchestrate(WorkflowPhase::Analyze) + .orchestrate(WorkflowPhase::Analyze, CancellationToken::disabled()) .await?; println!("\nBook preposition analysis complete!"); diff --git a/cano/examples/scheduler_cancellation.rs b/cano/examples/scheduler_cancellation.rs new file mode 100644 index 0000000..8a3f3ad --- /dev/null +++ b/cano/examples/scheduler_cancellation.rs @@ -0,0 +1,119 @@ +#![cfg(feature = "scheduler")] +//! # Scheduler cooperative cancellation +//! +//! Demonstrates [`RunningScheduler::cancel_flow`](cano::RunningScheduler::cancel_flow): +//! a manually-triggered saga `Reserve → Charge → Ship → Done` whose `Ship` step +//! runs long. A sibling task calls `cancel_flow` once `Ship` is in flight; the +//! engine aborts it at its next await, the saga compensation stack drains in +//! reverse (`Charge` then `Reserve`), and the flow returns to `Idle` — a +//! deliberate cancel is **not** counted as a backoff failure. Graceful `stop()` +//! cancels in-flight flows the same way. +//! +//! Run with: +//! ```bash +//! cargo run --example scheduler_cancellation --features scheduler +//! ``` + +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; + +use cano::prelude::*; +use cano::scheduler::Status; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +enum Step { + Reserve, + Charge, + Ship, + Done, +} + +struct Reserve; +struct Charge; + +#[saga::task(state = Step)] +impl Reserve { + type Output = u32; + async fn run(&self, _res: &Resources) -> Result<(TaskResult, u32), CanoError> { + println!("reserve : holding inventory (ticket #42)"); + Ok((TaskResult::Single(Step::Charge), 42)) + } + async fn compensate(&self, _res: &Resources, ticket: u32) -> Result<(), CanoError> { + println!("reserve : releasing ticket #{ticket} (rollback)"); + Ok(()) + } +} + +#[saga::task(state = Step)] +impl Charge { + type Output = String; + async fn run(&self, _res: &Resources) -> Result<(TaskResult, String), CanoError> { + println!("charge : capturing $42.00 (auth auth-XYZ)"); + Ok((TaskResult::Single(Step::Ship), "auth-XYZ".to_string())) + } + async fn compensate(&self, _res: &Resources, auth: String) -> Result<(), CanoError> { + println!("charge : refunding auth {auth} (rollback)"); + Ok(()) + } +} + +/// Long-running, non-compensatable step. Flips `started` so the sibling +/// canceller fires deterministically while this task is parked in its sleep. +struct Ship { + started: Arc, +} +#[task(state = Step)] +impl Ship { + fn config(&self) -> TaskConfig { + TaskConfig::minimal() + } + async fn run_bare(&self) -> Result, CanoError> { + println!("ship : dispatching shipment… (cancel_flow will stop this)"); + self.started.store(true, Ordering::SeqCst); + tokio::time::sleep(Duration::from_secs(10)).await; + println!("ship : this line should never print"); + Ok(TaskResult::Single(Step::Done)) + } +} + +#[tokio::main] +async fn main() -> CanoResult<()> { + let ship_started = Arc::new(AtomicBool::new(false)); + + let workflow = Workflow::bare() + .register_with_compensation(Step::Reserve, Reserve) + .register_with_compensation(Step::Charge, Charge) + .register( + Step::Ship, + Ship { + started: ship_started.clone(), + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler.manual("order", workflow, Step::Reserve)?; + let running = scheduler.start().await?; + + // Kick off the saga, then cancel it once Ship is in flight. + running.trigger("order").await?; + while !ship_started.load(Ordering::SeqCst) { + tokio::time::sleep(Duration::from_millis(5)).await; + } + println!("\n>>> cancelling the in-flight flow…\n"); + running.cancel_flow("order").await?; + + // Wait for the cancelled run to settle (saga drained, status back to Idle). + loop { + let status = running.status("order").await.map(|i| i.status); + if status != Some(Status::Running) { + println!("\norder flow status after cancel: {status:?} (Idle — not a failure)"); + break; + } + tokio::time::sleep(Duration::from_millis(5)).await; + } + + running.stop().await?; + Ok(()) +} diff --git a/cano/examples/split_bulkhead.rs b/cano/examples/split_bulkhead.rs index 8780062..522b823 100644 --- a/cano/examples/split_bulkhead.rs +++ b/cano/examples/split_bulkhead.rs @@ -167,7 +167,9 @@ async fn main() -> Result<(), CanoError> { .register(Step::Summarize, Summarize) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::ParallelWork).await?; + let result = workflow + .orchestrate(Step::ParallelWork, CancellationToken::disabled()) + .await?; println!("\ncompleted at {result:?}"); println!("\n=== Done ==="); diff --git a/cano/examples/stepped_task.rs b/cano/examples/stepped_task.rs index d084406..71588cf 100644 --- a/cano/examples/stepped_task.rs +++ b/cano/examples/stepped_task.rs @@ -140,7 +140,9 @@ async fn main() -> Result<(), Box> { .with_checkpoint_store(checkpoint_store.clone()) .with_workflow_id(run_id); - let result = workflow.orchestrate(Stage::Crunch).await?; + let result = workflow + .orchestrate(Stage::Crunch, CancellationToken::disabled()) + .await?; assert_eq!(result, Stage::Done); println!("\ncompleted at {result:?}"); diff --git a/cano/examples/store_custom_backend.rs b/cano/examples/store_custom_backend.rs index 379987d..dae181e 100644 --- a/cano/examples/store_custom_backend.rs +++ b/cano/examples/store_custom_backend.rs @@ -221,7 +221,9 @@ async fn main() -> Result<(), CanoError> { .add_exit_state(Step::Done); println!("\n-- Part 2: MemoryStore::get_shared (Arc zero-copy sharing) --"); - let result = workflow.orchestrate(Step::WriteA).await?; + let result = workflow + .orchestrate(Step::WriteA, CancellationToken::disabled()) + .await?; println!("\ncompleted at {result:?}"); println!("\n=== Done ==="); diff --git a/cano/examples/task_interface_demo.rs b/cano/examples/task_interface_demo.rs index 8ac5085..622e676 100644 --- a/cano/examples/task_interface_demo.rs +++ b/cano/examples/task_interface_demo.rs @@ -141,7 +141,10 @@ async fn main() -> Result<(), CanoError> { println!("Executing workflow..."); println!(); - match workflow.orchestrate(TaskState::Start).await { + match workflow + .orchestrate(TaskState::Start, CancellationToken::disabled()) + .await + { Ok(final_state) => { println!(); println!("Workflow completed successfully!"); diff --git a/cano/examples/task_simple.rs b/cano/examples/task_simple.rs index cfc10a1..273a768 100644 --- a/cano/examples/task_simple.rs +++ b/cano/examples/task_simple.rs @@ -82,7 +82,10 @@ async fn main() -> CanoResult<()> { .register(Action::Count, CounterTask) .add_exit_states(vec![Action::Complete]); - match workflow.orchestrate(Action::Generate).await { + match workflow + .orchestrate(Action::Generate, CancellationToken::disabled()) + .await + { Ok(_final_state) => { println!("Workflow completed!"); println!("Final Results:"); diff --git a/cano/examples/testing_helpers.rs b/cano/examples/testing_helpers.rs index 1f4333e..cdd08c3 100644 --- a/cano/examples/testing_helpers.rs +++ b/cano/examples/testing_helpers.rs @@ -117,7 +117,9 @@ async fn main() -> Result<(), Box> { .with_checkpoint_store(checkpoints.clone()) .with_workflow_id("demo-run"); - let final_state = workflow.orchestrate(Step::Start).await?; + let final_state = workflow + .orchestrate(Step::Start, CancellationToken::disabled()) + .await?; assert_eq!(final_state, Step::Done); // The observer captured the whole path and the checkpoint appends along the way. @@ -135,7 +137,10 @@ async fn main() -> Result<(), Box> { let panicky = Workflow::bare() .register(Step::Start, panic_on_attempt(1, Step::Done)) .add_exit_state(Step::Done); - match panicky.orchestrate(Step::Start).await { + match panicky + .orchestrate(Step::Start, CancellationToken::disabled()) + .await + { Ok(_) => unreachable!("the task panics on its first attempt"), Err(e) => println!("panic_on_attempt surfaced as error: {e}"), } @@ -149,7 +154,9 @@ async fn main() -> Result<(), Box> { .register_with_compensation(Step::Work, Charge) .register(Step::Finish, Boom) // fails → drains the compensation stack in reverse .add_exit_state(Step::Done); - let _ = saga.orchestrate(Step::Start).await; // expected to fail and roll back + let _ = saga + .orchestrate(Step::Start, CancellationToken::disabled()) + .await; // expected to fail and roll back let ran = handle.0.lock().unwrap().clone(); // Charge ran last, so it compensates first; then Reserve. diff --git a/cano/examples/timer_task.rs b/cano/examples/timer_task.rs index b28e67b..7956ec7 100644 --- a/cano/examples/timer_task.rs +++ b/cano/examples/timer_task.rs @@ -100,7 +100,9 @@ async fn main() -> CanoResult<()> { .register(Step::Process, Process) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::CoolDown).await?; + let result = workflow + .orchestrate(Step::CoolDown, CancellationToken::disabled()) + .await?; assert_eq!(result, Step::Done); println!("\ncompleted at {result:?}"); diff --git a/cano/examples/tracing_demo.rs b/cano/examples/tracing_demo.rs index 533042c..49af079 100644 --- a/cano/examples/tracing_demo.rs +++ b/cano/examples/tracing_demo.rs @@ -213,7 +213,9 @@ async fn main() -> CanoResult<()> { .add_exit_states(vec![WorkflowState::Complete, WorkflowState::Error]); info!("Starting workflow execution..."); - let result = workflow.orchestrate(WorkflowState::Start).await?; + let result = workflow + .orchestrate(WorkflowState::Start, CancellationToken::disabled()) + .await?; info!(final_state = ?result, "Workflow completed"); println!("Basic workflow completed with state: {result:?}\n"); @@ -248,7 +250,9 @@ async fn main() -> CanoResult<()> { .with_tracing_span(workflow_span); info!("Starting task-based workflow execution (with custom span)..."); - let result = task_workflow.orchestrate(WorkflowState::Start).await?; + let result = task_workflow + .orchestrate(WorkflowState::Start, CancellationToken::disabled()) + .await?; let math_result: i32 = store.get("math_result").unwrap_or(0); let completed_by: String = store.get("task_completed_by").unwrap_or_default(); @@ -329,7 +333,9 @@ async fn main() -> CanoResult<()> { .add_exit_states(vec![WorkflowState::Complete, WorkflowState::Error]); info!("Starting workflow that will encounter validation failure..."); - let result = error_workflow.orchestrate(WorkflowState::Start).await?; + let result = error_workflow + .orchestrate(WorkflowState::Start, CancellationToken::disabled()) + .await?; println!("Error workflow completed with state: {result:?}"); @@ -363,7 +369,9 @@ async fn main() -> CanoResult<()> { // One line: re-emit lifecycle/failure events as `tracing` events. .with_observer(Arc::new(TracingObserver::new())); - let result = observed_workflow.orchestrate(WorkflowState::Start).await?; + let result = observed_workflow + .orchestrate(WorkflowState::Start, CancellationToken::disabled()) + .await?; println!("Observed workflow completed with state: {result:?}"); println!( " (look for `task started` / `task succeeded` events; filter with RUST_LOG=cano::observer=debug)\n" diff --git a/cano/examples/workflow_ad_exchange.rs b/cano/examples/workflow_ad_exchange.rs index 20a4fd5..d55ab8d 100644 --- a/cano/examples/workflow_ad_exchange.rs +++ b/cano/examples/workflow_ad_exchange.rs @@ -596,7 +596,10 @@ async fn main() -> Result<(), CanoError> { let start = tokio::time::Instant::now(); // Execute workflow - if splits timeout or fail, transition to NoFill - let result = match workflow.orchestrate(AdExchangeState::Start).await { + let result = match workflow + .orchestrate(AdExchangeState::Start, CancellationToken::disabled()) + .await + { Ok(state) => state, Err(e) => { // If workflow fails due to split timeout/error, handle as NoFill. @@ -606,7 +609,12 @@ async fn main() -> Result<(), CanoError> { store.put("error_reason", e.to_string())?; println!("\n⚠️ Handling as No Fill due to error\n"); - workflow.orchestrate(AdExchangeState::ErrorTracking).await? + workflow + .orchestrate( + AdExchangeState::ErrorTracking, + CancellationToken::disabled(), + ) + .await? } }; diff --git a/cano/examples/workflow_bare.rs b/cano/examples/workflow_bare.rs index b416b81..3f19384 100644 --- a/cano/examples/workflow_bare.rs +++ b/cano/examples/workflow_bare.rs @@ -87,7 +87,10 @@ async fn main() -> CanoResult<()> { .register(Stage::Sanitize, SanitizeTask) .add_exit_states(vec![Stage::Persist, Stage::Done]); - match workflow.orchestrate(Stage::Validate).await { + match workflow + .orchestrate(Stage::Validate, CancellationToken::disabled()) + .await + { Ok(final_state) => println!("\nBare workflow reached: {final_state:?}\n"), Err(e) => { eprintln!("Workflow failed: {e}"); @@ -105,7 +108,10 @@ async fn main() -> CanoResult<()> { .register(Stage::Persist, PersistTask) // resource task .add_exit_states(vec![Stage::Done]); - match workflow.orchestrate(Stage::Validate).await { + match workflow + .orchestrate(Stage::Validate, CancellationToken::disabled()) + .await + { Ok(final_state) => { println!("\nMixed workflow reached: {final_state:?}"); if let Ok(v) = store.get::("sanitized_value") { diff --git a/cano/examples/workflow_book_prepositions.rs b/cano/examples/workflow_book_prepositions.rs index 753814a..a79024e 100644 --- a/cano/examples/workflow_book_prepositions.rs +++ b/cano/examples/workflow_book_prepositions.rs @@ -507,7 +507,13 @@ async fn run_workflow() -> Result<(), CanoError> { println!(" BookRankingByPrepositionTask (Ranking phase)"); // Execute the entire workflow using Workflow orchestration - match workflow.orchestrate(BookPrepositionAction::Download).await { + match workflow + .orchestrate( + BookPrepositionAction::Download, + CancellationToken::disabled(), + ) + .await + { Ok(final_state) => { match final_state { BookPrepositionAction::Complete => { diff --git a/cano/examples/workflow_cancellation.rs b/cano/examples/workflow_cancellation.rs new file mode 100644 index 0000000..2a74277 --- /dev/null +++ b/cano/examples/workflow_cancellation.rs @@ -0,0 +1,120 @@ +//! # Cooperative cancellation — saga rollback on cancel +//! +//! Demonstrates [`Workflow::orchestrate`](cano::Workflow::orchestrate) with a live token: +//! a 3-step saga `Reserve → Charge → Ship → Done` where a sibling task fires a +//! [`CancellationHandle`](cano::CancellationHandle) once `Ship` is in flight. The in-flight +//! task is aborted at its next await point, the saga compensation stack drains in reverse +//! (`Charge` then `Reserve`), and the call returns +//! [`CanoError::Cancelled`](cano::CanoError::Cancelled). +//! +//! Run with: +//! ```bash +//! cargo run --example workflow_cancellation +//! ``` +//! +//! Expected output (timings will vary): +//! ```text +//! reserve : holding inventory (ticket #42) +//! charge : capturing $42.00 (auth auth-XYZ) +//! ship : dispatching shipment… (a sibling task will cancel this) +//! charge : refunding auth auth-XYZ (rollback) +//! reserve : releasing ticket #42 (rollback) +//! workflow cancelled, rolled back: state=Ship attempt=0 path=[Reserve, Charge, Ship] caused by: Workflow cancelled +//! ``` + +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::time::Duration; + +use cano::CancellationToken; +use cano::prelude::*; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +enum Step { + Reserve, + Charge, + Ship, + Done, +} + +struct Reserve; +struct Charge; + +#[saga::task(state = Step)] +impl Reserve { + type Output = u32; + async fn run(&self, _res: &Resources) -> Result<(TaskResult, u32), CanoError> { + let ticket = 42; + println!("reserve : holding inventory (ticket #{ticket})"); + Ok((TaskResult::Single(Step::Charge), ticket)) + } + async fn compensate(&self, _res: &Resources, ticket: u32) -> Result<(), CanoError> { + println!("reserve : releasing ticket #{ticket} (rollback)"); + Ok(()) + } +} + +#[saga::task(state = Step)] +impl Charge { + type Output = String; + async fn run(&self, _res: &Resources) -> Result<(TaskResult, String), CanoError> { + let auth = "auth-XYZ".to_string(); + println!("charge : capturing $42.00 (auth {auth})"); + Ok((TaskResult::Single(Step::Ship), auth)) + } + async fn compensate(&self, _res: &Resources, auth: String) -> Result<(), CanoError> { + println!("charge : refunding auth {auth} (rollback)"); + Ok(()) + } +} + +// Plain (non-compensatable) long-running task. It flips `started` so the sibling +// canceller fires deterministically while this task is parked in its sleep. +struct Ship { + started: Arc, +} +#[task(state = Step)] +impl Ship { + fn config(&self) -> TaskConfig { + TaskConfig::minimal() + } + async fn run_bare(&self) -> Result, CanoError> { + println!("ship : dispatching shipment… (a sibling task will cancel this)"); + self.started.store(true, Ordering::SeqCst); + tokio::time::sleep(Duration::from_secs(2)).await; + println!("ship : this line should never print"); + Ok(TaskResult::Single(Step::Done)) + } +} + +#[tokio::main] +async fn main() { + let ship_started = Arc::new(AtomicBool::new(false)); + let workflow = Workflow::bare() + .register_with_compensation(Step::Reserve, Reserve) + .register_with_compensation(Step::Charge, Charge) + .register( + Step::Ship, + Ship { + started: ship_started.clone(), + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + + // Sibling task: cancel as soon as `Ship` is in flight. + let canceller = tokio::spawn(async move { + while !ship_started.load(Ordering::SeqCst) { + tokio::time::sleep(Duration::from_millis(5)).await; + } + handle.cancel(); + }); + + match workflow.orchestrate(Step::Reserve, token).await { + Ok(state) => println!("\nworkflow completed at {state:?}"), + Err(error) => println!("\nworkflow cancelled, rolled back: {error}"), + } + + canceller.await.expect("canceller task panicked"); +} diff --git a/cano/examples/workflow_negotiation.rs b/cano/examples/workflow_negotiation.rs index e0c2c14..cd19cf4 100644 --- a/cano/examples/workflow_negotiation.rs +++ b/cano/examples/workflow_negotiation.rs @@ -250,7 +250,13 @@ async fn run_negotiation_workflow() -> Result<(), CanoError> { ]); // Execute the negotiation workflow - match workflow.orchestrate(NegotiationAction::StartSelling).await { + match workflow + .orchestrate( + NegotiationAction::StartSelling, + CancellationToken::disabled(), + ) + .await + { Ok(final_state) => { println!("{}", "=".repeat(50)); diff --git a/cano/examples/workflow_observer.rs b/cano/examples/workflow_observer.rs index 42d224e..d8b076e 100644 --- a/cano/examples/workflow_observer.rs +++ b/cano/examples/workflow_observer.rs @@ -158,7 +158,9 @@ async fn main() -> Result<(), CanoError> { ) .add_exit_state(Step::Done) .with_observer(observer.clone()); - let final_state = workflow.orchestrate(Step::Load).await?; + let final_state = workflow + .orchestrate(Step::Load, CancellationToken::disabled()) + .await?; println!(" → workflow finished in state {final_state:?}\n"); // -- Scenario B ------------------------------------------------------- @@ -181,7 +183,10 @@ async fn main() -> Result<(), CanoError> { ) .add_exit_state(Step::Done) .with_observer(observer.clone()); - match guarded.orchestrate(Step::Probe).await { + match guarded + .orchestrate(Step::Probe, CancellationToken::disabled()) + .await + { Ok(s) => println!(" → unexpectedly finished in {s:?}\n"), Err(e) => println!(" → workflow errored as expected: {e}\n"), } diff --git a/cano/examples/workflow_on_request.rs b/cano/examples/workflow_on_request.rs index 3445196..5c25bfd 100644 --- a/cano/examples/workflow_on_request.rs +++ b/cano/examples/workflow_on_request.rs @@ -142,7 +142,7 @@ fn build_workflow(resources: Resources) -> Workflow { .register(TextPipelineState::Parse, ParseTask) .register(TextPipelineState::Transform, TransformTask) .add_exit_state(TextPipelineState::Done) - .with_timeout(Duration::from_secs(5)) + .with_total_timeout(Duration::from_secs(5)) } // ============================================================================ @@ -165,7 +165,7 @@ async fn process_handler( // Run the FSM to completion. workflow - .orchestrate(TextPipelineState::Parse) + .orchestrate(TextPipelineState::Parse, CancellationToken::disabled()) .await .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; diff --git a/cano/examples/workflow_partial_results.rs b/cano/examples/workflow_partial_results.rs index 411ee94..01038a6 100644 --- a/cano/examples/workflow_partial_results.rs +++ b/cano/examples/workflow_partial_results.rs @@ -103,7 +103,9 @@ async fn main() -> Result<(), CanoError> { println!("Starting workflow..."); let start = std::time::Instant::now(); - let result = workflow.orchestrate(ApiState::Start).await?; + let result = workflow + .orchestrate(ApiState::Start, CancellationToken::disabled()) + .await?; let duration = start.elapsed(); println!( diff --git a/cano/examples/workflow_recovery.rs b/cano/examples/workflow_recovery.rs index 0201d16..da2bd5c 100644 --- a/cano/examples/workflow_recovery.rs +++ b/cano/examples/workflow_recovery.rs @@ -109,7 +109,10 @@ async fn main() -> Result<(), Box> { .with_observer(Arc::new(Watcher)); println!("── run 1: orchestrate (Process will crash) ──"); - if let Err(e) = workflow.orchestrate(Step::Start).await { + if let Err(e) = workflow + .orchestrate(Step::Start, CancellationToken::disabled()) + .await + { println!(" stopped: {e}"); } println!("checkpoint log after run 1 (the crash left it intact):"); @@ -118,7 +121,9 @@ async fn main() -> Result<(), Box> { } println!("\n── run 2: resume_from ──"); - let final_state = workflow.resume_from(run_id).await?; + let final_state = workflow + .resume_from(run_id, CancellationToken::disabled()) + .await?; println!(" reached {final_state:?} — checkpoint log cleared on success"); assert_eq!(final_state, Step::Done); assert!(checkpoint_store.load_run(run_id).await?.is_empty()); diff --git a/cano/examples/workflow_resources.rs b/cano/examples/workflow_resources.rs index c30de2e..a822ae6 100644 --- a/cano/examples/workflow_resources.rs +++ b/cano/examples/workflow_resources.rs @@ -320,7 +320,9 @@ async fn main() -> Result<(), CanoError> { .add_exit_state(Step::Done); println!("Running workflow..."); - let final_state = workflow.orchestrate(Step::Init).await?; + let final_state = workflow + .orchestrate(Step::Init, CancellationToken::disabled()) + .await?; assert_eq!(final_state, Step::Done); let result: u32 = store.get("result")?; diff --git a/cano/examples/workflow_simd_matrix_pipeline.rs b/cano/examples/workflow_simd_matrix_pipeline.rs index 6077d90..8035db7 100644 --- a/cano/examples/workflow_simd_matrix_pipeline.rs +++ b/cano/examples/workflow_simd_matrix_pipeline.rs @@ -470,7 +470,9 @@ async fn main() -> Result<(), Box> { println!("Pipeline: Generate -> Multiply -> Transform -> Statistics -> Complete\n"); // Execute the workflow - let _final_state = workflow.orchestrate(PipelineState::Generate).await?; + let _final_state = workflow + .orchestrate(PipelineState::Generate, CancellationToken::disabled()) + .await?; let total_duration = start_time.elapsed(); println!("\nSIMD Matrix Processing Pipeline completed!"); diff --git a/cano/examples/workflow_simple.rs b/cano/examples/workflow_simple.rs index 1c24e80..ca8348b 100644 --- a/cano/examples/workflow_simple.rs +++ b/cano/examples/workflow_simple.rs @@ -152,7 +152,10 @@ async fn main() -> Result<(), CanoError> { .register(WorkflowAction::Count, CounterTask) .add_exit_states(vec![WorkflowAction::Complete, WorkflowAction::Error]); - match workflow.orchestrate(WorkflowAction::Generate).await { + match workflow + .orchestrate(WorkflowAction::Generate, CancellationToken::disabled()) + .await + { Ok(WorkflowAction::Complete) => { println!("\nWorkflow completed successfully!"); match store.get::("number_count") { diff --git a/cano/examples/workflow_split_join.rs b/cano/examples/workflow_split_join.rs index d4c14e7..ad070a6 100644 --- a/cano/examples/workflow_split_join.rs +++ b/cano/examples/workflow_split_join.rs @@ -142,7 +142,9 @@ async fn main() -> Result<(), CanoError> { .register(DataProcessingState::Aggregate, Aggregator) .add_exit_state(DataProcessingState::Complete); - let result = workflow.orchestrate(DataProcessingState::Start).await?; + let result = workflow + .orchestrate(DataProcessingState::Start, CancellationToken::disabled()) + .await?; let final_result: i32 = store.get("final_result")?; println!("Final result: {}", final_result); @@ -172,7 +174,9 @@ async fn main() -> Result<(), CanoError> { .register(DataProcessingState::Aggregate, Aggregator) .add_exit_state(DataProcessingState::Complete); - let result = workflow.orchestrate(DataProcessingState::Start).await?; + let result = workflow + .orchestrate(DataProcessingState::Start, CancellationToken::disabled()) + .await?; let processor_count: usize = store.get("processor_count")?; println!( @@ -205,7 +209,9 @@ async fn main() -> Result<(), CanoError> { .register(DataProcessingState::Aggregate, Aggregator) .add_exit_state(DataProcessingState::Complete); - let result = workflow.orchestrate(DataProcessingState::Start).await?; + let result = workflow + .orchestrate(DataProcessingState::Start, CancellationToken::disabled()) + .await?; println!("Workflow completed with Any strategy: {:?}\n", result); } @@ -236,7 +242,9 @@ async fn main() -> Result<(), CanoError> { .register(DataProcessingState::Aggregate, Aggregator) .add_exit_state(DataProcessingState::Complete); - let result = workflow.orchestrate(DataProcessingState::Start).await?; + let result = workflow + .orchestrate(DataProcessingState::Start, CancellationToken::disabled()) + .await?; let processor_count: usize = store.get("processor_count")?; println!("Processors completed: {} (66% threshold)", processor_count); @@ -267,7 +275,10 @@ async fn main() -> Result<(), CanoError> { .register(DataProcessingState::Aggregate, Aggregator) .add_exit_state(DataProcessingState::Complete); - match workflow.orchestrate(DataProcessingState::Start).await { + match workflow + .orchestrate(DataProcessingState::Start, CancellationToken::disabled()) + .await + { Ok(result) => println!("Workflow completed: {:?}", result), Err(e) => println!("Workflow failed (expected timeout): {}", e), } diff --git a/cano/examples/workflow_stack_store.rs b/cano/examples/workflow_stack_store.rs index 00393d1..ca053ac 100644 --- a/cano/examples/workflow_stack_store.rs +++ b/cano/examples/workflow_stack_store.rs @@ -159,7 +159,9 @@ async fn main() -> CanoResult<()> { // Execute workflow println!("\nStarting workflow...\n"); - let final_state = workflow.orchestrate(RequestState::Start).await?; + let final_state = workflow + .orchestrate(RequestState::Start, CancellationToken::disabled()) + .await?; // Display results println!("\nFinal Results:"); @@ -198,7 +200,9 @@ async fn main() -> CanoResult<()> { // Execute workflow println!("\nStarting workflow...\n"); - let final_state = workflow.orchestrate(RequestState::Start).await?; + let final_state = workflow + .orchestrate(RequestState::Start, CancellationToken::disabled()) + .await?; // Display results println!("\nFinal Results:"); @@ -237,7 +241,9 @@ async fn main() -> CanoResult<()> { // Execute workflow println!("\nStarting workflow...\n"); - let final_state = workflow.orchestrate(RequestState::Start).await?; + let final_state = workflow + .orchestrate(RequestState::Start, CancellationToken::disabled()) + .await?; // Display results println!("\nFinal Results:"); diff --git a/cano/examples/workflow_total_timeout.rs b/cano/examples/workflow_total_timeout.rs index 94f8e31..4dcf2a7 100644 --- a/cano/examples/workflow_total_timeout.rs +++ b/cano/examples/workflow_total_timeout.rs @@ -91,7 +91,10 @@ async fn main() { .register(Step::Ship, Ship) .add_exit_state(Step::Done); - match workflow.orchestrate(Step::Reserve).await { + match workflow + .orchestrate(Step::Reserve, CancellationToken::disabled()) + .await + { Ok(state) => println!("\nworkflow completed at {state:?}"), Err(error) => println!("\nworkflow failed, rolled back: {error}"), } diff --git a/cano/examples/workflow_validation.rs b/cano/examples/workflow_validation.rs index 2409b2e..384ae68 100644 --- a/cano/examples/workflow_validation.rs +++ b/cano/examples/workflow_validation.rs @@ -102,7 +102,9 @@ async fn main() -> Result<(), Box> { Err(e) => println!(" validate_initial_state(Prepare) -> Err: {e}"), } - let result = workflow.orchestrate(Step::Prepare).await?; + let result = workflow + .orchestrate(Step::Prepare, CancellationToken::disabled()) + .await?; println!(" orchestrate -> {result:?}\n"); } diff --git a/cano/src/bin/recovery_resume.rs b/cano/src/bin/recovery_resume.rs index 6173382..cc1a2a1 100644 --- a/cano/src/bin/recovery_resume.rs +++ b/cano/src/bin/recovery_resume.rs @@ -125,8 +125,16 @@ async fn main() -> Result<(), Box> { })); let final_state = match mode { - "resume" => workflow.resume_from(workflow_id).await?, - _ => workflow.orchestrate(Step::Start).await?, + "resume" => { + workflow + .resume_from(workflow_id, CancellationToken::disabled()) + .await? + } + _ => { + workflow + .orchestrate(Step::Start, CancellationToken::disabled()) + .await? + } }; println!("DONE {final_state:?}"); let _ = std::io::stdout().flush(); diff --git a/cano/src/bin/stepped_resume.rs b/cano/src/bin/stepped_resume.rs index f410286..c0e5309 100644 --- a/cano/src/bin/stepped_resume.rs +++ b/cano/src/bin/stepped_resume.rs @@ -107,13 +107,17 @@ async fn main() -> Result<(), Box> { let final_state = match mode { "resume" => { - let result = workflow.resume_from(WORKFLOW_ID).await?; + let result = workflow + .resume_from(WORKFLOW_ID, CancellationToken::disabled()) + .await?; println!("RESUME COMPLETE final={result:?}"); let _ = std::io::stdout().flush(); result } _ => { - let result = workflow.orchestrate(State::Crunch).await?; + let result = workflow + .orchestrate(State::Crunch, CancellationToken::disabled()) + .await?; println!("RUN COMPLETE final={result:?}"); let _ = std::io::stdout().flush(); result diff --git a/cano/src/cancel.rs b/cano/src/cancel.rs new file mode 100644 index 0000000..ae6e860 --- /dev/null +++ b/cano/src/cancel.rs @@ -0,0 +1,264 @@ +//! Cooperative cancellation for workflow runs. +//! +//! [`CancellationToken`] / [`CancellationHandle`] form a clonable signal pair built on +//! [`tokio::sync::watch`] — no extra dependency. Hand a token to +//! [`Workflow::orchestrate`](crate::workflow::Workflow::orchestrate) +//! (or [`resume_from`](crate::workflow::Workflow::resume_from)) and keep +//! the handle; calling [`CancellationHandle::cancel`] aborts the in-flight cancellable task at its +//! next await point, drains the saga compensation stack, and surfaces +//! [`CanoError::Cancelled`](crate::error::CanoError::Cancelled). To opt a run out of cancellation +//! entirely, pass [`CancellationToken::disabled`]. +//! +//! Cancellation is **cooperative**: the engine drops the running task future at its next `.await`, +//! so a task doing uninterrupted synchronous/CPU work is not interrupted until it next yields. A +//! [`CompensatableTask`](crate::saga::CompensatableTask) is deliberately *never* interrupted +//! mid-run (that would orphan a committed side effect with no entry to roll back) — it runs to +//! completion and the cancel is honoured at the next state boundary. The compensation drain itself +//! is uncancellable. +//! +//! ``` +//! use cano::prelude::*; +//! use cano::CancellationToken; +//! +//! #[derive(Clone, Debug, PartialEq, Eq, Hash)] +//! enum Step { Start, Done } +//! +//! struct Noop; +//! #[task] +//! impl Task for Noop { +//! async fn run_bare(&self) -> Result, CanoError> { +//! Ok(TaskResult::Single(Step::Done)) +//! } +//! } +//! +//! # #[tokio::main] +//! # async fn main() { +//! let (handle, token) = CancellationToken::new(); +//! let workflow = Workflow::bare() +//! .register(Step::Start, Noop) +//! .add_exit_state(Step::Done); +//! +//! // Cancel from anywhere (another task, a signal handler, …): +//! handle.cancel(); +//! +//! let result = workflow.orchestrate(Step::Start, token).await; +//! assert!(matches!(result, Err(e) if e.category() == "cancelled")); +//! # } +//! ``` + +/// The observing half of a cancellation signal. Clonable and cheap to pass into a workflow. +/// +/// A token built via [`CancellationToken::new`] observes its paired [`CancellationHandle`]; a +/// [`disabled`](CancellationToken::disabled) token never fires and adds no overhead, so a run +/// passed one opts out of cancellation entirely. +#[derive(Clone, Debug)] +pub struct CancellationToken { + rx: Option>, +} + +/// The controlling half of a cancellation signal. Call [`cancel`](Self::cancel) to fire it. +/// +/// Clonable, so several owners can trigger cancellation; [`cancel`](Self::cancel) is idempotent. +#[derive(Clone, Debug)] +pub struct CancellationHandle { + tx: tokio::sync::watch::Sender, +} + +impl CancellationToken { + /// Create a fresh handle/token pair. The token is not cancelled until the handle's + /// [`cancel`](CancellationHandle::cancel) is called. + #[must_use] + pub fn new() -> (CancellationHandle, CancellationToken) { + let (tx, rx) = tokio::sync::watch::channel(false); + ( + CancellationHandle { tx }, + CancellationToken { rx: Some(rx) }, + ) + } + + /// A token that can never be cancelled — pass it to + /// [`orchestrate`](crate::workflow::Workflow::orchestrate) / + /// [`resume_from`](crate::workflow::Workflow::resume_from) to opt a run out of cancellation. + /// No channel is allocated, so this path stays allocation- and overhead-free: the FSM skips + /// the cancellation `select!` entirely. + #[must_use] + pub fn disabled() -> Self { + Self { rx: None } + } + + /// Whether this token has already been cancelled. Non-blocking poll. + #[must_use] + pub fn is_cancelled(&self) -> bool { + self.rx.as_ref().is_some_and(|rx| *rx.borrow()) + } + + /// Whether this token can ever observe a cancellation. `false` for the internal "never" + /// token, letting the FSM hot path skip the cancellation `select!` entirely. + pub(crate) fn can_cancel(&self) -> bool { + self.rx.is_some() + } + + /// Resolve once the token is cancelled. A "never" token (or one whose handle was dropped + /// without cancelling) stays pending forever — making it safe to use as a `select!` branch + /// that simply never wins. + pub async fn cancelled(&self) { + match &self.rx { + None => std::future::pending::<()>().await, + Some(rx) => { + let mut rx = rx.clone(); + if *rx.borrow() { + return; + } + while rx.changed().await.is_ok() { + if *rx.borrow() { + return; + } + } + // Sender dropped without ever sending `true`: never cancels. + std::future::pending::<()>().await; + } + } + } +} + +impl CancellationHandle { + /// Signal cancellation to every token observing this handle. Idempotent — calling it again + /// after the first cancel is a no-op. + pub fn cancel(&self) { + let _ = self.tx.send(true); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::Duration; + + #[tokio::test] + async fn cancel_propagates_to_receivers() { + let (handle, token) = CancellationToken::new(); + assert!(!token.is_cancelled()); + handle.cancel(); + // `cancelled()` resolves promptly. + tokio::time::timeout(Duration::from_secs(1), token.cancelled()) + .await + .expect("cancelled() should resolve after cancel"); + assert!(token.is_cancelled()); + } + + #[tokio::test] + async fn clone_after_cancel_still_observes() { + let (handle, token) = CancellationToken::new(); + handle.cancel(); + let cloned = token.clone(); + assert!(cloned.is_cancelled()); + tokio::time::timeout(Duration::from_secs(1), cloned.cancelled()) + .await + .expect("a clone made after cancel still observes it"); + } + + #[test] + fn is_cancelled_polls_without_await() { + let (handle, token) = CancellationToken::new(); + assert!(!token.is_cancelled()); + handle.cancel(); + assert!(token.is_cancelled()); + } + + #[tokio::test] + async fn cancel_is_idempotent() { + let (handle, token) = CancellationToken::new(); + handle.cancel(); + handle.cancel(); // second call is a no-op + assert!(token.is_cancelled()); + } + + #[tokio::test] + async fn dropping_handle_without_cancel_keeps_token_pending() { + let (handle, token) = CancellationToken::new(); + drop(handle); + assert!(!token.is_cancelled()); + // `cancelled()` must NOT resolve just because the sender dropped. + let res = tokio::time::timeout(Duration::from_millis(50), token.cancelled()).await; + assert!( + res.is_err(), + "cancelled() should stay pending after handle drop" + ); + } + + #[tokio::test] + async fn disabled_is_never_cancelled_and_can_cancel_false() { + let token = CancellationToken::disabled(); + assert!(!token.is_cancelled()); + assert!(!token.can_cancel()); + let res = tokio::time::timeout(Duration::from_millis(50), token.cancelled()).await; + assert!(res.is_err(), "disabled token should stay pending"); + } + + #[test] + fn can_cancel_true_for_new_token() { + let (_handle, token) = CancellationToken::new(); + assert!(token.can_cancel()); + } + + // Cancel fires *after* the await begins — exercises the `rx.changed().await` + // wakeup path (the other tests cancel first and hit the fast-path `borrow()`). + #[tokio::test] + async fn cancelled_resolves_when_cancel_fires_while_awaiting() { + let (handle, token) = CancellationToken::new(); + let waiter = tokio::spawn(async move { token.cancelled().await }); + // Let the waiter park inside `changed().await` before cancelling. + tokio::time::sleep(Duration::from_millis(20)).await; + handle.cancel(); + tokio::time::timeout(Duration::from_secs(1), waiter) + .await + .expect("waiter should wake on cancel") + .expect("waiter task should not panic"); + } + + // A clone taken *before* the cancel still observes it (both halves share state). + #[tokio::test] + async fn clone_before_cancel_is_observed() { + let (handle, token) = CancellationToken::new(); + let cloned = token.clone(); + assert!(!cloned.is_cancelled()); + handle.cancel(); + assert!(cloned.is_cancelled()); + assert!(token.is_cancelled()); + tokio::time::timeout(Duration::from_secs(1), cloned.cancelled()) + .await + .expect("a clone made before cancel still resolves"); + } + + // `CancellationHandle` is `Clone`; cancelling via a clone still fires, even after + // the original handle is dropped. + #[tokio::test] + async fn cloned_handle_triggers_cancellation() { + let (handle, token) = CancellationToken::new(); + let handle2 = handle.clone(); + drop(handle); // only the clone remains + assert!(!token.is_cancelled()); + handle2.cancel(); + assert!(token.is_cancelled()); + } + + // One cancel wakes every concurrent awaiter. + #[tokio::test] + async fn multiple_awaiters_all_wake_on_cancel() { + let (handle, token) = CancellationToken::new(); + let waiters: Vec<_> = (0..5) + .map(|_| { + let t = token.clone(); + tokio::spawn(async move { t.cancelled().await }) + }) + .collect(); + tokio::time::sleep(Duration::from_millis(20)).await; + handle.cancel(); + for w in waiters { + tokio::time::timeout(Duration::from_secs(1), w) + .await + .expect("every awaiter should wake") + .expect("awaiter task should not panic"); + } + } +} diff --git a/cano/src/error.rs b/cano/src/error.rs index e58a56e..666605e 100644 --- a/cano/src/error.rs +++ b/cano/src/error.rs @@ -205,6 +205,17 @@ pub enum CanoError { limit: std::time::Duration, }, + /// A run was cancelled via a [`CancellationToken`](crate::cancel::CancellationToken). + /// + /// Emitted by [`orchestrate`](crate::workflow::Workflow::orchestrate) + /// and [`resume_from`](crate::workflow::Workflow::resume_from) when the paired + /// [`CancellationHandle`](crate::cancel::CancellationHandle) fires. The in-flight + /// cancellable task is dropped at its next await point and the compensation stack is drained + /// before this error surfaces. Like every task error from the FSM it is wrapped in + /// [`CanoError::WithStateContext`] (clean rollback); a dirty rollback yields + /// [`CanoError::CompensationFailed`] whose `errors[0]` carries the wrapped `Cancelled`. + Cancelled, + /// A call was rejected because the circuit breaker is open. /// /// Emitted by [`crate::circuit::CircuitBreaker::try_acquire`] (and surfaced through the @@ -382,6 +393,11 @@ impl CanoError { CanoError::WorkflowTimeout { elapsed, limit } } + /// Create a new cancellation error. + pub fn cancelled() -> Self { + CanoError::Cancelled + } + /// Create a new circuit-open error pub fn circuit_open>(msg: S) -> Self { CanoError::CircuitOpen(msg.into()) @@ -510,6 +526,7 @@ impl CanoError { CanoError::RetryExhausted { source, .. } => source.message(), CanoError::Timeout(msg) => msg, CanoError::WorkflowTimeout { .. } => "workflow total timeout exceeded", + CanoError::Cancelled => "workflow cancelled", CanoError::CircuitOpen(msg) => msg, CanoError::RateLimited { .. } => "rate limited", CanoError::CheckpointStore(msg) => msg, @@ -588,6 +605,7 @@ impl CanoError { CanoError::RetryExhausted { .. } => "retry_exhausted", CanoError::Timeout(_) => "timeout", CanoError::WorkflowTimeout { .. } => "workflow_timeout", + CanoError::Cancelled => "cancelled", CanoError::CircuitOpen(_) => "circuit_open", CanoError::RateLimited { .. } => "rate_limited", CanoError::CheckpointStore(_) => "checkpoint_store", @@ -618,6 +636,7 @@ impl std::fmt::Display for CanoError { f, "Workflow total timeout exceeded: elapsed={elapsed:?} limit={limit:?}" ), + CanoError::Cancelled => write!(f, "Workflow cancelled"), CanoError::CircuitOpen(msg) => write!(f, "Circuit open: {msg}"), CanoError::RateLimited { tier, retry_after } => { write!( @@ -713,6 +732,7 @@ impl PartialEq for CanoError { limit: l2, }, ) => e1 == e2 && l1 == l2, + (CanoError::Cancelled, CanoError::Cancelled) => true, (CanoError::CircuitOpen(a), CanoError::CircuitOpen(b)) => a == b, ( CanoError::RateLimited { @@ -991,6 +1011,43 @@ mod tests { assert_ne!(timeout, workflow); } + #[test] + fn test_cancelled_constructor_category_display_and_eq() { + let err = CanoError::cancelled(); + assert_eq!(err.message(), "workflow cancelled"); + assert_eq!(err.category(), "cancelled"); + assert_eq!(err.outer_category(), "cancelled"); + assert_eq!(format!("{err}"), "Workflow cancelled"); + assert_eq!(CanoError::cancelled(), CanoError::Cancelled); + assert_ne!(CanoError::cancelled(), CanoError::timeout("x")); + } + + #[test] + fn test_cancelled_wrapped_in_state_context() { + // How a cancel actually surfaces from orchestrate: wrapped with FSM context. + let wrapped = CanoError::with_state_context( + "Ship", + 0, + vec!["Reserve".into(), "Ship".into()], + CanoError::cancelled(), + ); + // `category()` unwraps `WithStateContext` so alerting still buckets on the cause. + assert_eq!(wrapped.category(), "cancelled"); + assert_eq!(wrapped.outer_category(), "with_state_context"); + assert!(matches!(wrapped.inner(), CanoError::Cancelled)); + // A dirty rollback nests it under CompensationFailed with errors[0] = the wrapped cancel. + let dirty = CanoError::compensation_failed(vec![ + wrapped, + CanoError::task_execution("compensator boom"), + ]); + assert_eq!(dirty.category(), "compensation_failed"); + if let CanoError::CompensationFailed { errors } = &dirty { + assert_eq!(errors[0].category(), "cancelled"); + } else { + panic!("expected CompensationFailed"); + } + } + #[test] fn test_circuit_open_constructor_and_category() { let err = CanoError::circuit_open("breaker tripped"); diff --git a/cano/src/lib.rs b/cano/src/lib.rs index f08514c..3fbc78b 100644 --- a/cano/src/lib.rs +++ b/cano/src/lib.rs @@ -57,7 +57,7 @@ //! .register(Step::Process, ProcessTask) //! .add_exit_state(Step::Done); //! -//! let final_state = workflow.orchestrate(Step::Fetch).await?; +//! let final_state = workflow.orchestrate(Step::Fetch, CancellationToken::disabled()).await?; //! assert_eq!(final_state, Step::Done); //! //! // The sum of 1..=3 is 6. @@ -95,7 +95,7 @@ //! .register(Step::Compute, ComputeTask) //! .add_exit_state(Step::Done); //! -//! let final_state = workflow.orchestrate(Step::Compute).await?; +//! let final_state = workflow.orchestrate(Step::Compute, CancellationToken::disabled()).await?; //! assert_eq!(final_state, Step::Done); //! # Ok(()) //! # } @@ -199,13 +199,25 @@ //! //! ### Timeouts //! -//! Three layered budgets bound a run. [`TaskConfig::with_attempt_timeout`](task::TaskConfig::with_attempt_timeout) +//! Two layered budgets bound a run. [`TaskConfig::with_attempt_timeout`](task::TaskConfig::with_attempt_timeout) //! caps each individual task attempt. [`Workflow::with_total_timeout`] sets a wall-clock //! budget for the entire [`orchestrate`](Workflow::orchestrate) / [`resume_from`](Workflow::resume_from) //! call; when it elapses the in-flight task is aborted, the saga compensation stack drains //! against its own bounded budget (configurable via [`Workflow::with_compensation_timeout`]), -//! and the call returns [`CanoError::WorkflowTimeout`]. Contrast with [`Workflow::with_timeout`], -//! a blunt outer `tokio::time::timeout` that offers no graceful compensation. +//! and the call returns [`CanoError::WorkflowTimeout`]. To stop a run on an external signal rather +//! than a deadline, use [cooperative cancellation](#cooperative-cancellation). +//! +//! ### Cooperative cancellation +//! +//! [`Workflow::orchestrate`] (and [`resume_from`](Workflow::resume_from)) +//! take a [`CancellationToken`] obtained from [`CancellationToken::new`]; firing the paired +//! [`CancellationHandle`] aborts the in-flight cancellable task at its next `.await`, drains the +//! saga compensation stack, and returns [`CanoError::Cancelled`]. Cancellation is *cooperative* +//! (a task in tight synchronous work isn't interrupted until it yields) and *saga-safe* (a +//! [`CompensatableTask`] always runs to completion so its rollback entry is recorded; the cancel +//! is honoured at the next state boundary). The compensation drain itself is uncancellable. +//! To opt a run out of cancellation, pass [`CancellationToken::disabled`] — it never fires and is +//! zero-cost (the FSM skips the cancellation `select!`). See the [`cancel`] module. //! //! ## Module Overview //! @@ -215,6 +227,7 @@ //! - [`task::timer`]: The [`TimerTask`] trait — wait-then-transition via `wait()`/`after_wait()`; registered with [`Workflow::register`] //! - [`task::batch`]: The [`BatchTask`] trait — fan-out over data items via `load`/`process_item`/`finish`; registered with [`Workflow::register`] //! - [`task::stepped`]: The [`SteppedTask`] trait — resumable iterative work via `step()` with a serializable cursor; registered with [`Workflow::register_stepped`] (persists the cursor when a checkpoint store is attached) +//! - [`cancel`]: [`CancellationToken`] / [`CancellationHandle`] — cooperative cancellation for [`orchestrate`](Workflow::orchestrate) //! - [`workflow`]: [`Workflow`] — FSM orchestration with Split/Join support //! - `scheduler` (requires `scheduler` feature): `Scheduler` (builder) and `RunningScheduler` (live handle) — cron and interval scheduling //! - [`mod@resource`]: [`Resource`] trait, [`Resources`] dictionary, and [`HealthStatus`] — lifecycle-aware resource management and health probes @@ -240,6 +253,7 @@ //! 2. Read the module docs — each module has detailed documentation and examples //! 3. Run benchmarks: `cargo bench --bench workflow_performance` +pub mod cancel; pub mod circuit; pub mod error; pub mod observer; @@ -261,6 +275,7 @@ pub mod scheduler; pub mod testing; // Core public API - simplified imports +pub use cancel::{CancellationHandle, CancellationToken}; pub use circuit::{CircuitBreaker, CircuitPolicy, CircuitState, Permit as CircuitPermit}; pub use error::{CanoError, CanoResult}; pub use observer::WorkflowObserver; @@ -353,14 +368,14 @@ pub mod prelude { //! Use `use cano::prelude::*;` to import the most commonly used types and traits. pub use crate::{ - BatchTask, CanoError, CanoResult, CheckpointRow, CheckpointStore, CircuitBreaker, - CircuitPermit, CircuitPolicy, CircuitState, CompensatableTask, HealthStatus, JoinConfig, - JoinStrategy, MemoryStore, Meter, MeterStatus, MultiPermit, MultiRateLimiter, - PollErrorPolicy, PollOutcome, PollTask, RateLimiter, RateLimiterPermit, RateLimiterPolicy, - Reservation, Resource, Resources, RetryMode, RouterTask, RowKind, SplitResult, - SplitTaskResult, StateEntry, StepOutcome, SteppedTask, Task, TaskConfig, TaskObject, - TaskResult, Tier, TimerOutcome, TimerTask, WindowPermit, WindowPolicy, WindowedRateLimiter, - Workflow, WorkflowObserver, run_stepped, + BatchTask, CancellationHandle, CancellationToken, CanoError, CanoResult, CheckpointRow, + CheckpointStore, CircuitBreaker, CircuitPermit, CircuitPolicy, CircuitState, + CompensatableTask, HealthStatus, JoinConfig, JoinStrategy, MemoryStore, Meter, MeterStatus, + MultiPermit, MultiRateLimiter, PollErrorPolicy, PollOutcome, PollTask, RateLimiter, + RateLimiterPermit, RateLimiterPolicy, Reservation, Resource, Resources, RetryMode, + RouterTask, RowKind, SplitResult, SplitTaskResult, StateEntry, StepOutcome, SteppedTask, + Task, TaskConfig, TaskObject, TaskResult, Tier, TimerOutcome, TimerTask, WindowPermit, + WindowPolicy, WindowedRateLimiter, Workflow, WorkflowObserver, run_stepped, }; #[cfg(feature = "scheduler")] diff --git a/cano/src/metrics.rs b/cano/src/metrics.rs index c08098d..60a1b1c 100644 --- a/cano/src/metrics.rs +++ b/cano/src/metrics.rs @@ -102,6 +102,7 @@ pub const OBSERVED_WORKFLOW_TIMEOUT_LIMIT_SECONDS: &str = pub const OBSERVED_WORKFLOW_TIMEOUT_ELAPSED_SECONDS: &str = "cano_observed_workflow_timeout_elapsed_seconds"; pub const OBSERVED_UNKNOWN_RESUME_STATES_TOTAL: &str = "cano_observed_unknown_resume_states_total"; +pub const OBSERVED_CANCELLATIONS_TOTAL: &str = "cano_observed_cancellations_total"; // Always-on direct instrumentation: pub const WORKFLOW_RUNS_TOTAL: &str = "cano_workflow_runs_total"; @@ -194,11 +195,16 @@ pub fn describe() { Unit::Count, "Checkpoint rows whose state label is not registered on the current workflow (emitted by MetricsObserver during resume_from)" ); + describe_counter!( + OBSERVED_CANCELLATIONS_TOTAL, + Unit::Count, + "Workflow runs cancelled via a CancellationToken (emitted by MetricsObserver via on_cancelled)" + ); describe_counter!( WORKFLOW_RUNS_TOTAL, Unit::Count, - "Workflow runs (via Workflow::orchestrate/resume_from), by terminal outcome (completed|failed|timeout)" + "Workflow runs (via Workflow::orchestrate/resume_from), by terminal outcome (completed|failed)" ); describe_histogram!( WORKFLOW_DURATION_SECONDS, @@ -402,6 +408,9 @@ pub(crate) fn observed_workflow_timeout(elapsed: Duration, limit: Duration) { pub(crate) fn observed_unknown_resume_state() { counter!(OBSERVED_UNKNOWN_RESUME_STATES_TOTAL).increment(1); } +pub(crate) fn observed_cancellation() { + counter!(OBSERVED_CANCELLATIONS_TOTAL).increment(1); +} // ----- workflow run ----- diff --git a/cano/src/observer.rs b/cano/src/observer.rs index e32d6e0..6099b65 100644 --- a/cano/src/observer.rs +++ b/cano/src/observer.rs @@ -107,6 +107,15 @@ pub trait WorkflowObserver: Send + Sync + 'static { /// wrapped timeout (dirty rollback). fn on_workflow_timeout(&self, _elapsed: std::time::Duration, _limit: std::time::Duration) {} + /// Called when a run is cancelled via a + /// [`CancellationToken`](crate::cancel::CancellationToken) — either observed at a state + /// boundary or while a cancellable task was in flight. `state` is the `Debug` rendering of the + /// state the cancellation was observed at. Fires exactly once per cancelled run, immediately + /// before the compensation stack is drained. Followed on the public API's return by a + /// `CanoError::WithStateContext` wrapping a `CanoError::Cancelled` (clean rollback), or a + /// `CanoError::CompensationFailed` whose `errors[0]` is the wrapped `Cancelled` (dirty rollback). + fn on_cancelled(&self, _state: &str) {} + /// Called when the engine attempted to clear a checkpoint log (after a /// successful run or after a clean compensation drain) and the backend /// returned an error. @@ -236,6 +245,9 @@ impl WorkflowObserver for TracingObserver { "workflow total timeout exceeded" ); } + fn on_cancelled(&self, state: &str) { + tracing::warn!(state, "workflow cancelled"); + } fn on_checkpoint_clear_failed(&self, workflow_id: &str, error: &CanoError) { tracing::warn!(workflow_id, error = %error, "checkpoint log clear failed"); } @@ -297,6 +309,9 @@ impl WorkflowObserver for MetricsObserver { fn on_workflow_timeout(&self, elapsed: std::time::Duration, limit: std::time::Duration) { crate::metrics::observed_workflow_timeout(elapsed, limit); } + fn on_cancelled(&self, _state: &str) { + crate::metrics::observed_cancellation(); + } fn on_checkpoint_clear_failed(&self, _workflow_id: &str, _error: &CanoError) { crate::metrics::checkpoint_clear(false); } @@ -382,7 +397,13 @@ mod tests { .add_exit_state(S::Done) .with_observer(Arc::new(obs)); - assert_eq!(workflow.orchestrate(S::Start).await.unwrap(), S::Done); + assert_eq!( + workflow + .orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap(), + S::Done + ); let events = rec.labels(); assert!( @@ -415,7 +436,12 @@ mod tests { .add_exit_state(S::Done) .with_observer(Arc::new(obs)); - assert!(workflow.orchestrate(S::Start).await.is_err()); + assert!( + workflow + .orchestrate(S::Start, CancellationToken::disabled()) + .await + .is_err() + ); let events = rec.labels(); assert!( @@ -463,7 +489,10 @@ mod tests { .add_exit_state(S::Done) .with_observer(Arc::new(obs)); - let err = workflow.orchestrate(S::Start).await.unwrap_err(); + let err = workflow + .orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap_err(); // The FSM wraps the failure with state context; the inner is CircuitOpen. assert!(matches!(err.inner(), CanoError::CircuitOpen(_)), "{err}"); @@ -484,7 +513,13 @@ mod tests { let workflow = Workflow::bare() .register(S::Start, OkTask) .add_exit_state(S::Done); - assert_eq!(workflow.orchestrate(S::Start).await.unwrap(), S::Done); + assert_eq!( + workflow + .orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap(), + S::Done + ); } #[test] @@ -580,7 +615,7 @@ mod metrics_observer_tests { .register(S::Start, GoTo(S::Mid)) .register(S::Mid, GoTo(S::Done)) .add_exit_state(S::Done) - .orchestrate(S::Start) + .orchestrate(S::Start, CancellationToken::disabled()) .await }); assert_eq!(res.unwrap(), S::Done); @@ -633,7 +668,7 @@ mod metrics_observer_tests { .with_total_timeout(std::time::Duration::from_millis(20)) .register(S::Start, SlowTask) .add_exit_state(S::Done) - .orchestrate(S::Start) + .orchestrate(S::Start, CancellationToken::disabled()) .await }); assert!(res.is_err()); @@ -681,7 +716,7 @@ mod metrics_observer_tests { .with_observer(Arc::new(MetricsObserver::new())) .register(S::Start, Flaky(n2)) .add_exit_state(S::Done) - .orchestrate(S::Start) + .orchestrate(S::Start, CancellationToken::disabled()) .await }); assert_eq!(res.unwrap(), S::Done); diff --git a/cano/src/scheduler.rs b/cano/src/scheduler.rs index 560db9f..6433bce 100644 --- a/cano/src/scheduler.rs +++ b/cano/src/scheduler.rs @@ -30,6 +30,7 @@ mod backoff; pub use backoff::BackoffPolicy; +use crate::cancel::CancellationHandle; use crate::error::CanoResult; use crate::workflow::Workflow; use chrono::{DateTime, Utc}; @@ -53,6 +54,12 @@ enum SchedulerCommand { id: Arc, response: oneshot::Sender>, }, + /// Request cooperative cancellation of a flow's in-flight run. A no-op when + /// the flow isn't currently running. + Cancel { + id: Arc, + response: oneshot::Sender>, + }, } /// Simplified scheduling options @@ -134,6 +141,11 @@ where schedule: ParsedSchedule, info: Arc>, policy: Arc, + /// Cancellation handle for the flow's *currently executing* run, published + /// by `execute_reserved_flow` while a run is in flight and cleared when it + /// finishes. `None` when the flow is idle. Lets `cancel_flow` and graceful + /// shutdown cooperatively cancel an in-flight run. + cancel: Arc>>, } impl Clone for FlowData @@ -148,6 +160,7 @@ where schedule: self.schedule.clone(), info: Arc::clone(&self.info), policy: self.policy.clone(), + cancel: Arc::clone(&self.cancel), } } } diff --git a/cano/src/scheduler/builder.rs b/cano/src/scheduler/builder.rs index 6378fd6..e9a2f29 100644 --- a/cano/src/scheduler/builder.rs +++ b/cano/src/scheduler/builder.rs @@ -184,6 +184,7 @@ where schedule, info, policy: Arc::new(BackoffPolicy::default()), + cancel: Arc::new(RwLock::new(None)), }, ); self.flow_order.push(id); @@ -323,6 +324,7 @@ where let initial_state = fd.initial_state.clone(); let info = Arc::clone(&fd.info); let policy = fd.policy.clone(); + let cancel = Arc::clone(&fd.cancel); let running_clone = Arc::clone(&running); let notify_clone = Arc::clone(&stop_notify); @@ -334,6 +336,7 @@ where initial_state, info, policy, + cancel, running_clone, notify_clone, interval, @@ -347,6 +350,7 @@ where initial_state, info, policy, + cancel, running_clone, notify_clone, cron_schedule, diff --git a/cano/src/scheduler/loops.rs b/cano/src/scheduler/loops.rs index 6df6dff..bd5af6d 100644 --- a/cano/src/scheduler/loops.rs +++ b/cano/src/scheduler/loops.rs @@ -13,6 +13,7 @@ use tokio::sync::{Notify, RwLock, mpsc, watch}; use tokio::task::{AbortHandle, JoinHandle}; use tokio::time::{Duration, sleep}; +use crate::cancel::{CancellationHandle, CancellationToken}; use crate::error::{CanoError, CanoResult}; use crate::workflow::Workflow; @@ -73,11 +74,13 @@ async fn sleep_unless_stopped( /// Per-flow `Every`-schedule loop body. Lives outside `start` so the driver /// task and the loops are decoupled — the driver owns the workflows /// HashMap, the loops just see the data they need. +#[allow(clippy::too_many_arguments)] pub(super) async fn spawn_every_loop( workflow: Arc>, initial_state: TState, info: Arc>, policy: Arc, + cancel: Arc>>, running: Arc>, stop_notify: Arc, interval: Duration, @@ -98,6 +101,7 @@ pub(super) async fn spawn_every_loop( initial_state.clone(), Arc::clone(&info), &policy, + Arc::clone(&cancel), ) .await; } @@ -134,6 +138,7 @@ pub(super) async fn spawn_every_loop( initial_state.clone(), Arc::clone(&info), &policy, + Arc::clone(&cancel), ) .await; } @@ -141,11 +146,13 @@ pub(super) async fn spawn_every_loop( /// Per-flow `Cron`-schedule loop body. See [`spawn_every_loop`] for the /// rationale on splitting the loop bodies out of `start`. +#[allow(clippy::too_many_arguments)] pub(super) async fn spawn_cron_loop( workflow: Arc>, initial_state: TState, info: Arc>, policy: Arc, + cancel: Arc>>, running: Arc>, stop_notify: Arc, schedule: Box, @@ -208,6 +215,7 @@ pub(super) async fn spawn_cron_loop( initial_state.clone(), Arc::clone(&info), &policy, + Arc::clone(&cancel), ) .await; } @@ -258,8 +266,16 @@ pub(super) async fn driver_task( let initial_state = flow.initial_state.clone(); let info = Arc::clone(&flow.info); let policy = Arc::clone(&flow.policy); + let cancel = Arc::clone(&flow.cancel); let handle = tokio::spawn(async move { - execute_reserved_flow(workflow, initial_state, info, &policy).await; + execute_reserved_flow( + workflow, + initial_state, + info, + &policy, + cancel, + ) + .await; }); let mut tasks = scheduler_tasks.write().await; tasks.retain(|h| !h.is_finished()); @@ -299,6 +315,24 @@ pub(super) async fn driver_task( ))) }; + let _ = response.send(outcome); + } + SchedulerCommand::Cancel { id, response } => { + let outcome = if let Some(flow) = workflows.get(&id) { + // Fire the in-flight run's cancellation handle, if any. The + // run observes `Cancelled` at its next await, drains its saga, + // and `apply_outcome` returns the flow to `Idle`. A flow that + // isn't currently running has no handle — an idempotent no-op. + if let Some(h) = flow.cancel.read().await.as_ref() { + h.cancel(); + } + Ok(()) + } else { + Err(CanoError::Workflow(format!( + "No workflow registered with id '{id}'" + ))) + }; + let _ = response.send(outcome); } } @@ -313,6 +347,17 @@ pub(super) async fn driver_task( // how long an in-flight workflow takes — not by the schedule interval. stop_notify.notify_waiters(); + // Cooperatively cancel every in-flight run so shutdown latency is bounded by + // the time to the next await + the saga drain, not by how long the workflow + // would naturally take. Each cancelled run drains its compensation stack and + // returns `Cancelled` (recorded as Idle, not a failure, by `apply_outcome`). + // The bounded wait below still caps the total drain time. + for flow in workflows.values() { + if let Some(h) = flow.cancel.read().await.as_ref() { + h.cancel(); + } + } + // Wait for all scheduler loop tasks to finish. // // Pop with a short-lived write lock per iteration (rather than holding @@ -387,6 +432,7 @@ async fn execute_flow( initial_state: TState, info: Arc>, policy: &BackoffPolicy, + cancel: Arc>>, ) where TState: Clone + Send + Sync + 'static + std::fmt::Debug + std::hash::Hash + Eq, TResourceKey: Hash + Eq + Send + Sync + 'static, @@ -398,7 +444,7 @@ async fn execute_flow( return; } - execute_reserved_flow(workflow, initial_state, info, policy).await; + execute_reserved_flow(workflow, initial_state, info, policy, cancel).await; } /// Result of attempting to reserve a flow for dispatch. The Tripped and @@ -434,6 +480,7 @@ async fn execute_reserved_flow( initial_state: TState, info: Arc>, policy: &BackoffPolicy, + cancel: Arc>>, ) where TState: Clone + Send + Sync + 'static + std::fmt::Debug + std::hash::Hash + Eq, TResourceKey: Hash + Eq + Send + Sync + 'static, @@ -455,6 +502,13 @@ async fn execute_reserved_flow( .total_timeout .map(|d| (std::time::Instant::now(), d)); + // Publish a fresh cancellation handle for this run so `cancel_flow` and + // graceful shutdown can cooperatively stop it (and drain its saga). A fresh + // token per run means cancelling one run never poisons a later one. Cleared + // below once the run finishes, so a `cancel_flow` on an idle flow is a no-op. + let (handle, token) = CancellationToken::new(); + *cancel.write().await = Some(handle); + // Wrap the workflow future in `catch_unwind`. A panic inside any path // that bypasses the FSM's own `catch_unwind` (e.g. an observer that // panics, a custom checkpoint store that panics) would otherwise abort @@ -465,10 +519,10 @@ async fn execute_reserved_flow( // `BackoffPolicy`. #[cfg(feature = "tracing")] let workflow_fut = workflow - .execute_workflow(initial_state, total_budget) + .execute_workflow(initial_state, total_budget, token) .instrument(tracing::info_span!("execute_flow")); #[cfg(not(feature = "tracing"))] - let workflow_fut = workflow.execute_workflow(initial_state, total_budget); + let workflow_fut = workflow.execute_workflow(initial_state, total_budget, token); let result = match AssertUnwindSafe(workflow_fut).catch_unwind().await { Ok(inner) => inner, @@ -482,6 +536,10 @@ async fn execute_reserved_flow( } }; + // The run is over: drop the handle so a later `cancel_flow` on this now-idle + // flow is a clean no-op rather than firing a stale token. + *cancel.write().await = None; + #[cfg(feature = "metrics")] crate::metrics::scheduler_flow_run(&_flow_id, result.is_ok(), _started.elapsed()); @@ -504,6 +562,14 @@ async fn apply_outcome( info_guard.failure_streak = 0; info_guard.next_eligible = None; } + // A deliberate cancellation (via `cancel_flow` or graceful shutdown) is + // not a fault: return the flow to `Idle` without touching the failure + // streak or backoff window, so its next scheduled run fires normally. A + // *dirty* cancel whose rollback itself failed surfaces as + // `compensation_failed`, which falls through to the backoff arm below. + Err(ref e) if e.category() == "cancelled" => { + info_guard.status = Status::Idle; + } Err(e) => { let err_str: Arc = Arc::from(e.to_string()); let new_streak = info_guard.failure_streak.saturating_add(1); diff --git a/cano/src/scheduler/running.rs b/cano/src/scheduler/running.rs index bdd6e0f..91d4d14 100644 --- a/cano/src/scheduler/running.rs +++ b/cano/src/scheduler/running.rs @@ -210,6 +210,44 @@ where })? } + /// Request cooperative cancellation of a flow's in-flight run. + /// + /// Fires the run's [`CancellationToken`](crate::cancel::CancellationToken): + /// the in-flight workflow aborts at its next await point, drains its saga + /// compensation stack, and returns [`CanoError::Cancelled`]. The flow then + /// returns to [`Status::Idle`](crate::scheduler::Status::Idle) — a deliberate + /// cancel is **not** counted as a failure against the [`BackoffPolicy`](crate::scheduler::BackoffPolicy), + /// so the next scheduled run fires normally. + /// + /// A **no-op** (returns `Ok`) when the flow exists but isn't currently + /// running. Graceful [`stop`](Self::stop) cancels every in-flight flow this + /// same way before draining. + /// + /// # Errors + /// + /// - [`CanoError::Workflow`] — the scheduler is not running, `id` is unknown, + /// or the command queue is full. + pub async fn cancel_flow(&self, id: &str) -> CanoResult<()> { + let (response_tx, response_rx) = oneshot::channel(); + self.command_tx + .try_send(SchedulerCommand::Cancel { + id: Arc::from(id), + response: response_tx, + }) + .map_err(|e| match e { + mpsc::error::TrySendError::Closed(_) => CanoError::Workflow( + "Scheduler not running — call start() before cancel_flow()".to_string(), + ), + mpsc::error::TrySendError::Full(_) => { + CanoError::Workflow("Scheduler command queue full".to_string()) + } + })?; + + response_rx.await.map_err(|_| { + CanoError::Workflow("Scheduler stopped before cancel was processed".to_string()) + })? + } + /// Get a snapshot of the workflow status. pub async fn status(&self, id: &str) -> Option { let info = self.flows.get(id)?; @@ -722,74 +760,6 @@ mod tests { assert!(result.is_ok(), "Test timed out"); } - #[tokio::test(flavor = "multi_thread")] - async fn test_trigger_during_graceful_shutdown_window_reports_not_running() { - // While the driver task is parked waiting for a slow in-flight workflow - // to finish, a concurrent trigger() must surface "not running" instead - // of enqueueing into the closed command channel. - #[derive(Clone)] - struct SlowTask; - - #[task] - impl Task for SlowTask { - async fn run_bare(&self) -> Result, CanoError> { - // Hold Status::Running long enough to span the shutdown window. - sleep(Duration::from_millis(400)).await; - Ok(TaskResult::Single(TestState::Complete)) - } - } - - let timeout = Duration::from_secs(5); - let result = tokio::time::timeout(timeout, async { - let mut scheduler: Scheduler = Scheduler::::new(); - let slow_workflow = Workflow::bare() - .register(TestState::Start, SlowTask) - .add_exit_state(TestState::Complete) - .add_exit_state(TestState::Error); - scheduler - .manual("slow_task", slow_workflow, TestState::Start) - .unwrap(); - - let running = scheduler.start().await.unwrap(); - let probe = running.clone(); - - // Kick off the slow workflow and wait until it is actually Running. - probe.trigger("slow_task").await.unwrap(); - sleep(Duration::from_millis(50)).await; - assert!( - probe.has_running_flows().await, - "slow workflow should be Running before stop()" - ); - - // Spawn stop() so we can probe the shutdown window concurrently. - let stop_handle = tokio::spawn(async move { running.stop().await }); - - // Let the driver dequeue Stop and close the command channel. The - // slow workflow is still running (~400ms total), so the driver is - // parked inside has_running_flows() — the shutdown window we want - // to probe. - sleep(Duration::from_millis(50)).await; - assert!( - !stop_handle.is_finished(), - "stop() must still be parked while the slow workflow is in flight" - ); - - // During the window, trigger() must report not-running. - let err = probe.trigger("slow_task").await.unwrap_err(); - assert!( - err.to_string().contains("Scheduler not running"), - "expected not-running during shutdown window, got: {err}" - ); - - // stop() eventually returns Ok (teardown finishes). - let stop_result = stop_handle.await.expect("stop task should not panic"); - stop_result.expect("stop should succeed once slow workflow finishes"); - }) - .await; - - assert!(result.is_ok(), "Test timed out"); - } - #[tokio::test(flavor = "multi_thread")] async fn test_failed_workflow_registration() { // Registering a "failing" workflow (one whose post() returns Err) is a @@ -1534,97 +1504,229 @@ mod tests { assert!(result.is_ok(), "Test timed out"); } - #[tokio::test(flavor = "multi_thread")] - async fn drop_aborts_wedged_handle_currently_being_awaited_by_driver() { - // Regression for F9: when the driver pops a JoinHandle from - // `scheduler_tasks` and awaits it, the popped handle no longer lives - // in the Vec. A `Drop` firing while the await is in flight previously - // aborted `driver_handle` (cancelling the driver future, which then - // dropped the popped JoinHandle — detaching the underlying task - // instead of aborting it). The wedged task leaked indefinitely. - // - // Now `RunningScheduler::in_flight_drain` holds the popped handle's - // `AbortHandle` for the duration of the await, so Drop can reach the - // wedged task. This test triggers a workflow whose task sleeps for - // far longer than the test's tolerance, stops the scheduler so the - // driver enters its drain phase, drops the last clone, and asserts - // that the workflow's completion counter never advances. - use std::sync::atomic::{AtomicUsize, Ordering}; - - #[derive(Clone)] - struct SlowTask { - completions: Arc, + // A long-running, cancellable flow task that records when it starts and (if + // never cancelled) when it completes — used to verify graceful shutdown + // cooperatively cancels in-flight flows. + #[derive(Clone)] + struct CancellableSlow { + started: std::sync::Arc, + completed: std::sync::Arc, + } + #[task] + impl Task for CancellableSlow { + fn config(&self) -> crate::task::TaskConfig { + crate::task::TaskConfig::minimal() } - #[task] - impl Task for SlowTask { - fn config(&self) -> crate::task::TaskConfig { - crate::task::TaskConfig::minimal() - } - async fn run_bare(&self) -> Result, CanoError> { - // Sleeps far longer than the test tolerance. If the abort - // doesn't reach this task, the counter eventually ticks up. - sleep(Duration::from_secs(30)).await; - self.completions.fetch_add(1, Ordering::SeqCst); - Ok(TaskResult::Single(TestState::Complete)) - } + async fn run_bare(&self) -> Result, CanoError> { + self.started.fetch_add(1, Ordering::SeqCst); + sleep(Duration::from_secs(30)).await; + self.completed.fetch_add(1, Ordering::SeqCst); + Ok(TaskResult::Single(TestState::Complete)) } + } - let timeout = Duration::from_secs(8); - let result = tokio::time::timeout(timeout, async { - let completions = Arc::new(AtomicUsize::new(0)); + #[tokio::test(flavor = "multi_thread")] + async fn graceful_stop_cancels_in_flight_flow() { + // Graceful shutdown cooperatively cancels a running flow instead of + // blocking until it finishes: `stop()` returns promptly (not after the + // task's 30s sleep) and the task never reaches completion. + let result = tokio::time::timeout(Duration::from_secs(5), async { + let started = std::sync::Arc::new(AtomicU32::new(0)); + let completed = std::sync::Arc::new(AtomicU32::new(0)); let mut scheduler: Scheduler = Scheduler::new(); - scheduler - .manual( - "wedged", - Workflow::bare() - .register( - TestState::Start, - SlowTask { - completions: Arc::clone(&completions), - }, - ) - .add_exit_state(TestState::Complete) - .add_exit_state(TestState::Error), + let wf = Workflow::bare() + .register( TestState::Start, + CancellableSlow { + started: started.clone(), + completed: completed.clone(), + }, ) - .unwrap(); + .add_exit_state(TestState::Complete) + .add_exit_state(TestState::Error); + scheduler.manual("slow", wf, TestState::Start).unwrap(); let running = scheduler.start().await.unwrap(); - running.trigger("wedged").await.unwrap(); - // Give the spawn time to land in scheduler_tasks. - sleep(Duration::from_millis(100)).await; + running.trigger("slow").await.unwrap(); + // Wait until the flow is actually in flight. + while started.load(Ordering::SeqCst) == 0 { + sleep(Duration::from_millis(5)).await; + } + assert!(running.has_running_flows().await); + + let t0 = std::time::Instant::now(); + running.stop().await.expect("graceful stop should succeed"); + assert!( + t0.elapsed() < Duration::from_secs(5), + "stop() must cancel the in-flight flow, not wait for its 30s sleep" + ); + assert_eq!( + completed.load(Ordering::SeqCst), + 0, + "the in-flight flow must be cancelled, not run to completion" + ); + }) + .await; + assert!(result.is_ok(), "Test timed out"); + } - // Spawn stop() so the driver advances into its drain loop and - // pops the wedged Trigger handle. stop() will not return because - // the awaited handle is sleeping for 30s. - let running_for_stop = running.clone(); - let stop_handle = tokio::spawn(async move { running_for_stop.stop().await }); + #[tokio::test(flavor = "multi_thread")] + async fn trigger_after_graceful_stop_reports_not_running() { + // Once graceful shutdown has run, the command channel is closed, so a + // subsequent trigger() reports "not running" rather than enqueueing. + let result = tokio::time::timeout(Duration::from_secs(5), async { + let started = std::sync::Arc::new(AtomicU32::new(0)); + let completed = std::sync::Arc::new(AtomicU32::new(0)); + let mut scheduler: Scheduler = Scheduler::new(); + let wf = Workflow::bare() + .register( + TestState::Start, + CancellableSlow { + started: started.clone(), + completed: completed.clone(), + }, + ) + .add_exit_state(TestState::Complete) + .add_exit_state(TestState::Error); + scheduler.manual("slow", wf, TestState::Start).unwrap(); - // Let the driver actually enter the drain phase and pop the handle. - sleep(Duration::from_millis(200)).await; + let running = scheduler.start().await.unwrap(); + running.trigger("slow").await.unwrap(); + while started.load(Ordering::SeqCst) == 0 { + sleep(Duration::from_millis(5)).await; + } + running.stop().await.expect("graceful stop should succeed"); + + let err = running.trigger("slow").await.unwrap_err(); assert!( - !stop_handle.is_finished(), - "stop() should still be parked while the wedged trigger handle is in flight" + err.to_string().contains("Scheduler not running"), + "trigger after shutdown must report not-running, got: {err}" ); + }) + .await; + assert!(result.is_ok(), "Test timed out"); + } - // Drop every clone — the in_flight_drain slot's AbortHandle must - // be used to abort the popped, in-flight handle. - drop(stop_handle.abort_handle()); - stop_handle.abort(); - drop(running); + #[tokio::test(flavor = "multi_thread")] + async fn cancel_flow_cancels_in_flight_run_and_returns_to_idle() { + // `cancel_flow` cooperatively cancels the in-flight run; the flow returns + // to Idle (a deliberate cancel is NOT a failure, so the streak stays 0 and + // the flow does not trip) and the task never completes. + let result = tokio::time::timeout(Duration::from_secs(5), async { + let started = std::sync::Arc::new(AtomicU32::new(0)); + let completed = std::sync::Arc::new(AtomicU32::new(0)); + let mut scheduler: Scheduler = Scheduler::new(); + let wf = Workflow::bare() + .register( + TestState::Start, + CancellableSlow { + started: started.clone(), + completed: completed.clone(), + }, + ) + .add_exit_state(TestState::Complete) + .add_exit_state(TestState::Error); + scheduler.manual("slow", wf, TestState::Start).unwrap(); + + let running = scheduler.start().await.unwrap(); + running.trigger("slow").await.unwrap(); + while started.load(Ordering::SeqCst) == 0 { + sleep(Duration::from_millis(5)).await; + } + + running + .cancel_flow("slow") + .await + .expect("cancel_flow should succeed"); - // Wait long enough that, if abort had failed, the slow task - // could have advanced. With the fix, the task is aborted before - // it can increment completions. - sleep(Duration::from_secs(2)).await; + // Wait for the cancelled run's apply_outcome to settle the status. + loop { + let st = running.status("slow").await.unwrap().status; + if st != crate::scheduler::Status::Running { + break; + } + sleep(Duration::from_millis(5)).await; + } + let info = running.status("slow").await.unwrap(); + assert_eq!( + info.status, + crate::scheduler::Status::Idle, + "a cancelled run returns to Idle" + ); + assert_eq!(info.failure_streak, 0, "cancel must not count as a failure"); assert_eq!( - completions.load(Ordering::SeqCst), + completed.load(Ordering::SeqCst), 0, - "wedged spawn must have been aborted by Drop's in_flight_drain abort path" + "task was cancelled, not completed" ); + running.stop().await.unwrap(); }) .await; + assert!(result.is_ok(), "Test timed out"); + } + + #[tokio::test(flavor = "multi_thread")] + async fn cancel_flow_on_idle_flow_is_noop() { + // Cancelling a registered flow that isn't running is an idempotent no-op. + let result = tokio::time::timeout(Duration::from_secs(5), async { + let started = std::sync::Arc::new(AtomicU32::new(0)); + let completed = std::sync::Arc::new(AtomicU32::new(0)); + let mut scheduler: Scheduler = Scheduler::new(); + let wf = Workflow::bare() + .register( + TestState::Start, + CancellableSlow { + started: started.clone(), + completed: completed.clone(), + }, + ) + .add_exit_state(TestState::Complete) + .add_exit_state(TestState::Error); + scheduler.manual("idle", wf, TestState::Start).unwrap(); + let running = scheduler.start().await.unwrap(); + // Never triggered → no in-flight run → cancel is a no-op Ok. + running + .cancel_flow("idle") + .await + .expect("cancel on idle flow is a no-op"); + assert_eq!( + running.status("idle").await.unwrap().status, + crate::scheduler::Status::Idle + ); + running.stop().await.unwrap(); + }) + .await; + assert!(result.is_ok(), "Test timed out"); + } + + #[tokio::test(flavor = "multi_thread")] + async fn cancel_flow_unknown_flow_errors() { + let result = tokio::time::timeout(Duration::from_secs(5), async { + let started = std::sync::Arc::new(AtomicU32::new(0)); + let completed = std::sync::Arc::new(AtomicU32::new(0)); + let mut scheduler: Scheduler = Scheduler::new(); + let wf = Workflow::bare() + .register( + TestState::Start, + CancellableSlow { + started: started.clone(), + completed: completed.clone(), + }, + ) + .add_exit_state(TestState::Complete) + .add_exit_state(TestState::Error); + scheduler.manual("known", wf, TestState::Start).unwrap(); + + let running = scheduler.start().await.unwrap(); + let err = running.cancel_flow("nope").await.unwrap_err(); + assert!( + err.to_string().contains("No workflow registered"), + "unknown flow must error, got: {err}" + ); + running.stop().await.unwrap(); + }) + .await; assert!(result.is_ok(), "Test timed out"); } diff --git a/cano/src/task.rs b/cano/src/task.rs index 1ac6127..e2e7388 100644 --- a/cano/src/task.rs +++ b/cano/src/task.rs @@ -69,7 +69,7 @@ //! let result = Workflow::new(resources) //! .register(Step::Fetch, FetchTask) //! .add_exit_state(Step::Done) -//! .orchestrate(Step::Fetch) +//! .orchestrate(Step::Fetch, CancellationToken::disabled()) //! .await?; //! assert_eq!(result, Step::Done); //! # Ok(()) diff --git a/cano/src/task/batch.rs b/cano/src/task/batch.rs index 505d284..1b6d581 100644 --- a/cano/src/task/batch.rs +++ b/cano/src/task/batch.rs @@ -72,7 +72,7 @@ //! .register(Step::Process, CsvProcessor) //! .add_exit_state(Step::Done); //! -//! let result = workflow.orchestrate(Step::Process).await?; +//! let result = workflow.orchestrate(Step::Process, CancellationToken::disabled()).await?; //! assert_eq!(result, Step::Done); //! # Ok(()) //! # } @@ -119,7 +119,7 @@ //! let workflow = Workflow::bare() //! .register(Step::Process, TolerantProcessor) //! .add_exit_state(Step::Done); -//! let result = workflow.orchestrate(Step::Process).await?; +//! let result = workflow.orchestrate(Step::Process, CancellationToken::disabled()).await?; //! assert_eq!(result, Step::Done); //! # Ok(()) //! # } @@ -468,6 +468,7 @@ pub type BatchTaskObject> = #[cfg(test)] mod tests { use super::*; + use crate::cancel::CancellationToken; use crate::resource::Resources; use crate::task; use crate::task::Task; @@ -869,7 +870,10 @@ mod tests { .register(Step::Process, IndexedBatch { n: 3 }) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Process).await.unwrap(); + let result = workflow + .orchestrate(Step::Process, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } @@ -892,6 +896,7 @@ mod tests { #[cfg(all(test, feature = "metrics"))] mod metrics_tests { use super::*; + use crate::cancel::CancellationToken; use crate::metrics::test_support::*; use crate::task::Task; use crate::workflow::Workflow; @@ -939,7 +944,9 @@ mod metrics_tests { let workflow = Workflow::bare() .register(St::Process, ThreeItemBatch) .add_exit_state(St::Done); - workflow.orchestrate(St::Process).await + workflow + .orchestrate(St::Process, CancellationToken::disabled()) + .await }); assert!(result.is_ok(), "workflow should succeed: {result:?}"); assert_eq!( diff --git a/cano/src/task/poll.rs b/cano/src/task/poll.rs index 54a85c5..f0bf88b 100644 --- a/cano/src/task/poll.rs +++ b/cano/src/task/poll.rs @@ -47,7 +47,7 @@ //! .register(Step::Wait, counter) //! .add_exit_state(Step::Done); //! -//! let result = workflow.orchestrate(Step::Wait).await?; +//! let result = workflow.orchestrate(Step::Wait, CancellationToken::disabled()).await?; //! assert_eq!(result, Step::Done); //! # Ok(()) //! # } @@ -109,7 +109,7 @@ //! .register(Step::Poll, TraitPoller) //! .add_exit_state(Step::Done); //! -//! let result = workflow.orchestrate(Step::Poll).await?; +//! let result = workflow.orchestrate(Step::Poll, CancellationToken::disabled()).await?; //! assert_eq!(result, Step::Done); //! # Ok(()) //! # } @@ -344,6 +344,7 @@ pub type PollTaskObject> = #[cfg(test)] mod tests { use super::*; + use crate::cancel::CancellationToken; use crate::resource::Resources; use crate::task; use crate::task::Task; @@ -590,7 +591,10 @@ mod tests { // But wait: poll 1 => count becomes 1, 1 < 2 => Pending; poll 2 => count becomes 2, 2 >= 2 => Ready(Done) // But we registered Step::Done as exit state so Done is the final state // Actually CountingPoller returns Single(Step::Done) when ready, so we skip Next entirely - let result = workflow.orchestrate(Step::Wait).await.unwrap(); + let result = workflow + .orchestrate(Step::Wait, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } @@ -754,7 +758,10 @@ mod tests { .add_exit_state(Step::Done); let start = std::time::Instant::now(); - let err = workflow.orchestrate(Step::Wait).await.unwrap_err(); + let err = workflow + .orchestrate(Step::Wait, CancellationToken::disabled()) + .await + .unwrap_err(); let elapsed = start.elapsed(); // The FSM wraps the failure with state context; `.inner()` peels one layer. diff --git a/cano/src/task/router.rs b/cano/src/task/router.rs index 0f6819b..3cd590b 100644 --- a/cano/src/task/router.rs +++ b/cano/src/task/router.rs @@ -53,7 +53,7 @@ //! .register(Step::PathA, DoPathA) //! .add_exit_state(Step::Done); //! -//! let result = workflow.orchestrate(Step::Route).await?; +//! let result = workflow.orchestrate(Step::Route, CancellationToken::disabled()).await?; //! assert_eq!(result, Step::Done); //! # Ok(()) //! # } @@ -84,7 +84,7 @@ //! .register(Step::Route, SimpleRouter) //! .add_exit_state(Step::Done); //! -//! let result = workflow.orchestrate(Step::Route).await?; +//! let result = workflow.orchestrate(Step::Route, CancellationToken::disabled()).await?; //! assert_eq!(result, Step::Done); //! # Ok(()) //! # } @@ -209,6 +209,7 @@ pub type RouterTaskObject> = #[cfg(test)] mod tests { use super::*; + use crate::cancel::CancellationToken; use crate::resource::Resources; use crate::task; use crate::task::Task; @@ -357,7 +358,10 @@ mod tests { .register(Step::PathA, PathATask) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Decide).await.unwrap(); + let result = workflow + .orchestrate(Step::Decide, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, Step::Done); } diff --git a/cano/src/task/stepped.rs b/cano/src/task/stepped.rs index 639c525..9f9f751 100644 --- a/cano/src/task/stepped.rs +++ b/cano/src/task/stepped.rs @@ -56,7 +56,7 @@ //! .register(MyState::Process, scanner) //! .add_exit_state(MyState::Done); //! -//! let result = workflow.orchestrate(MyState::Process).await?; +//! let result = workflow.orchestrate(MyState::Process, CancellationToken::disabled()).await?; //! assert_eq!(result, MyState::Done); //! # Ok(()) //! # } @@ -98,7 +98,7 @@ //! .register(MyState::Process, TraitStepper) //! .add_exit_state(MyState::Done); //! -//! let result = workflow.orchestrate(MyState::Process).await?; +//! let result = workflow.orchestrate(MyState::Process, CancellationToken::disabled()).await?; //! assert_eq!(result, MyState::Done); //! # Ok(()) //! # } @@ -397,6 +397,7 @@ where #[cfg(test)] mod tests { use super::*; + use crate::cancel::CancellationToken; use crate::resource::Resources; use crate::task; use crate::task::Task; @@ -729,7 +730,10 @@ mod tests { .register(MyState::Next, NextTask) .add_exit_state(MyState::Done); - let result = workflow.orchestrate(MyState::Work).await.unwrap(); + let result = workflow + .orchestrate(MyState::Work, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, MyState::Done); } @@ -787,6 +791,7 @@ mod tests { #[cfg(all(test, feature = "metrics"))] mod metrics_tests { use super::*; + use crate::cancel::CancellationToken; use crate::metrics::test_support::*; use crate::task::Task; use crate::workflow::Workflow; @@ -849,7 +854,9 @@ mod metrics_tests { let workflow = Workflow::bare() .register_stepped(St::Work, TwoMoreOneDone) .add_exit_state(St::Done); - workflow.orchestrate(St::Work).await + workflow + .orchestrate(St::Work, CancellationToken::disabled()) + .await }); assert!(result.is_ok(), "workflow should succeed: {result:?}"); assert_eq!( diff --git a/cano/src/task/timer.rs b/cano/src/task/timer.rs index 9e4cfbc..bd3409d 100644 --- a/cano/src/task/timer.rs +++ b/cano/src/task/timer.rs @@ -63,7 +63,7 @@ //! .register(Step::Wait, CoolDown) //! .add_exit_state(Step::Done); //! -//! let result = workflow.orchestrate(Step::Wait).await?; +//! let result = workflow.orchestrate(Step::Wait, CancellationToken::disabled()).await?; //! assert_eq!(result, Step::Done); //! # Ok(()) //! # } diff --git a/cano/src/testing.rs b/cano/src/testing.rs index 8f1c079..954b82a 100644 --- a/cano/src/testing.rs +++ b/cano/src/testing.rs @@ -49,7 +49,7 @@ //! .register(S::Start, OkTask) //! .add_exit_state(S::Done) //! .with_observer(observer.clone()); -//! assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done); +//! assert_eq!(wf.orchestrate(S::Start, CancellationToken::disabled()).await.unwrap(), S::Done); //! observer.assert_path(&["Start", "Done"]); //! # } //! ``` @@ -120,6 +120,11 @@ pub enum RecordedEvent { /// The sequence of the last persisted row. sequence: u64, }, + /// A run was cancelled via a [`CancellationToken`](crate::cancel::CancellationToken). + Cancelled { + /// The `Debug` rendering of the state cancellation was observed at. + state: String, + }, } /// A [`WorkflowObserver`] that records every event it @@ -291,6 +296,11 @@ impl WorkflowObserver for RecordingObserver { sequence, }); } + fn on_cancelled(&self, state: &str) { + self.events.lock().push(RecordedEvent::Cancelled { + state: state.into(), + }); + } } /// A process-local [`CheckpointStore`] for resume / @@ -549,7 +559,12 @@ mod tests { .register(S::Start, OkTask) .add_exit_state(S::Done) .with_observer(observer.clone()); - assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done); + assert_eq!( + wf.orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap(), + S::Done + ); observer.assert_path(&["Start", "Done"]); observer.assert_completed_with("Done"); assert!(observer.events().contains(&RecordedEvent::TaskSucceeded { @@ -564,7 +579,9 @@ mod tests { .register(S::Start, OkTask) .add_exit_state(S::Done) .with_observer(observer.clone()); - wf.orchestrate(S::Start).await.unwrap(); + wf.orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap(); assert!(!observer.events().is_empty()); observer.clear(); assert!(observer.events().is_empty()); @@ -687,7 +704,10 @@ mod tests { .register(S::Start, task) .add_exit_state(S::Done) .with_observer(observer.clone()); - let err = wf.orchestrate(S::Start).await.unwrap_err(); + let err = wf + .orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert!(err.to_string().contains("panic"), "{err}"); let retries = observer .events() @@ -710,7 +730,12 @@ mod tests { let wf = Workflow::bare() .register(S::Start, panic_on_attempt(0, S::Done)) .add_exit_state(S::Done); - assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done); + assert_eq!( + wf.orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap(), + S::Done + ); } #[test] @@ -734,7 +759,9 @@ mod tests { .register(S::B, Go(S::Done)) .add_exit_state(S::Done) .with_observer(observer.clone()); - wf.orchestrate(S::A).await.unwrap(); + wf.orchestrate(S::A, CancellationToken::disabled()) + .await + .unwrap(); observer .assert_all_states_entered(&[S::A, S::B, S::Done]) .expect("all states visited"); @@ -747,7 +774,9 @@ mod tests { .register(S::A, Go(S::Done)) .add_exit_state(S::Done) .with_observer(observer.clone()); - wf.orchestrate(S::A).await.unwrap(); + wf.orchestrate(S::A, CancellationToken::disabled()) + .await + .unwrap(); let missing = observer .assert_all_states_entered(&[S::A, S::B, S::C, S::Done]) .unwrap_err(); @@ -761,7 +790,9 @@ mod tests { .register(S::A, Go(S::Done)) .add_exit_state(S::Done) .with_observer(observer.clone()); - wf.orchestrate(S::A).await.unwrap(); + wf.orchestrate(S::A, CancellationToken::disabled()) + .await + .unwrap(); let missing = observer .assert_all_states_entered(&[S::A, S::A, S::B]) .unwrap_err(); @@ -776,7 +807,9 @@ mod tests { .register(S::B, Go(S::Done)) .add_exit_state(S::Done) .with_observer(observer.clone()); - wf.orchestrate(S::A).await.unwrap(); + wf.orchestrate(S::A, CancellationToken::disabled()) + .await + .unwrap(); observer .assert_registered_states_entered(&wf) .expect("all registered states visited"); @@ -790,7 +823,9 @@ mod tests { .register(S::C, Go(S::Done)) // never routed to .add_exit_state(S::Done) .with_observer(observer.clone()); - wf.orchestrate(S::A).await.unwrap(); + wf.orchestrate(S::A, CancellationToken::disabled()) + .await + .unwrap(); let missing = observer.assert_registered_states_entered(&wf).unwrap_err(); assert!(missing.contains(&"C".to_string()), "missing={missing:?}"); } diff --git a/cano/src/workflow.rs b/cano/src/workflow.rs index b448392..277f71f 100644 --- a/cano/src/workflow.rs +++ b/cano/src/workflow.rs @@ -77,6 +77,7 @@ use std::hash::Hash; use std::sync::{Arc, OnceLock}; use std::time::Duration; +use crate::cancel::CancellationToken; use crate::error::CanoError; use crate::observer::WorkflowObserver; use crate::recovery::CheckpointStore; @@ -213,8 +214,6 @@ where states: HashMap>>, /// Shared resources for all tasks pub(crate) resources: Arc>, - /// Global workflow timeout - workflow_timeout: Option, /// Total wall-clock budget for the entire `orchestrate` / `resume_from` call. /// When set, the FSM aborts the in-flight task at its next await point as soon /// as the budget elapses and drains the compensation stack against @@ -268,7 +267,6 @@ where Self { states: HashMap::new(), resources: Arc::new(resources), - workflow_timeout: None, total_timeout: None, compensation_timeout: None, exit_states: Vec::new(), @@ -283,27 +281,6 @@ where } } - /// Set a blunt wall-clock timeout for the entire `orchestrate` / - /// `resume_from` call. - /// - /// Implemented as a single `tokio::time::timeout` around the workflow - /// future. The in-flight task is dropped at its next await point and the - /// call returns `CanoError::Workflow("Workflow timeout exceeded")` — - /// compensation does **not** run. - /// - /// When [`with_total_timeout`](Self::with_total_timeout) is also set, the - /// engine treats this value as a *floor* on the total budget: the - /// effective wall-clock cap is `min(with_timeout, with_total_timeout)` - /// and the graceful total-timeout path drives it (compensation runs - /// under [`with_compensation_timeout`](Self::with_compensation_timeout), - /// `on_workflow_timeout` fires). This preserves the "with_timeout is a - /// hard upper bound" intent while avoiding a race between two outer - /// timeouts that would drop the inner compensation drain mid-flight. - pub fn with_timeout(mut self, timeout: Duration) -> Self { - self.workflow_timeout = Some(timeout); - self - } - /// Set a wall-clock budget for the entire `orchestrate` (or `resume_from`) call. /// /// When the budget elapses, the in-flight task is aborted at its next await @@ -343,7 +320,7 @@ where /// .add_exit_state(Step::Done); /// /// let err = workflow - /// .orchestrate(Step::Start) + /// .orchestrate(Step::Start, CancellationToken::disabled()) /// .await /// .expect_err("budget elapses before Done"); /// // The engine wraps task errors with state context; `.inner()` peels one layer. @@ -638,7 +615,7 @@ where /// .register(Step::Start, NoopTask) /// .add_exit_state(Step::Done) /// .with_observer(counter.clone()); - /// workflow.orchestrate(Step::Start).await?; + /// workflow.orchestrate(Step::Start, CancellationToken::disabled()).await?; /// assert_eq!(counter.0.load(Ordering::Relaxed), 1); /// # Ok(()) /// # } @@ -901,13 +878,35 @@ where /// /// Runs lifecycle setup before execution and teardown after, regardless of outcome. /// + /// `token` controls cooperative cancellation. Drive the run with a [`CancellationToken`] + /// obtained from [`CancellationToken::new`](crate::cancel::CancellationToken::new) and keep the + /// paired [`CancellationHandle`](crate::cancel::CancellationHandle); when the handle's + /// [`cancel`](crate::cancel::CancellationHandle::cancel) fires, the in-flight cancellable task + /// is dropped at its next await point, the saga compensation stack is drained, and the call + /// returns [`CanoError::Cancelled`] (wrapped in [`CanoError::WithStateContext`]; a dirty + /// rollback yields [`CanoError::CompensationFailed`] whose `errors[0]` is the wrapped cancel). + /// To opt a run out of cancellation, pass [`CancellationToken::disabled`] — it never fires and + /// is zero-cost (the FSM skips the cancellation `select!` entirely). + /// + /// Cancellation is cooperative and saga-safe: a task is only interrupted at an `.await`, and a + /// [`CompensatableTask`](crate::saga::CompensatableTask) is never interrupted mid-run (it + /// completes so its rollback entry is recorded, and the cancel is honoured at the next state + /// boundary). The compensation drain itself is uncancellable. See the + /// [`cancel`](crate::cancel) module for the full semantics and precedence rules against + /// [`with_total_timeout`](Self::with_total_timeout). + /// /// # Errors /// /// - [`CanoError::Workflow`] -- no handler is registered for the current state, a single /// task returned a `TaskResult::Split` (use [`Workflow::register_split`] instead), the /// global workflow timeout was exceeded, or a split strategy was misconfigured + /// - [`CanoError::Cancelled`] -- the run was cancelled via `token` (see above) /// - Any [`CanoError`] variant propagated from a task during execution - pub async fn orchestrate(&self, initial_state: TState) -> Result { + pub async fn orchestrate( + &self, + initial_state: TState, + token: CancellationToken, + ) -> Result { #[cfg(feature = "tracing")] let workflow_span = self.tracing_span.clone().unwrap_or_else(|| { if tracing::enabled!(tracing::Level::INFO) { @@ -937,79 +936,48 @@ where self.validate_initial_state(&initial_state)?; self.resources.setup_all().await?; - let result = self.run_workflow(initial_state).await; + let result = self.run_workflow(initial_state, token).await; self.resources .teardown_range(0..self.resources.lifecycle_len()) .await; result } - async fn run_workflow(&self, initial_state: TState) -> Result { + async fn run_workflow( + &self, + initial_state: TState, + token: CancellationToken, + ) -> Result { #[cfg(feature = "metrics")] let _active = crate::metrics::WorkflowActiveGuard::new(); let started = std::time::Instant::now(); let total_budget = self.resolve_total_budget(started); - let workflow_future = self.execute_workflow(initial_state, total_budget); - self.await_with_outer_timeout(workflow_future, total_budget, started) - .await + let result = self + .execute_workflow(initial_state, total_budget, token) + .await; + Self::record_run_outcome(&result, started); + result } - /// Resolve the effective wall-clock budget for the entire FSM call. - /// - /// Precedence: - /// 1. Both `with_timeout` and `with_total_timeout` set → graceful - /// total-timeout path with `min(...)` as the budget. Treating - /// `with_timeout` as a floor preserves the user's intent that it is - /// a hard upper bound, while avoiding a race between two outer - /// timeouts that would drop the compensation drain mid-flight. - /// 2. Only total set → graceful total-timeout path. - /// 3. Only `with_timeout` set → legacy blunt `tokio::time::timeout` - /// wrapper applied externally; the FSM loop runs unbudgeted. - /// 4. Neither → zero-cost path. + /// Resolve the wall-clock budget for the entire FSM call: the + /// [`with_total_timeout`](Self::with_total_timeout) duration, or `None` + /// (the zero-cost path) when unset. pub(crate) fn resolve_total_budget( &self, started: std::time::Instant, ) -> Option<(std::time::Instant, Duration)> { - let effective = match (self.workflow_timeout, self.total_timeout) { - (Some(w), Some(t)) => Some(w.min(t)), - (_, Some(t)) => Some(t), - _ => None, - }; - effective.map(|d| (started, d)) - } - - /// Apply the legacy `with_timeout` outer wrapper when (and only when) the - /// graceful total-timeout path is NOT also active. Emits the workflow-run - /// outcome metric exactly once per invocation — on the legacy-timeout - /// path the early return ensures `outcome="timeout"` is recorded - /// *without* a follow-up `outcome="failed"` for the same run; on the - /// non-timeout paths the post-match emission records `completed`/`failed`. - /// - /// Used by both `run_workflow` (forward direction) and - /// `execute_resume_inner` (resume direction) so the precedence rule - /// lives in one place. - pub(crate) async fn await_with_outer_timeout( - &self, - fut: F, - total_budget: Option<(std::time::Instant, Duration)>, - #[allow(unused_variables)] started: std::time::Instant, - ) -> Result - where - F: std::future::Future>, - { - let result = match (self.workflow_timeout, total_budget) { - (Some(timeout_duration), None) => { - match tokio::time::timeout(timeout_duration, fut).await { - Ok(inner) => inner, - Err(_) => { - #[cfg(feature = "metrics")] - crate::metrics::workflow_run("timeout", started.elapsed()); - return Err(CanoError::workflow("Workflow timeout exceeded")); - } - } - } - _ => fut.await, - }; + self.total_timeout.map(|d| (started, d)) + } + + /// Emit the workflow-run outcome metric (`completed` / `failed`) once per + /// run. Called by both `run_workflow` (forward) and `execute_resume_inner` + /// (resume) so the emission lives in one place. No-op without the `metrics` + /// feature. + #[cfg_attr(not(feature = "metrics"), allow(unused_variables))] + pub(crate) fn record_run_outcome( + result: &Result, + started: std::time::Instant, + ) { #[cfg(feature = "metrics")] crate::metrics::workflow_run( if result.is_ok() { @@ -1019,7 +987,6 @@ where }, started.elapsed(), ); - result } } @@ -1032,7 +999,6 @@ where Self { states: self.states.clone(), resources: Arc::clone(&self.resources), - workflow_timeout: self.workflow_timeout, total_timeout: self.total_timeout, compensation_timeout: self.compensation_timeout, exit_states: self.exit_states.clone(), @@ -1080,7 +1046,7 @@ where /// let result = Workflow::bare() /// .register(Step::Start, NoopTask) /// .add_exit_state(Step::Done) - /// .orchestrate(Step::Start) + /// .orchestrate(Step::Start, CancellationToken::disabled()) /// .await?; /// assert_eq!(result, Step::Done); /// # Ok(()) @@ -1100,7 +1066,6 @@ where f.debug_struct("Workflow") .field("states", &format!("{} states", self.states.len())) .field("exit_states", &self.exit_states) - .field("workflow_timeout", &self.workflow_timeout) .field("total_timeout", &self.total_timeout) .field("compensation_timeout", &self.compensation_timeout) .field("workflow_id", &self.workflow_id) @@ -1151,7 +1116,11 @@ mod metrics_tests { #[test] fn successful_run_records_outcome_duration_and_clears_active_gauge() { - let (res, rows) = run_with_recorder(|| async { ok_workflow().orchestrate(S::Start).await }); + let (res, rows) = run_with_recorder(|| async { + ok_workflow() + .orchestrate(S::Start, CancellationToken::disabled()) + .await + }); assert_eq!(res.unwrap(), S::Done); assert_eq!( counter( @@ -1178,7 +1147,7 @@ mod metrics_tests { Workflow::bare() .register(S::Start, Boom) .add_exit_state(S::Done) - .orchestrate(S::Start) + .orchestrate(S::Start, CancellationToken::disabled()) .await }); assert!(res.is_err()); @@ -1190,47 +1159,12 @@ mod metrics_tests { } #[test] - fn legacy_timeout_on_orchestrate_only_increments_timeout_counter() { - // Regression sentinel for F8: when `with_timeout` fires inside - // `run_workflow`, the early return guarantees only `outcome="timeout"` - // is incremented — not both `timeout` and `failed`. This test asserts - // the forward direction, which has always been correct; a sibling test - // in `compensation::tests` covers the resume direction (which used to - // double-count). - struct Slow; - #[crate::task] - impl Task for Slow { - fn config(&self) -> TaskConfig { - TaskConfig::minimal() - } - async fn run_bare(&self) -> Result, CanoError> { - tokio::time::sleep(std::time::Duration::from_millis(500)).await; - Ok(TaskResult::Single(S::Done)) - } - } + fn per_state_task_durations_are_recorded_single_and_split() { let (res, rows) = run_with_recorder(|| async { - Workflow::bare() - .with_timeout(std::time::Duration::from_millis(20)) - .register(S::Start, Slow) - .add_exit_state(S::Done) - .orchestrate(S::Start) + ok_workflow() + .orchestrate(S::Start, CancellationToken::disabled()) .await }); - assert!(res.is_err()); - assert_eq!( - counter(&rows, "cano_workflow_runs_total", &[("outcome", "timeout")]), - 1 - ); - assert_eq!( - counter_opt(&rows, "cano_workflow_runs_total", &[("outcome", "failed")]).unwrap_or(0), - 0, - "legacy timeout must not double-count as both `timeout` and `failed`" - ); - } - - #[test] - fn per_state_task_durations_are_recorded_single_and_split() { - let (res, rows) = run_with_recorder(|| async { ok_workflow().orchestrate(S::Start).await }); assert_eq!(res.unwrap(), S::Done); assert_eq!( histogram_count( @@ -1282,7 +1216,7 @@ mod metrics_tests { JoinConfig::new(JoinStrategy::PartialResults(2), S::Done), ) .add_exit_state(S::Done) - .orchestrate(S::Start) + .orchestrate(S::Start, CancellationToken::disabled()) .await }); assert_eq!(res.unwrap(), S::Done); @@ -1345,7 +1279,10 @@ mod tests { .register(TestState::Start, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1356,7 +1293,10 @@ mod tests { .register(TestState::Process, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1371,7 +1311,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); let data: String = store.get("test_key").unwrap(); @@ -1384,7 +1327,9 @@ mod tests { // upfront rather than reaching the FSM loop. let workflow = Workflow::::bare().add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await; + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; let err = result.unwrap_err(); assert_eq!(err.category(), "configuration"); assert!(err.to_string().contains("no registered state handlers")); @@ -1470,7 +1415,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1486,7 +1434,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1496,7 +1447,10 @@ mod tests { let workflow = Workflow::bare() .register(TestState::Start, SimpleTask::new(TestState::Complete)) .add_exit_states([TestState::Complete, TestState::Error]); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1506,7 +1460,10 @@ mod tests { let workflow = Workflow::bare() .register(TestState::Start, SimpleTask::new(TestState::Complete)) .add_exit_states([TestState::Complete, TestState::Complete].into_iter()); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1610,7 +1567,7 @@ mod tests { let result = Workflow::bare() .register(TestState::Start, BareWorkflowTask) .add_exit_state(TestState::Complete) - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .unwrap(); assert_eq!(result, TestState::Complete); @@ -1644,7 +1601,7 @@ mod tests { .register_router(TestState::Start, RouteToProcess) .register(TestState::Process, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete) - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .unwrap(); @@ -1770,7 +1727,7 @@ mod tests { .register(TestState::Start, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete) .with_observer(Arc::new(PanickyObserver)) - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .expect("orchestrate must complete despite observer panic"); assert_eq!(result, TestState::Complete); @@ -1800,7 +1757,7 @@ mod tests { .register(TestState::Process, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete) .with_observer(Arc::new(CountThenPanic(Arc::clone(&count)))) - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .expect("orchestrate must complete despite repeated observer panics"); assert_eq!(result, TestState::Complete); @@ -1821,7 +1778,10 @@ mod tests { let wf = Workflow::bare() .register(TestState::Start, start.clone()) .add_exit_state(TestState::Complete); - let result = wf.orchestrate(TestState::Complete).await.unwrap(); + let result = wf + .orchestrate(TestState::Complete, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); assert_eq!( start.count(), @@ -1839,7 +1799,10 @@ mod tests { .register(TestState::Start, SimpleTask::new(TestState::Process)) .register(TestState::Process, process.clone()) .add_exit_state(TestState::Process); - let result = wf.orchestrate(TestState::Start).await.unwrap(); + let result = wf + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Process); assert_eq!( process.count(), @@ -1855,7 +1818,10 @@ mod tests { let wf = Workflow::bare() .register(TestState::Start, SimpleTask::new(TestState::Process)) .add_exit_state(TestState::Complete); - let err = wf.orchestrate(TestState::Start).await.unwrap_err(); + let err = wf + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert!(err.to_string().contains("No task registered"), "got: {err}"); } @@ -1866,7 +1832,10 @@ mod tests { let wf = Workflow::bare() .register(TestState::Start, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete); - let err = wf.orchestrate(TestState::Process).await.unwrap_err(); + let err = wf + .orchestrate(TestState::Process, CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.category(), "configuration"); assert!( err.to_string() @@ -1889,7 +1858,10 @@ mod tests { let wf = Workflow::bare() .register(TestState::Start, ReturnsSplit) .add_exit_state(TestState::Complete); - let err = wf.orchestrate(TestState::Start).await.unwrap_err(); + let err = wf + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert!(err.to_string().contains("use register_split"), "got: {err}"); } @@ -1900,7 +1872,10 @@ mod tests { .register(TestState::Start, first.clone()) .register(TestState::Start, SimpleTask::new(TestState::Complete)) // replaces `first` .add_exit_state(TestState::Complete); - let result = wf.orchestrate(TestState::Start).await.unwrap(); + let result = wf + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); // the second handler ran assert_eq!(first.count(), 0, "the replaced handler must not run"); } @@ -1936,7 +1911,10 @@ mod tests { }, ) .add_exit_state(TestState::Complete); - let result = wf.orchestrate(TestState::Start).await.unwrap(); + let result = wf + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); assert_eq!(count.load(std::sync::atomic::Ordering::SeqCst), 5); } @@ -1955,7 +1933,7 @@ mod tests { .add_exit_state(TestState::Complete); let result = tokio::time::timeout( std::time::Duration::from_secs(5), - wf.orchestrate(TestState::Start), + wf.orchestrate(TestState::Start, CancellationToken::disabled()), ) .await .expect("orchestrate of an empty split must not hang"); @@ -1993,170 +1971,8 @@ mod tests { } } -/// Edge-case unit tests for `await_with_outer_timeout`. Integration-level -/// coverage already pins down the metric-emission shape end-to-end -/// (`legacy_timeout_on_orchestrate_only_increments_timeout_counter`, -/// `legacy_timeout_on_resume_only_increments_timeout_counter`, -/// `with_timeout_acts_as_floor_when_combined_with_with_total_timeout`, -/// `with_timeout_alone_still_uses_legacy_blunt_timeout`); these tests pin -/// down the helper's behavior in isolation across every -/// `(workflow_timeout, total_budget)` permutation. -#[cfg(test)] -mod await_with_outer_timeout_tests { - use super::test_support::TestState; - use super::*; - use std::time::Duration; - - fn workflow_with( - workflow_timeout: Option, - total_timeout: Option, - ) -> Workflow { - let mut w = Workflow::::bare(); - if let Some(d) = workflow_timeout { - w = w.with_timeout(d); - } - if let Some(d) = total_timeout { - w = w.with_total_timeout(d); - } - w - } - - #[tokio::test] - async fn neither_timeout_just_awaits_future() { - let w = workflow_with(None, None); - let started = std::time::Instant::now(); - let out = w - .await_with_outer_timeout( - async { Ok::(TestState::Complete) }, - None, - started, - ) - .await - .unwrap(); - assert_eq!(out, TestState::Complete); - } - - #[tokio::test] - async fn only_with_timeout_passes_through_when_future_is_fast() { - let w = workflow_with(Some(Duration::from_secs(60)), None); - let started = std::time::Instant::now(); - let out = w - .await_with_outer_timeout( - async { Ok::(TestState::Complete) }, - None, - started, - ) - .await - .unwrap(); - assert_eq!(out, TestState::Complete); - } - - #[tokio::test] - async fn only_with_timeout_fires_legacy_timeout_on_slow_future() { - // Slow future + small `with_timeout` and no total_budget → the legacy - // arm fires. Surfaces the documented `CanoError::Workflow("Workflow - // timeout exceeded")` and the helper returns early so the post-match - // emission does not also fire. - let w = workflow_with(Some(Duration::from_millis(10)), None); - let started = std::time::Instant::now(); - let err = w - .await_with_outer_timeout( - async { - tokio::time::sleep(Duration::from_secs(1)).await; - Ok::(TestState::Complete) - }, - None, - started, - ) - .await - .expect_err("legacy timeout must fire"); - assert!( - matches!(err, CanoError::Workflow(ref m) if m.contains("Workflow timeout exceeded")), - "expected legacy shape, got: {err}" - ); - assert!( - started.elapsed() < Duration::from_millis(500), - "must bound to the legacy timeout, not the inner sleep" - ); - } - - #[tokio::test] - async fn only_total_budget_skips_legacy_path() { - // The helper's match has `(Some, None)` only — when total_budget is - // Some, the legacy arm is never taken. The slow future is allowed to - // run; here we stop it via a quick inner Ok so the test is fast. - let w = workflow_with(None, Some(Duration::from_millis(10))); - let total_budget = Some((std::time::Instant::now(), Duration::from_millis(10))); - let started = std::time::Instant::now(); - let out = w - .await_with_outer_timeout( - async { Ok::(TestState::Complete) }, - total_budget, - started, - ) - .await - .unwrap(); - assert_eq!(out, TestState::Complete); - } - - #[tokio::test] - async fn both_timeouts_set_skips_legacy_path() { - // Both timeouts: the graceful path drives. Legacy wrapper must NOT - // wrap, otherwise the inner total-budget drain could be cancelled - // mid-flight. Verify by using a slow inner future and a Some - // total_budget: the helper passes the future through without timing - // out (since legacy doesn't apply and we don't simulate the graceful - // path here — that's the FSM loop's job). - let w = workflow_with( - Some(Duration::from_millis(5)), - Some(Duration::from_secs(60)), - ); - let total_budget = Some((std::time::Instant::now(), Duration::from_secs(60))); - let started = std::time::Instant::now(); - let out = w - .await_with_outer_timeout( - async { - // ~20ms — well beyond `with_timeout(5ms)` — to prove the - // legacy wrapper isn't applied. - tokio::time::sleep(Duration::from_millis(20)).await; - Ok::(TestState::Complete) - }, - total_budget, - started, - ) - .await - .unwrap(); - assert_eq!(out, TestState::Complete); - assert!( - started.elapsed() >= Duration::from_millis(20), - "future must run to completion; legacy wrapper must NOT be applied" - ); - } - - #[tokio::test] - async fn legacy_path_propagates_inner_errors_unchanged() { - // When the inner future returns Err before the deadline, the helper - // must surface that error unchanged — not convert it to a timeout. - let w = workflow_with(Some(Duration::from_secs(60)), None); - let started = std::time::Instant::now(); - let err = w - .await_with_outer_timeout( - async { Err::(CanoError::task_execution("inner boom")) }, - None, - started, - ) - .await - .expect_err("inner err must propagate"); - assert!( - matches!(err, CanoError::TaskExecution(ref m) if m == "inner boom"), - "must propagate verbatim, got: {err}" - ); - } -} - -/// Edge-case unit tests for `resolve_total_budget`. Verifies the precedence -/// rules separately from the integration-level -/// `with_timeout_acts_as_floor_when_combined_with_with_total_timeout`. +/// Edge-case unit tests for `resolve_total_budget` — the budget is simply the +/// `with_total_timeout` duration (or `None`). #[cfg(test)] mod resolve_total_budget_tests { use super::test_support::TestState; @@ -2164,50 +1980,19 @@ mod resolve_total_budget_tests { use std::time::Duration; #[test] - fn neither_set_returns_none() { + fn unset_returns_none() { let w = Workflow::::bare(); assert!(w.resolve_total_budget(std::time::Instant::now()).is_none()); } #[test] - fn only_with_timeout_set_returns_none() { - let w = Workflow::::bare().with_timeout(Duration::from_secs(1)); - assert!( - w.resolve_total_budget(std::time::Instant::now()).is_none(), - "with_timeout alone goes through the legacy wrapper; FSM gets no budget" - ); - } - - #[test] - fn only_total_timeout_set_returns_total() { + fn total_timeout_set_returns_total() { let w = Workflow::::bare().with_total_timeout(Duration::from_secs(7)); let now = std::time::Instant::now(); let (start, limit) = w.resolve_total_budget(now).unwrap(); assert_eq!(start, now); assert_eq!(limit, Duration::from_secs(7)); } - - #[test] - fn both_set_returns_min_via_with_timeout_as_floor() { - // F5: when both are configured the smaller bounds the FSM, so the - // legacy hard cap still applies (graceful path). - let w = Workflow::::bare() - .with_timeout(Duration::from_millis(50)) - .with_total_timeout(Duration::from_secs(60)); - let now = std::time::Instant::now(); - let (_, limit) = w.resolve_total_budget(now).unwrap(); - assert_eq!(limit, Duration::from_millis(50)); - } - - #[test] - fn both_set_total_smaller_returns_total() { - // Symmetric case: total smaller than legacy. - let w = Workflow::::bare() - .with_timeout(Duration::from_secs(60)) - .with_total_timeout(Duration::from_millis(50)); - let (_, limit) = w.resolve_total_budget(std::time::Instant::now()).unwrap(); - assert_eq!(limit, Duration::from_millis(50)); - } } /// Edge-case unit tests for `catch_panic_to_error`. The integration-level diff --git a/cano/src/workflow/compensation.rs b/cano/src/workflow/compensation.rs index 71b50d6..63e56fe 100644 --- a/cano/src/workflow/compensation.rs +++ b/cano/src/workflow/compensation.rs @@ -13,6 +13,7 @@ use std::sync::Arc; use futures_util::FutureExt; +use crate::cancel::CancellationToken; use crate::error::CanoError; use crate::recovery::RowKind; use crate::saga::{CompensationEntry, ErasedCompensatable}; @@ -505,6 +506,13 @@ where /// Rows with other kinds — ordinary [`RowKind::StateEntry`] rows and /// [`RowKind::StepCursor`] rows — are ignored by the rehydration. /// + /// `token` controls cooperative cancellation exactly as in + /// [`orchestrate`](Self::orchestrate): firing the paired + /// [`CancellationHandle`](crate::cancel::CancellationHandle) aborts the resumed run at the next + /// await point and drains the rehydrated compensation stack, returning + /// [`CanoError::Cancelled`]. Pass [`CancellationToken::disabled`] to opt out. See the + /// [`cancel`](crate::cancel) module for the full cancellation semantics. + /// /// # Errors /// /// - [`CanoError::Configuration`] — no checkpoint store attached, or the workflow @@ -513,8 +521,13 @@ where /// rows for `workflow_id`. /// - [`CanoError::Workflow`] — the recorded state label doesn't match any state of /// this workflow (e.g. resuming against a different workflow definition). + /// - [`CanoError::Cancelled`] — the resumed run was cancelled via `token`. /// - Any [`CanoError`] propagated from a task during the resumed execution. - pub async fn resume_from(&self, workflow_id: impl Into>) -> Result { + pub async fn resume_from( + &self, + workflow_id: impl Into>, + token: CancellationToken, + ) -> Result { let workflow_id: Arc = workflow_id.into(); #[cfg(feature = "tracing")] @@ -553,7 +566,8 @@ where // run teardown — even the rehydration `?`/`return Err` early-returns // between here and `execute_workflow_from`. Wrap the body so a single // `teardown_range` call at the bottom handles every path uniformly. - let result: Result = self.execute_resume_inner(workflow_id, store).await; + let result: Result = + self.execute_resume_inner(workflow_id, store, token).await; self.resources .teardown_range(0..self.resources.lifecycle_len()) .await; @@ -570,6 +584,7 @@ where &self, workflow_id: Arc, store: Arc, + token: CancellationToken, ) -> Result { let mut rows = store.load_run(&workflow_id).await.map_err(|e| { CanoError::checkpoint_store(format!("load checkpoint run {workflow_id:?}: {e}")) @@ -662,15 +677,15 @@ where resume_cursors, prior_transitions, total_budget, + token, ); // Teardown happens in the outer `resume_from` after this function // returns, so this branch only produces the error value and lets - // the caller clean up. `await_with_outer_timeout` owns the - // workflow_run metric emission for both timeout and - // completed/failed outcomes — see its docstring for the - // precedence rules between `with_timeout` and `with_total_timeout`. - self.await_with_outer_timeout(exec, total_budget, started) - .await + // the caller clean up. Emit the workflow-run outcome metric here, the + // same way `run_workflow` does for the forward direction. + let result = exec.await; + Self::record_run_outcome(&result, started); + result } } @@ -707,7 +722,10 @@ mod tests { .with_workflow_id("run-1"); assert_eq!( - workflow.orchestrate(TestState::Start).await.unwrap(), + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); @@ -741,10 +759,16 @@ mod tests { .register(TestState::Start, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete); assert_eq!( - workflow.orchestrate(TestState::Start).await.unwrap(), + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); - let err = workflow.resume_from("whatever").await.unwrap_err(); + let err = workflow + .resume_from("whatever", CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.category(), "configuration"); assert!(err.message().contains("checkpoint store")); } @@ -773,7 +797,10 @@ mod tests { .with_observer(Arc::new(observer)); assert_eq!( - workflow.resume_from("run-2").await.unwrap(), + workflow + .resume_from("run-2", CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); assert_eq!( @@ -822,7 +849,10 @@ mod tests { .with_checkpoint_store(store.clone()); assert_eq!( - workflow.resume_from("done-run").await.unwrap(), + workflow + .resume_from("done-run", CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); assert_eq!( @@ -846,7 +876,10 @@ mod tests { .with_workflow_id("split-run"); assert_eq!( - workflow.orchestrate(TestState::Start).await.unwrap(), + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); @@ -875,7 +908,10 @@ mod tests { .register(TestState::Start, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); // Errors raised inside `execute_workflow_from` are wrapped with state context; // `.inner()` peels one layer back to the underlying checkpoint_store error. assert_eq!(err.inner().category(), "checkpoint_store"); @@ -889,7 +925,10 @@ mod tests { .register(TestState::Start, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let err = workflow.resume_from("never-ran").await.unwrap_err(); + let err = workflow + .resume_from("never-ran", CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.category(), "checkpoint_store"); assert!(err.message().contains("no checkpoint rows")); } @@ -908,7 +947,10 @@ mod tests { .register(TestState::Start, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let err = workflow.resume_from("wrong-defn").await.unwrap_err(); + let err = workflow + .resume_from("wrong-defn", CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.category(), "workflow"); assert!(err.message().contains("is not a registered or exit state")); } @@ -948,7 +990,10 @@ mod tests { .with_observer(Arc::new(obs)); assert_eq!( - workflow.orchestrate(TestState::Start).await.unwrap(), + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); @@ -1009,7 +1054,10 @@ mod tests { .with_checkpoint_store(store.clone()); assert_eq!( - workflow.resume_from("router-resume").await.unwrap(), + workflow + .resume_from("router-resume", CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); assert_eq!( @@ -1063,7 +1111,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); // Clean rollback → the original failure is surfaced, wrapped with state context. assert_eq!(err.inner().category(), "task_execution"); assert_eq!(err.message(), "D forward failed"); @@ -1106,7 +1157,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.message(), "D forward failed"); // Only the two compensatable tasks rolled back — the plain `Process` task didn't. assert_eq!( @@ -1151,7 +1205,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); match err { CanoError::CompensationFailed { errors } => { // [original (D forward failed, wrapped with state context), B's compensate failure]. @@ -1204,7 +1261,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); match err { CanoError::CompensationFailed { errors } => { assert!( @@ -1296,7 +1356,10 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let err = workflow.resume_from("saga-run").await.unwrap_err(); + let err = workflow + .resume_from("saga-run", CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.message(), "C forward failed"); // The rehydrated stack [A=7, B=8] drains in reverse, using the outputs persisted // before the crash. C never produced an output (it failed forward). @@ -1390,7 +1453,10 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let err = workflow.resume_from("mixed-run").await.unwrap_err(); + let err = workflow + .resume_from("mixed-run", CancellationToken::disabled()) + .await + .unwrap_err(); // C failed forward (original error is "C forward failed"). assert_eq!(err.message(), "C forward failed"); @@ -1457,7 +1523,7 @@ mod tests { let s = Arc::clone(&store); handles.push(tokio::spawn(async move { three_state_checkpointed(s, format!("run-{i}")) - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await })); } @@ -1491,7 +1557,7 @@ mod tests { let s = Arc::clone(&store); handles.push(tokio::spawn(async move { three_state_checkpointed(s, "dup") - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await })); } @@ -1530,7 +1596,7 @@ mod tests { .unwrap(); let err = three_state_checkpointed(store.clone(), "run") - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .unwrap_err(); assert_eq!(err.inner().category(), "checkpoint_store"); @@ -1589,7 +1655,10 @@ mod tests { .with_checkpoint_store(store.clone()) .with_workflow_id("disk"); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); match err { CanoError::CompensationFailed { errors } => { // [the append failure that ended the run (now wrapped with state context), @@ -1687,7 +1756,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); match err { CanoError::CompensationFailed { errors } => { assert_eq!(errors[0].message(), "C forward failed"); @@ -1734,7 +1806,10 @@ mod tests { .add_exit_state(TestState::Complete); let started = std::time::Instant::now(); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert!( started.elapsed() < Duration::from_secs(5), "a hanging compensator must be bounded, not block the drain forever" @@ -1895,7 +1970,10 @@ mod tests { .add_exit_state(TestState::Complete); let started = std::time::Instant::now(); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); let elapsed = started.elapsed(); // With the bounded drain (50ms cap) the test finishes well under @@ -2091,7 +2169,10 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let err = workflow.resume_from("crash-after-b").await.unwrap_err(); + let err = workflow + .resume_from("crash-after-b", CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.message(), "C forward failed"); // B re-ran on resume and re-pushed its entry; the persisted B-completion row at the // resume point must NOT be replayed too, or B would compensate twice. Expect exactly @@ -2138,7 +2219,10 @@ mod tests { .with_checkpoint_store(store.clone()); let started = std::time::Instant::now(); - let err = workflow.resume_from("resume-budget").await.unwrap_err(); + let err = workflow + .resume_from("resume-budget", CancellationToken::disabled()) + .await + .unwrap_err(); let elapsed = started.elapsed(); assert!( @@ -2299,7 +2383,10 @@ mod tests { .with_checkpoint_store(store.clone()) .with_workflow_id("tour-interop"); - let result = workflow.orchestrate(TourStage::Route).await.unwrap(); + let result = workflow + .orchestrate(TourStage::Route, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TourStage::Done); // --- assertions on the audit log --- @@ -2375,7 +2462,10 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let err = workflow.resume_from("mid-a").await.unwrap_err(); + let err = workflow + .resume_from("mid-a", CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.message(), "C forward failed"); // A re-ran (fresh output 11), B ran (22), C failed → drain B then A. assert_eq!( @@ -2407,7 +2497,10 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()) .with_workflow_id("nope"); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.inner().category(), "task_execution"); // The Start checkpoint row is kept (empty stack ⇒ original error, no `clear`) — // so the run can still be resumed. @@ -2458,7 +2551,10 @@ mod tests { ); } - let err = workflow.orchestrate(0).await.unwrap_err(); + let err = workflow + .orchestrate(0, CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.message(), format!("n{} failed", N - 1)); // States 0..N-1 succeeded forward (the last one failed), so 0..N-1 compensate in reverse. let expected: Vec = (0..N - 1).rev().collect(); @@ -2511,7 +2607,7 @@ mod tests { }, ); } - let result = workflow.orchestrate(0).await; + let result = workflow.orchestrate(0, CancellationToken::disabled()).await; if fail_at == N { assert_eq!(result.unwrap(), N, "no failure ⇒ run completes"); assert!( @@ -2590,7 +2686,10 @@ mod tests { .with_workflow_id("step-fwd"); assert_eq!( - workflow.orchestrate(TestState::Start).await.unwrap(), + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); assert_eq!(calls.load(AtomicOrdering::Relaxed), 4); @@ -2660,7 +2759,10 @@ mod tests { .with_checkpoint_store(store.clone()); assert_eq!( - workflow.resume_from("step-resume").await.unwrap(), + workflow + .resume_from("step-resume", CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); // Only 2 step calls: cursor=2→More(3), cursor=3→Done. @@ -2684,7 +2786,10 @@ mod tests { .with_checkpoint_store(store.clone()) .with_workflow_id("dense"); - workflow.orchestrate(TestState::Start).await.unwrap(); + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); let audit = store.audit_rows("dense"); // seq 0: Start (StateEntry), seq 1: cursor=1, seq 2: cursor=2, seq 3: Complete (exit) @@ -2721,7 +2826,10 @@ mod tests { .with_checkpoint_store(store.clone()); assert_eq!( - workflow.resume_from("step-fresh-resume").await.unwrap(), + workflow + .resume_from("step-fresh-resume", CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); // Full 2 steps: None→1, 1→2, 2→Done = 3 calls @@ -2736,7 +2844,7 @@ mod tests { let result = Workflow::bare() .register_stepped(TestState::Start, stepper) .add_exit_state(TestState::Complete) - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .unwrap(); assert_eq!(result, TestState::Complete); @@ -2808,7 +2916,10 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let err = workflow.resume_from("mixed-stepped").await.unwrap_err(); + let err = workflow + .resume_from("mixed-stepped", CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.message(), "stepper failed"); // A must have been compensated with value 42 from the rehydrated stack. assert_eq!( @@ -2828,7 +2939,10 @@ mod tests { .with_checkpoint_store(store.clone()) .with_workflow_id("ver-run") .with_workflow_version(7); - workflow.orchestrate(TestState::Start).await.unwrap(); + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); let rows = store.audit_rows("ver-run"); assert!(rows.iter().all(|r| r.workflow_version == 7)); assert!(!rows.is_empty(), "expected at least one appended row"); @@ -2849,7 +2963,10 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()) .with_workflow_version(2); - let err = workflow.resume_from("ver-mismatch").await.unwrap_err(); + let err = workflow + .resume_from("ver-mismatch", CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err, CanoError::workflow_version_mismatch(1, 2)); } @@ -2878,7 +2995,7 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); let out = workflow - .resume_from("wf-no-se") + .resume_from("wf-no-se", CancellationToken::disabled()) .await .expect("resume should fall back instead of refusing on missing StateEntry"); assert_eq!(out, TestState::Complete); @@ -2917,7 +3034,7 @@ mod tests { .with_checkpoint_store(store.clone()) .with_workflow_version(1); let out = workflow - .resume_from("mixed-ver") + .resume_from("mixed-ver", CancellationToken::disabled()) .await .expect("mixed-version log with matching tail must resume cleanly"); assert_eq!(out, TestState::Complete); @@ -3007,7 +3124,10 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let _ = workflow.resume_from("unsorted").await.unwrap_err(); + let _ = workflow + .resume_from("unsorted", CancellationToken::disabled()) + .await + .unwrap_err(); // LIFO drain: B compensates first (output 2), then A (output 1). // Without the engine-side sort, the rehydrated stack would have been // built in reverse and `A` would have compensated before `B`. @@ -3069,7 +3189,10 @@ mod tests { .register(TestState::Process, FailTask::new(true)) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); // The drain ran compensate (it started) but timed out before finishing. assert!(began.load(Ordering::SeqCst), "compensate must have started"); assert!( @@ -3131,7 +3254,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert!( started.load(Ordering::SeqCst), "task body must have started" @@ -3193,7 +3319,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert!( committed.load(Ordering::SeqCst), "commit must have happened" @@ -3244,7 +3373,10 @@ mod tests { .with_observer(Arc::new(obs)); // Successful run that triggers the clear-on-success path. - workflow.orchestrate(TestState::Start).await.unwrap(); + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); let calls = rec.clear_failures(); assert_eq!(calls.len(), 1, "expected one clear-failure event"); @@ -3315,7 +3447,10 @@ mod tests { .add_exit_state(TestState::Complete); let started = std::time::Instant::now(); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); let elapsed = started.elapsed(); assert!( ran.load(std::sync::atomic::Ordering::SeqCst), @@ -3376,7 +3511,10 @@ mod tests { .register_with_compensation(TestState::Start, PanickyCompensate) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); match err.inner() { CanoError::CompensationFailed { errors } => { assert!( @@ -3439,7 +3577,10 @@ mod tests { let workflow = Workflow::bare() .register_with_compensation(TestState::Start, InlineFailingCompensate) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); let errors = match err { CanoError::CompensationFailed { errors } => errors, other => panic!("expected CompensationFailed, got: {other:?}"), @@ -3542,7 +3683,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert!(charged.load(Ordering::SeqCst)); // The error mentions the split rejection. assert!(err.message().contains("split"), "got: {err}"); @@ -3611,7 +3755,10 @@ mod tests { .register_with_compensation(TestState::Start, LeakyCharge { log: log.clone() }) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); // The serialize error is what surfaces (wrapped with state context). assert!(err.message().contains("serialize"), "got: {err}"); @@ -3692,7 +3839,10 @@ mod tests { .with_checkpoint_store(store.clone()) .with_observer(Arc::new(obs)); - let err = workflow.resume_from("f3").await.unwrap_err(); + let err = workflow + .resume_from("f3", CancellationToken::disabled()) + .await + .unwrap_err(); // Setup ran and failed. assert!(triggered.load(Ordering::SeqCst)); // The resource error surfaces (not wrapped in WithStateContext, not @@ -3753,7 +3903,10 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let err = workflow.resume_from("orphan").await.unwrap_err(); + let err = workflow + .resume_from("orphan", CancellationToken::disabled()) + .await + .unwrap_err(); // The drain produced two errors (original + orphan) → CompensationFailed. let inner = match &err { CanoError::CompensationFailed { errors } => errors.clone(), @@ -3834,7 +3987,10 @@ mod tests { .add_exit_state(TestState::Complete) .with_checkpoint_store(store.clone()); - let err = workflow.resume_from("f9").await.unwrap_err(); + let err = workflow + .resume_from("f9", CancellationToken::disabled()) + .await + .unwrap_err(); // The drain ran two compensators (clean rollback) so the surfaced // error is just the original wrapped failure. let ctx = match &err { @@ -3906,7 +4062,9 @@ mod tests { .with_observer(Arc::new(obs)); // Resume succeeds (label dropped from path; gap surfaced via observer). - let _ = workflow.resume_from("f-rename").await; + let _ = workflow + .resume_from("f-rename", CancellationToken::disabled()) + .await; let recorded = rec.unknown_states(); assert_eq!( recorded.len(), @@ -3963,7 +4121,10 @@ mod tests { .with_checkpoint_store(store.clone()); // Case 1: no rows for the id — `load_run` returns empty. - let err = workflow.resume_from("never-existed").await.unwrap_err(); + let err = workflow + .resume_from("never-existed", CancellationToken::disabled()) + .await + .unwrap_err(); assert!(err.message().contains("no checkpoint rows"), "got: {err}"); assert_eq!(setups.load(Ordering::SeqCst), 1, "setup must have run"); assert_eq!( @@ -3980,7 +4141,10 @@ mod tests { ) .await .unwrap(); - let err = workflow.resume_from("ver-mismatch").await.unwrap_err(); + let err = workflow + .resume_from("ver-mismatch", CancellationToken::disabled()) + .await + .unwrap_err(); assert!(matches!(err, CanoError::WorkflowVersionMismatch { .. })); assert_eq!(setups.load(Ordering::SeqCst), 2); assert_eq!( @@ -3997,7 +4161,10 @@ mod tests { ) .await .unwrap(); - let err = workflow.resume_from("bad-label").await.unwrap_err(); + let err = workflow + .resume_from("bad-label", CancellationToken::disabled()) + .await + .unwrap_err(); assert!(err.message().contains("is not a registered or exit state")); assert_eq!(setups.load(Ordering::SeqCst), 3); assert_eq!( @@ -4046,7 +4213,7 @@ mod tests { .with_checkpoint_store(store.clone()); let out = workflow - .resume_from("missing-entry") + .resume_from("missing-entry", CancellationToken::disabled()) .await .expect("resume should fall back rather than refuse on missing StateEntry"); assert_eq!(out, TestState::Complete); @@ -4084,7 +4251,7 @@ mod tests { .with_workflow_version(2); let out = workflow - .resume_from("mixed-ver") + .resume_from("mixed-ver", CancellationToken::disabled()) .await .expect("post-F3, mixed-version log with matching tail must resume"); assert_eq!(out, TestState::Complete); @@ -4348,63 +4515,12 @@ mod rehydrated_run_tests { mod metrics_tests { use crate::metrics::test_support::*; use crate::prelude::*; - use crate::recovery::CheckpointRow; use crate::task::TaskConfig; use crate::workflow::test_support::{CompLog, CompTask, MemCheckpoints, SimpleTask, TestState}; use std::sync::{Arc, Mutex}; // ---- Recovery: checkpoint append + clear counters ---- - #[test] - fn legacy_timeout_on_resume_only_increments_timeout_counter() { - // Regression for F8: previously `execute_resume_inner`'s legacy - // `with_timeout` arm recorded `outcome="timeout"` and then fell - // through to the unconditional `workflow_run("timeout"|"failed")` - // emission at the bottom of the function — double-counting the same - // invocation. `run_workflow` in the orchestrate direction already - // used `return Err(...)` to avoid this; the resume path now matches. - struct Slow; - #[crate::task] - impl Task for Slow { - fn config(&self) -> TaskConfig { - TaskConfig::minimal() - } - async fn run_bare(&self) -> Result, CanoError> { - tokio::time::sleep(std::time::Duration::from_millis(500)).await; - Ok(TaskResult::Single(TestState::Complete)) - } - } - - let (res, rows) = run_with_recorder(|| async { - let store = Arc::new(MemCheckpoints::default()); - // Pre-populate so resume_from has something to rehydrate. - store - .append( - "wf-legacy-timeout", - CheckpointRow::new(0, "Start", "S").with_workflow_version(0), - ) - .await - .unwrap(); - let workflow = Workflow::bare() - .with_checkpoint_store(store.clone()) - .with_timeout(std::time::Duration::from_millis(20)) - .register(TestState::Start, Slow) - .add_exit_state(TestState::Complete); - workflow.resume_from("wf-legacy-timeout").await - }); - assert!(res.is_err()); - assert_eq!( - counter(&rows, "cano_workflow_runs_total", &[("outcome", "timeout")]), - 1, - "exactly one `outcome=timeout` row should be recorded" - ); - assert_eq!( - counter_opt(&rows, "cano_workflow_runs_total", &[("outcome", "failed")]).unwrap_or(0), - 0, - "resume timeout must not also be counted as `outcome=failed`" - ); - } - #[test] fn checkpoint_append_and_clear_counters_on_successful_run() { let (res, rows) = run_with_recorder(|| async { @@ -4415,7 +4531,9 @@ mod metrics_tests { .register(TestState::Start, SimpleTask::new(TestState::Process)) .register(TestState::Process, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete); - workflow.orchestrate(TestState::Start).await + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await }); assert_eq!(res.unwrap(), TestState::Complete); // At minimum one append per state entered (Start, Process, Complete = 3 rows) @@ -4465,7 +4583,9 @@ mod metrics_tests { ) .register(TestState::Process, AlwaysFailTask) .add_exit_state(TestState::Complete); - workflow.orchestrate(TestState::Start).await + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await }); // Clean rollback: the original error is returned (not CompensationFailed) assert!(res.is_err(), "expected workflow to fail"); diff --git a/cano/src/workflow/execution.rs b/cano/src/workflow/execution.rs index b1e895d..366ca3d 100644 --- a/cano/src/workflow/execution.rs +++ b/cano/src/workflow/execution.rs @@ -14,6 +14,7 @@ use std::sync::Arc; use futures_util::FutureExt; +use crate::cancel::CancellationToken; use crate::error::CanoError; use crate::recovery::CheckpointRow; use crate::saga::{CompensationEntry, ErasedCompensatable}; @@ -159,6 +160,7 @@ where &self, initial_state: TState, total_budget: Option<(std::time::Instant, std::time::Duration)>, + token: CancellationToken, ) -> Result { self.execute_workflow_from( initial_state, @@ -168,6 +170,7 @@ where HashMap::new(), Vec::new(), total_budget, + token, ) .await } @@ -214,6 +217,11 @@ where // deadline. `None` is the zero-cost path — dispatch is awaited directly // with no `timeout_at` wrapper. total_budget: Option<(std::time::Instant, std::time::Duration)>, + // Cooperative-cancellation signal. The internal "never" token (used by + // `orchestrate`/`resume_from`) reports `can_cancel() == false`, so the + // dispatch path skips the cancellation `select!` entirely — the existing + // zero-cost behaviour is preserved bit-for-bit. + token: CancellationToken, ) -> Result { let mut current_state = initial_state; let mut sequence = start_sequence; @@ -258,6 +266,28 @@ where }); loop { + // Cooperative cancellation observed at a state boundary: stop before + // entering this state, fire `on_cancelled` once, and drain whatever + // compensatable work has completed so far. `is_cancelled()` is a + // non-blocking poll and is `false` for the "never" token, so this is + // free on the no-token path. `current_state` is not yet pushed onto + // `transitions_so_far`, so the reported path stops *before* the state + // we declined to run. + if token.is_cancelled() { + let label = format!("{current_state:?}"); + notify_observers(&self.observers, |o| o.on_cancelled(&label)); + return self + .wrap_and_drain( + workflow_id.as_deref(), + compensation_stack, + ¤t_state, + &transitions_so_far, + CanoError::cancelled(), + total_budget, + ) + .await; + } + // The `Debug` label of the state being entered. Needed for observer // `on_state_enter`, checkpoint rows, and the `metrics` feature's // `state` label; skipped (no allocation) when none are in play — the @@ -402,12 +432,19 @@ where StateEntry::Single { task, config } => { let task_name = task.name(); let fut = self.execute_single_task(task.clone(), Arc::clone(config)); - Self::dispatch_with_budget(step_budget, &self.observers, fut, |o, err| { - // `execute_single_task` fired `on_task_start` inside the - // dropped future; pair it with `on_task_failure` so observer - // gauges (`active_tasks` etc.) remain balanced. - o.on_task_failure(task_name.as_ref(), err); - }) + Self::dispatch_with_budget( + step_budget, + &token, + state_label.as_deref(), + &self.observers, + fut, + |o, err| { + // `execute_single_task` fired `on_task_start` inside the + // dropped future; pair it with `on_task_failure` so observer + // gauges (`active_tasks` etc.) remain balanced. + o.on_task_failure(task_name.as_ref(), err); + }, + ) .await } StateEntry::Router { task, config } => { @@ -415,9 +452,16 @@ where // block above (is_router guard). let task_name = task.name(); let fut = self.execute_single_task(task.clone(), Arc::clone(config)); - Self::dispatch_with_budget(step_budget, &self.observers, fut, |o, err| { - o.on_task_failure(task_name.as_ref(), err); - }) + Self::dispatch_with_budget( + step_budget, + &token, + state_label.as_deref(), + &self.observers, + fut, + |o, err| { + o.on_task_failure(task_name.as_ref(), err); + }, + ) .await } StateEntry::Split { @@ -434,10 +478,11 @@ where // helper invokes `task_failure_fan_out` once per observer, so // formatting inside the closure would re-allocate every time. // `execute_split_join` fires `on_task_start` per branch; on - // outer cancellation those branches are dropped, so we fire a - // synthetic per-branch `on_task_failure` to keep observer - // gauges balanced. - let branch_ids: Vec = if step_budget.is_some() { + // outer cancellation OR a total-timeout trip those branches are + // dropped, so we fire a synthetic per-branch `on_task_failure` + // to keep observer gauges balanced — needed whenever the + // dispatch can be aborted (a budget deadline or a live token). + let branch_ids: Vec = if step_budget.is_some() || token.can_cancel() { tasks .iter() .enumerate() @@ -446,11 +491,18 @@ where } else { Vec::new() }; - Self::dispatch_with_budget(step_budget, &self.observers, fut, |o, err| { - for id in &branch_ids { - o.on_task_failure(id, err); - } - }) + Self::dispatch_with_budget( + step_budget, + &token, + state_label.as_deref(), + &self.observers, + fut, + |o, err| { + for id in &branch_ids { + o.on_task_failure(id, err); + } + }, + ) .await } StateEntry::CompensatableSingle { task, config } => { @@ -535,9 +587,16 @@ where &mut sequence, resume_cursor, ); - Self::dispatch_with_budget(step_budget, &self.observers, fut, |o, err| { - o.on_task_failure(task_name.as_ref(), err); - }) + Self::dispatch_with_budget( + step_budget, + &token, + state_label.as_deref(), + &self.observers, + fut, + |o, err| { + o.on_task_failure(task_name.as_ref(), err); + }, + ) .await } }; @@ -562,6 +621,9 @@ where current_state = match step { Ok(s) => s, Err(e) => { + // `on_cancelled` for a mid-task cancel already fired inside + // `dispatch_with_budget` (the between-state case fires in the + // top-of-loop guard), so this arm stays generic. // Route through `wrap_and_drain` so the wrap + bounded-vs- // unbounded decision live in one place (it derives the // attempt count from `e` itself). The bounded drain bounds @@ -585,8 +647,9 @@ where } } - /// Wrap a state-dispatch future in the per-iteration step-budget, or - /// pass it through unchanged when no total budget is active. + /// Wrap a state-dispatch future in the per-iteration step-budget and race it + /// against the cancellation `token`, or pass it through unchanged when neither + /// is active. /// /// When the wrapped future trips the deadline, the engine synthesizes a /// `WorkflowTimeout` error, fires `on_workflow_timeout` once per @@ -597,14 +660,22 @@ where /// `on_task_start` already fired by the dropped inner future is paired /// with a matching `on_task_failure`). /// - /// `fut.await` is the zero-cost path when `step_budget` is `None`; no - /// observer plumbing runs in that case. + /// When the `token` fires first, the dispatch is dropped, the same + /// `task_failure_fan_out` runs (gauge balance), `on_cancelled(state_label)` + /// fires once, and `CanoError::Cancelled` is returned. This is the *only* + /// place a token-driven mid-task cancel is recognized — the caller's error + /// arm stays generic. + /// + /// `fut.await` is the zero-cost path when `step_budget` is `None` and the + /// token can never fire; no observer plumbing runs in that case. async fn dispatch_with_budget( step_budget: Option<( std::time::Instant, std::time::Duration, tokio::time::Instant, )>, + token: &CancellationToken, + state_label: Option<&str>, observers: &[Arc], fut: F, task_failure_fan_out: impl Fn(&dyn crate::observer::WorkflowObserver, &CanoError), @@ -612,20 +683,51 @@ where where F: std::future::Future>, { - let Some((start, limit, deadline)) = step_budget else { - return fut.await; + let fan = &task_failure_fan_out; + // The existing budget logic, untouched: `timeout_at` when a budget is set, + // otherwise a bare `fut.await`. Captured as a future so the cancellation + // arm below can race it. `async move` takes ownership of `fut`; `observers` + // and `fan` are `Copy` references, so they remain usable in the cancel arm. + let budgeted = async move { + let Some((start, limit, deadline)) = step_budget else { + return fut.await; + }; + match tokio::time::timeout_at(deadline, fut).await { + Ok(inner) => inner, + Err(_) => { + let elapsed = start.elapsed(); + let err = CanoError::workflow_timeout(elapsed, limit); + notify_observers(observers, |o| { + fan(o, &err); + o.on_workflow_timeout(elapsed, limit); + }); + Err(err) + } + } }; - match tokio::time::timeout_at(deadline, fut).await { - Ok(inner) => inner, - Err(_) => { - let elapsed = start.elapsed(); - let err = CanoError::workflow_timeout(elapsed, limit); - notify_observers(observers, |o| { - task_failure_fan_out(o, &err); - o.on_workflow_timeout(elapsed, limit); - }); + + // Zero-cost path: the "never" token can't fire, so skip the `select!` + // entirely and run the budgeted future exactly as before. + if !token.can_cancel() { + return budgeted.await; + } + + // Race cancellation against the budgeted dispatch. `biased` checks the + // cancel arm first so cancellation deterministically wins a tie against + // the per-state timeout. On cancel the inner `fut` is dropped (for splits + // this drops the `JoinSet`, aborting its children); we fire the same + // per-task fan-out the timeout path uses so observer gauges stay balanced. + tokio::select! { + biased; + _ = token.cancelled() => { + let err = CanoError::cancelled(); + notify_observers(observers, |o| fan(o, &err)); + if let Some(label) = state_label { + notify_observers(observers, |o| o.on_cancelled(label)); + } Err(err) } + res = budgeted => res, } } @@ -709,7 +811,9 @@ where pub(super) fn attempts_from_error(err: &CanoError) -> u32 { match err { CanoError::RetryExhausted { attempts, .. } => *attempts, - CanoError::CircuitOpen(_) | CanoError::WorkflowTimeout { .. } => 0, + CanoError::CircuitOpen(_) + | CanoError::WorkflowTimeout { .. } + | CanoError::Cancelled => 0, CanoError::WithStateContext { source, .. } => Self::attempts_from_error(source), _ => 1, } @@ -1215,7 +1319,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1233,7 +1340,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1252,7 +1362,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1272,7 +1385,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1290,7 +1406,9 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await; + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; assert!(result.is_err()); } @@ -1309,7 +1427,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1336,35 +1457,13 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await; + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; assert!(result.is_err()); assert!(result.unwrap_err().to_string().contains("timeout")); } - #[tokio::test] - async fn test_workflow_timeout() { - // Task that sleeps longer than workflow timeout - #[derive(Clone)] - struct SlowTask; - - #[task] - impl Task for SlowTask { - async fn run_bare(&self) -> Result, CanoError> { - tokio::time::sleep(Duration::from_millis(200)).await; - Ok(TaskResult::Single(TestState::Complete)) - } - } - - let workflow = Workflow::bare() - .with_timeout(Duration::from_millis(50)) - .register(TestState::Start, SlowTask) - .add_exit_state(TestState::Complete); - - let result = workflow.orchestrate(TestState::Start).await; - assert!(result.is_err()); - assert!(result.unwrap_err().to_string().contains("Workflow timeout")); - } - #[tokio::test] async fn test_split_with_data_sharing() { let store = crate::store::MemoryStore::new(); @@ -1382,7 +1481,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); // Verify all tasks wrote their data @@ -1416,7 +1518,10 @@ mod tests { .register(TestState::Process, SimpleTask::new(TestState::Complete)) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); // Verify all data was written @@ -1473,7 +1578,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1528,7 +1636,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1556,7 +1667,9 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await; + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; assert!(result.is_err()); assert!(result.unwrap_err().to_string().contains("timeout")); } @@ -1606,7 +1719,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1662,7 +1778,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1696,7 +1815,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1721,7 +1843,9 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await; + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; assert!(result.is_err()); assert!( result @@ -1738,7 +1862,10 @@ mod tests { let workflow = Workflow::bare() .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1750,7 +1877,10 @@ mod tests { let workflow = Workflow::bare() .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert!( matches!(err, CanoError::Configuration(_)), "expected Configuration error, got {err:?}" @@ -1768,7 +1898,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); assert_eq!( - workflow.orchestrate(TestState::Start).await.unwrap(), + workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(), TestState::Complete ); @@ -1777,7 +1910,12 @@ mod tests { let workflow2 = Workflow::bare() .register_split(TestState::Start, tasks_fail, join_config2) .add_exit_state(TestState::Complete); - assert!(workflow2.orchestrate(TestState::Start).await.is_err()); + assert!( + workflow2 + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .is_err() + ); } #[tokio::test] @@ -1791,7 +1929,10 @@ mod tests { let workflow = Workflow::bare() .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert!( matches!(err, CanoError::Configuration(_)), "expected Configuration error, got {err:?}" @@ -1805,7 +1946,10 @@ mod tests { let workflow = Workflow::bare() .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1816,7 +1960,10 @@ mod tests { let workflow = Workflow::bare() .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -1825,7 +1972,9 @@ mod tests { // No exit states means validate() rejects the workflow before any task runs. let workflow = Workflow::bare().register(TestState::Start, SimpleTask::new(TestState::Complete)); - let result = workflow.orchestrate(TestState::Start).await; + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; let err = result.unwrap_err(); assert_eq!(err.category(), "configuration"); assert!(err.to_string().contains("no exit states")); @@ -1846,7 +1995,9 @@ mod tests { let workflow = Workflow::bare() .register(TestState::Start, SplitReturningTask) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await; + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; assert!(result.is_err()); assert!(result.unwrap_err().to_string().contains("register_split")); } @@ -1883,7 +2034,9 @@ mod tests { .register(TestState::Start, task) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await; + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; assert!(result.is_err()); // With max_retries=2, there should be exactly 3 attempts (1 initial + 2 retries). @@ -1936,7 +2089,9 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await; + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; assert!(result.is_ok(), "workflow should succeed after retries"); assert_eq!( call_count.load(Ordering::SeqCst), @@ -1969,7 +2124,7 @@ mod tests { .add_exit_state(TestState::Complete); let err = workflow - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .expect_err("panic must surface as Err"); // The FSM wraps the failure with state context; `.inner()` peels one layer. @@ -1993,7 +2148,7 @@ mod tests { .add_exit_state(TestState::Complete); let err = workflow - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .expect_err("split panic must surface as Err"); let msg = err.to_string(); @@ -2056,7 +2211,10 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); let observed = max.load(Ordering::SeqCst); assert!( @@ -2075,7 +2233,7 @@ mod tests { .add_exit_state(TestState::Complete); let err = workflow - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .expect_err("bulkhead=0 must error"); assert!(matches!(err, CanoError::Configuration(_)), "got {err:?}"); @@ -2104,7 +2262,7 @@ mod tests { let err = Workflow::bare() .register(TestState::Start, SlowTask) .add_exit_state(TestState::Complete) - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .expect_err("expected attempt timeout to exhaust retries"); // The FSM wraps the failure with state context; `.inner()` peels one layer. @@ -2158,7 +2316,9 @@ mod tests { .register_split(TestState::Start, tasks, join_config) .add_exit_state(TestState::Complete); - let _ = workflow.orchestrate(TestState::Start).await; + let _ = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; assert!( matches!(breaker.state(), CircuitState::Open { .. }), "shared breaker must trip after 4 concurrent failures, got {:?}", @@ -2203,7 +2363,7 @@ mod tests { let result = Workflow::bare() .register_stepped(TestState::Start, Counter { target: 5 }) .add_exit_state(TestState::Complete) - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .unwrap(); assert_eq!(result, TestState::Complete); @@ -2234,7 +2394,7 @@ mod tests { let err = Workflow::bare() .register_stepped(TestState::Start, SplitStepper) .add_exit_state(TestState::Complete) - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .expect_err("split result from stepped must error"); // The FSM wraps the failure with state context; `.inner()` peels one layer. @@ -2270,7 +2430,10 @@ mod tests { .register(TestState::Process, FailNoRetry) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); match err { CanoError::WithStateContext { state, @@ -2306,7 +2469,7 @@ mod tests { let err = Workflow::bare() .register(TestState::Start, AlwaysFails) .add_exit_state(TestState::Complete) - .orchestrate(TestState::Start) + .orchestrate(TestState::Start, CancellationToken::disabled()) .await .unwrap_err(); match err { @@ -2338,7 +2501,10 @@ mod tests { .register(TestState::Process, FailTask::new(true)) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); match err { CanoError::WithStateContext { transitions_so_far, @@ -2392,7 +2558,10 @@ mod tests { }, ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); match err { CanoError::WithStateContext { attempt, source, .. @@ -2422,7 +2591,10 @@ mod tests { }, ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); // The engine wraps the timeout with state context; `.inner()` peels one layer. assert!( matches!(err.inner(), CanoError::WorkflowTimeout { .. }), @@ -2430,64 +2602,6 @@ mod tests { ); } - #[tokio::test] - async fn with_timeout_acts_as_floor_when_combined_with_with_total_timeout() { - // Regression for F5: previously, when both `with_timeout(d1)` and - // `with_total_timeout(d2)` were set, the legacy `with_timeout` was - // silently disabled (the total-timeout path won outright). Users who - // composed them expecting `with_timeout` to act as a hard upper bound - // would lose that guardrail. The engine now treats their min as the - // effective graceful budget, so `with_timeout=10ms` still bounds the - // run even when `with_total_timeout=60s` is configured. - let workflow = Workflow::bare() - .with_timeout(Duration::from_millis(10)) - .with_total_timeout(Duration::from_secs(60)) - .register( - TestState::Start, - SleepyTask { - sleep_ms: 1_000, - next: TestState::Complete, - }, - ) - .add_exit_state(TestState::Complete); - let started = std::time::Instant::now(); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); - let elapsed = started.elapsed(); - assert!( - elapsed < Duration::from_millis(500), - "with_timeout (10ms) must still bound the run when total_timeout is also set; took {elapsed:?}" - ); - // The graceful path produces `WorkflowTimeout`; the legacy path produces - // `CanoError::Workflow("Workflow timeout exceeded")`. The composition - // takes the graceful path. - assert!( - matches!(err.inner(), CanoError::WorkflowTimeout { .. }), - "expected graceful WorkflowTimeout, got: {err}" - ); - } - - #[tokio::test] - async fn with_timeout_alone_still_uses_legacy_blunt_timeout() { - // Sanity for F5: when only `with_timeout` is set the legacy path is - // unchanged — surface `CanoError::Workflow("Workflow timeout exceeded")` - // (the documented legacy shape) and run no compensation. - let workflow = Workflow::bare() - .with_timeout(Duration::from_millis(10)) - .register( - TestState::Start, - SleepyTask { - sleep_ms: 1_000, - next: TestState::Complete, - }, - ) - .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); - assert!( - matches!(err, CanoError::Workflow(ref m) if m.contains("Workflow timeout exceeded")), - "with_timeout alone must surface the legacy blunt-timeout shape, got: {err}" - ); - } - #[tokio::test] async fn total_timeout_fires_on_workflow_timeout_observer_hook() { let (obs, rec) = EventLog::new(); @@ -2502,7 +2616,9 @@ mod tests { }, ) .add_exit_state(TestState::Complete); - let _ = workflow.orchestrate(TestState::Start).await; + let _ = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await; let events = rec.timeouts(); assert_eq!(events.len(), 1, "hook should fire exactly once"); let (elapsed, limit) = events[0]; @@ -2568,7 +2684,10 @@ mod tests { ) .add_exit_state(TestState::Complete); - let err = workflow.orchestrate(TestState::Start).await.unwrap_err(); + let err = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap_err(); // Clean rollback → original error surfaced, wrapped with state context. assert!( matches!(err.inner(), CanoError::WorkflowTimeout { .. }), @@ -2632,7 +2751,10 @@ mod tests { // The compensatable task runs ~80ms despite the 20ms total budget — and the // workflow exits cleanly (no compensation runs, no error surfaces) because // there's no further state for the budget to cancel. - let outcome = workflow.orchestrate(TestState::Start).await.unwrap(); + let outcome = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(outcome, TestState::Complete); let log_entries = log.lock().unwrap().clone(); assert_eq!(log_entries, vec!["ran".to_string()]); @@ -2854,6 +2976,8 @@ mod dispatch_with_budget_tests { let call_count = Arc::new(AtomicUsize::new(0)); let cc = Arc::clone(&call_count); let result = Workflow::::dispatch_with_budget( + None, + &CancellationToken::disabled(), None, &observers, async { Ok::(TestState::Complete) }, @@ -2884,6 +3008,8 @@ mod dispatch_with_budget_tests { let cc = Arc::clone(&call_count); let result = Workflow::::dispatch_with_budget( budget_for(Duration::from_secs(60)), + &CancellationToken::disabled(), + None, &observers, async { Ok::(TestState::Complete) }, |_o, _err| { @@ -2915,6 +3041,8 @@ mod dispatch_with_budget_tests { Duration::from_millis(50), tokio::time::Instant::now() - Duration::from_millis(1), )), + &CancellationToken::disabled(), + None, &observers, async { tokio::time::sleep(Duration::from_secs(60)).await; @@ -2959,6 +3087,8 @@ mod dispatch_with_budget_tests { Duration::from_millis(10), tokio::time::Instant::now() - Duration::from_millis(1), )), + &CancellationToken::disabled(), + None, &observers, async { tokio::time::sleep(Duration::from_secs(60)).await; @@ -2993,6 +3123,8 @@ mod dispatch_with_budget_tests { Duration::from_millis(5), tokio::time::Instant::now() - Duration::from_millis(1), )), + &CancellationToken::disabled(), + None, &observers, async { tokio::time::sleep(Duration::from_secs(60)).await; @@ -3027,6 +3159,8 @@ mod dispatch_with_budget_tests { let observers: [Arc; 0] = []; let err = Workflow::::dispatch_with_budget( budget_for(Duration::from_secs(60)), + &CancellationToken::disabled(), + None, &observers, async { Err::(CanoError::task_execution("custom inner err")) }, |_o, _err| {}, @@ -3038,4 +3172,139 @@ mod dispatch_with_budget_tests { "must propagate the inner err verbatim, got: {err}" ); } + + // ----- cancellation path (token can fire ⇒ the `select!` arm is taken) ----- + + #[tokio::test] + async fn live_token_uncancelled_passes_future_through() { + // A real (not `never`) token that never fires takes the `select!` path, but the + // `res = budgeted` arm must still return the future's Ok with no observer events. + let (observer, rec) = EventLog::new(); + let observer_dyn: Arc = Arc::new(observer); + let observers = [observer_dyn]; + let (_handle, token) = CancellationToken::new(); + let result = Workflow::::dispatch_with_budget( + None, + &token, + None, + &observers, + async { Ok::(TestState::Complete) }, + |o, err| o.on_task_failure("synthetic", err), + ) + .await + .expect("uncancelled live token returns the future's Ok"); + assert_eq!(result, TestState::Complete); + assert!( + rec.is_empty(), + "no observer events when the task completes normally" + ); + } + + #[tokio::test] + async fn live_token_with_expired_budget_still_times_out() { + // Regression: adding the cancellation race must not break the budget timeout. + // A live-but-uncancelled token + an already-expired deadline must still trip + // `WorkflowTimeout` (and fire the timeout hooks, not `on_cancelled`). + let (observer, rec) = EventLog::new(); + let observer_dyn: Arc = Arc::new(observer); + let observers = [observer_dyn]; + let (_handle, token) = CancellationToken::new(); + let err = Workflow::::dispatch_with_budget( + Some(( + std::time::Instant::now() - Duration::from_secs(1), + Duration::from_millis(5), + tokio::time::Instant::now() - Duration::from_millis(1), + )), + &token, + Some("StateX"), + &observers, + async { + tokio::time::sleep(Duration::from_secs(60)).await; + Ok::(TestState::Complete) + }, + |o, err| o.on_task_failure("synthetic", err), + ) + .await + .expect_err("expired budget must trip even with a live token"); + assert!( + matches!(err, CanoError::WorkflowTimeout { .. }), + "got: {err}" + ); + let labels = rec.labels(); + assert!( + labels.iter().any(|l| l.starts_with("workflow_timeout:")), + "timeout hook fired: {labels:?}" + ); + assert!( + !labels.iter().any(|l| l.starts_with("cancelled:")), + "on_cancelled must NOT fire on a timeout: {labels:?}" + ); + } + + #[tokio::test] + async fn precancelled_token_returns_cancelled_and_fires_hooks() { + // The new cancel arm: a pre-cancelled token (biased `select!` picks it) must drop + // the inner future, return `Cancelled`, fire the per-task fan-out (gauge balance) + // AND `on_cancelled(state_label)` — but not the timeout hook. + let (observer, rec) = EventLog::new(); + let observer_dyn: Arc = Arc::new(observer); + let observers = [observer_dyn]; + let (handle, token) = CancellationToken::new(); + handle.cancel(); + let err = Workflow::::dispatch_with_budget( + None, + &token, + Some("StateX"), + &observers, + async { + tokio::time::sleep(Duration::from_secs(60)).await; + Ok::(TestState::Complete) + }, + |o, err| o.on_task_failure("synthetic", err), + ) + .await + .expect_err("pre-cancelled token returns Cancelled"); + assert!(matches!(err, CanoError::Cancelled), "got: {err}"); + let labels = rec.labels(); + assert!( + labels.iter().any(|l| l == "task_failure:synthetic"), + "fan-out (gauge balance) fired: {labels:?}" + ); + assert!( + labels.iter().any(|l| l == "cancelled:StateX"), + "on_cancelled fired with the state label: {labels:?}" + ); + assert!( + !labels.iter().any(|l| l.starts_with("workflow_timeout:")), + "timeout hook must NOT fire on a cancel: {labels:?}" + ); + } + + #[tokio::test] + async fn precancelled_token_without_state_label_skips_on_cancelled() { + // When no state label is available (no observers/checkpoint/metrics need it), + // the cancel arm still returns `Cancelled` and runs the fan-out, but does not + // attempt to fire `on_cancelled`. + let (observer, rec) = EventLog::new(); + let observer_dyn: Arc = Arc::new(observer); + let observers = [observer_dyn]; + let (handle, token) = CancellationToken::new(); + handle.cancel(); + let err = Workflow::::dispatch_with_budget( + None, + &token, + None, // no label + &observers, + async { Ok::(TestState::Complete) }, + |o, err| o.on_task_failure("synthetic", err), + ) + .await + .expect_err("pre-cancelled token returns Cancelled"); + assert!(matches!(err, CanoError::Cancelled), "got: {err}"); + let labels = rec.labels(); + assert!( + !labels.iter().any(|l| l.starts_with("cancelled:")), + "on_cancelled must not fire without a state label: {labels:?}" + ); + } } diff --git a/cano/src/workflow/test_support.rs b/cano/src/workflow/test_support.rs index e280ac1..9a7f78c 100644 --- a/cano/src/workflow/test_support.rs +++ b/cano/src/workflow/test_support.rs @@ -265,6 +265,9 @@ pub(crate) enum TestEvent { elapsed: Duration, limit: Duration, }, + Cancelled { + state: String, + }, CheckpointClearFailed { workflow_id: String, error: String, @@ -334,6 +337,11 @@ impl WorkflowObserver for EventLog { fn on_workflow_timeout(&self, elapsed: Duration, limit: Duration) { self.0.record(TestEvent::WorkflowTimeout { elapsed, limit }); } + fn on_cancelled(&self, state: &str) { + self.0.record(TestEvent::Cancelled { + state: state.to_string(), + }); + } fn on_checkpoint_clear_failed(&self, workflow_id: &str, error: &CanoError) { self.0.record(TestEvent::CheckpointClearFailed { workflow_id: workflow_id.to_string(), @@ -377,6 +385,7 @@ impl Recorder { TestEvent::WorkflowTimeout { limit, .. } => { format!("workflow_timeout:{}ms", limit.as_millis()) } + TestEvent::Cancelled { state } => format!("cancelled:{state}"), TestEvent::CheckpointClearFailed { workflow_id, .. } => { format!("checkpoint_clear_failed:{workflow_id}") } diff --git a/cano/tests/cancellation.rs b/cano/tests/cancellation.rs new file mode 100644 index 0000000..712a942 --- /dev/null +++ b/cano/tests/cancellation.rs @@ -0,0 +1,991 @@ +//! Integration tests for cooperative cancellation (QoL-2). +//! +//! Covers: mid-task cancel, cancel-before-start, saga drain on cancel, dirty rollback, +//! saga-safety (compensatable tasks run to completion), split child abort, exactly-once +//! `on_cancelled`, idempotency, precedence over `with_total_timeout`, an uncancellable drain, +//! pass-through equivalence with `orchestrate`, and resume cancellation. + +use cano::prelude::*; +use cano::{CancellationHandle, CancellationToken}; +use std::borrow::Cow; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +enum Step { + Reserve, + Charge, + Ship, + Park, + Done, +} + +// ---- a recording observer (no `testing` feature dependency) ---- +#[derive(Default)] +struct Rec { + events: Mutex>, +} +impl Rec { + fn push(&self, s: String) { + self.events.lock().unwrap().push(s); + } + fn snapshot(&self) -> Vec { + self.events.lock().unwrap().clone() + } + fn count_prefix(&self, prefix: &str) -> usize { + self.snapshot() + .iter() + .filter(|e| e.starts_with(prefix)) + .count() + } + fn has(&self, s: &str) -> bool { + self.snapshot().iter().any(|e| e == s) + } +} +impl WorkflowObserver for Rec { + fn on_task_start(&self, task_id: &str) { + self.push(format!("start:{task_id}")); + } + fn on_task_failure(&self, task_id: &str, _err: &CanoError) { + self.push(format!("failure:{task_id}")); + } + fn on_cancelled(&self, state: &str) { + self.push(format!("cancelled:{state}")); + } +} + +// ---- a long-running, non-compensatable task: records start/completion ---- +struct LongTask { + started: Arc, + completed: Arc, + next: Step, +} +#[task(state = Step)] +impl LongTask { + fn config(&self) -> TaskConfig { + TaskConfig::minimal() + } + async fn run_bare(&self) -> Result, CanoError> { + self.started.store(true, Ordering::SeqCst); + tokio::time::sleep(Duration::from_secs(10)).await; + self.completed.store(true, Ordering::SeqCst); + Ok(TaskResult::Single(self.next.clone())) + } +} + +// ---- a compensatable step with a per-instance name (so the compensator registry keys +// don't collide), optional pre-completion sleep, and an optionally-failing compensator ---- +struct CompStep { + name: &'static str, + next: Step, + log: Arc>>, + started: Arc, + completed: Arc, + run_sleep_ms: u64, + comp_sleep_ms: u64, + fail_comp: bool, +} +#[saga::task(state = Step)] +impl CompStep { + type Output = (); + fn name(&self) -> Cow<'static, str> { + Cow::Borrowed(self.name) + } + async fn run(&self, _res: &Resources) -> Result<(TaskResult, ()), CanoError> { + self.started.store(true, Ordering::SeqCst); + if self.run_sleep_ms > 0 { + tokio::time::sleep(Duration::from_millis(self.run_sleep_ms)).await; + } + self.log.lock().unwrap().push(format!("run:{}", self.name)); + self.completed.store(true, Ordering::SeqCst); + Ok((TaskResult::Single(self.next.clone()), ())) + } + async fn compensate(&self, _res: &Resources, _out: ()) -> Result<(), CanoError> { + if self.comp_sleep_ms > 0 { + tokio::time::sleep(Duration::from_millis(self.comp_sleep_ms)).await; + } + self.log + .lock() + .unwrap() + .push(format!("rollback:{}", self.name)); + if self.fail_comp { + return Err(CanoError::task_execution(format!( + "comp {} boom", + self.name + ))); + } + Ok(()) + } +} + +fn flag() -> Arc { + Arc::new(AtomicBool::new(false)) +} + +/// Cancel via `handle` as soon as `flag` flips true (deterministic: fire while the +/// target task is parked in its sleep). +fn cancel_when(flag: Arc, handle: CancellationHandle) -> tokio::task::JoinHandle<()> { + tokio::spawn(async move { + while !flag.load(Ordering::SeqCst) { + tokio::time::sleep(Duration::from_millis(2)).await; + } + handle.cancel(); + }) +} + +#[tokio::test] +async fn cancel_mid_long_running_task_returns_cancelled() { + let started = flag(); + let completed = flag(); + let wf = Workflow::bare() + .register( + Step::Ship, + LongTask { + started: started.clone(), + completed: completed.clone(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(started.clone(), handle); + + let t0 = Instant::now(); + let result = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert!( + !completed.load(Ordering::SeqCst), + "task should not complete" + ); + assert!( + t0.elapsed() < Duration::from_secs(2), + "should abort promptly" + ); +} + +#[tokio::test] +async fn cancel_before_orchestrate_returns_immediately_without_running_any_task() { + let started = flag(); + let completed = flag(); + let rec = Arc::new(Rec::default()); + let wf = Workflow::bare() + .with_observer(rec.clone()) + .register( + Step::Ship, + LongTask { + started: started.clone(), + completed: completed.clone(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + handle.cancel(); // pre-cancel before running + + let result = wf.orchestrate(Step::Ship, token).await; + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert!(!started.load(Ordering::SeqCst), "task must not start"); + assert_eq!(rec.count_prefix("start:"), 0, "no on_task_start fired"); + assert_eq!(rec.count_prefix("cancelled:"), 1, "on_cancelled fired once"); +} + +#[tokio::test] +async fn cancel_with_compensation_drains_stack_then_returns_cancelled() { + let log = Arc::new(Mutex::new(Vec::new())); + let ship_started = flag(); + let ignore = flag(); + let wf = Workflow::bare() + .register_with_compensation( + Step::Reserve, + CompStep { + name: "reserve", + next: Step::Charge, + log: log.clone(), + started: flag(), + completed: flag(), + run_sleep_ms: 0, + comp_sleep_ms: 0, + fail_comp: false, + }, + ) + .register_with_compensation( + Step::Charge, + CompStep { + name: "charge", + next: Step::Ship, + log: log.clone(), + started: flag(), + completed: flag(), + run_sleep_ms: 0, + comp_sleep_ms: 0, + fail_comp: false, + }, + ) + .register( + Step::Ship, + LongTask { + started: ship_started.clone(), + completed: ignore.clone(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(ship_started.clone(), handle); + + let result = wf.orchestrate(Step::Reserve, token).await; + canceller.await.unwrap(); + + let err = result.unwrap_err(); + assert_eq!(err.category(), "cancelled"); + assert!(matches!(err.inner(), CanoError::Cancelled)); + // Both compensatable steps ran, then rolled back in reverse order. + let events = log.lock().unwrap().clone(); + assert_eq!( + events, + vec![ + "run:reserve".to_string(), + "run:charge".to_string(), + "rollback:charge".to_string(), + "rollback:reserve".to_string(), + ] + ); +} + +#[tokio::test] +async fn cancel_with_failing_compensator_surfaces_compensation_failed() { + let log = Arc::new(Mutex::new(Vec::new())); + let ship_started = flag(); + let wf = Workflow::bare() + .register_with_compensation( + Step::Reserve, + CompStep { + name: "reserve", + next: Step::Ship, + log: log.clone(), + started: flag(), + completed: flag(), + run_sleep_ms: 0, + comp_sleep_ms: 0, + fail_comp: true, // its rollback fails + }, + ) + .register( + Step::Ship, + LongTask { + started: ship_started.clone(), + completed: flag(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(ship_started.clone(), handle); + + let result = wf.orchestrate(Step::Reserve, token).await; + canceller.await.unwrap(); + + match result.unwrap_err() { + CanoError::CompensationFailed { errors } => { + // errors[0] is the original (wrapped) cancellation. + assert_eq!(errors[0].category(), "cancelled"); + assert!(errors.len() >= 2, "must also carry the compensator error"); + } + other => panic!("expected CompensationFailed, got {other:?}"), + } +} + +#[tokio::test] +async fn compensatable_task_not_interrupted_midflight() { + // Cancel WHILE a CompensatableSingle is running. Saga safety requires it to run to + // completion (so its rollback entry is recorded), with the cancel honoured at the next + // boundary — draining that entry. The downstream Park task must never start. + let log = Arc::new(Mutex::new(Vec::new())); + let hold_started = flag(); + let hold_completed = flag(); + let park_started = flag(); + let wf = Workflow::bare() + .register_with_compensation( + Step::Reserve, + CompStep { + name: "hold", + next: Step::Park, + log: log.clone(), + started: hold_started.clone(), + completed: hold_completed.clone(), + run_sleep_ms: 150, // long enough to be cancelled mid-run + comp_sleep_ms: 0, + fail_comp: false, + }, + ) + .register( + Step::Park, + LongTask { + started: park_started.clone(), + completed: flag(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(hold_started.clone(), handle); // cancel during Hold's run + + let result = wf.orchestrate(Step::Reserve, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert!( + hold_completed.load(Ordering::SeqCst), + "compensatable task must run to completion, not be interrupted" + ); + assert!( + !park_started.load(Ordering::SeqCst), + "downstream task must not start" + ); + assert_eq!( + log.lock().unwrap().clone(), + vec!["run:hold".to_string(), "rollback:hold".to_string()] + ); +} + +// A long-running split child. +struct SplitChild { + started: Arc, + completed: Arc, +} +#[task(state = Step)] +impl SplitChild { + fn config(&self) -> TaskConfig { + TaskConfig::minimal() + } + async fn run_bare(&self) -> Result, CanoError> { + self.started.fetch_add(1, Ordering::SeqCst); + tokio::time::sleep(Duration::from_secs(10)).await; + self.completed.fetch_add(1, Ordering::SeqCst); + Ok(TaskResult::Single(Step::Done)) + } +} + +#[tokio::test] +async fn cancel_mid_split_aborts_children_and_returns_cancelled() { + let started = Arc::new(AtomicUsize::new(0)); + let completed = Arc::new(AtomicUsize::new(0)); + let children: Vec = (0..3) + .map(|_| SplitChild { + started: started.clone(), + completed: completed.clone(), + }) + .collect(); + // No `with_total_timeout` here on purpose: cancellation is the *only* abort path, + // which is exactly the case where the synthetic per-branch `on_task_failure` + // fan-out must still fire to keep observer gauges balanced. + let rec = Arc::new(Rec::default()); + let wf = Workflow::bare() + .with_observer(rec.clone()) + .register_split( + Step::Ship, + children, + JoinConfig::new(JoinStrategy::All, Step::Done), + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + // Cancel only once all three children have started (so all three fired on_task_start). + let started_probe = started.clone(); + let canceller = tokio::spawn(async move { + while started_probe.load(Ordering::SeqCst) < 3 { + tokio::time::sleep(Duration::from_millis(2)).await; + } + handle.cancel(); + }); + + let t0 = Instant::now(); + let result = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert!( + t0.elapsed() < Duration::from_secs(2), + "split should abort promptly" + ); + // Give any aborted children a moment; none should have completed. + tokio::time::sleep(Duration::from_millis(50)).await; + assert_eq!( + completed.load(Ordering::SeqCst), + 0, + "split children must be aborted, not completed" + ); + // Observer gauge balance: every on_task_start must be paired with an on_task_failure, + // even though no total-timeout budget was set (the cancel path is the only abort route). + assert_eq!(rec.count_prefix("start:"), 3, "all branches started"); + assert_eq!( + rec.count_prefix("failure:"), + rec.count_prefix("start:"), + "each started branch must get a paired on_task_failure on cancel (gauge balance)" + ); +} + +#[tokio::test] +async fn on_cancelled_fires_exactly_once() { + let started = flag(); + let rec = Arc::new(Rec::default()); + let wf = Workflow::bare() + .with_observer(rec.clone()) + .register( + Step::Ship, + LongTask { + started: started.clone(), + completed: flag(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(started.clone(), handle); + let result = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert_eq!( + rec.count_prefix("cancelled:"), + 1, + "on_cancelled exactly once" + ); + assert!( + rec.has("cancelled:Ship"), + "fired with the right state label" + ); +} + +#[tokio::test] +async fn on_cancelled_does_not_fire_on_successful_run() { + let rec = Arc::new(Rec::default()); + let log = Arc::new(Mutex::new(Vec::new())); + let wf = Workflow::bare() + .with_observer(rec.clone()) + .register_with_compensation( + Step::Reserve, + CompStep { + name: "reserve", + next: Step::Done, + log, + started: flag(), + completed: flag(), + run_sleep_ms: 0, + comp_sleep_ms: 0, + fail_comp: false, + }, + ) + .add_exit_state(Step::Done); + + let (_handle, token) = CancellationToken::new(); // armed but never fired + let result = wf.orchestrate(Step::Reserve, token).await; + + assert_eq!(result.unwrap(), Step::Done); + assert_eq!( + rec.count_prefix("cancelled:"), + 0, + "on_cancelled must not fire on a successful run" + ); +} + +#[tokio::test] +async fn uncancelled_token_behaves_like_orchestrate() { + let build = || { + Workflow::bare() + .register_with_compensation( + Step::Reserve, + // A quick compensatable task (no sleep) that transitions straight to Done; + // a successful run never triggers its compensator. + CompStep { + name: "reserve", + next: Step::Done, + log: Arc::new(Mutex::new(Vec::new())), + started: flag(), + completed: flag(), + run_sleep_ms: 0, + comp_sleep_ms: 0, + fail_comp: false, + }, + ) + .add_exit_state(Step::Done) + }; + + let plain = build() + .orchestrate(Step::Reserve, CancellationToken::disabled()) + .await; + let (_handle, token) = CancellationToken::new(); // never cancelled + let with_cancel = build().orchestrate(Step::Reserve, token).await; + + assert_eq!(plain.unwrap(), Step::Done); + assert_eq!(with_cancel.unwrap(), Step::Done); +} + +#[tokio::test] +async fn double_cancel_is_idempotent() { + let started = flag(); + let rec = Arc::new(Rec::default()); + let wf = Workflow::bare() + .with_observer(rec.clone()) + .register( + Step::Ship, + LongTask { + started: started.clone(), + completed: flag(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let probe = started.clone(); + let canceller = tokio::spawn(async move { + while !probe.load(Ordering::SeqCst) { + tokio::time::sleep(Duration::from_millis(2)).await; + } + handle.cancel(); + handle.cancel(); // second cancel is a no-op + }); + + let result = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert_eq!( + rec.count_prefix("cancelled:"), + 1, + "still exactly one on_cancelled" + ); +} + +#[tokio::test] +async fn cancellation_precedence_over_total_timeout() { + let started = flag(); + let wf = Workflow::bare() + .with_total_timeout(Duration::from_secs(30)) // would not fire before the cancel + .register( + Step::Ship, + LongTask { + started: started.clone(), + completed: flag(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(started.clone(), handle); + let result = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + + let err = result.unwrap_err(); + assert_eq!( + err.category(), + "cancelled", + "cancellation wins over the budget" + ); +} + +#[tokio::test] +async fn compensation_drain_completes_fully_under_cancellation() { + // The drain is uncancellable: both (slow) compensators run to completion even though the + // token stays cancelled throughout the rollback. + let log = Arc::new(Mutex::new(Vec::new())); + let ship_started = flag(); + let wf = Workflow::bare() + .register_with_compensation( + Step::Reserve, + CompStep { + name: "reserve", + next: Step::Charge, + log: log.clone(), + started: flag(), + completed: flag(), + run_sleep_ms: 0, + comp_sleep_ms: 40, + fail_comp: false, + }, + ) + .register_with_compensation( + Step::Charge, + CompStep { + name: "charge", + next: Step::Ship, + log: log.clone(), + started: flag(), + completed: flag(), + run_sleep_ms: 0, + comp_sleep_ms: 40, + fail_comp: false, + }, + ) + .register( + Step::Ship, + LongTask { + started: ship_started.clone(), + completed: flag(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(ship_started.clone(), handle); + let result = wf.orchestrate(Step::Reserve, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + let events = log.lock().unwrap().clone(); + assert!(events.contains(&"rollback:charge".to_string())); + assert!(events.contains(&"rollback:reserve".to_string())); +} + +// A resource whose teardown is observable, to prove cleanup runs even on cancel. +struct TeardownProbe { + tore_down: Arc, +} +#[resource] +impl Resource for TeardownProbe { + async fn teardown(&self) -> Result<(), CanoError> { + self.tore_down.fetch_add(1, Ordering::SeqCst); + Ok(()) + } +} + +#[tokio::test] +async fn resources_are_torn_down_on_cancel() { + let tore_down = Arc::new(AtomicUsize::new(0)); + let started = flag(); + let resources = Resources::new().insert( + "probe", + TeardownProbe { + tore_down: tore_down.clone(), + }, + ); + let wf = Workflow::new(resources) + .register( + Step::Ship, + LongTask { + started: started.clone(), + completed: flag(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(started.clone(), handle); + let result = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert_eq!( + tore_down.load(Ordering::SeqCst), + 1, + "resource teardown must still run when a run is cancelled" + ); +} + +// ===================================================================================== +// Cancellation across every processing model. Each model is dispatched through the same +// `dispatch_with_budget` race, so each must be interruptible mid-flight and surface +// `Cancelled` promptly rather than running to completion. +// ===================================================================================== + +// RouterTask: cancel while a route lookup is in flight. +struct SlowRouter { + started: Arc, +} +#[task::router(state = Step)] +impl SlowRouter { + async fn route(&self, _res: &Resources) -> Result, CanoError> { + self.started.store(true, Ordering::SeqCst); + tokio::time::sleep(Duration::from_secs(10)).await; + Ok(TaskResult::Single(Step::Done)) + } +} + +#[tokio::test] +async fn cancel_during_router_task_returns_cancelled() { + let started = flag(); + let wf = Workflow::bare() + .register_router( + Step::Reserve, + SlowRouter { + started: started.clone(), + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(started.clone(), handle); + let t0 = Instant::now(); + let result = wf.orchestrate(Step::Reserve, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert!( + t0.elapsed() < Duration::from_secs(2), + "router must abort promptly" + ); +} + +// PollTask: cancel while the poll loop is parked between Pending polls. +struct ForeverPoll { + started: Arc, +} +#[task::poll(state = Step)] +impl ForeverPoll { + async fn poll(&self, _res: &Resources) -> Result, CanoError> { + self.started.store(true, Ordering::SeqCst); + Ok(PollOutcome::Pending { delay_ms: 50 }) + } +} + +#[tokio::test] +async fn cancel_during_poll_task_returns_cancelled() { + let started = flag(); + let wf = Workflow::bare() + .register( + Step::Ship, + ForeverPoll { + started: started.clone(), + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(started.clone(), handle); + let t0 = Instant::now(); + let result = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert!( + t0.elapsed() < Duration::from_secs(2), + "poll loop must abort promptly" + ); +} + +// TimerTask: cancel during the scheduled sleep. +struct SlowTimer { + started: Arc, + fired: Arc, +} +#[task::timer(state = Step)] +impl SlowTimer { + async fn wait(&self, _res: &Resources) -> Result { + self.started.store(true, Ordering::SeqCst); + Ok(TimerOutcome::Duration(Duration::from_secs(10))) + } + async fn after_wait(&self, _res: &Resources) -> Result, CanoError> { + self.fired.store(true, Ordering::SeqCst); + Ok(TaskResult::Single(Step::Done)) + } +} + +#[tokio::test] +async fn cancel_during_timer_task_returns_cancelled() { + let started = flag(); + let fired = flag(); + let wf = Workflow::bare() + .register( + Step::Ship, + SlowTimer { + started: started.clone(), + fired: fired.clone(), + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(started.clone(), handle); + let t0 = Instant::now(); + let result = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert!( + t0.elapsed() < Duration::from_secs(2), + "timer must abort promptly" + ); + assert!( + !fired.load(Ordering::SeqCst), + "after_wait must not run when the timer is cancelled mid-sleep" + ); +} + +// BatchTask: cancel while items are being processed. +struct SlowBatch { + started: Arc, + finished: Arc, +} +#[task::batch(state = Step)] +impl SlowBatch { + type Item = u32; + type ItemOutput = (); + async fn load(&self, _res: &Resources) -> Result, CanoError> { + Ok(vec![0, 1, 2]) + } + async fn process_item(&self, _item: &u32) -> Result<(), CanoError> { + self.started.store(true, Ordering::SeqCst); + tokio::time::sleep(Duration::from_secs(10)).await; + Ok(()) + } + async fn finish( + &self, + _res: &Resources, + _outputs: Vec>, + ) -> Result, CanoError> { + self.finished.store(true, Ordering::SeqCst); + Ok(TaskResult::Single(Step::Done)) + } +} + +#[tokio::test] +async fn cancel_during_batch_task_returns_cancelled() { + let started = flag(); + let finished = flag(); + let wf = Workflow::bare() + .register( + Step::Ship, + SlowBatch { + started: started.clone(), + finished: finished.clone(), + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(started.clone(), handle); + let t0 = Instant::now(); + let result = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert!( + t0.elapsed() < Duration::from_secs(2), + "batch must abort promptly" + ); + assert!( + !finished.load(Ordering::SeqCst), + "finish must not run when the batch is cancelled mid-processing" + ); +} + +// SteppedTask: cancel mid-step (no checkpoint store ⇒ cursor is in-memory only). +struct SlowStepper { + started: Arc, +} +#[task::stepped(state = Step)] +impl SlowStepper { + async fn step( + &self, + _res: &Resources, + cursor: Option, + ) -> Result, CanoError> { + self.started.store(true, Ordering::SeqCst); + tokio::time::sleep(Duration::from_millis(50)).await; + let n = cursor.unwrap_or(0); + if n >= 10_000 { + Ok(StepOutcome::Done(TaskResult::Single(Step::Done))) + } else { + Ok(StepOutcome::More(n + 1)) + } + } +} + +#[tokio::test] +async fn cancel_during_stepped_task_returns_cancelled() { + let started = flag(); + let wf = Workflow::bare() + .register_stepped( + Step::Ship, + SlowStepper { + started: started.clone(), + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(started.clone(), handle); + let t0 = Instant::now(); + let result = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + + assert_eq!(result.unwrap_err().category(), "cancelled"); + assert!( + t0.elapsed() < Duration::from_secs(2), + "stepped loop must abort promptly" + ); +} + +#[cfg(feature = "testing")] +#[tokio::test] +async fn resume_from_honors_precancelled_token() { + use cano::testing::InMemoryCheckpointStore; + + let store = Arc::new(InMemoryCheckpointStore::new()); + let run_count = Arc::new(AtomicUsize::new(0)); + + // Run 1: cancel mid-Ship so a checkpoint log is left behind (Ship's StateEntry row was + // written before the task ran). Ship counts its runs. + struct CountingLong { + started: Arc, + runs: Arc, + } + #[task(state = Step)] + impl CountingLong { + fn config(&self) -> TaskConfig { + TaskConfig::minimal() + } + async fn run_bare(&self) -> Result, CanoError> { + self.runs.fetch_add(1, Ordering::SeqCst); + self.started.store(true, Ordering::SeqCst); + tokio::time::sleep(Duration::from_secs(10)).await; + Ok(TaskResult::Single(Step::Done)) + } + } + + let started = flag(); + let wf = Workflow::bare() + .with_checkpoint_store(store.clone()) + .with_workflow_id("run-1") + .register( + Step::Ship, + CountingLong { + started: started.clone(), + runs: run_count.clone(), + }, + ) + .add_exit_state(Step::Done); + + let (handle, token) = CancellationToken::new(); + let canceller = cancel_when(started.clone(), handle); + let r1 = wf.orchestrate(Step::Ship, token).await; + canceller.await.unwrap(); + assert_eq!(r1.unwrap_err().category(), "cancelled"); + assert_eq!(run_count.load(Ordering::SeqCst), 1); + + // Run 2: resume with a pre-cancelled token — the resumed run must cancel at the Ship + // boundary WITHOUT re-running the task. + let (handle2, token2) = CancellationToken::new(); + handle2.cancel(); + let r2 = wf.resume_from("run-1", token2).await; + assert_eq!(r2.unwrap_err().category(), "cancelled"); + assert_eq!( + run_count.load(Ordering::SeqCst), + 1, + "resumed task must not re-run when cancelled at the boundary" + ); +} diff --git a/cano/tests/recovery_hardening.rs b/cano/tests/recovery_hardening.rs index d4a552c..b5d21f3 100644 --- a/cano/tests/recovery_hardening.rs +++ b/cano/tests/recovery_hardening.rs @@ -112,7 +112,7 @@ async fn many_workflows_one_redb_file_concurrently() { let sidefx = dir.path().join(format!("sfx-{i}.log")); handles.push(tokio::spawn(async move { build(store, &format!("run-{i}"), &sidefx) - .orchestrate(St::Start) + .orchestrate(St::Start, CancellationToken::disabled()) .await })); } @@ -148,7 +148,9 @@ async fn racing_runs_of_one_id_on_a_real_file() { let store = Arc::clone(&store); let sidefx = dir.path().join(format!("sfx-{i}.log")); handles.push(tokio::spawn(async move { - build(store, "dup", &sidefx).orchestrate(St::Start).await + build(store, "dup", &sidefx) + .orchestrate(St::Start, CancellationToken::disabled()) + .await })); } let mut completed = 0; @@ -183,7 +185,7 @@ async fn crash_mid_work_then_resume_does_not_re_run_start() { ok_appends: AtomicUsize::new(2), }); let err = build(store, wf_id, &sidefx) - .orchestrate(St::Start) + .orchestrate(St::Start, CancellationToken::disabled()) .await .expect_err("generation 1 must crash before reaching Done"); assert_eq!(err.inner().category(), "checkpoint_store"); @@ -199,7 +201,12 @@ async fn crash_mid_work_then_resume_does_not_re_run_start() { { let store: Arc = Arc::new(RedbCheckpointStore::new(&db).unwrap()); let wf = build(store, wf_id, &sidefx); - assert_eq!(wf.resume_from(wf_id).await.unwrap(), St::Done); + assert_eq!( + wf.resume_from(wf_id, CancellationToken::disabled()) + .await + .unwrap(), + St::Done + ); } assert_eq!( side_effects(&sidefx), diff --git a/cano/tests/recovery_version_compat.rs b/cano/tests/recovery_version_compat.rs index 26aa2be..14a5802 100644 --- a/cano/tests/recovery_version_compat.rs +++ b/cano/tests/recovery_version_compat.rs @@ -98,7 +98,7 @@ async fn legacy_redb_row_resumes_under_default_workflow_version() { .with_checkpoint_store(store); let final_state = workflow - .resume_from(wf_id) + .resume_from(wf_id, CancellationToken::disabled()) .await .expect("legacy row must resume cleanly under default workflow_version"); assert_eq!(final_state, St::Done); @@ -124,7 +124,7 @@ async fn legacy_redb_row_rejected_when_workflow_version_bumped() { .with_workflow_version(1); let err = workflow - .resume_from(wf_id) + .resume_from(wf_id, CancellationToken::disabled()) .await .expect_err("bumped workflow_version must reject the legacy row"); assert_eq!(err, CanoError::workflow_version_mismatch(0, 1)); diff --git a/cano/tests/saga_hardening.rs b/cano/tests/saga_hardening.rs index 21390f7..fd1348b 100644 --- a/cano/tests/saga_hardening.rs +++ b/cano/tests/saga_hardening.rs @@ -103,7 +103,9 @@ async fn many_compensatable_workflows_concurrent_with_injected_failures() { // Sweep the failure point across all steps (3 of every 4 runs roll back). let fail_at = i % (STEPS + 1); handles.push(tokio::spawn(async move { - let result = saga(ledger.clone(), &account, fail_at).orchestrate(0).await; + let result = saga(ledger.clone(), &account, fail_at) + .orchestrate(0, CancellationToken::disabled()) + .await; (account, fail_at, result) })); } @@ -259,7 +261,10 @@ mod recovery { .add_exit_state(St::Done) .with_checkpoint_store(store.clone()); - let err = wf.resume_from("g").await.unwrap_err(); + let err = wf + .resume_from("g", CancellationToken::disabled()) + .await + .unwrap_err(); assert_eq!(err.message(), "C failed"); // B re-ran on resume and re-pushed its own entry; its persisted completion row at the // resume point must NOT be replayed too, or B would compensate twice. diff --git a/cano/tests/scheduler_cancellation.rs b/cano/tests/scheduler_cancellation.rs new file mode 100644 index 0000000..dc872e8 --- /dev/null +++ b/cano/tests/scheduler_cancellation.rs @@ -0,0 +1,667 @@ +#![cfg(feature = "scheduler")] +//! Scheduler cancellation across processing models and schedule types. +//! +//! The scheduler fires the engine's `CancellationToken` uniformly for every flow, +//! so `cancel_flow` / cancel-on-shutdown must work for *every* processing model +//! (base Task, saga, split, stepped, timer, poll, batch) and *every* schedule +//! type (manual, every, cron). These tests exercise that scheduler-specific wiring +//! (token publish/clear in `execute_reserved_flow`, the `Cancel` command, the +//! shutdown-cancel sweep, and `apply_outcome`'s cancel→Idle mapping) — the +//! orchestrate-level per-model cancellation lives in `tests/cancellation.rs`. + +use cano::prelude::*; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering::SeqCst}; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +enum Step { + Reserve, + Charge, + Ship, + Work, + Done, +} + +fn flag() -> Arc { + Arc::new(AtomicBool::new(false)) +} +fn counter() -> Arc { + Arc::new(AtomicUsize::new(0)) +} + +/// Spin until `flag` is true (a flow's task has started). Panics after 15s so a +/// wiring slip or a cancellation regression fails fast instead of hanging (some +/// of these flows — poll/stepped — never terminate on their own). +async fn await_flag(flag: &Arc) { + let t_start = Instant::now(); + while !flag.load(SeqCst) { + assert!( + t_start.elapsed() < Duration::from_secs(15), + "flow task never started (wiring bug, or the flow errored before this state)" + ); + tokio::time::sleep(Duration::from_millis(2)).await; + } +} + +/// Spin until the flow leaves `Status::Running` (its cancelled run settled). +/// Panics after 15s so a cancellation regression fails fast instead of hanging. +async fn await_not_running(running: &RunningScheduler, id: &str) { + let t_start = Instant::now(); + loop { + if running.status(id).await.map(|i| i.status) != Some(Status::Running) { + return; + } + assert!( + t_start.elapsed() < Duration::from_secs(15), + "flow '{id}' never left Running after cancel — cancellation regression" + ); + tokio::time::sleep(Duration::from_millis(2)).await; + } +} + +// ---- base long-running task (the cancellable building block) ---- +#[derive(Clone)] +struct Long { + started: Arc, + completed: Arc, + next: Step, +} +#[task(state = Step)] +impl Long { + fn config(&self) -> TaskConfig { + TaskConfig::minimal() + } + async fn run_bare(&self) -> Result, CanoError> { + self.started.store(true, SeqCst); + tokio::time::sleep(Duration::from_secs(30)).await; + self.completed.store(true, SeqCst); + Ok(TaskResult::Single(self.next.clone())) + } +} + +// ---- saga steps (distinct types ⇒ distinct compensator keys) ---- +#[derive(Clone)] +struct Reserve { + log: Arc>>, + fail_comp: bool, +} +#[saga::task(state = Step)] +impl Reserve { + type Output = (); + async fn run(&self, _res: &Resources) -> Result<(TaskResult, ()), CanoError> { + self.log.lock().unwrap().push("run:reserve".into()); + Ok((TaskResult::Single(Step::Charge), ())) + } + async fn compensate(&self, _res: &Resources, _o: ()) -> Result<(), CanoError> { + self.log.lock().unwrap().push("rollback:reserve".into()); + if self.fail_comp { + return Err(CanoError::task_execution("reserve rollback boom")); + } + Ok(()) + } +} + +#[derive(Clone)] +struct Charge { + log: Arc>>, +} +#[saga::task(state = Step)] +impl Charge { + type Output = (); + async fn run(&self, _res: &Resources) -> Result<(TaskResult, ()), CanoError> { + self.log.lock().unwrap().push("run:charge".into()); + Ok((TaskResult::Single(Step::Ship), ())) + } + async fn compensate(&self, _res: &Resources, _o: ()) -> Result<(), CanoError> { + self.log.lock().unwrap().push("rollback:charge".into()); + Ok(()) + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn cancel_flow_drains_scheduled_saga_and_returns_to_idle() { + let log = Arc::new(Mutex::new(Vec::new())); + let ship_started = flag(); + let wf = Workflow::bare() + .register_with_compensation( + Step::Reserve, + Reserve { + log: log.clone(), + fail_comp: false, + }, + ) + .register_with_compensation(Step::Charge, Charge { log: log.clone() }) + .register( + Step::Ship, + Long { + started: ship_started.clone(), + completed: flag(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler.manual("order", wf, Step::Reserve).unwrap(); + let running = scheduler.start().await.unwrap(); + + running.trigger("order").await.unwrap(); + await_flag(&ship_started).await; + running.cancel_flow("order").await.unwrap(); + await_not_running(&running, "order").await; + + let info = running.status("order").await.unwrap(); + assert_eq!(info.status, Status::Idle, "clean cancel → Idle"); + assert_eq!(info.failure_streak, 0, "cancel is not a backoff failure"); + assert_eq!( + *log.lock().unwrap(), + vec![ + "run:reserve".to_string(), + "run:charge".to_string(), + "rollback:charge".to_string(), + "rollback:reserve".to_string(), + ], + "saga must roll back in reverse on a scheduled cancel" + ); + running.stop().await.unwrap(); +} + +#[tokio::test(flavor = "multi_thread")] +async fn cancel_flow_dirty_rollback_saga_parks_in_backoff() { + // A cancel whose compensator FAILS surfaces as `compensation_failed`, which is + // a genuine fault: the flow lands in Backoff (default policy never trips), NOT Idle. + let log = Arc::new(Mutex::new(Vec::new())); + let ship_started = flag(); + let wf = Workflow::bare() + .register_with_compensation( + Step::Reserve, + Reserve { + log: log.clone(), + fail_comp: true, // Reserve's rollback fails ⇒ dirty rollback + }, + ) + // Reserve.run transitions to Charge, so Charge must be registered for the + // chain to reach the long Ship step where the cancel lands. + .register_with_compensation(Step::Charge, Charge { log: log.clone() }) + .register( + Step::Ship, + Long { + started: ship_started.clone(), + completed: flag(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler.manual("order", wf, Step::Reserve).unwrap(); + let running = scheduler.start().await.unwrap(); + + running.trigger("order").await.unwrap(); + await_flag(&ship_started).await; + running.cancel_flow("order").await.unwrap(); + await_not_running(&running, "order").await; + + let info = running.status("order").await.unwrap(); + assert!( + matches!(info.status, Status::Backoff { .. }), + "a dirty rollback is a failure → Backoff, got {:?}", + info.status + ); + assert_eq!(info.failure_streak, 1); + running.stop().await.unwrap(); +} + +#[tokio::test(flavor = "multi_thread")] +async fn graceful_stop_rolls_back_in_flight_saga() { + let log = Arc::new(Mutex::new(Vec::new())); + let ship_started = flag(); + let wf = Workflow::bare() + .register_with_compensation( + Step::Reserve, + Reserve { + log: log.clone(), + fail_comp: false, + }, + ) + .register_with_compensation(Step::Charge, Charge { log: log.clone() }) + .register( + Step::Ship, + Long { + started: ship_started.clone(), + completed: flag(), + next: Step::Done, + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler.manual("order", wf, Step::Reserve).unwrap(); + let running = scheduler.start().await.unwrap(); + + running.trigger("order").await.unwrap(); + await_flag(&ship_started).await; + + let t_start = Instant::now(); + running.stop().await.expect("graceful stop succeeds"); + assert!( + t_start.elapsed() < Duration::from_secs(5), + "shutdown must cancel + drain, not wait 30s" + ); + let events = log.lock().unwrap().clone(); + assert!(events.contains(&"rollback:charge".to_string())); + assert!(events.contains(&"rollback:reserve".to_string())); +} + +// ---- split ---- +#[derive(Clone)] +struct SplitChild { + started: Arc, + completed: Arc, +} +#[task(state = Step)] +impl SplitChild { + fn config(&self) -> TaskConfig { + TaskConfig::minimal() + } + async fn run_bare(&self) -> Result, CanoError> { + self.started.fetch_add(1, SeqCst); + tokio::time::sleep(Duration::from_secs(30)).await; + self.completed.fetch_add(1, SeqCst); + Ok(TaskResult::Single(Step::Done)) + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn cancel_flow_cancels_in_flight_split_flow() { + let started = counter(); + let completed = counter(); + let children: Vec = (0..3) + .map(|_| SplitChild { + started: started.clone(), + completed: completed.clone(), + }) + .collect(); + let wf = Workflow::bare() + .register_split( + Step::Work, + children, + JoinConfig::new(JoinStrategy::All, Step::Done), + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler.manual("split", wf, Step::Work).unwrap(); + let running = scheduler.start().await.unwrap(); + + running.trigger("split").await.unwrap(); + let t_start = Instant::now(); + while started.load(SeqCst) < 3 { + assert!( + t_start.elapsed() < Duration::from_secs(15), + "split children never all started" + ); + tokio::time::sleep(Duration::from_millis(2)).await; + } + running.cancel_flow("split").await.unwrap(); + await_not_running(&running, "split").await; + + tokio::time::sleep(Duration::from_millis(50)).await; + assert_eq!( + completed.load(SeqCst), + 0, + "split children must be aborted, not completed" + ); + assert_eq!(running.status("split").await.unwrap().status, Status::Idle); + running.stop().await.unwrap(); +} + +// ---- stepped ---- +#[derive(Clone)] +struct SlowStepper { + started: Arc, +} +#[task::stepped(state = Step)] +impl SlowStepper { + async fn step( + &self, + _res: &Resources, + cursor: Option, + ) -> Result, CanoError> { + self.started.store(true, SeqCst); + tokio::time::sleep(Duration::from_millis(50)).await; + let n = cursor.unwrap_or(0); + if n >= 100_000 { + Ok(StepOutcome::Done(TaskResult::Single(Step::Done))) + } else { + Ok(StepOutcome::More(n + 1)) + } + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn cancel_flow_cancels_in_flight_stepped_flow() { + let started = flag(); + let wf = Workflow::bare() + .register_stepped( + Step::Work, + SlowStepper { + started: started.clone(), + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler.manual("stepped", wf, Step::Work).unwrap(); + let running = scheduler.start().await.unwrap(); + + running.trigger("stepped").await.unwrap(); + await_flag(&started).await; + let t_start = Instant::now(); + running.cancel_flow("stepped").await.unwrap(); + await_not_running(&running, "stepped").await; + + assert!( + t_start.elapsed() < Duration::from_secs(5), + "stepped loop aborts promptly" + ); + assert_eq!( + running.status("stepped").await.unwrap().status, + Status::Idle + ); + running.stop().await.unwrap(); +} + +// ---- timer ---- +#[derive(Clone)] +struct SlowTimer { + started: Arc, + fired: Arc, +} +#[task::timer(state = Step)] +impl SlowTimer { + async fn wait(&self, _res: &Resources) -> Result { + self.started.store(true, SeqCst); + Ok(TimerOutcome::Duration(Duration::from_secs(30))) + } + async fn after_wait(&self, _res: &Resources) -> Result, CanoError> { + self.fired.store(true, SeqCst); + Ok(TaskResult::Single(Step::Done)) + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn cancel_flow_cancels_in_flight_timer_flow() { + let started = flag(); + let fired = flag(); + let wf = Workflow::bare() + .register( + Step::Work, + SlowTimer { + started: started.clone(), + fired: fired.clone(), + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler.manual("timer", wf, Step::Work).unwrap(); + let running = scheduler.start().await.unwrap(); + + running.trigger("timer").await.unwrap(); + await_flag(&started).await; + running.cancel_flow("timer").await.unwrap(); + await_not_running(&running, "timer").await; + + assert!( + !fired.load(SeqCst), + "after_wait must not run when the timer is cancelled" + ); + assert_eq!(running.status("timer").await.unwrap().status, Status::Idle); + running.stop().await.unwrap(); +} + +// ---- poll ---- +#[derive(Clone)] +struct ForeverPoll { + started: Arc, +} +#[task::poll(state = Step)] +impl ForeverPoll { + async fn poll(&self, _res: &Resources) -> Result, CanoError> { + self.started.store(true, SeqCst); + Ok(PollOutcome::Pending { delay_ms: 50 }) + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn cancel_flow_cancels_in_flight_poll_flow() { + let started = flag(); + let wf = Workflow::bare() + .register( + Step::Work, + ForeverPoll { + started: started.clone(), + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler.manual("poll", wf, Step::Work).unwrap(); + let running = scheduler.start().await.unwrap(); + + running.trigger("poll").await.unwrap(); + await_flag(&started).await; + let t_start = Instant::now(); + running.cancel_flow("poll").await.unwrap(); + await_not_running(&running, "poll").await; + + assert!( + t_start.elapsed() < Duration::from_secs(5), + "poll loop aborts promptly" + ); + assert_eq!(running.status("poll").await.unwrap().status, Status::Idle); + running.stop().await.unwrap(); +} + +// ---- batch ---- +#[derive(Clone)] +struct SlowBatch { + started: Arc, + finished: Arc, +} +#[task::batch(state = Step)] +impl SlowBatch { + type Item = u32; + type ItemOutput = (); + async fn load(&self, _res: &Resources) -> Result, CanoError> { + Ok(vec![0, 1, 2]) + } + async fn process_item(&self, _item: &u32) -> Result<(), CanoError> { + self.started.store(true, SeqCst); + tokio::time::sleep(Duration::from_secs(30)).await; + Ok(()) + } + async fn finish( + &self, + _res: &Resources, + _outputs: Vec>, + ) -> Result, CanoError> { + self.finished.store(true, SeqCst); + Ok(TaskResult::Single(Step::Done)) + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn cancel_flow_cancels_in_flight_batch_flow() { + let started = flag(); + let finished = flag(); + let wf = Workflow::bare() + .register( + Step::Work, + SlowBatch { + started: started.clone(), + finished: finished.clone(), + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler.manual("batch", wf, Step::Work).unwrap(); + let running = scheduler.start().await.unwrap(); + + running.trigger("batch").await.unwrap(); + await_flag(&started).await; + let t_start = Instant::now(); + running.cancel_flow("batch").await.unwrap(); + await_not_running(&running, "batch").await; + + assert!( + t_start.elapsed() < Duration::from_secs(5), + "batch aborts promptly" + ); + assert!( + !finished.load(SeqCst), + "finish must not run when the batch is cancelled" + ); + assert_eq!(running.status("batch").await.unwrap().status, Status::Idle); + running.stop().await.unwrap(); +} + +// ---- schedule types: every / cron ---- + +#[tokio::test(flavor = "multi_thread")] +async fn cancel_flow_on_every_flow_cancels_current_run_and_keeps_scheduling() { + // An interval flow: cancelling the in-flight run returns it to Idle and the + // loop keeps firing — a deliberate cancel must not stop future scheduled runs. + let started = counter(); + let wf = Workflow::bare() + .register( + Step::Work, + CountingLong { + started: started.clone(), + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler + .every("ticker", wf, Step::Work, Duration::from_millis(80)) + .unwrap(); + let running = scheduler.start().await.unwrap(); + + // First run is in flight. + while started.load(SeqCst) < 1 { + tokio::time::sleep(Duration::from_millis(2)).await; + } + running.cancel_flow("ticker").await.unwrap(); + + // The loop must dispatch a *second* run after the cancel returns it to Idle. + let t_start = Instant::now(); + while started.load(SeqCst) < 2 { + assert!( + t_start.elapsed() < Duration::from_secs(5), + "interval flow must keep scheduling after a cancel" + ); + tokio::time::sleep(Duration::from_millis(5)).await; + } + assert!( + !matches!( + running.status("ticker").await.unwrap().status, + Status::Tripped { .. } + ), + "cancel must not trip an interval flow" + ); + running.stop().await.unwrap(); +} + +#[tokio::test(flavor = "multi_thread")] +async fn cancel_flow_cancels_in_flight_cron_flow() { + // Cover the cron loop path (`spawn_cron_loop`): a per-second cron flow whose + // run is cancelled mid-flight returns to Idle. + let started = counter(); + let wf = Workflow::bare() + .register( + Step::Work, + CountingLong { + started: started.clone(), + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler + .cron("cronflow", wf, Step::Work, "* * * * * *") + .unwrap(); + let running = scheduler.start().await.unwrap(); + + // First per-second tick starts a run within ~1s. + let t_start = Instant::now(); + while started.load(SeqCst) < 1 { + assert!( + t_start.elapsed() < Duration::from_secs(3), + "cron tick should fire" + ); + tokio::time::sleep(Duration::from_millis(5)).await; + } + running.cancel_flow("cronflow").await.unwrap(); + await_not_running(&running, "cronflow").await; + assert_eq!( + running.status("cronflow").await.unwrap().status, + Status::Idle + ); + running.stop().await.unwrap(); +} + +// A long task that counts how many runs have started — for interval/cron tests. +#[derive(Clone)] +struct CountingLong { + started: Arc, +} +#[task(state = Step)] +impl CountingLong { + fn config(&self) -> TaskConfig { + TaskConfig::minimal() + } + async fn run_bare(&self) -> Result, CanoError> { + self.started.fetch_add(1, SeqCst); + tokio::time::sleep(Duration::from_secs(30)).await; + Ok(TaskResult::Single(Step::Done)) + } +} + +// ---- stepped + checkpoint store (recovery/testing) ---- +#[cfg(feature = "testing")] +#[tokio::test(flavor = "multi_thread")] +async fn cancel_flow_cancels_checkpointed_stepped_flow() { + use cano::testing::InMemoryCheckpointStore; + + let started = flag(); + let store = Arc::new(InMemoryCheckpointStore::new()); + let wf = Workflow::bare() + .with_checkpoint_store(store.clone()) + .with_workflow_id("stepped-ckpt") + .register_stepped( + Step::Work, + SlowStepper { + started: started.clone(), + }, + ) + .add_exit_state(Step::Done); + + let mut scheduler = Scheduler::new(); + scheduler.manual("stepped", wf, Step::Work).unwrap(); + let running = scheduler.start().await.unwrap(); + + running.trigger("stepped").await.unwrap(); + await_flag(&started).await; + running.cancel_flow("stepped").await.unwrap(); + await_not_running(&running, "stepped").await; + + assert_eq!( + running.status("stepped").await.unwrap().status, + Status::Idle + ); + running.stop().await.unwrap(); +} diff --git a/cano/tests/testing_module_e2e.rs b/cano/tests/testing_module_e2e.rs index 3c879ba..8f95164 100644 --- a/cano/tests/testing_module_e2e.rs +++ b/cano/tests/testing_module_e2e.rs @@ -41,7 +41,12 @@ async fn recording_observer_captures_path_and_checkpoints() { .with_checkpoint_store(store.clone()) .with_workflow_id("run-1"); - assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done); + assert_eq!( + wf.orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap(), + S::Done + ); observer.assert_path(&["Start", "Work", "Done"]); observer.assert_completed_with("Done"); @@ -88,7 +93,7 @@ async fn in_memory_store_supports_resume_after_failure() { // Generation 1: crashes at Work. The log keeps Start + Work entries. build(runs.clone()) - .orchestrate(S::Start) + .orchestrate(S::Start, CancellationToken::disabled()) .await .expect_err("generation 1 must fail at Work"); assert_eq!(store.load_run(wf_id).await.unwrap().len(), 2); @@ -96,7 +101,13 @@ async fn in_memory_store_supports_resume_after_failure() { // Generation 2: resume re-enters at Work, re-runs it (now succeeds), reaches Done. let observer = Arc::new(RecordingObserver::new()); let resumed = build(runs.clone()).with_observer(observer.clone()); - assert_eq!(resumed.resume_from(wf_id).await.unwrap(), S::Done); + assert_eq!( + resumed + .resume_from(wf_id, CancellationToken::disabled()) + .await + .unwrap(), + S::Done + ); assert!( observer .events() @@ -115,7 +126,10 @@ async fn panic_on_attempt_fails_fast_with_panic_error() { .register(S::Start, panic_on_attempt(1, S::Done)) .add_exit_state(S::Done) .with_observer(observer.clone()); - let err = wf.orchestrate(S::Start).await.unwrap_err(); + let err = wf + .orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap_err(); assert!(err.to_string().contains("panic"), "{err}"); // Panics are not retried — no retry event recorded. assert!( @@ -191,7 +205,9 @@ async fn assert_compensation_ran_matches_reverse_order() { .register_with_compensation(S::Work, Step2) .register(S::Finish, Fail) .add_exit_state(S::Done); - let _ = saga.orchestrate(S::Start).await; // fails, rolls back + let _ = saga + .orchestrate(S::Start, CancellationToken::disabled()) + .await; // fails, rolls back let ran = handle.0.lock().unwrap().clone(); // Step2 completed last, so it compensates first; then Step1. diff --git a/cano/tests/testing_state_coverage.rs b/cano/tests/testing_state_coverage.rs index 98699f0..24b579b 100644 --- a/cano/tests/testing_state_coverage.rs +++ b/cano/tests/testing_state_coverage.rs @@ -47,7 +47,9 @@ async fn router_hop_is_counted() { .register(S::Worker, Go(S::Done)) .add_exit_state(S::Done) .with_observer(observer.clone()); - wf.orchestrate(S::Start).await.unwrap(); + wf.orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap(); observer .assert_registered_states_entered(&wf) .expect("router hops counted"); @@ -61,7 +63,9 @@ async fn unregistered_routed_state_returns_err_does_not_panic() { .register(S::Orphan, Go(S::Done)) .add_exit_state(S::Done) .with_observer(observer.clone()); - wf.orchestrate(S::Start).await.unwrap(); + wf.orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap(); let missing = observer.assert_registered_states_entered(&wf).unwrap_err(); assert!(missing.contains(&"Orphan".to_string()), "got: {missing:?}"); } @@ -73,7 +77,9 @@ async fn multi_state_with_explicit_list() { .register(S::Start, Go(S::Done)) .add_exit_state(S::Done) .with_observer(observer.clone()); - wf.orchestrate(S::Start).await.unwrap(); + wf.orchestrate(S::Start, CancellationToken::disabled()) + .await + .unwrap(); let missing = observer .assert_all_states_entered(&[S::Start, S::Route, S::Worker]) .unwrap_err(); diff --git a/cano/tests/timer_task_e2e.rs b/cano/tests/timer_task_e2e.rs index 6569b22..bac917a 100644 --- a/cano/tests/timer_task_e2e.rs +++ b/cano/tests/timer_task_e2e.rs @@ -51,7 +51,10 @@ async fn duration_timer_fires_after_expected_delay() { .register(Step::Process, Process) .add_exit_state(Step::Done); - let result = workflow.orchestrate(Step::Wait).await.unwrap(); + let result = workflow + .orchestrate(Step::Wait, CancellationToken::disabled()) + .await + .unwrap(); let elapsed = start.elapsed(); assert_eq!(result, Step::Done); @@ -92,7 +95,7 @@ async fn instant_timer_fires_at_correct_instant() { .register(Step::Wait, NearFutureInstantTimer) .register(Step::Process, Process) .add_exit_state(Step::Done) - .orchestrate(Step::Wait) + .orchestrate(Step::Wait, CancellationToken::disabled()) .await .unwrap(); @@ -132,7 +135,7 @@ async fn attempt_timeout_cancels_long_timer() { .register(Step::Wait, OneHourTimer) .register(Step::Process, Process) .add_exit_state(Step::Done) - .orchestrate(Step::Wait) + .orchestrate(Step::Wait, CancellationToken::disabled()) .await .unwrap_err(); diff --git a/cano/tests/tracing_tests.rs b/cano/tests/tracing_tests.rs index 0aaa896..b6cbfe3 100644 --- a/cano/tests/tracing_tests.rs +++ b/cano/tests/tracing_tests.rs @@ -51,7 +51,10 @@ async fn test_workflow_with_tracing_span() { .register(TestState::Processing, TestTask::new("processing")) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -71,7 +74,10 @@ async fn test_concurrent_workflow_with_tracing_span() { .register(TestState::Processing, TestTask::new("processing")) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -117,7 +123,10 @@ async fn test_workflow_tracing_without_custom_span() { .register(TestState::Processing, TestTask::new("processing")) .add_exit_state(TestState::Complete); - let result = workflow.orchestrate(TestState::Start).await.unwrap(); + let result = workflow + .orchestrate(TestState::Start, CancellationToken::disabled()) + .await + .unwrap(); assert_eq!(result, TestState::Complete); } @@ -217,7 +226,13 @@ async fn test_tracing_observer_runs_workflow() { ) .add_exit_state(Flow::Done) .with_observer(Arc::new(TracingObserver::new())); - assert_eq!(workflow.orchestrate(Flow::A).await.unwrap(), Flow::Done); + assert_eq!( + workflow + .orchestrate(Flow::A, CancellationToken::disabled()) + .await + .unwrap(), + Flow::Done + ); } #[derive(Clone)] @@ -255,7 +270,12 @@ async fn test_tracing_observer_captures_events() { .register(Flow::A, AlwaysFail) .add_exit_state(Flow::Done) .with_observer(Arc::new(TracingObserver::new())); - assert!(fail_wf.orchestrate(Flow::A).await.is_err()); + assert!( + fail_wf + .orchestrate(Flow::A, CancellationToken::disabled()) + .await + .is_err() + ); // Pre-tripped breaker → on_circuit_open + on_task_failure. let breaker = Arc::new(CircuitBreaker::new(CircuitPolicy { @@ -274,7 +294,10 @@ async fn test_tracing_observer_captures_events() { ) .add_exit_state(Flow::Done) .with_observer(Arc::new(TracingObserver::new())); - let cb_err = cb_wf.orchestrate(Flow::B).await.unwrap_err(); + let cb_err = cb_wf + .orchestrate(Flow::B, CancellationToken::disabled()) + .await + .unwrap_err(); // The FSM wraps task failures with state context; `.inner()` peels one layer. assert!(matches!(cb_err.inner(), CanoError::CircuitOpen(_))); diff --git a/docs/content/_index.md b/docs/content/_index.md index d199521..41fbfe7 100644 --- a/docs/content/_index.md +++ b/docs/content/_index.md @@ -202,7 +202,7 @@ async fn main() -> Result<(), CanoError> { .register(WorkflowState::Process, DoneTask) .add_exit_state(WorkflowState::Complete); - workflow.orchestrate(WorkflowState::Start).await?; + workflow.orchestrate(WorkflowState::Start, CancellationToken::disabled()).await?; Ok(()) } diff --git a/docs/content/metrics/_index.md b/docs/content/metrics/_index.md index 34fa5c7..359700d 100644 --- a/docs/content/metrics/_index.md +++ b/docs/content/metrics/_index.md @@ -108,6 +108,7 @@ Workflow::bare()
  • cano_circuit_open_events_total{task} — on on_circuit_open
  • cano_checkpoints_observed_total — on on_checkpoint
  • cano_resumes_total — on on_resume
  • +
  • cano_observed_cancellations_total — on on_cancelled (run cancelled via a CancellationToken)
  • on_task_start is intentionally not counted — every dispatch already shows up in @@ -408,7 +409,7 @@ async fn main() { // 2. Run the workflow a few times directly. for _ in 0..3 { workflow() - .orchestrate(Step::Fetch) + .orchestrate(Step::Fetch, CancellationToken::disabled()) .await .expect("workflow run"); } diff --git a/docs/content/observers/_index.md b/docs/content/observers/_index.md index 8da2137..a2fba75 100644 --- a/docs/content/observers/_index.md +++ b/docs/content/observers/_index.md @@ -86,7 +86,7 @@ async fn main() -> Result<(), CanoError> { .add_exit_state(Step::Done) .with_observer(counter.clone()); - workflow.orchestrate(Step::Start).await?; + workflow.orchestrate(Step::Start, CancellationToken::disabled()).await?; assert_eq!(counter.0.load(Ordering::Relaxed), 1); Ok(()) } @@ -151,6 +151,15 @@ before the compensation drain runs. Followed on the public API's return by CanoError::WithStateContext wrapping a CanoError::WorkflowTimeout (clean rollback) or CanoError::CompensationFailed if a compensate also fails (its errors[0] carries the wrapped timeout).

    + +

    on_cancelled(state: &str)

    +

    Fired when a run is cancelled via a +CancellationToken — either observed at a state +boundary or while a cancellable task was in flight. state is the Debug +rendering of the state the cancel was observed at. Fires exactly once per cancelled run, before the +compensation drain. Followed on the public API's return by CanoError::WithStateContext +wrapping a CanoError::Cancelled (clean rollback) or CanoError::CompensationFailed +whose errors[0] is the wrapped cancel (dirty rollback).

    @@ -260,7 +269,7 @@ async fn main() -> Result<(), CanoError> { .add_exit_state(Step::Done) .with_observer(Arc::new(TracingObserver::new())); - workflow.orchestrate(Step::Start).await?; + workflow.orchestrate(Step::Start, CancellationToken::disabled()).await?; Ok(()) } ``` @@ -284,6 +293,7 @@ Because the events carry the cano::observer target, you can filter on_checkpointDEBUG"checkpoint appended"workflow_id, sequence on_resumeINFO"workflow resumed from checkpoint"workflow_id, sequence on_workflow_timeoutWARN"workflow total timeout exceeded"elapsed_ms, limit_ms +on_cancelledWARN"workflow cancelled"state
    @@ -420,7 +430,7 @@ async fn main() -> Result<(), CanoError> { .add_exit_state(Step::Done) .with_observer(observer.clone()); - workflow.orchestrate(Step::Load).await?; + workflow.orchestrate(Step::Load, CancellationToken::disabled()).await?; assert_eq!(observer.failures.load(Ordering::Relaxed), 0); Ok(()) } diff --git a/docs/content/recovery/_index.md b/docs/content/recovery/_index.md index 6f96d09..ab6b4f8 100644 --- a/docs/content/recovery/_index.md +++ b/docs/content/recovery/_index.md @@ -85,7 +85,7 @@ async fn main() -> Result<(), CanoError> { .with_checkpoint_store(checkpoint_store) .with_workflow_id("run-42"); - workflow.orchestrate(Step::Start).await?; + workflow.orchestrate(Step::Start, CancellationToken::disabled()).await?; Ok(()) } ``` @@ -175,7 +175,7 @@ async fn main() -> Result<(), CanoError> { .with_checkpoint_store(checkpoint_store); // Some earlier process crashed mid-run; pick up where it left off. - let final_state = workflow.resume_from("run-42").await?; + let final_state = workflow.resume_from("run-42", CancellationToken::disabled()).await?; assert_eq!(final_state, Step::Done); Ok(()) } @@ -435,10 +435,10 @@ async fn main() -> Result<(), CanoError> { .with_workflow_id("demo-run"); // Run 1: crashes inside ProcessTask. The Start and Process rows are already durable. - let _ = workflow.orchestrate(Step::Start).await; + let _ = workflow.orchestrate(Step::Start, CancellationToken::disabled()).await; // Run 2: resume — re-runs ProcessTask (now it succeeds) and finishes at Done. - let final_state = workflow.resume_from("demo-run").await?; + let final_state = workflow.resume_from("demo-run", CancellationToken::disabled()).await?; assert_eq!(final_state, Step::Done); // The append-only log: Start, Process (crash), Process (re-run), Finalize, Done — diff --git a/docs/content/resilience/_index.md b/docs/content/resilience/_index.md index 72a2fb4..2816c84 100644 --- a/docs/content/resilience/_index.md +++ b/docs/content/resilience/_index.md @@ -30,7 +30,8 @@ probes — lives in Recovery, Saga
  • Workflow Total Timeout
  • Compensation drain budget
  • Observer hook
  • -
  • The three timeout knobs
  • +
  • The two timeout knobs
  • +
  • Cooperative Cancellation
  • Circuit Breakers & Rate Limiting
  • Bulkheads (split concurrency)
  • Panic Safety
  • @@ -101,8 +102,7 @@ impl CallTask { ```

    Distinct from Workflow::with_total_timeout (the wall-clock budget for the entire -orchestration — see below) and from the legacy Workflow::with_timeout (a blunt outer -tokio::time::timeout with no graceful compensation). The full TaskConfig / +orchestration — see below). The full TaskConfig / RetryMode API — including how attempt timeouts compose with each retry mode — lives in Tasks → Configuration & Retries.


    @@ -158,19 +158,18 @@ fields under the cano::observer target. See The three timeout knobs +

    The two timeout knobs

    -
    APIScopeOn expiryCompensation drain
    TaskConfig::with_attempt_timeoutOne attempt of one taskCanoError::Timeout — retried like any other failure; final timeout becomes RetryExhaustedTriggered like any other terminal task error (unbounded)
    Workflow::with_total_timeoutThe entire orchestrate / resume_from callIn-flight task aborted; CanoError::WorkflowTimeout (wrapped in WithStateContext)Bounded by with_compensation_timeout or the default min(remaining/2, 30s)
    Workflow::with_timeout (legacy)The whole orchestration futureCanoError::Workflow("Workflow timeout exceeded") — no graceful abortNone — the future is dropped abruptly

    -Pick with_total_timeout for any new code that needs a workflow-wide budget. The legacy -with_timeout remains for backward compatibility and composes naturally — if both are -set, whichever fires first wins. +Reach for with_attempt_timeout to bound a single call and with_total_timeout +for a workflow-wide budget; they compose. To stop a run on an external signal rather than a deadline, +use cooperative cancellation.

    @@ -180,6 +179,78 @@ and the final error is the wrapped WorkflowTimeout.


    +

    Cooperative Cancellation

    +

    +Where a total timeout aborts a run on a deadline, cancellation aborts it on a signal +you control — a shutdown handler, a user "stop" button, a parent task giving up. +Workflow::orchestrate(start, token) (and resume_from(id, token)) always take +a CancellationToken; firing the paired CancellationHandle aborts the +in-flight task at its next await point, drains the +saga compensation stack, and returns CanoError::Cancelled wrapped +in CanoError::WithStateContext (or CanoError::CompensationFailed if a +compensate also fails). +

    + +```rust +use cano::CancellationToken; + +let (handle, token) = CancellationToken::new(); + +// Cancel from anywhere — a signal handler, a sibling task, a timer: +let canceller = tokio::spawn(async move { + shutdown_signal().await; + handle.cancel(); // idempotent; the handle is Clone, so many owners can trigger it +}); + +let result = workflow.orchestrate(Step::Reserve, token).await; +assert!(matches!(result, Err(e) if e.category() == "cancelled")); +``` + +

    +To opt a run out of cancellation, pass CancellationToken::disabled() instead of a live +token: workflow.orchestrate(start, CancellationToken::disabled()). A disabled token +never fires and is zero-cost — the cancellation select! is skipped entirely. +

    + +
    +

    Cancellation is cooperative. The engine drops the running task's future at its next +.await. A task spinning in a tight synchronous/CPU loop with no .await is not +interrupted until it next yields — design long-running task bodies to .await periodically +if they must be cancellable.

    +
    + +

    Saga safety

    +

    +A compensatable task is never interrupted mid-run. Aborting it +after an in-task side effect committed but before its Output reached the compensation +stack would orphan that side effect with nothing to roll back. So a CompensatableTask +always runs to completion (recording its rollback entry); the cancellation is honoured at the +next state boundary, which then drains the now-complete stack. The compensation drain itself +is uncancellable — a cancel that lands during rollback does not abort the remaining compensators. +

    + +

    Observer hook & precedence

    +

    +A WorkflowObserver receives one on_cancelled(state) call when the cancel is +observed, before the drain runs; TracingObserver re-emits it as a WARN event +and MetricsObserver increments cano_observed_cancellations_total. Against +with_total_timeout, cancellation wins: it is checked deterministically at each state +boundary and biased ahead of the per-state budget mid-task. +

    + +
    +

    The scheduler builds on this: RunningScheduler::cancel_flow(id) +cancels an in-flight scheduled run, and graceful stop() cancels every in-flight flow +(rolling back their sagas) instead of waiting for them to finish.

    +
    + +
    +

    Runnable example: cargo run --example workflow_cancellation — a 3-step saga where a +sibling task cancels the shipping step mid-flight; the prior steps' compensations run in reverse and +the final error is the wrapped Cancelled.

    +
    +
    +

    Circuit Breakers & Rate Limiting

    Two of the most-used resilience primitives have their own dedicated pages:

      diff --git a/docs/content/resources/_index.md b/docs/content/resources/_index.md index c51f4f1..06fc3d1 100644 --- a/docs/content/resources/_index.md +++ b/docs/content/resources/_index.md @@ -86,7 +86,7 @@ async fn main() -> Result<(), CanoError> { .register(Step::Init, InitTask) .add_exit_state(Step::Done); - workflow.orchestrate(Step::Init).await?; + workflow.orchestrate(Step::Init, CancellationToken::disabled()).await?; Ok(()) } diff --git a/docs/content/saga/_index.md b/docs/content/saga/_index.md index b4c5e0a..31cd2d7 100644 --- a/docs/content/saga/_index.md +++ b/docs/content/saga/_index.md @@ -325,7 +325,7 @@ async fn main() { .register(Step::Ship, ShipOrder) // plain — and it fails .add_exit_state(Step::Done); - match workflow.orchestrate(Step::Reserve).await { + match workflow.orchestrate(Step::Reserve, CancellationToken::disabled()).await { Ok(state) => println!("completed at {state:?}"), Err(error) => println!("failed, rolled back: {error}"), // "courier unavailable" — the original error } diff --git a/docs/content/scheduler/_index.md b/docs/content/scheduler/_index.md index 471759c..b733da6 100644 --- a/docs/content/scheduler/_index.md +++ b/docs/content/scheduler/_index.md @@ -22,6 +22,7 @@ template = "section.html"
    • Manual Triggering
    • Mixed Scheduling
    • Backoff & Trip State
    • +
    • Cancelling a Flow
    • Graceful Shutdown
    • Advanced: Multi-Level Map-Reduce
    • @@ -57,7 +58,7 @@ override it per flow via set_backoff. Scheduler is RunningScheduler is the live handle returned by scheduler.start().await?. It owns the spawned driver and per-flow loop tasks. It is cheap to clone — every clone shares the same command channel and flow registry, so you can call trigger, -status, list, reset_flow, and stop from any task. +status, list, reset_flow, cancel_flow, and stop from any task.

    start consumes the builder, so the compiler prevents you from starting the same scheduler @@ -490,24 +491,46 @@ on a dedicated page:


    +

    Cancelling a Flow

    +

    +RunningScheduler::cancel_flow(id) requests cooperative cancellation of a flow's in-flight +run. The running workflow aborts at its next await point, its saga compensation +stack drains (rolling back completed steps), and the flow returns to Status::Idle. A +deliberate cancel is not counted as a failure against the +BackoffPolicy — the streak is left untouched and the flow never trips, so +its next scheduled run fires normally. Cancelling a flow that isn't currently running is an idempotent +no-op. +

    + +```rust +// Stop the in-flight run of a flow; its saga rolls back and the flow goes Idle. +running.cancel_flow("order").await?; +``` + +
    +

    Runnable example: cargo run --example scheduler_cancellation --features scheduler — +triggers a saga, cancels it mid-flight, and watches the compensators roll back in reverse.

    +
    +
    +

    Graceful Shutdown

    -The scheduler supports graceful shutdown, allowing currently running workflows to complete before stopping. -This includes workflows started by interval or cron triggers as well as manually-triggered workflows. -All active executions are tracked and included in the shutdown wait. +When stop() is called, the scheduler signals all scheduling loops to stop and then +cooperatively cancels every in-flight flow — interval, cron, and manual alike — the +same way cancel_flow does: each running workflow aborts at its next await and drains its +saga before the scheduler runs resource teardown (reverse registration order) and returns. +Shutdown latency is therefore bounded by the time to the next await plus the compensation drain, not by +how long the workflows would naturally take.

    ```rust -// Stop the scheduler and wait for running flows to finish. +// Stop the scheduler: in-flight flows are cancelled + rolled back, then teardown runs. running.stop().await?; - ```

    -When stop() is called, the scheduler signals all scheduling loops to stop, -waits up to 30 seconds for any in-progress workflow executions to finish, and runs each -workflow's resource teardown_all in reverse registration order before returning. -A second stop() call after success is idempotent — it returns the same cached result. +A bounded wait (up to 30 seconds) caps the drain; a second stop() after success is +idempotent — it returns the same cached result.

    diff --git a/docs/content/split-join/_index.md b/docs/content/split-join/_index.md index c193bd6..6b18360 100644 --- a/docs/content/split-join/_index.md +++ b/docs/content/split-join/_index.md @@ -253,7 +253,7 @@ async fn main() -> Result<(), CanoError> { .register(DataState::Aggregate, Aggregator) .add_exit_state(DataState::Complete); - let result = workflow.orchestrate(DataState::Start).await?; + let result = workflow.orchestrate(DataState::Start, CancellationToken::disabled()).await?; let final_result: i32 = store.get("final_result")?; println!("Workflow completed: {:?} — total {}", result, final_result); Ok(()) diff --git a/docs/content/split-join/parallel-patterns.md b/docs/content/split-join/parallel-patterns.md index 9d3d5d2..b98a0e8 100644 --- a/docs/content/split-join/parallel-patterns.md +++ b/docs/content/split-join/parallel-patterns.md @@ -128,7 +128,7 @@ async fn main() -> Result<(), CanoError> { .add_exit_state(QueueState::Complete); loop { - let result = workflow.orchestrate(QueueState::PullBatch).await?; + let result = workflow.orchestrate(QueueState::PullBatch, CancellationToken::disabled()).await?; if result == QueueState::Complete && queue.lock().await.is_empty() { break; } @@ -208,7 +208,7 @@ async fn main() -> Result<(), CanoError> { .register(DataState::Aggregate, FinishAggregate) .add_exit_state(DataState::Complete); - workflow.orchestrate(DataState::LoadRecords).await?; + workflow.orchestrate(DataState::LoadRecords, CancellationToken::disabled()).await?; Ok(()) } ``` diff --git a/docs/content/stepped-task/_index.md b/docs/content/stepped-task/_index.md index 798ad73..74a5b32 100644 --- a/docs/content/stepped-task/_index.md +++ b/docs/content/stepped-task/_index.md @@ -197,7 +197,7 @@ let workflow = Workflow::new(resources) .add_exit_state(Stage::Done); // First run crashes after 600/1000 steps. Restart: -// let result = workflow.resume_from("nightly-crunch").await?; +// let result = workflow.resume_from("nightly-crunch", CancellationToken::disabled()).await?; // step() is first called with Some(Progress { processed: 600, .. }) — not None. ```
    diff --git a/docs/content/store/_index.md b/docs/content/store/_index.md index a9aec8a..95e3437 100644 --- a/docs/content/store/_index.md +++ b/docs/content/store/_index.md @@ -300,7 +300,7 @@ async fn main() -> Result<(), CanoError> { .register(Stage::Transform, TransformTask) .add_exit_state(Stage::Complete); - workflow.orchestrate(Stage::Ingest).await?; + workflow.orchestrate(Stage::Ingest, CancellationToken::disabled()).await?; // Read results after the workflow completes let result: Vec = store.get("result")?; diff --git a/docs/content/testing/_index.md b/docs/content/testing/_index.md index 0c1875d..d28da0b 100644 --- a/docs/content/testing/_index.md +++ b/docs/content/testing/_index.md @@ -86,7 +86,7 @@ async fn recording_observer_captures_the_path() { .add_exit_state(S::Done) .with_observer(observer.clone()); - assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done); + assert_eq!(wf.orchestrate(S::Start, CancellationToken::disabled()).await.unwrap(), S::Done); // Assert the whole path, or inspect events directly. observer.assert_path(&["Start", "Done"]); @@ -144,7 +144,7 @@ async fn every_registered_state_is_reached() { .register(S::Worker, Go(S::Done)) .add_exit_state(S::Done) .with_observer(observer.clone()); - wf.orchestrate(S::Start).await.unwrap(); + wf.orchestrate(S::Start, CancellationToken::disabled()).await.unwrap(); // Every registered handler was actually reached — no dead states. observer.assert_registered_states_entered(&wf).expect("no dead states"); @@ -189,7 +189,7 @@ async fn checkpoints_run_in_memory() { .add_exit_state(S::Done) .with_checkpoint_store(store.clone()) .with_workflow_id("run-1"); - assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done); + assert_eq!(wf.orchestrate(S::Start, CancellationToken::disabled()).await.unwrap(), S::Done); } ``` @@ -240,7 +240,7 @@ async fn panicking_task_fails_fast() { .register(S::Start, panic_on_attempt(1, S::Done)) .add_exit_state(S::Done); - let err = wf.orchestrate(S::Start).await.unwrap_err(); + let err = wf.orchestrate(S::Start, CancellationToken::disabled()).await.unwrap_err(); assert!(err.to_string().contains("panic")); } ``` diff --git a/docs/content/tracing/_index.md b/docs/content/tracing/_index.md index 655bf0e..2e53387 100644 --- a/docs/content/tracing/_index.md +++ b/docs/content/tracing/_index.md @@ -382,7 +382,7 @@ async fn main() -> Result<(), CanoError> { // 3. Run info!("Submitting order..."); - workflow.orchestrate(State::Start).await?; + workflow.orchestrate(State::Start, CancellationToken::disabled()).await?; Ok(()) } diff --git a/docs/content/workflows/_index.md b/docs/content/workflows/_index.md index dc4bb19..ec5ca68 100644 --- a/docs/content/workflows/_index.md +++ b/docs/content/workflows/_index.md @@ -14,6 +14,7 @@ template = "section.html"
    1. Defining States
    2. Building a Workflow
    3. +
    4. Cancelling a Run
    5. Builder Pattern
    6. Validation & Errors
    7. Parallel Tasks: Split & Join
    8. @@ -124,7 +125,7 @@ async fn main() -> Result<(), CanoError> { .add_exit_states(vec![OrderState::Complete, OrderState::Failed]); // 3. Execute - let result = workflow.orchestrate(OrderState::Start).await?; + let result = workflow.orchestrate(OrderState::Start, CancellationToken::disabled()).await?; println!("Final State: {:?}", result); Ok(()) @@ -138,6 +139,41 @@ like the one above.


      +

      Cancelling a Run

      +

      +orchestrate always takes a CancellationToken as its second argument. The +example above passes CancellationToken::disabled() — a token that never fires, opting +the run out of cancellation at zero cost. To stop a run early on a signal you control — a +shutdown handler, a user "stop" button, a parent task giving up — pass a live token from +CancellationToken::new() instead and keep its paired CancellationHandle. +Firing the handle aborts the in-flight task at its next .await, drains the +saga compensation stack, and returns CanoError::Cancelled. +

      + +```rust +use cano::prelude::*; + +let (handle, token) = CancellationToken::new(); + +// Cancel from anywhere — the handle is Clone and cancel() is idempotent: +tokio::spawn(async move { + shutdown_signal().await; + handle.cancel(); +}); + +let result = workflow.orchestrate(OrderState::Start, token).await; +assert!(matches!(result, Err(e) if e.category() == "cancelled")); +``` + +

      +Cancellation is cooperative (a task is interrupted only at an .await) and +saga-safe (a compensatable task is never interrupted mid-run). The +Resilience → Cooperative Cancellation page covers the full +semantics, the on_cancelled observer hook, and precedence against +with_total_timeout. +

      +
      +

      Builder Pattern and #[must_use]

      Workflow uses a builder pattern where the register* methods and @@ -319,21 +355,21 @@ fn build_workflow(store: MemoryStore) -> Workflow { .register(TextPipelineState::Parse, ParseTask) .register(TextPipelineState::Transform, TransformTask) .add_exit_state(TextPipelineState::Done) - .with_timeout(Duration::from_secs(5)) + .with_total_timeout(Duration::from_secs(5)) } // Inside an HTTP handler: let store = MemoryStore::new(); // fresh store — full isolation store.put("input_text", text)?; let workflow = build_workflow(store.clone()); -let final_state = workflow.orchestrate(TextPipelineState::Parse).await?; // which terminal branch ran +let final_state = workflow.orchestrate(TextPipelineState::Parse, CancellationToken::disabled()).await?; // which terminal branch ran let word_count: usize = store.get("word_count")?; ```

      Tip

      -Use .with_timeout() on the workflow to keep a hung request from blocking indefinitely. For +Use .with_total_timeout() on the workflow to keep a hung request from blocking indefinitely. For read-heavy workloads with shared reference data, pre-populate one store, share it via Arc, and use per-request keys to avoid collisions. The full Axum version is in cargo run --example workflow_on_request. diff --git a/docs/content/workflows/validation-and-errors.md b/docs/content/workflows/validation-and-errors.md index 0c3dabd..d6e956d 100644 --- a/docs/content/workflows/validation-and-errors.md +++ b/docs/content/workflows/validation-and-errors.md @@ -90,7 +90,7 @@ async fn main() -> Result<(), CanoError> { workflow.validate_initial_state(&State::Start)?; // Safe to orchestrate - let _result = workflow.orchestrate(State::Start).await?; + let _result = workflow.orchestrate(State::Start, CancellationToken::disabled()).await?; Ok(()) } ``` @@ -133,9 +133,9 @@ during execution. Understanding these errors helps you build robust error recove Increase with_total_timeout() or speed up the workflow; see Resilience → Workflow Total Timeout -CanoError::Workflow -Legacy with_timeout() outer tokio::time::timeout elapsed (no graceful compensation) -Prefer with_total_timeout() for new code; otherwise increase with_timeout() or optimize task execution time +CanoError::Cancelled +Run cancelled via a live CancellationToken passed to orchestrate / resume_from; in-flight task aborted, compensation stack drained. Surfaced under CanoError::WithStateContext (or CompensationFailed on a dirty rollback). +Expected when you cancel deliberately; see Resilience → Cooperative Cancellation CanoError::Configuration @@ -181,7 +181,7 @@ through the join strategy.

      ```rust -match workflow.orchestrate(State::Start).await { +match workflow.orchestrate(State::Start, CancellationToken::disabled()).await { Ok(final_state) => println!("Completed: {:?}", final_state), Err(CanoError::Workflow(msg)) => eprintln!("Workflow error: {}", msg), Err(CanoError::Configuration(msg)) => eprintln!("Config error: {}", msg),