diff --git a/README.md b/README.md
index 8f6b55b..6be4388 100644
--- a/README.md
+++ b/README.md
@@ -142,7 +142,7 @@ async fn main() -> Result<(), CanoError> {
         .add_exit_state(FlowState::Complete);
 
     // 5. Run.
-    let result = workflow.orchestrate(FlowState::Start).await?;
+    let result = workflow.orchestrate(FlowState::Start, CancellationToken::disabled()).await?;
     println!("Workflow finished: {:?}", result);
 
     Ok(())
diff --git a/cano-e2e/src/bin/cano_workflow_app.rs b/cano-e2e/src/bin/cano_workflow_app.rs
index 260aad7..3b609ea 100644
--- a/cano-e2e/src/bin/cano_workflow_app.rs
+++ b/cano-e2e/src/bin/cano_workflow_app.rs
@@ -13,6 +13,7 @@
 use std::str::FromStr;
 use std::sync::Arc;
 
+use cano::CancellationToken;
 use cano_e2e::{Faults, Phase, PostgresCheckpointStore, StdoutTracer, build_workflow};
 
 #[tokio::main]
@@ -62,8 +63,16 @@ async fn main() -> anyhow::Result<()> {
     emit(&format!("READY {workflow_id} {mode}"));
 
     let result = match mode.as_str() {
-        "resume" => workflow.resume_from(workflow_id.clone()).await,
-        "run" => workflow.orchestrate(Phase::Reserve).await,
+        "resume" => {
+            workflow
+                .resume_from(workflow_id.clone(), CancellationToken::disabled())
+                .await
+        }
+        "run" => {
+            workflow
+                .orchestrate(Phase::Reserve, CancellationToken::disabled())
+                .await
+        }
         other => anyhow::bail!("unknown mode {other:?}"),
     };
     match result {
diff --git a/cano-macros/tests/batch_task_impl.rs b/cano-macros/tests/batch_task_impl.rs
index ff59645..9aec98e 100644
--- a/cano-macros/tests/batch_task_impl.rs
+++ b/cano-macros/tests/batch_task_impl.rs
@@ -105,7 +105,10 @@ async fn inherent_inferred_integrates_with_workflow() {
         .register(Step::Process, InherentInferred)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::Process).await.unwrap();
+    let result = workflow
+        .orchestrate(Step::Process, CancellationToken::disabled())
+        .await
+        .unwrap();
     assert_eq!(result, Step::Done);
 }
 
@@ -197,7 +200,10 @@ async fn inherent_with_key_integrates_with_workflow() {
         .register(Step::Process, InherentWithKey)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::Process).await.unwrap();
+    let result = workflow
+        .orchestrate(Step::Process, CancellationToken::disabled())
+        .await
+        .unwrap();
     assert_eq!(result, Step::Done);
 }
 
@@ -394,7 +400,10 @@ async fn trait_form_integrates_with_workflow() {
         .register(Step::Process, TraitBatch)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::Process).await.unwrap();
+    let result = workflow
+        .orchestrate(Step::Process, CancellationToken::disabled())
+        .await
+        .unwrap();
     assert_eq!(result, Step::Done);
 }
 
@@ -534,7 +543,10 @@ async fn end_to_end_workflow_load_process_finish() {
         .register(Step::Process, LoadStep)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::Process).await.unwrap();
+    let result = workflow
+        .orchestrate(Step::Process, CancellationToken::disabled())
+        .await
+        .unwrap();
     assert_eq!(result, Step::Done);
 
     let output: Vec<u32> = store.get("output").unwrap();
diff --git a/cano-macros/tests/compensatable_task_impl.rs b/cano-macros/tests/compensatable_task_impl.rs
index 6f8bde2..3172527 100644
--- a/cano-macros/tests/compensatable_task_impl.rs
+++ b/cano-macros/tests/compensatable_task_impl.rs
@@ -72,7 +72,10 @@ async fn inherent_compensatable_impl_registers_and_compensates() {
         .register(Step::Boom, Boom)
         .add_exit_state(Step::Done);
 
-    let err = workflow.orchestrate(Step::Reserve).await.unwrap_err();
+    let err = workflow
+        .orchestrate(Step::Reserve, CancellationToken::disabled())
+        .await
+        .unwrap_err();
     assert_eq!(err.message(), "boom"); // clean rollback -> the original failure is surfaced
     assert!(
         compensated.load(Ordering::SeqCst),
@@ -85,7 +88,10 @@ async fn inherent_compensatable_impl_registers_and_compensates() {
         .register_with_compensation(Step::Reserve, ReserveNamed)
         .add_exit_state(Step::Done);
     assert_eq!(
-        workflow.orchestrate(Step::Reserve).await.unwrap(),
+        workflow
+            .orchestrate(Step::Reserve, CancellationToken::disabled())
+            .await
+            .unwrap(),
         Step::Done
     );
 }
diff --git a/cano-macros/tests/poll_task_impl.rs b/cano-macros/tests/poll_task_impl.rs
index 7b467da..b27622e 100644
--- a/cano-macros/tests/poll_task_impl.rs
+++ b/cano-macros/tests/poll_task_impl.rs
@@ -219,7 +219,10 @@ async fn inherent_poller_integrates_with_workflow() {
         .register(Step::Poll, InherentPoller)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::Poll).await.unwrap();
+    let result = workflow
+        .orchestrate(Step::Poll, CancellationToken::disabled())
+        .await
+        .unwrap();
     assert_eq!(result, Step::Done);
 }
 
@@ -229,7 +232,10 @@ async fn trait_poller_integrates_with_workflow() {
         .register(Step::Poll, TraitPoller)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::Poll).await.unwrap();
+    let result = workflow
+        .orchestrate(Step::Poll, CancellationToken::disabled())
+        .await
+        .unwrap();
     assert_eq!(result, Step::Done);
 }
 
diff --git a/cano-macros/tests/router_task_impl.rs b/cano-macros/tests/router_task_impl.rs
index 30ae1ff..00b0f17 100644
--- a/cano-macros/tests/router_task_impl.rs
+++ b/cano-macros/tests/router_task_impl.rs
@@ -192,7 +192,10 @@ async fn inherent_router_integrates_with_workflow() {
         .register(Step::PathA, PathATask)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::Route).await.unwrap();
+    let result = workflow
+        .orchestrate(Step::Route, CancellationToken::disabled())
+        .await
+        .unwrap();
     assert_eq!(result, Step::Done);
 }
 
@@ -203,6 +206,9 @@ async fn trait_router_integrates_with_workflow() {
         .register(Step::PathA, PathATask)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::Route).await.unwrap();
+    let result = workflow
+        .orchestrate(Step::Route, CancellationToken::disabled())
+        .await
+        .unwrap();
     assert_eq!(result, Step::Done);
 }
diff --git a/cano-macros/tests/stepped_task_impl.rs b/cano-macros/tests/stepped_task_impl.rs
index 3f55a28..f26c827 100644
--- a/cano-macros/tests/stepped_task_impl.rs
+++ b/cano-macros/tests/stepped_task_impl.rs
@@ -380,7 +380,10 @@ async fn stepped_task_in_workflow() {
         .register(MyState::Work, stepper)
         .add_exit_state(MyState::Done);
 
-    let result = workflow.orchestrate(MyState::Work).await.unwrap();
+    let result = workflow
+        .orchestrate(MyState::Work, CancellationToken::disabled())
+        .await
+        .unwrap();
     assert_eq!(result, MyState::Done);
 }
 
diff --git a/cano-macros/tests/task_impl_name.rs b/cano-macros/tests/task_impl_name.rs
index 5a2c747..cc3bf3c 100644
--- a/cano-macros/tests/task_impl_name.rs
+++ b/cano-macros/tests/task_impl_name.rs
@@ -55,5 +55,11 @@ async fn task_still_runs_in_a_workflow() {
     let workflow = Workflow::bare()
         .register(Step::Start, NamedInherentTask)
         .add_exit_state(Step::Done);
-    assert_eq!(workflow.orchestrate(Step::Start).await.unwrap(), Step::Done);
+    assert_eq!(
+        workflow
+            .orchestrate(Step::Start, CancellationToken::disabled())
+            .await
+            .unwrap(),
+        Step::Done
+    );
 }
diff --git a/cano-macros/tests/timer_task_impl.rs b/cano-macros/tests/timer_task_impl.rs
index 5ef14f2..d663f77 100644
--- a/cano-macros/tests/timer_task_impl.rs
+++ b/cano-macros/tests/timer_task_impl.rs
@@ -212,7 +212,10 @@ async fn inherent_timer_integrates_with_workflow() {
         .register(Step::Wait, TraitTimer)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::Wait).await.unwrap();
+    let result = workflow
+        .orchestrate(Step::Wait, CancellationToken::disabled())
+        .await
+        .unwrap();
     assert_eq!(result, Step::Done);
 }
 
diff --git a/cano/Cargo.toml b/cano/Cargo.toml
index 3ffae3b..13691c3 100644
--- a/cano/Cargo.toml
+++ b/cano/Cargo.toml
@@ -101,6 +101,11 @@ name = "scheduler_graceful_shutdown"
 path = "examples/scheduler_graceful_shutdown.rs"
 required-features = ["scheduler"]
 
+[[example]]
+name = "scheduler_cancellation"
+path = "examples/scheduler_cancellation.rs"
+required-features = ["scheduler"]
+
 [[example]]
 name = "scheduler_mixed_workflows"
 path = "examples/scheduler_mixed_workflows.rs"
@@ -201,6 +206,10 @@ path = "examples/saga_payment.rs"
 name = "workflow_total_timeout"
 path = "examples/workflow_total_timeout.rs"
 
+[[example]]
+name = "workflow_cancellation"
+path = "examples/workflow_cancellation.rs"
+
 [[example]]
 name = "router_task"
 path = "examples/router_task.rs"
diff --git a/cano/benches/workflow_performance.rs b/cano/benches/workflow_performance.rs
index ee4cbd5..f85e700 100644
--- a/cano/benches/workflow_performance.rs
+++ b/cano/benches/workflow_performance.rs
@@ -577,7 +577,9 @@ fn bench_orchestrate_overhead(c: &mut Criterion) {
         b.to_async(&runtime).iter(|| {
             let workflow = Arc::clone(&workflow);
             async move {
-                let _ = workflow.orchestrate(S::Done).await;
+                let _ = workflow
+                    .orchestrate(S::Done, CancellationToken::disabled())
+                    .await;
             }
         });
     });
@@ -617,7 +619,9 @@ fn bench_large_split_collect(c: &mut Criterion) {
                 .add_exit_state(S::Done);
 
             b.to_async(&runtime).iter(|| async {
-                let _ = workflow.orchestrate(S::Start).await;
+                let _ = workflow
+                    .orchestrate(S::Start, CancellationToken::disabled())
+                    .await;
             });
         });
     }
@@ -659,7 +663,9 @@ fn bench_tracing_overhead(c: &mut Criterion) {
         b.to_async(&runtime).iter(|| {
             let workflow = Arc::clone(&workflow);
             async move {
-                let _ = workflow.orchestrate(S::Done).await;
+                let _ = workflow
+                    .orchestrate(S::Done, CancellationToken::disabled())
+                    .await;
             }
         });
     });
diff --git a/cano/examples/ai_workflow_yes_and.rs b/cano/examples/ai_workflow_yes_and.rs
index 96ba928..3849df7 100644
--- a/cano/examples/ai_workflow_yes_and.rs
+++ b/cano/examples/ai_workflow_yes_and.rs
@@ -318,7 +318,9 @@ async fn main() -> Result<(), CanoError> {
 
     println!("Starting improvised story...\n");
 
-    let final_state = workflow.orchestrate(ConversationState::Start).await?;
+    let final_state = workflow
+        .orchestrate(ConversationState::Start, CancellationToken::disabled())
+        .await?;
 
     println!("\nStory completed with state: {final_state:?}");
 
diff --git a/cano/examples/batch_task.rs b/cano/examples/batch_task.rs
index a93230b..6726e78 100644
--- a/cano/examples/batch_task.rs
+++ b/cano/examples/batch_task.rs
@@ -240,7 +240,9 @@ async fn main() -> CanoResult<()> {
         .register(Step::Summarise, Summarise { url_count })
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::ParseUrls).await?;
+    let result = workflow
+        .orchestrate(Step::ParseUrls, CancellationToken::disabled())
+        .await?;
     assert_eq!(result, Step::Done);
     println!("\ncompleted at {result:?}");
 
diff --git a/cano/examples/circuit_breaker.rs b/cano/examples/circuit_breaker.rs
index 5c7ad63..80283bb 100644
--- a/cano/examples/circuit_breaker.rs
+++ b/cano/examples/circuit_breaker.rs
@@ -98,7 +98,9 @@ async fn main() -> Result<(), CanoError> {
 
     println!("Phase 1 — dependency unhealthy, threshold = 3.");
     for attempt in 1..=5 {
-        let outcome = workflow.orchestrate(Step::Call).await;
+        let outcome = workflow
+            .orchestrate(Step::Call, CancellationToken::disabled())
+            .await;
         // `orchestrate` wraps task failures in `WithStateContext`; unwrap one layer
         // before pattern-matching on the underlying variant.
         let label = match outcome {
@@ -125,7 +127,10 @@ async fn main() -> Result<(), CanoError> {
 
     println!("Phase 3 — half-open trial probes the dependency, then closes the breaker.");
     for attempt in 1..=3 {
-        match workflow.orchestrate(Step::Call).await {
+        match workflow
+            .orchestrate(Step::Call, CancellationToken::disabled())
+            .await
+        {
             Ok(_) => println!(
                 "  recovery call {attempt}: ok | state={:?}",
                 breaker.state()
diff --git a/cano/examples/circuit_breaker_manual.rs b/cano/examples/circuit_breaker_manual.rs
index e0539b3..902713b 100644
--- a/cano/examples/circuit_breaker_manual.rs
+++ b/cano/examples/circuit_breaker_manual.rs
@@ -151,7 +151,9 @@ async fn main() -> Result<(), CanoError> {
     // ------------------------------------------------------------------
     println!("Phase 1: dependency unhealthy (threshold = 3 consecutive failures)");
     for attempt in 1..=5 {
-        let outcome = workflow.orchestrate(Step::Call).await;
+        let outcome = workflow
+            .orchestrate(Step::Call, CancellationToken::disabled())
+            .await;
         // `orchestrate` wraps task failures in `WithStateContext`; unwrap one layer
         // before pattern-matching on the underlying variant.
         let label = match &outcome {
@@ -185,7 +187,10 @@ async fn main() -> Result<(), CanoError> {
     // ------------------------------------------------------------------
     println!("\nPhase 3: half-open trial — one probe closes the breaker");
     for attempt in 1..=3 {
-        match workflow.orchestrate(Step::Call).await {
+        match workflow
+            .orchestrate(Step::Call, CancellationToken::disabled())
+            .await
+        {
             Ok(_) => println!("  call {attempt}: ok | breaker={:?}", breaker.state()),
             Err(e) => println!("  call {attempt}: err: {e} | breaker={:?}", breaker.state()),
         }
diff --git a/cano/examples/custom_checkpoint_store.rs b/cano/examples/custom_checkpoint_store.rs
index 7ff97dc..49f0e74 100644
--- a/cano/examples/custom_checkpoint_store.rs
+++ b/cano/examples/custom_checkpoint_store.rs
@@ -173,7 +173,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     // --- First run: Process crashes; the checkpoint log is kept. -------
     println!("=== run 1: Process will crash ===");
-    match workflow.orchestrate(Step::Init).await {
+    match workflow
+        .orchestrate(Step::Init, CancellationToken::disabled())
+        .await
+    {
         Ok(s) => println!("  completed at {s:?} (unexpected)"),
         Err(e) => println!("  stopped with error: {e}"),
     }
@@ -201,7 +204,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     // --- Second run: resume from last checkpoint (Process, attempt 2). ---
     println!("\n=== run 2: resume_from ===");
-    let final_state = workflow.resume_from(run_id).await?;
+    let final_state = workflow
+        .resume_from(run_id, CancellationToken::disabled())
+        .await?;
     println!("  reached {final_state:?}");
     assert_eq!(final_state, Step::Done);
 
diff --git a/cano/examples/join_strategies.rs b/cano/examples/join_strategies.rs
index 76cd54e..c550357 100644
--- a/cano/examples/join_strategies.rs
+++ b/cano/examples/join_strategies.rs
@@ -110,7 +110,9 @@ async fn run_strategy(label: &str, strategy: JoinStrategy) -> CanoResult<()> {
         .add_exit_state(Step::Done);
 
     let start = Instant::now();
-    let result = workflow.orchestrate(Step::Parallel).await?;
+    let result = workflow
+        .orchestrate(Step::Parallel, CancellationToken::disabled())
+        .await?;
     let elapsed = start.elapsed();
 
     // Count how many workers managed to log a result before being cancelled.
diff --git a/cano/examples/metrics_demo.rs b/cano/examples/metrics_demo.rs
index e249564..8ba6f01 100644
--- a/cano/examples/metrics_demo.rs
+++ b/cano/examples/metrics_demo.rs
@@ -60,7 +60,7 @@ async fn main() {
     // Run the workflow 3 times directly.
     for _ in 0..3 {
         workflow()
-            .orchestrate(Step::Fetch)
+            .orchestrate(Step::Fetch, CancellationToken::disabled())
             .await
             .expect("workflow run");
     }
diff --git a/cano/examples/metrics_tracing_context.rs b/cano/examples/metrics_tracing_context.rs
index 49c2426..8c55b4a 100644
--- a/cano/examples/metrics_tracing_context.rs
+++ b/cano/examples/metrics_tracing_context.rs
@@ -94,7 +94,7 @@ async fn main() {
     // Path 1: Cano's own `workflow_orchestrate` span carries `workflow_id`.
     workflow()
         .with_workflow_id("demo-run-1")
-        .orchestrate(Step::Fetch)
+        .orchestrate(Step::Fetch, CancellationToken::disabled())
         .await
         .expect("workflow run");
 
@@ -103,7 +103,7 @@ async fn main() {
         let span = info_span!("api_request", request_id = "abc");
         let _enter = span.enter();
         workflow()
-            .orchestrate(Step::Fetch)
+            .orchestrate(Step::Fetch, CancellationToken::disabled())
             .await
             .expect("workflow run");
     }
diff --git a/cano/examples/mixed_workflow.rs b/cano/examples/mixed_workflow.rs
index 276b664..77b66c5 100644
--- a/cano/examples/mixed_workflow.rs
+++ b/cano/examples/mixed_workflow.rs
@@ -240,7 +240,10 @@ async fn main() -> CanoResult<()> {
         .register(WorkflowState::GenerateReport, ReportTask)
         .add_exit_states(vec![WorkflowState::Complete]);
 
-    match workflow.orchestrate(WorkflowState::GenerateData).await {
+    match workflow
+        .orchestrate(WorkflowState::GenerateData, CancellationToken::disabled())
+        .await
+    {
         Ok(_final_state) => {
             println!("\nWorkflow completed successfully!");
 
diff --git a/cano/examples/observer_metrics.rs b/cano/examples/observer_metrics.rs
index 04ff78a..6e7d678 100644
--- a/cano/examples/observer_metrics.rs
+++ b/cano/examples/observer_metrics.rs
@@ -141,7 +141,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         .with_observer(metrics.clone());
 
     for run in 1..=2 {
-        match workflow.orchestrate(Step::Start).await {
+        match workflow
+            .orchestrate(Step::Start, CancellationToken::disabled())
+            .await
+        {
             Ok(state) => println!("run {run}: reached {state:?}"),
             Err(error) => println!("run {run}: stopped — {error}"),
         }
diff --git a/cano/examples/panic_safety.rs b/cano/examples/panic_safety.rs
index 9d26a18..6b3359f 100644
--- a/cano/examples/panic_safety.rs
+++ b/cano/examples/panic_safety.rs
@@ -127,7 +127,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
             .register(Step::PanicTask, Panicker)
             .add_exit_state(Step::Done);
 
-        match workflow.orchestrate(Step::PanicTask).await {
+        match workflow
+            .orchestrate(Step::PanicTask, CancellationToken::disabled())
+            .await
+        {
             Ok(s) => println!("  outcome: Ok({s:?})  (unexpected)"),
             Err(e) => {
                 println!("  outcome: Err(\"{e}\")");
@@ -157,7 +160,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
             .register(Step::PanicTask, PanicAfterReserve)
             .add_exit_state(Step::Done);
 
-        match workflow.orchestrate(Step::Reserve).await {
+        match workflow
+            .orchestrate(Step::Reserve, CancellationToken::disabled())
+            .await
+        {
             Ok(s) => println!("  outcome: Ok({s:?})  (unexpected)"),
             Err(e) => {
                 println!("  outcome: Err(\"{e}\")");
diff --git a/cano/examples/poll_retry_on_error.rs b/cano/examples/poll_retry_on_error.rs
index 7b652e2..1e4263c 100644
--- a/cano/examples/poll_retry_on_error.rs
+++ b/cano/examples/poll_retry_on_error.rs
@@ -115,7 +115,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
             .register(Step::Poll, poller)
             .add_exit_state(Step::Done);
 
-        match workflow.orchestrate(Step::Poll).await {
+        match workflow
+            .orchestrate(Step::Poll, CancellationToken::disabled())
+            .await
+        {
             Ok(state) => println!("  result: Ok({state:?})  -- loop tolerated the streak\n"),
             Err(e) => println!("  result: Err({e})  -- unexpected failure\n"),
         }
@@ -135,7 +138,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
             .register(Step::Poll, poller)
             .add_exit_state(Step::Done);
 
-        match workflow.orchestrate(Step::Poll).await {
+        match workflow
+            .orchestrate(Step::Poll, CancellationToken::disabled())
+            .await
+        {
             Ok(state) => println!("  result: Ok({state:?})  -- unexpected success\n"),
             Err(e) => println!("  result: Err(\"{e}\")  -- loop aborted after streak > cap\n"),
         }
@@ -195,7 +201,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
             )
             .add_exit_state(Step::Done);
 
-        match workflow.orchestrate(Step::Poll).await {
+        match workflow
+            .orchestrate(Step::Poll, CancellationToken::disabled())
+            .await
+        {
             Ok(state) => println!("  result: Ok({state:?})  -- Pending reset the counter\n"),
             Err(e) => println!("  result: Err({e})  -- unexpected failure\n"),
         }
diff --git a/cano/examples/poll_task.rs b/cano/examples/poll_task.rs
index 8026ccb..0ede9af 100644
--- a/cano/examples/poll_task.rs
+++ b/cano/examples/poll_task.rs
@@ -160,7 +160,9 @@ async fn main() -> CanoResult<()> {
         .register(Step::Process, Process)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::AwaitJob).await?;
+    let result = workflow
+        .orchestrate(Step::AwaitJob, CancellationToken::disabled())
+        .await?;
     assert_eq!(result, Step::Done);
     println!("\ncompleted at {result:?}");
 
diff --git a/cano/examples/processing_models_tour.rs b/cano/examples/processing_models_tour.rs
index 457dd21..dc5fb6d 100644
--- a/cano/examples/processing_models_tour.rs
+++ b/cano/examples/processing_models_tour.rs
@@ -242,7 +242,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         .with_checkpoint_store(checkpoint_store.clone())
         .with_workflow_id(run_id);
 
-    let result = workflow.orchestrate(Stage::Route).await?;
+    let result = workflow
+        .orchestrate(Stage::Route, CancellationToken::disabled())
+        .await?;
     assert_eq!(result, Stage::Done);
 
     println!("\ncompleted at {result:?}");
diff --git a/cano/examples/router_task.rs b/cano/examples/router_task.rs
index 1af745e..151999c 100644
--- a/cano/examples/router_task.rs
+++ b/cano/examples/router_task.rs
@@ -123,13 +123,17 @@ fn build_workflow(use_fast_path: bool) -> Workflow<Step> {
 async fn main() -> CanoResult<()> {
     println!("=== fast-path run ===");
     let workflow = build_workflow(true);
-    let result = workflow.orchestrate(Step::Classify).await?;
+    let result = workflow
+        .orchestrate(Step::Classify, CancellationToken::disabled())
+        .await?;
     assert_eq!(result, Step::Done);
     println!("completed at {result:?}\n");
 
     println!("=== slow-path run ===");
     let workflow = build_workflow(false);
-    let result = workflow.orchestrate(Step::Classify).await?;
+    let result = workflow
+        .orchestrate(Step::Classify, CancellationToken::disabled())
+        .await?;
     assert_eq!(result, Step::Done);
     println!("completed at {result:?}");
 
diff --git a/cano/examples/saga_payment.rs b/cano/examples/saga_payment.rs
index 445050b..b3bb217 100644
--- a/cano/examples/saga_payment.rs
+++ b/cano/examples/saga_payment.rs
@@ -118,7 +118,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         .register(Step::Ship, ShipOrder) // plain — and it fails
         .add_exit_state(Step::Done);
 
-    match workflow.orchestrate(Step::Reserve).await {
+    match workflow
+        .orchestrate(Step::Reserve, CancellationToken::disabled())
+        .await
+    {
         Ok(state) => println!("\ncompleted at {state:?}"),
         Err(error) => println!("\nfailed, rolled back: {error}"),
     }
diff --git a/cano/examples/saga_recovery.rs b/cano/examples/saga_recovery.rs
index 1d0d574..577f50a 100644
--- a/cano/examples/saga_recovery.rs
+++ b/cano/examples/saga_recovery.rs
@@ -161,7 +161,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     // --- (a) First run: Charge fails; compensations drain LIFO. ----------
     println!("--- run: Reserve → Authorize → Charge (fails) → compensate LIFO ---\n");
-    match workflow.orchestrate(Step::Reserve).await {
+    match workflow
+        .orchestrate(Step::Reserve, CancellationToken::disabled())
+        .await
+    {
         Ok(s) => println!("\ncompleted at {s:?} (unexpected)"),
         Err(e) => println!("\nfailed + rolled back: {e}"),
     }
diff --git a/cano/examples/scheduler_book_prepositions.rs b/cano/examples/scheduler_book_prepositions.rs
index 9ff257c..8d14917 100644
--- a/cano/examples/scheduler_book_prepositions.rs
+++ b/cano/examples/scheduler_book_prepositions.rs
@@ -273,7 +273,9 @@ async fn main() -> CanoResult<()> {
         )
         .add_exit_states(vec![WorkflowPhase::Analyze, WorkflowPhase::Complete]);
 
-    let _ = workflow1.orchestrate(WorkflowPhase::Download).await?;
+    let _ = workflow1
+        .orchestrate(WorkflowPhase::Download, CancellationToken::disabled())
+        .await?;
 
     // Book 2: Alice's Adventures in Wonderland
     let workflow2 = Workflow::new(Resources::new().insert("store", store.clone()))
@@ -283,7 +285,9 @@ async fn main() -> CanoResult<()> {
         )
         .add_exit_states(vec![WorkflowPhase::Analyze, WorkflowPhase::Complete]);
 
-    let _ = workflow2.orchestrate(WorkflowPhase::Download).await?;
+    let _ = workflow2
+        .orchestrate(WorkflowPhase::Download, CancellationToken::disabled())
+        .await?;
 
     // Book 3: A Christmas Carol
     let workflow3 = Workflow::new(Resources::new().insert("store", store.clone()))
@@ -293,7 +297,9 @@ async fn main() -> CanoResult<()> {
         )
         .add_exit_states(vec![WorkflowPhase::Analyze, WorkflowPhase::Complete]);
 
-    let _ = workflow3.orchestrate(WorkflowPhase::Download).await?;
+    let _ = workflow3
+        .orchestrate(WorkflowPhase::Download, CancellationToken::disabled())
+        .await?;
 
     // Analyze and rank the downloaded books
     println!("\nAnalyzing and ranking books...\n");
@@ -304,7 +310,7 @@ async fn main() -> CanoResult<()> {
         .add_exit_state(WorkflowPhase::Complete);
 
     analysis_workflow
-        .orchestrate(WorkflowPhase::Analyze)
+        .orchestrate(WorkflowPhase::Analyze, CancellationToken::disabled())
         .await?;
 
     println!("\nBook preposition analysis complete!");
diff --git a/cano/examples/scheduler_cancellation.rs b/cano/examples/scheduler_cancellation.rs
new file mode 100644
index 0000000..8a3f3ad
--- /dev/null
+++ b/cano/examples/scheduler_cancellation.rs
@@ -0,0 +1,119 @@
+#![cfg(feature = "scheduler")]
+//! # Scheduler cooperative cancellation
+//!
+//! Demonstrates [`RunningScheduler::cancel_flow`](cano::RunningScheduler::cancel_flow):
+//! a manually-triggered saga `Reserve → Charge → Ship → Done` whose `Ship` step
+//! runs long. A sibling task calls `cancel_flow` once `Ship` is in flight; the
+//! engine aborts it at its next await, the saga compensation stack drains in
+//! reverse (`Charge` then `Reserve`), and the flow returns to `Idle` — a
+//! deliberate cancel is **not** counted as a backoff failure. Graceful `stop()`
+//! cancels in-flight flows the same way.
+//!
+//! Run with:
+//! ```bash
+//! cargo run --example scheduler_cancellation --features scheduler
+//! ```
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::Duration;
+
+use cano::prelude::*;
+use cano::scheduler::Status;
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+enum Step {
+    Reserve,
+    Charge,
+    Ship,
+    Done,
+}
+
+struct Reserve;
+struct Charge;
+
+#[saga::task(state = Step)]
+impl Reserve {
+    type Output = u32;
+    async fn run(&self, _res: &Resources) -> Result<(TaskResult<Step>, u32), CanoError> {
+        println!("reserve  : holding inventory (ticket #42)");
+        Ok((TaskResult::Single(Step::Charge), 42))
+    }
+    async fn compensate(&self, _res: &Resources, ticket: u32) -> Result<(), CanoError> {
+        println!("reserve  : releasing ticket #{ticket}  (rollback)");
+        Ok(())
+    }
+}
+
+#[saga::task(state = Step)]
+impl Charge {
+    type Output = String;
+    async fn run(&self, _res: &Resources) -> Result<(TaskResult<Step>, String), CanoError> {
+        println!("charge   : capturing $42.00 (auth auth-XYZ)");
+        Ok((TaskResult::Single(Step::Ship), "auth-XYZ".to_string()))
+    }
+    async fn compensate(&self, _res: &Resources, auth: String) -> Result<(), CanoError> {
+        println!("charge   : refunding auth {auth}  (rollback)");
+        Ok(())
+    }
+}
+
+/// Long-running, non-compensatable step. Flips `started` so the sibling
+/// canceller fires deterministically while this task is parked in its sleep.
+struct Ship {
+    started: Arc<AtomicBool>,
+}
+#[task(state = Step)]
+impl Ship {
+    fn config(&self) -> TaskConfig {
+        TaskConfig::minimal()
+    }
+    async fn run_bare(&self) -> Result<TaskResult<Step>, CanoError> {
+        println!("ship     : dispatching shipment…  (cancel_flow will stop this)");
+        self.started.store(true, Ordering::SeqCst);
+        tokio::time::sleep(Duration::from_secs(10)).await;
+        println!("ship     : this line should never print");
+        Ok(TaskResult::Single(Step::Done))
+    }
+}
+
+#[tokio::main]
+async fn main() -> CanoResult<()> {
+    let ship_started = Arc::new(AtomicBool::new(false));
+
+    let workflow = Workflow::bare()
+        .register_with_compensation(Step::Reserve, Reserve)
+        .register_with_compensation(Step::Charge, Charge)
+        .register(
+            Step::Ship,
+            Ship {
+                started: ship_started.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler.manual("order", workflow, Step::Reserve)?;
+    let running = scheduler.start().await?;
+
+    // Kick off the saga, then cancel it once Ship is in flight.
+    running.trigger("order").await?;
+    while !ship_started.load(Ordering::SeqCst) {
+        tokio::time::sleep(Duration::from_millis(5)).await;
+    }
+    println!("\n>>> cancelling the in-flight flow…\n");
+    running.cancel_flow("order").await?;
+
+    // Wait for the cancelled run to settle (saga drained, status back to Idle).
+    loop {
+        let status = running.status("order").await.map(|i| i.status);
+        if status != Some(Status::Running) {
+            println!("\norder flow status after cancel: {status:?}  (Idle — not a failure)");
+            break;
+        }
+        tokio::time::sleep(Duration::from_millis(5)).await;
+    }
+
+    running.stop().await?;
+    Ok(())
+}
diff --git a/cano/examples/split_bulkhead.rs b/cano/examples/split_bulkhead.rs
index 8780062..522b823 100644
--- a/cano/examples/split_bulkhead.rs
+++ b/cano/examples/split_bulkhead.rs
@@ -167,7 +167,9 @@ async fn main() -> Result<(), CanoError> {
         .register(Step::Summarize, Summarize)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::ParallelWork).await?;
+    let result = workflow
+        .orchestrate(Step::ParallelWork, CancellationToken::disabled())
+        .await?;
     println!("\ncompleted at {result:?}");
 
     println!("\n=== Done ===");
diff --git a/cano/examples/stepped_task.rs b/cano/examples/stepped_task.rs
index d084406..71588cf 100644
--- a/cano/examples/stepped_task.rs
+++ b/cano/examples/stepped_task.rs
@@ -140,7 +140,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         .with_checkpoint_store(checkpoint_store.clone())
         .with_workflow_id(run_id);
 
-    let result = workflow.orchestrate(Stage::Crunch).await?;
+    let result = workflow
+        .orchestrate(Stage::Crunch, CancellationToken::disabled())
+        .await?;
     assert_eq!(result, Stage::Done);
 
     println!("\ncompleted at {result:?}");
diff --git a/cano/examples/store_custom_backend.rs b/cano/examples/store_custom_backend.rs
index 379987d..dae181e 100644
--- a/cano/examples/store_custom_backend.rs
+++ b/cano/examples/store_custom_backend.rs
@@ -221,7 +221,9 @@ async fn main() -> Result<(), CanoError> {
         .add_exit_state(Step::Done);
 
     println!("\n-- Part 2: MemoryStore::get_shared (Arc zero-copy sharing) --");
-    let result = workflow.orchestrate(Step::WriteA).await?;
+    let result = workflow
+        .orchestrate(Step::WriteA, CancellationToken::disabled())
+        .await?;
     println!("\ncompleted at {result:?}");
 
     println!("\n=== Done ===");
diff --git a/cano/examples/task_interface_demo.rs b/cano/examples/task_interface_demo.rs
index 8ac5085..622e676 100644
--- a/cano/examples/task_interface_demo.rs
+++ b/cano/examples/task_interface_demo.rs
@@ -141,7 +141,10 @@ async fn main() -> Result<(), CanoError> {
     println!("Executing workflow...");
     println!();
 
-    match workflow.orchestrate(TaskState::Start).await {
+    match workflow
+        .orchestrate(TaskState::Start, CancellationToken::disabled())
+        .await
+    {
         Ok(final_state) => {
             println!();
             println!("Workflow completed successfully!");
diff --git a/cano/examples/task_simple.rs b/cano/examples/task_simple.rs
index cfc10a1..273a768 100644
--- a/cano/examples/task_simple.rs
+++ b/cano/examples/task_simple.rs
@@ -82,7 +82,10 @@ async fn main() -> CanoResult<()> {
         .register(Action::Count, CounterTask)
         .add_exit_states(vec![Action::Complete]);
 
-    match workflow.orchestrate(Action::Generate).await {
+    match workflow
+        .orchestrate(Action::Generate, CancellationToken::disabled())
+        .await
+    {
         Ok(_final_state) => {
             println!("Workflow completed!");
             println!("Final Results:");
diff --git a/cano/examples/testing_helpers.rs b/cano/examples/testing_helpers.rs
index 1f4333e..cdd08c3 100644
--- a/cano/examples/testing_helpers.rs
+++ b/cano/examples/testing_helpers.rs
@@ -117,7 +117,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         .with_checkpoint_store(checkpoints.clone())
         .with_workflow_id("demo-run");
 
-    let final_state = workflow.orchestrate(Step::Start).await?;
+    let final_state = workflow
+        .orchestrate(Step::Start, CancellationToken::disabled())
+        .await?;
     assert_eq!(final_state, Step::Done);
 
     // The observer captured the whole path and the checkpoint appends along the way.
@@ -135,7 +137,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     let panicky = Workflow::bare()
         .register(Step::Start, panic_on_attempt(1, Step::Done))
         .add_exit_state(Step::Done);
-    match panicky.orchestrate(Step::Start).await {
+    match panicky
+        .orchestrate(Step::Start, CancellationToken::disabled())
+        .await
+    {
         Ok(_) => unreachable!("the task panics on its first attempt"),
         Err(e) => println!("panic_on_attempt surfaced as error: {e}"),
     }
@@ -149,7 +154,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         .register_with_compensation(Step::Work, Charge)
         .register(Step::Finish, Boom) // fails → drains the compensation stack in reverse
         .add_exit_state(Step::Done);
-    let _ = saga.orchestrate(Step::Start).await; // expected to fail and roll back
+    let _ = saga
+        .orchestrate(Step::Start, CancellationToken::disabled())
+        .await; // expected to fail and roll back
 
     let ran = handle.0.lock().unwrap().clone();
     // Charge ran last, so it compensates first; then Reserve.
diff --git a/cano/examples/timer_task.rs b/cano/examples/timer_task.rs
index b28e67b..7956ec7 100644
--- a/cano/examples/timer_task.rs
+++ b/cano/examples/timer_task.rs
@@ -100,7 +100,9 @@ async fn main() -> CanoResult<()> {
         .register(Step::Process, Process)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::CoolDown).await?;
+    let result = workflow
+        .orchestrate(Step::CoolDown, CancellationToken::disabled())
+        .await?;
     assert_eq!(result, Step::Done);
     println!("\ncompleted at {result:?}");
 
diff --git a/cano/examples/tracing_demo.rs b/cano/examples/tracing_demo.rs
index 533042c..49af079 100644
--- a/cano/examples/tracing_demo.rs
+++ b/cano/examples/tracing_demo.rs
@@ -213,7 +213,9 @@ async fn main() -> CanoResult<()> {
             .add_exit_states(vec![WorkflowState::Complete, WorkflowState::Error]);
 
         info!("Starting workflow execution...");
-        let result = workflow.orchestrate(WorkflowState::Start).await?;
+        let result = workflow
+            .orchestrate(WorkflowState::Start, CancellationToken::disabled())
+            .await?;
         info!(final_state = ?result, "Workflow completed");
 
         println!("Basic workflow completed with state: {result:?}\n");
@@ -248,7 +250,9 @@ async fn main() -> CanoResult<()> {
             .with_tracing_span(workflow_span);
 
         info!("Starting task-based workflow execution (with custom span)...");
-        let result = task_workflow.orchestrate(WorkflowState::Start).await?;
+        let result = task_workflow
+            .orchestrate(WorkflowState::Start, CancellationToken::disabled())
+            .await?;
 
         let math_result: i32 = store.get("math_result").unwrap_or(0);
         let completed_by: String = store.get("task_completed_by").unwrap_or_default();
@@ -329,7 +333,9 @@ async fn main() -> CanoResult<()> {
             .add_exit_states(vec![WorkflowState::Complete, WorkflowState::Error]);
 
         info!("Starting workflow that will encounter validation failure...");
-        let result = error_workflow.orchestrate(WorkflowState::Start).await?;
+        let result = error_workflow
+            .orchestrate(WorkflowState::Start, CancellationToken::disabled())
+            .await?;
 
         println!("Error workflow completed with state: {result:?}");
 
@@ -363,7 +369,9 @@ async fn main() -> CanoResult<()> {
             // One line: re-emit lifecycle/failure events as `tracing` events.
             .with_observer(Arc::new(TracingObserver::new()));
 
-        let result = observed_workflow.orchestrate(WorkflowState::Start).await?;
+        let result = observed_workflow
+            .orchestrate(WorkflowState::Start, CancellationToken::disabled())
+            .await?;
         println!("Observed workflow completed with state: {result:?}");
         println!(
             "   (look for `task started` / `task succeeded` events; filter with RUST_LOG=cano::observer=debug)\n"
diff --git a/cano/examples/workflow_ad_exchange.rs b/cano/examples/workflow_ad_exchange.rs
index 20a4fd5..d55ab8d 100644
--- a/cano/examples/workflow_ad_exchange.rs
+++ b/cano/examples/workflow_ad_exchange.rs
@@ -596,7 +596,10 @@ async fn main() -> Result<(), CanoError> {
     let start = tokio::time::Instant::now();
 
     // Execute workflow - if splits timeout or fail, transition to NoFill
-    let result = match workflow.orchestrate(AdExchangeState::Start).await {
+    let result = match workflow
+        .orchestrate(AdExchangeState::Start, CancellationToken::disabled())
+        .await
+    {
         Ok(state) => state,
         Err(e) => {
             // If workflow fails due to split timeout/error, handle as NoFill.
@@ -606,7 +609,12 @@ async fn main() -> Result<(), CanoError> {
             store.put("error_reason", e.to_string())?;
             println!("\n⚠️  Handling as No Fill due to error\n");
 
-            workflow.orchestrate(AdExchangeState::ErrorTracking).await?
+            workflow
+                .orchestrate(
+                    AdExchangeState::ErrorTracking,
+                    CancellationToken::disabled(),
+                )
+                .await?
         }
     };
 
diff --git a/cano/examples/workflow_bare.rs b/cano/examples/workflow_bare.rs
index b416b81..3f19384 100644
--- a/cano/examples/workflow_bare.rs
+++ b/cano/examples/workflow_bare.rs
@@ -87,7 +87,10 @@ async fn main() -> CanoResult<()> {
         .register(Stage::Sanitize, SanitizeTask)
         .add_exit_states(vec![Stage::Persist, Stage::Done]);
 
-    match workflow.orchestrate(Stage::Validate).await {
+    match workflow
+        .orchestrate(Stage::Validate, CancellationToken::disabled())
+        .await
+    {
         Ok(final_state) => println!("\nBare workflow reached: {final_state:?}\n"),
         Err(e) => {
             eprintln!("Workflow failed: {e}");
@@ -105,7 +108,10 @@ async fn main() -> CanoResult<()> {
         .register(Stage::Persist, PersistTask) // resource task
         .add_exit_states(vec![Stage::Done]);
 
-    match workflow.orchestrate(Stage::Validate).await {
+    match workflow
+        .orchestrate(Stage::Validate, CancellationToken::disabled())
+        .await
+    {
         Ok(final_state) => {
             println!("\nMixed workflow reached: {final_state:?}");
             if let Ok(v) = store.get::<i32>("sanitized_value") {
diff --git a/cano/examples/workflow_book_prepositions.rs b/cano/examples/workflow_book_prepositions.rs
index 753814a..a79024e 100644
--- a/cano/examples/workflow_book_prepositions.rs
+++ b/cano/examples/workflow_book_prepositions.rs
@@ -507,7 +507,13 @@ async fn run_workflow() -> Result<(), CanoError> {
     println!("  BookRankingByPrepositionTask (Ranking phase)");
 
     // Execute the entire workflow using Workflow orchestration
-    match workflow.orchestrate(BookPrepositionAction::Download).await {
+    match workflow
+        .orchestrate(
+            BookPrepositionAction::Download,
+            CancellationToken::disabled(),
+        )
+        .await
+    {
         Ok(final_state) => {
             match final_state {
                 BookPrepositionAction::Complete => {
diff --git a/cano/examples/workflow_cancellation.rs b/cano/examples/workflow_cancellation.rs
new file mode 100644
index 0000000..2a74277
--- /dev/null
+++ b/cano/examples/workflow_cancellation.rs
@@ -0,0 +1,120 @@
+//! # Cooperative cancellation — saga rollback on cancel
+//!
+//! Demonstrates [`Workflow::orchestrate`](cano::Workflow::orchestrate) with a live token:
+//! a 3-step saga `Reserve → Charge → Ship → Done` where a sibling task fires a
+//! [`CancellationHandle`](cano::CancellationHandle) once `Ship` is in flight. The in-flight
+//! task is aborted at its next await point, the saga compensation stack drains in reverse
+//! (`Charge` then `Reserve`), and the call returns
+//! [`CanoError::Cancelled`](cano::CanoError::Cancelled).
+//!
+//! Run with:
+//! ```bash
+//! cargo run --example workflow_cancellation
+//! ```
+//!
+//! Expected output (timings will vary):
+//! ```text
+//! reserve  : holding inventory (ticket #42)
+//! charge   : capturing $42.00 (auth auth-XYZ)
+//! ship     : dispatching shipment…  (a sibling task will cancel this)
+//! charge   : refunding auth auth-XYZ  (rollback)
+//! reserve  : releasing ticket #42  (rollback)
+//! workflow cancelled, rolled back: state=Ship attempt=0 path=[Reserve, Charge, Ship] caused by: Workflow cancelled
+//! ```
+
+use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::time::Duration;
+
+use cano::CancellationToken;
+use cano::prelude::*;
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+enum Step {
+    Reserve,
+    Charge,
+    Ship,
+    Done,
+}
+
+struct Reserve;
+struct Charge;
+
+#[saga::task(state = Step)]
+impl Reserve {
+    type Output = u32;
+    async fn run(&self, _res: &Resources) -> Result<(TaskResult<Step>, u32), CanoError> {
+        let ticket = 42;
+        println!("reserve  : holding inventory (ticket #{ticket})");
+        Ok((TaskResult::Single(Step::Charge), ticket))
+    }
+    async fn compensate(&self, _res: &Resources, ticket: u32) -> Result<(), CanoError> {
+        println!("reserve  : releasing ticket #{ticket}  (rollback)");
+        Ok(())
+    }
+}
+
+#[saga::task(state = Step)]
+impl Charge {
+    type Output = String;
+    async fn run(&self, _res: &Resources) -> Result<(TaskResult<Step>, String), CanoError> {
+        let auth = "auth-XYZ".to_string();
+        println!("charge   : capturing $42.00 (auth {auth})");
+        Ok((TaskResult::Single(Step::Ship), auth))
+    }
+    async fn compensate(&self, _res: &Resources, auth: String) -> Result<(), CanoError> {
+        println!("charge   : refunding auth {auth}  (rollback)");
+        Ok(())
+    }
+}
+
+// Plain (non-compensatable) long-running task. It flips `started` so the sibling
+// canceller fires deterministically while this task is parked in its sleep.
+struct Ship {
+    started: Arc<AtomicBool>,
+}
+#[task(state = Step)]
+impl Ship {
+    fn config(&self) -> TaskConfig {
+        TaskConfig::minimal()
+    }
+    async fn run_bare(&self) -> Result<TaskResult<Step>, CanoError> {
+        println!("ship     : dispatching shipment…  (a sibling task will cancel this)");
+        self.started.store(true, Ordering::SeqCst);
+        tokio::time::sleep(Duration::from_secs(2)).await;
+        println!("ship     : this line should never print");
+        Ok(TaskResult::Single(Step::Done))
+    }
+}
+
+#[tokio::main]
+async fn main() {
+    let ship_started = Arc::new(AtomicBool::new(false));
+    let workflow = Workflow::bare()
+        .register_with_compensation(Step::Reserve, Reserve)
+        .register_with_compensation(Step::Charge, Charge)
+        .register(
+            Step::Ship,
+            Ship {
+                started: ship_started.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+
+    // Sibling task: cancel as soon as `Ship` is in flight.
+    let canceller = tokio::spawn(async move {
+        while !ship_started.load(Ordering::SeqCst) {
+            tokio::time::sleep(Duration::from_millis(5)).await;
+        }
+        handle.cancel();
+    });
+
+    match workflow.orchestrate(Step::Reserve, token).await {
+        Ok(state) => println!("\nworkflow completed at {state:?}"),
+        Err(error) => println!("\nworkflow cancelled, rolled back: {error}"),
+    }
+
+    canceller.await.expect("canceller task panicked");
+}
diff --git a/cano/examples/workflow_negotiation.rs b/cano/examples/workflow_negotiation.rs
index e0c2c14..cd19cf4 100644
--- a/cano/examples/workflow_negotiation.rs
+++ b/cano/examples/workflow_negotiation.rs
@@ -250,7 +250,13 @@ async fn run_negotiation_workflow() -> Result<(), CanoError> {
         ]);
 
     // Execute the negotiation workflow
-    match workflow.orchestrate(NegotiationAction::StartSelling).await {
+    match workflow
+        .orchestrate(
+            NegotiationAction::StartSelling,
+            CancellationToken::disabled(),
+        )
+        .await
+    {
         Ok(final_state) => {
             println!("{}", "=".repeat(50));
 
diff --git a/cano/examples/workflow_observer.rs b/cano/examples/workflow_observer.rs
index 42d224e..d8b076e 100644
--- a/cano/examples/workflow_observer.rs
+++ b/cano/examples/workflow_observer.rs
@@ -158,7 +158,9 @@ async fn main() -> Result<(), CanoError> {
         )
         .add_exit_state(Step::Done)
         .with_observer(observer.clone());
-    let final_state = workflow.orchestrate(Step::Load).await?;
+    let final_state = workflow
+        .orchestrate(Step::Load, CancellationToken::disabled())
+        .await?;
     println!("  → workflow finished in state {final_state:?}\n");
 
     // -- Scenario B -------------------------------------------------------
@@ -181,7 +183,10 @@ async fn main() -> Result<(), CanoError> {
         )
         .add_exit_state(Step::Done)
         .with_observer(observer.clone());
-    match guarded.orchestrate(Step::Probe).await {
+    match guarded
+        .orchestrate(Step::Probe, CancellationToken::disabled())
+        .await
+    {
         Ok(s) => println!("  → unexpectedly finished in {s:?}\n"),
         Err(e) => println!("  → workflow errored as expected: {e}\n"),
     }
diff --git a/cano/examples/workflow_on_request.rs b/cano/examples/workflow_on_request.rs
index 3445196..5c25bfd 100644
--- a/cano/examples/workflow_on_request.rs
+++ b/cano/examples/workflow_on_request.rs
@@ -142,7 +142,7 @@ fn build_workflow(resources: Resources) -> Workflow<TextPipelineState> {
         .register(TextPipelineState::Parse, ParseTask)
         .register(TextPipelineState::Transform, TransformTask)
         .add_exit_state(TextPipelineState::Done)
-        .with_timeout(Duration::from_secs(5))
+        .with_total_timeout(Duration::from_secs(5))
 }
 
 // ============================================================================
@@ -165,7 +165,7 @@ async fn process_handler(
 
     // Run the FSM to completion.
     workflow
-        .orchestrate(TextPipelineState::Parse)
+        .orchestrate(TextPipelineState::Parse, CancellationToken::disabled())
         .await
         .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
 
diff --git a/cano/examples/workflow_partial_results.rs b/cano/examples/workflow_partial_results.rs
index 411ee94..01038a6 100644
--- a/cano/examples/workflow_partial_results.rs
+++ b/cano/examples/workflow_partial_results.rs
@@ -103,7 +103,9 @@ async fn main() -> Result<(), CanoError> {
     println!("Starting workflow...");
     let start = std::time::Instant::now();
 
-    let result = workflow.orchestrate(ApiState::Start).await?;
+    let result = workflow
+        .orchestrate(ApiState::Start, CancellationToken::disabled())
+        .await?;
 
     let duration = start.elapsed();
     println!(
diff --git a/cano/examples/workflow_recovery.rs b/cano/examples/workflow_recovery.rs
index 0201d16..da2bd5c 100644
--- a/cano/examples/workflow_recovery.rs
+++ b/cano/examples/workflow_recovery.rs
@@ -109,7 +109,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         .with_observer(Arc::new(Watcher));
 
     println!("── run 1: orchestrate (Process will crash) ──");
-    if let Err(e) = workflow.orchestrate(Step::Start).await {
+    if let Err(e) = workflow
+        .orchestrate(Step::Start, CancellationToken::disabled())
+        .await
+    {
         println!("  stopped: {e}");
     }
     println!("checkpoint log after run 1 (the crash left it intact):");
@@ -118,7 +121,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     }
 
     println!("\n── run 2: resume_from ──");
-    let final_state = workflow.resume_from(run_id).await?;
+    let final_state = workflow
+        .resume_from(run_id, CancellationToken::disabled())
+        .await?;
     println!("  reached {final_state:?} — checkpoint log cleared on success");
     assert_eq!(final_state, Step::Done);
     assert!(checkpoint_store.load_run(run_id).await?.is_empty());
diff --git a/cano/examples/workflow_resources.rs b/cano/examples/workflow_resources.rs
index c30de2e..a822ae6 100644
--- a/cano/examples/workflow_resources.rs
+++ b/cano/examples/workflow_resources.rs
@@ -320,7 +320,9 @@ async fn main() -> Result<(), CanoError> {
         .add_exit_state(Step::Done);
 
     println!("Running workflow...");
-    let final_state = workflow.orchestrate(Step::Init).await?;
+    let final_state = workflow
+        .orchestrate(Step::Init, CancellationToken::disabled())
+        .await?;
     assert_eq!(final_state, Step::Done);
 
     let result: u32 = store.get("result")?;
diff --git a/cano/examples/workflow_simd_matrix_pipeline.rs b/cano/examples/workflow_simd_matrix_pipeline.rs
index 6077d90..8035db7 100644
--- a/cano/examples/workflow_simd_matrix_pipeline.rs
+++ b/cano/examples/workflow_simd_matrix_pipeline.rs
@@ -470,7 +470,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
     println!("Pipeline: Generate -> Multiply -> Transform -> Statistics -> Complete\n");
 
     // Execute the workflow
-    let _final_state = workflow.orchestrate(PipelineState::Generate).await?;
+    let _final_state = workflow
+        .orchestrate(PipelineState::Generate, CancellationToken::disabled())
+        .await?;
 
     let total_duration = start_time.elapsed();
     println!("\nSIMD Matrix Processing Pipeline completed!");
diff --git a/cano/examples/workflow_simple.rs b/cano/examples/workflow_simple.rs
index 1c24e80..ca8348b 100644
--- a/cano/examples/workflow_simple.rs
+++ b/cano/examples/workflow_simple.rs
@@ -152,7 +152,10 @@ async fn main() -> Result<(), CanoError> {
         .register(WorkflowAction::Count, CounterTask)
         .add_exit_states(vec![WorkflowAction::Complete, WorkflowAction::Error]);
 
-    match workflow.orchestrate(WorkflowAction::Generate).await {
+    match workflow
+        .orchestrate(WorkflowAction::Generate, CancellationToken::disabled())
+        .await
+    {
         Ok(WorkflowAction::Complete) => {
             println!("\nWorkflow completed successfully!");
             match store.get::<usize>("number_count") {
diff --git a/cano/examples/workflow_split_join.rs b/cano/examples/workflow_split_join.rs
index d4c14e7..ad070a6 100644
--- a/cano/examples/workflow_split_join.rs
+++ b/cano/examples/workflow_split_join.rs
@@ -142,7 +142,9 @@ async fn main() -> Result<(), CanoError> {
             .register(DataProcessingState::Aggregate, Aggregator)
             .add_exit_state(DataProcessingState::Complete);
 
-        let result = workflow.orchestrate(DataProcessingState::Start).await?;
+        let result = workflow
+            .orchestrate(DataProcessingState::Start, CancellationToken::disabled())
+            .await?;
 
         let final_result: i32 = store.get("final_result")?;
         println!("Final result: {}", final_result);
@@ -172,7 +174,9 @@ async fn main() -> Result<(), CanoError> {
             .register(DataProcessingState::Aggregate, Aggregator)
             .add_exit_state(DataProcessingState::Complete);
 
-        let result = workflow.orchestrate(DataProcessingState::Start).await?;
+        let result = workflow
+            .orchestrate(DataProcessingState::Start, CancellationToken::disabled())
+            .await?;
 
         let processor_count: usize = store.get("processor_count")?;
         println!(
@@ -205,7 +209,9 @@ async fn main() -> Result<(), CanoError> {
             .register(DataProcessingState::Aggregate, Aggregator)
             .add_exit_state(DataProcessingState::Complete);
 
-        let result = workflow.orchestrate(DataProcessingState::Start).await?;
+        let result = workflow
+            .orchestrate(DataProcessingState::Start, CancellationToken::disabled())
+            .await?;
 
         println!("Workflow completed with Any strategy: {:?}\n", result);
     }
@@ -236,7 +242,9 @@ async fn main() -> Result<(), CanoError> {
             .register(DataProcessingState::Aggregate, Aggregator)
             .add_exit_state(DataProcessingState::Complete);
 
-        let result = workflow.orchestrate(DataProcessingState::Start).await?;
+        let result = workflow
+            .orchestrate(DataProcessingState::Start, CancellationToken::disabled())
+            .await?;
 
         let processor_count: usize = store.get("processor_count")?;
         println!("Processors completed: {} (66% threshold)", processor_count);
@@ -267,7 +275,10 @@ async fn main() -> Result<(), CanoError> {
             .register(DataProcessingState::Aggregate, Aggregator)
             .add_exit_state(DataProcessingState::Complete);
 
-        match workflow.orchestrate(DataProcessingState::Start).await {
+        match workflow
+            .orchestrate(DataProcessingState::Start, CancellationToken::disabled())
+            .await
+        {
             Ok(result) => println!("Workflow completed: {:?}", result),
             Err(e) => println!("Workflow failed (expected timeout): {}", e),
         }
diff --git a/cano/examples/workflow_stack_store.rs b/cano/examples/workflow_stack_store.rs
index 00393d1..ca053ac 100644
--- a/cano/examples/workflow_stack_store.rs
+++ b/cano/examples/workflow_stack_store.rs
@@ -159,7 +159,9 @@ async fn main() -> CanoResult<()> {
 
         // Execute workflow
         println!("\nStarting workflow...\n");
-        let final_state = workflow.orchestrate(RequestState::Start).await?;
+        let final_state = workflow
+            .orchestrate(RequestState::Start, CancellationToken::disabled())
+            .await?;
 
         // Display results
         println!("\nFinal Results:");
@@ -198,7 +200,9 @@ async fn main() -> CanoResult<()> {
 
         // Execute workflow
         println!("\nStarting workflow...\n");
-        let final_state = workflow.orchestrate(RequestState::Start).await?;
+        let final_state = workflow
+            .orchestrate(RequestState::Start, CancellationToken::disabled())
+            .await?;
 
         // Display results
         println!("\nFinal Results:");
@@ -237,7 +241,9 @@ async fn main() -> CanoResult<()> {
 
         // Execute workflow
         println!("\nStarting workflow...\n");
-        let final_state = workflow.orchestrate(RequestState::Start).await?;
+        let final_state = workflow
+            .orchestrate(RequestState::Start, CancellationToken::disabled())
+            .await?;
 
         // Display results
         println!("\nFinal Results:");
diff --git a/cano/examples/workflow_total_timeout.rs b/cano/examples/workflow_total_timeout.rs
index 94f8e31..4dcf2a7 100644
--- a/cano/examples/workflow_total_timeout.rs
+++ b/cano/examples/workflow_total_timeout.rs
@@ -91,7 +91,10 @@ async fn main() {
         .register(Step::Ship, Ship)
         .add_exit_state(Step::Done);
 
-    match workflow.orchestrate(Step::Reserve).await {
+    match workflow
+        .orchestrate(Step::Reserve, CancellationToken::disabled())
+        .await
+    {
         Ok(state) => println!("\nworkflow completed at {state:?}"),
         Err(error) => println!("\nworkflow failed, rolled back: {error}"),
     }
diff --git a/cano/examples/workflow_validation.rs b/cano/examples/workflow_validation.rs
index 2409b2e..384ae68 100644
--- a/cano/examples/workflow_validation.rs
+++ b/cano/examples/workflow_validation.rs
@@ -102,7 +102,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
             Err(e) => println!("  validate_initial_state(Prepare) -> Err: {e}"),
         }
 
-        let result = workflow.orchestrate(Step::Prepare).await?;
+        let result = workflow
+            .orchestrate(Step::Prepare, CancellationToken::disabled())
+            .await?;
         println!("  orchestrate -> {result:?}\n");
     }
 
diff --git a/cano/src/bin/recovery_resume.rs b/cano/src/bin/recovery_resume.rs
index 6173382..cc1a2a1 100644
--- a/cano/src/bin/recovery_resume.rs
+++ b/cano/src/bin/recovery_resume.rs
@@ -125,8 +125,16 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
         }));
 
     let final_state = match mode {
-        "resume" => workflow.resume_from(workflow_id).await?,
-        _ => workflow.orchestrate(Step::Start).await?,
+        "resume" => {
+            workflow
+                .resume_from(workflow_id, CancellationToken::disabled())
+                .await?
+        }
+        _ => {
+            workflow
+                .orchestrate(Step::Start, CancellationToken::disabled())
+                .await?
+        }
     };
     println!("DONE {final_state:?}");
     let _ = std::io::stdout().flush();
diff --git a/cano/src/bin/stepped_resume.rs b/cano/src/bin/stepped_resume.rs
index f410286..c0e5309 100644
--- a/cano/src/bin/stepped_resume.rs
+++ b/cano/src/bin/stepped_resume.rs
@@ -107,13 +107,17 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
 
     let final_state = match mode {
         "resume" => {
-            let result = workflow.resume_from(WORKFLOW_ID).await?;
+            let result = workflow
+                .resume_from(WORKFLOW_ID, CancellationToken::disabled())
+                .await?;
             println!("RESUME COMPLETE final={result:?}");
             let _ = std::io::stdout().flush();
             result
         }
         _ => {
-            let result = workflow.orchestrate(State::Crunch).await?;
+            let result = workflow
+                .orchestrate(State::Crunch, CancellationToken::disabled())
+                .await?;
             println!("RUN COMPLETE final={result:?}");
             let _ = std::io::stdout().flush();
             result
diff --git a/cano/src/cancel.rs b/cano/src/cancel.rs
new file mode 100644
index 0000000..ae6e860
--- /dev/null
+++ b/cano/src/cancel.rs
@@ -0,0 +1,264 @@
+//! Cooperative cancellation for workflow runs.
+//!
+//! [`CancellationToken`] / [`CancellationHandle`] form a clonable signal pair built on
+//! [`tokio::sync::watch`] — no extra dependency. Hand a token to
+//! [`Workflow::orchestrate`](crate::workflow::Workflow::orchestrate)
+//! (or [`resume_from`](crate::workflow::Workflow::resume_from)) and keep
+//! the handle; calling [`CancellationHandle::cancel`] aborts the in-flight cancellable task at its
+//! next await point, drains the saga compensation stack, and surfaces
+//! [`CanoError::Cancelled`](crate::error::CanoError::Cancelled). To opt a run out of cancellation
+//! entirely, pass [`CancellationToken::disabled`].
+//!
+//! Cancellation is **cooperative**: the engine drops the running task future at its next `.await`,
+//! so a task doing uninterrupted synchronous/CPU work is not interrupted until it next yields. A
+//! [`CompensatableTask`](crate::saga::CompensatableTask) is deliberately *never* interrupted
+//! mid-run (that would orphan a committed side effect with no entry to roll back) — it runs to
+//! completion and the cancel is honoured at the next state boundary. The compensation drain itself
+//! is uncancellable.
+//!
+//! ```
+//! use cano::prelude::*;
+//! use cano::CancellationToken;
+//!
+//! #[derive(Clone, Debug, PartialEq, Eq, Hash)]
+//! enum Step { Start, Done }
+//!
+//! struct Noop;
+//! #[task]
+//! impl Task<Step> for Noop {
+//!     async fn run_bare(&self) -> Result<TaskResult<Step>, CanoError> {
+//!         Ok(TaskResult::Single(Step::Done))
+//!     }
+//! }
+//!
+//! # #[tokio::main]
+//! # async fn main() {
+//! let (handle, token) = CancellationToken::new();
+//! let workflow = Workflow::bare()
+//!     .register(Step::Start, Noop)
+//!     .add_exit_state(Step::Done);
+//!
+//! // Cancel from anywhere (another task, a signal handler, …):
+//! handle.cancel();
+//!
+//! let result = workflow.orchestrate(Step::Start, token).await;
+//! assert!(matches!(result, Err(e) if e.category() == "cancelled"));
+//! # }
+//! ```
+
+/// The observing half of a cancellation signal. Clonable and cheap to pass into a workflow.
+///
+/// A token built via [`CancellationToken::new`] observes its paired [`CancellationHandle`]; a
+/// [`disabled`](CancellationToken::disabled) token never fires and adds no overhead, so a run
+/// passed one opts out of cancellation entirely.
+#[derive(Clone, Debug)]
+pub struct CancellationToken {
+    rx: Option<tokio::sync::watch::Receiver<bool>>,
+}
+
+/// The controlling half of a cancellation signal. Call [`cancel`](Self::cancel) to fire it.
+///
+/// Clonable, so several owners can trigger cancellation; [`cancel`](Self::cancel) is idempotent.
+#[derive(Clone, Debug)]
+pub struct CancellationHandle {
+    tx: tokio::sync::watch::Sender<bool>,
+}
+
+impl CancellationToken {
+    /// Create a fresh handle/token pair. The token is not cancelled until the handle's
+    /// [`cancel`](CancellationHandle::cancel) is called.
+    #[must_use]
+    pub fn new() -> (CancellationHandle, CancellationToken) {
+        let (tx, rx) = tokio::sync::watch::channel(false);
+        (
+            CancellationHandle { tx },
+            CancellationToken { rx: Some(rx) },
+        )
+    }
+
+    /// A token that can never be cancelled — pass it to
+    /// [`orchestrate`](crate::workflow::Workflow::orchestrate) /
+    /// [`resume_from`](crate::workflow::Workflow::resume_from) to opt a run out of cancellation.
+    /// No channel is allocated, so this path stays allocation- and overhead-free: the FSM skips
+    /// the cancellation `select!` entirely.
+    #[must_use]
+    pub fn disabled() -> Self {
+        Self { rx: None }
+    }
+
+    /// Whether this token has already been cancelled. Non-blocking poll.
+    #[must_use]
+    pub fn is_cancelled(&self) -> bool {
+        self.rx.as_ref().is_some_and(|rx| *rx.borrow())
+    }
+
+    /// Whether this token can ever observe a cancellation. `false` for the internal "never"
+    /// token, letting the FSM hot path skip the cancellation `select!` entirely.
+    pub(crate) fn can_cancel(&self) -> bool {
+        self.rx.is_some()
+    }
+
+    /// Resolve once the token is cancelled. A "never" token (or one whose handle was dropped
+    /// without cancelling) stays pending forever — making it safe to use as a `select!` branch
+    /// that simply never wins.
+    pub async fn cancelled(&self) {
+        match &self.rx {
+            None => std::future::pending::<()>().await,
+            Some(rx) => {
+                let mut rx = rx.clone();
+                if *rx.borrow() {
+                    return;
+                }
+                while rx.changed().await.is_ok() {
+                    if *rx.borrow() {
+                        return;
+                    }
+                }
+                // Sender dropped without ever sending `true`: never cancels.
+                std::future::pending::<()>().await;
+            }
+        }
+    }
+}
+
+impl CancellationHandle {
+    /// Signal cancellation to every token observing this handle. Idempotent — calling it again
+    /// after the first cancel is a no-op.
+    pub fn cancel(&self) {
+        let _ = self.tx.send(true);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::time::Duration;
+
+    #[tokio::test]
+    async fn cancel_propagates_to_receivers() {
+        let (handle, token) = CancellationToken::new();
+        assert!(!token.is_cancelled());
+        handle.cancel();
+        // `cancelled()` resolves promptly.
+        tokio::time::timeout(Duration::from_secs(1), token.cancelled())
+            .await
+            .expect("cancelled() should resolve after cancel");
+        assert!(token.is_cancelled());
+    }
+
+    #[tokio::test]
+    async fn clone_after_cancel_still_observes() {
+        let (handle, token) = CancellationToken::new();
+        handle.cancel();
+        let cloned = token.clone();
+        assert!(cloned.is_cancelled());
+        tokio::time::timeout(Duration::from_secs(1), cloned.cancelled())
+            .await
+            .expect("a clone made after cancel still observes it");
+    }
+
+    #[test]
+    fn is_cancelled_polls_without_await() {
+        let (handle, token) = CancellationToken::new();
+        assert!(!token.is_cancelled());
+        handle.cancel();
+        assert!(token.is_cancelled());
+    }
+
+    #[tokio::test]
+    async fn cancel_is_idempotent() {
+        let (handle, token) = CancellationToken::new();
+        handle.cancel();
+        handle.cancel(); // second call is a no-op
+        assert!(token.is_cancelled());
+    }
+
+    #[tokio::test]
+    async fn dropping_handle_without_cancel_keeps_token_pending() {
+        let (handle, token) = CancellationToken::new();
+        drop(handle);
+        assert!(!token.is_cancelled());
+        // `cancelled()` must NOT resolve just because the sender dropped.
+        let res = tokio::time::timeout(Duration::from_millis(50), token.cancelled()).await;
+        assert!(
+            res.is_err(),
+            "cancelled() should stay pending after handle drop"
+        );
+    }
+
+    #[tokio::test]
+    async fn disabled_is_never_cancelled_and_can_cancel_false() {
+        let token = CancellationToken::disabled();
+        assert!(!token.is_cancelled());
+        assert!(!token.can_cancel());
+        let res = tokio::time::timeout(Duration::from_millis(50), token.cancelled()).await;
+        assert!(res.is_err(), "disabled token should stay pending");
+    }
+
+    #[test]
+    fn can_cancel_true_for_new_token() {
+        let (_handle, token) = CancellationToken::new();
+        assert!(token.can_cancel());
+    }
+
+    // Cancel fires *after* the await begins — exercises the `rx.changed().await`
+    // wakeup path (the other tests cancel first and hit the fast-path `borrow()`).
+    #[tokio::test]
+    async fn cancelled_resolves_when_cancel_fires_while_awaiting() {
+        let (handle, token) = CancellationToken::new();
+        let waiter = tokio::spawn(async move { token.cancelled().await });
+        // Let the waiter park inside `changed().await` before cancelling.
+        tokio::time::sleep(Duration::from_millis(20)).await;
+        handle.cancel();
+        tokio::time::timeout(Duration::from_secs(1), waiter)
+            .await
+            .expect("waiter should wake on cancel")
+            .expect("waiter task should not panic");
+    }
+
+    // A clone taken *before* the cancel still observes it (both halves share state).
+    #[tokio::test]
+    async fn clone_before_cancel_is_observed() {
+        let (handle, token) = CancellationToken::new();
+        let cloned = token.clone();
+        assert!(!cloned.is_cancelled());
+        handle.cancel();
+        assert!(cloned.is_cancelled());
+        assert!(token.is_cancelled());
+        tokio::time::timeout(Duration::from_secs(1), cloned.cancelled())
+            .await
+            .expect("a clone made before cancel still resolves");
+    }
+
+    // `CancellationHandle` is `Clone`; cancelling via a clone still fires, even after
+    // the original handle is dropped.
+    #[tokio::test]
+    async fn cloned_handle_triggers_cancellation() {
+        let (handle, token) = CancellationToken::new();
+        let handle2 = handle.clone();
+        drop(handle); // only the clone remains
+        assert!(!token.is_cancelled());
+        handle2.cancel();
+        assert!(token.is_cancelled());
+    }
+
+    // One cancel wakes every concurrent awaiter.
+    #[tokio::test]
+    async fn multiple_awaiters_all_wake_on_cancel() {
+        let (handle, token) = CancellationToken::new();
+        let waiters: Vec<_> = (0..5)
+            .map(|_| {
+                let t = token.clone();
+                tokio::spawn(async move { t.cancelled().await })
+            })
+            .collect();
+        tokio::time::sleep(Duration::from_millis(20)).await;
+        handle.cancel();
+        for w in waiters {
+            tokio::time::timeout(Duration::from_secs(1), w)
+                .await
+                .expect("every awaiter should wake")
+                .expect("awaiter task should not panic");
+        }
+    }
+}
diff --git a/cano/src/error.rs b/cano/src/error.rs
index e58a56e..666605e 100644
--- a/cano/src/error.rs
+++ b/cano/src/error.rs
@@ -205,6 +205,17 @@ pub enum CanoError {
         limit: std::time::Duration,
     },
 
+    /// A run was cancelled via a [`CancellationToken`](crate::cancel::CancellationToken).
+    ///
+    /// Emitted by [`orchestrate`](crate::workflow::Workflow::orchestrate)
+    /// and [`resume_from`](crate::workflow::Workflow::resume_from) when the paired
+    /// [`CancellationHandle`](crate::cancel::CancellationHandle) fires. The in-flight
+    /// cancellable task is dropped at its next await point and the compensation stack is drained
+    /// before this error surfaces. Like every task error from the FSM it is wrapped in
+    /// [`CanoError::WithStateContext`] (clean rollback); a dirty rollback yields
+    /// [`CanoError::CompensationFailed`] whose `errors[0]` carries the wrapped `Cancelled`.
+    Cancelled,
+
     /// A call was rejected because the circuit breaker is open.
     ///
     /// Emitted by [`crate::circuit::CircuitBreaker::try_acquire`] (and surfaced through the
@@ -382,6 +393,11 @@ impl CanoError {
         CanoError::WorkflowTimeout { elapsed, limit }
     }
 
+    /// Create a new cancellation error.
+    pub fn cancelled() -> Self {
+        CanoError::Cancelled
+    }
+
     /// Create a new circuit-open error
     pub fn circuit_open<S: Into<String>>(msg: S) -> Self {
         CanoError::CircuitOpen(msg.into())
@@ -510,6 +526,7 @@ impl CanoError {
             CanoError::RetryExhausted { source, .. } => source.message(),
             CanoError::Timeout(msg) => msg,
             CanoError::WorkflowTimeout { .. } => "workflow total timeout exceeded",
+            CanoError::Cancelled => "workflow cancelled",
             CanoError::CircuitOpen(msg) => msg,
             CanoError::RateLimited { .. } => "rate limited",
             CanoError::CheckpointStore(msg) => msg,
@@ -588,6 +605,7 @@ impl CanoError {
             CanoError::RetryExhausted { .. } => "retry_exhausted",
             CanoError::Timeout(_) => "timeout",
             CanoError::WorkflowTimeout { .. } => "workflow_timeout",
+            CanoError::Cancelled => "cancelled",
             CanoError::CircuitOpen(_) => "circuit_open",
             CanoError::RateLimited { .. } => "rate_limited",
             CanoError::CheckpointStore(_) => "checkpoint_store",
@@ -618,6 +636,7 @@ impl std::fmt::Display for CanoError {
                 f,
                 "Workflow total timeout exceeded: elapsed={elapsed:?} limit={limit:?}"
             ),
+            CanoError::Cancelled => write!(f, "Workflow cancelled"),
             CanoError::CircuitOpen(msg) => write!(f, "Circuit open: {msg}"),
             CanoError::RateLimited { tier, retry_after } => {
                 write!(
@@ -713,6 +732,7 @@ impl PartialEq for CanoError {
                     limit: l2,
                 },
             ) => e1 == e2 && l1 == l2,
+            (CanoError::Cancelled, CanoError::Cancelled) => true,
             (CanoError::CircuitOpen(a), CanoError::CircuitOpen(b)) => a == b,
             (
                 CanoError::RateLimited {
@@ -991,6 +1011,43 @@ mod tests {
         assert_ne!(timeout, workflow);
     }
 
+    #[test]
+    fn test_cancelled_constructor_category_display_and_eq() {
+        let err = CanoError::cancelled();
+        assert_eq!(err.message(), "workflow cancelled");
+        assert_eq!(err.category(), "cancelled");
+        assert_eq!(err.outer_category(), "cancelled");
+        assert_eq!(format!("{err}"), "Workflow cancelled");
+        assert_eq!(CanoError::cancelled(), CanoError::Cancelled);
+        assert_ne!(CanoError::cancelled(), CanoError::timeout("x"));
+    }
+
+    #[test]
+    fn test_cancelled_wrapped_in_state_context() {
+        // How a cancel actually surfaces from orchestrate: wrapped with FSM context.
+        let wrapped = CanoError::with_state_context(
+            "Ship",
+            0,
+            vec!["Reserve".into(), "Ship".into()],
+            CanoError::cancelled(),
+        );
+        // `category()` unwraps `WithStateContext` so alerting still buckets on the cause.
+        assert_eq!(wrapped.category(), "cancelled");
+        assert_eq!(wrapped.outer_category(), "with_state_context");
+        assert!(matches!(wrapped.inner(), CanoError::Cancelled));
+        // A dirty rollback nests it under CompensationFailed with errors[0] = the wrapped cancel.
+        let dirty = CanoError::compensation_failed(vec![
+            wrapped,
+            CanoError::task_execution("compensator boom"),
+        ]);
+        assert_eq!(dirty.category(), "compensation_failed");
+        if let CanoError::CompensationFailed { errors } = &dirty {
+            assert_eq!(errors[0].category(), "cancelled");
+        } else {
+            panic!("expected CompensationFailed");
+        }
+    }
+
     #[test]
     fn test_circuit_open_constructor_and_category() {
         let err = CanoError::circuit_open("breaker tripped");
diff --git a/cano/src/lib.rs b/cano/src/lib.rs
index f08514c..3fbc78b 100644
--- a/cano/src/lib.rs
+++ b/cano/src/lib.rs
@@ -57,7 +57,7 @@
 //!     .register(Step::Process, ProcessTask)
 //!     .add_exit_state(Step::Done);
 //!
-//! let final_state = workflow.orchestrate(Step::Fetch).await?;
+//! let final_state = workflow.orchestrate(Step::Fetch, CancellationToken::disabled()).await?;
 //! assert_eq!(final_state, Step::Done);
 //!
 //! // The sum of 1..=3 is 6.
@@ -95,7 +95,7 @@
 //!     .register(Step::Compute, ComputeTask)
 //!     .add_exit_state(Step::Done);
 //!
-//! let final_state = workflow.orchestrate(Step::Compute).await?;
+//! let final_state = workflow.orchestrate(Step::Compute, CancellationToken::disabled()).await?;
 //! assert_eq!(final_state, Step::Done);
 //! # Ok(())
 //! # }
@@ -199,13 +199,25 @@
 //!
 //! ### Timeouts
 //!
-//! Three layered budgets bound a run. [`TaskConfig::with_attempt_timeout`](task::TaskConfig::with_attempt_timeout)
+//! Two layered budgets bound a run. [`TaskConfig::with_attempt_timeout`](task::TaskConfig::with_attempt_timeout)
 //! caps each individual task attempt. [`Workflow::with_total_timeout`] sets a wall-clock
 //! budget for the entire [`orchestrate`](Workflow::orchestrate) / [`resume_from`](Workflow::resume_from)
 //! call; when it elapses the in-flight task is aborted, the saga compensation stack drains
 //! against its own bounded budget (configurable via [`Workflow::with_compensation_timeout`]),
-//! and the call returns [`CanoError::WorkflowTimeout`]. Contrast with [`Workflow::with_timeout`],
-//! a blunt outer `tokio::time::timeout` that offers no graceful compensation.
+//! and the call returns [`CanoError::WorkflowTimeout`]. To stop a run on an external signal rather
+//! than a deadline, use [cooperative cancellation](#cooperative-cancellation).
+//!
+//! ### Cooperative cancellation
+//!
+//! [`Workflow::orchestrate`] (and [`resume_from`](Workflow::resume_from))
+//! take a [`CancellationToken`] obtained from [`CancellationToken::new`]; firing the paired
+//! [`CancellationHandle`] aborts the in-flight cancellable task at its next `.await`, drains the
+//! saga compensation stack, and returns [`CanoError::Cancelled`]. Cancellation is *cooperative*
+//! (a task in tight synchronous work isn't interrupted until it yields) and *saga-safe* (a
+//! [`CompensatableTask`] always runs to completion so its rollback entry is recorded; the cancel
+//! is honoured at the next state boundary). The compensation drain itself is uncancellable.
+//! To opt a run out of cancellation, pass [`CancellationToken::disabled`] — it never fires and is
+//! zero-cost (the FSM skips the cancellation `select!`). See the [`cancel`] module.
 //!
 //! ## Module Overview
 //!
@@ -215,6 +227,7 @@
 //! - [`task::timer`]: The [`TimerTask`] trait — wait-then-transition via `wait()`/`after_wait()`; registered with [`Workflow::register`]
 //! - [`task::batch`]: The [`BatchTask`] trait — fan-out over data items via `load`/`process_item`/`finish`; registered with [`Workflow::register`]
 //! - [`task::stepped`]: The [`SteppedTask`] trait — resumable iterative work via `step()` with a serializable cursor; registered with [`Workflow::register_stepped`] (persists the cursor when a checkpoint store is attached)
+//! - [`cancel`]: [`CancellationToken`] / [`CancellationHandle`] — cooperative cancellation for [`orchestrate`](Workflow::orchestrate)
 //! - [`workflow`]: [`Workflow`] — FSM orchestration with Split/Join support
 //! - `scheduler` (requires `scheduler` feature): `Scheduler` (builder) and `RunningScheduler` (live handle) — cron and interval scheduling
 //! - [`mod@resource`]: [`Resource`] trait, [`Resources`] dictionary, and [`HealthStatus`] — lifecycle-aware resource management and health probes
@@ -240,6 +253,7 @@
 //! 2. Read the module docs — each module has detailed documentation and examples
 //! 3. Run benchmarks: `cargo bench --bench workflow_performance`
 
+pub mod cancel;
 pub mod circuit;
 pub mod error;
 pub mod observer;
@@ -261,6 +275,7 @@ pub mod scheduler;
 pub mod testing;
 
 // Core public API - simplified imports
+pub use cancel::{CancellationHandle, CancellationToken};
 pub use circuit::{CircuitBreaker, CircuitPolicy, CircuitState, Permit as CircuitPermit};
 pub use error::{CanoError, CanoResult};
 pub use observer::WorkflowObserver;
@@ -353,14 +368,14 @@ pub mod prelude {
     //! Use `use cano::prelude::*;` to import the most commonly used types and traits.
 
     pub use crate::{
-        BatchTask, CanoError, CanoResult, CheckpointRow, CheckpointStore, CircuitBreaker,
-        CircuitPermit, CircuitPolicy, CircuitState, CompensatableTask, HealthStatus, JoinConfig,
-        JoinStrategy, MemoryStore, Meter, MeterStatus, MultiPermit, MultiRateLimiter,
-        PollErrorPolicy, PollOutcome, PollTask, RateLimiter, RateLimiterPermit, RateLimiterPolicy,
-        Reservation, Resource, Resources, RetryMode, RouterTask, RowKind, SplitResult,
-        SplitTaskResult, StateEntry, StepOutcome, SteppedTask, Task, TaskConfig, TaskObject,
-        TaskResult, Tier, TimerOutcome, TimerTask, WindowPermit, WindowPolicy, WindowedRateLimiter,
-        Workflow, WorkflowObserver, run_stepped,
+        BatchTask, CancellationHandle, CancellationToken, CanoError, CanoResult, CheckpointRow,
+        CheckpointStore, CircuitBreaker, CircuitPermit, CircuitPolicy, CircuitState,
+        CompensatableTask, HealthStatus, JoinConfig, JoinStrategy, MemoryStore, Meter, MeterStatus,
+        MultiPermit, MultiRateLimiter, PollErrorPolicy, PollOutcome, PollTask, RateLimiter,
+        RateLimiterPermit, RateLimiterPolicy, Reservation, Resource, Resources, RetryMode,
+        RouterTask, RowKind, SplitResult, SplitTaskResult, StateEntry, StepOutcome, SteppedTask,
+        Task, TaskConfig, TaskObject, TaskResult, Tier, TimerOutcome, TimerTask, WindowPermit,
+        WindowPolicy, WindowedRateLimiter, Workflow, WorkflowObserver, run_stepped,
     };
 
     #[cfg(feature = "scheduler")]
diff --git a/cano/src/metrics.rs b/cano/src/metrics.rs
index c08098d..60a1b1c 100644
--- a/cano/src/metrics.rs
+++ b/cano/src/metrics.rs
@@ -102,6 +102,7 @@ pub const OBSERVED_WORKFLOW_TIMEOUT_LIMIT_SECONDS: &str =
 pub const OBSERVED_WORKFLOW_TIMEOUT_ELAPSED_SECONDS: &str =
     "cano_observed_workflow_timeout_elapsed_seconds";
 pub const OBSERVED_UNKNOWN_RESUME_STATES_TOTAL: &str = "cano_observed_unknown_resume_states_total";
+pub const OBSERVED_CANCELLATIONS_TOTAL: &str = "cano_observed_cancellations_total";
 
 // Always-on direct instrumentation:
 pub const WORKFLOW_RUNS_TOTAL: &str = "cano_workflow_runs_total";
@@ -194,11 +195,16 @@ pub fn describe() {
         Unit::Count,
         "Checkpoint rows whose state label is not registered on the current workflow (emitted by MetricsObserver during resume_from)"
     );
+    describe_counter!(
+        OBSERVED_CANCELLATIONS_TOTAL,
+        Unit::Count,
+        "Workflow runs cancelled via a CancellationToken (emitted by MetricsObserver via on_cancelled)"
+    );
 
     describe_counter!(
         WORKFLOW_RUNS_TOTAL,
         Unit::Count,
-        "Workflow runs (via Workflow::orchestrate/resume_from), by terminal outcome (completed|failed|timeout)"
+        "Workflow runs (via Workflow::orchestrate/resume_from), by terminal outcome (completed|failed)"
     );
     describe_histogram!(
         WORKFLOW_DURATION_SECONDS,
@@ -402,6 +408,9 @@ pub(crate) fn observed_workflow_timeout(elapsed: Duration, limit: Duration) {
 pub(crate) fn observed_unknown_resume_state() {
     counter!(OBSERVED_UNKNOWN_RESUME_STATES_TOTAL).increment(1);
 }
+pub(crate) fn observed_cancellation() {
+    counter!(OBSERVED_CANCELLATIONS_TOTAL).increment(1);
+}
 
 // ----- workflow run -----
 
diff --git a/cano/src/observer.rs b/cano/src/observer.rs
index e32d6e0..6099b65 100644
--- a/cano/src/observer.rs
+++ b/cano/src/observer.rs
@@ -107,6 +107,15 @@ pub trait WorkflowObserver: Send + Sync + 'static {
     /// wrapped timeout (dirty rollback).
     fn on_workflow_timeout(&self, _elapsed: std::time::Duration, _limit: std::time::Duration) {}
 
+    /// Called when a run is cancelled via a
+    /// [`CancellationToken`](crate::cancel::CancellationToken) — either observed at a state
+    /// boundary or while a cancellable task was in flight. `state` is the `Debug` rendering of the
+    /// state the cancellation was observed at. Fires exactly once per cancelled run, immediately
+    /// before the compensation stack is drained. Followed on the public API's return by a
+    /// `CanoError::WithStateContext` wrapping a `CanoError::Cancelled` (clean rollback), or a
+    /// `CanoError::CompensationFailed` whose `errors[0]` is the wrapped `Cancelled` (dirty rollback).
+    fn on_cancelled(&self, _state: &str) {}
+
     /// Called when the engine attempted to clear a checkpoint log (after a
     /// successful run or after a clean compensation drain) and the backend
     /// returned an error.
@@ -236,6 +245,9 @@ impl WorkflowObserver for TracingObserver {
             "workflow total timeout exceeded"
         );
     }
+    fn on_cancelled(&self, state: &str) {
+        tracing::warn!(state, "workflow cancelled");
+    }
     fn on_checkpoint_clear_failed(&self, workflow_id: &str, error: &CanoError) {
         tracing::warn!(workflow_id, error = %error, "checkpoint log clear failed");
     }
@@ -297,6 +309,9 @@ impl WorkflowObserver for MetricsObserver {
     fn on_workflow_timeout(&self, elapsed: std::time::Duration, limit: std::time::Duration) {
         crate::metrics::observed_workflow_timeout(elapsed, limit);
     }
+    fn on_cancelled(&self, _state: &str) {
+        crate::metrics::observed_cancellation();
+    }
     fn on_checkpoint_clear_failed(&self, _workflow_id: &str, _error: &CanoError) {
         crate::metrics::checkpoint_clear(false);
     }
@@ -382,7 +397,13 @@ mod tests {
             .add_exit_state(S::Done)
             .with_observer(Arc::new(obs));
 
-        assert_eq!(workflow.orchestrate(S::Start).await.unwrap(), S::Done);
+        assert_eq!(
+            workflow
+                .orchestrate(S::Start, CancellationToken::disabled())
+                .await
+                .unwrap(),
+            S::Done
+        );
 
         let events = rec.labels();
         assert!(
@@ -415,7 +436,12 @@ mod tests {
             .add_exit_state(S::Done)
             .with_observer(Arc::new(obs));
 
-        assert!(workflow.orchestrate(S::Start).await.is_err());
+        assert!(
+            workflow
+                .orchestrate(S::Start, CancellationToken::disabled())
+                .await
+                .is_err()
+        );
 
         let events = rec.labels();
         assert!(
@@ -463,7 +489,10 @@ mod tests {
             .add_exit_state(S::Done)
             .with_observer(Arc::new(obs));
 
-        let err = workflow.orchestrate(S::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(S::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // The FSM wraps the failure with state context; the inner is CircuitOpen.
         assert!(matches!(err.inner(), CanoError::CircuitOpen(_)), "{err}");
 
@@ -484,7 +513,13 @@ mod tests {
         let workflow = Workflow::bare()
             .register(S::Start, OkTask)
             .add_exit_state(S::Done);
-        assert_eq!(workflow.orchestrate(S::Start).await.unwrap(), S::Done);
+        assert_eq!(
+            workflow
+                .orchestrate(S::Start, CancellationToken::disabled())
+                .await
+                .unwrap(),
+            S::Done
+        );
     }
 
     #[test]
@@ -580,7 +615,7 @@ mod metrics_observer_tests {
                 .register(S::Start, GoTo(S::Mid))
                 .register(S::Mid, GoTo(S::Done))
                 .add_exit_state(S::Done)
-                .orchestrate(S::Start)
+                .orchestrate(S::Start, CancellationToken::disabled())
                 .await
         });
         assert_eq!(res.unwrap(), S::Done);
@@ -633,7 +668,7 @@ mod metrics_observer_tests {
                 .with_total_timeout(std::time::Duration::from_millis(20))
                 .register(S::Start, SlowTask)
                 .add_exit_state(S::Done)
-                .orchestrate(S::Start)
+                .orchestrate(S::Start, CancellationToken::disabled())
                 .await
         });
         assert!(res.is_err());
@@ -681,7 +716,7 @@ mod metrics_observer_tests {
                 .with_observer(Arc::new(MetricsObserver::new()))
                 .register(S::Start, Flaky(n2))
                 .add_exit_state(S::Done)
-                .orchestrate(S::Start)
+                .orchestrate(S::Start, CancellationToken::disabled())
                 .await
         });
         assert_eq!(res.unwrap(), S::Done);
diff --git a/cano/src/scheduler.rs b/cano/src/scheduler.rs
index 560db9f..6433bce 100644
--- a/cano/src/scheduler.rs
+++ b/cano/src/scheduler.rs
@@ -30,6 +30,7 @@ mod backoff;
 
 pub use backoff::BackoffPolicy;
 
+use crate::cancel::CancellationHandle;
 use crate::error::CanoResult;
 use crate::workflow::Workflow;
 use chrono::{DateTime, Utc};
@@ -53,6 +54,12 @@ enum SchedulerCommand {
         id: Arc<str>,
         response: oneshot::Sender<CanoResult<()>>,
     },
+    /// Request cooperative cancellation of a flow's in-flight run. A no-op when
+    /// the flow isn't currently running.
+    Cancel {
+        id: Arc<str>,
+        response: oneshot::Sender<CanoResult<()>>,
+    },
 }
 
 /// Simplified scheduling options
@@ -134,6 +141,11 @@ where
     schedule: ParsedSchedule,
     info: Arc<RwLock<FlowInfo>>,
     policy: Arc<BackoffPolicy>,
+    /// Cancellation handle for the flow's *currently executing* run, published
+    /// by `execute_reserved_flow` while a run is in flight and cleared when it
+    /// finishes. `None` when the flow is idle. Lets `cancel_flow` and graceful
+    /// shutdown cooperatively cancel an in-flight run.
+    cancel: Arc<RwLock<Option<CancellationHandle>>>,
 }
 
 impl<TState, TResourceKey> Clone for FlowData<TState, TResourceKey>
@@ -148,6 +160,7 @@ where
             schedule: self.schedule.clone(),
             info: Arc::clone(&self.info),
             policy: self.policy.clone(),
+            cancel: Arc::clone(&self.cancel),
         }
     }
 }
diff --git a/cano/src/scheduler/builder.rs b/cano/src/scheduler/builder.rs
index 6378fd6..e9a2f29 100644
--- a/cano/src/scheduler/builder.rs
+++ b/cano/src/scheduler/builder.rs
@@ -184,6 +184,7 @@ where
                 schedule,
                 info,
                 policy: Arc::new(BackoffPolicy::default()),
+                cancel: Arc::new(RwLock::new(None)),
             },
         );
         self.flow_order.push(id);
@@ -323,6 +324,7 @@ where
                 let initial_state = fd.initial_state.clone();
                 let info = Arc::clone(&fd.info);
                 let policy = fd.policy.clone();
+                let cancel = Arc::clone(&fd.cancel);
                 let running_clone = Arc::clone(&running);
                 let notify_clone = Arc::clone(&stop_notify);
 
@@ -334,6 +336,7 @@ where
                             initial_state,
                             info,
                             policy,
+                            cancel,
                             running_clone,
                             notify_clone,
                             interval,
@@ -347,6 +350,7 @@ where
                             initial_state,
                             info,
                             policy,
+                            cancel,
                             running_clone,
                             notify_clone,
                             cron_schedule,
diff --git a/cano/src/scheduler/loops.rs b/cano/src/scheduler/loops.rs
index 6df6dff..bd5af6d 100644
--- a/cano/src/scheduler/loops.rs
+++ b/cano/src/scheduler/loops.rs
@@ -13,6 +13,7 @@ use tokio::sync::{Notify, RwLock, mpsc, watch};
 use tokio::task::{AbortHandle, JoinHandle};
 use tokio::time::{Duration, sleep};
 
+use crate::cancel::{CancellationHandle, CancellationToken};
 use crate::error::{CanoError, CanoResult};
 use crate::workflow::Workflow;
 
@@ -73,11 +74,13 @@ async fn sleep_unless_stopped(
 /// Per-flow `Every`-schedule loop body. Lives outside `start` so the driver
 /// task and the loops are decoupled — the driver owns the workflows
 /// HashMap, the loops just see the data they need.
+#[allow(clippy::too_many_arguments)]
 pub(super) async fn spawn_every_loop<TState, TResourceKey>(
     workflow: Arc<Workflow<TState, TResourceKey>>,
     initial_state: TState,
     info: Arc<RwLock<FlowInfo>>,
     policy: Arc<BackoffPolicy>,
+    cancel: Arc<RwLock<Option<CancellationHandle>>>,
     running: Arc<RwLock<bool>>,
     stop_notify: Arc<Notify>,
     interval: Duration,
@@ -98,6 +101,7 @@ pub(super) async fn spawn_every_loop<TState, TResourceKey>(
             initial_state.clone(),
             Arc::clone(&info),
             &policy,
+            Arc::clone(&cancel),
         )
         .await;
     }
@@ -134,6 +138,7 @@ pub(super) async fn spawn_every_loop<TState, TResourceKey>(
             initial_state.clone(),
             Arc::clone(&info),
             &policy,
+            Arc::clone(&cancel),
         )
         .await;
     }
@@ -141,11 +146,13 @@ pub(super) async fn spawn_every_loop<TState, TResourceKey>(
 
 /// Per-flow `Cron`-schedule loop body. See [`spawn_every_loop`] for the
 /// rationale on splitting the loop bodies out of `start`.
+#[allow(clippy::too_many_arguments)]
 pub(super) async fn spawn_cron_loop<TState, TResourceKey>(
     workflow: Arc<Workflow<TState, TResourceKey>>,
     initial_state: TState,
     info: Arc<RwLock<FlowInfo>>,
     policy: Arc<BackoffPolicy>,
+    cancel: Arc<RwLock<Option<CancellationHandle>>>,
     running: Arc<RwLock<bool>>,
     stop_notify: Arc<Notify>,
     schedule: Box<CronSchedule>,
@@ -208,6 +215,7 @@ pub(super) async fn spawn_cron_loop<TState, TResourceKey>(
             initial_state.clone(),
             Arc::clone(&info),
             &policy,
+            Arc::clone(&cancel),
         )
         .await;
     }
@@ -258,8 +266,16 @@ pub(super) async fn driver_task<TState, TResourceKey>(
                             let initial_state = flow.initial_state.clone();
                             let info = Arc::clone(&flow.info);
                             let policy = Arc::clone(&flow.policy);
+                            let cancel = Arc::clone(&flow.cancel);
                             let handle = tokio::spawn(async move {
-                                execute_reserved_flow(workflow, initial_state, info, &policy).await;
+                                execute_reserved_flow(
+                                    workflow,
+                                    initial_state,
+                                    info,
+                                    &policy,
+                                    cancel,
+                                )
+                                .await;
                             });
                             let mut tasks = scheduler_tasks.write().await;
                             tasks.retain(|h| !h.is_finished());
@@ -299,6 +315,24 @@ pub(super) async fn driver_task<TState, TResourceKey>(
                     )))
                 };
 
+                let _ = response.send(outcome);
+            }
+            SchedulerCommand::Cancel { id, response } => {
+                let outcome = if let Some(flow) = workflows.get(&id) {
+                    // Fire the in-flight run's cancellation handle, if any. The
+                    // run observes `Cancelled` at its next await, drains its saga,
+                    // and `apply_outcome` returns the flow to `Idle`. A flow that
+                    // isn't currently running has no handle — an idempotent no-op.
+                    if let Some(h) = flow.cancel.read().await.as_ref() {
+                        h.cancel();
+                    }
+                    Ok(())
+                } else {
+                    Err(CanoError::Workflow(format!(
+                        "No workflow registered with id '{id}'"
+                    )))
+                };
+
                 let _ = response.send(outcome);
             }
         }
@@ -313,6 +347,17 @@ pub(super) async fn driver_task<TState, TResourceKey>(
     // how long an in-flight workflow takes — not by the schedule interval.
     stop_notify.notify_waiters();
 
+    // Cooperatively cancel every in-flight run so shutdown latency is bounded by
+    // the time to the next await + the saga drain, not by how long the workflow
+    // would naturally take. Each cancelled run drains its compensation stack and
+    // returns `Cancelled` (recorded as Idle, not a failure, by `apply_outcome`).
+    // The bounded wait below still caps the total drain time.
+    for flow in workflows.values() {
+        if let Some(h) = flow.cancel.read().await.as_ref() {
+            h.cancel();
+        }
+    }
+
     // Wait for all scheduler loop tasks to finish.
     //
     // Pop with a short-lived write lock per iteration (rather than holding
@@ -387,6 +432,7 @@ async fn execute_flow<TState, TResourceKey>(
     initial_state: TState,
     info: Arc<RwLock<FlowInfo>>,
     policy: &BackoffPolicy,
+    cancel: Arc<RwLock<Option<CancellationHandle>>>,
 ) where
     TState: Clone + Send + Sync + 'static + std::fmt::Debug + std::hash::Hash + Eq,
     TResourceKey: Hash + Eq + Send + Sync + 'static,
@@ -398,7 +444,7 @@ async fn execute_flow<TState, TResourceKey>(
         return;
     }
 
-    execute_reserved_flow(workflow, initial_state, info, policy).await;
+    execute_reserved_flow(workflow, initial_state, info, policy, cancel).await;
 }
 
 /// Result of attempting to reserve a flow for dispatch. The Tripped and
@@ -434,6 +480,7 @@ async fn execute_reserved_flow<TState, TResourceKey>(
     initial_state: TState,
     info: Arc<RwLock<FlowInfo>>,
     policy: &BackoffPolicy,
+    cancel: Arc<RwLock<Option<CancellationHandle>>>,
 ) where
     TState: Clone + Send + Sync + 'static + std::fmt::Debug + std::hash::Hash + Eq,
     TResourceKey: Hash + Eq + Send + Sync + 'static,
@@ -455,6 +502,13 @@ async fn execute_reserved_flow<TState, TResourceKey>(
         .total_timeout
         .map(|d| (std::time::Instant::now(), d));
 
+    // Publish a fresh cancellation handle for this run so `cancel_flow` and
+    // graceful shutdown can cooperatively stop it (and drain its saga). A fresh
+    // token per run means cancelling one run never poisons a later one. Cleared
+    // below once the run finishes, so a `cancel_flow` on an idle flow is a no-op.
+    let (handle, token) = CancellationToken::new();
+    *cancel.write().await = Some(handle);
+
     // Wrap the workflow future in `catch_unwind`. A panic inside any path
     // that bypasses the FSM's own `catch_unwind` (e.g. an observer that
     // panics, a custom checkpoint store that panics) would otherwise abort
@@ -465,10 +519,10 @@ async fn execute_reserved_flow<TState, TResourceKey>(
     // `BackoffPolicy`.
     #[cfg(feature = "tracing")]
     let workflow_fut = workflow
-        .execute_workflow(initial_state, total_budget)
+        .execute_workflow(initial_state, total_budget, token)
         .instrument(tracing::info_span!("execute_flow"));
     #[cfg(not(feature = "tracing"))]
-    let workflow_fut = workflow.execute_workflow(initial_state, total_budget);
+    let workflow_fut = workflow.execute_workflow(initial_state, total_budget, token);
 
     let result = match AssertUnwindSafe(workflow_fut).catch_unwind().await {
         Ok(inner) => inner,
@@ -482,6 +536,10 @@ async fn execute_reserved_flow<TState, TResourceKey>(
         }
     };
 
+    // The run is over: drop the handle so a later `cancel_flow` on this now-idle
+    // flow is a clean no-op rather than firing a stale token.
+    *cancel.write().await = None;
+
     #[cfg(feature = "metrics")]
     crate::metrics::scheduler_flow_run(&_flow_id, result.is_ok(), _started.elapsed());
 
@@ -504,6 +562,14 @@ async fn apply_outcome(
             info_guard.failure_streak = 0;
             info_guard.next_eligible = None;
         }
+        // A deliberate cancellation (via `cancel_flow` or graceful shutdown) is
+        // not a fault: return the flow to `Idle` without touching the failure
+        // streak or backoff window, so its next scheduled run fires normally. A
+        // *dirty* cancel whose rollback itself failed surfaces as
+        // `compensation_failed`, which falls through to the backoff arm below.
+        Err(ref e) if e.category() == "cancelled" => {
+            info_guard.status = Status::Idle;
+        }
         Err(e) => {
             let err_str: Arc<str> = Arc::from(e.to_string());
             let new_streak = info_guard.failure_streak.saturating_add(1);
diff --git a/cano/src/scheduler/running.rs b/cano/src/scheduler/running.rs
index bdd6e0f..91d4d14 100644
--- a/cano/src/scheduler/running.rs
+++ b/cano/src/scheduler/running.rs
@@ -210,6 +210,44 @@ where
         })?
     }
 
+    /// Request cooperative cancellation of a flow's in-flight run.
+    ///
+    /// Fires the run's [`CancellationToken`](crate::cancel::CancellationToken):
+    /// the in-flight workflow aborts at its next await point, drains its saga
+    /// compensation stack, and returns [`CanoError::Cancelled`]. The flow then
+    /// returns to [`Status::Idle`](crate::scheduler::Status::Idle) — a deliberate
+    /// cancel is **not** counted as a failure against the [`BackoffPolicy`](crate::scheduler::BackoffPolicy),
+    /// so the next scheduled run fires normally.
+    ///
+    /// A **no-op** (returns `Ok`) when the flow exists but isn't currently
+    /// running. Graceful [`stop`](Self::stop) cancels every in-flight flow this
+    /// same way before draining.
+    ///
+    /// # Errors
+    ///
+    /// - [`CanoError::Workflow`] — the scheduler is not running, `id` is unknown,
+    ///   or the command queue is full.
+    pub async fn cancel_flow(&self, id: &str) -> CanoResult<()> {
+        let (response_tx, response_rx) = oneshot::channel();
+        self.command_tx
+            .try_send(SchedulerCommand::Cancel {
+                id: Arc::from(id),
+                response: response_tx,
+            })
+            .map_err(|e| match e {
+                mpsc::error::TrySendError::Closed(_) => CanoError::Workflow(
+                    "Scheduler not running — call start() before cancel_flow()".to_string(),
+                ),
+                mpsc::error::TrySendError::Full(_) => {
+                    CanoError::Workflow("Scheduler command queue full".to_string())
+                }
+            })?;
+
+        response_rx.await.map_err(|_| {
+            CanoError::Workflow("Scheduler stopped before cancel was processed".to_string())
+        })?
+    }
+
     /// Get a snapshot of the workflow status.
     pub async fn status(&self, id: &str) -> Option<FlowInfo> {
         let info = self.flows.get(id)?;
@@ -722,74 +760,6 @@ mod tests {
         assert!(result.is_ok(), "Test timed out");
     }
 
-    #[tokio::test(flavor = "multi_thread")]
-    async fn test_trigger_during_graceful_shutdown_window_reports_not_running() {
-        // While the driver task is parked waiting for a slow in-flight workflow
-        // to finish, a concurrent trigger() must surface "not running" instead
-        // of enqueueing into the closed command channel.
-        #[derive(Clone)]
-        struct SlowTask;
-
-        #[task]
-        impl Task<TestState> for SlowTask {
-            async fn run_bare(&self) -> Result<TaskResult<TestState>, CanoError> {
-                // Hold Status::Running long enough to span the shutdown window.
-                sleep(Duration::from_millis(400)).await;
-                Ok(TaskResult::Single(TestState::Complete))
-            }
-        }
-
-        let timeout = Duration::from_secs(5);
-        let result = tokio::time::timeout(timeout, async {
-            let mut scheduler: Scheduler<TestState> = Scheduler::<TestState>::new();
-            let slow_workflow = Workflow::bare()
-                .register(TestState::Start, SlowTask)
-                .add_exit_state(TestState::Complete)
-                .add_exit_state(TestState::Error);
-            scheduler
-                .manual("slow_task", slow_workflow, TestState::Start)
-                .unwrap();
-
-            let running = scheduler.start().await.unwrap();
-            let probe = running.clone();
-
-            // Kick off the slow workflow and wait until it is actually Running.
-            probe.trigger("slow_task").await.unwrap();
-            sleep(Duration::from_millis(50)).await;
-            assert!(
-                probe.has_running_flows().await,
-                "slow workflow should be Running before stop()"
-            );
-
-            // Spawn stop() so we can probe the shutdown window concurrently.
-            let stop_handle = tokio::spawn(async move { running.stop().await });
-
-            // Let the driver dequeue Stop and close the command channel. The
-            // slow workflow is still running (~400ms total), so the driver is
-            // parked inside has_running_flows() — the shutdown window we want
-            // to probe.
-            sleep(Duration::from_millis(50)).await;
-            assert!(
-                !stop_handle.is_finished(),
-                "stop() must still be parked while the slow workflow is in flight"
-            );
-
-            // During the window, trigger() must report not-running.
-            let err = probe.trigger("slow_task").await.unwrap_err();
-            assert!(
-                err.to_string().contains("Scheduler not running"),
-                "expected not-running during shutdown window, got: {err}"
-            );
-
-            // stop() eventually returns Ok (teardown finishes).
-            let stop_result = stop_handle.await.expect("stop task should not panic");
-            stop_result.expect("stop should succeed once slow workflow finishes");
-        })
-        .await;
-
-        assert!(result.is_ok(), "Test timed out");
-    }
-
     #[tokio::test(flavor = "multi_thread")]
     async fn test_failed_workflow_registration() {
         // Registering a "failing" workflow (one whose post() returns Err) is a
@@ -1534,97 +1504,229 @@ mod tests {
         assert!(result.is_ok(), "Test timed out");
     }
 
-    #[tokio::test(flavor = "multi_thread")]
-    async fn drop_aborts_wedged_handle_currently_being_awaited_by_driver() {
-        // Regression for F9: when the driver pops a JoinHandle from
-        // `scheduler_tasks` and awaits it, the popped handle no longer lives
-        // in the Vec. A `Drop` firing while the await is in flight previously
-        // aborted `driver_handle` (cancelling the driver future, which then
-        // dropped the popped JoinHandle — detaching the underlying task
-        // instead of aborting it). The wedged task leaked indefinitely.
-        //
-        // Now `RunningScheduler::in_flight_drain` holds the popped handle's
-        // `AbortHandle` for the duration of the await, so Drop can reach the
-        // wedged task. This test triggers a workflow whose task sleeps for
-        // far longer than the test's tolerance, stops the scheduler so the
-        // driver enters its drain phase, drops the last clone, and asserts
-        // that the workflow's completion counter never advances.
-        use std::sync::atomic::{AtomicUsize, Ordering};
-
-        #[derive(Clone)]
-        struct SlowTask {
-            completions: Arc<AtomicUsize>,
+    // A long-running, cancellable flow task that records when it starts and (if
+    // never cancelled) when it completes — used to verify graceful shutdown
+    // cooperatively cancels in-flight flows.
+    #[derive(Clone)]
+    struct CancellableSlow {
+        started: std::sync::Arc<AtomicU32>,
+        completed: std::sync::Arc<AtomicU32>,
+    }
+    #[task]
+    impl Task<TestState> for CancellableSlow {
+        fn config(&self) -> crate::task::TaskConfig {
+            crate::task::TaskConfig::minimal()
         }
-        #[task]
-        impl Task<TestState> for SlowTask {
-            fn config(&self) -> crate::task::TaskConfig {
-                crate::task::TaskConfig::minimal()
-            }
-            async fn run_bare(&self) -> Result<TaskResult<TestState>, CanoError> {
-                // Sleeps far longer than the test tolerance. If the abort
-                // doesn't reach this task, the counter eventually ticks up.
-                sleep(Duration::from_secs(30)).await;
-                self.completions.fetch_add(1, Ordering::SeqCst);
-                Ok(TaskResult::Single(TestState::Complete))
-            }
+        async fn run_bare(&self) -> Result<TaskResult<TestState>, CanoError> {
+            self.started.fetch_add(1, Ordering::SeqCst);
+            sleep(Duration::from_secs(30)).await;
+            self.completed.fetch_add(1, Ordering::SeqCst);
+            Ok(TaskResult::Single(TestState::Complete))
         }
+    }
 
-        let timeout = Duration::from_secs(8);
-        let result = tokio::time::timeout(timeout, async {
-            let completions = Arc::new(AtomicUsize::new(0));
+    #[tokio::test(flavor = "multi_thread")]
+    async fn graceful_stop_cancels_in_flight_flow() {
+        // Graceful shutdown cooperatively cancels a running flow instead of
+        // blocking until it finishes: `stop()` returns promptly (not after the
+        // task's 30s sleep) and the task never reaches completion.
+        let result = tokio::time::timeout(Duration::from_secs(5), async {
+            let started = std::sync::Arc::new(AtomicU32::new(0));
+            let completed = std::sync::Arc::new(AtomicU32::new(0));
             let mut scheduler: Scheduler<TestState> = Scheduler::new();
-            scheduler
-                .manual(
-                    "wedged",
-                    Workflow::bare()
-                        .register(
-                            TestState::Start,
-                            SlowTask {
-                                completions: Arc::clone(&completions),
-                            },
-                        )
-                        .add_exit_state(TestState::Complete)
-                        .add_exit_state(TestState::Error),
+            let wf = Workflow::bare()
+                .register(
                     TestState::Start,
+                    CancellableSlow {
+                        started: started.clone(),
+                        completed: completed.clone(),
+                    },
                 )
-                .unwrap();
+                .add_exit_state(TestState::Complete)
+                .add_exit_state(TestState::Error);
+            scheduler.manual("slow", wf, TestState::Start).unwrap();
 
             let running = scheduler.start().await.unwrap();
-            running.trigger("wedged").await.unwrap();
-            // Give the spawn time to land in scheduler_tasks.
-            sleep(Duration::from_millis(100)).await;
+            running.trigger("slow").await.unwrap();
+            // Wait until the flow is actually in flight.
+            while started.load(Ordering::SeqCst) == 0 {
+                sleep(Duration::from_millis(5)).await;
+            }
+            assert!(running.has_running_flows().await);
+
+            let t0 = std::time::Instant::now();
+            running.stop().await.expect("graceful stop should succeed");
+            assert!(
+                t0.elapsed() < Duration::from_secs(5),
+                "stop() must cancel the in-flight flow, not wait for its 30s sleep"
+            );
+            assert_eq!(
+                completed.load(Ordering::SeqCst),
+                0,
+                "the in-flight flow must be cancelled, not run to completion"
+            );
+        })
+        .await;
+        assert!(result.is_ok(), "Test timed out");
+    }
 
-            // Spawn stop() so the driver advances into its drain loop and
-            // pops the wedged Trigger handle. stop() will not return because
-            // the awaited handle is sleeping for 30s.
-            let running_for_stop = running.clone();
-            let stop_handle = tokio::spawn(async move { running_for_stop.stop().await });
+    #[tokio::test(flavor = "multi_thread")]
+    async fn trigger_after_graceful_stop_reports_not_running() {
+        // Once graceful shutdown has run, the command channel is closed, so a
+        // subsequent trigger() reports "not running" rather than enqueueing.
+        let result = tokio::time::timeout(Duration::from_secs(5), async {
+            let started = std::sync::Arc::new(AtomicU32::new(0));
+            let completed = std::sync::Arc::new(AtomicU32::new(0));
+            let mut scheduler: Scheduler<TestState> = Scheduler::new();
+            let wf = Workflow::bare()
+                .register(
+                    TestState::Start,
+                    CancellableSlow {
+                        started: started.clone(),
+                        completed: completed.clone(),
+                    },
+                )
+                .add_exit_state(TestState::Complete)
+                .add_exit_state(TestState::Error);
+            scheduler.manual("slow", wf, TestState::Start).unwrap();
 
-            // Let the driver actually enter the drain phase and pop the handle.
-            sleep(Duration::from_millis(200)).await;
+            let running = scheduler.start().await.unwrap();
+            running.trigger("slow").await.unwrap();
+            while started.load(Ordering::SeqCst) == 0 {
+                sleep(Duration::from_millis(5)).await;
+            }
+            running.stop().await.expect("graceful stop should succeed");
+
+            let err = running.trigger("slow").await.unwrap_err();
             assert!(
-                !stop_handle.is_finished(),
-                "stop() should still be parked while the wedged trigger handle is in flight"
+                err.to_string().contains("Scheduler not running"),
+                "trigger after shutdown must report not-running, got: {err}"
             );
+        })
+        .await;
+        assert!(result.is_ok(), "Test timed out");
+    }
 
-            // Drop every clone — the in_flight_drain slot's AbortHandle must
-            // be used to abort the popped, in-flight handle.
-            drop(stop_handle.abort_handle());
-            stop_handle.abort();
-            drop(running);
+    #[tokio::test(flavor = "multi_thread")]
+    async fn cancel_flow_cancels_in_flight_run_and_returns_to_idle() {
+        // `cancel_flow` cooperatively cancels the in-flight run; the flow returns
+        // to Idle (a deliberate cancel is NOT a failure, so the streak stays 0 and
+        // the flow does not trip) and the task never completes.
+        let result = tokio::time::timeout(Duration::from_secs(5), async {
+            let started = std::sync::Arc::new(AtomicU32::new(0));
+            let completed = std::sync::Arc::new(AtomicU32::new(0));
+            let mut scheduler: Scheduler<TestState> = Scheduler::new();
+            let wf = Workflow::bare()
+                .register(
+                    TestState::Start,
+                    CancellableSlow {
+                        started: started.clone(),
+                        completed: completed.clone(),
+                    },
+                )
+                .add_exit_state(TestState::Complete)
+                .add_exit_state(TestState::Error);
+            scheduler.manual("slow", wf, TestState::Start).unwrap();
+
+            let running = scheduler.start().await.unwrap();
+            running.trigger("slow").await.unwrap();
+            while started.load(Ordering::SeqCst) == 0 {
+                sleep(Duration::from_millis(5)).await;
+            }
+
+            running
+                .cancel_flow("slow")
+                .await
+                .expect("cancel_flow should succeed");
 
-            // Wait long enough that, if abort had failed, the slow task
-            // could have advanced. With the fix, the task is aborted before
-            // it can increment completions.
-            sleep(Duration::from_secs(2)).await;
+            // Wait for the cancelled run's apply_outcome to settle the status.
+            loop {
+                let st = running.status("slow").await.unwrap().status;
+                if st != crate::scheduler::Status::Running {
+                    break;
+                }
+                sleep(Duration::from_millis(5)).await;
+            }
+            let info = running.status("slow").await.unwrap();
+            assert_eq!(
+                info.status,
+                crate::scheduler::Status::Idle,
+                "a cancelled run returns to Idle"
+            );
+            assert_eq!(info.failure_streak, 0, "cancel must not count as a failure");
             assert_eq!(
-                completions.load(Ordering::SeqCst),
+                completed.load(Ordering::SeqCst),
                 0,
-                "wedged spawn must have been aborted by Drop's in_flight_drain abort path"
+                "task was cancelled, not completed"
             );
+            running.stop().await.unwrap();
         })
         .await;
+        assert!(result.is_ok(), "Test timed out");
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn cancel_flow_on_idle_flow_is_noop() {
+        // Cancelling a registered flow that isn't running is an idempotent no-op.
+        let result = tokio::time::timeout(Duration::from_secs(5), async {
+            let started = std::sync::Arc::new(AtomicU32::new(0));
+            let completed = std::sync::Arc::new(AtomicU32::new(0));
+            let mut scheduler: Scheduler<TestState> = Scheduler::new();
+            let wf = Workflow::bare()
+                .register(
+                    TestState::Start,
+                    CancellableSlow {
+                        started: started.clone(),
+                        completed: completed.clone(),
+                    },
+                )
+                .add_exit_state(TestState::Complete)
+                .add_exit_state(TestState::Error);
+            scheduler.manual("idle", wf, TestState::Start).unwrap();
 
+            let running = scheduler.start().await.unwrap();
+            // Never triggered → no in-flight run → cancel is a no-op Ok.
+            running
+                .cancel_flow("idle")
+                .await
+                .expect("cancel on idle flow is a no-op");
+            assert_eq!(
+                running.status("idle").await.unwrap().status,
+                crate::scheduler::Status::Idle
+            );
+            running.stop().await.unwrap();
+        })
+        .await;
+        assert!(result.is_ok(), "Test timed out");
+    }
+
+    #[tokio::test(flavor = "multi_thread")]
+    async fn cancel_flow_unknown_flow_errors() {
+        let result = tokio::time::timeout(Duration::from_secs(5), async {
+            let started = std::sync::Arc::new(AtomicU32::new(0));
+            let completed = std::sync::Arc::new(AtomicU32::new(0));
+            let mut scheduler: Scheduler<TestState> = Scheduler::new();
+            let wf = Workflow::bare()
+                .register(
+                    TestState::Start,
+                    CancellableSlow {
+                        started: started.clone(),
+                        completed: completed.clone(),
+                    },
+                )
+                .add_exit_state(TestState::Complete)
+                .add_exit_state(TestState::Error);
+            scheduler.manual("known", wf, TestState::Start).unwrap();
+
+            let running = scheduler.start().await.unwrap();
+            let err = running.cancel_flow("nope").await.unwrap_err();
+            assert!(
+                err.to_string().contains("No workflow registered"),
+                "unknown flow must error, got: {err}"
+            );
+            running.stop().await.unwrap();
+        })
+        .await;
         assert!(result.is_ok(), "Test timed out");
     }
 
diff --git a/cano/src/task.rs b/cano/src/task.rs
index 1ac6127..e2e7388 100644
--- a/cano/src/task.rs
+++ b/cano/src/task.rs
@@ -69,7 +69,7 @@
 //! let result = Workflow::new(resources)
 //!     .register(Step::Fetch, FetchTask)
 //!     .add_exit_state(Step::Done)
-//!     .orchestrate(Step::Fetch)
+//!     .orchestrate(Step::Fetch, CancellationToken::disabled())
 //!     .await?;
 //! assert_eq!(result, Step::Done);
 //! # Ok(())
diff --git a/cano/src/task/batch.rs b/cano/src/task/batch.rs
index 505d284..1b6d581 100644
--- a/cano/src/task/batch.rs
+++ b/cano/src/task/batch.rs
@@ -72,7 +72,7 @@
 //!     .register(Step::Process, CsvProcessor)
 //!     .add_exit_state(Step::Done);
 //!
-//! let result = workflow.orchestrate(Step::Process).await?;
+//! let result = workflow.orchestrate(Step::Process, CancellationToken::disabled()).await?;
 //! assert_eq!(result, Step::Done);
 //! # Ok(())
 //! # }
@@ -119,7 +119,7 @@
 //! let workflow = Workflow::bare()
 //!     .register(Step::Process, TolerantProcessor)
 //!     .add_exit_state(Step::Done);
-//! let result = workflow.orchestrate(Step::Process).await?;
+//! let result = workflow.orchestrate(Step::Process, CancellationToken::disabled()).await?;
 //! assert_eq!(result, Step::Done);
 //! # Ok(())
 //! # }
@@ -468,6 +468,7 @@ pub type BatchTaskObject<TState, TResourceKey = Cow<'static, str>> =
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::cancel::CancellationToken;
     use crate::resource::Resources;
     use crate::task;
     use crate::task::Task;
@@ -869,7 +870,10 @@ mod tests {
             .register(Step::Process, IndexedBatch { n: 3 })
             .add_exit_state(Step::Done);
 
-        let result = workflow.orchestrate(Step::Process).await.unwrap();
+        let result = workflow
+            .orchestrate(Step::Process, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, Step::Done);
     }
 
@@ -892,6 +896,7 @@ mod tests {
 #[cfg(all(test, feature = "metrics"))]
 mod metrics_tests {
     use super::*;
+    use crate::cancel::CancellationToken;
     use crate::metrics::test_support::*;
     use crate::task::Task;
     use crate::workflow::Workflow;
@@ -939,7 +944,9 @@ mod metrics_tests {
             let workflow = Workflow::bare()
                 .register(St::Process, ThreeItemBatch)
                 .add_exit_state(St::Done);
-            workflow.orchestrate(St::Process).await
+            workflow
+                .orchestrate(St::Process, CancellationToken::disabled())
+                .await
         });
         assert!(result.is_ok(), "workflow should succeed: {result:?}");
         assert_eq!(
diff --git a/cano/src/task/poll.rs b/cano/src/task/poll.rs
index 54a85c5..f0bf88b 100644
--- a/cano/src/task/poll.rs
+++ b/cano/src/task/poll.rs
@@ -47,7 +47,7 @@
 //!     .register(Step::Wait, counter)
 //!     .add_exit_state(Step::Done);
 //!
-//! let result = workflow.orchestrate(Step::Wait).await?;
+//! let result = workflow.orchestrate(Step::Wait, CancellationToken::disabled()).await?;
 //! assert_eq!(result, Step::Done);
 //! # Ok(())
 //! # }
@@ -109,7 +109,7 @@
 //!     .register(Step::Poll, TraitPoller)
 //!     .add_exit_state(Step::Done);
 //!
-//! let result = workflow.orchestrate(Step::Poll).await?;
+//! let result = workflow.orchestrate(Step::Poll, CancellationToken::disabled()).await?;
 //! assert_eq!(result, Step::Done);
 //! # Ok(())
 //! # }
@@ -344,6 +344,7 @@ pub type PollTaskObject<TState, TResourceKey = Cow<'static, str>> =
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::cancel::CancellationToken;
     use crate::resource::Resources;
     use crate::task;
     use crate::task::Task;
@@ -590,7 +591,10 @@ mod tests {
         // But wait: poll 1 => count becomes 1, 1 < 2 => Pending; poll 2 => count becomes 2, 2 >= 2 => Ready(Done)
         // But we registered Step::Done as exit state so Done is the final state
         // Actually CountingPoller returns Single(Step::Done) when ready, so we skip Next entirely
-        let result = workflow.orchestrate(Step::Wait).await.unwrap();
+        let result = workflow
+            .orchestrate(Step::Wait, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, Step::Done);
     }
 
@@ -754,7 +758,10 @@ mod tests {
             .add_exit_state(Step::Done);
 
         let start = std::time::Instant::now();
-        let err = workflow.orchestrate(Step::Wait).await.unwrap_err();
+        let err = workflow
+            .orchestrate(Step::Wait, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         let elapsed = start.elapsed();
 
         // The FSM wraps the failure with state context; `.inner()` peels one layer.
diff --git a/cano/src/task/router.rs b/cano/src/task/router.rs
index 0f6819b..3cd590b 100644
--- a/cano/src/task/router.rs
+++ b/cano/src/task/router.rs
@@ -53,7 +53,7 @@
 //!     .register(Step::PathA, DoPathA)
 //!     .add_exit_state(Step::Done);
 //!
-//! let result = workflow.orchestrate(Step::Route).await?;
+//! let result = workflow.orchestrate(Step::Route, CancellationToken::disabled()).await?;
 //! assert_eq!(result, Step::Done);
 //! # Ok(())
 //! # }
@@ -84,7 +84,7 @@
 //!     .register(Step::Route, SimpleRouter)
 //!     .add_exit_state(Step::Done);
 //!
-//! let result = workflow.orchestrate(Step::Route).await?;
+//! let result = workflow.orchestrate(Step::Route, CancellationToken::disabled()).await?;
 //! assert_eq!(result, Step::Done);
 //! # Ok(())
 //! # }
@@ -209,6 +209,7 @@ pub type RouterTaskObject<TState, TResourceKey = Cow<'static, str>> =
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::cancel::CancellationToken;
     use crate::resource::Resources;
     use crate::task;
     use crate::task::Task;
@@ -357,7 +358,10 @@ mod tests {
             .register(Step::PathA, PathATask)
             .add_exit_state(Step::Done);
 
-        let result = workflow.orchestrate(Step::Decide).await.unwrap();
+        let result = workflow
+            .orchestrate(Step::Decide, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, Step::Done);
     }
 
diff --git a/cano/src/task/stepped.rs b/cano/src/task/stepped.rs
index 639c525..9f9f751 100644
--- a/cano/src/task/stepped.rs
+++ b/cano/src/task/stepped.rs
@@ -56,7 +56,7 @@
 //!     .register(MyState::Process, scanner)
 //!     .add_exit_state(MyState::Done);
 //!
-//! let result = workflow.orchestrate(MyState::Process).await?;
+//! let result = workflow.orchestrate(MyState::Process, CancellationToken::disabled()).await?;
 //! assert_eq!(result, MyState::Done);
 //! # Ok(())
 //! # }
@@ -98,7 +98,7 @@
 //!     .register(MyState::Process, TraitStepper)
 //!     .add_exit_state(MyState::Done);
 //!
-//! let result = workflow.orchestrate(MyState::Process).await?;
+//! let result = workflow.orchestrate(MyState::Process, CancellationToken::disabled()).await?;
 //! assert_eq!(result, MyState::Done);
 //! # Ok(())
 //! # }
@@ -397,6 +397,7 @@ where
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::cancel::CancellationToken;
     use crate::resource::Resources;
     use crate::task;
     use crate::task::Task;
@@ -729,7 +730,10 @@ mod tests {
             .register(MyState::Next, NextTask)
             .add_exit_state(MyState::Done);
 
-        let result = workflow.orchestrate(MyState::Work).await.unwrap();
+        let result = workflow
+            .orchestrate(MyState::Work, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, MyState::Done);
     }
 
@@ -787,6 +791,7 @@ mod tests {
 #[cfg(all(test, feature = "metrics"))]
 mod metrics_tests {
     use super::*;
+    use crate::cancel::CancellationToken;
     use crate::metrics::test_support::*;
     use crate::task::Task;
     use crate::workflow::Workflow;
@@ -849,7 +854,9 @@ mod metrics_tests {
             let workflow = Workflow::bare()
                 .register_stepped(St::Work, TwoMoreOneDone)
                 .add_exit_state(St::Done);
-            workflow.orchestrate(St::Work).await
+            workflow
+                .orchestrate(St::Work, CancellationToken::disabled())
+                .await
         });
         assert!(result.is_ok(), "workflow should succeed: {result:?}");
         assert_eq!(
diff --git a/cano/src/task/timer.rs b/cano/src/task/timer.rs
index 9e4cfbc..bd3409d 100644
--- a/cano/src/task/timer.rs
+++ b/cano/src/task/timer.rs
@@ -63,7 +63,7 @@
 //!     .register(Step::Wait, CoolDown)
 //!     .add_exit_state(Step::Done);
 //!
-//! let result = workflow.orchestrate(Step::Wait).await?;
+//! let result = workflow.orchestrate(Step::Wait, CancellationToken::disabled()).await?;
 //! assert_eq!(result, Step::Done);
 //! # Ok(())
 //! # }
diff --git a/cano/src/testing.rs b/cano/src/testing.rs
index 8f1c079..954b82a 100644
--- a/cano/src/testing.rs
+++ b/cano/src/testing.rs
@@ -49,7 +49,7 @@
 //!     .register(S::Start, OkTask)
 //!     .add_exit_state(S::Done)
 //!     .with_observer(observer.clone());
-//! assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done);
+//! assert_eq!(wf.orchestrate(S::Start, CancellationToken::disabled()).await.unwrap(), S::Done);
 //! observer.assert_path(&["Start", "Done"]);
 //! # }
 //! ```
@@ -120,6 +120,11 @@ pub enum RecordedEvent {
         /// The sequence of the last persisted row.
         sequence: u64,
     },
+    /// A run was cancelled via a [`CancellationToken`](crate::cancel::CancellationToken).
+    Cancelled {
+        /// The `Debug` rendering of the state cancellation was observed at.
+        state: String,
+    },
 }
 
 /// A [`WorkflowObserver`] that records every event it
@@ -291,6 +296,11 @@ impl WorkflowObserver for RecordingObserver {
             sequence,
         });
     }
+    fn on_cancelled(&self, state: &str) {
+        self.events.lock().push(RecordedEvent::Cancelled {
+            state: state.into(),
+        });
+    }
 }
 
 /// A process-local [`CheckpointStore`] for resume /
@@ -549,7 +559,12 @@ mod tests {
             .register(S::Start, OkTask)
             .add_exit_state(S::Done)
             .with_observer(observer.clone());
-        assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done);
+        assert_eq!(
+            wf.orchestrate(S::Start, CancellationToken::disabled())
+                .await
+                .unwrap(),
+            S::Done
+        );
         observer.assert_path(&["Start", "Done"]);
         observer.assert_completed_with("Done");
         assert!(observer.events().contains(&RecordedEvent::TaskSucceeded {
@@ -564,7 +579,9 @@ mod tests {
             .register(S::Start, OkTask)
             .add_exit_state(S::Done)
             .with_observer(observer.clone());
-        wf.orchestrate(S::Start).await.unwrap();
+        wf.orchestrate(S::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert!(!observer.events().is_empty());
         observer.clear();
         assert!(observer.events().is_empty());
@@ -687,7 +704,10 @@ mod tests {
             .register(S::Start, task)
             .add_exit_state(S::Done)
             .with_observer(observer.clone());
-        let err = wf.orchestrate(S::Start).await.unwrap_err();
+        let err = wf
+            .orchestrate(S::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(err.to_string().contains("panic"), "{err}");
         let retries = observer
             .events()
@@ -710,7 +730,12 @@ mod tests {
         let wf = Workflow::bare()
             .register(S::Start, panic_on_attempt(0, S::Done))
             .add_exit_state(S::Done);
-        assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done);
+        assert_eq!(
+            wf.orchestrate(S::Start, CancellationToken::disabled())
+                .await
+                .unwrap(),
+            S::Done
+        );
     }
 
     #[test]
@@ -734,7 +759,9 @@ mod tests {
             .register(S::B, Go(S::Done))
             .add_exit_state(S::Done)
             .with_observer(observer.clone());
-        wf.orchestrate(S::A).await.unwrap();
+        wf.orchestrate(S::A, CancellationToken::disabled())
+            .await
+            .unwrap();
         observer
             .assert_all_states_entered(&[S::A, S::B, S::Done])
             .expect("all states visited");
@@ -747,7 +774,9 @@ mod tests {
             .register(S::A, Go(S::Done))
             .add_exit_state(S::Done)
             .with_observer(observer.clone());
-        wf.orchestrate(S::A).await.unwrap();
+        wf.orchestrate(S::A, CancellationToken::disabled())
+            .await
+            .unwrap();
         let missing = observer
             .assert_all_states_entered(&[S::A, S::B, S::C, S::Done])
             .unwrap_err();
@@ -761,7 +790,9 @@ mod tests {
             .register(S::A, Go(S::Done))
             .add_exit_state(S::Done)
             .with_observer(observer.clone());
-        wf.orchestrate(S::A).await.unwrap();
+        wf.orchestrate(S::A, CancellationToken::disabled())
+            .await
+            .unwrap();
         let missing = observer
             .assert_all_states_entered(&[S::A, S::A, S::B])
             .unwrap_err();
@@ -776,7 +807,9 @@ mod tests {
             .register(S::B, Go(S::Done))
             .add_exit_state(S::Done)
             .with_observer(observer.clone());
-        wf.orchestrate(S::A).await.unwrap();
+        wf.orchestrate(S::A, CancellationToken::disabled())
+            .await
+            .unwrap();
         observer
             .assert_registered_states_entered(&wf)
             .expect("all registered states visited");
@@ -790,7 +823,9 @@ mod tests {
             .register(S::C, Go(S::Done)) // never routed to
             .add_exit_state(S::Done)
             .with_observer(observer.clone());
-        wf.orchestrate(S::A).await.unwrap();
+        wf.orchestrate(S::A, CancellationToken::disabled())
+            .await
+            .unwrap();
         let missing = observer.assert_registered_states_entered(&wf).unwrap_err();
         assert!(missing.contains(&"C".to_string()), "missing={missing:?}");
     }
diff --git a/cano/src/workflow.rs b/cano/src/workflow.rs
index b448392..277f71f 100644
--- a/cano/src/workflow.rs
+++ b/cano/src/workflow.rs
@@ -77,6 +77,7 @@ use std::hash::Hash;
 use std::sync::{Arc, OnceLock};
 use std::time::Duration;
 
+use crate::cancel::CancellationToken;
 use crate::error::CanoError;
 use crate::observer::WorkflowObserver;
 use crate::recovery::CheckpointStore;
@@ -213,8 +214,6 @@ where
     states: HashMap<TState, Arc<StateEntry<TState, TResourceKey>>>,
     /// Shared resources for all tasks
     pub(crate) resources: Arc<Resources<TResourceKey>>,
-    /// Global workflow timeout
-    workflow_timeout: Option<Duration>,
     /// Total wall-clock budget for the entire `orchestrate` / `resume_from` call.
     /// When set, the FSM aborts the in-flight task at its next await point as soon
     /// as the budget elapses and drains the compensation stack against
@@ -268,7 +267,6 @@ where
         Self {
             states: HashMap::new(),
             resources: Arc::new(resources),
-            workflow_timeout: None,
             total_timeout: None,
             compensation_timeout: None,
             exit_states: Vec::new(),
@@ -283,27 +281,6 @@ where
         }
     }
 
-    /// Set a blunt wall-clock timeout for the entire `orchestrate` /
-    /// `resume_from` call.
-    ///
-    /// Implemented as a single `tokio::time::timeout` around the workflow
-    /// future. The in-flight task is dropped at its next await point and the
-    /// call returns `CanoError::Workflow("Workflow timeout exceeded")` —
-    /// compensation does **not** run.
-    ///
-    /// When [`with_total_timeout`](Self::with_total_timeout) is also set, the
-    /// engine treats this value as a *floor* on the total budget: the
-    /// effective wall-clock cap is `min(with_timeout, with_total_timeout)`
-    /// and the graceful total-timeout path drives it (compensation runs
-    /// under [`with_compensation_timeout`](Self::with_compensation_timeout),
-    /// `on_workflow_timeout` fires). This preserves the "with_timeout is a
-    /// hard upper bound" intent while avoiding a race between two outer
-    /// timeouts that would drop the inner compensation drain mid-flight.
-    pub fn with_timeout(mut self, timeout: Duration) -> Self {
-        self.workflow_timeout = Some(timeout);
-        self
-    }
-
     /// Set a wall-clock budget for the entire `orchestrate` (or `resume_from`) call.
     ///
     /// When the budget elapses, the in-flight task is aborted at its next await
@@ -343,7 +320,7 @@ where
     ///     .add_exit_state(Step::Done);
     ///
     /// let err = workflow
-    ///     .orchestrate(Step::Start)
+    ///     .orchestrate(Step::Start, CancellationToken::disabled())
     ///     .await
     ///     .expect_err("budget elapses before Done");
     /// // The engine wraps task errors with state context; `.inner()` peels one layer.
@@ -638,7 +615,7 @@ where
     ///     .register(Step::Start, NoopTask)
     ///     .add_exit_state(Step::Done)
     ///     .with_observer(counter.clone());
-    /// workflow.orchestrate(Step::Start).await?;
+    /// workflow.orchestrate(Step::Start, CancellationToken::disabled()).await?;
     /// assert_eq!(counter.0.load(Ordering::Relaxed), 1);
     /// # Ok(())
     /// # }
@@ -901,13 +878,35 @@ where
     ///
     /// Runs lifecycle setup before execution and teardown after, regardless of outcome.
     ///
+    /// `token` controls cooperative cancellation. Drive the run with a [`CancellationToken`]
+    /// obtained from [`CancellationToken::new`](crate::cancel::CancellationToken::new) and keep the
+    /// paired [`CancellationHandle`](crate::cancel::CancellationHandle); when the handle's
+    /// [`cancel`](crate::cancel::CancellationHandle::cancel) fires, the in-flight cancellable task
+    /// is dropped at its next await point, the saga compensation stack is drained, and the call
+    /// returns [`CanoError::Cancelled`] (wrapped in [`CanoError::WithStateContext`]; a dirty
+    /// rollback yields [`CanoError::CompensationFailed`] whose `errors[0]` is the wrapped cancel).
+    /// To opt a run out of cancellation, pass [`CancellationToken::disabled`] — it never fires and
+    /// is zero-cost (the FSM skips the cancellation `select!` entirely).
+    ///
+    /// Cancellation is cooperative and saga-safe: a task is only interrupted at an `.await`, and a
+    /// [`CompensatableTask`](crate::saga::CompensatableTask) is never interrupted mid-run (it
+    /// completes so its rollback entry is recorded, and the cancel is honoured at the next state
+    /// boundary). The compensation drain itself is uncancellable. See the
+    /// [`cancel`](crate::cancel) module for the full semantics and precedence rules against
+    /// [`with_total_timeout`](Self::with_total_timeout).
+    ///
     /// # Errors
     ///
     /// - [`CanoError::Workflow`] -- no handler is registered for the current state, a single
     ///   task returned a `TaskResult::Split` (use [`Workflow::register_split`] instead), the
     ///   global workflow timeout was exceeded, or a split strategy was misconfigured
+    /// - [`CanoError::Cancelled`] -- the run was cancelled via `token` (see above)
     /// - Any [`CanoError`] variant propagated from a task during execution
-    pub async fn orchestrate(&self, initial_state: TState) -> Result<TState, CanoError> {
+    pub async fn orchestrate(
+        &self,
+        initial_state: TState,
+        token: CancellationToken,
+    ) -> Result<TState, CanoError> {
         #[cfg(feature = "tracing")]
         let workflow_span = self.tracing_span.clone().unwrap_or_else(|| {
             if tracing::enabled!(tracing::Level::INFO) {
@@ -937,79 +936,48 @@ where
         self.validate_initial_state(&initial_state)?;
 
         self.resources.setup_all().await?;
-        let result = self.run_workflow(initial_state).await;
+        let result = self.run_workflow(initial_state, token).await;
         self.resources
             .teardown_range(0..self.resources.lifecycle_len())
             .await;
         result
     }
 
-    async fn run_workflow(&self, initial_state: TState) -> Result<TState, CanoError> {
+    async fn run_workflow(
+        &self,
+        initial_state: TState,
+        token: CancellationToken,
+    ) -> Result<TState, CanoError> {
         #[cfg(feature = "metrics")]
         let _active = crate::metrics::WorkflowActiveGuard::new();
         let started = std::time::Instant::now();
         let total_budget = self.resolve_total_budget(started);
-        let workflow_future = self.execute_workflow(initial_state, total_budget);
-        self.await_with_outer_timeout(workflow_future, total_budget, started)
-            .await
+        let result = self
+            .execute_workflow(initial_state, total_budget, token)
+            .await;
+        Self::record_run_outcome(&result, started);
+        result
     }
 
-    /// Resolve the effective wall-clock budget for the entire FSM call.
-    ///
-    /// Precedence:
-    /// 1. Both `with_timeout` and `with_total_timeout` set → graceful
-    ///    total-timeout path with `min(...)` as the budget. Treating
-    ///    `with_timeout` as a floor preserves the user's intent that it is
-    ///    a hard upper bound, while avoiding a race between two outer
-    ///    timeouts that would drop the compensation drain mid-flight.
-    /// 2. Only total set → graceful total-timeout path.
-    /// 3. Only `with_timeout` set → legacy blunt `tokio::time::timeout`
-    ///    wrapper applied externally; the FSM loop runs unbudgeted.
-    /// 4. Neither → zero-cost path.
+    /// Resolve the wall-clock budget for the entire FSM call: the
+    /// [`with_total_timeout`](Self::with_total_timeout) duration, or `None`
+    /// (the zero-cost path) when unset.
     pub(crate) fn resolve_total_budget(
         &self,
         started: std::time::Instant,
     ) -> Option<(std::time::Instant, Duration)> {
-        let effective = match (self.workflow_timeout, self.total_timeout) {
-            (Some(w), Some(t)) => Some(w.min(t)),
-            (_, Some(t)) => Some(t),
-            _ => None,
-        };
-        effective.map(|d| (started, d))
-    }
-
-    /// Apply the legacy `with_timeout` outer wrapper when (and only when) the
-    /// graceful total-timeout path is NOT also active. Emits the workflow-run
-    /// outcome metric exactly once per invocation — on the legacy-timeout
-    /// path the early return ensures `outcome="timeout"` is recorded
-    /// *without* a follow-up `outcome="failed"` for the same run; on the
-    /// non-timeout paths the post-match emission records `completed`/`failed`.
-    ///
-    /// Used by both `run_workflow` (forward direction) and
-    /// `execute_resume_inner` (resume direction) so the precedence rule
-    /// lives in one place.
-    pub(crate) async fn await_with_outer_timeout<F, T>(
-        &self,
-        fut: F,
-        total_budget: Option<(std::time::Instant, Duration)>,
-        #[allow(unused_variables)] started: std::time::Instant,
-    ) -> Result<T, CanoError>
-    where
-        F: std::future::Future<Output = Result<T, CanoError>>,
-    {
-        let result = match (self.workflow_timeout, total_budget) {
-            (Some(timeout_duration), None) => {
-                match tokio::time::timeout(timeout_duration, fut).await {
-                    Ok(inner) => inner,
-                    Err(_) => {
-                        #[cfg(feature = "metrics")]
-                        crate::metrics::workflow_run("timeout", started.elapsed());
-                        return Err(CanoError::workflow("Workflow timeout exceeded"));
-                    }
-                }
-            }
-            _ => fut.await,
-        };
+        self.total_timeout.map(|d| (started, d))
+    }
+
+    /// Emit the workflow-run outcome metric (`completed` / `failed`) once per
+    /// run. Called by both `run_workflow` (forward) and `execute_resume_inner`
+    /// (resume) so the emission lives in one place. No-op without the `metrics`
+    /// feature.
+    #[cfg_attr(not(feature = "metrics"), allow(unused_variables))]
+    pub(crate) fn record_run_outcome<T>(
+        result: &Result<T, CanoError>,
+        started: std::time::Instant,
+    ) {
         #[cfg(feature = "metrics")]
         crate::metrics::workflow_run(
             if result.is_ok() {
@@ -1019,7 +987,6 @@ where
             },
             started.elapsed(),
         );
-        result
     }
 }
 
@@ -1032,7 +999,6 @@ where
         Self {
             states: self.states.clone(),
             resources: Arc::clone(&self.resources),
-            workflow_timeout: self.workflow_timeout,
             total_timeout: self.total_timeout,
             compensation_timeout: self.compensation_timeout,
             exit_states: self.exit_states.clone(),
@@ -1080,7 +1046,7 @@ where
     /// let result = Workflow::bare()
     ///     .register(Step::Start, NoopTask)
     ///     .add_exit_state(Step::Done)
-    ///     .orchestrate(Step::Start)
+    ///     .orchestrate(Step::Start, CancellationToken::disabled())
     ///     .await?;
     /// assert_eq!(result, Step::Done);
     /// # Ok(())
@@ -1100,7 +1066,6 @@ where
         f.debug_struct("Workflow")
             .field("states", &format!("{} states", self.states.len()))
             .field("exit_states", &self.exit_states)
-            .field("workflow_timeout", &self.workflow_timeout)
             .field("total_timeout", &self.total_timeout)
             .field("compensation_timeout", &self.compensation_timeout)
             .field("workflow_id", &self.workflow_id)
@@ -1151,7 +1116,11 @@ mod metrics_tests {
 
     #[test]
     fn successful_run_records_outcome_duration_and_clears_active_gauge() {
-        let (res, rows) = run_with_recorder(|| async { ok_workflow().orchestrate(S::Start).await });
+        let (res, rows) = run_with_recorder(|| async {
+            ok_workflow()
+                .orchestrate(S::Start, CancellationToken::disabled())
+                .await
+        });
         assert_eq!(res.unwrap(), S::Done);
         assert_eq!(
             counter(
@@ -1178,7 +1147,7 @@ mod metrics_tests {
             Workflow::bare()
                 .register(S::Start, Boom)
                 .add_exit_state(S::Done)
-                .orchestrate(S::Start)
+                .orchestrate(S::Start, CancellationToken::disabled())
                 .await
         });
         assert!(res.is_err());
@@ -1190,47 +1159,12 @@ mod metrics_tests {
     }
 
     #[test]
-    fn legacy_timeout_on_orchestrate_only_increments_timeout_counter() {
-        // Regression sentinel for F8: when `with_timeout` fires inside
-        // `run_workflow`, the early return guarantees only `outcome="timeout"`
-        // is incremented — not both `timeout` and `failed`. This test asserts
-        // the forward direction, which has always been correct; a sibling test
-        // in `compensation::tests` covers the resume direction (which used to
-        // double-count).
-        struct Slow;
-        #[crate::task]
-        impl Task<S> for Slow {
-            fn config(&self) -> TaskConfig {
-                TaskConfig::minimal()
-            }
-            async fn run_bare(&self) -> Result<TaskResult<S>, CanoError> {
-                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
-                Ok(TaskResult::Single(S::Done))
-            }
-        }
+    fn per_state_task_durations_are_recorded_single_and_split() {
         let (res, rows) = run_with_recorder(|| async {
-            Workflow::bare()
-                .with_timeout(std::time::Duration::from_millis(20))
-                .register(S::Start, Slow)
-                .add_exit_state(S::Done)
-                .orchestrate(S::Start)
+            ok_workflow()
+                .orchestrate(S::Start, CancellationToken::disabled())
                 .await
         });
-        assert!(res.is_err());
-        assert_eq!(
-            counter(&rows, "cano_workflow_runs_total", &[("outcome", "timeout")]),
-            1
-        );
-        assert_eq!(
-            counter_opt(&rows, "cano_workflow_runs_total", &[("outcome", "failed")]).unwrap_or(0),
-            0,
-            "legacy timeout must not double-count as both `timeout` and `failed`"
-        );
-    }
-
-    #[test]
-    fn per_state_task_durations_are_recorded_single_and_split() {
-        let (res, rows) = run_with_recorder(|| async { ok_workflow().orchestrate(S::Start).await });
         assert_eq!(res.unwrap(), S::Done);
         assert_eq!(
             histogram_count(
@@ -1282,7 +1216,7 @@ mod metrics_tests {
                     JoinConfig::new(JoinStrategy::PartialResults(2), S::Done),
                 )
                 .add_exit_state(S::Done)
-                .orchestrate(S::Start)
+                .orchestrate(S::Start, CancellationToken::disabled())
                 .await
         });
         assert_eq!(res.unwrap(), S::Done);
@@ -1345,7 +1279,10 @@ mod tests {
             .register(TestState::Start, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1356,7 +1293,10 @@ mod tests {
             .register(TestState::Process, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1371,7 +1311,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
 
         let data: String = store.get("test_key").unwrap();
@@ -1384,7 +1327,9 @@ mod tests {
         // upfront rather than reaching the FSM loop.
         let workflow = Workflow::<TestState>::bare().add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await;
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         let err = result.unwrap_err();
         assert_eq!(err.category(), "configuration");
         assert!(err.to_string().contains("no registered state handlers"));
@@ -1470,7 +1415,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1486,7 +1434,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1496,7 +1447,10 @@ mod tests {
         let workflow = Workflow::bare()
             .register(TestState::Start, SimpleTask::new(TestState::Complete))
             .add_exit_states([TestState::Complete, TestState::Error]);
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1506,7 +1460,10 @@ mod tests {
         let workflow = Workflow::bare()
             .register(TestState::Start, SimpleTask::new(TestState::Complete))
             .add_exit_states([TestState::Complete, TestState::Complete].into_iter());
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1610,7 +1567,7 @@ mod tests {
         let result = Workflow::bare()
             .register(TestState::Start, BareWorkflowTask)
             .add_exit_state(TestState::Complete)
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .unwrap();
         assert_eq!(result, TestState::Complete);
@@ -1644,7 +1601,7 @@ mod tests {
             .register_router(TestState::Start, RouteToProcess)
             .register(TestState::Process, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete)
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .unwrap();
 
@@ -1770,7 +1727,7 @@ mod tests {
             .register(TestState::Start, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete)
             .with_observer(Arc::new(PanickyObserver))
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .expect("orchestrate must complete despite observer panic");
         assert_eq!(result, TestState::Complete);
@@ -1800,7 +1757,7 @@ mod tests {
             .register(TestState::Process, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete)
             .with_observer(Arc::new(CountThenPanic(Arc::clone(&count))))
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .expect("orchestrate must complete despite repeated observer panics");
         assert_eq!(result, TestState::Complete);
@@ -1821,7 +1778,10 @@ mod tests {
         let wf = Workflow::bare()
             .register(TestState::Start, start.clone())
             .add_exit_state(TestState::Complete);
-        let result = wf.orchestrate(TestState::Complete).await.unwrap();
+        let result = wf
+            .orchestrate(TestState::Complete, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
         assert_eq!(
             start.count(),
@@ -1839,7 +1799,10 @@ mod tests {
             .register(TestState::Start, SimpleTask::new(TestState::Process))
             .register(TestState::Process, process.clone())
             .add_exit_state(TestState::Process);
-        let result = wf.orchestrate(TestState::Start).await.unwrap();
+        let result = wf
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Process);
         assert_eq!(
             process.count(),
@@ -1855,7 +1818,10 @@ mod tests {
         let wf = Workflow::bare()
             .register(TestState::Start, SimpleTask::new(TestState::Process))
             .add_exit_state(TestState::Complete);
-        let err = wf.orchestrate(TestState::Start).await.unwrap_err();
+        let err = wf
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(err.to_string().contains("No task registered"), "got: {err}");
     }
 
@@ -1866,7 +1832,10 @@ mod tests {
         let wf = Workflow::bare()
             .register(TestState::Start, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete);
-        let err = wf.orchestrate(TestState::Process).await.unwrap_err();
+        let err = wf
+            .orchestrate(TestState::Process, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.category(), "configuration");
         assert!(
             err.to_string()
@@ -1889,7 +1858,10 @@ mod tests {
         let wf = Workflow::bare()
             .register(TestState::Start, ReturnsSplit)
             .add_exit_state(TestState::Complete);
-        let err = wf.orchestrate(TestState::Start).await.unwrap_err();
+        let err = wf
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(err.to_string().contains("use register_split"), "got: {err}");
     }
 
@@ -1900,7 +1872,10 @@ mod tests {
             .register(TestState::Start, first.clone())
             .register(TestState::Start, SimpleTask::new(TestState::Complete)) // replaces `first`
             .add_exit_state(TestState::Complete);
-        let result = wf.orchestrate(TestState::Start).await.unwrap();
+        let result = wf
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete); // the second handler ran
         assert_eq!(first.count(), 0, "the replaced handler must not run");
     }
@@ -1936,7 +1911,10 @@ mod tests {
                 },
             )
             .add_exit_state(TestState::Complete);
-        let result = wf.orchestrate(TestState::Start).await.unwrap();
+        let result = wf
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
         assert_eq!(count.load(std::sync::atomic::Ordering::SeqCst), 5);
     }
@@ -1955,7 +1933,7 @@ mod tests {
             .add_exit_state(TestState::Complete);
         let result = tokio::time::timeout(
             std::time::Duration::from_secs(5),
-            wf.orchestrate(TestState::Start),
+            wf.orchestrate(TestState::Start, CancellationToken::disabled()),
         )
         .await
         .expect("orchestrate of an empty split must not hang");
@@ -1993,170 +1971,8 @@ mod tests {
     }
 }
 
-/// Edge-case unit tests for `await_with_outer_timeout`. Integration-level
-/// coverage already pins down the metric-emission shape end-to-end
-/// (`legacy_timeout_on_orchestrate_only_increments_timeout_counter`,
-/// `legacy_timeout_on_resume_only_increments_timeout_counter`,
-/// `with_timeout_acts_as_floor_when_combined_with_with_total_timeout`,
-/// `with_timeout_alone_still_uses_legacy_blunt_timeout`); these tests pin
-/// down the helper's behavior in isolation across every
-/// `(workflow_timeout, total_budget)` permutation.
-#[cfg(test)]
-mod await_with_outer_timeout_tests {
-    use super::test_support::TestState;
-    use super::*;
-    use std::time::Duration;
-
-    fn workflow_with(
-        workflow_timeout: Option<Duration>,
-        total_timeout: Option<Duration>,
-    ) -> Workflow<TestState> {
-        let mut w = Workflow::<TestState>::bare();
-        if let Some(d) = workflow_timeout {
-            w = w.with_timeout(d);
-        }
-        if let Some(d) = total_timeout {
-            w = w.with_total_timeout(d);
-        }
-        w
-    }
-
-    #[tokio::test]
-    async fn neither_timeout_just_awaits_future() {
-        let w = workflow_with(None, None);
-        let started = std::time::Instant::now();
-        let out = w
-            .await_with_outer_timeout(
-                async { Ok::<TestState, CanoError>(TestState::Complete) },
-                None,
-                started,
-            )
-            .await
-            .unwrap();
-        assert_eq!(out, TestState::Complete);
-    }
-
-    #[tokio::test]
-    async fn only_with_timeout_passes_through_when_future_is_fast() {
-        let w = workflow_with(Some(Duration::from_secs(60)), None);
-        let started = std::time::Instant::now();
-        let out = w
-            .await_with_outer_timeout(
-                async { Ok::<TestState, CanoError>(TestState::Complete) },
-                None,
-                started,
-            )
-            .await
-            .unwrap();
-        assert_eq!(out, TestState::Complete);
-    }
-
-    #[tokio::test]
-    async fn only_with_timeout_fires_legacy_timeout_on_slow_future() {
-        // Slow future + small `with_timeout` and no total_budget → the legacy
-        // arm fires. Surfaces the documented `CanoError::Workflow("Workflow
-        // timeout exceeded")` and the helper returns early so the post-match
-        // emission does not also fire.
-        let w = workflow_with(Some(Duration::from_millis(10)), None);
-        let started = std::time::Instant::now();
-        let err = w
-            .await_with_outer_timeout(
-                async {
-                    tokio::time::sleep(Duration::from_secs(1)).await;
-                    Ok::<TestState, CanoError>(TestState::Complete)
-                },
-                None,
-                started,
-            )
-            .await
-            .expect_err("legacy timeout must fire");
-        assert!(
-            matches!(err, CanoError::Workflow(ref m) if m.contains("Workflow timeout exceeded")),
-            "expected legacy shape, got: {err}"
-        );
-        assert!(
-            started.elapsed() < Duration::from_millis(500),
-            "must bound to the legacy timeout, not the inner sleep"
-        );
-    }
-
-    #[tokio::test]
-    async fn only_total_budget_skips_legacy_path() {
-        // The helper's match has `(Some, None)` only — when total_budget is
-        // Some, the legacy arm is never taken. The slow future is allowed to
-        // run; here we stop it via a quick inner Ok so the test is fast.
-        let w = workflow_with(None, Some(Duration::from_millis(10)));
-        let total_budget = Some((std::time::Instant::now(), Duration::from_millis(10)));
-        let started = std::time::Instant::now();
-        let out = w
-            .await_with_outer_timeout(
-                async { Ok::<TestState, CanoError>(TestState::Complete) },
-                total_budget,
-                started,
-            )
-            .await
-            .unwrap();
-        assert_eq!(out, TestState::Complete);
-    }
-
-    #[tokio::test]
-    async fn both_timeouts_set_skips_legacy_path() {
-        // Both timeouts: the graceful path drives. Legacy wrapper must NOT
-        // wrap, otherwise the inner total-budget drain could be cancelled
-        // mid-flight. Verify by using a slow inner future and a Some
-        // total_budget: the helper passes the future through without timing
-        // out (since legacy doesn't apply and we don't simulate the graceful
-        // path here — that's the FSM loop's job).
-        let w = workflow_with(
-            Some(Duration::from_millis(5)),
-            Some(Duration::from_secs(60)),
-        );
-        let total_budget = Some((std::time::Instant::now(), Duration::from_secs(60)));
-        let started = std::time::Instant::now();
-        let out = w
-            .await_with_outer_timeout(
-                async {
-                    // ~20ms — well beyond `with_timeout(5ms)` — to prove the
-                    // legacy wrapper isn't applied.
-                    tokio::time::sleep(Duration::from_millis(20)).await;
-                    Ok::<TestState, CanoError>(TestState::Complete)
-                },
-                total_budget,
-                started,
-            )
-            .await
-            .unwrap();
-        assert_eq!(out, TestState::Complete);
-        assert!(
-            started.elapsed() >= Duration::from_millis(20),
-            "future must run to completion; legacy wrapper must NOT be applied"
-        );
-    }
-
-    #[tokio::test]
-    async fn legacy_path_propagates_inner_errors_unchanged() {
-        // When the inner future returns Err before the deadline, the helper
-        // must surface that error unchanged — not convert it to a timeout.
-        let w = workflow_with(Some(Duration::from_secs(60)), None);
-        let started = std::time::Instant::now();
-        let err = w
-            .await_with_outer_timeout(
-                async { Err::<TestState, _>(CanoError::task_execution("inner boom")) },
-                None,
-                started,
-            )
-            .await
-            .expect_err("inner err must propagate");
-        assert!(
-            matches!(err, CanoError::TaskExecution(ref m) if m == "inner boom"),
-            "must propagate verbatim, got: {err}"
-        );
-    }
-}
-
-/// Edge-case unit tests for `resolve_total_budget`. Verifies the precedence
-/// rules separately from the integration-level
-/// `with_timeout_acts_as_floor_when_combined_with_with_total_timeout`.
+/// Edge-case unit tests for `resolve_total_budget` — the budget is simply the
+/// `with_total_timeout` duration (or `None`).
 #[cfg(test)]
 mod resolve_total_budget_tests {
     use super::test_support::TestState;
@@ -2164,50 +1980,19 @@ mod resolve_total_budget_tests {
     use std::time::Duration;
 
     #[test]
-    fn neither_set_returns_none() {
+    fn unset_returns_none() {
         let w = Workflow::<TestState>::bare();
         assert!(w.resolve_total_budget(std::time::Instant::now()).is_none());
     }
 
     #[test]
-    fn only_with_timeout_set_returns_none() {
-        let w = Workflow::<TestState>::bare().with_timeout(Duration::from_secs(1));
-        assert!(
-            w.resolve_total_budget(std::time::Instant::now()).is_none(),
-            "with_timeout alone goes through the legacy wrapper; FSM gets no budget"
-        );
-    }
-
-    #[test]
-    fn only_total_timeout_set_returns_total() {
+    fn total_timeout_set_returns_total() {
         let w = Workflow::<TestState>::bare().with_total_timeout(Duration::from_secs(7));
         let now = std::time::Instant::now();
         let (start, limit) = w.resolve_total_budget(now).unwrap();
         assert_eq!(start, now);
         assert_eq!(limit, Duration::from_secs(7));
     }
-
-    #[test]
-    fn both_set_returns_min_via_with_timeout_as_floor() {
-        // F5: when both are configured the smaller bounds the FSM, so the
-        // legacy hard cap still applies (graceful path).
-        let w = Workflow::<TestState>::bare()
-            .with_timeout(Duration::from_millis(50))
-            .with_total_timeout(Duration::from_secs(60));
-        let now = std::time::Instant::now();
-        let (_, limit) = w.resolve_total_budget(now).unwrap();
-        assert_eq!(limit, Duration::from_millis(50));
-    }
-
-    #[test]
-    fn both_set_total_smaller_returns_total() {
-        // Symmetric case: total smaller than legacy.
-        let w = Workflow::<TestState>::bare()
-            .with_timeout(Duration::from_secs(60))
-            .with_total_timeout(Duration::from_millis(50));
-        let (_, limit) = w.resolve_total_budget(std::time::Instant::now()).unwrap();
-        assert_eq!(limit, Duration::from_millis(50));
-    }
 }
 
 /// Edge-case unit tests for `catch_panic_to_error`. The integration-level
diff --git a/cano/src/workflow/compensation.rs b/cano/src/workflow/compensation.rs
index 71b50d6..63e56fe 100644
--- a/cano/src/workflow/compensation.rs
+++ b/cano/src/workflow/compensation.rs
@@ -13,6 +13,7 @@ use std::sync::Arc;
 
 use futures_util::FutureExt;
 
+use crate::cancel::CancellationToken;
 use crate::error::CanoError;
 use crate::recovery::RowKind;
 use crate::saga::{CompensationEntry, ErasedCompensatable};
@@ -505,6 +506,13 @@ where
     /// Rows with other kinds — ordinary [`RowKind::StateEntry`] rows and
     /// [`RowKind::StepCursor`] rows — are ignored by the rehydration.
     ///
+    /// `token` controls cooperative cancellation exactly as in
+    /// [`orchestrate`](Self::orchestrate): firing the paired
+    /// [`CancellationHandle`](crate::cancel::CancellationHandle) aborts the resumed run at the next
+    /// await point and drains the rehydrated compensation stack, returning
+    /// [`CanoError::Cancelled`]. Pass [`CancellationToken::disabled`] to opt out. See the
+    /// [`cancel`](crate::cancel) module for the full cancellation semantics.
+    ///
     /// # Errors
     ///
     /// - [`CanoError::Configuration`] — no checkpoint store attached, or the workflow
@@ -513,8 +521,13 @@ where
     ///   rows for `workflow_id`.
     /// - [`CanoError::Workflow`] — the recorded state label doesn't match any state of
     ///   this workflow (e.g. resuming against a different workflow definition).
+    /// - [`CanoError::Cancelled`] — the resumed run was cancelled via `token`.
     /// - Any [`CanoError`] propagated from a task during the resumed execution.
-    pub async fn resume_from(&self, workflow_id: impl Into<Arc<str>>) -> Result<TState, CanoError> {
+    pub async fn resume_from(
+        &self,
+        workflow_id: impl Into<Arc<str>>,
+        token: CancellationToken,
+    ) -> Result<TState, CanoError> {
         let workflow_id: Arc<str> = workflow_id.into();
 
         #[cfg(feature = "tracing")]
@@ -553,7 +566,8 @@ where
         // run teardown — even the rehydration `?`/`return Err` early-returns
         // between here and `execute_workflow_from`. Wrap the body so a single
         // `teardown_range` call at the bottom handles every path uniformly.
-        let result: Result<TState, CanoError> = self.execute_resume_inner(workflow_id, store).await;
+        let result: Result<TState, CanoError> =
+            self.execute_resume_inner(workflow_id, store, token).await;
         self.resources
             .teardown_range(0..self.resources.lifecycle_len())
             .await;
@@ -570,6 +584,7 @@ where
         &self,
         workflow_id: Arc<str>,
         store: Arc<dyn crate::recovery::CheckpointStore>,
+        token: CancellationToken,
     ) -> Result<TState, CanoError> {
         let mut rows = store.load_run(&workflow_id).await.map_err(|e| {
             CanoError::checkpoint_store(format!("load checkpoint run {workflow_id:?}: {e}"))
@@ -662,15 +677,15 @@ where
             resume_cursors,
             prior_transitions,
             total_budget,
+            token,
         );
         // Teardown happens in the outer `resume_from` after this function
         // returns, so this branch only produces the error value and lets
-        // the caller clean up. `await_with_outer_timeout` owns the
-        // workflow_run metric emission for both timeout and
-        // completed/failed outcomes — see its docstring for the
-        // precedence rules between `with_timeout` and `with_total_timeout`.
-        self.await_with_outer_timeout(exec, total_budget, started)
-            .await
+        // the caller clean up. Emit the workflow-run outcome metric here, the
+        // same way `run_workflow` does for the forward direction.
+        let result = exec.await;
+        Self::record_run_outcome(&result, started);
+        result
     }
 }
 
@@ -707,7 +722,10 @@ mod tests {
             .with_workflow_id("run-1");
 
         assert_eq!(
-            workflow.orchestrate(TestState::Start).await.unwrap(),
+            workflow
+                .orchestrate(TestState::Start, CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
 
@@ -741,10 +759,16 @@ mod tests {
             .register(TestState::Start, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete);
         assert_eq!(
-            workflow.orchestrate(TestState::Start).await.unwrap(),
+            workflow
+                .orchestrate(TestState::Start, CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
-        let err = workflow.resume_from("whatever").await.unwrap_err();
+        let err = workflow
+            .resume_from("whatever", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.category(), "configuration");
         assert!(err.message().contains("checkpoint store"));
     }
@@ -773,7 +797,10 @@ mod tests {
             .with_observer(Arc::new(observer));
 
         assert_eq!(
-            workflow.resume_from("run-2").await.unwrap(),
+            workflow
+                .resume_from("run-2", CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
         assert_eq!(
@@ -822,7 +849,10 @@ mod tests {
             .with_checkpoint_store(store.clone());
 
         assert_eq!(
-            workflow.resume_from("done-run").await.unwrap(),
+            workflow
+                .resume_from("done-run", CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
         assert_eq!(
@@ -846,7 +876,10 @@ mod tests {
             .with_workflow_id("split-run");
 
         assert_eq!(
-            workflow.orchestrate(TestState::Start).await.unwrap(),
+            workflow
+                .orchestrate(TestState::Start, CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
 
@@ -875,7 +908,10 @@ mod tests {
             .register(TestState::Start, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // Errors raised inside `execute_workflow_from` are wrapped with state context;
         // `.inner()` peels one layer back to the underlying checkpoint_store error.
         assert_eq!(err.inner().category(), "checkpoint_store");
@@ -889,7 +925,10 @@ mod tests {
             .register(TestState::Start, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
-        let err = workflow.resume_from("never-ran").await.unwrap_err();
+        let err = workflow
+            .resume_from("never-ran", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.category(), "checkpoint_store");
         assert!(err.message().contains("no checkpoint rows"));
     }
@@ -908,7 +947,10 @@ mod tests {
             .register(TestState::Start, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
-        let err = workflow.resume_from("wrong-defn").await.unwrap_err();
+        let err = workflow
+            .resume_from("wrong-defn", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.category(), "workflow");
         assert!(err.message().contains("is not a registered or exit state"));
     }
@@ -948,7 +990,10 @@ mod tests {
             .with_observer(Arc::new(obs));
 
         assert_eq!(
-            workflow.orchestrate(TestState::Start).await.unwrap(),
+            workflow
+                .orchestrate(TestState::Start, CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
 
@@ -1009,7 +1054,10 @@ mod tests {
             .with_checkpoint_store(store.clone());
 
         assert_eq!(
-            workflow.resume_from("router-resume").await.unwrap(),
+            workflow
+                .resume_from("router-resume", CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
         assert_eq!(
@@ -1063,7 +1111,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // Clean rollback → the original failure is surfaced, wrapped with state context.
         assert_eq!(err.inner().category(), "task_execution");
         assert_eq!(err.message(), "D forward failed");
@@ -1106,7 +1157,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.message(), "D forward failed");
         // Only the two compensatable tasks rolled back — the plain `Process` task didn't.
         assert_eq!(
@@ -1151,7 +1205,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         match err {
             CanoError::CompensationFailed { errors } => {
                 // [original (D forward failed, wrapped with state context), B's compensate failure].
@@ -1204,7 +1261,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         match err {
             CanoError::CompensationFailed { errors } => {
                 assert!(
@@ -1296,7 +1356,10 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
 
-        let err = workflow.resume_from("saga-run").await.unwrap_err();
+        let err = workflow
+            .resume_from("saga-run", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.message(), "C forward failed");
         // The rehydrated stack [A=7, B=8] drains in reverse, using the outputs persisted
         // before the crash. C never produced an output (it failed forward).
@@ -1390,7 +1453,10 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
 
-        let err = workflow.resume_from("mixed-run").await.unwrap_err();
+        let err = workflow
+            .resume_from("mixed-run", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // C failed forward (original error is "C forward failed").
         assert_eq!(err.message(), "C forward failed");
 
@@ -1457,7 +1523,7 @@ mod tests {
             let s = Arc::clone(&store);
             handles.push(tokio::spawn(async move {
                 three_state_checkpointed(s, format!("run-{i}"))
-                    .orchestrate(TestState::Start)
+                    .orchestrate(TestState::Start, CancellationToken::disabled())
                     .await
             }));
         }
@@ -1491,7 +1557,7 @@ mod tests {
             let s = Arc::clone(&store);
             handles.push(tokio::spawn(async move {
                 three_state_checkpointed(s, "dup")
-                    .orchestrate(TestState::Start)
+                    .orchestrate(TestState::Start, CancellationToken::disabled())
                     .await
             }));
         }
@@ -1530,7 +1596,7 @@ mod tests {
             .unwrap();
 
         let err = three_state_checkpointed(store.clone(), "run")
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .unwrap_err();
         assert_eq!(err.inner().category(), "checkpoint_store");
@@ -1589,7 +1655,10 @@ mod tests {
             .with_checkpoint_store(store.clone())
             .with_workflow_id("disk");
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         match err {
             CanoError::CompensationFailed { errors } => {
                 // [the append failure that ended the run (now wrapped with state context),
@@ -1687,7 +1756,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         match err {
             CanoError::CompensationFailed { errors } => {
                 assert_eq!(errors[0].message(), "C forward failed");
@@ -1734,7 +1806,10 @@ mod tests {
             .add_exit_state(TestState::Complete);
 
         let started = std::time::Instant::now();
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(
             started.elapsed() < Duration::from_secs(5),
             "a hanging compensator must be bounded, not block the drain forever"
@@ -1895,7 +1970,10 @@ mod tests {
             .add_exit_state(TestState::Complete);
 
         let started = std::time::Instant::now();
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         let elapsed = started.elapsed();
 
         // With the bounded drain (50ms cap) the test finishes well under
@@ -2091,7 +2169,10 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
 
-        let err = workflow.resume_from("crash-after-b").await.unwrap_err();
+        let err = workflow
+            .resume_from("crash-after-b", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.message(), "C forward failed");
         // B re-ran on resume and re-pushed its entry; the persisted B-completion row at the
         // resume point must NOT be replayed too, or B would compensate twice. Expect exactly
@@ -2138,7 +2219,10 @@ mod tests {
             .with_checkpoint_store(store.clone());
 
         let started = std::time::Instant::now();
-        let err = workflow.resume_from("resume-budget").await.unwrap_err();
+        let err = workflow
+            .resume_from("resume-budget", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         let elapsed = started.elapsed();
 
         assert!(
@@ -2299,7 +2383,10 @@ mod tests {
             .with_checkpoint_store(store.clone())
             .with_workflow_id("tour-interop");
 
-        let result = workflow.orchestrate(TourStage::Route).await.unwrap();
+        let result = workflow
+            .orchestrate(TourStage::Route, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TourStage::Done);
 
         // --- assertions on the audit log ---
@@ -2375,7 +2462,10 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
 
-        let err = workflow.resume_from("mid-a").await.unwrap_err();
+        let err = workflow
+            .resume_from("mid-a", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.message(), "C forward failed");
         // A re-ran (fresh output 11), B ran (22), C failed → drain B then A.
         assert_eq!(
@@ -2407,7 +2497,10 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone())
             .with_workflow_id("nope");
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.inner().category(), "task_execution");
         // The Start checkpoint row is kept (empty stack ⇒ original error, no `clear`) —
         // so the run can still be resumed.
@@ -2458,7 +2551,10 @@ mod tests {
             );
         }
 
-        let err = workflow.orchestrate(0).await.unwrap_err();
+        let err = workflow
+            .orchestrate(0, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.message(), format!("n{} failed", N - 1));
         // States 0..N-1 succeeded forward (the last one failed), so 0..N-1 compensate in reverse.
         let expected: Vec<u32> = (0..N - 1).rev().collect();
@@ -2511,7 +2607,7 @@ mod tests {
                     },
                 );
             }
-            let result = workflow.orchestrate(0).await;
+            let result = workflow.orchestrate(0, CancellationToken::disabled()).await;
             if fail_at == N {
                 assert_eq!(result.unwrap(), N, "no failure ⇒ run completes");
                 assert!(
@@ -2590,7 +2686,10 @@ mod tests {
             .with_workflow_id("step-fwd");
 
         assert_eq!(
-            workflow.orchestrate(TestState::Start).await.unwrap(),
+            workflow
+                .orchestrate(TestState::Start, CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
         assert_eq!(calls.load(AtomicOrdering::Relaxed), 4);
@@ -2660,7 +2759,10 @@ mod tests {
             .with_checkpoint_store(store.clone());
 
         assert_eq!(
-            workflow.resume_from("step-resume").await.unwrap(),
+            workflow
+                .resume_from("step-resume", CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
         // Only 2 step calls: cursor=2→More(3), cursor=3→Done.
@@ -2684,7 +2786,10 @@ mod tests {
             .with_checkpoint_store(store.clone())
             .with_workflow_id("dense");
 
-        workflow.orchestrate(TestState::Start).await.unwrap();
+        workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
 
         let audit = store.audit_rows("dense");
         // seq 0: Start (StateEntry), seq 1: cursor=1, seq 2: cursor=2, seq 3: Complete (exit)
@@ -2721,7 +2826,10 @@ mod tests {
             .with_checkpoint_store(store.clone());
 
         assert_eq!(
-            workflow.resume_from("step-fresh-resume").await.unwrap(),
+            workflow
+                .resume_from("step-fresh-resume", CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
         // Full 2 steps: None→1, 1→2, 2→Done = 3 calls
@@ -2736,7 +2844,7 @@ mod tests {
         let result = Workflow::bare()
             .register_stepped(TestState::Start, stepper)
             .add_exit_state(TestState::Complete)
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .unwrap();
         assert_eq!(result, TestState::Complete);
@@ -2808,7 +2916,10 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
 
-        let err = workflow.resume_from("mixed-stepped").await.unwrap_err();
+        let err = workflow
+            .resume_from("mixed-stepped", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.message(), "stepper failed");
         // A must have been compensated with value 42 from the rehydrated stack.
         assert_eq!(
@@ -2828,7 +2939,10 @@ mod tests {
             .with_checkpoint_store(store.clone())
             .with_workflow_id("ver-run")
             .with_workflow_version(7);
-        workflow.orchestrate(TestState::Start).await.unwrap();
+        workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         let rows = store.audit_rows("ver-run");
         assert!(rows.iter().all(|r| r.workflow_version == 7));
         assert!(!rows.is_empty(), "expected at least one appended row");
@@ -2849,7 +2963,10 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone())
             .with_workflow_version(2);
-        let err = workflow.resume_from("ver-mismatch").await.unwrap_err();
+        let err = workflow
+            .resume_from("ver-mismatch", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err, CanoError::workflow_version_mismatch(1, 2));
     }
 
@@ -2878,7 +2995,7 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
         let out = workflow
-            .resume_from("wf-no-se")
+            .resume_from("wf-no-se", CancellationToken::disabled())
             .await
             .expect("resume should fall back instead of refusing on missing StateEntry");
         assert_eq!(out, TestState::Complete);
@@ -2917,7 +3034,7 @@ mod tests {
             .with_checkpoint_store(store.clone())
             .with_workflow_version(1);
         let out = workflow
-            .resume_from("mixed-ver")
+            .resume_from("mixed-ver", CancellationToken::disabled())
             .await
             .expect("mixed-version log with matching tail must resume cleanly");
         assert_eq!(out, TestState::Complete);
@@ -3007,7 +3124,10 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
 
-        let _ = workflow.resume_from("unsorted").await.unwrap_err();
+        let _ = workflow
+            .resume_from("unsorted", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // LIFO drain: B compensates first (output 2), then A (output 1).
         // Without the engine-side sort, the rehydrated stack would have been
         // built in reverse and `A` would have compensated before `B`.
@@ -3069,7 +3189,10 @@ mod tests {
             .register(TestState::Process, FailTask::new(true))
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // The drain ran compensate (it started) but timed out before finishing.
         assert!(began.load(Ordering::SeqCst), "compensate must have started");
         assert!(
@@ -3131,7 +3254,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(
             started.load(Ordering::SeqCst),
             "task body must have started"
@@ -3193,7 +3319,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(
             committed.load(Ordering::SeqCst),
             "commit must have happened"
@@ -3244,7 +3373,10 @@ mod tests {
             .with_observer(Arc::new(obs));
 
         // Successful run that triggers the clear-on-success path.
-        workflow.orchestrate(TestState::Start).await.unwrap();
+        workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
 
         let calls = rec.clear_failures();
         assert_eq!(calls.len(), 1, "expected one clear-failure event");
@@ -3315,7 +3447,10 @@ mod tests {
             .add_exit_state(TestState::Complete);
 
         let started = std::time::Instant::now();
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         let elapsed = started.elapsed();
         assert!(
             ran.load(std::sync::atomic::Ordering::SeqCst),
@@ -3376,7 +3511,10 @@ mod tests {
             .register_with_compensation(TestState::Start, PanickyCompensate)
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         match err.inner() {
             CanoError::CompensationFailed { errors } => {
                 assert!(
@@ -3439,7 +3577,10 @@ mod tests {
         let workflow = Workflow::bare()
             .register_with_compensation(TestState::Start, InlineFailingCompensate)
             .add_exit_state(TestState::Complete);
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         let errors = match err {
             CanoError::CompensationFailed { errors } => errors,
             other => panic!("expected CompensationFailed, got: {other:?}"),
@@ -3542,7 +3683,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(charged.load(Ordering::SeqCst));
         // The error mentions the split rejection.
         assert!(err.message().contains("split"), "got: {err}");
@@ -3611,7 +3755,10 @@ mod tests {
             .register_with_compensation(TestState::Start, LeakyCharge { log: log.clone() })
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // The serialize error is what surfaces (wrapped with state context).
         assert!(err.message().contains("serialize"), "got: {err}");
 
@@ -3692,7 +3839,10 @@ mod tests {
             .with_checkpoint_store(store.clone())
             .with_observer(Arc::new(obs));
 
-        let err = workflow.resume_from("f3").await.unwrap_err();
+        let err = workflow
+            .resume_from("f3", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // Setup ran and failed.
         assert!(triggered.load(Ordering::SeqCst));
         // The resource error surfaces (not wrapped in WithStateContext, not
@@ -3753,7 +3903,10 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
 
-        let err = workflow.resume_from("orphan").await.unwrap_err();
+        let err = workflow
+            .resume_from("orphan", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // The drain produced two errors (original + orphan) → CompensationFailed.
         let inner = match &err {
             CanoError::CompensationFailed { errors } => errors.clone(),
@@ -3834,7 +3987,10 @@ mod tests {
             .add_exit_state(TestState::Complete)
             .with_checkpoint_store(store.clone());
 
-        let err = workflow.resume_from("f9").await.unwrap_err();
+        let err = workflow
+            .resume_from("f9", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // The drain ran two compensators (clean rollback) so the surfaced
         // error is just the original wrapped failure.
         let ctx = match &err {
@@ -3906,7 +4062,9 @@ mod tests {
             .with_observer(Arc::new(obs));
 
         // Resume succeeds (label dropped from path; gap surfaced via observer).
-        let _ = workflow.resume_from("f-rename").await;
+        let _ = workflow
+            .resume_from("f-rename", CancellationToken::disabled())
+            .await;
         let recorded = rec.unknown_states();
         assert_eq!(
             recorded.len(),
@@ -3963,7 +4121,10 @@ mod tests {
             .with_checkpoint_store(store.clone());
 
         // Case 1: no rows for the id — `load_run` returns empty.
-        let err = workflow.resume_from("never-existed").await.unwrap_err();
+        let err = workflow
+            .resume_from("never-existed", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(err.message().contains("no checkpoint rows"), "got: {err}");
         assert_eq!(setups.load(Ordering::SeqCst), 1, "setup must have run");
         assert_eq!(
@@ -3980,7 +4141,10 @@ mod tests {
             )
             .await
             .unwrap();
-        let err = workflow.resume_from("ver-mismatch").await.unwrap_err();
+        let err = workflow
+            .resume_from("ver-mismatch", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(matches!(err, CanoError::WorkflowVersionMismatch { .. }));
         assert_eq!(setups.load(Ordering::SeqCst), 2);
         assert_eq!(
@@ -3997,7 +4161,10 @@ mod tests {
             )
             .await
             .unwrap();
-        let err = workflow.resume_from("bad-label").await.unwrap_err();
+        let err = workflow
+            .resume_from("bad-label", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(err.message().contains("is not a registered or exit state"));
         assert_eq!(setups.load(Ordering::SeqCst), 3);
         assert_eq!(
@@ -4046,7 +4213,7 @@ mod tests {
             .with_checkpoint_store(store.clone());
 
         let out = workflow
-            .resume_from("missing-entry")
+            .resume_from("missing-entry", CancellationToken::disabled())
             .await
             .expect("resume should fall back rather than refuse on missing StateEntry");
         assert_eq!(out, TestState::Complete);
@@ -4084,7 +4251,7 @@ mod tests {
             .with_workflow_version(2);
 
         let out = workflow
-            .resume_from("mixed-ver")
+            .resume_from("mixed-ver", CancellationToken::disabled())
             .await
             .expect("post-F3, mixed-version log with matching tail must resume");
         assert_eq!(out, TestState::Complete);
@@ -4348,63 +4515,12 @@ mod rehydrated_run_tests {
 mod metrics_tests {
     use crate::metrics::test_support::*;
     use crate::prelude::*;
-    use crate::recovery::CheckpointRow;
     use crate::task::TaskConfig;
     use crate::workflow::test_support::{CompLog, CompTask, MemCheckpoints, SimpleTask, TestState};
     use std::sync::{Arc, Mutex};
 
     // ---- Recovery: checkpoint append + clear counters ----
 
-    #[test]
-    fn legacy_timeout_on_resume_only_increments_timeout_counter() {
-        // Regression for F8: previously `execute_resume_inner`'s legacy
-        // `with_timeout` arm recorded `outcome="timeout"` and then fell
-        // through to the unconditional `workflow_run("timeout"|"failed")`
-        // emission at the bottom of the function — double-counting the same
-        // invocation. `run_workflow` in the orchestrate direction already
-        // used `return Err(...)` to avoid this; the resume path now matches.
-        struct Slow;
-        #[crate::task]
-        impl Task<TestState> for Slow {
-            fn config(&self) -> TaskConfig {
-                TaskConfig::minimal()
-            }
-            async fn run_bare(&self) -> Result<TaskResult<TestState>, CanoError> {
-                tokio::time::sleep(std::time::Duration::from_millis(500)).await;
-                Ok(TaskResult::Single(TestState::Complete))
-            }
-        }
-
-        let (res, rows) = run_with_recorder(|| async {
-            let store = Arc::new(MemCheckpoints::default());
-            // Pre-populate so resume_from has something to rehydrate.
-            store
-                .append(
-                    "wf-legacy-timeout",
-                    CheckpointRow::new(0, "Start", "S").with_workflow_version(0),
-                )
-                .await
-                .unwrap();
-            let workflow = Workflow::bare()
-                .with_checkpoint_store(store.clone())
-                .with_timeout(std::time::Duration::from_millis(20))
-                .register(TestState::Start, Slow)
-                .add_exit_state(TestState::Complete);
-            workflow.resume_from("wf-legacy-timeout").await
-        });
-        assert!(res.is_err());
-        assert_eq!(
-            counter(&rows, "cano_workflow_runs_total", &[("outcome", "timeout")]),
-            1,
-            "exactly one `outcome=timeout` row should be recorded"
-        );
-        assert_eq!(
-            counter_opt(&rows, "cano_workflow_runs_total", &[("outcome", "failed")]).unwrap_or(0),
-            0,
-            "resume timeout must not also be counted as `outcome=failed`"
-        );
-    }
-
     #[test]
     fn checkpoint_append_and_clear_counters_on_successful_run() {
         let (res, rows) = run_with_recorder(|| async {
@@ -4415,7 +4531,9 @@ mod metrics_tests {
                 .register(TestState::Start, SimpleTask::new(TestState::Process))
                 .register(TestState::Process, SimpleTask::new(TestState::Complete))
                 .add_exit_state(TestState::Complete);
-            workflow.orchestrate(TestState::Start).await
+            workflow
+                .orchestrate(TestState::Start, CancellationToken::disabled())
+                .await
         });
         assert_eq!(res.unwrap(), TestState::Complete);
         // At minimum one append per state entered (Start, Process, Complete = 3 rows)
@@ -4465,7 +4583,9 @@ mod metrics_tests {
                 )
                 .register(TestState::Process, AlwaysFailTask)
                 .add_exit_state(TestState::Complete);
-            workflow.orchestrate(TestState::Start).await
+            workflow
+                .orchestrate(TestState::Start, CancellationToken::disabled())
+                .await
         });
         // Clean rollback: the original error is returned (not CompensationFailed)
         assert!(res.is_err(), "expected workflow to fail");
diff --git a/cano/src/workflow/execution.rs b/cano/src/workflow/execution.rs
index b1e895d..366ca3d 100644
--- a/cano/src/workflow/execution.rs
+++ b/cano/src/workflow/execution.rs
@@ -14,6 +14,7 @@ use std::sync::Arc;
 
 use futures_util::FutureExt;
 
+use crate::cancel::CancellationToken;
 use crate::error::CanoError;
 use crate::recovery::CheckpointRow;
 use crate::saga::{CompensationEntry, ErasedCompensatable};
@@ -159,6 +160,7 @@ where
         &self,
         initial_state: TState,
         total_budget: Option<(std::time::Instant, std::time::Duration)>,
+        token: CancellationToken,
     ) -> Result<TState, CanoError> {
         self.execute_workflow_from(
             initial_state,
@@ -168,6 +170,7 @@ where
             HashMap::new(),
             Vec::new(),
             total_budget,
+            token,
         )
         .await
     }
@@ -214,6 +217,11 @@ where
         // deadline. `None` is the zero-cost path — dispatch is awaited directly
         // with no `timeout_at` wrapper.
         total_budget: Option<(std::time::Instant, std::time::Duration)>,
+        // Cooperative-cancellation signal. The internal "never" token (used by
+        // `orchestrate`/`resume_from`) reports `can_cancel() == false`, so the
+        // dispatch path skips the cancellation `select!` entirely — the existing
+        // zero-cost behaviour is preserved bit-for-bit.
+        token: CancellationToken,
     ) -> Result<TState, CanoError> {
         let mut current_state = initial_state;
         let mut sequence = start_sequence;
@@ -258,6 +266,28 @@ where
         });
 
         loop {
+            // Cooperative cancellation observed at a state boundary: stop before
+            // entering this state, fire `on_cancelled` once, and drain whatever
+            // compensatable work has completed so far. `is_cancelled()` is a
+            // non-blocking poll and is `false` for the "never" token, so this is
+            // free on the no-token path. `current_state` is not yet pushed onto
+            // `transitions_so_far`, so the reported path stops *before* the state
+            // we declined to run.
+            if token.is_cancelled() {
+                let label = format!("{current_state:?}");
+                notify_observers(&self.observers, |o| o.on_cancelled(&label));
+                return self
+                    .wrap_and_drain(
+                        workflow_id.as_deref(),
+                        compensation_stack,
+                        &current_state,
+                        &transitions_so_far,
+                        CanoError::cancelled(),
+                        total_budget,
+                    )
+                    .await;
+            }
+
             // The `Debug` label of the state being entered. Needed for observer
             // `on_state_enter`, checkpoint rows, and the `metrics` feature's
             // `state` label; skipped (no allocation) when none are in play — the
@@ -402,12 +432,19 @@ where
                 StateEntry::Single { task, config } => {
                     let task_name = task.name();
                     let fut = self.execute_single_task(task.clone(), Arc::clone(config));
-                    Self::dispatch_with_budget(step_budget, &self.observers, fut, |o, err| {
-                        // `execute_single_task` fired `on_task_start` inside the
-                        // dropped future; pair it with `on_task_failure` so observer
-                        // gauges (`active_tasks` etc.) remain balanced.
-                        o.on_task_failure(task_name.as_ref(), err);
-                    })
+                    Self::dispatch_with_budget(
+                        step_budget,
+                        &token,
+                        state_label.as_deref(),
+                        &self.observers,
+                        fut,
+                        |o, err| {
+                            // `execute_single_task` fired `on_task_start` inside the
+                            // dropped future; pair it with `on_task_failure` so observer
+                            // gauges (`active_tasks` etc.) remain balanced.
+                            o.on_task_failure(task_name.as_ref(), err);
+                        },
+                    )
                     .await
                 }
                 StateEntry::Router { task, config } => {
@@ -415,9 +452,16 @@ where
                     // block above (is_router guard).
                     let task_name = task.name();
                     let fut = self.execute_single_task(task.clone(), Arc::clone(config));
-                    Self::dispatch_with_budget(step_budget, &self.observers, fut, |o, err| {
-                        o.on_task_failure(task_name.as_ref(), err);
-                    })
+                    Self::dispatch_with_budget(
+                        step_budget,
+                        &token,
+                        state_label.as_deref(),
+                        &self.observers,
+                        fut,
+                        |o, err| {
+                            o.on_task_failure(task_name.as_ref(), err);
+                        },
+                    )
                     .await
                 }
                 StateEntry::Split {
@@ -434,10 +478,11 @@ where
                     // helper invokes `task_failure_fan_out` once per observer, so
                     // formatting inside the closure would re-allocate every time.
                     // `execute_split_join` fires `on_task_start` per branch; on
-                    // outer cancellation those branches are dropped, so we fire a
-                    // synthetic per-branch `on_task_failure` to keep observer
-                    // gauges balanced.
-                    let branch_ids: Vec<String> = if step_budget.is_some() {
+                    // outer cancellation OR a total-timeout trip those branches are
+                    // dropped, so we fire a synthetic per-branch `on_task_failure`
+                    // to keep observer gauges balanced — needed whenever the
+                    // dispatch can be aborted (a budget deadline or a live token).
+                    let branch_ids: Vec<String> = if step_budget.is_some() || token.can_cancel() {
                         tasks
                             .iter()
                             .enumerate()
@@ -446,11 +491,18 @@ where
                     } else {
                         Vec::new()
                     };
-                    Self::dispatch_with_budget(step_budget, &self.observers, fut, |o, err| {
-                        for id in &branch_ids {
-                            o.on_task_failure(id, err);
-                        }
-                    })
+                    Self::dispatch_with_budget(
+                        step_budget,
+                        &token,
+                        state_label.as_deref(),
+                        &self.observers,
+                        fut,
+                        |o, err| {
+                            for id in &branch_ids {
+                                o.on_task_failure(id, err);
+                            }
+                        },
+                    )
                     .await
                 }
                 StateEntry::CompensatableSingle { task, config } => {
@@ -535,9 +587,16 @@ where
                         &mut sequence,
                         resume_cursor,
                     );
-                    Self::dispatch_with_budget(step_budget, &self.observers, fut, |o, err| {
-                        o.on_task_failure(task_name.as_ref(), err);
-                    })
+                    Self::dispatch_with_budget(
+                        step_budget,
+                        &token,
+                        state_label.as_deref(),
+                        &self.observers,
+                        fut,
+                        |o, err| {
+                            o.on_task_failure(task_name.as_ref(), err);
+                        },
+                    )
                     .await
                 }
             };
@@ -562,6 +621,9 @@ where
             current_state = match step {
                 Ok(s) => s,
                 Err(e) => {
+                    // `on_cancelled` for a mid-task cancel already fired inside
+                    // `dispatch_with_budget` (the between-state case fires in the
+                    // top-of-loop guard), so this arm stays generic.
                     // Route through `wrap_and_drain` so the wrap + bounded-vs-
                     // unbounded decision live in one place (it derives the
                     // attempt count from `e` itself). The bounded drain bounds
@@ -585,8 +647,9 @@ where
         }
     }
 
-    /// Wrap a state-dispatch future in the per-iteration step-budget, or
-    /// pass it through unchanged when no total budget is active.
+    /// Wrap a state-dispatch future in the per-iteration step-budget and race it
+    /// against the cancellation `token`, or pass it through unchanged when neither
+    /// is active.
     ///
     /// When the wrapped future trips the deadline, the engine synthesizes a
     /// `WorkflowTimeout` error, fires `on_workflow_timeout` once per
@@ -597,14 +660,22 @@ where
     /// `on_task_start` already fired by the dropped inner future is paired
     /// with a matching `on_task_failure`).
     ///
-    /// `fut.await` is the zero-cost path when `step_budget` is `None`; no
-    /// observer plumbing runs in that case.
+    /// When the `token` fires first, the dispatch is dropped, the same
+    /// `task_failure_fan_out` runs (gauge balance), `on_cancelled(state_label)`
+    /// fires once, and `CanoError::Cancelled` is returned. This is the *only*
+    /// place a token-driven mid-task cancel is recognized — the caller's error
+    /// arm stays generic.
+    ///
+    /// `fut.await` is the zero-cost path when `step_budget` is `None` and the
+    /// token can never fire; no observer plumbing runs in that case.
     async fn dispatch_with_budget<T, F>(
         step_budget: Option<(
             std::time::Instant,
             std::time::Duration,
             tokio::time::Instant,
         )>,
+        token: &CancellationToken,
+        state_label: Option<&str>,
         observers: &[Arc<dyn crate::observer::WorkflowObserver>],
         fut: F,
         task_failure_fan_out: impl Fn(&dyn crate::observer::WorkflowObserver, &CanoError),
@@ -612,20 +683,51 @@ where
     where
         F: std::future::Future<Output = Result<T, CanoError>>,
     {
-        let Some((start, limit, deadline)) = step_budget else {
-            return fut.await;
+        let fan = &task_failure_fan_out;
+        // The existing budget logic, untouched: `timeout_at` when a budget is set,
+        // otherwise a bare `fut.await`. Captured as a future so the cancellation
+        // arm below can race it. `async move` takes ownership of `fut`; `observers`
+        // and `fan` are `Copy` references, so they remain usable in the cancel arm.
+        let budgeted = async move {
+            let Some((start, limit, deadline)) = step_budget else {
+                return fut.await;
+            };
+            match tokio::time::timeout_at(deadline, fut).await {
+                Ok(inner) => inner,
+                Err(_) => {
+                    let elapsed = start.elapsed();
+                    let err = CanoError::workflow_timeout(elapsed, limit);
+                    notify_observers(observers, |o| {
+                        fan(o, &err);
+                        o.on_workflow_timeout(elapsed, limit);
+                    });
+                    Err(err)
+                }
+            }
         };
-        match tokio::time::timeout_at(deadline, fut).await {
-            Ok(inner) => inner,
-            Err(_) => {
-                let elapsed = start.elapsed();
-                let err = CanoError::workflow_timeout(elapsed, limit);
-                notify_observers(observers, |o| {
-                    task_failure_fan_out(o, &err);
-                    o.on_workflow_timeout(elapsed, limit);
-                });
+
+        // Zero-cost path: the "never" token can't fire, so skip the `select!`
+        // entirely and run the budgeted future exactly as before.
+        if !token.can_cancel() {
+            return budgeted.await;
+        }
+
+        // Race cancellation against the budgeted dispatch. `biased` checks the
+        // cancel arm first so cancellation deterministically wins a tie against
+        // the per-state timeout. On cancel the inner `fut` is dropped (for splits
+        // this drops the `JoinSet`, aborting its children); we fire the same
+        // per-task fan-out the timeout path uses so observer gauges stay balanced.
+        tokio::select! {
+            biased;
+            _ = token.cancelled() => {
+                let err = CanoError::cancelled();
+                notify_observers(observers, |o| fan(o, &err));
+                if let Some(label) = state_label {
+                    notify_observers(observers, |o| o.on_cancelled(label));
+                }
                 Err(err)
             }
+            res = budgeted => res,
         }
     }
 
@@ -709,7 +811,9 @@ where
     pub(super) fn attempts_from_error(err: &CanoError) -> u32 {
         match err {
             CanoError::RetryExhausted { attempts, .. } => *attempts,
-            CanoError::CircuitOpen(_) | CanoError::WorkflowTimeout { .. } => 0,
+            CanoError::CircuitOpen(_)
+            | CanoError::WorkflowTimeout { .. }
+            | CanoError::Cancelled => 0,
             CanoError::WithStateContext { source, .. } => Self::attempts_from_error(source),
             _ => 1,
         }
@@ -1215,7 +1319,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1233,7 +1340,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1252,7 +1362,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1272,7 +1385,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1290,7 +1406,9 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await;
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         assert!(result.is_err());
     }
 
@@ -1309,7 +1427,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1336,35 +1457,13 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await;
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         assert!(result.is_err());
         assert!(result.unwrap_err().to_string().contains("timeout"));
     }
 
-    #[tokio::test]
-    async fn test_workflow_timeout() {
-        // Task that sleeps longer than workflow timeout
-        #[derive(Clone)]
-        struct SlowTask;
-
-        #[task]
-        impl Task<TestState> for SlowTask {
-            async fn run_bare(&self) -> Result<TaskResult<TestState>, CanoError> {
-                tokio::time::sleep(Duration::from_millis(200)).await;
-                Ok(TaskResult::Single(TestState::Complete))
-            }
-        }
-
-        let workflow = Workflow::bare()
-            .with_timeout(Duration::from_millis(50))
-            .register(TestState::Start, SlowTask)
-            .add_exit_state(TestState::Complete);
-
-        let result = workflow.orchestrate(TestState::Start).await;
-        assert!(result.is_err());
-        assert!(result.unwrap_err().to_string().contains("Workflow timeout"));
-    }
-
     #[tokio::test]
     async fn test_split_with_data_sharing() {
         let store = crate::store::MemoryStore::new();
@@ -1382,7 +1481,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
 
         // Verify all tasks wrote their data
@@ -1416,7 +1518,10 @@ mod tests {
             .register(TestState::Process, SimpleTask::new(TestState::Complete))
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
 
         // Verify all data was written
@@ -1473,7 +1578,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1528,7 +1636,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1556,7 +1667,9 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await;
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         assert!(result.is_err());
         assert!(result.unwrap_err().to_string().contains("timeout"));
     }
@@ -1606,7 +1719,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1662,7 +1778,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1696,7 +1815,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1721,7 +1843,9 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await;
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         assert!(result.is_err());
         assert!(
             result
@@ -1738,7 +1862,10 @@ mod tests {
         let workflow = Workflow::bare()
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1750,7 +1877,10 @@ mod tests {
         let workflow = Workflow::bare()
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(
             matches!(err, CanoError::Configuration(_)),
             "expected Configuration error, got {err:?}"
@@ -1768,7 +1898,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
         assert_eq!(
-            workflow.orchestrate(TestState::Start).await.unwrap(),
+            workflow
+                .orchestrate(TestState::Start, CancellationToken::disabled())
+                .await
+                .unwrap(),
             TestState::Complete
         );
 
@@ -1777,7 +1910,12 @@ mod tests {
         let workflow2 = Workflow::bare()
             .register_split(TestState::Start, tasks_fail, join_config2)
             .add_exit_state(TestState::Complete);
-        assert!(workflow2.orchestrate(TestState::Start).await.is_err());
+        assert!(
+            workflow2
+                .orchestrate(TestState::Start, CancellationToken::disabled())
+                .await
+                .is_err()
+        );
     }
 
     #[tokio::test]
@@ -1791,7 +1929,10 @@ mod tests {
         let workflow = Workflow::bare()
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert!(
             matches!(err, CanoError::Configuration(_)),
             "expected Configuration error, got {err:?}"
@@ -1805,7 +1946,10 @@ mod tests {
         let workflow = Workflow::bare()
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1816,7 +1960,10 @@ mod tests {
         let workflow = Workflow::bare()
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
     }
 
@@ -1825,7 +1972,9 @@ mod tests {
         // No exit states means validate() rejects the workflow before any task runs.
         let workflow =
             Workflow::bare().register(TestState::Start, SimpleTask::new(TestState::Complete));
-        let result = workflow.orchestrate(TestState::Start).await;
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         let err = result.unwrap_err();
         assert_eq!(err.category(), "configuration");
         assert!(err.to_string().contains("no exit states"));
@@ -1846,7 +1995,9 @@ mod tests {
         let workflow = Workflow::bare()
             .register(TestState::Start, SplitReturningTask)
             .add_exit_state(TestState::Complete);
-        let result = workflow.orchestrate(TestState::Start).await;
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         assert!(result.is_err());
         assert!(result.unwrap_err().to_string().contains("register_split"));
     }
@@ -1883,7 +2034,9 @@ mod tests {
             .register(TestState::Start, task)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await;
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         assert!(result.is_err());
 
         // With max_retries=2, there should be exactly 3 attempts (1 initial + 2 retries).
@@ -1936,7 +2089,9 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await;
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         assert!(result.is_ok(), "workflow should succeed after retries");
         assert_eq!(
             call_count.load(Ordering::SeqCst),
@@ -1969,7 +2124,7 @@ mod tests {
             .add_exit_state(TestState::Complete);
 
         let err = workflow
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .expect_err("panic must surface as Err");
         // The FSM wraps the failure with state context; `.inner()` peels one layer.
@@ -1993,7 +2148,7 @@ mod tests {
             .add_exit_state(TestState::Complete);
 
         let err = workflow
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .expect_err("split panic must surface as Err");
         let msg = err.to_string();
@@ -2056,7 +2211,10 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let result = workflow.orchestrate(TestState::Start).await.unwrap();
+        let result = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(result, TestState::Complete);
         let observed = max.load(Ordering::SeqCst);
         assert!(
@@ -2075,7 +2233,7 @@ mod tests {
             .add_exit_state(TestState::Complete);
 
         let err = workflow
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .expect_err("bulkhead=0 must error");
         assert!(matches!(err, CanoError::Configuration(_)), "got {err:?}");
@@ -2104,7 +2262,7 @@ mod tests {
         let err = Workflow::bare()
             .register(TestState::Start, SlowTask)
             .add_exit_state(TestState::Complete)
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .expect_err("expected attempt timeout to exhaust retries");
         // The FSM wraps the failure with state context; `.inner()` peels one layer.
@@ -2158,7 +2316,9 @@ mod tests {
             .register_split(TestState::Start, tasks, join_config)
             .add_exit_state(TestState::Complete);
 
-        let _ = workflow.orchestrate(TestState::Start).await;
+        let _ = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         assert!(
             matches!(breaker.state(), CircuitState::Open { .. }),
             "shared breaker must trip after 4 concurrent failures, got {:?}",
@@ -2203,7 +2363,7 @@ mod tests {
         let result = Workflow::bare()
             .register_stepped(TestState::Start, Counter { target: 5 })
             .add_exit_state(TestState::Complete)
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .unwrap();
         assert_eq!(result, TestState::Complete);
@@ -2234,7 +2394,7 @@ mod tests {
         let err = Workflow::bare()
             .register_stepped(TestState::Start, SplitStepper)
             .add_exit_state(TestState::Complete)
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .expect_err("split result from stepped must error");
         // The FSM wraps the failure with state context; `.inner()` peels one layer.
@@ -2270,7 +2430,10 @@ mod tests {
             .register(TestState::Process, FailNoRetry)
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         match err {
             CanoError::WithStateContext {
                 state,
@@ -2306,7 +2469,7 @@ mod tests {
         let err = Workflow::bare()
             .register(TestState::Start, AlwaysFails)
             .add_exit_state(TestState::Complete)
-            .orchestrate(TestState::Start)
+            .orchestrate(TestState::Start, CancellationToken::disabled())
             .await
             .unwrap_err();
         match err {
@@ -2338,7 +2501,10 @@ mod tests {
             .register(TestState::Process, FailTask::new(true))
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         match err {
             CanoError::WithStateContext {
                 transitions_so_far,
@@ -2392,7 +2558,10 @@ mod tests {
                 },
             )
             .add_exit_state(TestState::Complete);
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         match err {
             CanoError::WithStateContext {
                 attempt, source, ..
@@ -2422,7 +2591,10 @@ mod tests {
                 },
             )
             .add_exit_state(TestState::Complete);
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // The engine wraps the timeout with state context; `.inner()` peels one layer.
         assert!(
             matches!(err.inner(), CanoError::WorkflowTimeout { .. }),
@@ -2430,64 +2602,6 @@ mod tests {
         );
     }
 
-    #[tokio::test]
-    async fn with_timeout_acts_as_floor_when_combined_with_with_total_timeout() {
-        // Regression for F5: previously, when both `with_timeout(d1)` and
-        // `with_total_timeout(d2)` were set, the legacy `with_timeout` was
-        // silently disabled (the total-timeout path won outright). Users who
-        // composed them expecting `with_timeout` to act as a hard upper bound
-        // would lose that guardrail. The engine now treats their min as the
-        // effective graceful budget, so `with_timeout=10ms` still bounds the
-        // run even when `with_total_timeout=60s` is configured.
-        let workflow = Workflow::bare()
-            .with_timeout(Duration::from_millis(10))
-            .with_total_timeout(Duration::from_secs(60))
-            .register(
-                TestState::Start,
-                SleepyTask {
-                    sleep_ms: 1_000,
-                    next: TestState::Complete,
-                },
-            )
-            .add_exit_state(TestState::Complete);
-        let started = std::time::Instant::now();
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
-        let elapsed = started.elapsed();
-        assert!(
-            elapsed < Duration::from_millis(500),
-            "with_timeout (10ms) must still bound the run when total_timeout is also set; took {elapsed:?}"
-        );
-        // The graceful path produces `WorkflowTimeout`; the legacy path produces
-        // `CanoError::Workflow("Workflow timeout exceeded")`. The composition
-        // takes the graceful path.
-        assert!(
-            matches!(err.inner(), CanoError::WorkflowTimeout { .. }),
-            "expected graceful WorkflowTimeout, got: {err}"
-        );
-    }
-
-    #[tokio::test]
-    async fn with_timeout_alone_still_uses_legacy_blunt_timeout() {
-        // Sanity for F5: when only `with_timeout` is set the legacy path is
-        // unchanged — surface `CanoError::Workflow("Workflow timeout exceeded")`
-        // (the documented legacy shape) and run no compensation.
-        let workflow = Workflow::bare()
-            .with_timeout(Duration::from_millis(10))
-            .register(
-                TestState::Start,
-                SleepyTask {
-                    sleep_ms: 1_000,
-                    next: TestState::Complete,
-                },
-            )
-            .add_exit_state(TestState::Complete);
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
-        assert!(
-            matches!(err, CanoError::Workflow(ref m) if m.contains("Workflow timeout exceeded")),
-            "with_timeout alone must surface the legacy blunt-timeout shape, got: {err}"
-        );
-    }
-
     #[tokio::test]
     async fn total_timeout_fires_on_workflow_timeout_observer_hook() {
         let (obs, rec) = EventLog::new();
@@ -2502,7 +2616,9 @@ mod tests {
                 },
             )
             .add_exit_state(TestState::Complete);
-        let _ = workflow.orchestrate(TestState::Start).await;
+        let _ = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await;
         let events = rec.timeouts();
         assert_eq!(events.len(), 1, "hook should fire exactly once");
         let (elapsed, limit) = events[0];
@@ -2568,7 +2684,10 @@ mod tests {
             )
             .add_exit_state(TestState::Complete);
 
-        let err = workflow.orchestrate(TestState::Start).await.unwrap_err();
+        let err = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap_err();
         // Clean rollback → original error surfaced, wrapped with state context.
         assert!(
             matches!(err.inner(), CanoError::WorkflowTimeout { .. }),
@@ -2632,7 +2751,10 @@ mod tests {
         // The compensatable task runs ~80ms despite the 20ms total budget — and the
         // workflow exits cleanly (no compensation runs, no error surfaces) because
         // there's no further state for the budget to cancel.
-        let outcome = workflow.orchestrate(TestState::Start).await.unwrap();
+        let outcome = workflow
+            .orchestrate(TestState::Start, CancellationToken::disabled())
+            .await
+            .unwrap();
         assert_eq!(outcome, TestState::Complete);
         let log_entries = log.lock().unwrap().clone();
         assert_eq!(log_entries, vec!["ran".to_string()]);
@@ -2854,6 +2976,8 @@ mod dispatch_with_budget_tests {
         let call_count = Arc::new(AtomicUsize::new(0));
         let cc = Arc::clone(&call_count);
         let result = Workflow::<TestState>::dispatch_with_budget(
+            None,
+            &CancellationToken::disabled(),
             None,
             &observers,
             async { Ok::<TestState, CanoError>(TestState::Complete) },
@@ -2884,6 +3008,8 @@ mod dispatch_with_budget_tests {
         let cc = Arc::clone(&call_count);
         let result = Workflow::<TestState>::dispatch_with_budget(
             budget_for(Duration::from_secs(60)),
+            &CancellationToken::disabled(),
+            None,
             &observers,
             async { Ok::<TestState, CanoError>(TestState::Complete) },
             |_o, _err| {
@@ -2915,6 +3041,8 @@ mod dispatch_with_budget_tests {
                 Duration::from_millis(50),
                 tokio::time::Instant::now() - Duration::from_millis(1),
             )),
+            &CancellationToken::disabled(),
+            None,
             &observers,
             async {
                 tokio::time::sleep(Duration::from_secs(60)).await;
@@ -2959,6 +3087,8 @@ mod dispatch_with_budget_tests {
                 Duration::from_millis(10),
                 tokio::time::Instant::now() - Duration::from_millis(1),
             )),
+            &CancellationToken::disabled(),
+            None,
             &observers,
             async {
                 tokio::time::sleep(Duration::from_secs(60)).await;
@@ -2993,6 +3123,8 @@ mod dispatch_with_budget_tests {
                 Duration::from_millis(5),
                 tokio::time::Instant::now() - Duration::from_millis(1),
             )),
+            &CancellationToken::disabled(),
+            None,
             &observers,
             async {
                 tokio::time::sleep(Duration::from_secs(60)).await;
@@ -3027,6 +3159,8 @@ mod dispatch_with_budget_tests {
         let observers: [Arc<dyn WorkflowObserver>; 0] = [];
         let err = Workflow::<TestState>::dispatch_with_budget(
             budget_for(Duration::from_secs(60)),
+            &CancellationToken::disabled(),
+            None,
             &observers,
             async { Err::<TestState, _>(CanoError::task_execution("custom inner err")) },
             |_o, _err| {},
@@ -3038,4 +3172,139 @@ mod dispatch_with_budget_tests {
             "must propagate the inner err verbatim, got: {err}"
         );
     }
+
+    // ----- cancellation path (token can fire ⇒ the `select!` arm is taken) -----
+
+    #[tokio::test]
+    async fn live_token_uncancelled_passes_future_through() {
+        // A real (not `never`) token that never fires takes the `select!` path, but the
+        // `res = budgeted` arm must still return the future's Ok with no observer events.
+        let (observer, rec) = EventLog::new();
+        let observer_dyn: Arc<dyn WorkflowObserver> = Arc::new(observer);
+        let observers = [observer_dyn];
+        let (_handle, token) = CancellationToken::new();
+        let result = Workflow::<TestState>::dispatch_with_budget(
+            None,
+            &token,
+            None,
+            &observers,
+            async { Ok::<TestState, CanoError>(TestState::Complete) },
+            |o, err| o.on_task_failure("synthetic", err),
+        )
+        .await
+        .expect("uncancelled live token returns the future's Ok");
+        assert_eq!(result, TestState::Complete);
+        assert!(
+            rec.is_empty(),
+            "no observer events when the task completes normally"
+        );
+    }
+
+    #[tokio::test]
+    async fn live_token_with_expired_budget_still_times_out() {
+        // Regression: adding the cancellation race must not break the budget timeout.
+        // A live-but-uncancelled token + an already-expired deadline must still trip
+        // `WorkflowTimeout` (and fire the timeout hooks, not `on_cancelled`).
+        let (observer, rec) = EventLog::new();
+        let observer_dyn: Arc<dyn WorkflowObserver> = Arc::new(observer);
+        let observers = [observer_dyn];
+        let (_handle, token) = CancellationToken::new();
+        let err = Workflow::<TestState>::dispatch_with_budget(
+            Some((
+                std::time::Instant::now() - Duration::from_secs(1),
+                Duration::from_millis(5),
+                tokio::time::Instant::now() - Duration::from_millis(1),
+            )),
+            &token,
+            Some("StateX"),
+            &observers,
+            async {
+                tokio::time::sleep(Duration::from_secs(60)).await;
+                Ok::<TestState, CanoError>(TestState::Complete)
+            },
+            |o, err| o.on_task_failure("synthetic", err),
+        )
+        .await
+        .expect_err("expired budget must trip even with a live token");
+        assert!(
+            matches!(err, CanoError::WorkflowTimeout { .. }),
+            "got: {err}"
+        );
+        let labels = rec.labels();
+        assert!(
+            labels.iter().any(|l| l.starts_with("workflow_timeout:")),
+            "timeout hook fired: {labels:?}"
+        );
+        assert!(
+            !labels.iter().any(|l| l.starts_with("cancelled:")),
+            "on_cancelled must NOT fire on a timeout: {labels:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn precancelled_token_returns_cancelled_and_fires_hooks() {
+        // The new cancel arm: a pre-cancelled token (biased `select!` picks it) must drop
+        // the inner future, return `Cancelled`, fire the per-task fan-out (gauge balance)
+        // AND `on_cancelled(state_label)` — but not the timeout hook.
+        let (observer, rec) = EventLog::new();
+        let observer_dyn: Arc<dyn WorkflowObserver> = Arc::new(observer);
+        let observers = [observer_dyn];
+        let (handle, token) = CancellationToken::new();
+        handle.cancel();
+        let err = Workflow::<TestState>::dispatch_with_budget(
+            None,
+            &token,
+            Some("StateX"),
+            &observers,
+            async {
+                tokio::time::sleep(Duration::from_secs(60)).await;
+                Ok::<TestState, CanoError>(TestState::Complete)
+            },
+            |o, err| o.on_task_failure("synthetic", err),
+        )
+        .await
+        .expect_err("pre-cancelled token returns Cancelled");
+        assert!(matches!(err, CanoError::Cancelled), "got: {err}");
+        let labels = rec.labels();
+        assert!(
+            labels.iter().any(|l| l == "task_failure:synthetic"),
+            "fan-out (gauge balance) fired: {labels:?}"
+        );
+        assert!(
+            labels.iter().any(|l| l == "cancelled:StateX"),
+            "on_cancelled fired with the state label: {labels:?}"
+        );
+        assert!(
+            !labels.iter().any(|l| l.starts_with("workflow_timeout:")),
+            "timeout hook must NOT fire on a cancel: {labels:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn precancelled_token_without_state_label_skips_on_cancelled() {
+        // When no state label is available (no observers/checkpoint/metrics need it),
+        // the cancel arm still returns `Cancelled` and runs the fan-out, but does not
+        // attempt to fire `on_cancelled`.
+        let (observer, rec) = EventLog::new();
+        let observer_dyn: Arc<dyn WorkflowObserver> = Arc::new(observer);
+        let observers = [observer_dyn];
+        let (handle, token) = CancellationToken::new();
+        handle.cancel();
+        let err = Workflow::<TestState>::dispatch_with_budget(
+            None,
+            &token,
+            None, // no label
+            &observers,
+            async { Ok::<TestState, CanoError>(TestState::Complete) },
+            |o, err| o.on_task_failure("synthetic", err),
+        )
+        .await
+        .expect_err("pre-cancelled token returns Cancelled");
+        assert!(matches!(err, CanoError::Cancelled), "got: {err}");
+        let labels = rec.labels();
+        assert!(
+            !labels.iter().any(|l| l.starts_with("cancelled:")),
+            "on_cancelled must not fire without a state label: {labels:?}"
+        );
+    }
 }
diff --git a/cano/src/workflow/test_support.rs b/cano/src/workflow/test_support.rs
index e280ac1..9a7f78c 100644
--- a/cano/src/workflow/test_support.rs
+++ b/cano/src/workflow/test_support.rs
@@ -265,6 +265,9 @@ pub(crate) enum TestEvent {
         elapsed: Duration,
         limit: Duration,
     },
+    Cancelled {
+        state: String,
+    },
     CheckpointClearFailed {
         workflow_id: String,
         error: String,
@@ -334,6 +337,11 @@ impl WorkflowObserver for EventLog {
     fn on_workflow_timeout(&self, elapsed: Duration, limit: Duration) {
         self.0.record(TestEvent::WorkflowTimeout { elapsed, limit });
     }
+    fn on_cancelled(&self, state: &str) {
+        self.0.record(TestEvent::Cancelled {
+            state: state.to_string(),
+        });
+    }
     fn on_checkpoint_clear_failed(&self, workflow_id: &str, error: &CanoError) {
         self.0.record(TestEvent::CheckpointClearFailed {
             workflow_id: workflow_id.to_string(),
@@ -377,6 +385,7 @@ impl Recorder<TestEvent> {
                 TestEvent::WorkflowTimeout { limit, .. } => {
                     format!("workflow_timeout:{}ms", limit.as_millis())
                 }
+                TestEvent::Cancelled { state } => format!("cancelled:{state}"),
                 TestEvent::CheckpointClearFailed { workflow_id, .. } => {
                     format!("checkpoint_clear_failed:{workflow_id}")
                 }
diff --git a/cano/tests/cancellation.rs b/cano/tests/cancellation.rs
new file mode 100644
index 0000000..712a942
--- /dev/null
+++ b/cano/tests/cancellation.rs
@@ -0,0 +1,991 @@
+//! Integration tests for cooperative cancellation (QoL-2).
+//!
+//! Covers: mid-task cancel, cancel-before-start, saga drain on cancel, dirty rollback,
+//! saga-safety (compensatable tasks run to completion), split child abort, exactly-once
+//! `on_cancelled`, idempotency, precedence over `with_total_timeout`, an uncancellable drain,
+//! pass-through equivalence with `orchestrate`, and resume cancellation.
+
+use cano::prelude::*;
+use cano::{CancellationHandle, CancellationToken};
+use std::borrow::Cow;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+enum Step {
+    Reserve,
+    Charge,
+    Ship,
+    Park,
+    Done,
+}
+
+// ---- a recording observer (no `testing` feature dependency) ----
+#[derive(Default)]
+struct Rec {
+    events: Mutex<Vec<String>>,
+}
+impl Rec {
+    fn push(&self, s: String) {
+        self.events.lock().unwrap().push(s);
+    }
+    fn snapshot(&self) -> Vec<String> {
+        self.events.lock().unwrap().clone()
+    }
+    fn count_prefix(&self, prefix: &str) -> usize {
+        self.snapshot()
+            .iter()
+            .filter(|e| e.starts_with(prefix))
+            .count()
+    }
+    fn has(&self, s: &str) -> bool {
+        self.snapshot().iter().any(|e| e == s)
+    }
+}
+impl WorkflowObserver for Rec {
+    fn on_task_start(&self, task_id: &str) {
+        self.push(format!("start:{task_id}"));
+    }
+    fn on_task_failure(&self, task_id: &str, _err: &CanoError) {
+        self.push(format!("failure:{task_id}"));
+    }
+    fn on_cancelled(&self, state: &str) {
+        self.push(format!("cancelled:{state}"));
+    }
+}
+
+// ---- a long-running, non-compensatable task: records start/completion ----
+struct LongTask {
+    started: Arc<AtomicBool>,
+    completed: Arc<AtomicBool>,
+    next: Step,
+}
+#[task(state = Step)]
+impl LongTask {
+    fn config(&self) -> TaskConfig {
+        TaskConfig::minimal()
+    }
+    async fn run_bare(&self) -> Result<TaskResult<Step>, CanoError> {
+        self.started.store(true, Ordering::SeqCst);
+        tokio::time::sleep(Duration::from_secs(10)).await;
+        self.completed.store(true, Ordering::SeqCst);
+        Ok(TaskResult::Single(self.next.clone()))
+    }
+}
+
+// ---- a compensatable step with a per-instance name (so the compensator registry keys
+// don't collide), optional pre-completion sleep, and an optionally-failing compensator ----
+struct CompStep {
+    name: &'static str,
+    next: Step,
+    log: Arc<Mutex<Vec<String>>>,
+    started: Arc<AtomicBool>,
+    completed: Arc<AtomicBool>,
+    run_sleep_ms: u64,
+    comp_sleep_ms: u64,
+    fail_comp: bool,
+}
+#[saga::task(state = Step)]
+impl CompStep {
+    type Output = ();
+    fn name(&self) -> Cow<'static, str> {
+        Cow::Borrowed(self.name)
+    }
+    async fn run(&self, _res: &Resources) -> Result<(TaskResult<Step>, ()), CanoError> {
+        self.started.store(true, Ordering::SeqCst);
+        if self.run_sleep_ms > 0 {
+            tokio::time::sleep(Duration::from_millis(self.run_sleep_ms)).await;
+        }
+        self.log.lock().unwrap().push(format!("run:{}", self.name));
+        self.completed.store(true, Ordering::SeqCst);
+        Ok((TaskResult::Single(self.next.clone()), ()))
+    }
+    async fn compensate(&self, _res: &Resources, _out: ()) -> Result<(), CanoError> {
+        if self.comp_sleep_ms > 0 {
+            tokio::time::sleep(Duration::from_millis(self.comp_sleep_ms)).await;
+        }
+        self.log
+            .lock()
+            .unwrap()
+            .push(format!("rollback:{}", self.name));
+        if self.fail_comp {
+            return Err(CanoError::task_execution(format!(
+                "comp {} boom",
+                self.name
+            )));
+        }
+        Ok(())
+    }
+}
+
+fn flag() -> Arc<AtomicBool> {
+    Arc::new(AtomicBool::new(false))
+}
+
+/// Cancel via `handle` as soon as `flag` flips true (deterministic: fire while the
+/// target task is parked in its sleep).
+fn cancel_when(flag: Arc<AtomicBool>, handle: CancellationHandle) -> tokio::task::JoinHandle<()> {
+    tokio::spawn(async move {
+        while !flag.load(Ordering::SeqCst) {
+            tokio::time::sleep(Duration::from_millis(2)).await;
+        }
+        handle.cancel();
+    })
+}
+
+#[tokio::test]
+async fn cancel_mid_long_running_task_returns_cancelled() {
+    let started = flag();
+    let completed = flag();
+    let wf = Workflow::bare()
+        .register(
+            Step::Ship,
+            LongTask {
+                started: started.clone(),
+                completed: completed.clone(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(started.clone(), handle);
+
+    let t0 = Instant::now();
+    let result = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert!(
+        !completed.load(Ordering::SeqCst),
+        "task should not complete"
+    );
+    assert!(
+        t0.elapsed() < Duration::from_secs(2),
+        "should abort promptly"
+    );
+}
+
+#[tokio::test]
+async fn cancel_before_orchestrate_returns_immediately_without_running_any_task() {
+    let started = flag();
+    let completed = flag();
+    let rec = Arc::new(Rec::default());
+    let wf = Workflow::bare()
+        .with_observer(rec.clone())
+        .register(
+            Step::Ship,
+            LongTask {
+                started: started.clone(),
+                completed: completed.clone(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    handle.cancel(); // pre-cancel before running
+
+    let result = wf.orchestrate(Step::Ship, token).await;
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert!(!started.load(Ordering::SeqCst), "task must not start");
+    assert_eq!(rec.count_prefix("start:"), 0, "no on_task_start fired");
+    assert_eq!(rec.count_prefix("cancelled:"), 1, "on_cancelled fired once");
+}
+
+#[tokio::test]
+async fn cancel_with_compensation_drains_stack_then_returns_cancelled() {
+    let log = Arc::new(Mutex::new(Vec::new()));
+    let ship_started = flag();
+    let ignore = flag();
+    let wf = Workflow::bare()
+        .register_with_compensation(
+            Step::Reserve,
+            CompStep {
+                name: "reserve",
+                next: Step::Charge,
+                log: log.clone(),
+                started: flag(),
+                completed: flag(),
+                run_sleep_ms: 0,
+                comp_sleep_ms: 0,
+                fail_comp: false,
+            },
+        )
+        .register_with_compensation(
+            Step::Charge,
+            CompStep {
+                name: "charge",
+                next: Step::Ship,
+                log: log.clone(),
+                started: flag(),
+                completed: flag(),
+                run_sleep_ms: 0,
+                comp_sleep_ms: 0,
+                fail_comp: false,
+            },
+        )
+        .register(
+            Step::Ship,
+            LongTask {
+                started: ship_started.clone(),
+                completed: ignore.clone(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(ship_started.clone(), handle);
+
+    let result = wf.orchestrate(Step::Reserve, token).await;
+    canceller.await.unwrap();
+
+    let err = result.unwrap_err();
+    assert_eq!(err.category(), "cancelled");
+    assert!(matches!(err.inner(), CanoError::Cancelled));
+    // Both compensatable steps ran, then rolled back in reverse order.
+    let events = log.lock().unwrap().clone();
+    assert_eq!(
+        events,
+        vec![
+            "run:reserve".to_string(),
+            "run:charge".to_string(),
+            "rollback:charge".to_string(),
+            "rollback:reserve".to_string(),
+        ]
+    );
+}
+
+#[tokio::test]
+async fn cancel_with_failing_compensator_surfaces_compensation_failed() {
+    let log = Arc::new(Mutex::new(Vec::new()));
+    let ship_started = flag();
+    let wf = Workflow::bare()
+        .register_with_compensation(
+            Step::Reserve,
+            CompStep {
+                name: "reserve",
+                next: Step::Ship,
+                log: log.clone(),
+                started: flag(),
+                completed: flag(),
+                run_sleep_ms: 0,
+                comp_sleep_ms: 0,
+                fail_comp: true, // its rollback fails
+            },
+        )
+        .register(
+            Step::Ship,
+            LongTask {
+                started: ship_started.clone(),
+                completed: flag(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(ship_started.clone(), handle);
+
+    let result = wf.orchestrate(Step::Reserve, token).await;
+    canceller.await.unwrap();
+
+    match result.unwrap_err() {
+        CanoError::CompensationFailed { errors } => {
+            // errors[0] is the original (wrapped) cancellation.
+            assert_eq!(errors[0].category(), "cancelled");
+            assert!(errors.len() >= 2, "must also carry the compensator error");
+        }
+        other => panic!("expected CompensationFailed, got {other:?}"),
+    }
+}
+
+#[tokio::test]
+async fn compensatable_task_not_interrupted_midflight() {
+    // Cancel WHILE a CompensatableSingle is running. Saga safety requires it to run to
+    // completion (so its rollback entry is recorded), with the cancel honoured at the next
+    // boundary — draining that entry. The downstream Park task must never start.
+    let log = Arc::new(Mutex::new(Vec::new()));
+    let hold_started = flag();
+    let hold_completed = flag();
+    let park_started = flag();
+    let wf = Workflow::bare()
+        .register_with_compensation(
+            Step::Reserve,
+            CompStep {
+                name: "hold",
+                next: Step::Park,
+                log: log.clone(),
+                started: hold_started.clone(),
+                completed: hold_completed.clone(),
+                run_sleep_ms: 150, // long enough to be cancelled mid-run
+                comp_sleep_ms: 0,
+                fail_comp: false,
+            },
+        )
+        .register(
+            Step::Park,
+            LongTask {
+                started: park_started.clone(),
+                completed: flag(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(hold_started.clone(), handle); // cancel during Hold's run
+
+    let result = wf.orchestrate(Step::Reserve, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert!(
+        hold_completed.load(Ordering::SeqCst),
+        "compensatable task must run to completion, not be interrupted"
+    );
+    assert!(
+        !park_started.load(Ordering::SeqCst),
+        "downstream task must not start"
+    );
+    assert_eq!(
+        log.lock().unwrap().clone(),
+        vec!["run:hold".to_string(), "rollback:hold".to_string()]
+    );
+}
+
+// A long-running split child.
+struct SplitChild {
+    started: Arc<AtomicUsize>,
+    completed: Arc<AtomicUsize>,
+}
+#[task(state = Step)]
+impl SplitChild {
+    fn config(&self) -> TaskConfig {
+        TaskConfig::minimal()
+    }
+    async fn run_bare(&self) -> Result<TaskResult<Step>, CanoError> {
+        self.started.fetch_add(1, Ordering::SeqCst);
+        tokio::time::sleep(Duration::from_secs(10)).await;
+        self.completed.fetch_add(1, Ordering::SeqCst);
+        Ok(TaskResult::Single(Step::Done))
+    }
+}
+
+#[tokio::test]
+async fn cancel_mid_split_aborts_children_and_returns_cancelled() {
+    let started = Arc::new(AtomicUsize::new(0));
+    let completed = Arc::new(AtomicUsize::new(0));
+    let children: Vec<SplitChild> = (0..3)
+        .map(|_| SplitChild {
+            started: started.clone(),
+            completed: completed.clone(),
+        })
+        .collect();
+    // No `with_total_timeout` here on purpose: cancellation is the *only* abort path,
+    // which is exactly the case where the synthetic per-branch `on_task_failure`
+    // fan-out must still fire to keep observer gauges balanced.
+    let rec = Arc::new(Rec::default());
+    let wf = Workflow::bare()
+        .with_observer(rec.clone())
+        .register_split(
+            Step::Ship,
+            children,
+            JoinConfig::new(JoinStrategy::All, Step::Done),
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    // Cancel only once all three children have started (so all three fired on_task_start).
+    let started_probe = started.clone();
+    let canceller = tokio::spawn(async move {
+        while started_probe.load(Ordering::SeqCst) < 3 {
+            tokio::time::sleep(Duration::from_millis(2)).await;
+        }
+        handle.cancel();
+    });
+
+    let t0 = Instant::now();
+    let result = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert!(
+        t0.elapsed() < Duration::from_secs(2),
+        "split should abort promptly"
+    );
+    // Give any aborted children a moment; none should have completed.
+    tokio::time::sleep(Duration::from_millis(50)).await;
+    assert_eq!(
+        completed.load(Ordering::SeqCst),
+        0,
+        "split children must be aborted, not completed"
+    );
+    // Observer gauge balance: every on_task_start must be paired with an on_task_failure,
+    // even though no total-timeout budget was set (the cancel path is the only abort route).
+    assert_eq!(rec.count_prefix("start:"), 3, "all branches started");
+    assert_eq!(
+        rec.count_prefix("failure:"),
+        rec.count_prefix("start:"),
+        "each started branch must get a paired on_task_failure on cancel (gauge balance)"
+    );
+}
+
+#[tokio::test]
+async fn on_cancelled_fires_exactly_once() {
+    let started = flag();
+    let rec = Arc::new(Rec::default());
+    let wf = Workflow::bare()
+        .with_observer(rec.clone())
+        .register(
+            Step::Ship,
+            LongTask {
+                started: started.clone(),
+                completed: flag(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(started.clone(), handle);
+    let result = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert_eq!(
+        rec.count_prefix("cancelled:"),
+        1,
+        "on_cancelled exactly once"
+    );
+    assert!(
+        rec.has("cancelled:Ship"),
+        "fired with the right state label"
+    );
+}
+
+#[tokio::test]
+async fn on_cancelled_does_not_fire_on_successful_run() {
+    let rec = Arc::new(Rec::default());
+    let log = Arc::new(Mutex::new(Vec::new()));
+    let wf = Workflow::bare()
+        .with_observer(rec.clone())
+        .register_with_compensation(
+            Step::Reserve,
+            CompStep {
+                name: "reserve",
+                next: Step::Done,
+                log,
+                started: flag(),
+                completed: flag(),
+                run_sleep_ms: 0,
+                comp_sleep_ms: 0,
+                fail_comp: false,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (_handle, token) = CancellationToken::new(); // armed but never fired
+    let result = wf.orchestrate(Step::Reserve, token).await;
+
+    assert_eq!(result.unwrap(), Step::Done);
+    assert_eq!(
+        rec.count_prefix("cancelled:"),
+        0,
+        "on_cancelled must not fire on a successful run"
+    );
+}
+
+#[tokio::test]
+async fn uncancelled_token_behaves_like_orchestrate() {
+    let build = || {
+        Workflow::bare()
+            .register_with_compensation(
+                Step::Reserve,
+                // A quick compensatable task (no sleep) that transitions straight to Done;
+                // a successful run never triggers its compensator.
+                CompStep {
+                    name: "reserve",
+                    next: Step::Done,
+                    log: Arc::new(Mutex::new(Vec::new())),
+                    started: flag(),
+                    completed: flag(),
+                    run_sleep_ms: 0,
+                    comp_sleep_ms: 0,
+                    fail_comp: false,
+                },
+            )
+            .add_exit_state(Step::Done)
+    };
+
+    let plain = build()
+        .orchestrate(Step::Reserve, CancellationToken::disabled())
+        .await;
+    let (_handle, token) = CancellationToken::new(); // never cancelled
+    let with_cancel = build().orchestrate(Step::Reserve, token).await;
+
+    assert_eq!(plain.unwrap(), Step::Done);
+    assert_eq!(with_cancel.unwrap(), Step::Done);
+}
+
+#[tokio::test]
+async fn double_cancel_is_idempotent() {
+    let started = flag();
+    let rec = Arc::new(Rec::default());
+    let wf = Workflow::bare()
+        .with_observer(rec.clone())
+        .register(
+            Step::Ship,
+            LongTask {
+                started: started.clone(),
+                completed: flag(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let probe = started.clone();
+    let canceller = tokio::spawn(async move {
+        while !probe.load(Ordering::SeqCst) {
+            tokio::time::sleep(Duration::from_millis(2)).await;
+        }
+        handle.cancel();
+        handle.cancel(); // second cancel is a no-op
+    });
+
+    let result = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert_eq!(
+        rec.count_prefix("cancelled:"),
+        1,
+        "still exactly one on_cancelled"
+    );
+}
+
+#[tokio::test]
+async fn cancellation_precedence_over_total_timeout() {
+    let started = flag();
+    let wf = Workflow::bare()
+        .with_total_timeout(Duration::from_secs(30)) // would not fire before the cancel
+        .register(
+            Step::Ship,
+            LongTask {
+                started: started.clone(),
+                completed: flag(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(started.clone(), handle);
+    let result = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+
+    let err = result.unwrap_err();
+    assert_eq!(
+        err.category(),
+        "cancelled",
+        "cancellation wins over the budget"
+    );
+}
+
+#[tokio::test]
+async fn compensation_drain_completes_fully_under_cancellation() {
+    // The drain is uncancellable: both (slow) compensators run to completion even though the
+    // token stays cancelled throughout the rollback.
+    let log = Arc::new(Mutex::new(Vec::new()));
+    let ship_started = flag();
+    let wf = Workflow::bare()
+        .register_with_compensation(
+            Step::Reserve,
+            CompStep {
+                name: "reserve",
+                next: Step::Charge,
+                log: log.clone(),
+                started: flag(),
+                completed: flag(),
+                run_sleep_ms: 0,
+                comp_sleep_ms: 40,
+                fail_comp: false,
+            },
+        )
+        .register_with_compensation(
+            Step::Charge,
+            CompStep {
+                name: "charge",
+                next: Step::Ship,
+                log: log.clone(),
+                started: flag(),
+                completed: flag(),
+                run_sleep_ms: 0,
+                comp_sleep_ms: 40,
+                fail_comp: false,
+            },
+        )
+        .register(
+            Step::Ship,
+            LongTask {
+                started: ship_started.clone(),
+                completed: flag(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(ship_started.clone(), handle);
+    let result = wf.orchestrate(Step::Reserve, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    let events = log.lock().unwrap().clone();
+    assert!(events.contains(&"rollback:charge".to_string()));
+    assert!(events.contains(&"rollback:reserve".to_string()));
+}
+
+// A resource whose teardown is observable, to prove cleanup runs even on cancel.
+struct TeardownProbe {
+    tore_down: Arc<AtomicUsize>,
+}
+#[resource]
+impl Resource for TeardownProbe {
+    async fn teardown(&self) -> Result<(), CanoError> {
+        self.tore_down.fetch_add(1, Ordering::SeqCst);
+        Ok(())
+    }
+}
+
+#[tokio::test]
+async fn resources_are_torn_down_on_cancel() {
+    let tore_down = Arc::new(AtomicUsize::new(0));
+    let started = flag();
+    let resources = Resources::new().insert(
+        "probe",
+        TeardownProbe {
+            tore_down: tore_down.clone(),
+        },
+    );
+    let wf = Workflow::new(resources)
+        .register(
+            Step::Ship,
+            LongTask {
+                started: started.clone(),
+                completed: flag(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(started.clone(), handle);
+    let result = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert_eq!(
+        tore_down.load(Ordering::SeqCst),
+        1,
+        "resource teardown must still run when a run is cancelled"
+    );
+}
+
+// =====================================================================================
+// Cancellation across every processing model. Each model is dispatched through the same
+// `dispatch_with_budget` race, so each must be interruptible mid-flight and surface
+// `Cancelled` promptly rather than running to completion.
+// =====================================================================================
+
+// RouterTask: cancel while a route lookup is in flight.
+struct SlowRouter {
+    started: Arc<AtomicBool>,
+}
+#[task::router(state = Step)]
+impl SlowRouter {
+    async fn route(&self, _res: &Resources) -> Result<TaskResult<Step>, CanoError> {
+        self.started.store(true, Ordering::SeqCst);
+        tokio::time::sleep(Duration::from_secs(10)).await;
+        Ok(TaskResult::Single(Step::Done))
+    }
+}
+
+#[tokio::test]
+async fn cancel_during_router_task_returns_cancelled() {
+    let started = flag();
+    let wf = Workflow::bare()
+        .register_router(
+            Step::Reserve,
+            SlowRouter {
+                started: started.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(started.clone(), handle);
+    let t0 = Instant::now();
+    let result = wf.orchestrate(Step::Reserve, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert!(
+        t0.elapsed() < Duration::from_secs(2),
+        "router must abort promptly"
+    );
+}
+
+// PollTask: cancel while the poll loop is parked between Pending polls.
+struct ForeverPoll {
+    started: Arc<AtomicBool>,
+}
+#[task::poll(state = Step)]
+impl ForeverPoll {
+    async fn poll(&self, _res: &Resources) -> Result<PollOutcome<Step>, CanoError> {
+        self.started.store(true, Ordering::SeqCst);
+        Ok(PollOutcome::Pending { delay_ms: 50 })
+    }
+}
+
+#[tokio::test]
+async fn cancel_during_poll_task_returns_cancelled() {
+    let started = flag();
+    let wf = Workflow::bare()
+        .register(
+            Step::Ship,
+            ForeverPoll {
+                started: started.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(started.clone(), handle);
+    let t0 = Instant::now();
+    let result = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert!(
+        t0.elapsed() < Duration::from_secs(2),
+        "poll loop must abort promptly"
+    );
+}
+
+// TimerTask: cancel during the scheduled sleep.
+struct SlowTimer {
+    started: Arc<AtomicBool>,
+    fired: Arc<AtomicBool>,
+}
+#[task::timer(state = Step)]
+impl SlowTimer {
+    async fn wait(&self, _res: &Resources) -> Result<TimerOutcome, CanoError> {
+        self.started.store(true, Ordering::SeqCst);
+        Ok(TimerOutcome::Duration(Duration::from_secs(10)))
+    }
+    async fn after_wait(&self, _res: &Resources) -> Result<TaskResult<Step>, CanoError> {
+        self.fired.store(true, Ordering::SeqCst);
+        Ok(TaskResult::Single(Step::Done))
+    }
+}
+
+#[tokio::test]
+async fn cancel_during_timer_task_returns_cancelled() {
+    let started = flag();
+    let fired = flag();
+    let wf = Workflow::bare()
+        .register(
+            Step::Ship,
+            SlowTimer {
+                started: started.clone(),
+                fired: fired.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(started.clone(), handle);
+    let t0 = Instant::now();
+    let result = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert!(
+        t0.elapsed() < Duration::from_secs(2),
+        "timer must abort promptly"
+    );
+    assert!(
+        !fired.load(Ordering::SeqCst),
+        "after_wait must not run when the timer is cancelled mid-sleep"
+    );
+}
+
+// BatchTask: cancel while items are being processed.
+struct SlowBatch {
+    started: Arc<AtomicBool>,
+    finished: Arc<AtomicBool>,
+}
+#[task::batch(state = Step)]
+impl SlowBatch {
+    type Item = u32;
+    type ItemOutput = ();
+    async fn load(&self, _res: &Resources) -> Result<Vec<u32>, CanoError> {
+        Ok(vec![0, 1, 2])
+    }
+    async fn process_item(&self, _item: &u32) -> Result<(), CanoError> {
+        self.started.store(true, Ordering::SeqCst);
+        tokio::time::sleep(Duration::from_secs(10)).await;
+        Ok(())
+    }
+    async fn finish(
+        &self,
+        _res: &Resources,
+        _outputs: Vec<Result<(), CanoError>>,
+    ) -> Result<TaskResult<Step>, CanoError> {
+        self.finished.store(true, Ordering::SeqCst);
+        Ok(TaskResult::Single(Step::Done))
+    }
+}
+
+#[tokio::test]
+async fn cancel_during_batch_task_returns_cancelled() {
+    let started = flag();
+    let finished = flag();
+    let wf = Workflow::bare()
+        .register(
+            Step::Ship,
+            SlowBatch {
+                started: started.clone(),
+                finished: finished.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(started.clone(), handle);
+    let t0 = Instant::now();
+    let result = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert!(
+        t0.elapsed() < Duration::from_secs(2),
+        "batch must abort promptly"
+    );
+    assert!(
+        !finished.load(Ordering::SeqCst),
+        "finish must not run when the batch is cancelled mid-processing"
+    );
+}
+
+// SteppedTask: cancel mid-step (no checkpoint store ⇒ cursor is in-memory only).
+struct SlowStepper {
+    started: Arc<AtomicBool>,
+}
+#[task::stepped(state = Step)]
+impl SlowStepper {
+    async fn step(
+        &self,
+        _res: &Resources,
+        cursor: Option<u32>,
+    ) -> Result<StepOutcome<u32, Step>, CanoError> {
+        self.started.store(true, Ordering::SeqCst);
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        let n = cursor.unwrap_or(0);
+        if n >= 10_000 {
+            Ok(StepOutcome::Done(TaskResult::Single(Step::Done)))
+        } else {
+            Ok(StepOutcome::More(n + 1))
+        }
+    }
+}
+
+#[tokio::test]
+async fn cancel_during_stepped_task_returns_cancelled() {
+    let started = flag();
+    let wf = Workflow::bare()
+        .register_stepped(
+            Step::Ship,
+            SlowStepper {
+                started: started.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(started.clone(), handle);
+    let t0 = Instant::now();
+    let result = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+
+    assert_eq!(result.unwrap_err().category(), "cancelled");
+    assert!(
+        t0.elapsed() < Duration::from_secs(2),
+        "stepped loop must abort promptly"
+    );
+}
+
+#[cfg(feature = "testing")]
+#[tokio::test]
+async fn resume_from_honors_precancelled_token() {
+    use cano::testing::InMemoryCheckpointStore;
+
+    let store = Arc::new(InMemoryCheckpointStore::new());
+    let run_count = Arc::new(AtomicUsize::new(0));
+
+    // Run 1: cancel mid-Ship so a checkpoint log is left behind (Ship's StateEntry row was
+    // written before the task ran). Ship counts its runs.
+    struct CountingLong {
+        started: Arc<AtomicBool>,
+        runs: Arc<AtomicUsize>,
+    }
+    #[task(state = Step)]
+    impl CountingLong {
+        fn config(&self) -> TaskConfig {
+            TaskConfig::minimal()
+        }
+        async fn run_bare(&self) -> Result<TaskResult<Step>, CanoError> {
+            self.runs.fetch_add(1, Ordering::SeqCst);
+            self.started.store(true, Ordering::SeqCst);
+            tokio::time::sleep(Duration::from_secs(10)).await;
+            Ok(TaskResult::Single(Step::Done))
+        }
+    }
+
+    let started = flag();
+    let wf = Workflow::bare()
+        .with_checkpoint_store(store.clone())
+        .with_workflow_id("run-1")
+        .register(
+            Step::Ship,
+            CountingLong {
+                started: started.clone(),
+                runs: run_count.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let (handle, token) = CancellationToken::new();
+    let canceller = cancel_when(started.clone(), handle);
+    let r1 = wf.orchestrate(Step::Ship, token).await;
+    canceller.await.unwrap();
+    assert_eq!(r1.unwrap_err().category(), "cancelled");
+    assert_eq!(run_count.load(Ordering::SeqCst), 1);
+
+    // Run 2: resume with a pre-cancelled token — the resumed run must cancel at the Ship
+    // boundary WITHOUT re-running the task.
+    let (handle2, token2) = CancellationToken::new();
+    handle2.cancel();
+    let r2 = wf.resume_from("run-1", token2).await;
+    assert_eq!(r2.unwrap_err().category(), "cancelled");
+    assert_eq!(
+        run_count.load(Ordering::SeqCst),
+        1,
+        "resumed task must not re-run when cancelled at the boundary"
+    );
+}
diff --git a/cano/tests/recovery_hardening.rs b/cano/tests/recovery_hardening.rs
index d4a552c..b5d21f3 100644
--- a/cano/tests/recovery_hardening.rs
+++ b/cano/tests/recovery_hardening.rs
@@ -112,7 +112,7 @@ async fn many_workflows_one_redb_file_concurrently() {
         let sidefx = dir.path().join(format!("sfx-{i}.log"));
         handles.push(tokio::spawn(async move {
             build(store, &format!("run-{i}"), &sidefx)
-                .orchestrate(St::Start)
+                .orchestrate(St::Start, CancellationToken::disabled())
                 .await
         }));
     }
@@ -148,7 +148,9 @@ async fn racing_runs_of_one_id_on_a_real_file() {
         let store = Arc::clone(&store);
         let sidefx = dir.path().join(format!("sfx-{i}.log"));
         handles.push(tokio::spawn(async move {
-            build(store, "dup", &sidefx).orchestrate(St::Start).await
+            build(store, "dup", &sidefx)
+                .orchestrate(St::Start, CancellationToken::disabled())
+                .await
         }));
     }
     let mut completed = 0;
@@ -183,7 +185,7 @@ async fn crash_mid_work_then_resume_does_not_re_run_start() {
             ok_appends: AtomicUsize::new(2),
         });
         let err = build(store, wf_id, &sidefx)
-            .orchestrate(St::Start)
+            .orchestrate(St::Start, CancellationToken::disabled())
             .await
             .expect_err("generation 1 must crash before reaching Done");
         assert_eq!(err.inner().category(), "checkpoint_store");
@@ -199,7 +201,12 @@ async fn crash_mid_work_then_resume_does_not_re_run_start() {
     {
         let store: Arc<dyn CheckpointStore> = Arc::new(RedbCheckpointStore::new(&db).unwrap());
         let wf = build(store, wf_id, &sidefx);
-        assert_eq!(wf.resume_from(wf_id).await.unwrap(), St::Done);
+        assert_eq!(
+            wf.resume_from(wf_id, CancellationToken::disabled())
+                .await
+                .unwrap(),
+            St::Done
+        );
     }
     assert_eq!(
         side_effects(&sidefx),
diff --git a/cano/tests/recovery_version_compat.rs b/cano/tests/recovery_version_compat.rs
index 26aa2be..14a5802 100644
--- a/cano/tests/recovery_version_compat.rs
+++ b/cano/tests/recovery_version_compat.rs
@@ -98,7 +98,7 @@ async fn legacy_redb_row_resumes_under_default_workflow_version() {
         .with_checkpoint_store(store);
 
     let final_state = workflow
-        .resume_from(wf_id)
+        .resume_from(wf_id, CancellationToken::disabled())
         .await
         .expect("legacy row must resume cleanly under default workflow_version");
     assert_eq!(final_state, St::Done);
@@ -124,7 +124,7 @@ async fn legacy_redb_row_rejected_when_workflow_version_bumped() {
         .with_workflow_version(1);
 
     let err = workflow
-        .resume_from(wf_id)
+        .resume_from(wf_id, CancellationToken::disabled())
         .await
         .expect_err("bumped workflow_version must reject the legacy row");
     assert_eq!(err, CanoError::workflow_version_mismatch(0, 1));
diff --git a/cano/tests/saga_hardening.rs b/cano/tests/saga_hardening.rs
index 21390f7..fd1348b 100644
--- a/cano/tests/saga_hardening.rs
+++ b/cano/tests/saga_hardening.rs
@@ -103,7 +103,9 @@ async fn many_compensatable_workflows_concurrent_with_injected_failures() {
         // Sweep the failure point across all steps (3 of every 4 runs roll back).
         let fail_at = i % (STEPS + 1);
         handles.push(tokio::spawn(async move {
-            let result = saga(ledger.clone(), &account, fail_at).orchestrate(0).await;
+            let result = saga(ledger.clone(), &account, fail_at)
+                .orchestrate(0, CancellationToken::disabled())
+                .await;
             (account, fail_at, result)
         }));
     }
@@ -259,7 +261,10 @@ mod recovery {
             .add_exit_state(St::Done)
             .with_checkpoint_store(store.clone());
 
-        let err = wf.resume_from("g").await.unwrap_err();
+        let err = wf
+            .resume_from("g", CancellationToken::disabled())
+            .await
+            .unwrap_err();
         assert_eq!(err.message(), "C failed");
         // B re-ran on resume and re-pushed its own entry; its persisted completion row at the
         // resume point must NOT be replayed too, or B would compensate twice.
diff --git a/cano/tests/scheduler_cancellation.rs b/cano/tests/scheduler_cancellation.rs
new file mode 100644
index 0000000..dc872e8
--- /dev/null
+++ b/cano/tests/scheduler_cancellation.rs
@@ -0,0 +1,667 @@
+#![cfg(feature = "scheduler")]
+//! Scheduler cancellation across processing models and schedule types.
+//!
+//! The scheduler fires the engine's `CancellationToken` uniformly for every flow,
+//! so `cancel_flow` / cancel-on-shutdown must work for *every* processing model
+//! (base Task, saga, split, stepped, timer, poll, batch) and *every* schedule
+//! type (manual, every, cron). These tests exercise that scheduler-specific wiring
+//! (token publish/clear in `execute_reserved_flow`, the `Cancel` command, the
+//! shutdown-cancel sweep, and `apply_outcome`'s cancel→Idle mapping) — the
+//! orchestrate-level per-model cancellation lives in `tests/cancellation.rs`.
+
+use cano::prelude::*;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering::SeqCst};
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+enum Step {
+    Reserve,
+    Charge,
+    Ship,
+    Work,
+    Done,
+}
+
+fn flag() -> Arc<AtomicBool> {
+    Arc::new(AtomicBool::new(false))
+}
+fn counter() -> Arc<AtomicUsize> {
+    Arc::new(AtomicUsize::new(0))
+}
+
+/// Spin until `flag` is true (a flow's task has started). Panics after 15s so a
+/// wiring slip or a cancellation regression fails fast instead of hanging (some
+/// of these flows — poll/stepped — never terminate on their own).
+async fn await_flag(flag: &Arc<AtomicBool>) {
+    let t_start = Instant::now();
+    while !flag.load(SeqCst) {
+        assert!(
+            t_start.elapsed() < Duration::from_secs(15),
+            "flow task never started (wiring bug, or the flow errored before this state)"
+        );
+        tokio::time::sleep(Duration::from_millis(2)).await;
+    }
+}
+
+/// Spin until the flow leaves `Status::Running` (its cancelled run settled).
+/// Panics after 15s so a cancellation regression fails fast instead of hanging.
+async fn await_not_running(running: &RunningScheduler<Step>, id: &str) {
+    let t_start = Instant::now();
+    loop {
+        if running.status(id).await.map(|i| i.status) != Some(Status::Running) {
+            return;
+        }
+        assert!(
+            t_start.elapsed() < Duration::from_secs(15),
+            "flow '{id}' never left Running after cancel — cancellation regression"
+        );
+        tokio::time::sleep(Duration::from_millis(2)).await;
+    }
+}
+
+// ---- base long-running task (the cancellable building block) ----
+#[derive(Clone)]
+struct Long {
+    started: Arc<AtomicBool>,
+    completed: Arc<AtomicBool>,
+    next: Step,
+}
+#[task(state = Step)]
+impl Long {
+    fn config(&self) -> TaskConfig {
+        TaskConfig::minimal()
+    }
+    async fn run_bare(&self) -> Result<TaskResult<Step>, CanoError> {
+        self.started.store(true, SeqCst);
+        tokio::time::sleep(Duration::from_secs(30)).await;
+        self.completed.store(true, SeqCst);
+        Ok(TaskResult::Single(self.next.clone()))
+    }
+}
+
+// ---- saga steps (distinct types ⇒ distinct compensator keys) ----
+#[derive(Clone)]
+struct Reserve {
+    log: Arc<Mutex<Vec<String>>>,
+    fail_comp: bool,
+}
+#[saga::task(state = Step)]
+impl Reserve {
+    type Output = ();
+    async fn run(&self, _res: &Resources) -> Result<(TaskResult<Step>, ()), CanoError> {
+        self.log.lock().unwrap().push("run:reserve".into());
+        Ok((TaskResult::Single(Step::Charge), ()))
+    }
+    async fn compensate(&self, _res: &Resources, _o: ()) -> Result<(), CanoError> {
+        self.log.lock().unwrap().push("rollback:reserve".into());
+        if self.fail_comp {
+            return Err(CanoError::task_execution("reserve rollback boom"));
+        }
+        Ok(())
+    }
+}
+
+#[derive(Clone)]
+struct Charge {
+    log: Arc<Mutex<Vec<String>>>,
+}
+#[saga::task(state = Step)]
+impl Charge {
+    type Output = ();
+    async fn run(&self, _res: &Resources) -> Result<(TaskResult<Step>, ()), CanoError> {
+        self.log.lock().unwrap().push("run:charge".into());
+        Ok((TaskResult::Single(Step::Ship), ()))
+    }
+    async fn compensate(&self, _res: &Resources, _o: ()) -> Result<(), CanoError> {
+        self.log.lock().unwrap().push("rollback:charge".into());
+        Ok(())
+    }
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn cancel_flow_drains_scheduled_saga_and_returns_to_idle() {
+    let log = Arc::new(Mutex::new(Vec::new()));
+    let ship_started = flag();
+    let wf = Workflow::bare()
+        .register_with_compensation(
+            Step::Reserve,
+            Reserve {
+                log: log.clone(),
+                fail_comp: false,
+            },
+        )
+        .register_with_compensation(Step::Charge, Charge { log: log.clone() })
+        .register(
+            Step::Ship,
+            Long {
+                started: ship_started.clone(),
+                completed: flag(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler.manual("order", wf, Step::Reserve).unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    running.trigger("order").await.unwrap();
+    await_flag(&ship_started).await;
+    running.cancel_flow("order").await.unwrap();
+    await_not_running(&running, "order").await;
+
+    let info = running.status("order").await.unwrap();
+    assert_eq!(info.status, Status::Idle, "clean cancel → Idle");
+    assert_eq!(info.failure_streak, 0, "cancel is not a backoff failure");
+    assert_eq!(
+        *log.lock().unwrap(),
+        vec![
+            "run:reserve".to_string(),
+            "run:charge".to_string(),
+            "rollback:charge".to_string(),
+            "rollback:reserve".to_string(),
+        ],
+        "saga must roll back in reverse on a scheduled cancel"
+    );
+    running.stop().await.unwrap();
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn cancel_flow_dirty_rollback_saga_parks_in_backoff() {
+    // A cancel whose compensator FAILS surfaces as `compensation_failed`, which is
+    // a genuine fault: the flow lands in Backoff (default policy never trips), NOT Idle.
+    let log = Arc::new(Mutex::new(Vec::new()));
+    let ship_started = flag();
+    let wf = Workflow::bare()
+        .register_with_compensation(
+            Step::Reserve,
+            Reserve {
+                log: log.clone(),
+                fail_comp: true, // Reserve's rollback fails ⇒ dirty rollback
+            },
+        )
+        // Reserve.run transitions to Charge, so Charge must be registered for the
+        // chain to reach the long Ship step where the cancel lands.
+        .register_with_compensation(Step::Charge, Charge { log: log.clone() })
+        .register(
+            Step::Ship,
+            Long {
+                started: ship_started.clone(),
+                completed: flag(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler.manual("order", wf, Step::Reserve).unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    running.trigger("order").await.unwrap();
+    await_flag(&ship_started).await;
+    running.cancel_flow("order").await.unwrap();
+    await_not_running(&running, "order").await;
+
+    let info = running.status("order").await.unwrap();
+    assert!(
+        matches!(info.status, Status::Backoff { .. }),
+        "a dirty rollback is a failure → Backoff, got {:?}",
+        info.status
+    );
+    assert_eq!(info.failure_streak, 1);
+    running.stop().await.unwrap();
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn graceful_stop_rolls_back_in_flight_saga() {
+    let log = Arc::new(Mutex::new(Vec::new()));
+    let ship_started = flag();
+    let wf = Workflow::bare()
+        .register_with_compensation(
+            Step::Reserve,
+            Reserve {
+                log: log.clone(),
+                fail_comp: false,
+            },
+        )
+        .register_with_compensation(Step::Charge, Charge { log: log.clone() })
+        .register(
+            Step::Ship,
+            Long {
+                started: ship_started.clone(),
+                completed: flag(),
+                next: Step::Done,
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler.manual("order", wf, Step::Reserve).unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    running.trigger("order").await.unwrap();
+    await_flag(&ship_started).await;
+
+    let t_start = Instant::now();
+    running.stop().await.expect("graceful stop succeeds");
+    assert!(
+        t_start.elapsed() < Duration::from_secs(5),
+        "shutdown must cancel + drain, not wait 30s"
+    );
+    let events = log.lock().unwrap().clone();
+    assert!(events.contains(&"rollback:charge".to_string()));
+    assert!(events.contains(&"rollback:reserve".to_string()));
+}
+
+// ---- split ----
+#[derive(Clone)]
+struct SplitChild {
+    started: Arc<AtomicUsize>,
+    completed: Arc<AtomicUsize>,
+}
+#[task(state = Step)]
+impl SplitChild {
+    fn config(&self) -> TaskConfig {
+        TaskConfig::minimal()
+    }
+    async fn run_bare(&self) -> Result<TaskResult<Step>, CanoError> {
+        self.started.fetch_add(1, SeqCst);
+        tokio::time::sleep(Duration::from_secs(30)).await;
+        self.completed.fetch_add(1, SeqCst);
+        Ok(TaskResult::Single(Step::Done))
+    }
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn cancel_flow_cancels_in_flight_split_flow() {
+    let started = counter();
+    let completed = counter();
+    let children: Vec<SplitChild> = (0..3)
+        .map(|_| SplitChild {
+            started: started.clone(),
+            completed: completed.clone(),
+        })
+        .collect();
+    let wf = Workflow::bare()
+        .register_split(
+            Step::Work,
+            children,
+            JoinConfig::new(JoinStrategy::All, Step::Done),
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler.manual("split", wf, Step::Work).unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    running.trigger("split").await.unwrap();
+    let t_start = Instant::now();
+    while started.load(SeqCst) < 3 {
+        assert!(
+            t_start.elapsed() < Duration::from_secs(15),
+            "split children never all started"
+        );
+        tokio::time::sleep(Duration::from_millis(2)).await;
+    }
+    running.cancel_flow("split").await.unwrap();
+    await_not_running(&running, "split").await;
+
+    tokio::time::sleep(Duration::from_millis(50)).await;
+    assert_eq!(
+        completed.load(SeqCst),
+        0,
+        "split children must be aborted, not completed"
+    );
+    assert_eq!(running.status("split").await.unwrap().status, Status::Idle);
+    running.stop().await.unwrap();
+}
+
+// ---- stepped ----
+#[derive(Clone)]
+struct SlowStepper {
+    started: Arc<AtomicBool>,
+}
+#[task::stepped(state = Step)]
+impl SlowStepper {
+    async fn step(
+        &self,
+        _res: &Resources,
+        cursor: Option<u32>,
+    ) -> Result<StepOutcome<u32, Step>, CanoError> {
+        self.started.store(true, SeqCst);
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        let n = cursor.unwrap_or(0);
+        if n >= 100_000 {
+            Ok(StepOutcome::Done(TaskResult::Single(Step::Done)))
+        } else {
+            Ok(StepOutcome::More(n + 1))
+        }
+    }
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn cancel_flow_cancels_in_flight_stepped_flow() {
+    let started = flag();
+    let wf = Workflow::bare()
+        .register_stepped(
+            Step::Work,
+            SlowStepper {
+                started: started.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler.manual("stepped", wf, Step::Work).unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    running.trigger("stepped").await.unwrap();
+    await_flag(&started).await;
+    let t_start = Instant::now();
+    running.cancel_flow("stepped").await.unwrap();
+    await_not_running(&running, "stepped").await;
+
+    assert!(
+        t_start.elapsed() < Duration::from_secs(5),
+        "stepped loop aborts promptly"
+    );
+    assert_eq!(
+        running.status("stepped").await.unwrap().status,
+        Status::Idle
+    );
+    running.stop().await.unwrap();
+}
+
+// ---- timer ----
+#[derive(Clone)]
+struct SlowTimer {
+    started: Arc<AtomicBool>,
+    fired: Arc<AtomicBool>,
+}
+#[task::timer(state = Step)]
+impl SlowTimer {
+    async fn wait(&self, _res: &Resources) -> Result<TimerOutcome, CanoError> {
+        self.started.store(true, SeqCst);
+        Ok(TimerOutcome::Duration(Duration::from_secs(30)))
+    }
+    async fn after_wait(&self, _res: &Resources) -> Result<TaskResult<Step>, CanoError> {
+        self.fired.store(true, SeqCst);
+        Ok(TaskResult::Single(Step::Done))
+    }
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn cancel_flow_cancels_in_flight_timer_flow() {
+    let started = flag();
+    let fired = flag();
+    let wf = Workflow::bare()
+        .register(
+            Step::Work,
+            SlowTimer {
+                started: started.clone(),
+                fired: fired.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler.manual("timer", wf, Step::Work).unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    running.trigger("timer").await.unwrap();
+    await_flag(&started).await;
+    running.cancel_flow("timer").await.unwrap();
+    await_not_running(&running, "timer").await;
+
+    assert!(
+        !fired.load(SeqCst),
+        "after_wait must not run when the timer is cancelled"
+    );
+    assert_eq!(running.status("timer").await.unwrap().status, Status::Idle);
+    running.stop().await.unwrap();
+}
+
+// ---- poll ----
+#[derive(Clone)]
+struct ForeverPoll {
+    started: Arc<AtomicBool>,
+}
+#[task::poll(state = Step)]
+impl ForeverPoll {
+    async fn poll(&self, _res: &Resources) -> Result<PollOutcome<Step>, CanoError> {
+        self.started.store(true, SeqCst);
+        Ok(PollOutcome::Pending { delay_ms: 50 })
+    }
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn cancel_flow_cancels_in_flight_poll_flow() {
+    let started = flag();
+    let wf = Workflow::bare()
+        .register(
+            Step::Work,
+            ForeverPoll {
+                started: started.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler.manual("poll", wf, Step::Work).unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    running.trigger("poll").await.unwrap();
+    await_flag(&started).await;
+    let t_start = Instant::now();
+    running.cancel_flow("poll").await.unwrap();
+    await_not_running(&running, "poll").await;
+
+    assert!(
+        t_start.elapsed() < Duration::from_secs(5),
+        "poll loop aborts promptly"
+    );
+    assert_eq!(running.status("poll").await.unwrap().status, Status::Idle);
+    running.stop().await.unwrap();
+}
+
+// ---- batch ----
+#[derive(Clone)]
+struct SlowBatch {
+    started: Arc<AtomicBool>,
+    finished: Arc<AtomicBool>,
+}
+#[task::batch(state = Step)]
+impl SlowBatch {
+    type Item = u32;
+    type ItemOutput = ();
+    async fn load(&self, _res: &Resources) -> Result<Vec<u32>, CanoError> {
+        Ok(vec![0, 1, 2])
+    }
+    async fn process_item(&self, _item: &u32) -> Result<(), CanoError> {
+        self.started.store(true, SeqCst);
+        tokio::time::sleep(Duration::from_secs(30)).await;
+        Ok(())
+    }
+    async fn finish(
+        &self,
+        _res: &Resources,
+        _outputs: Vec<Result<(), CanoError>>,
+    ) -> Result<TaskResult<Step>, CanoError> {
+        self.finished.store(true, SeqCst);
+        Ok(TaskResult::Single(Step::Done))
+    }
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn cancel_flow_cancels_in_flight_batch_flow() {
+    let started = flag();
+    let finished = flag();
+    let wf = Workflow::bare()
+        .register(
+            Step::Work,
+            SlowBatch {
+                started: started.clone(),
+                finished: finished.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler.manual("batch", wf, Step::Work).unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    running.trigger("batch").await.unwrap();
+    await_flag(&started).await;
+    let t_start = Instant::now();
+    running.cancel_flow("batch").await.unwrap();
+    await_not_running(&running, "batch").await;
+
+    assert!(
+        t_start.elapsed() < Duration::from_secs(5),
+        "batch aborts promptly"
+    );
+    assert!(
+        !finished.load(SeqCst),
+        "finish must not run when the batch is cancelled"
+    );
+    assert_eq!(running.status("batch").await.unwrap().status, Status::Idle);
+    running.stop().await.unwrap();
+}
+
+// ---- schedule types: every / cron ----
+
+#[tokio::test(flavor = "multi_thread")]
+async fn cancel_flow_on_every_flow_cancels_current_run_and_keeps_scheduling() {
+    // An interval flow: cancelling the in-flight run returns it to Idle and the
+    // loop keeps firing — a deliberate cancel must not stop future scheduled runs.
+    let started = counter();
+    let wf = Workflow::bare()
+        .register(
+            Step::Work,
+            CountingLong {
+                started: started.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler
+        .every("ticker", wf, Step::Work, Duration::from_millis(80))
+        .unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    // First run is in flight.
+    while started.load(SeqCst) < 1 {
+        tokio::time::sleep(Duration::from_millis(2)).await;
+    }
+    running.cancel_flow("ticker").await.unwrap();
+
+    // The loop must dispatch a *second* run after the cancel returns it to Idle.
+    let t_start = Instant::now();
+    while started.load(SeqCst) < 2 {
+        assert!(
+            t_start.elapsed() < Duration::from_secs(5),
+            "interval flow must keep scheduling after a cancel"
+        );
+        tokio::time::sleep(Duration::from_millis(5)).await;
+    }
+    assert!(
+        !matches!(
+            running.status("ticker").await.unwrap().status,
+            Status::Tripped { .. }
+        ),
+        "cancel must not trip an interval flow"
+    );
+    running.stop().await.unwrap();
+}
+
+#[tokio::test(flavor = "multi_thread")]
+async fn cancel_flow_cancels_in_flight_cron_flow() {
+    // Cover the cron loop path (`spawn_cron_loop`): a per-second cron flow whose
+    // run is cancelled mid-flight returns to Idle.
+    let started = counter();
+    let wf = Workflow::bare()
+        .register(
+            Step::Work,
+            CountingLong {
+                started: started.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler
+        .cron("cronflow", wf, Step::Work, "* * * * * *")
+        .unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    // First per-second tick starts a run within ~1s.
+    let t_start = Instant::now();
+    while started.load(SeqCst) < 1 {
+        assert!(
+            t_start.elapsed() < Duration::from_secs(3),
+            "cron tick should fire"
+        );
+        tokio::time::sleep(Duration::from_millis(5)).await;
+    }
+    running.cancel_flow("cronflow").await.unwrap();
+    await_not_running(&running, "cronflow").await;
+    assert_eq!(
+        running.status("cronflow").await.unwrap().status,
+        Status::Idle
+    );
+    running.stop().await.unwrap();
+}
+
+// A long task that counts how many runs have started — for interval/cron tests.
+#[derive(Clone)]
+struct CountingLong {
+    started: Arc<AtomicUsize>,
+}
+#[task(state = Step)]
+impl CountingLong {
+    fn config(&self) -> TaskConfig {
+        TaskConfig::minimal()
+    }
+    async fn run_bare(&self) -> Result<TaskResult<Step>, CanoError> {
+        self.started.fetch_add(1, SeqCst);
+        tokio::time::sleep(Duration::from_secs(30)).await;
+        Ok(TaskResult::Single(Step::Done))
+    }
+}
+
+// ---- stepped + checkpoint store (recovery/testing) ----
+#[cfg(feature = "testing")]
+#[tokio::test(flavor = "multi_thread")]
+async fn cancel_flow_cancels_checkpointed_stepped_flow() {
+    use cano::testing::InMemoryCheckpointStore;
+
+    let started = flag();
+    let store = Arc::new(InMemoryCheckpointStore::new());
+    let wf = Workflow::bare()
+        .with_checkpoint_store(store.clone())
+        .with_workflow_id("stepped-ckpt")
+        .register_stepped(
+            Step::Work,
+            SlowStepper {
+                started: started.clone(),
+            },
+        )
+        .add_exit_state(Step::Done);
+
+    let mut scheduler = Scheduler::new();
+    scheduler.manual("stepped", wf, Step::Work).unwrap();
+    let running = scheduler.start().await.unwrap();
+
+    running.trigger("stepped").await.unwrap();
+    await_flag(&started).await;
+    running.cancel_flow("stepped").await.unwrap();
+    await_not_running(&running, "stepped").await;
+
+    assert_eq!(
+        running.status("stepped").await.unwrap().status,
+        Status::Idle
+    );
+    running.stop().await.unwrap();
+}
diff --git a/cano/tests/testing_module_e2e.rs b/cano/tests/testing_module_e2e.rs
index 3c879ba..8f95164 100644
--- a/cano/tests/testing_module_e2e.rs
+++ b/cano/tests/testing_module_e2e.rs
@@ -41,7 +41,12 @@ async fn recording_observer_captures_path_and_checkpoints() {
         .with_checkpoint_store(store.clone())
         .with_workflow_id("run-1");
 
-    assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done);
+    assert_eq!(
+        wf.orchestrate(S::Start, CancellationToken::disabled())
+            .await
+            .unwrap(),
+        S::Done
+    );
     observer.assert_path(&["Start", "Work", "Done"]);
     observer.assert_completed_with("Done");
 
@@ -88,7 +93,7 @@ async fn in_memory_store_supports_resume_after_failure() {
 
     // Generation 1: crashes at Work. The log keeps Start + Work entries.
     build(runs.clone())
-        .orchestrate(S::Start)
+        .orchestrate(S::Start, CancellationToken::disabled())
         .await
         .expect_err("generation 1 must fail at Work");
     assert_eq!(store.load_run(wf_id).await.unwrap().len(), 2);
@@ -96,7 +101,13 @@ async fn in_memory_store_supports_resume_after_failure() {
     // Generation 2: resume re-enters at Work, re-runs it (now succeeds), reaches Done.
     let observer = Arc::new(RecordingObserver::new());
     let resumed = build(runs.clone()).with_observer(observer.clone());
-    assert_eq!(resumed.resume_from(wf_id).await.unwrap(), S::Done);
+    assert_eq!(
+        resumed
+            .resume_from(wf_id, CancellationToken::disabled())
+            .await
+            .unwrap(),
+        S::Done
+    );
     assert!(
         observer
             .events()
@@ -115,7 +126,10 @@ async fn panic_on_attempt_fails_fast_with_panic_error() {
         .register(S::Start, panic_on_attempt(1, S::Done))
         .add_exit_state(S::Done)
         .with_observer(observer.clone());
-    let err = wf.orchestrate(S::Start).await.unwrap_err();
+    let err = wf
+        .orchestrate(S::Start, CancellationToken::disabled())
+        .await
+        .unwrap_err();
     assert!(err.to_string().contains("panic"), "{err}");
     // Panics are not retried — no retry event recorded.
     assert!(
@@ -191,7 +205,9 @@ async fn assert_compensation_ran_matches_reverse_order() {
         .register_with_compensation(S::Work, Step2)
         .register(S::Finish, Fail)
         .add_exit_state(S::Done);
-    let _ = saga.orchestrate(S::Start).await; // fails, rolls back
+    let _ = saga
+        .orchestrate(S::Start, CancellationToken::disabled())
+        .await; // fails, rolls back
 
     let ran = handle.0.lock().unwrap().clone();
     // Step2 completed last, so it compensates first; then Step1.
diff --git a/cano/tests/testing_state_coverage.rs b/cano/tests/testing_state_coverage.rs
index 98699f0..24b579b 100644
--- a/cano/tests/testing_state_coverage.rs
+++ b/cano/tests/testing_state_coverage.rs
@@ -47,7 +47,9 @@ async fn router_hop_is_counted() {
         .register(S::Worker, Go(S::Done))
         .add_exit_state(S::Done)
         .with_observer(observer.clone());
-    wf.orchestrate(S::Start).await.unwrap();
+    wf.orchestrate(S::Start, CancellationToken::disabled())
+        .await
+        .unwrap();
     observer
         .assert_registered_states_entered(&wf)
         .expect("router hops counted");
@@ -61,7 +63,9 @@ async fn unregistered_routed_state_returns_err_does_not_panic() {
         .register(S::Orphan, Go(S::Done))
         .add_exit_state(S::Done)
         .with_observer(observer.clone());
-    wf.orchestrate(S::Start).await.unwrap();
+    wf.orchestrate(S::Start, CancellationToken::disabled())
+        .await
+        .unwrap();
     let missing = observer.assert_registered_states_entered(&wf).unwrap_err();
     assert!(missing.contains(&"Orphan".to_string()), "got: {missing:?}");
 }
@@ -73,7 +77,9 @@ async fn multi_state_with_explicit_list() {
         .register(S::Start, Go(S::Done))
         .add_exit_state(S::Done)
         .with_observer(observer.clone());
-    wf.orchestrate(S::Start).await.unwrap();
+    wf.orchestrate(S::Start, CancellationToken::disabled())
+        .await
+        .unwrap();
     let missing = observer
         .assert_all_states_entered(&[S::Start, S::Route, S::Worker])
         .unwrap_err();
diff --git a/cano/tests/timer_task_e2e.rs b/cano/tests/timer_task_e2e.rs
index 6569b22..bac917a 100644
--- a/cano/tests/timer_task_e2e.rs
+++ b/cano/tests/timer_task_e2e.rs
@@ -51,7 +51,10 @@ async fn duration_timer_fires_after_expected_delay() {
         .register(Step::Process, Process)
         .add_exit_state(Step::Done);
 
-    let result = workflow.orchestrate(Step::Wait).await.unwrap();
+    let result = workflow
+        .orchestrate(Step::Wait, CancellationToken::disabled())
+        .await
+        .unwrap();
     let elapsed = start.elapsed();
 
     assert_eq!(result, Step::Done);
@@ -92,7 +95,7 @@ async fn instant_timer_fires_at_correct_instant() {
         .register(Step::Wait, NearFutureInstantTimer)
         .register(Step::Process, Process)
         .add_exit_state(Step::Done)
-        .orchestrate(Step::Wait)
+        .orchestrate(Step::Wait, CancellationToken::disabled())
         .await
         .unwrap();
 
@@ -132,7 +135,7 @@ async fn attempt_timeout_cancels_long_timer() {
         .register(Step::Wait, OneHourTimer)
         .register(Step::Process, Process)
         .add_exit_state(Step::Done)
-        .orchestrate(Step::Wait)
+        .orchestrate(Step::Wait, CancellationToken::disabled())
         .await
         .unwrap_err();
 
diff --git a/cano/tests/tracing_tests.rs b/cano/tests/tracing_tests.rs
index 0aaa896..b6cbfe3 100644
--- a/cano/tests/tracing_tests.rs
+++ b/cano/tests/tracing_tests.rs
@@ -51,7 +51,10 @@ async fn test_workflow_with_tracing_span() {
         .register(TestState::Processing, TestTask::new("processing"))
         .add_exit_state(TestState::Complete);
 
-    let result = workflow.orchestrate(TestState::Start).await.unwrap();
+    let result = workflow
+        .orchestrate(TestState::Start, CancellationToken::disabled())
+        .await
+        .unwrap();
 
     assert_eq!(result, TestState::Complete);
 }
@@ -71,7 +74,10 @@ async fn test_concurrent_workflow_with_tracing_span() {
         .register(TestState::Processing, TestTask::new("processing"))
         .add_exit_state(TestState::Complete);
 
-    let result = workflow.orchestrate(TestState::Start).await.unwrap();
+    let result = workflow
+        .orchestrate(TestState::Start, CancellationToken::disabled())
+        .await
+        .unwrap();
 
     assert_eq!(result, TestState::Complete);
 }
@@ -117,7 +123,10 @@ async fn test_workflow_tracing_without_custom_span() {
         .register(TestState::Processing, TestTask::new("processing"))
         .add_exit_state(TestState::Complete);
 
-    let result = workflow.orchestrate(TestState::Start).await.unwrap();
+    let result = workflow
+        .orchestrate(TestState::Start, CancellationToken::disabled())
+        .await
+        .unwrap();
 
     assert_eq!(result, TestState::Complete);
 }
@@ -217,7 +226,13 @@ async fn test_tracing_observer_runs_workflow() {
         )
         .add_exit_state(Flow::Done)
         .with_observer(Arc::new(TracingObserver::new()));
-    assert_eq!(workflow.orchestrate(Flow::A).await.unwrap(), Flow::Done);
+    assert_eq!(
+        workflow
+            .orchestrate(Flow::A, CancellationToken::disabled())
+            .await
+            .unwrap(),
+        Flow::Done
+    );
 }
 
 #[derive(Clone)]
@@ -255,7 +270,12 @@ async fn test_tracing_observer_captures_events() {
         .register(Flow::A, AlwaysFail)
         .add_exit_state(Flow::Done)
         .with_observer(Arc::new(TracingObserver::new()));
-    assert!(fail_wf.orchestrate(Flow::A).await.is_err());
+    assert!(
+        fail_wf
+            .orchestrate(Flow::A, CancellationToken::disabled())
+            .await
+            .is_err()
+    );
 
     // Pre-tripped breaker → on_circuit_open + on_task_failure.
     let breaker = Arc::new(CircuitBreaker::new(CircuitPolicy {
@@ -274,7 +294,10 @@ async fn test_tracing_observer_captures_events() {
         )
         .add_exit_state(Flow::Done)
         .with_observer(Arc::new(TracingObserver::new()));
-    let cb_err = cb_wf.orchestrate(Flow::B).await.unwrap_err();
+    let cb_err = cb_wf
+        .orchestrate(Flow::B, CancellationToken::disabled())
+        .await
+        .unwrap_err();
     // The FSM wraps task failures with state context; `.inner()` peels one layer.
     assert!(matches!(cb_err.inner(), CanoError::CircuitOpen(_)));
 
diff --git a/docs/content/_index.md b/docs/content/_index.md
index d199521..41fbfe7 100644
--- a/docs/content/_index.md
+++ b/docs/content/_index.md
@@ -202,7 +202,7 @@ async fn main() -> Result<(), CanoError> {
         .register(WorkflowState::Process, DoneTask)
         .add_exit_state(WorkflowState::Complete);
 
-    workflow.orchestrate(WorkflowState::Start).await?;
+    workflow.orchestrate(WorkflowState::Start, CancellationToken::disabled()).await?;
     Ok(())
 }
 
diff --git a/docs/content/metrics/_index.md b/docs/content/metrics/_index.md
index 34fa5c7..359700d 100644
--- a/docs/content/metrics/_index.md
+++ b/docs/content/metrics/_index.md
@@ -108,6 +108,7 @@ Workflow::bare()
 <li><code>cano_circuit_open_events_total{task}</code> — on <code>on_circuit_open</code></li>
 <li><code>cano_checkpoints_observed_total</code> — on <code>on_checkpoint</code></li>
 <li><code>cano_resumes_total</code> — on <code>on_resume</code></li>
+<li><code>cano_observed_cancellations_total</code> — on <code>on_cancelled</code> (run cancelled via a <code>CancellationToken</code>)</li>
 </ul>
 <p>
 <code>on_task_start</code> is intentionally <em>not</em> counted — every dispatch already shows up in
@@ -408,7 +409,7 @@ async fn main() {
     // 2. Run the workflow a few times directly.
     for _ in 0..3 {
         workflow()
-            .orchestrate(Step::Fetch)
+            .orchestrate(Step::Fetch, CancellationToken::disabled())
             .await
             .expect("workflow run");
     }
diff --git a/docs/content/observers/_index.md b/docs/content/observers/_index.md
index 8da2137..a2fba75 100644
--- a/docs/content/observers/_index.md
+++ b/docs/content/observers/_index.md
@@ -86,7 +86,7 @@ async fn main() -> Result<(), CanoError> {
         .add_exit_state(Step::Done)
         .with_observer(counter.clone());
 
-    workflow.orchestrate(Step::Start).await?;
+    workflow.orchestrate(Step::Start, CancellationToken::disabled()).await?;
     assert_eq!(counter.0.load(Ordering::Relaxed), 1);
     Ok(())
 }
@@ -151,6 +151,15 @@ before the compensation drain runs. Followed on the public API's return by
 <code>CanoError::WithStateContext</code> wrapping a <code>CanoError::WorkflowTimeout</code> (clean
 rollback) or <code>CanoError::CompensationFailed</code> if a <code>compensate</code> also fails
 (its <code>errors[0]</code> carries the wrapped timeout).</p>
+
+<h3 id="on-cancelled"><a href="#on-cancelled" class="anchor-link" aria-hidden="true">#</a><code>on_cancelled(state: &amp;str)</code></h3>
+<p>Fired when a run is cancelled via a
+<a href="../resilience/#cancellation"><code>CancellationToken</code></a> — either observed at a state
+boundary or while a cancellable task was in flight. <code>state</code> is the <code>Debug</code>
+rendering of the state the cancel was observed at. Fires exactly once per cancelled run, before the
+compensation drain. Followed on the public API's return by <code>CanoError::WithStateContext</code>
+wrapping a <code>CanoError::Cancelled</code> (clean rollback) or <code>CanoError::CompensationFailed</code>
+whose <code>errors[0]</code> is the wrapped cancel (dirty rollback).</p>
 </div>
 </div>
 
@@ -260,7 +269,7 @@ async fn main() -> Result<(), CanoError> {
         .add_exit_state(Step::Done)
         .with_observer(Arc::new(TracingObserver::new()));
 
-    workflow.orchestrate(Step::Start).await?;
+    workflow.orchestrate(Step::Start, CancellationToken::disabled()).await?;
     Ok(())
 }
 ```
@@ -284,6 +293,7 @@ Because the events carry the <code>cano::observer</code> target, you can filter
 <tr><td><code>on_checkpoint</code></td><td><code>DEBUG</code></td><td><code>"checkpoint appended"</code></td><td><code>workflow_id</code>, <code>sequence</code></td></tr>
 <tr><td><code>on_resume</code></td><td><code>INFO</code></td><td><code>"workflow resumed from checkpoint"</code></td><td><code>workflow_id</code>, <code>sequence</code></td></tr>
 <tr><td><code>on_workflow_timeout</code></td><td><code>WARN</code></td><td><code>"workflow total timeout exceeded"</code></td><td><code>elapsed_ms</code>, <code>limit_ms</code></td></tr>
+<tr><td><code>on_cancelled</code></td><td><code>WARN</code></td><td><code>"workflow cancelled"</code></td><td><code>state</code></td></tr>
 </tbody>
 </table>
 <hr class="section-divider">
@@ -420,7 +430,7 @@ async fn main() -> Result<(), CanoError> {
         .add_exit_state(Step::Done)
         .with_observer(observer.clone());
 
-    workflow.orchestrate(Step::Load).await?;
+    workflow.orchestrate(Step::Load, CancellationToken::disabled()).await?;
     assert_eq!(observer.failures.load(Ordering::Relaxed), 0);
     Ok(())
 }
diff --git a/docs/content/recovery/_index.md b/docs/content/recovery/_index.md
index 6f96d09..ab6b4f8 100644
--- a/docs/content/recovery/_index.md
+++ b/docs/content/recovery/_index.md
@@ -85,7 +85,7 @@ async fn main() -> Result<(), CanoError> {
         .with_checkpoint_store(checkpoint_store)
         .with_workflow_id("run-42");
 
-    workflow.orchestrate(Step::Start).await?;
+    workflow.orchestrate(Step::Start, CancellationToken::disabled()).await?;
     Ok(())
 }
 ```
@@ -175,7 +175,7 @@ async fn main() -> Result<(), CanoError> {
         .with_checkpoint_store(checkpoint_store);
 
     // Some earlier process crashed mid-run; pick up where it left off.
-    let final_state = workflow.resume_from("run-42").await?;
+    let final_state = workflow.resume_from("run-42", CancellationToken::disabled()).await?;
     assert_eq!(final_state, Step::Done);
     Ok(())
 }
@@ -435,10 +435,10 @@ async fn main() -> Result<(), CanoError> {
         .with_workflow_id("demo-run");
 
     // Run 1: crashes inside ProcessTask. The Start and Process rows are already durable.
-    let _ = workflow.orchestrate(Step::Start).await;
+    let _ = workflow.orchestrate(Step::Start, CancellationToken::disabled()).await;
 
     // Run 2: resume — re-runs ProcessTask (now it succeeds) and finishes at Done.
-    let final_state = workflow.resume_from("demo-run").await?;
+    let final_state = workflow.resume_from("demo-run", CancellationToken::disabled()).await?;
     assert_eq!(final_state, Step::Done);
 
     // The append-only log: Start, Process (crash), Process (re-run), Finalize, Done —
diff --git a/docs/content/resilience/_index.md b/docs/content/resilience/_index.md
index 72a2fb4..2816c84 100644
--- a/docs/content/resilience/_index.md
+++ b/docs/content/resilience/_index.md
@@ -30,7 +30,8 @@ probes — lives in <a href="../recovery/">Recovery</a>, <a href="../saga/">Saga
 <li><a href="#workflow-total-timeout">Workflow Total Timeout</a></li>
 <li class="toc-sub"><a href="#workflow-total-timeout-compensation">Compensation drain budget</a></li>
 <li class="toc-sub"><a href="#workflow-total-timeout-observer">Observer hook</a></li>
-<li class="toc-sub"><a href="#workflow-total-timeout-vs">The three timeout knobs</a></li>
+<li class="toc-sub"><a href="#workflow-total-timeout-vs">The two timeout knobs</a></li>
+<li><a href="#cancellation">Cooperative Cancellation</a></li>
 <li><a href="#cb-rl-guides">Circuit Breakers &amp; Rate Limiting</a></li>
 <li><a href="#bulkhead">Bulkheads (split concurrency)</a></li>
 <li><a href="#panic-safety">Panic Safety</a></li>
@@ -101,8 +102,7 @@ impl CallTask {
 ```
 
 <p>Distinct from <code>Workflow::with_total_timeout</code> (the wall-clock budget for the entire
-orchestration — see below) and from the legacy <code>Workflow::with_timeout</code> (a blunt outer
-<code>tokio::time::timeout</code> with no graceful compensation). The full <code>TaskConfig</code> /
+orchestration — see below). The full <code>TaskConfig</code> /
 <code>RetryMode</code> API — including how attempt timeouts compose with each retry mode — lives in
 <a href="../task/configuration/">Tasks → Configuration &amp; Retries</a>.</p>
 <hr class="section-divider">
@@ -158,19 +158,18 @@ fields under the <code>cano::observer</code> target. See <a href="../observers/#
 Lifecycle Events</a> for the full hook reference.
 </p>
 
-<h3 id="workflow-total-timeout-vs"><a href="#workflow-total-timeout-vs" class="anchor-link" aria-hidden="true">#</a>The three timeout knobs</h3>
+<h3 id="workflow-total-timeout-vs"><a href="#workflow-total-timeout-vs" class="anchor-link" aria-hidden="true">#</a>The two timeout knobs</h3>
 <table class="styled-table">
 <thead><tr><th>API</th><th>Scope</th><th>On expiry</th><th>Compensation drain</th></tr></thead>
 <tbody>
 <tr><td><code>TaskConfig::with_attempt_timeout</code></td><td>One attempt of one task</td><td><code>CanoError::Timeout</code> — retried like any other failure; final timeout becomes <code>RetryExhausted</code></td><td>Triggered like any other terminal task error (unbounded)</td></tr>
 <tr><td><code>Workflow::with_total_timeout</code></td><td>The entire <code>orchestrate</code> / <code>resume_from</code> call</td><td>In-flight task aborted; <code>CanoError::WorkflowTimeout</code> (wrapped in <code>WithStateContext</code>)</td><td>Bounded by <code>with_compensation_timeout</code> or the default <code>min(remaining/2, 30s)</code></td></tr>
-<tr><td><code>Workflow::with_timeout</code> <em>(legacy)</em></td><td>The whole orchestration future</td><td><code>CanoError::Workflow("Workflow timeout exceeded")</code> — no graceful abort</td><td><strong>None</strong> — the future is dropped abruptly</td></tr>
 </tbody>
 </table>
 <p>
-Pick <code>with_total_timeout</code> for any new code that needs a workflow-wide budget. The legacy
-<code>with_timeout</code> remains for backward compatibility and composes naturally — if both are
-set, whichever fires first wins.
+Reach for <code>with_attempt_timeout</code> to bound a single call and <code>with_total_timeout</code>
+for a workflow-wide budget; they compose. To stop a run on an external signal rather than a deadline,
+use <a href="#cancellation">cooperative cancellation</a>.
 </p>
 
 <div class="callout callout-tip">
@@ -180,6 +179,78 @@ and the final error is the wrapped <code>WorkflowTimeout</code>.</p>
 </div>
 <hr class="section-divider">
 
+<h2 id="cancellation"><a href="#cancellation" class="anchor-link" aria-hidden="true">#</a>Cooperative Cancellation</h2>
+<p>
+Where a total timeout aborts a run on a <em>deadline</em>, cancellation aborts it on a <em>signal</em>
+you control — a shutdown handler, a user "stop" button, a parent task giving up.
+<code>Workflow::orchestrate(start, token)</code> (and <code>resume_from(id, token)</code>) always take
+a <code>CancellationToken</code>; firing the paired <code>CancellationHandle</code> aborts the
+in-flight task at its next await point, drains the
+<a href="../saga/">saga compensation stack</a>, and returns <code>CanoError::Cancelled</code> wrapped
+in <code>CanoError::WithStateContext</code> (or <code>CanoError::CompensationFailed</code> if a
+<code>compensate</code> also fails).
+</p>
+
+```rust
+use cano::CancellationToken;
+
+let (handle, token) = CancellationToken::new();
+
+// Cancel from anywhere — a signal handler, a sibling task, a timer:
+let canceller = tokio::spawn(async move {
+    shutdown_signal().await;
+    handle.cancel(); // idempotent; the handle is Clone, so many owners can trigger it
+});
+
+let result = workflow.orchestrate(Step::Reserve, token).await;
+assert!(matches!(result, Err(e) if e.category() == "cancelled"));
+```
+
+<p>
+To opt a run out of cancellation, pass <code>CancellationToken::disabled()</code> instead of a live
+token: <code>workflow.orchestrate(start, CancellationToken::disabled())</code>. A disabled token
+never fires and is zero-cost — the cancellation <code>select!</code> is skipped entirely.
+</p>
+
+<div class="callout callout-warning">
+<p><strong>Cancellation is cooperative.</strong> The engine drops the running task's future at its next
+<code>.await</code>. A task spinning in a tight synchronous/CPU loop with no <code>.await</code> is not
+interrupted until it next yields — design long-running task bodies to <code>.await</code> periodically
+if they must be cancellable.</p>
+</div>
+
+<h3 id="cancellation-saga"><a href="#cancellation-saga" class="anchor-link" aria-hidden="true">#</a>Saga safety</h3>
+<p>
+A <a href="../saga/">compensatable task</a> is <strong>never</strong> interrupted mid-run. Aborting it
+after an in-task side effect committed but before its <code>Output</code> reached the compensation
+stack would orphan that side effect with nothing to roll back. So a <code>CompensatableTask</code>
+always runs to completion (recording its rollback entry); the cancellation is honoured at the
+<em>next</em> state boundary, which then drains the now-complete stack. The compensation drain itself
+is uncancellable — a cancel that lands during rollback does not abort the remaining compensators.
+</p>
+
+<h3 id="cancellation-observer"><a href="#cancellation-observer" class="anchor-link" aria-hidden="true">#</a>Observer hook &amp; precedence</h3>
+<p>
+A <code>WorkflowObserver</code> receives one <code>on_cancelled(state)</code> call when the cancel is
+observed, before the drain runs; <code>TracingObserver</code> re-emits it as a <code>WARN</code> event
+and <code>MetricsObserver</code> increments <code>cano_observed_cancellations_total</code>. Against
+<code>with_total_timeout</code>, cancellation wins: it is checked deterministically at each state
+boundary and biased ahead of the per-state budget mid-task.
+</p>
+
+<div class="callout callout-tip">
+<p>The scheduler builds on this: <a href="../scheduler/#cancellation"><code>RunningScheduler::cancel_flow(id)</code></a>
+cancels an in-flight scheduled run, and graceful <code>stop()</code> cancels every in-flight flow
+(rolling back their sagas) instead of waiting for them to finish.</p>
+</div>
+
+<div class="callout callout-tip">
+<p>Runnable example: <code>cargo run --example workflow_cancellation</code> — a 3-step saga where a
+sibling task cancels the shipping step mid-flight; the prior steps' compensations run in reverse and
+the final error is the wrapped <code>Cancelled</code>.</p>
+</div>
+<hr class="section-divider">
+
 <h2 id="cb-rl-guides"><a href="#cb-rl-guides" class="anchor-link" aria-hidden="true">#</a>Circuit Breakers &amp; Rate Limiting</h2>
 <p>Two of the most-used resilience primitives have their own dedicated pages:</p>
 <ul>
diff --git a/docs/content/resources/_index.md b/docs/content/resources/_index.md
index c51f4f1..06fc3d1 100644
--- a/docs/content/resources/_index.md
+++ b/docs/content/resources/_index.md
@@ -86,7 +86,7 @@ async fn main() -> Result<(), CanoError> {
         .register(Step::Init, InitTask)
         .add_exit_state(Step::Done);
 
-    workflow.orchestrate(Step::Init).await?;
+    workflow.orchestrate(Step::Init, CancellationToken::disabled()).await?;
     Ok(())
 }
 
diff --git a/docs/content/saga/_index.md b/docs/content/saga/_index.md
index b4c5e0a..31cd2d7 100644
--- a/docs/content/saga/_index.md
+++ b/docs/content/saga/_index.md
@@ -325,7 +325,7 @@ async fn main() {
         .register(Step::Ship, ShipOrder)                      // plain — and it fails
         .add_exit_state(Step::Done);
 
-    match workflow.orchestrate(Step::Reserve).await {
+    match workflow.orchestrate(Step::Reserve, CancellationToken::disabled()).await {
         Ok(state) => println!("completed at {state:?}"),
         Err(error) => println!("failed, rolled back: {error}"), // "courier unavailable" — the original error
     }
diff --git a/docs/content/scheduler/_index.md b/docs/content/scheduler/_index.md
index 471759c..b733da6 100644
--- a/docs/content/scheduler/_index.md
+++ b/docs/content/scheduler/_index.md
@@ -22,6 +22,7 @@ template = "section.html"
 <li class="toc-sub"><a href="#manual-triggering">Manual Triggering</a></li>
 <li class="toc-sub"><a href="#mixed-scheduling">Mixed Scheduling</a></li>
 <li><a href="#backoff-guide">Backoff &amp; Trip State</a></li>
+<li><a href="#cancellation">Cancelling a Flow</a></li>
 <li><a href="#graceful-shutdown">Graceful Shutdown</a></li>
 <li><a href="#multi-level-map-reduce">Advanced: Multi-Level Map-Reduce</a></li>
 </ol>
@@ -57,7 +58,7 @@ override it per flow via <code>set_backoff</code>. <code>Scheduler</code> is <st
 <li><strong><code>RunningScheduler</code></strong> is the <em>live handle</em> returned by
 <code>scheduler.start().await?</code>. It owns the spawned driver and per-flow loop tasks. It is cheap to
 clone — every clone shares the same command channel and flow registry, so you can call <code>trigger</code>,
-<code>status</code>, <code>list</code>, <code>reset_flow</code>, and <code>stop</code> from any task.</li>
+<code>status</code>, <code>list</code>, <code>reset_flow</code>, <code>cancel_flow</code>, and <code>stop</code> from any task.</li>
 </ul>
 <p>
 <code>start</code> consumes the builder, so the compiler prevents you from starting the same scheduler
@@ -490,24 +491,46 @@ on a dedicated page:</p>
 </ul>
 <hr class="section-divider">
 
+<h2 id="cancellation"><a href="#cancellation" class="anchor-link" aria-hidden="true">#</a>Cancelling a Flow</h2>
+<p>
+<code>RunningScheduler::cancel_flow(id)</code> requests cooperative cancellation of a flow's in-flight
+run. The running workflow aborts at its next await point, its <a href="../saga/">saga compensation
+stack</a> drains (rolling back completed steps), and the flow returns to <code>Status::Idle</code>. A
+deliberate cancel is <strong>not</strong> counted as a failure against the
+<a href="#backoff-guide">BackoffPolicy</a> — the streak is left untouched and the flow never trips, so
+its next scheduled run fires normally. Cancelling a flow that isn't currently running is an idempotent
+no-op.
+</p>
+
+```rust
+// Stop the in-flight run of a flow; its saga rolls back and the flow goes Idle.
+running.cancel_flow("order").await?;
+```
+
+<div class="callout callout-tip">
+<p>Runnable example: <code>cargo run --example scheduler_cancellation --features scheduler</code> —
+triggers a saga, cancels it mid-flight, and watches the compensators roll back in reverse.</p>
+</div>
+<hr class="section-divider">
+
 <h2 id="graceful-shutdown"><a href="#graceful-shutdown" class="anchor-link" aria-hidden="true">#</a>Graceful Shutdown</h2>
 <p>
-The scheduler supports graceful shutdown, allowing currently running workflows to complete before stopping.
-This includes workflows started by interval or cron triggers as well as manually-triggered workflows.
-All active executions are tracked and included in the shutdown wait.
+When <code>stop()</code> is called, the scheduler signals all scheduling loops to stop and then
+<strong>cooperatively cancels every in-flight flow</strong> — interval, cron, and manual alike — the
+same way <code>cancel_flow</code> does: each running workflow aborts at its next await and drains its
+saga before the scheduler runs resource <code>teardown</code> (reverse registration order) and returns.
+Shutdown latency is therefore bounded by the time to the next await plus the compensation drain, not by
+how long the workflows would naturally take.
 </p>
 
 ```rust
-// Stop the scheduler and wait for running flows to finish.
+// Stop the scheduler: in-flight flows are cancelled + rolled back, then teardown runs.
 running.stop().await?;
-
 ```
 
 <p>
-When <code>stop()</code> is called, the scheduler signals all scheduling loops to stop,
-waits up to 30 seconds for any in-progress workflow executions to finish, and runs each
-workflow's resource <code>teardown_all</code> in reverse registration order before returning.
-A second <code>stop()</code> call after success is idempotent — it returns the same cached result.
+A bounded wait (up to 30 seconds) caps the drain; a second <code>stop()</code> after success is
+idempotent — it returns the same cached result.
 </p>
 
 <div class="callout callout-tip">
diff --git a/docs/content/split-join/_index.md b/docs/content/split-join/_index.md
index c193bd6..6b18360 100644
--- a/docs/content/split-join/_index.md
+++ b/docs/content/split-join/_index.md
@@ -253,7 +253,7 @@ async fn main() -> Result<(), CanoError> {
         .register(DataState::Aggregate, Aggregator)
         .add_exit_state(DataState::Complete);
 
-    let result = workflow.orchestrate(DataState::Start).await?;
+    let result = workflow.orchestrate(DataState::Start, CancellationToken::disabled()).await?;
     let final_result: i32 = store.get("final_result")?;
     println!("Workflow completed: {:?} — total {}", result, final_result);
     Ok(())
diff --git a/docs/content/split-join/parallel-patterns.md b/docs/content/split-join/parallel-patterns.md
index 9d3d5d2..b98a0e8 100644
--- a/docs/content/split-join/parallel-patterns.md
+++ b/docs/content/split-join/parallel-patterns.md
@@ -128,7 +128,7 @@ async fn main() -> Result<(), CanoError> {
         .add_exit_state(QueueState::Complete);
 
     loop {
-        let result = workflow.orchestrate(QueueState::PullBatch).await?;
+        let result = workflow.orchestrate(QueueState::PullBatch, CancellationToken::disabled()).await?;
         if result == QueueState::Complete && queue.lock().await.is_empty() {
             break;
         }
@@ -208,7 +208,7 @@ async fn main() -> Result<(), CanoError> {
         .register(DataState::Aggregate, FinishAggregate)
         .add_exit_state(DataState::Complete);
 
-    workflow.orchestrate(DataState::LoadRecords).await?;
+    workflow.orchestrate(DataState::LoadRecords, CancellationToken::disabled()).await?;
     Ok(())
 }
 ```
diff --git a/docs/content/stepped-task/_index.md b/docs/content/stepped-task/_index.md
index 798ad73..74a5b32 100644
--- a/docs/content/stepped-task/_index.md
+++ b/docs/content/stepped-task/_index.md
@@ -197,7 +197,7 @@ let workflow = Workflow::new(resources)
     .add_exit_state(Stage::Done);
 
 // First run crashes after 600/1000 steps. Restart:
-// let result = workflow.resume_from("nightly-crunch").await?;
+// let result = workflow.resume_from("nightly-crunch", CancellationToken::disabled()).await?;
 // step() is first called with Some(Progress { processed: 600, .. }) — not None.
 ```
 </div>
diff --git a/docs/content/store/_index.md b/docs/content/store/_index.md
index a9aec8a..95e3437 100644
--- a/docs/content/store/_index.md
+++ b/docs/content/store/_index.md
@@ -300,7 +300,7 @@ async fn main() -> Result<(), CanoError> {
         .register(Stage::Transform, TransformTask)
         .add_exit_state(Stage::Complete);
 
-    workflow.orchestrate(Stage::Ingest).await?;
+    workflow.orchestrate(Stage::Ingest, CancellationToken::disabled()).await?;
 
     // Read results after the workflow completes
     let result: Vec<u32> = store.get("result")?;
diff --git a/docs/content/testing/_index.md b/docs/content/testing/_index.md
index 0c1875d..d28da0b 100644
--- a/docs/content/testing/_index.md
+++ b/docs/content/testing/_index.md
@@ -86,7 +86,7 @@ async fn recording_observer_captures_the_path() {
         .add_exit_state(S::Done)
         .with_observer(observer.clone());
 
-    assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done);
+    assert_eq!(wf.orchestrate(S::Start, CancellationToken::disabled()).await.unwrap(), S::Done);
 
     // Assert the whole path, or inspect events directly.
     observer.assert_path(&["Start", "Done"]);
@@ -144,7 +144,7 @@ async fn every_registered_state_is_reached() {
         .register(S::Worker, Go(S::Done))
         .add_exit_state(S::Done)
         .with_observer(observer.clone());
-    wf.orchestrate(S::Start).await.unwrap();
+    wf.orchestrate(S::Start, CancellationToken::disabled()).await.unwrap();
 
     // Every registered handler was actually reached — no dead states.
     observer.assert_registered_states_entered(&wf).expect("no dead states");
@@ -189,7 +189,7 @@ async fn checkpoints_run_in_memory() {
         .add_exit_state(S::Done)
         .with_checkpoint_store(store.clone())
         .with_workflow_id("run-1");
-    assert_eq!(wf.orchestrate(S::Start).await.unwrap(), S::Done);
+    assert_eq!(wf.orchestrate(S::Start, CancellationToken::disabled()).await.unwrap(), S::Done);
 }
 ```
 
@@ -240,7 +240,7 @@ async fn panicking_task_fails_fast() {
         .register(S::Start, panic_on_attempt(1, S::Done))
         .add_exit_state(S::Done);
 
-    let err = wf.orchestrate(S::Start).await.unwrap_err();
+    let err = wf.orchestrate(S::Start, CancellationToken::disabled()).await.unwrap_err();
     assert!(err.to_string().contains("panic"));
 }
 ```
diff --git a/docs/content/tracing/_index.md b/docs/content/tracing/_index.md
index 655bf0e..2e53387 100644
--- a/docs/content/tracing/_index.md
+++ b/docs/content/tracing/_index.md
@@ -382,7 +382,7 @@ async fn main() -> Result<(), CanoError> {
 
     // 3. Run
     info!("Submitting order...");
-    workflow.orchestrate(State::Start).await?;
+    workflow.orchestrate(State::Start, CancellationToken::disabled()).await?;
 
     Ok(())
 }
diff --git a/docs/content/workflows/_index.md b/docs/content/workflows/_index.md
index dc4bb19..ec5ca68 100644
--- a/docs/content/workflows/_index.md
+++ b/docs/content/workflows/_index.md
@@ -14,6 +14,7 @@ template = "section.html"
 <ol>
 <li><a href="#defining-states">Defining States</a></li>
 <li><a href="#building-a-workflow">Building a Workflow</a></li>
+<li><a href="#cancellation">Cancelling a Run</a></li>
 <li><a href="#builder-pattern">Builder Pattern</a></li>
 <li><a href="#validation-guide">Validation &amp; Errors</a></li>
 <li><a href="#split-join">Parallel Tasks: Split &amp; Join</a></li>
@@ -124,7 +125,7 @@ async fn main() -> Result<(), CanoError> {
         .add_exit_states(vec![OrderState::Complete, OrderState::Failed]);
 
     // 3. Execute
-    let result = workflow.orchestrate(OrderState::Start).await?;
+    let result = workflow.orchestrate(OrderState::Start, CancellationToken::disabled()).await?;
 
     println!("Final State: {:?}", result);
     Ok(())
@@ -138,6 +139,41 @@ like the one above.</p>
 </div>
 <hr class="section-divider">
 
+<h2 id="cancellation"><a href="#cancellation" class="anchor-link" aria-hidden="true">#</a>Cancelling a Run</h2>
+<p>
+<code>orchestrate</code> always takes a <code>CancellationToken</code> as its second argument. The
+example above passes <code>CancellationToken::disabled()</code> — a token that never fires, opting
+the run out of cancellation at zero cost. To stop a run <em>early</em> on a signal you control — a
+shutdown handler, a user "stop" button, a parent task giving up — pass a live token from
+<code>CancellationToken::new()</code> instead and keep its paired <code>CancellationHandle</code>.
+Firing the handle aborts the in-flight task at its next <code>.await</code>, drains the
+<a href="../saga/">saga compensation stack</a>, and returns <code>CanoError::Cancelled</code>.
+</p>
+
+```rust
+use cano::prelude::*;
+
+let (handle, token) = CancellationToken::new();
+
+// Cancel from anywhere — the handle is Clone and cancel() is idempotent:
+tokio::spawn(async move {
+    shutdown_signal().await;
+    handle.cancel();
+});
+
+let result = workflow.orchestrate(OrderState::Start, token).await;
+assert!(matches!(result, Err(e) if e.category() == "cancelled"));
+```
+
+<p>
+Cancellation is <em>cooperative</em> (a task is interrupted only at an <code>.await</code>) and
+<em>saga-safe</em> (a <a href="../saga/">compensatable task</a> is never interrupted mid-run). The
+<a href="../resilience/#cancellation">Resilience → Cooperative Cancellation</a> page covers the full
+semantics, the <code>on_cancelled</code> observer hook, and precedence against
+<code>with_total_timeout</code>.
+</p>
+<hr class="section-divider">
+
 <h2 id="builder-pattern"><a href="#builder-pattern" class="anchor-link" aria-hidden="true">#</a>Builder Pattern and #[must_use]</h2>
 <p>
 Workflow uses a builder pattern where the <code>register*</code> methods and
@@ -319,21 +355,21 @@ fn build_workflow(store: MemoryStore) -> Workflow<TextPipelineState> {
         .register(TextPipelineState::Parse, ParseTask)
         .register(TextPipelineState::Transform, TransformTask)
         .add_exit_state(TextPipelineState::Done)
-        .with_timeout(Duration::from_secs(5))
+        .with_total_timeout(Duration::from_secs(5))
 }
 
 // Inside an HTTP handler:
 let store = MemoryStore::new();              // fresh store — full isolation
 store.put("input_text", text)?;
 let workflow = build_workflow(store.clone());
-let final_state = workflow.orchestrate(TextPipelineState::Parse).await?; // which terminal branch ran
+let final_state = workflow.orchestrate(TextPipelineState::Parse, CancellationToken::disabled()).await?; // which terminal branch ran
 let word_count: usize = store.get("word_count")?;
 ```
 
 <div class="callout callout-tip">
 <span class="callout-label">Tip</span>
 <p>
-Use <code>.with_timeout()</code> on the workflow to keep a hung request from blocking indefinitely. For
+Use <code>.with_total_timeout()</code> on the workflow to keep a hung request from blocking indefinitely. For
 read-heavy workloads with shared reference data, pre-populate one store, share it via <code>Arc</code>,
 and use per-request keys to avoid collisions. The full Axum version is in
 <code>cargo run --example workflow_on_request</code>.
diff --git a/docs/content/workflows/validation-and-errors.md b/docs/content/workflows/validation-and-errors.md
index 0c3dabd..d6e956d 100644
--- a/docs/content/workflows/validation-and-errors.md
+++ b/docs/content/workflows/validation-and-errors.md
@@ -90,7 +90,7 @@ async fn main() -> Result<(), CanoError> {
     workflow.validate_initial_state(&State::Start)?;
 
     // Safe to orchestrate
-    let _result = workflow.orchestrate(State::Start).await?;
+    let _result = workflow.orchestrate(State::Start, CancellationToken::disabled()).await?;
     Ok(())
 }
 ```
@@ -133,9 +133,9 @@ during execution. Understanding these errors helps you build robust error recove
 <td>Increase <code>with_total_timeout()</code> or speed up the workflow; see <a href="../../resilience/#workflow-total-timeout">Resilience → Workflow Total Timeout</a></td>
 </tr>
 <tr>
-<td><code>CanoError::Workflow</code></td>
-<td>Legacy <code>with_timeout()</code> outer <code>tokio::time::timeout</code> elapsed (no graceful compensation)</td>
-<td>Prefer <code>with_total_timeout()</code> for new code; otherwise increase <code>with_timeout()</code> or optimize task execution time</td>
+<td><code>CanoError::Cancelled</code></td>
+<td>Run cancelled via a live <code>CancellationToken</code> passed to <code>orchestrate</code> / <code>resume_from</code>; in-flight task aborted, compensation stack drained. Surfaced under <code>CanoError::WithStateContext</code> (or <code>CompensationFailed</code> on a dirty rollback).</td>
+<td>Expected when you cancel deliberately; see <a href="../../resilience/#cancellation">Resilience → Cooperative Cancellation</a></td>
 </tr>
 <tr>
 <td><code>CanoError::Configuration</code></td>
@@ -181,7 +181,7 @@ through the join strategy.
 </div>
 
 ```rust
-match workflow.orchestrate(State::Start).await {
+match workflow.orchestrate(State::Start, CancellationToken::disabled()).await {
     Ok(final_state) => println!("Completed: {:?}", final_state),
     Err(CanoError::Workflow(msg)) => eprintln!("Workflow error: {}", msg),
     Err(CanoError::Configuration(msg)) => eprintln!("Config error: {}", msg),

API	Scope	On expiry	Compensation drain
`TaskConfig::with_attempt_timeout`	One attempt of one task	`CanoError::Timeout` — retried like any other failure; final timeout becomes `RetryExhausted`	Triggered like any other terminal task error (unbounded)
`Workflow::with_total_timeout`	The entire `orchestrate` / `resume_from` call	In-flight task aborted; `CanoError::WorkflowTimeout` (wrapped in `WithStateContext`)	Bounded by `with_compensation_timeout` or the default `min(remaining/2, 30s)`
`Workflow::with_timeout` (legacy)	The whole orchestration future	`CanoError::Workflow("Workflow timeout exceeded")` — no graceful abort	None — the future is dropped abruptly