diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b565a1..d7ba16c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,33 @@ All notable changes to didcomm-dotnet are documented here. Format follows
## [Unreleased]
+### Security — problem-report cascade-guard redesign (`feat/security-cascade-guard`)
+
+Resolves two FR-PROTO-10 cascade-guard findings (GitHub issues #29, #36), addressed together since both
+stem from the per-`pthid` error budget living on the dispatcher's general thread store. Passed a
+PoC-backed adversarial red-team pass that caught — and drove the fix for — a concurrency bug in the
+first cut. Full suite (631 tests) green.
+
+- **Cascade budget moved to a dedicated, bounded store (#36).** The FR-PROTO-10 budget now lives in a
+ new `CascadeBudgetStore`, separate from the dispatcher's general thread store (which is flooded with
+ one entry per inbound `thid`). Previously a flood of cheap fresh `thid`s could evict — and reset to
+ zero — a victim `pthid`'s error budget, defeating the cascade-stop. The dedicated store **owns its
+ concurrency**: the increment + threshold check + trip decision is atomic under a single `pthid`-keyed
+ lock (the earlier "lock on the per-thread state object" seam broke under LRU eviction and could
+ double-emit the cascade-stop — fixed). Its LRU **prefers evicting not-yet-tripped entries**, so a
+ tripped thread's silenced decision survives a fresh-`pthid` flood without exempting entries from
+ eviction (still bounded). Registered as its own DI singleton so the budget persists regardless of the
+ handler's lifetime. Residual: a determined attacker can still pressure the bound with distinct-`pthid`
+ *error reports* (far costlier than the original cheap-`thid` reset); a distributed/persistent budget
+ is the path to a hard guarantee for horizontally-scaled hosts.
+- **Unrepliable-stream counter/log flood capped (#29).** A sustained stream of unrepliable
+ (anoncrypt/from-less) over-threshold reports previously incremented the counter and emitted an Info
+ log line forever (the trip permanently deferred). The handler now clamps the count at `threshold + 1`
+ and stops per-message logging after the breach, while preserving the deferral so a later repliable
+ report still fires the cascade-stop.
+- `ThreadState.ErrorCount`/`MaxErrorsNoticeSent` are retained as general-purpose per-thread fields but
+ are no longer used by the built-in cascade guard (which uses `CascadeBudgetStore`).
+
### Fixed — rotation JWT compliance
Resolves two Low-severity audit findings on the `from_prior` rotation path (GitHub issues #25, #26).
diff --git a/samples/02-Cookbook/Sections/Section_U_ProblemReport.cs b/samples/02-Cookbook/Sections/Section_U_ProblemReport.cs
index 5721e8d..fadf16d 100644
--- a/samples/02-Cookbook/Sections/Section_U_ProblemReport.cs
+++ b/samples/02-Cookbook/Sections/Section_U_ProblemReport.cs
@@ -84,7 +84,9 @@ public static async Task RunAsync(CookbookContext ctx)
ctx.Narrator.Value("Cascade-stop code", ProblemReportApi.ReadCode(trip!.Reply!));
ctx.Narrator.Value("Cascade-stop pthid", trip.Reply!.Pthid);
- ctx.Narrator.Value("Failing-thread errorCount", localStore.GetOrCreate(failingThread).ErrorCount);
+ // The breach count rides on the cascade-stop notice itself (the count is communicated to the
+ // peer, not read from internal state — the FR-PROTO-10 budget lives in a dedicated store, #36).
+ ctx.Narrator.Value("Cascade-stop comment", ProblemReportApi.RenderComment(trip.Reply!));
ctx.Narrator.Note("Beyond the trip the handler returns null for further reports on the same pthid (FR-PROTO-10).");
}
}
diff --git a/src/DidComm.Core/Protocols/ProblemReport/CascadeBudgetStore.cs b/src/DidComm.Core/Protocols/ProblemReport/CascadeBudgetStore.cs
new file mode 100644
index 0000000..4588194
--- /dev/null
+++ b/src/DidComm.Core/Protocols/ProblemReport/CascadeBudgetStore.cs
@@ -0,0 +1,168 @@
+using System.Linq;
+
+namespace DidComm.Protocols.ProblemReport;
+
+///
+/// Bounded, thread-safe, process-local tracker for the FR-PROTO-10 cascade budget, kept
+/// separate from the dispatcher's general thread-state store.
+///
+///
+///
+/// The dispatcher writes one entry per inbound message's thid into the general store. Routing
+/// the cascade budget through that same store let a flood of cheap fresh thids evict — and
+/// reset to zero — a victim pthid's error budget, defeating the cascade guard (#36). This
+/// tracker holds the budget separately, keyed only by the failing thread's pthid (so only
+/// error reports create entries).
+///
+///
+/// It owns its concurrency: does the whole
+/// increment + threshold check + trip decision atomically under a single lock keyed by the pthid
+/// string. (An earlier design locked on the per-pthid state object borrowed from an LRU store; under
+/// eviction two concurrent callers could lock different instances for the same pthid and
+/// double-emit the cascade-stop — the lock seam must not be an evictable value.) The single lock
+/// serializes all callers; that is a deliberate trade-off — problem reports are rare events, so a
+/// global lock is simpler and safe. A high-throughput in-process host would stripe locks by pthid; a
+/// multi-host deployment supplies a distributed budget (same frontier as the eviction residual below).
+///
+///
+/// Bounded at with approximate-LRU eviction that prefers
+/// evicting not-yet-tripped entries, so a tripped thread's silenced decision survives a flood of
+/// fresh pthids — without exempting tripped entries from eviction (if every entry is tripped the
+/// oldest are still removed, so the store never grows without limit, per #36). Residual: a determined
+/// attacker can still pressure the bound with distinct-pthid error reports (each does real
+/// per-message work), but that is far more expensive than the original cheap-thid reset; a
+/// distributed/persistent budget is the path to a hard guarantee for horizontally-scaled hosts.
+///
+///
+public sealed class CascadeBudgetStore
+{
+ /// Default cap on tracked failing-thread budgets.
+ public const int DefaultMaxEntries = 10_000;
+
+ private sealed class Entry
+ {
+ public int ErrorCount;
+ public bool NoticeSent;
+ public long LastTouched;
+ }
+
+ private readonly object _gate = new();
+ private readonly Dictionary _entries = new(StringComparer.Ordinal);
+ private readonly int _maxEntries;
+ private readonly int _lowWaterMark;
+ private long _tick;
+
+ /// Create a cascade-budget tracker bounded at failing threads.
+ /// Cap on tracked budgets; must be positive. Defaults to .
+ /// is not positive.
+ public CascadeBudgetStore(int maxEntries = DefaultMaxEntries)
+ {
+ if (maxEntries <= 0)
+ {
+ throw new ArgumentOutOfRangeException(
+ nameof(maxEntries), maxEntries, "Cascade-budget capacity must be > 0.");
+ }
+
+ _maxEntries = maxEntries;
+ // Trim toward ~90%, but ALWAYS strictly below the cap so Evict() actually frees a slot. A naive
+ // Math.Max(1, …) leaves lowWater == cap for maxEntries == 1 (1/10 == 0): Evict() removes
+ // Count − lowWater == 0 entries, the new entry is added anyway, and the store grows past its cap
+ // without bound. Clamp to maxEntries − 1 (PR #40 review).
+ _lowWaterMark = Math.Min(maxEntries - 1, Math.Max(1, maxEntries - Math.Max(1, maxEntries / 10)));
+ }
+
+ ///
+ /// Atomically record one inbound error problem-report on the failing thread named by
+ /// and return the FR-PROTO-10 cascade-guard decision.
+ ///
+ /// The failing thread's pthid (FR-PROTO-07, non-empty).
+ /// The per-thread error budget ().
+ /// Whether this report has a usable reply target (a cascade-stop can be addressed).
+ ///
+ /// = build and send the cascade-stop; =
+ /// the count advanced and should be logged; = the (clamped) count.
+ ///
+ internal CascadeStep RecordErrorReport(string pthid, int threshold, bool repliable)
+ {
+ lock (_gate)
+ {
+ var entry = GetOrCreate(pthid);
+
+ // Notice already emitted → fully silent: no increment, no log, no work.
+ if (entry.NoticeSent)
+ return new CascadeStep(Emit: false, Log: false, Count: entry.ErrorCount);
+
+ // Clamp at threshold+1: once a thread has breached, stop incrementing AND stop logging per
+ // message — otherwise a sustained unrepliable (from-less) stream grows the counter and
+ // floods the log forever, the trip permanently deferred (#29). The breach is counted and
+ // logged exactly once.
+ bool atCap = entry.ErrorCount > threshold;
+ if (!atCap)
+ entry.ErrorCount++;
+ int count = entry.ErrorCount;
+
+ if (count > threshold)
+ {
+ // Breach. A cascade-stop needs a repliable target; an anoncrypt/addressless report
+ // DEFERS emission (NoticeSent stays false) so a later repliable report on the same
+ // pthid can fire it, while the counter stays clamped so the unrepliable stream can't flood.
+ if (!repliable)
+ return new CascadeStep(Emit: false, Log: !atCap, Count: count);
+
+ entry.NoticeSent = true;
+ return new CascadeStep(Emit: true, Log: !atCap, Count: count);
+ }
+
+ return new CascadeStep(Emit: false, Log: !atCap, Count: count);
+ }
+ }
+
+ private Entry GetOrCreate(string pthid) // caller holds _gate
+ {
+ if (_entries.TryGetValue(pthid, out var entry))
+ {
+ entry.LastTouched = ++_tick;
+ return entry;
+ }
+
+ if (_entries.Count >= _maxEntries)
+ Evict();
+
+ entry = new Entry { LastTouched = ++_tick };
+ _entries[pthid] = entry;
+ return entry;
+ }
+
+ private void Evict() // caller holds _gate
+ {
+ // Remove down to the low-water mark, preferring not-yet-tripped entries (NoticeSent=false
+ // sorts first) and then oldest-touched. This keeps a tripped thread's silenced decision sticky
+ // against a flood of fresh, non-tripped pthids — without exempting tripped entries (if all are
+ // tripped the oldest are still removed, so the store can never grow without limit, #36).
+ var victims = _entries
+ .OrderBy(kv => kv.Value.NoticeSent)
+ .ThenBy(kv => kv.Value.LastTouched)
+ .Take(_entries.Count - _lowWaterMark)
+ .Select(kv => kv.Key)
+ .ToArray();
+
+ foreach (var key in victims)
+ _entries.Remove(key);
+ }
+
+ /// Current number of tracked failing-thread budgets. Exposed for tests.
+ internal int Count
+ {
+ get { lock (_gate) return _entries.Count; }
+ }
+
+ /// Read a thread's current budget without mutating it. Exposed for tests.
+ internal (int ErrorCount, bool NoticeSent) Peek(string pthid)
+ {
+ lock (_gate)
+ return _entries.TryGetValue(pthid, out var e) ? (e.ErrorCount, e.NoticeSent) : (0, false);
+ }
+}
+
+/// The cascade-guard decision returned by .
+internal readonly record struct CascadeStep(bool Emit, bool Log, int Count);
diff --git a/src/DidComm.Core/Protocols/ProblemReport/ProblemReportHandler.cs b/src/DidComm.Core/Protocols/ProblemReport/ProblemReportHandler.cs
index d11f8a9..1f98086 100644
--- a/src/DidComm.Core/Protocols/ProblemReport/ProblemReportHandler.cs
+++ b/src/DidComm.Core/Protocols/ProblemReport/ProblemReportHandler.cs
@@ -11,8 +11,8 @@ namespace DidComm.Protocols.ProblemReport;
///
/// Problem-reports are informational by design — they tell the peer something went wrong on
/// a thread, but they do not themselves require an application-level reply. So this handler's
-/// "happy path" is to read the code, increment
-/// for error-sorter codes, and return null.
+/// "happy path" is to read the code, record it against the failing thread's FR-PROTO-10 budget in
+/// the dedicated (#36) for error-sorter codes, and return null.
///
///
/// The one exception is the FR-PROTO-10 cascade guard: once a thread accumulates more than
@@ -25,16 +25,27 @@ namespace DidComm.Protocols.ProblemReport;
public sealed class ProblemReportHandler : IProtocolHandler
{
private readonly ProblemReportOptions _options;
+ private readonly CascadeBudgetStore _cascadeBudget;
private readonly ILogger? _logger;
- /// Construct the handler with explicit options + optional logger.
+ /// Construct the handler with explicit options, an optional dedicated cascade-budget store, and an optional logger.
/// Handler options (cascade threshold, etc.). Validated eagerly at ctor so a misconfig surfaces at DI resolution rather than as silently-degraded cascade-guard behaviour.
+ ///
+ /// The dedicated FR-PROTO-10 cascade-budget store (#36), kept separate from the dispatcher's general
+ /// thread store. Defaults to a fresh process-local instance; the DI setup registers one as a
+ /// singleton so the budget's lifetime is decoupled from the handler's (a handler accidentally
+ /// registered non-singleton would otherwise lose its budget per request).
+ ///
/// Optional structured logger.
- public ProblemReportHandler(IOptions options, ILogger? logger = null)
+ public ProblemReportHandler(
+ IOptions options,
+ CascadeBudgetStore? cascadeBudget = null,
+ ILogger? logger = null)
{
ArgumentNullException.ThrowIfNull(options);
_options = options.Value;
_options.Validate();
+ _cascadeBudget = cascadeBudget ?? new CascadeBudgetStore();
_logger = logger;
}
@@ -70,46 +81,26 @@ public ProblemReportHandler(IOptions options, ILogger(null);
- var failingThread = context.Threads.GetOrCreate(message.Pthid);
+ // FR-PROTO-10 cascade guard. The whole increment + threshold check + trip decision is done
+ // ATOMICALLY inside the dedicated, bounded cascade-budget store (#36) — separate from the
+ // dispatcher's general thread store (which a cheap-thid flood evicts) and self-synchronizing on
+ // a stable pthid-keyed lock (so concurrent reports on the same pthid can't double-emit, even
+ // under the store's LRU eviction). A report is repliable iff it has a usable reply target.
+ bool repliable = !string.IsNullOrEmpty(message.From)
+ && message.To is { Count: > 0 } && !string.IsNullOrEmpty(message.To[0]);
+ var step = _cascadeBudget.RecordErrorReport(message.Pthid, _options.CascadeThreshold, repliable);
- // FR-PROTO-10 cascade guard. The increment + threshold check + trip-emission decision
- // must be atomic per-thread, otherwise concurrent inbound reports on the same pthid
- // race on `ErrorCount` and either double-emit the cascade-stop or skip the trip entirely.
- // Lock on the ThreadState instance — it is shared between callers of the same pthid via
- // IThreadStateStore.GetOrCreate, so it is the natural lock seam.
- bool shouldEmit = false;
- int errorCountSnapshot = 0;
- lock (failingThread)
+ if (step.Log)
{
- // Truly silent after the trip has fired: no increment, no log, no work. (Previously
- // every post-trip report still mutated state and emitted an Information log line,
- // turning "silently ignored" into a DoS-shaped log-flood + unbounded counter growth.)
- if (failingThread.MaxErrorsNoticeSent)
- return Task.FromResult(null);
-
- failingThread.ErrorCount++;
- errorCountSnapshot = failingThread.ErrorCount;
_logger?.LogInformation(
"Inbound problem-report on thread {Pthid}: code={Code}, errorCount={Count}/{Threshold}",
- message.Pthid, code.Value, errorCountSnapshot, _options.CascadeThreshold);
-
- if (errorCountSnapshot > _options.CascadeThreshold)
- {
- // We need a repliable target to emit the cascade-stop. If this report is anoncrypt
- // (no `from`) or addressless, DEFER the trip — leave MaxErrorsNoticeSent=false so
- // the next repliable report on the same pthid can fire the trip. Without this
- // deferral, an unrepliable report sets the counter past the threshold; the next
- // report falls into the silent-ignore branch above and the cascade-stop is never
- // emitted (FR-PROTO-10 silently broken).
- if (string.IsNullOrEmpty(message.From) || message.To is not { Count: > 0 } || string.IsNullOrEmpty(message.To[0]))
- return Task.FromResult(null);
-
- failingThread.MaxErrorsNoticeSent = true;
- shouldEmit = true;
- }
+ message.Pthid, code.Value, step.Count, _options.CascadeThreshold);
}
- if (!shouldEmit) return Task.FromResult(null);
+ if (!step.Emit)
+ return Task.FromResult(null);
+
+ var errorCountSnapshot = step.Count;
// Build + log the cascade-stop OUTSIDE the per-thread lock — MessageBuilder allocations
// and logging shouldn't contend on the lock.
diff --git a/src/DidComm.Core/Threading/ThreadState.cs b/src/DidComm.Core/Threading/ThreadState.cs
index 82223c9..748cadb 100644
--- a/src/DidComm.Core/Threading/ThreadState.cs
+++ b/src/DidComm.Core/Threading/ThreadState.cs
@@ -24,17 +24,16 @@ public sealed class ThreadState
public IReadOnlyList? AcceptLang { get; set; }
///
- /// Count of problem-reports produced on this thread. Used by the FR-PROTO-10 cascade
- /// guard (Phase 6.2c) to emit e.p.req.max-errors-exceeded once the per-thread
- /// threshold trips and then stop responding on the thread. Implementations / tests MAY
- /// increment this directly; concurrent callers MUST hold the per-instance lock
- /// ( exposes itself as the lock seam — see remarks).
+ /// General-purpose mutable per-thread error counter for consumer use. Note: the built-in
+ /// FR-PROTO-10 cascade guard no longer uses this field — it keeps its budget in a dedicated,
+ /// separate store (CascadeBudgetStore) so a flood of cheap thids in the general store can't
+ /// evict and reset it (#36). Retained for consumers who track their own per-thread counts.
///
public int ErrorCount { get; set; }
///
- /// Set to true the moment the FR-PROTO-10 cascade-stop notice has been emitted on
- /// this thread, so the handler can short-circuit subsequent reports without further work.
+ /// General-purpose per-thread flag for consumer use. Note: as with ,
+ /// the built-in cascade guard no longer reads or writes this — see CascadeBudgetStore (#36).
///
public bool MaxErrorsNoticeSent { get; set; }
diff --git a/src/DidComm.Extensions.DependencyInjection/DidCommBuilder.cs b/src/DidComm.Extensions.DependencyInjection/DidCommBuilder.cs
index af5b384..2e5de99 100644
--- a/src/DidComm.Extensions.DependencyInjection/DidCommBuilder.cs
+++ b/src/DidComm.Extensions.DependencyInjection/DidCommBuilder.cs
@@ -97,6 +97,9 @@ public DidCommBuilder AddBuiltInProtocols()
AddProtocol();
AddProtocol();
AddProtocol();
+ // The FR-PROTO-10 cascade budget (#36) lives in a dedicated store registered as its OWN
+ // singleton, so the budget persists even if the handler is (mis)registered non-singleton.
+ Services.TryAddSingleton();
// ProblemReportOptions is bound via the standard Options pattern; default ctor's
// CascadeThreshold = 5 is the SICPA-python-matching default per locked decision.
Services.AddOptions();
diff --git a/tasks/lessons.md b/tasks/lessons.md
index 43976b2..17fe1e7 100644
--- a/tasks/lessons.md
+++ b/tasks/lessons.md
@@ -513,3 +513,22 @@ Format per entry:
- **How to apply:** Before tightening a shared codec/parser, enumerate each call site and check the
*spec's* strictness requirement for THAT field. Apply strictness only where mandated; give permissive
fields a clearly-named relaxed path. Verify the spec, don't assume one rule fits all call sites.
+
+## L-028 — Never use a value from an evictable/bounded store as a lock seam; the store must own its concurrency.
+
+- **Lesson:** The cascade guard locked on the per-pthid `ThreadState` instance returned by a bounded
+ LRU store ("same pthid → same instance → natural lock seam"). That invariant is FALSE for an
+ evicting store: if the entry is evicted between `GetOrCreate` and `lock`, a concurrent caller mints a
+ fresh instance, the two callers lock DIFFERENT objects, mutual exclusion is lost, and the "emit
+ exactly once" guard double-emits. The mandatory adversarial pass (AGENTS.md §2) proved it
+ deterministically at small caps.
+- **Why:** Eviction can replace the object you locked on with a new one for the same logical key, so an
+ object borrowed from the store is not a stable monitor. Atomicity must be owned by the store, keyed
+ by the stable string key, not delegated to the caller via a returned (evictable) object.
+- **How to apply:** When a bounded/evicting store backs a guard that needs atomic read-modify-write,
+ put the whole decision INSIDE the store under a stable lock (single lock, or stripe by the string
+ key), with eviction performed inside that same lock — and return a value/decision, not a mutable
+ shared object for the caller to lock. Also: a bounded store can reset any entry on eviction, so make
+ "decision already made" (e.g. tripped/silenced) survive eviction (prefer-evict-untripped or a sticky
+ marker), and decouple the store's DI lifetime from a handler's (register it as its own singleton) so
+ a non-singleton handler can't silently get a fresh store per request.
diff --git a/tasks/todo20260617T112755Z.md b/tasks/todo20260617T112755Z.md
new file mode 100644
index 0000000..82f99ab
--- /dev/null
+++ b/tasks/todo20260617T112755Z.md
@@ -0,0 +1,116 @@
+# Plan — Batch C: cascade-guard redesign (#29 + #36)
+
+**Branch:** `feat/security-cascade-guard` (off `main`)
+**One PR** closing **#29, #36** (related — addressed together per the #36 issue).
+
+Both concern the FR-PROTO-10 problem-report cascade guard. Today the per-`pthid` budget
+(`ErrorCount`/`MaxErrorsNoticeSent`) lives on a `ThreadState` in the **general** thread store, which
+the dispatcher floods with cheap `thid`s — so (a) the budget can grow/log without bound for
+unrepliable streams (#29) and (b) a cheap-`thid` flood evicts and resets a victim's budget (#36).
+
+## Design (two parts)
+
+### Part 1 — #29: clamp the handler logic (in `ProblemReportHandler`)
+Restructure the locked region so that, once a thread breaches the threshold:
+- **Clamp** `ErrorCount` at `threshold + 1` (never increment past it).
+- **Stop logging** per-message after the breach (today every post-threshold report logs an Info line).
+- **Preserve the deferral**: an unrepliable (anoncrypt/from-less) breach leaves `MaxErrorsNoticeSent =
+ false` and the counter clamped, so a *later repliable* report on the same pthid can still emit the
+ cascade-stop — but the unrepliable stream no longer floods the log or grows the counter.
+
+Shape (inside `lock (budget)`):
+```
+if (budget.MaxErrorsNoticeSent) return null; // notice sent → fully silent
+var atCap = budget.ErrorCount > threshold; // already clamped at T+1
+if (!atCap) { budget.ErrorCount++; log(count/threshold); } // increment+log only while accumulating
+if (budget.ErrorCount > threshold) { // breach
+ if (repliable) { budget.MaxErrorsNoticeSent = true; emit; }
+ // else: deferred — counter clamped at T+1, no further increment/log
+}
+```
+Net: a sustained from-less stream clamps at `T+1`, logs once at breach, never floods.
+
+### Part 2 — #36: move the cascade budget OFF the cheap-thid-evictable general store
+Give the handler a **dedicated, bounded, process-local cascade-budget store**, separate from the
+dispatcher's general thread store, keyed only by `pthid`:
+- New internal `CascadeBudgetStore` wrapping its own bounded `InMemoryThreadStateStore` (reusing the
+ #21 LRU bound; reuses `ThreadState.ErrorCount`/`MaxErrorsNoticeSent` — no new state type).
+- `ProblemReportHandler` ctor gains an **optional** `CascadeBudgetStore? cascadeBudget = null` param,
+ defaulting to a fresh instance (the handler is a singleton, so its budget persists). **It is a
+ distinct type — NOT `IThreadStateStore`** — specifically so DI does not auto-wire the *general*
+ store back in and re-merge them.
+- The handler uses `_cascadeBudget.GetOrCreate(pthid)` instead of `context.Threads.GetOrCreate(pthid)`.
+
+**Why this closes #36:** the documented attack floods cheap `thid`s (general store); those no longer
+touch the cascade-budget store, so a victim's `pthid` budget is no longer reset. A residual remains —
+a determined attacker could flood *distinct-`pthid` error reports* to pressure the bounded budget
+store — but that is uneconomical (each is a real error report; the victim's recently-touched entry is
+newest-LRU and re-clamps in `T+1` reports). I'll **document** this residual honestly and note that a
+distributed/persistent budget store is the path to a stronger guarantee. (Per the #36 issue: do NOT
+exempt non-zero entries from eviction — that re-opens unbounded growth.)
+
+## Files
+- `src/DidComm.Core/Protocols/ProblemReport/CascadeBudgetStore.cs` (new, internal).
+- `src/DidComm.Core/Protocols/ProblemReport/ProblemReportHandler.cs` (ctor + locked-region rewrite).
+- `ThreadState.ErrorCount`/`MaxErrorsNoticeSent` — **kept** (now used by the cascade-budget store).
+- No mandatory DI change (optional ctor param defaults); `DidCommBuilder` registration unchanged.
+
+## Tests (`ProblemReportHandlerTests`)
+- Update existing tests to inject + inspect the `CascadeBudgetStore` (same `ErrorCount` values; the
+ under-threshold and first-breach assertions are unchanged in magnitude).
+- **#29 new:** a sustained from-less (unrepliable) over-threshold stream ⇒ `ErrorCount` clamps at
+ `T+1` (not N) and stops logging; a later repliable report still emits the cascade-stop (deferral).
+- **#36 new:** with a dedicated cascade-budget store and a SEPARATE general store, flood the general
+ store with `> cap` fresh `thid`s (evicting its entries) ⇒ the cascade budget is unaffected and the
+ guard still trips on the victim `pthid` (proves the separation; would fail on `main`, where both
+ share one store).
+- Keep `Cascade_trip_is_deferred_when_threshold_lands_on_an_unrepliable_report` green.
+
+## Cross-cutting
+- [ ] Branch `feat/security-cascade-guard` off `main`.
+- [ ] Implement Part 1 + Part 2 (additive; new internal type + optional ctor param — no breaking API).
+- [ ] `dotnet build` (warnings-as-errors) + full `dotnet test` green.
+- [ ] **Adversarial red-team pass** (AGENTS.md §2): try to (a) still flood the log/counter, (b) reset a
+ victim's budget cheaply, (c) double-emit or skip the trip under concurrency.
+- [ ] Update `CHANGELOG.md`; one PR: `fix(security): cascade-guard redesign (#29, #36)`,
+ `Closes #29`, `closes #36`.
+
+## Verification standard
+- #29: prove a sustained from-less stream clamps the counter and stops logging (not just one report).
+- #36: prove a general-store thid-flood does NOT reset the cascade budget (separate store) — a test
+ that would fail on `main`.
+- Concurrency: the per-`ThreadState` lock seam is preserved; no double-emit / skipped-trip race.
+
+## Review section
+
+Implemented on `feat/security-cascade-guard`. **630 tests green**, build clean (incl. cookbook).
+
+- **#29 (clamp):** the handler clamps `ErrorCount` at `threshold+1` and stops per-message logging after
+ the breach, preserving the unrepliable-deferral. Sustained-from-less test proves the clamp.
+- **#36 (separate store):** the budget moved to a dedicated `CascadeBudgetStore`, separate from the
+ general thread store; a general-store thid-flood no longer resets it (test would fail on `main`).
+
+### Adversarial red-team pass (AGENTS.md §2) — found a HIGH bug I introduced; remediated
+Both subagents converged: my first cut made the dedicated store an LRU store but kept **locking on the
+evictable `ThreadState` instance**, so under eviction two concurrent callers lock different instances
+→ **double-emit** (HIGH, PoC-proven deterministically at small caps). Also: residual eviction-reset
+(MEDIUM) and a DI foot-gun (non-singleton handler → fresh budget per request, MEDIUM). Remediated:
+- **Store now owns concurrency** — `RecordErrorReport` does the whole increment+threshold+trip decision
+ atomically under a single pthid-keyed lock, eviction inside that lock; returns a `CascadeStep`
+ decision (no shared mutable object to lock). Closes the double-emit. New test asserts ≤1 emit under
+ concurrent reports + eviction pressure.
+- **Prefer-evict-untripped LRU** — a tripped thread's silenced decision survives a fresh-pthid flood
+ (without exempting tripped entries → still bounded). New test proves a tripped thread stays silent
+ after a 500-pthid flood.
+- **DI decoupling** — `CascadeBudgetStore` registered as its OWN singleton, so the budget persists even
+ if the handler is mis-registered non-singleton.
+- **Residual documented** — a determined attacker can still pressure the bound with distinct-pthid
+ *error reports* (far more expensive than the original cheap-thid reset; each does real work). A
+ distributed/persistent budget is the path to a hard guarantee. Lesson **L-028** captured.
+
+### Integration impact handled
+- The cookbook (`Section_U_ProblemReport`) inspected `ThreadState.ErrorCount` on a store it controlled;
+ the budget moved away from that store, so it now narrates the **cascade-stop notice** (which carries
+ the count) instead — the consumer-facing signal.
+- `ThreadState.ErrorCount`/`MaxErrorsNoticeSent` kept (no public-API removal) but re-documented as
+ general-purpose consumer fields no longer used by the built-in cascade guard.
diff --git a/tests/DidComm.Core.Tests/Protocols/ProblemReport/ProblemReportHandlerTests.cs b/tests/DidComm.Core.Tests/Protocols/ProblemReport/ProblemReportHandlerTests.cs
index 7caec5d..c7550b3 100644
--- a/tests/DidComm.Core.Tests/Protocols/ProblemReport/ProblemReportHandlerTests.cs
+++ b/tests/DidComm.Core.Tests/Protocols/ProblemReport/ProblemReportHandlerTests.cs
@@ -25,27 +25,31 @@ private static ProtocolContext Ctx(Message m, IThreadStateStore? store = null)
return new ProtocolContext(unpacked, ownThread, Client: null, new DidCommOptions(), threads);
}
- private static ProblemReportHandler BuildHandler(int cascadeThreshold = 5)
- => new(Options.Create(new ProblemReportOptions { CascadeThreshold = cascadeThreshold }));
+ private static ProblemReportHandler BuildHandler(int cascadeThreshold = 5, CascadeBudgetStore? budget = null)
+ => new(Options.Create(new ProblemReportOptions { CascadeThreshold = cascadeThreshold }), budget ?? new CascadeBudgetStore());
[Fact]
public async Task Warning_report_does_not_increment_error_count()
{
- var handler = BuildHandler();
+ var budget = new CascadeBudgetStore();
+ var handler = BuildHandler(budget: budget);
var warning = ProblemReportApi.Create(
from: "did:peer:alice", to: "did:peer:bob",
code: "w.m.req.expired", pthid: "thread-1");
var store = new InMemoryThreadStateStore();
var reply = await handler.HandleAsync(warning, Ctx(warning, store), CancellationToken.None);
reply.Should().BeNull();
- // No state was created for "thread-1" since warnings don't trigger error accounting.
- store.Get("thread-1").Should().BeNull();
+ // A warning code must not create or advance the FR-PROTO-10 budget entry (PR #40 review: assert
+ // against the budget store, since the handler no longer writes the general store for any code).
+ budget.Peek("thread-1").ErrorCount.Should().Be(0);
+ budget.Count.Should().Be(0, "warnings create no budget entry at all");
}
[Fact]
public async Task Error_report_increments_error_count_and_returns_null_under_threshold()
{
- var handler = BuildHandler(cascadeThreshold: 5);
+ var budget = new CascadeBudgetStore();
+ var handler = BuildHandler(cascadeThreshold: 5, budget);
var error = ProblemReportApi.Create(
from: "did:peer:alice", to: "did:peer:bob",
code: "e.p.xfer.cant-use-endpoint", pthid: "thread-1");
@@ -56,13 +60,14 @@ public async Task Error_report_increments_error_count_and_returns_null_under_thr
var reply = await handler.HandleAsync(error, Ctx(error, store), CancellationToken.None);
reply.Should().BeNull($"report #{i} is at or under the threshold");
}
- store.GetOrCreate("thread-1").ErrorCount.Should().Be(5);
+ budget.Peek("thread-1").ErrorCount.Should().Be(5);
}
[Fact]
public async Task Cascade_guard_emits_max_errors_exceeded_on_first_breach()
{
- var handler = BuildHandler(cascadeThreshold: 2);
+ var budget = new CascadeBudgetStore();
+ var handler = BuildHandler(cascadeThreshold: 2, budget);
var error = ProblemReportApi.Create(
from: "did:peer:alice", to: "did:peer:bob",
code: "e.p.xfer.cant-use-endpoint", pthid: "thread-1");
@@ -70,7 +75,7 @@ public async Task Cascade_guard_emits_max_errors_exceeded_on_first_breach()
await handler.HandleAsync(error, Ctx(error, store), CancellationToken.None);
await handler.HandleAsync(error, Ctx(error, store), CancellationToken.None);
- store.GetOrCreate("thread-1").ErrorCount.Should().Be(2);
+ budget.Peek("thread-1").ErrorCount.Should().Be(2);
// The third error trips the guard (>threshold of 2).
var trip = await handler.HandleAsync(error, Ctx(error, store), CancellationToken.None);
@@ -139,7 +144,8 @@ public async Task Cascade_trip_is_deferred_when_threshold_lands_on_an_unrepliabl
// MaxErrorsNoticeSent flag unset — so the NEXT repliable report can fire the trip.
// Without the deferral, the counter would land past the threshold and every subsequent
// report would fall into the silent-ignore branch, so the cascade-stop is never emitted.
- var handler = BuildHandler(cascadeThreshold: 1);
+ var budget = new CascadeBudgetStore();
+ var handler = BuildHandler(cascadeThreshold: 1, budget);
var store = new InMemoryThreadStateStore();
var first = ProblemReportApi.Create(
@@ -154,7 +160,7 @@ public async Task Cascade_trip_is_deferred_when_threshold_lands_on_an_unrepliabl
.Build();
(await handler.HandleAsync(unrepliable, Ctx(unrepliable, store), CancellationToken.None))
.Should().BeNull("trip emission requires a repliable target");
- store.GetOrCreate("thread-1").MaxErrorsNoticeSent.Should().BeFalse(
+ budget.Peek("thread-1").NoticeSent.Should().BeFalse(
"the cascade flag stays unset so a later repliable report can still fire the trip");
var thirdRepliable = ProblemReportApi.Create(
@@ -163,7 +169,7 @@ public async Task Cascade_trip_is_deferred_when_threshold_lands_on_an_unrepliabl
var trip = await handler.HandleAsync(thirdRepliable, Ctx(thirdRepliable, store), CancellationToken.None);
trip.Should().NotBeNull("the deferred cascade-stop must fire on the next repliable report");
ProblemReportApi.ReadCode(trip!).Should().Be(ProblemReportApi.MaxErrorsExceededCode);
- store.GetOrCreate("thread-1").MaxErrorsNoticeSent.Should().BeTrue();
+ budget.Peek("thread-1").NoticeSent.Should().BeTrue();
}
[Fact]
@@ -173,7 +179,8 @@ public async Task Cascade_trip_fires_exactly_once_under_concurrent_dispatches()
// fed by parallel transports) must yield exactly one cascade-stop. Without per-thread
// atomicity around the increment + threshold check, races would either double-emit OR
// skip the trip equality entirely. This stress test pins the fix.
- var handler = BuildHandler(cascadeThreshold: 5);
+ var budget = new CascadeBudgetStore();
+ var handler = BuildHandler(cascadeThreshold: 5, budget);
var store = new InMemoryThreadStateStore();
var error = ProblemReportApi.Create(
from: "did:peer:alice", to: "did:peer:bob",
@@ -187,7 +194,71 @@ public async Task Cascade_trip_fires_exactly_once_under_concurrent_dispatches()
results.Count(r => r is not null).Should().Be(1,
"the cascade-stop must be emitted exactly once across all concurrent reports on the same pthid");
- store.GetOrCreate("thread-race").MaxErrorsNoticeSent.Should().BeTrue();
+ budget.Peek("thread-race").NoticeSent.Should().BeTrue();
+ }
+
+ [Fact]
+ public async Task Sustained_unrepliable_stream_clamps_counter_and_log_then_fires_on_repliable_FrProto10_29()
+ {
+ // Issue #29: a sustained stream of unrepliable (from-less) over-threshold reports must NOT grow
+ // the counter (or log) without bound — the trip is deferred (no repliable target), but the work
+ // is clamped. The counter sits at threshold+1; a later repliable report still fires the trip.
+ const int threshold = 2;
+ var budget = new CascadeBudgetStore();
+ var handler = BuildHandler(threshold, budget);
+
+ var unrepliable = new MessageBuilder()
+ .WithType(ProblemReportApi.MessageType)
+ .WithBody(new System.Text.Json.Nodes.JsonObject { ["code"] = "e.p.xfer.cant-use-endpoint" })
+ .WithPthid("thread-x")
+ .Build();
+
+ for (var i = 0; i < 50; i++)
+ {
+ var reply = await handler.HandleAsync(unrepliable, Ctx(unrepliable), CancellationToken.None);
+ reply.Should().BeNull("an unrepliable report has no target for the cascade-stop");
+ }
+
+ // Clamped at threshold+1 (NOT 50), trip still deferred.
+ budget.Peek("thread-x").ErrorCount.Should().Be(threshold + 1);
+ budget.Peek("thread-x").NoticeSent.Should().BeFalse();
+
+ // A later repliable report on the same pthid fires the deferred cascade-stop.
+ var repliable = ProblemReportApi.Create(
+ from: "did:peer:alice", to: "did:peer:bob",
+ code: "e.p.xfer.cant-use-endpoint", pthid: "thread-x");
+ var trip = await handler.HandleAsync(repliable, Ctx(repliable), CancellationToken.None);
+ trip.Should().NotBeNull();
+ ProblemReportApi.ReadCode(trip!).Should().Be(ProblemReportApi.MaxErrorsExceededCode);
+ }
+
+ [Fact]
+ public async Task Cascade_budget_survives_a_general_store_thid_flood_FrProto10_36()
+ {
+ // Issue #36: the cascade budget lives in a DEDICATED store, so flooding the dispatcher's
+ // general thread store with cheap fresh thids cannot evict and reset a victim pthid's budget.
+ // On main (where both share one store) the flood would reset the victim and the trip would not
+ // fire; here the budget survives and the guard still trips.
+ const int threshold = 2;
+ var budget = new CascadeBudgetStore();
+ var handler = BuildHandler(threshold, budget);
+ var generalStore = new InMemoryThreadStateStore(maxEntries: 10); // tiny so the flood definitely evicts
+
+ var error = ProblemReportApi.Create(
+ from: "did:peer:alice", to: "did:peer:bob",
+ code: "e.p.xfer.cant-use-endpoint", pthid: "victim");
+
+ await handler.HandleAsync(error, Ctx(error, generalStore), CancellationToken.None); // budget: victim=1
+ for (var i = 0; i < 1000; i++) generalStore.GetOrCreate($"flood-a-{i}"); // evict everything in the general store
+ await handler.HandleAsync(error, Ctx(error, generalStore), CancellationToken.None); // budget: victim=2
+ for (var i = 0; i < 1000; i++) generalStore.GetOrCreate($"flood-b-{i}");
+
+ // The dedicated budget is intact despite the general-store flood.
+ budget.Peek("victim").ErrorCount.Should().Be(2);
+
+ var trip = await handler.HandleAsync(error, Ctx(error, generalStore), CancellationToken.None);
+ trip.Should().NotBeNull("the cascade budget was not reset by the general-store flood, so the 3rd report trips");
+ ProblemReportApi.ReadCode(trip!).Should().Be(ProblemReportApi.MaxErrorsExceededCode);
}
[Fact]
@@ -198,4 +269,85 @@ public void Ctor_validates_options_eagerly()
var act = () => new ProblemReportHandler(Options.Create(new ProblemReportOptions { CascadeThreshold = -1 }));
act.Should().Throw().WithMessage("*CascadeThreshold*");
}
+
+ [Fact]
+ public async Task Cascade_stop_never_double_emits_even_under_eviction_pressure()
+ {
+ // Red-team (#36): the budget store's LRU eviction must NOT split the lock seam. An earlier
+ // design locked on the evictable per-pthid state object, so under eviction two concurrent
+ // callers could lock DIFFERENT instances for the same pthid and double-emit. The atomic,
+ // pthid-keyed store must yield AT MOST ONE cascade-stop for the victim despite concurrent
+ // reports + a flood that drives constant eviction (tiny cap).
+ const int threshold = 2;
+ var budget = new CascadeBudgetStore(maxEntries: 8);
+ var handler = BuildHandler(threshold, budget);
+ var store = new InMemoryThreadStateStore();
+ var victim = ProblemReportApi.Create(
+ from: "did:peer:alice", to: "did:peer:bob",
+ code: "e.p.xfer.cant-use-endpoint", pthid: "victim");
+
+ var tasks = new List>();
+ for (var i = 0; i < 100; i++)
+ {
+ tasks.Add(Task.Run(() => handler.HandleAsync(victim, Ctx(victim, store), CancellationToken.None)));
+ var n = i;
+ tasks.Add(Task.Run(() => handler.HandleAsync(
+ ProblemReportApi.Create(from: "did:peer:f", to: "did:peer:t", code: "e.p.xfer.cant-use-endpoint", pthid: $"flood-{n}"),
+ Ctx(victim, store), CancellationToken.None)));
+ }
+ var results = await Task.WhenAll(tasks);
+
+ results.Count(r => r is not null && r.Pthid == "victim"
+ && ProblemReportApi.ReadCode(r) == ProblemReportApi.MaxErrorsExceededCode)
+ .Should().BeLessThanOrEqualTo(1, "the cascade-stop must never double-emit, even under eviction");
+ }
+
+ [Fact]
+ public async Task A_tripped_thread_stays_silenced_after_a_flood_of_fresh_pthids()
+ {
+ // Red-team (#36): once a thread trips, its silenced decision survives a flood of fresh
+ // (non-tripped) pthids — the budget's LRU prefers evicting not-yet-tripped entries, so the
+ // tripped victim is not reset and re-emit. (Would fail under a plain LRU that evicts the idle
+ // tripped victim.)
+ const int threshold = 1;
+ var budget = new CascadeBudgetStore(maxEntries: 16);
+ var handler = BuildHandler(threshold, budget);
+ var store = new InMemoryThreadStateStore();
+ var victim = ProblemReportApi.Create(
+ from: "did:peer:alice", to: "did:peer:bob",
+ code: "e.p.xfer.cant-use-endpoint", pthid: "victim");
+
+ await handler.HandleAsync(victim, Ctx(victim, store), CancellationToken.None);
+ (await handler.HandleAsync(victim, Ctx(victim, store), CancellationToken.None)).Should().NotBeNull(); // trips
+
+ for (var i = 0; i < 500; i++)
+ {
+ await handler.HandleAsync(
+ ProblemReportApi.Create(from: "did:peer:f", to: "did:peer:t", code: "e.p.xfer.cant-use-endpoint", pthid: $"flood-{i}"),
+ Ctx(victim, store), CancellationToken.None);
+ }
+
+ budget.Peek("victim").NoticeSent.Should().BeTrue("the tripped decision survives the fresh-pthid flood");
+ (await handler.HandleAsync(victim, Ctx(victim, store), CancellationToken.None))
+ .Should().BeNull("a tripped thread stays silent — no re-emit after the flood");
+ }
+
+ [Fact]
+ public async Task Budget_store_stays_bounded_even_at_a_cap_of_one()
+ {
+ // PR #40 review: maxEntries=1 must actually bound the store. A naive low-water (== cap for
+ // maxEntries=1) made Evict() a no-op, so the store grew by one per distinct pthid without limit.
+ var budget = new CascadeBudgetStore(maxEntries: 1);
+ var handler = BuildHandler(budget: budget);
+ var store = new InMemoryThreadStateStore();
+
+ for (var i = 0; i < 200; i++)
+ {
+ var r = ProblemReportApi.Create(
+ from: "did:peer:a", to: "did:peer:b", code: "e.p.xfer.cant-use-endpoint", pthid: $"p-{i}");
+ await handler.HandleAsync(r, Ctx(r, store), CancellationToken.None);
+ }
+
+ budget.Count.Should().BeLessThanOrEqualTo(1, "the cap must hold even at maxEntries=1");
+ }
}