Leberkas-org · st0o0 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/docs/.vitepress/config.ts b/docs/.vitepress/config.ts
@@ -22,6 +22,7 @@ export default defineConfig({
 
         nav: [
             { text: 'Getting Started', link: '/getting-started/' },
+            { text: 'When to Use', link: '/when-to-use' },
             { text: 'Scenarios', link: '/scenarios' },
             { text: 'Client', link: '/client/' },
             { text: 'Server', link: '/server/' },
@@ -47,6 +48,16 @@ export default defineConfig({
                     text: 'Scenarios',
                     items: [
                         { text: 'Overview', link: '/scenarios' },
+                        { text: 'When to Use TurboHTTP', link: '/when-to-use' },
+                    ],
+                },
+            ],
+            '/when-to-use': [
+                {
+                    text: 'Scenarios',
+                    items: [
+                        { text: 'Overview', link: '/scenarios' },
+                        { text: 'When to Use TurboHTTP', link: '/when-to-use' },
                     ],
                 },
             ],

diff --git a/docs/when-to-use.md b/docs/when-to-use.md
@@ -0,0 +1,92 @@
+# When to Use TurboHTTP
+
+TurboHTTP is not a drop-in "faster HttpClient/Kestrel". It is an HTTP stack built on Akka.Streams
+whose strengths are **streaming, backpressure, large payloads under concurrency, and actor
+integration** — and whose trade-off is per-request overhead on tiny, latency-critical requests.
+This page summarizes where each side of the stack wins, based on the benchmark suite
+(BenchmarkDotNet, loopback, 2026-06).
+
+## TL;DR
+
+| Your workload | Recommendation |
+|---|---|
+| Many small GETs, lowest possible latency | HttpClient / Kestrel |
+| Large request bodies (uploads) under concurrency | **TurboHTTP client** (H2/H3: up to 2–3.5× HttpClient) |
+| Upload-heavy server endpoints (HTTP/1.1) | **TurboServer** (+10–34 % vs Kestrel) |
+| Streaming, SSE, backpressure end-to-end | **TurboHTTP (both sides)** |
+| Actor-based backends (Akka.NET) | **TurboServer** — shares your `ActorSystem` |
+| Bulk request pipelines (fire thousands, drain results) | **TurboHTTP client channel API** |
+
+## As a Client
+
+### Where it wins
+
+- **Concurrent uploads over HTTP/2 and HTTP/3.** With many in-flight POSTs, the multiplexed
+  upload path clearly beats `SocketsHttpHandler`: at 512–4096 concurrent 10 KB uploads the
+  benchmark shows **+12 % to +58 % (H2)** and **+123 % to +247 % (H3)** throughput, with up to
+  **84 % fewer allocations** (H2, CL=4096). Tail latency follows: p99 is 40–70 % lower in these
+  scenarios.
+- **HTTP/1.1 uploads at scale** run close to HttpClient (within ~30–40 % at high concurrency)
+  with bounded memory — the request body pump is backpressured against the socket instead of
+  buffering whole bodies.
+- **Resilience built into the pipeline.** Retries, reconnect with request replay, redirects,
+  cookies, HTTP caching, and content encoding are stream stages, not handler wrappers — and all
+  of it is observable through permanent `Servus.Senf` tracing.
+- **The channel API** (`client.Requests` / `client.Responses`) turns the client into a
+  backpressured pipeline: write thousands of requests, drain responses as they complete. Ideal
+  for crawlers, batch syncs, and fan-out jobs where aggregate throughput matters, not
+  per-request latency.
+
+### Where HttpClient is the better tool
+
+- **Single-request latency on light GETs.** A lone ~3 B GET costs ~150–160 µs vs HttpClient's
+  ~74 µs; light-GET fan-out at very high concurrency is also slower (H2/H3 light concurrent).
+- **The channel API has a latency floor** (~1.3–1.6 ms per isolated request) from its
+  stream-materialization hops — it amortizes over bulk work, not single calls.
+
+## As a Server
+
+### Where it wins
+
+- **HTTP/1.1 upload endpoints.** 1 MB POSTs run **+10 % to +34 %** faster than Kestrel
+  (sequential and CL=1 concurrent; +10–20 % at CL=64/256 sequential).
+- **HTTP/2 / HTTP/3 request handling at parity.** Plaintext/JSON/Fortunes sequential are within
+  ±5–15 % of Kestrel across protocols; several H2 concurrent scenarios (plaintext, JSON) are
+  ahead at p95/p99.
+- **Streaming responses with real backpressure.** Return an Akka Streams `Source` (SSE, long
+  downloads) and flow control runs end-to-end — a slow client slows the producer instead of
+  growing a buffer.
+- **Actor integration.** TurboServer reuses your `ActorSystem` from DI; HTTP connections and
+  domain actors share supervision, dispatchers, and tracing (see [Scenarios](/scenarios)).
+
+### Where Kestrel is the better tool
+
+- **Small-response throughput/latency records.** Plaintext/JSON-style endpoints are ~6–16 %
+  slower at p50 and allocate more per request (managed allocations are roughly 3–4× Kestrel's
+  2.7 KB; native/pooled buffers excluded on both sides).
+- **Very high fan-out on HTTP/3.** Light-request concurrency over QUIC currently trails Kestrel
+  significantly (-50 % to -74 %) — a known limitation of the shared pipeline, being worked on.
+
+## In Combination
+
+Running TurboHTTP on both ends pays off when the *pipeline* is the product:
+
+- **Service-to-service with large payloads.** TurboHTTP client → TurboServer keeps uploads
+  backpressured on both sides; neither end buffers whole bodies, so memory stays flat under
+  load spikes.
+- **End-to-end streaming.** An Akka Streams `Source` on the server feeds an Akka Streams
+  consumer on the client — one flow-controlled graph across the network, including SSE.
+- **Gateways and proxies.** Forward-proxy and CONNECT tunneling are supported; combined with
+  the channel API this makes backpressured relay/aggregation services straightforward.
+- **One ActorSystem everywhere.** Client stages, server connections, and your domain actors
+  share dispatchers, supervision, and `Servus.Senf` tracing categories — a single operational
+  surface from socket to business logic.
+
+## Benchmark Context
+
+Numbers above come from the repo's benchmark suite (`TurboHTTP.Benchmarks`): localhost loopback,
+BenchmarkDotNet, HTTP/1.1 + h2c cleartext, HTTP/3 with self-signed TLS, run 2026-06. Loopback
+isolates protocol-stack overhead and exaggerates per-request costs relative to real networks —
+over WAN latencies, the gaps on light requests shrink while the streaming/backpressure advantages
+remain. Memory figures count managed allocations only. Re-run with
+`dotnet run -c Release --project TurboHTTP.Benchmarks` to reproduce on your hardware.
diff --git a/notes/Bugs/H1-response-rate-monitor-leak.md b/notes/Bugs/H1-response-rate-monitor-leak.md
@@ -0,0 +1,75 @@
+---
+status: fixed
+component: Protocol/Http11/Server
+discovered: '2026-06-12'
+fixed: '2026-06-12'
+branch: release-next
+severity: critical
+tags:
+  - bug
+  - http11
+  - http10
+  - server
+  - data-rate
+  - connection-reset
+  - fixed
+---
+# H1.x Server Killed Idle Keep-Alive Connections (Response-Rate Entry Leak) — FIXED (2026-06-12)
+
+## Symptom
+
+The four zero-result rows in the 2026-06-12 server benchmark report (H1.1
+Plaintext/Fortunes/Upload_Concurrent): BenchmarkDotNet cases failed with
+`SocketException 10054: connection forcibly closed by the remote host`. GET benchmarks
+mostly survived because SocketsHttpHandler silently retries requests that die on a reused
+connection before the response starts (BDN still measured 0.77 first-chance exceptions per
+op on Plaintext CL=256!); POST uploads are not retryable → the whole benchmark case errored.
+
+## Root cause
+
+`Http11ServerStateMachine.EmitBufferedBody` — the standard path for **buffered** response
+bodies (i.e. virtually every normal MapGet/MapPost response) — called
+`_responseRate.Observe(...)` per chunk but never `_responseRate.Remove(0)` on completion.
+Only the *streaming* response path removed the entry. The stale entry's measured rate
+decayed toward 0 B/s; once the connection sat idle on keep-alive longer than
+`MinResponseDataRateGracePeriod` (default 5 s), the periodic `data-rate-check` timer flagged
+a "violation" and set `ShouldComplete` → the server reset a perfectly healthy connection.
+
+Trace signature: `data rate violation (reqRate=0, respRate=1, paused=False)` at Warning level.
+
+Same leak in `Http10ServerStateMachine.HandleResponseBodyRead` (streaming completion,
+`bytesRead == 0` branch) — relevant for `Connection: keep-alive` H1.0 clients.
+H2/H3 are unaffected (per-stream entries removed in `CloseStream`).
+
+## Why it was intermittent
+
+Under continuous tight-loop load the Observe calls keep refreshing the rate, so no violation.
+BDN's pauses between iterations (and HttpClient's >64 pooled connections rotating in and out
+of use) created exactly the idle-past-grace windows. A 1000-round tight-loop repro stayed
+green while the same scenario under BDN failed — first-chance exception counting +
+Senf Warning tracing exposed it.
+
+## The fix
+
+- `EmitBufferedBody`: `_responseRate.Remove(0)` after `writer.CompleteAsync()`.
+- `Http10ServerStateMachine`: same removal in the streaming-completion branch.
+
+## Tests
+
+- `Http11DataRateSpec.Buffered_response_completion_should_not_flag_idle_keepalive_connection`
+  (FakeTimeProvider, buffered body via `TurboHttpResponseBodyFeature.Writer`, idle 10 s, two
+  data-rate-check fires → must NOT set ShouldComplete; failed pre-fix).
+- `Http10DataRateSpec.Completed_streaming_response_should_not_flag_idle_connection`.
+
+## Verification
+
+- Repro (HttpClient → TurboServer, 500×64 concurrent 1 MB uploads): pre-fix 13 connection
+  resets + violation traces; post-fix 0/0 across 32 000 uploads.
+- BDN `Upload_Concurrent` H1.1 CL=64/256: was NA (errored), post-fix produces results.
+
+## Lesson
+
+Rate-monitor entries are per-response state with mandatory removal on every completion path —
+buffered, streamed, and failed. When BDN shows a 0/NA row, check the per-case log for
+first-chance exceptions; HttpClient retry semantics can hide server connection-kills on GETs
+while only POST benchmarks fail.
diff --git a/notes/Bugs/H1.1-client-body-pump-backpressure.md b/notes/Bugs/H1.1-client-body-pump-backpressure.md
@@ -0,0 +1,65 @@
+---
+status: fixed
+component: Protocol/Http11/Client
+discovered: '2026-06-12'
+fixed: '2026-06-12'
+branch: release-next
+severity: high
+tags:
+  - bug
+  - http11
+  - client
+  - backpressure
+  - memory
+  - fixed
+---
+# H1.1 Client Body Pump Had No Backpressure — FIXED (2026-06-12)
+
+## Symptom
+
+Benchmark: TurboHTTP client allocated **~1 MB per 1 MB POST at CL=512** over HTTP/1.1
+(534 MB total vs HttpClient's 35 MB) and trailed HttpClient by 63% in throughput, while
+single-request heavy was fine (93 KB/op). Repro (in-process Kestrel + real client):
+
+| | no backpressure | HWM=2 (fix) |
+|---|---|---|
+| CL=1 | 770 K B/op, 104 req/s | 660 K B/op, 164 req/s |
+| CL=64 | 789 K B/op, 487 req/s | 287 K B/op, 1 016 req/s |
+| CL=512 | 1 001 K B/op, 738 req/s | 173 K B/op, 1 262 req/s |
+
+## Root cause
+
+`Http11ClientStateMachine.HandleBodyRead` pumped the next chunk immediately after
+`FlushAsync()` with no regard for whether the network had consumed the previous one.
+For memory-backed content the entire request body was copied into pooled 16–64 KB
+chunks instantly and parked in the connection stage's outbound queue. Under N concurrent
+uploads the aggregate rented working set (N × body size) far exceeded `ArrayPool`
+capacity, so nearly every body byte became a fresh allocation — and the resulting GC
+pressure also throttled throughput.
+
+## The fix
+
+High-water mark in `Http11ClientStateMachine`: count body chunks emitted but not yet
+flushed (`_unflushedBodyChunks`, HWM = 2). At the HWM the pump pauses instead of issuing
+the next read; the already-existing `IClientStateMachine.OnOutboundFlushed()` callback
+(invoked by `HttpConnectionStageLogic` on every push to the network out-port, previously
+ignored by H1.1) decrements and resumes. Counter incremented *before* `FlushAsync` because
+a free port flushes synchronously and re-enters `OnOutboundFlushed`. State reset in
+`StartBodyDrain`, `BodyReadFailed`, and `Cleanup`.
+
+HTTP/1.0 client is unaffected (it buffers the whole body into a single
+`BufferedBodyWriter` by design — no chunked framing in 1.0).
+
+## Tests
+
+- `TurboHTTP.Tests/Protocol/Syntax/Http11/Client/Http11ClientBodyBackpressureSpec.cs` —
+  counting body stream proves the pump stops issuing reads after the HWM without flush
+  signals (pre-fix: all 64 chunks pumped) and resumes on `OnOutboundFlushed` to complete
+  the full 1 MB body intact.
+
+## Follow-up ideas (not done)
+
+- Zero-copy fast path for visible-buffer `MemoryStream` content (mirror
+  `Http2ClientSessionManager` fast path A) would eliminate the remaining per-chunk copy.
+- The server response-body path has its own pause/resume; H2/H3 bound in-flight data via
+  flow-control windows — only H1.1 client lacked a bound.
diff --git a/notes/Bugs/H3-frame-buffer-leak.md b/notes/Bugs/H3-frame-buffer-leak.md
@@ -0,0 +1,61 @@
+---
+status: fixed
+component: Protocol/Http3
+discovered: '2026-06-12'
+fixed: '2026-06-12'
+branch: release-next
+severity: high
+tags:
+  - bug
+  - http3
+  - memory
+  - pooling
+  - fixed
+---
+# H3 Inbound Frame Buffer Leak — FIXED (2026-06-12)
+
+## Symptom
+
+Benchmark: TurboServer allocated **1.27 MB managed memory per 1 MB HTTP/3 upload** (Kestrel: 87 KB).
+Client side showed the same pathology for H3 response bodies. Unit repro measured 1.04 bytes
+allocated per body byte at steady state.
+
+## Root cause
+
+`FrameDecoder.DecodeDataFrame`/`DecodeHeadersFrame` (`Protocol/Syntax/Http3/FrameDecoder.cs`)
+copy each frame payload into a `MemoryPool<byte>.Shared` rental owned by the frame
+(`DataFrame`/`HeadersFrame` implement `IDisposable`). Neither consumer ever disposed the frames:
+
+- Server: `Http3ServerSessionManager.ProcessFrames` — handled frames in a `foreach`/`switch`, no dispose.
+- Client: `Http3ClientStateMachine.ProcessFrameData` — same.
+
+Rentals were never returned → the array pool drained permanently → every subsequent
+`Rent` allocated a fresh array → allocations ≈ full body size per request/response.
+
+## The fix
+
+1. Both frame loops dispose each frame in a per-frame `finally` after handling
+   (handling copies what it keeps: body bytes via `QueuedBodyReader.TryEnqueue`,
+   header strings via QPACK decode).
+2. **Prerequisite**: `QpackTableSync.TryDecodeOrBlock` used to retain the *caller's*
+   `ReadOnlyMemory<byte>` (aliasing the frame's pooled rental) in `_blockedStreams` —
+   disposal would have corrupted blocked header blocks on pool reuse. It now stores an
+   owned copy (`data.ToArray()`; blocked streams are rare and small).
+
+## Tests
+
+- `TurboHTTP.Tests/Protocol/Syntax/Http3/Server/SessionManager/Http3DataFrameBufferReleaseSpec.cs`
+  — allocation-budget spec (thread-local `GC.GetAllocatedBytesForCurrentThread`, warmup + steady
+  state, asserts < ¼ body size) + body round-trip integrity. Pre-fix: 4.35 MB for 4.19 MB body.
+- `TurboHTTP.Tests/Protocol/Syntax/Http3/Client/StateMachine/Http3ResponseFrameBufferReleaseSpec.cs`
+  — client-side equivalent. Pre-fix: 2.15 MB for 2.10 MB body.
+- `TurboHTTP.Tests/Protocol/Syntax/Http3/Qpack/QpackBlockedStreamBufferOwnershipSpec.cs`
+  — blocked header block must survive caller-buffer scribble.
+
+## Lesson
+
+Pooled-rent + copy paths must have an explicit owner with a deterministic dispose point.
+The same audit found the H1.1 client pump issue (see [[H1.1-client-body-pump-backpressure]]).
+A remaining (separate, optimization-level) issue: H3 still double-copies DATA payloads
+(FrameDecoder rental → QueuedBodyReader rental); H2 avoids the first copy by slicing its
+working buffer. Aligning H3 with H2 would cut another ~50% of transient copy traffic.
diff --git a/notes/Bugs/SendAsync-options-race.md b/notes/Bugs/SendAsync-options-race.md
@@ -0,0 +1,51 @@
+---
+status: fixed
+component: Client
+discovered: '2026-06-12'
+fixed: '2026-06-12'
+branch: release-next
+severity: high
+tags:
+  - bug
+  - client
+  - race-condition
+  - fixed
+---
+# SendAsync Mutated request.Options After Enqueue (Dictionary Corruption Race) — FIXED (2026-06-12)
+
+## Symptom
+
+Full benchmark run, `KestrelTurboSendAsyncConcurrentBenchmarks` H2 CL=4096: benchmark child
+process crashed (exit -1) with
+
+```
+InvalidOperationException: Operations that change non-concurrent collections must have
+exclusive access. ... at RequestEnricher.Enrich (line 88, Options.TryGetValue)
+→ MergeHub ProducerFailed → consumer ingress dies
+```
+
+## Root cause
+
+`TurboHttpClient.SendAsync` wrote the request into the channel (`Requests.WriteAsync`) and
+**afterwards** called `request.SetCancellationToken(cts.Token)` → `request.Options.Set(...)`
+on the caller thread. From the moment the request is enqueued, the pipeline's
+`RequestEnricher.Enrich` reads and mutates the same `HttpRequestOptions` (a plain
+`Dictionary<string, object?>`) on a MergeHub stream thread. Two unsynchronized writers →
+dictionary state corruption under high concurrency (CL=4096 reliably hit the window).
+
+## The fix
+
+Reordered `SendAsync`: the CTS is created (linked / pooled / fresh) and the cancellation
+token stamped into `request.Options` **before** `Requests.WriteAsync`. `CancelAfter` and the
+`UnsafeRegister` callback still happen after the write (they don't touch Options). Cleanup
+(TryReset/pool/dispose) moved to the outer finally so the early-created CTS is also released
+when the channel write throws.
+
+**Invariant to preserve:** nothing may touch `request.Options` after the channel write —
+the options dictionary is single-owner until enqueue, then owned by the stream side.
+
+## Tests
+
+- `TurboHttpClientSpec.SendAsync_should_set_cancellation_token_before_enqueueing` — a
+  capturing `ChannelWriter` asserts the token is present at the moment of `TryWrite`
+  (deterministic; failed pre-fix).
diff --git a/src/TurboHTTP.Tests.Shared/FakeServerOps.cs b/src/TurboHTTP.Tests.Shared/FakeServerOps.cs
@@ -15,11 +15,15 @@ internal sealed class FakeServerOps : IServerStageOperations
     public List<(string Name, TimeSpan Delay)> ScheduledTimers { get; } = [];
     public List<string> CancelledTimers { get; } = [];
 
+    /// <summary>Every OnScheduleTimer call in order, without the de-duplication applied to <see cref="ScheduledTimers"/>.</summary>
+    public List<(string Name, TimeSpan Delay)> ScheduleTimerCalls { get; } = [];
+
     public void OnRequest(IFeatureCollection features) => Requests.Add(features);
     public void OnOutbound(ITransportOutbound item) => Outbound.Add(item);
 
     public void OnScheduleTimer(string name, TimeSpan delay)
     {
+        ScheduleTimerCalls.Add((name, delay));
         ScheduledTimers.RemoveAll(t => t.Name == name);
         ScheduledTimers.Add((name, delay));
     }