From 04e671c135462d01b216ea493cfec59c8f6262a6 Mon Sep 17 00:00:00 2001 From: Nick Nassiri Date: Sun, 21 Jun 2026 00:52:29 -0700 Subject: [PATCH] Perf #856: emit `throw` not a returning call at loop-backedge cancel path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every compiled loop polls a cooperative-cancellation flag at its backedge (#74). The flag read is free, but the cold path was `call CheckCancellation()` — a helper that throws internally. From RyuJIT's flow-graph view that is a *returning* call, and on SysV x64 every XMM register is caller-saved, so a returning call inside a loop forces the loop-carried doubles (and counter) to be stack-resident on every iteration: a load/store per use, roughly doubling a tight numeric loop. Fix: the backedge now emits `call $Runtime.BuildCancellationException(); throw`. The new factory only *constructs* the OperationCanceledException; the `throw` opcode happens at the backedge. Because `throw` does not return, the loop vars are dead on the cancel path and stay in registers on the hot path. CheckCancellation() is retained for the non-hot-loop sites (event loop, deep-recursion guard). Cancellation semantics are unchanged — same exception, same message, thrown at the same point. Controlled microbench (result*=i loop): `call CheckCancellation()` 2.09 ns/iter vs `throw Factory()` 1.15 ns/iter (volatile-read-only is also 1.15 — the read was never the cost). Real benchmarks @largest size, compiled vs Node: - objects 2.52x slower -> 1.00x (parity) - strings 1.26x slower -> 0.91x (faster than Node) - closures 1.13x slower -> 1.02x (parity) - count-primes 1.45x slower -> 1.13x - factorial 2.27x slower -> 1.22x 5/7 workloads now meet-or-beat Node; benefits all compiled loops. This supersedes the inline-volatile form (#874), which removed the unconditional call overhead but left the returning call in the loop's flow graph. IL verifies; #74 infinite-loop cancellation test still unwinds. --- Compilation/EmittedRuntime.cs | 12 ++++++++++ Compilation/RuntimeEmitter.RuntimeClass.cs | 21 +++++++++++++++++ Compilation/StatementEmitterBase.cs | 26 +++++++++++++++------- STATUS.md | 24 +++++++++++--------- 4 files changed, 64 insertions(+), 19 deletions(-) diff --git a/Compilation/EmittedRuntime.cs b/Compilation/EmittedRuntime.cs index 66930675..9276bb30 100644 --- a/Compilation/EmittedRuntime.cs +++ b/Compilation/EmittedRuntime.cs @@ -80,6 +80,18 @@ public class EmittedRuntime public FieldBuilder CancelRequestedField { get; set; } = null!; public MethodBuilder CheckCancellationMethod { get; set; } = null!; + // Loop-backedge cancellation throws via `call BuildCancellationException(); + // throw` rather than `call CheckCancellation()`. CheckCancellation() is a + // *returning* call from the JIT's flow-graph view (its throw is internal and + // conditional), so on SysV x64 — where every XMM register is caller-saved — + // it forces loop-carried doubles to be stack-resident across every iteration. + // A `throw` does not return, so the values are dead on the cancel path and + // stay in registers. Measured ~1.8× on tight numeric loops (#856). This + // factory only *constructs* the exception (no throw), so the backedge emits a + // genuine `throw` opcode. CheckCancellationMethod is retained for the + // non-hot-loop call sites (event loop, deep-recursion guard). + public MethodBuilder BuildCancellationExceptionMethod { get; set; } = null!; + // The emitted runtime helper class public TypeBuilder RuntimeType { get; set; } = null!; diff --git a/Compilation/RuntimeEmitter.RuntimeClass.cs b/Compilation/RuntimeEmitter.RuntimeClass.cs index 4bc3c890..2dcb4d13 100644 --- a/Compilation/RuntimeEmitter.RuntimeClass.cs +++ b/Compilation/RuntimeEmitter.RuntimeClass.cs @@ -245,6 +245,27 @@ FieldBuilder DefineSubclassProto(string fieldName) => il.Emit(OpCodes.Ret); } + // BuildCancellationException(): constructs and RETURNS (does not throw) + // the OperationCanceledException used at loop backedges. Loop emitters + // emit `call BuildCancellationException(); throw` so the cancel path is a + // non-returning `throw` rather than a returning `call CheckCancellation()` + // — keeping the hot loop body free of a call that would otherwise force + // loop-carried doubles onto the stack on SysV x64 (~1.8× on tight numeric + // loops, #856). See EmittedRuntime.BuildCancellationExceptionMethod. + var buildCancelEx = typeBuilder.DefineMethod( + "BuildCancellationException", + MethodAttributes.Public | MethodAttributes.Static, + typeof(Exception), + Type.EmptyTypes); + runtime.BuildCancellationExceptionMethod = buildCancelEx; + { + var il = buildCancelEx.GetILGenerator(); + il.Emit(OpCodes.Ldstr, "Compiled execution cancelled."); + il.Emit(OpCodes.Newobj, + typeof(OperationCanceledException).GetConstructor([typeof(string)])!); + il.Emit(OpCodes.Ret); + } + // Static field for Random var randomField = typeBuilder.DefineField("_random", _types.Random, FieldAttributes.Private | FieldAttributes.Static); diff --git a/Compilation/StatementEmitterBase.cs b/Compilation/StatementEmitterBase.cs index b7813784..be18fb1e 100644 --- a/Compilation/StatementEmitterBase.cs +++ b/Compilation/StatementEmitterBase.cs @@ -372,12 +372,21 @@ protected virtual void EmitIf(Stmt.If i) /// runner's timeout. /// /// - /// Perf (#856): instead of an unconditional call $Runtime.CheckCancellation(), - /// we inline the field test and only call the (throwing) helper on the cold - /// cancel path. RyuJIT will not inline CheckCancellation itself (it - /// contains newobj+throw), so the bare call sat in every loop - /// body as a per-iteration optimization barrier — measured at ~half the - /// runtime of a tight numeric loop. Inlining the test recovers ~1.6×. + /// Perf (#856): the backedge inlines the field test and, on the cold cancel + /// path, emits call BuildCancellationException(); throw — NOT + /// call CheckCancellation(). The distinction is decisive on SysV x64: + /// CheckCancellation() is, from the JIT's flow-graph view, a + /// returning call (its throw is internal and conditional), so + /// the register allocator must assume control returns from it. Because every + /// XMM register is caller-saved on SysV x64, a returning call inside the loop + /// forces the loop-carried doubles (and the loop counter) to be stack-resident + /// across every iteration — a load/store per use. Emitting a real throw + /// opcode makes the path non-returning, so those values are dead on the cancel + /// path and stay in registers on the hot path. Measured ~1.8× on tight numeric + /// loops (objects/factorial reach Node parity); the earlier inline-call form + /// (#874) only removed the unconditional-call overhead, not this spill. + /// BuildCancellationException merely constructs the exception + /// (it does not throw), so the throw happens here at the backedge. /// /// The volatile. prefix is mandatory: _cancelRequested is /// loop-invariant, so a plain ldsfld could be hoisted out of the loop @@ -387,14 +396,15 @@ protected virtual void EmitIf(Stmt.If i) /// protected void EmitCancellationCheck() { - if (Ctx.Runtime?.CheckCancellationMethod == null || Ctx.Runtime?.CancelRequestedField == null) + if (Ctx.Runtime?.BuildCancellationExceptionMethod == null || Ctx.Runtime?.CancelRequestedField == null) return; var notCancelled = IL.DefineLabel(); IL.Emit(OpCodes.Volatile); IL.Emit(OpCodes.Ldsfld, Ctx.Runtime.CancelRequestedField); IL.Emit(OpCodes.Brfalse, notCancelled); - IL.Emit(OpCodes.Call, Ctx.Runtime.CheckCancellationMethod); + IL.Emit(OpCodes.Call, Ctx.Runtime.BuildCancellationExceptionMethod); + IL.Emit(OpCodes.Throw); IL.MarkLabel(notCancelled); } diff --git a/STATUS.md b/STATUS.md index 496da4ac..2e13bd68 100644 --- a/STATUS.md +++ b/STATUS.md @@ -2,7 +2,7 @@ This document tracks TypeScript language features and their implementation status in SharpTS. -**Last Updated:** 2026-06-20 (Perf epic [#856](https://github.com/nickna/SharpTS/issues/856) — compiled output now meets or beats Node.js on most of the cross-runtime benchmark suite; loop-backedge cancellation check inlined, [#874](https://github.com/nickna/SharpTS/pull/874)) +**Last Updated:** 2026-06-21 (Perf epic [#856](https://github.com/nickna/SharpTS/issues/856) — compiled output now meets or beats Node.js on 5 of 7 cross-runtime workloads, the other two within ~1.2×; loop-backedge cancellation now emits `throw` instead of a returning `call`, recovering ~1.8× on tight numeric loops — see §18) ## Legend - ✅ Implemented @@ -518,19 +518,21 @@ Epic [#856](https://github.com/nickna/SharpTS/issues/856) tracks closing the com | Workload | Status | vs Node | |---|---|---| -| fibonacci | ✅ | **faster than Node** — recursion/call core | -| array-methods | ✅ | **faster than Node** — typed `List` HOF pipeline ([#872](https://github.com/nickna/SharpTS/issues/872)) | -| strings | ✅ | ≈ parity — `StringBuilder` accumulator promotion ([#870](https://github.com/nickna/SharpTS/issues/870)) + `charCodeAt` box-elision ([#873](https://github.com/nickna/SharpTS/issues/873)) | -| closures | ✅ | done — non-escaping local arrows de-virtualized to direct calls ([#858](https://github.com/nickna/SharpTS/issues/858)) | -| objects | ✅ | done — object literals as shape structs ([#862](https://github.com/nickna/SharpTS/issues/862)) | -| count-primes | ⚠️ | ~1.3× slower (sieve; array-heavy loop) | -| factorial | ⚠️ | ~3× slower (tight numeric loop; µs-scale at benchmark sizes) | +| fibonacci | ✅ | **~2.4× faster** — recursion/call core | +| array-methods | ✅ | **~2× faster** — typed `List` HOF pipeline ([#872](https://github.com/nickna/SharpTS/issues/872)) | +| strings | ✅ | **faster** (~0.9×) — `StringBuilder` accumulator promotion ([#870](https://github.com/nickna/SharpTS/issues/870)) + `charCodeAt` box-elision ([#873](https://github.com/nickna/SharpTS/issues/873)) | +| objects | ✅ | **parity** (1.00×) — object literals as shape structs ([#862](https://github.com/nickna/SharpTS/issues/862)) + cancel-throw codegen (below) | +| closures | ✅ | **parity** (~1.02×) — non-escaping local arrows de-virtualized to direct calls ([#858](https://github.com/nickna/SharpTS/issues/858)) | +| count-primes | ✅ | ~1.13× (sieve; `List` index-write bounds checks are the residual) | +| factorial | ✅ | ~1.22× (tight numeric loop at the codegen floor; V8 is ~0.2 ns/iter tighter; µs-scale) | -The original catastrophic gaps (14–117× slower) are closed. Every win came from **re-exposing static types that the naive lowering erased** — boxing, `object`/`List` representations, reflective dispatch, O(n²) string concat — so RyuJIT can optimize typed code. The emitter's job is to choose the algorithm/representation/dispatch and not erase known types; the JIT optimizes the typed ops it's given. +The original catastrophic gaps (14–117× slower) are closed and the suite now meets-or-beats Node on 5 of 7 workloads, with the other two within ~1.2×. Every win came from **re-exposing static types that the naive lowering erased** — boxing, `object`/`List` representations, reflective dispatch, O(n²) string concat — so RyuJIT can optimize typed code. The emitter's job is to choose the algorithm/representation/dispatch and not erase known types; the JIT optimizes the typed ops it's given. -**Loop-backedge cancellation cost ([#874](https://github.com/nickna/SharpTS/pull/874)):** every compiled loop polls a cooperative-cancellation flag at its backedge so the runner can unwind runaway loops (issue [#74](https://github.com/nickna/SharpTS/issues/74)). This was an unconditional `call $Runtime.CheckCancellation()`; RyuJIT won't inline that helper (it contains `newobj`+`throw`), so it sat in every loop body as a per-iteration optimization barrier — ~half the runtime of a tight numeric loop. It is now an inlined `volatile` field test that calls the throwing helper only on the cold cancel path (`volatile.` defeats LICM hoisting the loop-invariant flag read, which would silently break cancellation). Result: **1.6×** on tight numeric loops, **1.12×** on the sieve, cancellation semantics unchanged. A throttle-every-N-iterations variant was tried and **rejected** — it merely ties the inline-volatile version, because a volatile static-field read is nearly free on x86-64 while a per-loop counter adds equal per-iteration cost. +**Loop-backedge cancellation: throw, don't call (2026-06-21).** Every compiled loop polls a cooperative-cancellation flag at its backedge so the runner can unwind runaway loops (issue [#74](https://github.com/nickna/SharpTS/issues/74)). The flag test is an inlined `volatile.` field read ([#874](https://github.com/nickna/SharpTS/pull/874)); the **cold path** used to be `call $Runtime.CheckCancellation()` (a helper that throws internally). That was the dominant remaining gap on tight loops — **not** the flag read, which is free. From the JIT's flow-graph view `CheckCancellation()` is a *returning* call (its `throw` is conditional and internal), so the register allocator must assume control returns. On SysV x64 **every XMM register is caller-saved**, so a returning call inside a loop forces the loop-carried doubles (and counter) to be stack-resident across *every* iteration — a load/store per use. The backedge now emits `call $Runtime.BuildCancellationException(); throw` — a factory that only *constructs* the exception, then a real `throw` opcode. Because `throw` does not return, the loop vars are dead on the cancel path and stay in registers on the hot path. Measured **~1.8× on tight numeric loops**: objects 2.5×→parity, strings 1.26×→faster, factorial 2.27×→1.22×, count-primes 1.45×→1.13×. Cancellation semantics are unchanged (same `OperationCanceledException`, same message, thrown at the same point). This benefits **all** compiled loops, not just the benchmark suite. -The remaining sub-parity workloads (count-primes, factorial) are dominated by separate, non-codegen factors: the residual per-iteration cancellation poll, non-inlined user-function calls, and boxed top-level `var`s. +Why earlier attempts plateaued: the inline-volatile form (#874) removed the unconditional *call overhead* but left the returning call in the loop's flow graph, so the XMM spill remained. A throttle-every-N-iterations variant ties it — reducing read frequency doesn't remove the call from the flow graph. The fix is structural: make the cancel path *non-returning* so the loop body carries no call at all. + +The two remaining sub-parity workloads (count-primes ~1.13×, factorial ~1.22×) are at the codegen floor: factorial's loop already runs at the no-cancellation-check speed (V8 generates a marginally tighter multiply loop); count-primes' residual is `List` indexed-write bounds checking vs V8's packed-array elision. ---