From 98e7f8e7552783b498612ddc7c7a05dcfc98a9d4 Mon Sep 17 00:00:00 2001 From: Mark Scherer Date: Sat, 6 Jun 2026 18:06:57 +0200 Subject: [PATCH 1/6] Align heading-id slugs with jgm/djot#393; add opt-in asciiHeadingIds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Default heading-id generation now follows the settled #393 rule: replace each maximal run of non-alphanumeric ASCII with a single '-' and trim leading/trailing '-', preserving letter case and all non-ASCII characters. This drops the previous always-on ASCII transliteration and the '_' exception, so 'Über café' becomes 'Über-café', 'under_score' becomes 'under-score', 'a--b' becomes 'a-b'. A leading-digit result keeps the 'h-' prefix for CSS-selector safety (orthogonal to #393, which governs punctuation only). ASCII transliteration is now opt-in via a new asciiHeadingIds option on DjotConverter, threaded to both the renderer and the parser's reference-resolution pass so heading ids stay in parity. With it enabled, 'Über café' becomes 'uber-cafe'. The official djot test suite is unchanged (its id cases are simple ASCII). --- src/DjotConverter.php | 5 +- src/Parser/BlockParser.php | 13 +++- src/Renderer/HeadingIdTracker.php | 69 +++++++++++-------- src/Renderer/HtmlRenderer.php | 4 +- src/Renderer/RenderContext.php | 4 +- tests/TestCase/DjotConverterTest.php | 13 ++-- .../HeadingReferenceExtensionTest.php | 7 +- .../Renderer/HeadingIdTrackerTest.php | 56 ++++++++------- 8 files changed, 102 insertions(+), 69 deletions(-) diff --git a/src/DjotConverter.php b/src/DjotConverter.php index e4aaf702..f5057be1 100644 --- a/src/DjotConverter.php +++ b/src/DjotConverter.php @@ -146,6 +146,7 @@ public function __construct( bool $nestedBlocksInLists = false, bool $blocksInterruptParagraphs = false, bool $nestedListsWithoutBlankLine = false, + bool $asciiHeadingIds = false, ) { $this->collectWarnings = $warnings; $this->strictMode = $strict; @@ -154,14 +155,14 @@ public function __construct( if ($parser !== null) { $this->parser = $parser; } else { - $this->parser = new BlockParser($warnings, $strict, $significantNewlines, $nestedBlocksInLists, $blocksInterruptParagraphs, $nestedListsWithoutBlankLine); + $this->parser = new BlockParser($warnings, $strict, $significantNewlines, $nestedBlocksInLists, $blocksInterruptParagraphs, $nestedListsWithoutBlankLine, $asciiHeadingIds); } // Use provided renderer or create one from parameters if ($renderer !== null) { $this->renderer = $renderer; } else { - $this->renderer = new HtmlRenderer($xhtml); + $this->renderer = new HtmlRenderer($xhtml, $asciiHeadingIds); // Configure safe mode $this->setSafeMode($safeMode); diff --git a/src/Parser/BlockParser.php b/src/Parser/BlockParser.php index f4ba7ae2..d31c3d27 100644 --- a/src/Parser/BlockParser.php +++ b/src/Parser/BlockParser.php @@ -193,6 +193,13 @@ class BlockParser */ protected bool $nestedListsWithoutBlankLine = false; + /** + * When true, heading ids are ASCII-folded (opt-in). Mirrors the renderer's + * HeadingIdTracker setting so parser-side id computation (heading reference + * resolution) matches the rendered ids. + */ + protected bool $asciiHeadingIds = false; + public function __construct( bool $collectWarnings = false, bool $strictMode = false, @@ -200,9 +207,11 @@ public function __construct( bool $nestedBlocksInLists = false, bool $blocksInterruptParagraphs = false, bool $nestedListsWithoutBlankLine = false, + bool $asciiHeadingIds = false, ) { $this->collectWarnings = $collectWarnings; $this->strictMode = $strictMode; + $this->asciiHeadingIds = $asciiHeadingIds; // significantNewlines is the deprecated union of blocksInterruptParagraphs // and nestedListsWithoutBlankLine (NOT the broad nestedBlocksInLists). $this->blocksInterruptParagraphs = $blocksInterruptParagraphs || $significantNewlines; @@ -697,7 +706,7 @@ protected function extractAbbreviations(array $lines): void */ protected function extractHeadingReferences(array $lines): void { - $headingIdTracker = new HeadingIdTracker(); + $headingIdTracker = new HeadingIdTracker(null, $this->asciiHeadingIds); $pendingId = null; $count = count($lines); @@ -780,7 +789,7 @@ protected function extractHeadingReferences(array $lines): void */ protected function rewriteHeadingReferences(Document $document): void { - $tracker = new HeadingIdTracker(); + $tracker = new HeadingIdTracker(null, $this->asciiHeadingIds); $tracker->reserveExplicitIds($document); /** @var array $newUrlByLabel */ diff --git a/src/Renderer/HeadingIdTracker.php b/src/Renderer/HeadingIdTracker.php index 61bab9fb..0314667c 100644 --- a/src/Renderer/HeadingIdTracker.php +++ b/src/Renderer/HeadingIdTracker.php @@ -57,8 +57,17 @@ class HeadingIdTracker protected AsciiTransliterator $transliterator; - public function __construct(?AsciiTransliterator $transliterator = null) - { + /** + * @param \Djot\Renderer\AsciiTransliterator|null $transliterator + * @param bool $asciiHeadingIds When true, transliterate heading text to ASCII + * (Über -> Uber, café -> cafe) before slugging, for maximum URL/CSS + * portability. Off by default: per jgm/djot#393 the identifier preserves + * non-ASCII characters. + */ + public function __construct( + ?AsciiTransliterator $transliterator = null, + protected bool $asciiHeadingIds = false, + ) { $this->transliterator = $transliterator ?? new AsciiTransliterator(); } @@ -115,40 +124,42 @@ public function reserveExplicitIds(Node $node): void } /** - * Normalize text into a valid, link-safe CSS identifier string - * - * 1. Transliterate to ASCII (Über → Uber, café → cafe, Привет → Privet), - * so the ID survives being shared as a URL fragment through - * auto-linkers that truncate or mangle non-ASCII - * 2. Strip # characters entirely - * 3. Trim whitespace - * 4. Replace whitespace sequences with single dashes - * 5. Replace any remaining characters invalid in CSS identifiers - * (anything other than letters, numbers, hyphens, and underscores) - * with dashes - * 6. Collapse consecutive dashes and trim leading/trailing dashes - * 7. Prefix with 'h-' if the result starts with a digit, ensuring a valid - * CSS ident start (digits are not allowed as the first character) + * Normalize heading text into an identifier (jgm/djot#393) * - * Returns '' when nothing usable remains (e.g. all-punctuation text, or a - * script the transliterator cannot reduce to ASCII); the caller then - * falls back to a generated `s-N` id. + * 1. (Opt-in only, when $asciiHeadingIds) transliterate to ASCII + * (Über → Uber, café → cafe, Привет → Privet) for maximum URL/CSS + * portability. + * 2. Replace each maximal run of non-alphanumeric ASCII with a single '-' + * (covers whitespace, punctuation, '_', and runs of '-'); non-ASCII + * characters and letter case are preserved. + * 3. Trim leading/trailing '-'. + * 4. Prefix with 'h-' if the result starts with a digit, so the id is a + * valid bare CSS selector (querySelector('#9-x') would otherwise throw). + * This is orthogonal to #393, which governs punctuation only. * - * Producing a valid CSS identifier ensures that consumers such as HTMX, - * which call `querySelector` with the section ID for scroll-restoration, - * do not throw a SyntaxError when headings contain inline code or special - * characters (e.g. `$this->t($key, $params = [], $fallback = '')`). + * Returns '' when nothing usable remains (all-punctuation text, or - in the + * opt-in ASCII mode - a script the transliterator cannot reduce to ASCII); + * the caller then falls back to a generated `s-N` id. */ public function normalizeId(string $text): string { - $id = $this->transliterator->transliterate($text); - $id = str_replace('#', '', $id); - $id = trim($id); - $id = preg_replace('/\s+/u', '-', $id) ?? $id; - $id = preg_replace('/[^\p{L}\p{N}_-]+/u', '-', $id) ?? $id; - $id = preg_replace('/-{2,}/', '-', $id) ?? $id; + $id = $text; + + // Opt-in: fold to ASCII first (Über -> Uber) for URL/CSS portability. + if ($this->asciiHeadingIds) { + $id = $this->transliterator->transliterate($id); + } + + // jgm/djot#393: replace each maximal run of non-alphanumeric ASCII with a + // single '-' and trim leading/trailing '-'. Non-ASCII characters (unicode + // letters/marks) and case are preserved; this also collapses spaces, + // punctuation, '_' and runs of '-'. + $id = preg_replace('/[^0-9A-Za-z\x{0080}-\x{10FFFF}]+/u', '-', $id) ?? $id; $id = trim($id, '-'); + // A leading digit is a valid HTML id but an invalid bare CSS selector + // (e.g. querySelector('#9-x') throws), so prefix 'h-'. This is orthogonal + // to #393, which governs punctuation, not the leading-digit case. if ($id !== '' && preg_match('/^\p{N}/u', $id)) { $id = 'h-' . $id; } diff --git a/src/Renderer/HtmlRenderer.php b/src/Renderer/HtmlRenderer.php index b068e3b9..ff3e2ed7 100644 --- a/src/Renderer/HtmlRenderer.php +++ b/src/Renderer/HtmlRenderer.php @@ -97,9 +97,9 @@ class HtmlRenderer implements RendererInterface */ protected const OL_ONLY_ATTRIBUTES = ['start', 'type', 'reversed']; - public function __construct(protected bool $xhtml = false) + public function __construct(protected bool $xhtml = false, bool $asciiHeadingIds = false) { - $this->sharedRenderContext = new RenderContext(); + $this->sharedRenderContext = new RenderContext(asciiHeadingIds: $asciiHeadingIds); $this->initNodeRenderers(); } diff --git a/src/Renderer/RenderContext.php b/src/Renderer/RenderContext.php index bde8e2a3..f58c3aa2 100644 --- a/src/Renderer/RenderContext.php +++ b/src/Renderer/RenderContext.php @@ -44,9 +44,9 @@ class RenderContext */ public array $inlineFootnoteRenderers = []; - public function __construct(?HeadingIdTracker $headingIdTracker = null) + public function __construct(?HeadingIdTracker $headingIdTracker = null, bool $asciiHeadingIds = false) { - $this->headingIdTracker = $headingIdTracker ?? new HeadingIdTracker(); + $this->headingIdTracker = $headingIdTracker ?? new HeadingIdTracker(null, $asciiHeadingIds); } public function reset(): void diff --git a/tests/TestCase/DjotConverterTest.php b/tests/TestCase/DjotConverterTest.php index e5ab9633..6e6619f5 100644 --- a/tests/TestCase/DjotConverterTest.php +++ b/tests/TestCase/DjotConverterTest.php @@ -1919,15 +1919,16 @@ public function testUnicodeInHeading(): void $djot = '# 日本語の見出し'; $result = $this->converter->convert($djot); - // The visible heading text is unchanged; only the ID is made - // ASCII-safe so it survives being shared as a URL fragment. + // jgm/djot#393: non-ASCII heading text is preserved in the id. $this->assertStringContainsString('

日本語の見出し

', $result); - $this->assertStringNotContainsString('id="日本語の見出し"', $result); - $this->assertMatchesRegularExpression('/
/', $result); + $this->assertStringContainsString('id="日本語の見出し"', $result); + // The opt-in asciiHeadingIds mode folds the id to ASCII for URL/CSS + // portability. + $ascii = (new DjotConverter(asciiHeadingIds: true))->convert($djot); + $this->assertMatchesRegularExpression('/
/', $ascii); if (class_exists(Transliterator::class)) { - // With ext-intl the CJK heading is romanized rather than dropped. - $this->assertStringContainsString('
', $result); + $this->assertStringContainsString('
', $ascii); } } diff --git a/tests/TestCase/Extension/HeadingReferenceExtensionTest.php b/tests/TestCase/Extension/HeadingReferenceExtensionTest.php index 15cfcc6a..46902dfc 100644 --- a/tests/TestCase/Extension/HeadingReferenceExtensionTest.php +++ b/tests/TestCase/Extension/HeadingReferenceExtensionTest.php @@ -174,14 +174,15 @@ public function testHeadingWithSmartQuotesMatchesStraightQuoteReference(): void // The parser converts straight quotes to smart quotes in heading text, // but reference targets keep straight quotes. The extension normalizes - // quotes for matching so this should resolve correctly. + // quotes for matching so the reference still resolves; the resulting id + // keeps the smart quotes (jgm/djot#393 preserves non-ASCII characters). $html = $converter->convert(<<<'DJOT' See [[Say "Hello"]]. # Say "Hello" DJOT); - $this->assertStringContainsString('href="#Say-Hello"', $html); + $this->assertStringContainsString('href="#Say-“Hello”"', $html); $this->assertStringNotContainsString('[[Say "Hello"]]', $html); } @@ -210,7 +211,7 @@ public function testHeadingWithApostropheResolvesCorrectly(): void # Bob's Guide DJOT); - $this->assertStringContainsString('href="#Bob-s-Guide"', $html); + $this->assertStringContainsString('href="#Bob’s-Guide"', $html); $this->assertStringNotContainsString('data-heading-ref=', $html); $this->assertStringNotContainsString('[[Bob\'s Guide]]', $html); } diff --git a/tests/TestCase/Renderer/HeadingIdTrackerTest.php b/tests/TestCase/Renderer/HeadingIdTrackerTest.php index 22a27ba0..75f8ba13 100644 --- a/tests/TestCase/Renderer/HeadingIdTrackerTest.php +++ b/tests/TestCase/Renderer/HeadingIdTrackerTest.php @@ -175,38 +175,48 @@ public function testNormalizeId(): void $this->assertSame('Multiple-Spaces', $this->tracker->normalizeId('Multiple Spaces')); $this->assertSame('this-t-key-params-fallback', $this->tracker->normalizeId("\$this->t(\$key, \$params = [], \$fallback = '')")); $this->assertSame('My-title', $this->tracker->normalizeId('My --- title')); - // Non-ASCII is transliterated to keep shared anchors link-safe; the - // Latin/Cyrillic output is deterministic with or without ext-intl. - $this->assertSame('Privet-mir', $this->tracker->normalizeId('Привет мир')); + // jgm/djot#393: non-ASCII is preserved (case kept), not transliterated. + $this->assertSame('Привет-мир', $this->tracker->normalizeId('Привет мир')); $this->assertSame('', $this->tracker->normalizeId('###')); $this->assertSame('h-123-Things', $this->tracker->normalizeId('123 Things')); $this->assertSame('h-1-Introduction', $this->tracker->normalizeId('1. Introduction')); } /** - * Pins djot-php's heading-ID behaviour around jgm/djot#391. - * - * djot-php replaces (not removes) mid-word punctuation, additionally - * replaces apostrophes / quotes / `;` / `:` so IDs are valid CSS - * identifiers, and transliterates non-ASCII to ASCII so the IDs survive - * being shared as URL fragments through auto-linkers. All cases below - * are deterministic with or without ext-intl. + * Pins djot-php's heading-ID behaviour to jgm/djot#393: each maximal run of + * non-alphanumeric ASCII is replaced with `-` and leading/trailing `-` are + * trimmed. Case and non-ASCII characters (Cyrillic, accented Latin, smart + * quotes) are preserved; `_` is replaced (no longer an exception). A + * leading-digit result keeps the `h-` prefix for CSS-selector safety + * (orthogonal to #393). ASCII-folding is opt-in (asciiHeadingIds). */ public function testNormalizeIdSpecAlignmentEdgeCases(): void { $this->assertSame('A-B-C', $this->tracker->normalizeId('A+B=C')); $this->assertSame('Emphasis-strong', $this->tracker->normalizeId('Emphasis/strong')); $this->assertSame('That-s-all', $this->tracker->normalizeId("That's all")); - $this->assertSame('That-s-all', $this->tracker->normalizeId('That’s all')); + $this->assertSame('That’s-all', $this->tracker->normalizeId('That’s all')); $this->assertSame('foo-bar', $this->tracker->normalizeId('foo...bar')); $this->assertSame('Uber-uns', $this->tracker->normalizeId('Uber uns')); - $this->assertSame('Uber-uns', $this->tracker->normalizeId('Über uns')); - $this->assertSame('cafe-resume', $this->tracker->normalizeId('café résumé')); - $this->assertSame('Strasse', $this->tracker->normalizeId('Straße')); + $this->assertSame('Über-uns', $this->tracker->normalizeId('Über uns')); + $this->assertSame('café-résumé', $this->tracker->normalizeId('café résumé')); + $this->assertSame('Straße', $this->tracker->normalizeId('Straße')); $this->assertSame('h-2024-recap', $this->tracker->normalizeId('2024 recap')); $this->assertSame('', $this->tracker->normalizeId('!!!')); } + /** + * The opt-in asciiHeadingIds mode transliterates non-ASCII to ASCII before + * slugging, for maximum URL/CSS portability. + */ + public function testAsciiHeadingIdsOptInTransliterates(): void + { + $ascii = new HeadingIdTracker(null, asciiHeadingIds: true); + $this->assertSame('Privet-mir', $ascii->normalizeId('Привет мир')); + $this->assertSame('Uber-uns', $ascii->normalizeId('Über uns')); + $this->assertSame('cafe-resume', $ascii->normalizeId('café résumé')); + } + public function testGetPlainText(): void { $heading = new Heading(2); @@ -370,28 +380,28 @@ public function testSymbolsRetainedInPlainText(): void } /** - * djot.js keeps `_` (it is not in its punctuation denylist) and it is a - * valid CSS identifier character, so djot-php keeps it too. This pins the - * deliberate divergence from the looser #393 spec prose. + * jgm/djot#393 removes the per-character exceptions: `_` is non-alphanumeric + * ASCII, so it is replaced with `-` like any other punctuation. */ - public function testUnderscoreRetainedInId(): void + public function testUnderscoreReplacedInId(): void { $heading = new Heading(2); $heading->appendChild(new Text('foo_bar baz')); $id = $this->tracker->getIdForHeading($heading); - $this->assertSame('foo_bar-baz', $id); + $this->assertSame('foo-bar-baz', $id); } /** - * When transliteration removes the entire heading text (a script outside - * the baked map, no ext-intl), the heading must fall back to a stable - * generated `s-N` id — not the legacy `heading` sentinel. + * In the opt-in asciiHeadingIds mode, when transliteration removes the + * entire heading text (a script outside the baked map, no ext-intl), the + * heading must fall back to a stable generated `s-N` id. (By default the + * non-ASCII text is preserved instead, per #393, so no fallback occurs.) */ public function testHeadingThatTransliteratesToNothingGetsFallbackId(): void { - $tracker = new HeadingIdTracker(new AsciiTransliterator(useIntl: false)); + $tracker = new HeadingIdTracker(new AsciiTransliterator(useIntl: false), asciiHeadingIds: true); $cjk = new Heading(2); $cjk->appendChild(new Text('日本語の見出し')); From cfd201155f98758b79ea461c90ec5140cf5c0d93 Mon Sep 17 00:00:00 2001 From: Mark Scherer Date: Sat, 6 Jun 2026 18:08:53 +0200 Subject: [PATCH 2/6] Add asciiHeadingIds to DjotConverter constructor docblock (phpcs) --- src/DjotConverter.php | 1 + 1 file changed, 1 insertion(+) diff --git a/src/DjotConverter.php b/src/DjotConverter.php index f5057be1..937e1f0a 100644 --- a/src/DjotConverter.php +++ b/src/DjotConverter.php @@ -131,6 +131,7 @@ public static function ansi(?BlockParser $parser = null): self * @param bool $nestedBlocksInLists Allow nested blocks in list items without blank lines (deprecated; prefer blocksInterruptParagraphs + nestedListsWithoutBlankLine) * @param bool $blocksInterruptParagraphs Allow top-level block elements to interrupt paragraphs without a blank line * @param bool $nestedListsWithoutBlankLine Allow sublists to nest in list items without a blank line + * @param bool $asciiHeadingIds */ public function __construct( bool $xhtml = false, From 37bb9a136f33b610977c4db75226b7efab0e0339 Mon Sep 17 00:00:00 2001 From: Mark Scherer Date: Sat, 6 Jun 2026 19:12:42 +0200 Subject: [PATCH 3/6] Make ASCII heading ids a pluggable transform + extension; unify prefix to s- Replace the baked-in asciiHeadingIds bool with a pluggable id transform on HeadingIdTracker (Closure(string): string). The core stays pure jgm/djot#393 (unicode-preserving); ASCII folding now ships as AsciiHeadingIdsExtension, which sets the transform on both the renderer's tracker and the parser's heading-reference resolution so section ids and [Heading][] link targets stay in parity. The transform runs over the spec id and is re-slugged afterwards, so a transform that reintroduces separators (e.g. CJK romanization "ri ben yu") still yields a clean id. Also unify the leading-digit prefix from h- to s- so it matches the empty-heading s-N fallback (one prefix convention). --- src/DjotConverter.php | 6 +- src/Extension/AsciiHeadingIdsExtension.php | 47 +++++++++++ src/Parser/BlockParser.php | 29 +++++-- src/Renderer/HeadingIdTracker.php | 80 ++++++++++--------- src/Renderer/HtmlRenderer.php | 4 +- src/Renderer/RenderContext.php | 4 +- tests/TestCase/DjotConverterTest.php | 8 +- .../AsciiHeadingIdsExtensionTest.php | 42 ++++++++++ .../Renderer/HeadingIdTrackerTest.php | 26 +++--- 9 files changed, 176 insertions(+), 70 deletions(-) create mode 100644 src/Extension/AsciiHeadingIdsExtension.php create mode 100644 tests/TestCase/Extension/AsciiHeadingIdsExtensionTest.php diff --git a/src/DjotConverter.php b/src/DjotConverter.php index 937e1f0a..e4aaf702 100644 --- a/src/DjotConverter.php +++ b/src/DjotConverter.php @@ -131,7 +131,6 @@ public static function ansi(?BlockParser $parser = null): self * @param bool $nestedBlocksInLists Allow nested blocks in list items without blank lines (deprecated; prefer blocksInterruptParagraphs + nestedListsWithoutBlankLine) * @param bool $blocksInterruptParagraphs Allow top-level block elements to interrupt paragraphs without a blank line * @param bool $nestedListsWithoutBlankLine Allow sublists to nest in list items without a blank line - * @param bool $asciiHeadingIds */ public function __construct( bool $xhtml = false, @@ -147,7 +146,6 @@ public function __construct( bool $nestedBlocksInLists = false, bool $blocksInterruptParagraphs = false, bool $nestedListsWithoutBlankLine = false, - bool $asciiHeadingIds = false, ) { $this->collectWarnings = $warnings; $this->strictMode = $strict; @@ -156,14 +154,14 @@ public function __construct( if ($parser !== null) { $this->parser = $parser; } else { - $this->parser = new BlockParser($warnings, $strict, $significantNewlines, $nestedBlocksInLists, $blocksInterruptParagraphs, $nestedListsWithoutBlankLine, $asciiHeadingIds); + $this->parser = new BlockParser($warnings, $strict, $significantNewlines, $nestedBlocksInLists, $blocksInterruptParagraphs, $nestedListsWithoutBlankLine); } // Use provided renderer or create one from parameters if ($renderer !== null) { $this->renderer = $renderer; } else { - $this->renderer = new HtmlRenderer($xhtml, $asciiHeadingIds); + $this->renderer = new HtmlRenderer($xhtml); // Configure safe mode $this->setSafeMode($safeMode); diff --git a/src/Extension/AsciiHeadingIdsExtension.php b/src/Extension/AsciiHeadingIdsExtension.php new file mode 100644 index 00000000..944d825e --- /dev/null +++ b/src/Extension/AsciiHeadingIdsExtension.php @@ -0,0 +1,47 @@ + Uber, café -> cafe, Привет -> + * Privet) for maximum URL/CSS-fragment portability. + * + * By default djot-php generates spec-faithful ids (jgm/djot#393) that preserve + * non-ASCII characters. Adding this extension layers an ASCII transliteration on top + * of that, as a pluggable id transform - it does not fork the core slugger. + * + * The transform is wired to BOTH the renderer's HeadingIdTracker and the parser's + * heading-reference resolution pass, so `
` values and implicit + * `[Heading][]` link targets stay in parity. + */ +class AsciiHeadingIdsExtension implements ExtensionInterface +{ + /** + * @param bool|null $useIntl Force the transliteration engine; null auto-detects + * ext-intl (ICU) and otherwise uses the built-in baked map. + */ + public function __construct(protected ?bool $useIntl = null) + { + } + + public function register(DjotConverter $converter): void + { + $transliterator = new AsciiTransliterator($this->useIntl); + $transform = static fn (string $id): string => $transliterator->transliterate($id); + + // Renderer side (section ids). getHeadingIdTracker() only exists for HTML. + if ($converter->getRenderer() instanceof HtmlRenderer) { + $converter->getHeadingIdTracker()->setIdTransformer($transform); + } + + // Parser side (implicit [Heading][] reference resolution) - keeps the link + // targets identical to the rendered section ids. + $converter->getParser()->setHeadingIdTransformer($transform); + } +} diff --git a/src/Parser/BlockParser.php b/src/Parser/BlockParser.php index d31c3d27..aae31b00 100644 --- a/src/Parser/BlockParser.php +++ b/src/Parser/BlockParser.php @@ -4,6 +4,7 @@ namespace Djot\Parser; +use Closure; use Djot\Exception\ParseException; use Djot\Exception\ParseWarning; use Djot\Node\Block\BlockQuote; @@ -194,11 +195,13 @@ class BlockParser protected bool $nestedListsWithoutBlankLine = false; /** - * When true, heading ids are ASCII-folded (opt-in). Mirrors the renderer's - * HeadingIdTracker setting so parser-side id computation (heading reference - * resolution) matches the rendered ids. + * Optional heading-id transform, mirrored from the renderer's + * HeadingIdTracker so parser-side id computation (heading reference + * resolution) matches the rendered ids. Set by AsciiHeadingIdsExtension. + * + * @var \Closure(string):|null string|null */ - protected bool $asciiHeadingIds = false; + protected ?Closure $headingIdTransformer = null; public function __construct( bool $collectWarnings = false, @@ -207,11 +210,9 @@ public function __construct( bool $nestedBlocksInLists = false, bool $blocksInterruptParagraphs = false, bool $nestedListsWithoutBlankLine = false, - bool $asciiHeadingIds = false, ) { $this->collectWarnings = $collectWarnings; $this->strictMode = $strictMode; - $this->asciiHeadingIds = $asciiHeadingIds; // significantNewlines is the deprecated union of blocksInterruptParagraphs // and nestedListsWithoutBlankLine (NOT the broad nestedBlocksInLists). $this->blocksInterruptParagraphs = $blocksInterruptParagraphs || $significantNewlines; @@ -223,6 +224,18 @@ public function __construct( $this->fencedBlockParser = new FencedBlockParser(); } + /** + * Set the optional heading-id transform used by the heading-reference + * resolution pass, so parser-computed ids match the renderer's. Set by + * AsciiHeadingIdsExtension to keep ids and `[Heading][]` links in parity. + * + * @param \Closure(string): string|null $transformer + */ + public function setHeadingIdTransformer(?Closure $transformer): void + { + $this->headingIdTransformer = $transformer; + } + /** * Enable or disable significant newlines mode. * @@ -706,7 +719,7 @@ protected function extractAbbreviations(array $lines): void */ protected function extractHeadingReferences(array $lines): void { - $headingIdTracker = new HeadingIdTracker(null, $this->asciiHeadingIds); + $headingIdTracker = new HeadingIdTracker($this->headingIdTransformer); $pendingId = null; $count = count($lines); @@ -789,7 +802,7 @@ protected function extractHeadingReferences(array $lines): void */ protected function rewriteHeadingReferences(Document $document): void { - $tracker = new HeadingIdTracker(null, $this->asciiHeadingIds); + $tracker = new HeadingIdTracker($this->headingIdTransformer); $tracker->reserveExplicitIds($document); /** @var array $newUrlByLabel */ diff --git a/src/Renderer/HeadingIdTracker.php b/src/Renderer/HeadingIdTracker.php index 0314667c..5608d4fa 100644 --- a/src/Renderer/HeadingIdTracker.php +++ b/src/Renderer/HeadingIdTracker.php @@ -4,6 +4,7 @@ namespace Djot\Renderer; +use Closure; use Djot\Node\Block\Heading; use Djot\Node\Inline\Code; use Djot\Node\Inline\FootnoteRef; @@ -55,20 +56,25 @@ class HeadingIdTracker */ protected array $resolvedTexts = []; - protected AsciiTransliterator $transliterator; + /** + * @param \Closure(string): string|null $idTransformer Optional transform applied + * to the spec-normalized id (e.g. ASCII transliteration for URL/CSS + * portability). Null (default) leaves the jgm/djot#393 unicode-preserving id + * unchanged. Set via an extension such as AsciiHeadingIdsExtension. + */ + public function __construct(protected ?Closure $idTransformer = null) + { + } /** - * @param \Djot\Renderer\AsciiTransliterator|null $transliterator - * @param bool $asciiHeadingIds When true, transliterate heading text to ASCII - * (Über -> Uber, café -> cafe) before slugging, for maximum URL/CSS - * portability. Off by default: per jgm/djot#393 the identifier preserves - * non-ASCII characters. + * Set the optional id transform (see the constructor). Used by extensions to + * adjust generated ids without forking the core spec slugger. + * + * @param \Closure(string): string|null $idTransformer */ - public function __construct( - ?AsciiTransliterator $transliterator = null, - protected bool $asciiHeadingIds = false, - ) { - $this->transliterator = $transliterator ?? new AsciiTransliterator(); + public function setIdTransformer(?Closure $idTransformer): void + { + $this->idTransformer = $idTransformer; } /** @@ -126,47 +132,43 @@ public function reserveExplicitIds(Node $node): void /** * Normalize heading text into an identifier (jgm/djot#393) * - * 1. (Opt-in only, when $asciiHeadingIds) transliterate to ASCII - * (Über → Uber, café → cafe, Привет → Privet) for maximum URL/CSS - * portability. - * 2. Replace each maximal run of non-alphanumeric ASCII with a single '-' - * (covers whitespace, punctuation, '_', and runs of '-'); non-ASCII - * characters and letter case are preserved. - * 3. Trim leading/trailing '-'. - * 4. Prefix with 'h-' if the result starts with a digit, so the id is a - * valid bare CSS selector (querySelector('#9-x') would otherwise throw). - * This is orthogonal to #393, which governs punctuation only. + * 1. Slug the text: replace each maximal run of non-alphanumeric ASCII with a + * single '-' and trim; non-ASCII characters and letter case are preserved. + * 2. If an id transform is set (e.g. ASCII transliteration via + * AsciiHeadingIdsExtension), apply it to the slug and re-slug the result + * (the transform may reintroduce spaces/punctuation, e.g. romanization). + * 3. Prefix with 's-' if the result starts with a digit, so the id is a valid + * bare CSS selector (querySelector('#9-x') would otherwise throw). This is + * orthogonal to #393, which governs punctuation only. * - * Returns '' when nothing usable remains (all-punctuation text, or - in the - * opt-in ASCII mode - a script the transliterator cannot reduce to ASCII); - * the caller then falls back to a generated `s-N` id. + * Returns '' when nothing usable remains (all-punctuation text, or a transform + * that reduces the text to nothing); the caller then falls back to a generated + * `s-N` id. */ public function normalizeId(string $text): string { - $id = $text; + $id = $this->slug($text); - // Opt-in: fold to ASCII first (Über -> Uber) for URL/CSS portability. - if ($this->asciiHeadingIds) { - $id = $this->transliterator->transliterate($id); + if ($this->idTransformer !== null) { + $id = $this->slug(($this->idTransformer)($id)); } - // jgm/djot#393: replace each maximal run of non-alphanumeric ASCII with a - // single '-' and trim leading/trailing '-'. Non-ASCII characters (unicode - // letters/marks) and case are preserved; this also collapses spaces, - // punctuation, '_' and runs of '-'. - $id = preg_replace('/[^0-9A-Za-z\x{0080}-\x{10FFFF}]+/u', '-', $id) ?? $id; - $id = trim($id, '-'); - - // A leading digit is a valid HTML id but an invalid bare CSS selector - // (e.g. querySelector('#9-x') throws), so prefix 'h-'. This is orthogonal - // to #393, which governs punctuation, not the leading-digit case. if ($id !== '' && preg_match('/^\p{N}/u', $id)) { - $id = 'h-' . $id; + $id = 's-' . $id; } return $id; } + /** + * jgm/djot#393 slug step: replace each maximal run of non-alphanumeric ASCII + * with a single '-' and trim. Non-ASCII characters and letter case are kept. + */ + protected function slug(string $text): string + { + return trim(preg_replace('/[^0-9A-Za-z\x{0080}-\x{10FFFF}]+/u', '-', $text) ?? $text, '-'); + } + /** * Get plain text content of a node * diff --git a/src/Renderer/HtmlRenderer.php b/src/Renderer/HtmlRenderer.php index ff3e2ed7..b068e3b9 100644 --- a/src/Renderer/HtmlRenderer.php +++ b/src/Renderer/HtmlRenderer.php @@ -97,9 +97,9 @@ class HtmlRenderer implements RendererInterface */ protected const OL_ONLY_ATTRIBUTES = ['start', 'type', 'reversed']; - public function __construct(protected bool $xhtml = false, bool $asciiHeadingIds = false) + public function __construct(protected bool $xhtml = false) { - $this->sharedRenderContext = new RenderContext(asciiHeadingIds: $asciiHeadingIds); + $this->sharedRenderContext = new RenderContext(); $this->initNodeRenderers(); } diff --git a/src/Renderer/RenderContext.php b/src/Renderer/RenderContext.php index f58c3aa2..bde8e2a3 100644 --- a/src/Renderer/RenderContext.php +++ b/src/Renderer/RenderContext.php @@ -44,9 +44,9 @@ class RenderContext */ public array $inlineFootnoteRenderers = []; - public function __construct(?HeadingIdTracker $headingIdTracker = null, bool $asciiHeadingIds = false) + public function __construct(?HeadingIdTracker $headingIdTracker = null) { - $this->headingIdTracker = $headingIdTracker ?? new HeadingIdTracker(null, $asciiHeadingIds); + $this->headingIdTracker = $headingIdTracker ?? new HeadingIdTracker(); } public function reset(): void diff --git a/tests/TestCase/DjotConverterTest.php b/tests/TestCase/DjotConverterTest.php index 6e6619f5..05a34499 100644 --- a/tests/TestCase/DjotConverterTest.php +++ b/tests/TestCase/DjotConverterTest.php @@ -7,6 +7,7 @@ use Djot\DjotConverter; use Djot\Event\RenderEvent; use Djot\Exception\ParseException; +use Djot\Extension\AsciiHeadingIdsExtension; use Djot\Extension\HeadingLevelShiftExtension; use Djot\Extension\TabsExtension; use Djot\Node\Block\Heading; @@ -1923,9 +1924,10 @@ public function testUnicodeInHeading(): void $this->assertStringContainsString('

日本語の見出し

', $result); $this->assertStringContainsString('id="日本語の見出し"', $result); - // The opt-in asciiHeadingIds mode folds the id to ASCII for URL/CSS - // portability. - $ascii = (new DjotConverter(asciiHeadingIds: true))->convert($djot); + // AsciiHeadingIdsExtension folds the id to ASCII for URL/CSS portability. + $asciiConverter = new DjotConverter(); + $asciiConverter->addExtension(new AsciiHeadingIdsExtension()); + $ascii = $asciiConverter->convert($djot); $this->assertMatchesRegularExpression('/
/', $ascii); if (class_exists(Transliterator::class)) { $this->assertStringContainsString('
', $ascii); diff --git a/tests/TestCase/Extension/AsciiHeadingIdsExtensionTest.php b/tests/TestCase/Extension/AsciiHeadingIdsExtensionTest.php new file mode 100644 index 00000000..4e794a13 --- /dev/null +++ b/tests/TestCase/Extension/AsciiHeadingIdsExtensionTest.php @@ -0,0 +1,42 @@ +convert("# über café\n"); + + $this->assertStringContainsString('
', $html); + } + + public function testExtensionFoldsHeadingIdToAscii(): void + { + $converter = new DjotConverter(); + $converter->addExtension(new AsciiHeadingIdsExtension()); + + $html = $converter->convert("# über café\n"); + + $this->assertStringContainsString('
', $html); + } + + public function testExtensionKeepsImplicitReferenceInParity(): void + { + // The folded id must also be used by the `[Heading][]` link target, so the + // anchor still resolves (parser/renderer parity). + $converter = new DjotConverter(); + $converter->addExtension(new AsciiHeadingIdsExtension()); + + $html = $converter->convert("# über café\n\nsee [über café][]\n"); + + $this->assertStringContainsString('
', $html); + $this->assertStringContainsString('href="#uber-cafe"', $html); + } +} diff --git a/tests/TestCase/Renderer/HeadingIdTrackerTest.php b/tests/TestCase/Renderer/HeadingIdTrackerTest.php index 75f8ba13..16f32bb9 100644 --- a/tests/TestCase/Renderer/HeadingIdTrackerTest.php +++ b/tests/TestCase/Renderer/HeadingIdTrackerTest.php @@ -178,8 +178,8 @@ public function testNormalizeId(): void // jgm/djot#393: non-ASCII is preserved (case kept), not transliterated. $this->assertSame('Привет-мир', $this->tracker->normalizeId('Привет мир')); $this->assertSame('', $this->tracker->normalizeId('###')); - $this->assertSame('h-123-Things', $this->tracker->normalizeId('123 Things')); - $this->assertSame('h-1-Introduction', $this->tracker->normalizeId('1. Introduction')); + $this->assertSame('s-123-Things', $this->tracker->normalizeId('123 Things')); + $this->assertSame('s-1-Introduction', $this->tracker->normalizeId('1. Introduction')); } /** @@ -188,7 +188,7 @@ public function testNormalizeId(): void * trimmed. Case and non-ASCII characters (Cyrillic, accented Latin, smart * quotes) are preserved; `_` is replaced (no longer an exception). A * leading-digit result keeps the `h-` prefix for CSS-selector safety - * (orthogonal to #393). ASCII-folding is opt-in (asciiHeadingIds). + * (orthogonal to #393). ASCII-folding is opt-in via AsciiHeadingIdsExtension. */ public function testNormalizeIdSpecAlignmentEdgeCases(): void { @@ -201,17 +201,18 @@ public function testNormalizeIdSpecAlignmentEdgeCases(): void $this->assertSame('Über-uns', $this->tracker->normalizeId('Über uns')); $this->assertSame('café-résumé', $this->tracker->normalizeId('café résumé')); $this->assertSame('Straße', $this->tracker->normalizeId('Straße')); - $this->assertSame('h-2024-recap', $this->tracker->normalizeId('2024 recap')); + $this->assertSame('s-2024-recap', $this->tracker->normalizeId('2024 recap')); $this->assertSame('', $this->tracker->normalizeId('!!!')); } /** - * The opt-in asciiHeadingIds mode transliterates non-ASCII to ASCII before - * slugging, for maximum URL/CSS portability. + * An id transform (e.g. the one set by AsciiHeadingIdsExtension) is applied to + * the spec id; here it transliterates non-ASCII to ASCII for portability. */ public function testAsciiHeadingIdsOptInTransliterates(): void { - $ascii = new HeadingIdTracker(null, asciiHeadingIds: true); + $transliterator = new AsciiTransliterator(); + $ascii = new HeadingIdTracker(static fn (string $id): string => $transliterator->transliterate($id)); $this->assertSame('Privet-mir', $ascii->normalizeId('Привет мир')); $this->assertSame('Uber-uns', $ascii->normalizeId('Über uns')); $this->assertSame('cafe-resume', $ascii->normalizeId('café résumé')); @@ -394,14 +395,15 @@ public function testUnderscoreReplacedInId(): void } /** - * In the opt-in asciiHeadingIds mode, when transliteration removes the - * entire heading text (a script outside the baked map, no ext-intl), the - * heading must fall back to a stable generated `s-N` id. (By default the - * non-ASCII text is preserved instead, per #393, so no fallback occurs.) + * With an ASCII-folding id transform, when transliteration removes the entire + * heading text (a script outside the baked map, no ext-intl), the heading must + * fall back to a stable generated `s-N` id. (By default the non-ASCII text is + * preserved instead, per #393, so no fallback occurs.) */ public function testHeadingThatTransliteratesToNothingGetsFallbackId(): void { - $tracker = new HeadingIdTracker(new AsciiTransliterator(useIntl: false), asciiHeadingIds: true); + $transliterator = new AsciiTransliterator(useIntl: false); + $tracker = new HeadingIdTracker(static fn (string $id): string => $transliterator->transliterate($id)); $cjk = new Heading(2); $cjk->appendChild(new Text('日本語の見出し')); From 6ea885523e2912e2981d87531b965bcd749d67c7 Mon Sep 17 00:00:00 2001 From: Mark Scherer Date: Sat, 6 Jun 2026 19:23:17 +0200 Subject: [PATCH 4/6] Fix @var annotation phpcbf mangled into invalid PHPStan syntax --- src/Parser/BlockParser.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Parser/BlockParser.php b/src/Parser/BlockParser.php index aae31b00..6f0e9e83 100644 --- a/src/Parser/BlockParser.php +++ b/src/Parser/BlockParser.php @@ -199,7 +199,7 @@ class BlockParser * HeadingIdTracker so parser-side id computation (heading reference * resolution) matches the rendered ids. Set by AsciiHeadingIdsExtension. * - * @var \Closure(string):|null string|null + * @var \Closure|null */ protected ?Closure $headingIdTransformer = null; From f7338c62975b161921b36b6521a2b5751baa1fbc Mon Sep 17 00:00:00 2001 From: Mark Scherer Date: Sat, 6 Jun 2026 19:33:34 +0200 Subject: [PATCH 5/6] docs: heading-id #393 default + AsciiHeadingIdsExtension Rewrite the heading-id reference section for the jgm/djot#393 default (letter case and non-ASCII preserved, s- prefix for leading digits, s-N empty fallback) and document AsciiHeadingIdsExtension as the opt-in ASCII-folding path, including its parser/renderer parity and the note that registration order does not matter. --- docs/extensions/index.md | 27 ++++++++++++ docs/reference/enhancements.md | 77 ++++++++++++++++++---------------- 2 files changed, 69 insertions(+), 35 deletions(-) diff --git a/docs/extensions/index.md b/docs/extensions/index.md index a9786da9..7bacdd19 100644 --- a/docs/extensions/index.md +++ b/docs/extensions/index.md @@ -7,6 +7,7 @@ Extensions provide a clean way to bundle related customizations together. Each e | Extension | Description | |-----------|-------------| | [AdmonitionExtension](#admonitionextension) | Transforms divs into semantic admonition markup with accessibility support | +| [AsciiHeadingIdsExtension](#asciiheadingidsextension) | Folds auto-generated heading ids to ASCII (`Über` → `Uber`) for URL/CSS-fragment portability | | [AutolinkExtension](#autolinkextension) | Auto-links bare URLs and email addresses | | [CodeGroupExtension](#codegroupextension) | Transforms code-group divs into tabbed code block interfaces | | [DefaultAttributesExtension](#defaultattributesextension) | Adds default attributes to elements by type | @@ -47,6 +48,7 @@ Extensions are applied in registration order. Generally, order doesn't matter, b - **AutolinkExtension** should be registered before **ExternalLinksExtension** if you want auto-linked URLs to also get external link attributes - **TableOfContentsExtension** should be registered before **HeadingPermalinksExtension** if you want clean heading text in the TOC (without permalink symbols) +- **AsciiHeadingIdsExtension** has no ordering requirement relative to the heading extensions (TableOfContents, HeadingPermalinks, HeadingReference) — it installs an id transform at registration that is consumed later during parsing/rendering, so the resulting ids are consistent regardless of registration order Extensions are reset per render, so reusing the same `DjotConverter` across multiple `convert()` calls will not carry per-document extension state such as collected TOC entries into the next output. @@ -226,6 +228,31 @@ $converter->addExtension(new AdmonitionExtension( )); ``` +## AsciiHeadingIdsExtension + +By default, auto-generated heading ids follow the [jgm/djot#393](https://github.com/jgm/djot/pull/393) rule and **preserve letter case and non-ASCII characters** (`# Über café` → `id="Über-café"`). That is valid HTML5 and resolves in browsers, but the URL fragment is percent-encoded when shared (`#%C3%9Cber-caf%C3%A9`). + +`AsciiHeadingIdsExtension` folds heading ids to ASCII for maximum portability — clean fragments, friendlier to legacy anchor tooling, and trivially safe as bare CSS/JS selectors: + +```php +use Djot\DjotConverter; +use Djot\Extension\AsciiHeadingIdsExtension; + +$converter = new DjotConverter(); +$converter->addExtension(new AsciiHeadingIdsExtension()); + +$converter->convert("# Über café\n"); +//
… (default would be id="Über-café") +``` + +It applies an ASCII transliteration on top of the spec slug and re-slugs the result, so a transform that reintroduces separators (e.g. CJK romanization) still yields a clean id. The transform is wired to **both** the renderer and the parser's `[Heading][]` reference resolution, so section ids and implicit heading-link targets stay in parity. + +`ext-intl` (ICU) is used when available and romanizes scripts the built-in map does not cover (Greek, CJK, Arabic, …); otherwise a baked Unicode→ASCII map is used. Pass `new AsciiHeadingIdsExtension(useIntl: false)` to force the baked map. A heading whose text reduces to nothing under transliteration falls back to a generated `s-N` id. + +Registration order relative to other heading extensions (TableOfContents, HeadingPermalinks, HeadingReference) does not matter: the transform is installed at registration and consumed later, during parsing and rendering. + +See [Heading ID Generation](/reference/enhancements#heading-id-generation) for the default rule and a full comparison table. + ## ExternalLinksExtension Adds `target="_blank"` and `rel="noopener noreferrer"` to external links (http/https URLs). diff --git a/docs/reference/enhancements.md b/docs/reference/enhancements.md index 149dc093..9ff3f0e9 100644 --- a/docs/reference/enhancements.md +++ b/docs/reference/enhancements.md @@ -155,25 +155,22 @@ content is a symbol falls back to a generated `s-N` ID. --- -## CSS-Safe Heading IDs +## Heading ID Generation -**Related:** [php-collective/djot-php#92](https://github.com/php-collective/djot-php/pull/92), [jgm/djot#391](https://github.com/jgm/djot/issues/391) +**Related:** [php-collective/djot-php#92](https://github.com/php-collective/djot-php/pull/92), [#224](https://github.com/php-collective/djot-php/pull/224), [jgm/djot#391](https://github.com/jgm/djot/issues/391), [jgm/djot#393](https://github.com/jgm/djot/pull/393) **Status:** Implemented in djot-php -Auto-generated heading IDs are normalized to be valid CSS selectors **and ASCII-only**, so they work with `querySelector()` / HTMX scroll restoration *and* survive being copied around as URL fragments (see [Why ASCII](#why-ascii) below). +Auto-generated heading IDs follow the settled [jgm/djot#393](https://github.com/jgm/djot/pull/393) rule and **preserve letter case and non-ASCII characters**. For ASCII-only anchors, opt in with [`AsciiHeadingIdsExtension`](#ascii-heading-ids-opt-in). -### Normalization Rules +### Normalization Rules (default) -1. **Transliterate to ASCII** — `Über`→`Uber`, `café`→`cafe`, `Привет`→`Privet`, smart quotes/dashes→`'"-` (then replaced) -2. **Strip `#` characters** — Prevents invalid selectors -3. **Trim whitespace** -4. **Whitespace to dashes** — Spaces become single `-` -5. **Invalid characters to dashes** — Anything other than letters, numbers, `-`, `_` becomes `-` -6. **Collapse consecutive dashes** — `foo--bar` becomes `foo-bar` -7. **Trim leading/trailing dashes** -8. **Prefix digits** — IDs starting with a digit get an `h-` prefix (CSS requirement) -9. **Fallback** — Empty results become `heading` (or a generated `s-N` for empty headings) +1. **Replace non-alphanumeric ASCII** — each maximal run of non-alphanumeric ASCII (spaces, punctuation, `_`, runs of `-`) becomes a single `-`. +2. **Trim** leading/trailing `-`. **Letter case and all non-ASCII characters (accented Latin, Cyrillic, CJK, smart quotes, …) are preserved.** +3. **Prefix `s-` for a leading digit** — a leading digit is a valid HTML id but an invalid *bare* CSS selector (`querySelector('#9-x')` throws), so it is prefixed. Orthogonal to #393, which governs punctuation only. +4. **Fallback** — an empty result (all-punctuation text) becomes a generated `s-N` id. + +Symbols (`:name:`) and footnote references are excluded from the id text (see [Section ID Excludes Footnote Markers and Symbols](#section-id-excludes-footnote-markers-and-symbols)). ### Examples @@ -181,26 +178,36 @@ Auto-generated heading IDs are normalized to be valid CSS selectors **and ASCII- |---------|--------------| | `# Hello World` | `Hello-World` | | `# Hello World!` | `Hello-World` | -| `# Über uns` | `Uber-uns` | -| `# café résumé` | `cafe-resume` | -| `# Привет мир` | `Privet-mir` | -| `# Bob's Guide` (smart quotes) | `Bob-s-Guide` | +| `# Über uns` | `Über-uns` | +| `# café résumé` | `café-résumé` | +| `# Привет мир` | `Привет-мир` | +| `# under_score` | `under-score` | | `# E=mc^2` | `E-mc-2` | -| `# 123 Numbers First` | `h-123-Numbers-First` | +| `# 123 Numbers First` | `s-123-Numbers-First` | | `# $this->method()` | `this-method` | -| `# ###` | `heading` | +| `# ###` | `s-1` | + +### ASCII heading IDs (opt-in) {#ascii-heading-ids-opt-in} + +Add `AsciiHeadingIdsExtension` to fold ids to ASCII (`Über uns` → `uber-uns`) for maximum URL/CSS-fragment portability: + +```php +use Djot\Extension\AsciiHeadingIdsExtension; + +$converter->addExtension(new AsciiHeadingIdsExtension()); +``` -### Why ASCII {#why-ascii} +It applies an ASCII transliteration on top of the #393 slug (and re-slugs the result), wired to **both** the renderer and the parser's `[Heading][]` reference resolution so section ids and link targets stay in parity. Registration order relative to other heading extensions does not matter. -Heading IDs end up as URL fragments (`…/page#Über-uns`) that get copied into chat, email and other documents, where **auto-linkers re-detect the URL heuristically**. Non-ASCII fragments are routinely: +Unicode ids are valid HTML5 and resolve in browsers (the URL fragment is percent-encoded but functional), so ASCII is a portability choice, not a correctness requirement. You may want it because heading IDs end up as URL fragments (`…/page#Über-uns`) copied into chat, email and other documents, where **auto-linkers re-detect the URL heuristically**. Non-ASCII fragments are routinely: - **truncated** — the link is cut at the first non-ASCII byte (`#Über` → `#`), producing a silent dead link; - **percent-encoded inconsistently** — `’`→`%E2%80%99`, bloating and sometimes breaking the link; - **re-normalized differently** by the receiving app (NFC/NFD), so the pasted fragment no longer matches the page's `id`. -Transliterating to ASCII keeps shared deep links robust. It's a deliberate deviation from both the djot.js reference and the [jgm/djot#393](https://github.com/jgm/djot/pull/393) spec prose (both preserve non-ASCII) — see [Spec Alignment](#spec-alignment). +Transliterating to ASCII keeps such shared deep links robust. This is what the extension opts into; the default (no extension) preserves non-ASCII to match the spec — see [Spec Alignment](#spec-alignment). -### Transliteration engine & determinism +### Transliteration engine & determinism (extension) Two engines produce the ASCII form: @@ -223,23 +230,23 @@ Explicit IDs are used as-is without normalization or transliteration. ### Spec Alignment {#spec-alignment} -The remove-vs-replace question raised in [jgm/djot#391](https://github.com/jgm/djot/issues/391) was settled by [jgm/djot#393](https://github.com/jgm/djot/pull/393), which reworded the spec to: *"replacing each maximal run of non-alphanumeric ASCII characters with `-`, removing any leading or trailing `-`"*. #393 changed only the spec **prose**; the djot.js reference implementation is unchanged. +The remove-vs-replace question raised in [jgm/djot#391](https://github.com/jgm/djot/issues/391) was settled by [jgm/djot#393](https://github.com/jgm/djot/pull/393), which reworded the spec to: *"replacing each maximal run of non-alphanumeric ASCII characters with `-`, removing any leading or trailing `-`"*. -djot-php replaces (does not remove) mid-word punctuation — the direction #393 settled on — additionally replaces `' " ; :` so IDs are valid CSS identifiers, and **transliterates non-ASCII to ASCII** so IDs stay link-safe when shared. The last point is a deliberate deviation from *both* djot.js and the #393 prose, justified by the [Why ASCII](#why-ascii) failure mode. +djot-php's **default now matches #393**: it replaces every maximal run of non-alphanumeric ASCII (including `_`, `'`, `"`, `;`, `:`) with `-`, and preserves letter case and all non-ASCII characters. The only additions are orthogonal to #393's punctuation rule: a leading-digit `s-` prefix (CSS-selector safety) and an `s-N` fallback for empty results. ASCII transliteration — the previous always-on behavior — is now opt-in via `AsciiHeadingIdsExtension`. -| Aspect | djot.js reference impl | #393 spec prose | djot-php | -|--------|------------------------|-----------------|----------| +| Aspect | djot.js / #393 | djot-php default | with AsciiHeadingIdsExtension | +|--------|----------------|------------------|-------------------------------| | Mid-word punctuation (`A+B=C`) | `A-B-C` | `A-B-C` | `A-B-C` | -| Consecutive punctuation (`foo...bar`) | collapse → `foo-bar` | collapse → `foo-bar` | collapse → `foo-bar` | -| Underscore (`foo_bar`) | keep → `foo_bar` | strip → `foo-bar` | keep → `foo_bar` (CSS-valid, link-safe) | -| Apostrophe / `"` / `;` / `:` | preserve | replace | replace → `-` (CSS-safe) | -| Non-ASCII letters (`Über uns`) | preserve → `Über-uns` | preserve → `Über-uns` | **transliterate → `Uber-uns`** (link-safe) | -| Non-ASCII / smart quotes (`Bob’s`) | preserve → `Bob’s` | preserve → `Bob’s` | **transliterate → `Bob-s`** (link-safe) | -| Leading digit (`2024 recap`) | `2024-recap` | `2024-recap` | prefix → `h-2024-recap` (CSS requires non-digit start) | -| Empty result (`!!!`) | `s-N` family | unspecified | fallback → `heading` | +| Consecutive punctuation (`foo...bar`) | `foo-bar` | `foo-bar` | `foo-bar` | +| Underscore (`foo_bar`) | `foo-bar` | `foo-bar` | `foo-bar` | +| Apostrophe / `"` / `;` / `:` | replace → `-` | replace → `-` | replace → `-` | +| Non-ASCII letters (`Über uns`) | preserve → `Über-uns` | preserve → `Über-uns` | **fold → `uber-uns`** | +| Smart quotes (`Bob’s`) | preserve → `Bob’s` | preserve → `Bob’s` | **fold → `Bob-s`** | +| Leading digit (`2024 recap`) | `2024-recap` | prefix → `s-2024-recap` | `s-2024-recap` | +| Empty result (`!!!`) | `s-N` family | `s-N` | `s-N` | | Symbols / footnote refs | excluded | excluded | excluded | -The deviations are deliberate: `' " ; :` are not valid in unescaped CSS identifiers, and non-ASCII fragments break when shared (see [Why ASCII](#why-ascii)). The leading-digit and empty-result behaviors fill in gaps the spec and reference handle inconsistently. A note proposing the spec clarify the non-ASCII question is tracked against [jgm/djot#391](https://github.com/jgm/djot/issues/391). +The default is spec-faithful; the leading-digit `s-` prefix and `s-N` fallback fill in cases #393 leaves to the implementation. The ASCII-folding column is only active when the extension is registered. --- From 23ef085b8095ae47a1ce24f260b7b5134c510991 Mon Sep 17 00:00:00 2001 From: Mark Scherer Date: Sat, 6 Jun 2026 19:49:08 +0200 Subject: [PATCH 6/6] Restore heading-id transformer Closure signature in a phpcbf-safe form The headingIdTransformer property lost its (string): string signature when phpcbf mangled the single-line annotation (6ea8855) into invalid syntax and the workaround dropped to a bare \Closure|null. Restore the signature with the two-line var + phpstan-var pattern already used by FrontmatterExtension: the plain var stays \Closure|null (phpcbf leaves it untouched) and the parenthesized phpstan-var carries the full (\Closure(string): string)|null. Verified phpcbf no longer mangles it; phpcs and phpstan are clean. --- src/Parser/BlockParser.php | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Parser/BlockParser.php b/src/Parser/BlockParser.php index 6f0e9e83..2e2040cb 100644 --- a/src/Parser/BlockParser.php +++ b/src/Parser/BlockParser.php @@ -200,6 +200,7 @@ class BlockParser * resolution) matches the rendered ids. Set by AsciiHeadingIdsExtension. * * @var \Closure|null + * @phpstan-var (\Closure(string): string)|null */ protected ?Closure $headingIdTransformer = null;