php-collective · dereuromark · May 20, 2026 · May 19, 2026
diff --git a/bin/gen-translit-map.php b/bin/gen-translit-map.php
@@ -0,0 +1,108 @@
+#!/usr/bin/env php
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Regenerates src/Renderer/ascii_translit_map.php from the ICU
+ * "Any-Latin; Latin-ASCII" transliterator.
+ *
+ * The baked map is the deterministic fallback used by AsciiTransliterator
+ * when ext-intl is unavailable. Generating it from ICU guarantees the
+ * common (European / Cyrillic / Greek / punctuation) output is identical
+ * with and without intl. Requires ext-intl to run.
+ *
+ * Usage: php bin/gen-translit-map.php
+ */
+
+if (!class_exists(Transliterator::class)) {
+    fwrite(STDERR, "ext-intl is required to regenerate the map.\n");
+    exit(1);
+}
+
+$tr = Transliterator::create('Any-Latin; Latin-ASCII');
+if ($tr === null) {
+    fwrite(STDERR, "Failed to create ICU transliterator.\n");
+    exit(1);
+}
+
+// Latin-1/Extended, IPA, combining marks, Greek, Cyrillic, Latin Extended
+// Additional, general punctuation, super/subscripts, currency, letterlike.
+$ranges = [
+    [0x00A0, 0x024F], [0x0250, 0x02AF], [0x0300, 0x036F], [0x0370, 0x03FF],
+    [0x0400, 0x04FF], [0x1E00, 0x1EFF], [0x2000, 0x206F], [0x2070, 0x209F],
+    [0x20A0, 0x20BF], [0x2100, 0x214F],
+];
+
+// A per-character map cannot reproduce ICU's context-sensitive rules (e.g.
+// Greek `αυ` → `au` but `υ` alone → `y`). Baking only the context-free
+// subset of such a script is *worse* than excluding it: `Αυγή` would give
+// `Ae` from the map vs `Auge` from ICU. So each range is all-or-nothing —
+// it is baked only if EVERY pure-ASCII code point in it is context-free.
+// Context-sensitive scripts (Greek, …) are excluded wholesale and degrade
+// to the generated `s-N` fallback when ext-intl is absent, exactly like
+// CJK; with ext-intl they are still romanized.
+$map = [];
+foreach ($ranges as [$start, $end]) {
+    $rangeEntries = [];
+    $rangeIsContextFree = true;
+
+    for ($cp = $start; $cp <= $end; $cp++) {
+        $char = IntlChar::chr($cp);
+        if ($char === null) {
+            continue;
+        }
+
+        $ascii = $tr->transliterate($char);
+        if ($ascii === false || $ascii === $char || preg_match('/^[\x00-\x7F]*$/', $ascii) !== 1) {
+            continue;
+        }
+
+        $contextFree =
+            $tr->transliterate($char . $char) === $ascii . $ascii
+            && $tr->transliterate('a' . $char . 'a') === 'a' . $ascii . 'a'
+            && $tr->transliterate('Z' . $char . 'Z') === 'Z' . $ascii . 'Z';
+        if (!$contextFree) {
+            $rangeIsContextFree = false;
+
+            break;
+        }
+
+        $rangeEntries[$char] = $ascii;
+    }
+
+    if ($rangeIsContextFree) {
+        $map += $rangeEntries;
+    }
+}
+ksort($map);
+
+$lines = [];
+foreach ($map as $from => $to) {
+    $lines[] = '    ' . var_export($from, true) . ' => ' . var_export($to, true) . ',';
+}
+
+$header = <<<'PHP'
+<?php
+
+declare(strict_types=1);
+
+/**
+ * Unicode -> ASCII transliteration fallback map.
+ *
+ * Generated from the ICU "Any-Latin; Latin-ASCII" transliterator over the
+ * Latin, IPA, combining-marks, Greek, Cyrillic, Latin-Extended-Additional,
+ * punctuation, super/subscript, currency and letterlike ranges. Used by
+ * the AsciiTransliterator only when ext-intl is unavailable, so the common
+ * (European/Cyrillic/Greek/punctuation) output is byte-identical with or
+ * without intl. Do not hand-edit; regenerate with `php bin/gen-translit-map.php`.
+ *
+ * @return array<string, string>
+ */
+return [
+PHP;
+
+$target = dirname(__DIR__) . '/src/Renderer/ascii_translit_map.php';
+file_put_contents($target, $header . "\n" . implode("\n", $lines) . "\n];\n");
+
+echo 'Wrote ' . count($map) . " entries to {$target}\n";
diff --git a/composer.json b/composer.json
@@ -14,6 +14,9 @@
     "require": {
         "php": "^8.2"
     },
+    "suggest": {
+        "ext-intl": "Recommended for heading-ID transliteration: enables ICU romanization of non-Latin scripts (e.g. CJK, Arabic). Without it, a baked map covers Latin/Cyrillic/Greek/punctuation identically and other scripts fall back to generated `s-N` ids."
+    },
     "require-dev": {
         "nikic/php-fuzzer": "^0.0.11",
         "php-collective/code-sniffer": "dev-master",

diff --git a/docs/reference/enhancements.md b/docs/reference/enhancements.md
@@ -160,64 +160,55 @@ content is a symbol falls back to a generated `s-N` ID.
 
 **Status:** Implemented in djot-php
 
-Auto-generated heading IDs are normalized to be valid CSS selectors, ensuring compatibility with `querySelector()`, HTMX scroll restoration, and CSS attribute selectors.
+Auto-generated heading IDs are normalized to be valid CSS selectors **and ASCII-only**, so they work with `querySelector()` / HTMX scroll restoration *and* survive being copied around as URL fragments (see [Why ASCII](#why-ascii) below).
 
 ### Normalization Rules
 
-1. **Strip `#` characters** — Prevents invalid selectors
-2. **Trim whitespace** — Clean leading/trailing spaces
-3. **Whitespace to dashes** — Spaces become single `-`
-4. **Invalid characters to dashes** — Only Unicode letters (`\p{L}`), numbers (`\p{N}`), hyphens, and underscores are preserved
-5. **Collapse consecutive dashes** — `foo--bar` becomes `foo-bar`
-6. **Trim leading/trailing dashes** — `-foo-` becomes `foo`
-7. **Prefix digits** — IDs starting with a number get `h-` prefix (CSS requirement)
-8. **Fallback** — Empty results become `heading`
+1. **Transliterate to ASCII** — `Über`→`Uber`, `café`→`cafe`, `Привет`→`Privet`, smart quotes/dashes→`'"-` (then replaced)
+2. **Strip `#` characters** — Prevents invalid selectors
+3. **Trim whitespace**
+4. **Whitespace to dashes** — Spaces become single `-`
+5. **Invalid characters to dashes** — Anything other than letters, numbers, `-`, `_` becomes `-`
+6. **Collapse consecutive dashes** — `foo--bar` becomes `foo-bar`
+7. **Trim leading/trailing dashes**
+8. **Prefix digits** — IDs starting with a digit get an `h-` prefix (CSS requirement)
+9. **Fallback** — Empty results become `heading` (or a generated `s-N` for empty headings)
 
 ### Examples
 
 | Heading | Generated ID |
 |---------|--------------|
 | `# Hello World` | `Hello-World` |
 | `# Hello World!` | `Hello-World` |
-| `# 日本語の見出し` | `日本語の見出し` |
-| `# Привет мир` | `Привет-мир` |
+| `# Über uns` | `Uber-uns` |
+| `# café résumé` | `cafe-resume` |
+| `# Привет мир` | `Privet-mir` |
+| `# Bob's Guide` (smart quotes) | `Bob-s-Guide` |
 | `# E=mc^2` | `E-mc-2` |
 | `# 123 Numbers First` | `h-123-Numbers-First` |
 | `# $this->method()` | `this-method` |
 | `# ###` | `heading` |
 
-### Unicode Preservation
+### Why ASCII {#why-ascii}
 
-International characters are preserved while special characters are normalized:
+Heading IDs end up as URL fragments (`…/page#Über-uns`) that get copied into chat, email and other documents, where **auto-linkers re-detect the URL heuristically**. Non-ASCII fragments are routinely:
 
-```djot
-# 日本語の見出し
-
-# Cześć świecie
-```
+- **truncated** — the link is cut at the first non-ASCII byte (`#Über` → `#`), producing a silent dead link;
+- **percent-encoded inconsistently** — `’`→`%E2%80%99`, bloating and sometimes breaking the link;
+- **re-normalized differently** by the receiving app (NFC/NFD), so the pasted fragment no longer matches the page's `id`.
 
-**Output:**
-```html
-<h1 id="日本語の見出し">日本語の見出し</h1>
-<h1 id="Cześć-świecie">Cześć świecie</h1>
-```
+Transliterating to ASCII keeps shared deep links robust. It's a deliberate deviation from both the djot.js reference and the [jgm/djot#393](https://github.com/jgm/djot/pull/393) spec prose (both preserve non-ASCII) — see [Spec Alignment](#spec-alignment).
 
-### Why This Matters
+### Transliteration engine & determinism
 
-Without CSS-safe normalization, headings with special characters would break:
+Two engines produce the ASCII form:
 
-```js
-// This would throw SyntaxError with unsafe IDs
-document.querySelector('#E=mc^2');  // Invalid selector
-htmx.scrollToElement('#$this->foo'); // Invalid selector
-```
+- **ICU `Transliterator`** (`Any-Latin; Latin-ASCII`) when `ext-intl` is installed — also romanizes scripts the map doesn't cover (Greek, CJK, Arabic, …);
+- a **baked Unicode→ASCII map** (`src/Renderer/ascii_translit_map.php`) otherwise.
 
-With normalization, these work correctly:
+The baked map is generated *from the same ICU transform*, and the generator bakes a script **only if every code point in it transliterates context-free** (verified standalone, doubled, and between Latin letters). For those scripts — Latin (so all of German, French, Spanish, Polish, Czech, Turkish, Vietnamese, …), Cyrillic, punctuation, smart quotes, dashes, currency — the output is **byte-identical with or without `ext-intl`**, so shared anchors stay stable across environments.
 
-```js
-document.querySelector('#E-mc-2');  // Works
-htmx.scrollToElement('#this-foo');  // Works
-```
+Scripts whose ICU romanization is context-sensitive (e.g. Greek: `αυ`→`au` but `υ`→`y`) are excluded *wholesale* — baking only their context-free letters would produce IDs that disagree with ICU, which is worse than not covering them. Those scripts, plus non-Latin scripts the map never covers (CJK, Arabic, …), behave one way: **with `ext-intl` they are romanized; without it they are dropped and the heading falls back to a generated `s-N` id**. `ext-intl` is therefore *recommended* (a `composer suggest`) but not required; the determinism guarantee above never depends on it.
 
 ### Explicit IDs
 
@@ -227,27 +218,27 @@ You can always override with an explicit ID attribute:
 # My Heading {#custom-id}
 ```
 
-Explicit IDs are used as-is without normalization.
+Explicit IDs are used as-is without normalization or transliteration.
 
-### Spec Alignment
+### Spec Alignment {#spec-alignment}
 
-The remove-vs-replace question raised in [jgm/djot#391](https://github.com/jgm/djot/issues/391) was settled by [jgm/djot#393](https://github.com/jgm/djot/pull/393), which reworded the spec to: *"replacing each maximal run of non-alphanumeric ASCII characters with `-`, removing any leading or trailing `-`"*. Note that #393 only changes the spec **prose** — the djot.js reference implementation is unchanged and (per djot's own changelog policy) remains the authoritative behavior. The new prose is actually broader than djot.js itself: it would also strip `_`, which djot.js keeps.
+The remove-vs-replace question raised in [jgm/djot#391](https://github.com/jgm/djot/issues/391) was settled by [jgm/djot#393](https://github.com/jgm/djot/pull/393), which reworded the spec to: *"replacing each maximal run of non-alphanumeric ASCII characters with `-`, removing any leading or trailing `-`"*. #393 changed only the spec **prose**; the djot.js reference implementation is unchanged.
 
-djot-php replaces (does not remove) mid-word punctuation — the direction #393 settled on — and tracks the djot.js **implementation** where the prose and implementation disagree, deliberately deviating only where required to produce valid CSS identifiers for `querySelector()` consumers.
+djot-php replaces (does not remove) mid-word punctuation — the direction #393 settled on — additionally replaces `' " ; :` so IDs are valid CSS identifiers, and **transliterates non-ASCII to ASCII** so IDs stay link-safe when shared. The last point is a deliberate deviation from *both* djot.js and the #393 prose, justified by the [Why ASCII](#why-ascii) failure mode.
 
 | Aspect | djot.js reference impl | #393 spec prose | djot-php |
 |--------|------------------------|-----------------|----------|
 | Mid-word punctuation (`A+B=C`) | `A-B-C` | `A-B-C` | `A-B-C` |
-| Non-ASCII letters (`Über uns`) | preserve → `Über-uns` | preserve → `Über-uns` | preserve → `Über-uns` |
 | Consecutive punctuation (`foo...bar`) | collapse → `foo-bar` | collapse → `foo-bar` | collapse → `foo-bar` |
-| Underscore (`foo_bar`) | keep → `foo_bar` | strip → `foo-bar` | keep → `foo_bar` (follows impl; CSS-valid) |
-| Apostrophe (`That's all`) | preserve → `That's-all` | replace → `That-s-all` | replace → `That-s-all` (CSS-safe) |
-| Double quote / `;` / `:` | preserve | replace | replace with `-` (CSS-safe) |
+| Underscore (`foo_bar`) | keep → `foo_bar` | strip → `foo-bar` | keep → `foo_bar` (CSS-valid, link-safe) |
+| Apostrophe / `"` / `;` / `:` | preserve | replace | replace → `-` (CSS-safe) |
+| Non-ASCII letters (`Über uns`) | preserve → `Über-uns` | preserve → `Über-uns` | **transliterate → `Uber-uns`** (link-safe) |
+| Non-ASCII / smart quotes (`Bob’s`) | preserve → `Bob’s` | preserve → `Bob’s` | **transliterate → `Bob-s`** (link-safe) |
 | Leading digit (`2024 recap`) | `2024-recap` | `2024-recap` | prefix → `h-2024-recap` (CSS requires non-digit start) |
 | Empty result (`!!!`) | `s-N` family | unspecified | fallback → `heading` |
 | Symbols / footnote refs | excluded | excluded | excluded |
 
-The apostrophe / quote / semicolon / colon deviation is deliberate: these characters are not valid in unescaped CSS identifiers, so preserving them per djot.js would force every JS consumer to round-trip through `CSS.escape()` before doing a selector lookup. The leading-digit and empty-result behaviors fill in gaps that the spec and implementation handle inconsistently.
+The deviations are deliberate: `' " ; :` are not valid in unescaped CSS identifiers, and non-ASCII fragments break when shared (see [Why ASCII](#why-ascii)). The leading-digit and empty-result behaviors fill in gaps the spec and reference handle inconsistently. A note proposing the spec clarify the non-ASCII question is tracked against [jgm/djot#391](https://github.com/jgm/djot/issues/391).
 
 ---
 

diff --git a/src/Renderer/AsciiTransliterator.php b/src/Renderer/AsciiTransliterator.php
@@ -0,0 +1,103 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Djot\Renderer;
+
+use Transliterator;
+
+/**
+ * Transliterates arbitrary Unicode text to ASCII for use in heading IDs.
+ *
+ * Heading IDs become URL fragments that get copied around and re-detected by
+ * auto-linkers in chat clients, mail and other docs. Non-ASCII fragments are
+ * routinely truncated or percent-encoded inconsistently there, producing
+ * broken deep links. Reducing IDs to ASCII keeps shared links robust.
+ *
+ * Two engines:
+ *  - ICU `Transliterator` ("Any-Latin; Latin-ASCII") when ext-intl is
+ *    available — also romanizes scripts the map does not cover (e.g. CJK);
+ *  - a baked Unicode->ASCII map generated *from that same ICU transform*
+ *    (see ascii_translit_map.php) as the fallback.
+ *
+ * Because the fallback is generated from ICU, the common European / Cyrillic
+ * / Greek / punctuation output is byte-identical with or without intl, so
+ * shared anchors stay stable across environments. Only scripts outside the
+ * baked ranges differ: intl romanizes them, the map drops them (the caller's
+ * empty-result handling then yields a stable generated id).
+ */
+class AsciiTransliterator
+{
+    protected static bool $icuResolved = false;
+
+    protected static ?Transliterator $icu = null;
+
+    /**
+     * @var array<string, string>|null
+     */
+    protected static ?array $map = null;
+
+    protected bool $useIntl;
+
+    /**
+     * @param bool|null $useIntl Force the engine; null auto-detects ext-intl.
+     */
+    public function __construct(?bool $useIntl = null)
+    {
+        $this->useIntl = $useIntl ?? class_exists(Transliterator::class);
+    }
+
+    public function transliterate(string $text): string
+    {
+        if ($text === '') {
+            return '';
+        }
+
+        $icu = $this->useIntl ? static::icu() : null;
+        if ($icu !== null) {
+            $converted = $icu->transliterate($text);
+            if ($converted !== false) {
+                $text = $converted;
+            }
+        } else {
+            // No usable ICU (intl absent, or Transliterator::create()
+            // returned null on a broken build) — use the deterministic
+            // baked map rather than stripping covered characters.
+            $text = strtr($text, static::map());
+        }
+
+        // Anything still non-ASCII is something neither ICU nor the map
+        // resolved. Turn separators / punctuation / symbols into a space
+        // first so word boundaries (e.g. the ideographic space U+3000 or
+        // comma U+3001 between ASCII words) survive as `-` instead of
+        // merging tokens; then drop the rest (letters of unromanizable
+        // scripts) so the caller falls back to a stable generated id.
+        $text = (string)preg_replace_callback(
+            '/[^\x00-\x7F]+/',
+            static fn (array $m): string => (string)preg_replace('/[\p{Z}\p{P}\p{S}]/u', ' ', $m[0]),
+            $text,
+        );
+
+        return (string)preg_replace('/[^\x00-\x7F]+/', '', $text);
+    }
+
+    protected static function icu(): ?Transliterator
+    {
+        if (!static::$icuResolved) {
+            static::$icuResolved = true;
+            static::$icu = class_exists(Transliterator::class)
+                ? Transliterator::create('Any-Latin; Latin-ASCII')
+                : null;
+        }
+
+        return static::$icu;
+    }
+
+    /**
+     * @return array<string, string>
+     */
+    protected static function map(): array
+    {
+        return static::$map ??= require __DIR__ . '/ascii_translit_map.php';
+    }
+}