diff --git a/bin/gen-translit-map.php b/bin/gen-translit-map.php new file mode 100644 index 00000000..b0becb8e --- /dev/null +++ b/bin/gen-translit-map.php @@ -0,0 +1,108 @@ +#!/usr/bin/env php +transliterate($char); + if ($ascii === false || $ascii === $char || preg_match('/^[\x00-\x7F]*$/', $ascii) !== 1) { + continue; + } + + $contextFree = + $tr->transliterate($char . $char) === $ascii . $ascii + && $tr->transliterate('a' . $char . 'a') === 'a' . $ascii . 'a' + && $tr->transliterate('Z' . $char . 'Z') === 'Z' . $ascii . 'Z'; + if (!$contextFree) { + $rangeIsContextFree = false; + + break; + } + + $rangeEntries[$char] = $ascii; + } + + if ($rangeIsContextFree) { + $map += $rangeEntries; + } +} +ksort($map); + +$lines = []; +foreach ($map as $from => $to) { + $lines[] = ' ' . var_export($from, true) . ' => ' . var_export($to, true) . ','; +} + +$header = <<<'PHP' + ASCII transliteration fallback map. + * + * Generated from the ICU "Any-Latin; Latin-ASCII" transliterator over the + * Latin, IPA, combining-marks, Greek, Cyrillic, Latin-Extended-Additional, + * punctuation, super/subscript, currency and letterlike ranges. Used by + * the AsciiTransliterator only when ext-intl is unavailable, so the common + * (European/Cyrillic/Greek/punctuation) output is byte-identical with or + * without intl. Do not hand-edit; regenerate with `php bin/gen-translit-map.php`. + * + * @return array + */ +return [ +PHP; + +$target = dirname(__DIR__) . '/src/Renderer/ascii_translit_map.php'; +file_put_contents($target, $header . "\n" . implode("\n", $lines) . "\n];\n"); + +echo 'Wrote ' . count($map) . " entries to {$target}\n"; diff --git a/composer.json b/composer.json index 9f26c944..1bb403ae 100644 --- a/composer.json +++ b/composer.json @@ -14,6 +14,9 @@ "require": { "php": "^8.2" }, + "suggest": { + "ext-intl": "Recommended for heading-ID transliteration: enables ICU romanization of non-Latin scripts (e.g. CJK, Arabic). Without it, a baked map covers Latin/Cyrillic/Greek/punctuation identically and other scripts fall back to generated `s-N` ids." + }, "require-dev": { "nikic/php-fuzzer": "^0.0.11", "php-collective/code-sniffer": "dev-master", diff --git a/docs/reference/enhancements.md b/docs/reference/enhancements.md index 24c301f2..7d749e83 100644 --- a/docs/reference/enhancements.md +++ b/docs/reference/enhancements.md @@ -160,18 +160,19 @@ content is a symbol falls back to a generated `s-N` ID. **Status:** Implemented in djot-php -Auto-generated heading IDs are normalized to be valid CSS selectors, ensuring compatibility with `querySelector()`, HTMX scroll restoration, and CSS attribute selectors. +Auto-generated heading IDs are normalized to be valid CSS selectors **and ASCII-only**, so they work with `querySelector()` / HTMX scroll restoration *and* survive being copied around as URL fragments (see [Why ASCII](#why-ascii) below). ### Normalization Rules -1. **Strip `#` characters** — Prevents invalid selectors -2. **Trim whitespace** — Clean leading/trailing spaces -3. **Whitespace to dashes** — Spaces become single `-` -4. **Invalid characters to dashes** — Only Unicode letters (`\p{L}`), numbers (`\p{N}`), hyphens, and underscores are preserved -5. **Collapse consecutive dashes** — `foo--bar` becomes `foo-bar` -6. **Trim leading/trailing dashes** — `-foo-` becomes `foo` -7. **Prefix digits** — IDs starting with a number get `h-` prefix (CSS requirement) -8. **Fallback** — Empty results become `heading` +1. **Transliterate to ASCII** — `Über`→`Uber`, `café`→`cafe`, `Привет`→`Privet`, smart quotes/dashes→`'"-` (then replaced) +2. **Strip `#` characters** — Prevents invalid selectors +3. **Trim whitespace** +4. **Whitespace to dashes** — Spaces become single `-` +5. **Invalid characters to dashes** — Anything other than letters, numbers, `-`, `_` becomes `-` +6. **Collapse consecutive dashes** — `foo--bar` becomes `foo-bar` +7. **Trim leading/trailing dashes** +8. **Prefix digits** — IDs starting with a digit get an `h-` prefix (CSS requirement) +9. **Fallback** — Empty results become `heading` (or a generated `s-N` for empty headings) ### Examples @@ -179,45 +180,35 @@ Auto-generated heading IDs are normalized to be valid CSS selectors, ensuring co |---------|--------------| | `# Hello World` | `Hello-World` | | `# Hello World!` | `Hello-World` | -| `# 日本語の見出し` | `日本語の見出し` | -| `# Привет мир` | `Привет-мир` | +| `# Über uns` | `Uber-uns` | +| `# café résumé` | `cafe-resume` | +| `# Привет мир` | `Privet-mir` | +| `# Bob's Guide` (smart quotes) | `Bob-s-Guide` | | `# E=mc^2` | `E-mc-2` | | `# 123 Numbers First` | `h-123-Numbers-First` | | `# $this->method()` | `this-method` | | `# ###` | `heading` | -### Unicode Preservation +### Why ASCII {#why-ascii} -International characters are preserved while special characters are normalized: +Heading IDs end up as URL fragments (`…/page#Über-uns`) that get copied into chat, email and other documents, where **auto-linkers re-detect the URL heuristically**. Non-ASCII fragments are routinely: -```djot -# 日本語の見出し - -# Cześć świecie -``` +- **truncated** — the link is cut at the first non-ASCII byte (`#Über` → `#`), producing a silent dead link; +- **percent-encoded inconsistently** — `’`→`%E2%80%99`, bloating and sometimes breaking the link; +- **re-normalized differently** by the receiving app (NFC/NFD), so the pasted fragment no longer matches the page's `id`. -**Output:** -```html -

日本語の見出し

-

Cześć świecie

-``` +Transliterating to ASCII keeps shared deep links robust. It's a deliberate deviation from both the djot.js reference and the [jgm/djot#393](https://github.com/jgm/djot/pull/393) spec prose (both preserve non-ASCII) — see [Spec Alignment](#spec-alignment). -### Why This Matters +### Transliteration engine & determinism -Without CSS-safe normalization, headings with special characters would break: +Two engines produce the ASCII form: -```js -// This would throw SyntaxError with unsafe IDs -document.querySelector('#E=mc^2'); // Invalid selector -htmx.scrollToElement('#$this->foo'); // Invalid selector -``` +- **ICU `Transliterator`** (`Any-Latin; Latin-ASCII`) when `ext-intl` is installed — also romanizes scripts the map doesn't cover (Greek, CJK, Arabic, …); +- a **baked Unicode→ASCII map** (`src/Renderer/ascii_translit_map.php`) otherwise. -With normalization, these work correctly: +The baked map is generated *from the same ICU transform*, and the generator bakes a script **only if every code point in it transliterates context-free** (verified standalone, doubled, and between Latin letters). For those scripts — Latin (so all of German, French, Spanish, Polish, Czech, Turkish, Vietnamese, …), Cyrillic, punctuation, smart quotes, dashes, currency — the output is **byte-identical with or without `ext-intl`**, so shared anchors stay stable across environments. -```js -document.querySelector('#E-mc-2'); // Works -htmx.scrollToElement('#this-foo'); // Works -``` +Scripts whose ICU romanization is context-sensitive (e.g. Greek: `αυ`→`au` but `υ`→`y`) are excluded *wholesale* — baking only their context-free letters would produce IDs that disagree with ICU, which is worse than not covering them. Those scripts, plus non-Latin scripts the map never covers (CJK, Arabic, …), behave one way: **with `ext-intl` they are romanized; without it they are dropped and the heading falls back to a generated `s-N` id**. `ext-intl` is therefore *recommended* (a `composer suggest`) but not required; the determinism guarantee above never depends on it. ### Explicit IDs @@ -227,27 +218,27 @@ You can always override with an explicit ID attribute: # My Heading {#custom-id} ``` -Explicit IDs are used as-is without normalization. +Explicit IDs are used as-is without normalization or transliteration. -### Spec Alignment +### Spec Alignment {#spec-alignment} -The remove-vs-replace question raised in [jgm/djot#391](https://github.com/jgm/djot/issues/391) was settled by [jgm/djot#393](https://github.com/jgm/djot/pull/393), which reworded the spec to: *"replacing each maximal run of non-alphanumeric ASCII characters with `-`, removing any leading or trailing `-`"*. Note that #393 only changes the spec **prose** — the djot.js reference implementation is unchanged and (per djot's own changelog policy) remains the authoritative behavior. The new prose is actually broader than djot.js itself: it would also strip `_`, which djot.js keeps. +The remove-vs-replace question raised in [jgm/djot#391](https://github.com/jgm/djot/issues/391) was settled by [jgm/djot#393](https://github.com/jgm/djot/pull/393), which reworded the spec to: *"replacing each maximal run of non-alphanumeric ASCII characters with `-`, removing any leading or trailing `-`"*. #393 changed only the spec **prose**; the djot.js reference implementation is unchanged. -djot-php replaces (does not remove) mid-word punctuation — the direction #393 settled on — and tracks the djot.js **implementation** where the prose and implementation disagree, deliberately deviating only where required to produce valid CSS identifiers for `querySelector()` consumers. +djot-php replaces (does not remove) mid-word punctuation — the direction #393 settled on — additionally replaces `' " ; :` so IDs are valid CSS identifiers, and **transliterates non-ASCII to ASCII** so IDs stay link-safe when shared. The last point is a deliberate deviation from *both* djot.js and the #393 prose, justified by the [Why ASCII](#why-ascii) failure mode. | Aspect | djot.js reference impl | #393 spec prose | djot-php | |--------|------------------------|-----------------|----------| | Mid-word punctuation (`A+B=C`) | `A-B-C` | `A-B-C` | `A-B-C` | -| Non-ASCII letters (`Über uns`) | preserve → `Über-uns` | preserve → `Über-uns` | preserve → `Über-uns` | | Consecutive punctuation (`foo...bar`) | collapse → `foo-bar` | collapse → `foo-bar` | collapse → `foo-bar` | -| Underscore (`foo_bar`) | keep → `foo_bar` | strip → `foo-bar` | keep → `foo_bar` (follows impl; CSS-valid) | -| Apostrophe (`That's all`) | preserve → `That's-all` | replace → `That-s-all` | replace → `That-s-all` (CSS-safe) | -| Double quote / `;` / `:` | preserve | replace | replace with `-` (CSS-safe) | +| Underscore (`foo_bar`) | keep → `foo_bar` | strip → `foo-bar` | keep → `foo_bar` (CSS-valid, link-safe) | +| Apostrophe / `"` / `;` / `:` | preserve | replace | replace → `-` (CSS-safe) | +| Non-ASCII letters (`Über uns`) | preserve → `Über-uns` | preserve → `Über-uns` | **transliterate → `Uber-uns`** (link-safe) | +| Non-ASCII / smart quotes (`Bob’s`) | preserve → `Bob’s` | preserve → `Bob’s` | **transliterate → `Bob-s`** (link-safe) | | Leading digit (`2024 recap`) | `2024-recap` | `2024-recap` | prefix → `h-2024-recap` (CSS requires non-digit start) | | Empty result (`!!!`) | `s-N` family | unspecified | fallback → `heading` | | Symbols / footnote refs | excluded | excluded | excluded | -The apostrophe / quote / semicolon / colon deviation is deliberate: these characters are not valid in unescaped CSS identifiers, so preserving them per djot.js would force every JS consumer to round-trip through `CSS.escape()` before doing a selector lookup. The leading-digit and empty-result behaviors fill in gaps that the spec and implementation handle inconsistently. +The deviations are deliberate: `' " ; :` are not valid in unescaped CSS identifiers, and non-ASCII fragments break when shared (see [Why ASCII](#why-ascii)). The leading-digit and empty-result behaviors fill in gaps the spec and reference handle inconsistently. A note proposing the spec clarify the non-ASCII question is tracked against [jgm/djot#391](https://github.com/jgm/djot/issues/391). --- diff --git a/src/Renderer/AsciiTransliterator.php b/src/Renderer/AsciiTransliterator.php new file mode 100644 index 00000000..63512b98 --- /dev/null +++ b/src/Renderer/AsciiTransliterator.php @@ -0,0 +1,103 @@ +ASCII map generated *from that same ICU transform* + * (see ascii_translit_map.php) as the fallback. + * + * Because the fallback is generated from ICU, the common European / Cyrillic + * / Greek / punctuation output is byte-identical with or without intl, so + * shared anchors stay stable across environments. Only scripts outside the + * baked ranges differ: intl romanizes them, the map drops them (the caller's + * empty-result handling then yields a stable generated id). + */ +class AsciiTransliterator +{ + protected static bool $icuResolved = false; + + protected static ?Transliterator $icu = null; + + /** + * @var array|null + */ + protected static ?array $map = null; + + protected bool $useIntl; + + /** + * @param bool|null $useIntl Force the engine; null auto-detects ext-intl. + */ + public function __construct(?bool $useIntl = null) + { + $this->useIntl = $useIntl ?? class_exists(Transliterator::class); + } + + public function transliterate(string $text): string + { + if ($text === '') { + return ''; + } + + $icu = $this->useIntl ? static::icu() : null; + if ($icu !== null) { + $converted = $icu->transliterate($text); + if ($converted !== false) { + $text = $converted; + } + } else { + // No usable ICU (intl absent, or Transliterator::create() + // returned null on a broken build) — use the deterministic + // baked map rather than stripping covered characters. + $text = strtr($text, static::map()); + } + + // Anything still non-ASCII is something neither ICU nor the map + // resolved. Turn separators / punctuation / symbols into a space + // first so word boundaries (e.g. the ideographic space U+3000 or + // comma U+3001 between ASCII words) survive as `-` instead of + // merging tokens; then drop the rest (letters of unromanizable + // scripts) so the caller falls back to a stable generated id. + $text = (string)preg_replace_callback( + '/[^\x00-\x7F]+/', + static fn (array $m): string => (string)preg_replace('/[\p{Z}\p{P}\p{S}]/u', ' ', $m[0]), + $text, + ); + + return (string)preg_replace('/[^\x00-\x7F]+/', '', $text); + } + + protected static function icu(): ?Transliterator + { + if (!static::$icuResolved) { + static::$icuResolved = true; + static::$icu = class_exists(Transliterator::class) + ? Transliterator::create('Any-Latin; Latin-ASCII') + : null; + } + + return static::$icu; + } + + /** + * @return array + */ + protected static function map(): array + { + return static::$map ??= require __DIR__ . '/ascii_translit_map.php'; + } +} diff --git a/src/Renderer/HeadingIdTracker.php b/src/Renderer/HeadingIdTracker.php index b8b95d6e..61bab9fb 100644 --- a/src/Renderer/HeadingIdTracker.php +++ b/src/Renderer/HeadingIdTracker.php @@ -55,6 +55,13 @@ class HeadingIdTracker */ protected array $resolvedTexts = []; + protected AsciiTransliterator $transliterator; + + public function __construct(?AsciiTransliterator $transliterator = null) + { + $this->transliterator = $transliterator ?? new AsciiTransliterator(); + } + /** * Get the unique ID for a heading node * @@ -108,18 +115,25 @@ public function reserveExplicitIds(Node $node): void } /** - * Normalize text into a valid CSS identifier string + * Normalize text into a valid, link-safe CSS identifier string * - * 1. Strip # characters entirely - * 2. Trim whitespace - * 3. Replace whitespace sequences (including Unicode spaces) with single dashes - * 4. Replace any remaining characters that are invalid in CSS identifiers - * (anything other than Unicode letters/numbers, hyphens, and underscores) + * 1. Transliterate to ASCII (Über → Uber, café → cafe, Привет → Privet), + * so the ID survives being shared as a URL fragment through + * auto-linkers that truncate or mangle non-ASCII + * 2. Strip # characters entirely + * 3. Trim whitespace + * 4. Replace whitespace sequences with single dashes + * 5. Replace any remaining characters invalid in CSS identifiers + * (anything other than letters, numbers, hyphens, and underscores) * with dashes - * 5. Collapse consecutive dashes and trim leading/trailing dashes - * 6. Prefix with 'h-' if the result starts with a digit, ensuring a valid + * 6. Collapse consecutive dashes and trim leading/trailing dashes + * 7. Prefix with 'h-' if the result starts with a digit, ensuring a valid * CSS ident start (digits are not allowed as the first character) * + * Returns '' when nothing usable remains (e.g. all-punctuation text, or a + * script the transliterator cannot reduce to ASCII); the caller then + * falls back to a generated `s-N` id. + * * Producing a valid CSS identifier ensures that consumers such as HTMX, * which call `querySelector` with the section ID for scroll-restoration, * do not throw a SyntaxError when headings contain inline code or special @@ -127,7 +141,8 @@ public function reserveExplicitIds(Node $node): void */ public function normalizeId(string $text): string { - $id = str_replace('#', '', $text); + $id = $this->transliterator->transliterate($text); + $id = str_replace('#', '', $id); $id = trim($id); $id = preg_replace('/\s+/u', '-', $id) ?? $id; $id = preg_replace('/[^\p{L}\p{N}_-]+/u', '-', $id) ?? $id; @@ -138,7 +153,7 @@ public function normalizeId(string $text): string $id = 'h-' . $id; } - return $id !== '' ? $id : 'heading'; + return $id; } /** @@ -236,14 +251,17 @@ protected function generateId(Heading $node): string // excluding non-textual elements such as symbols and footnote // references (jgm/djot#393). $idText = $this->extractPlainText($node, forId: true); + $baseId = $this->normalizeId($idText); - if ($idText === '') { - // Generate fallback ID, skipping any `s-N` already reserved - // (by `reserveExplicitIds`, an explicit `{#s-N}` heading, or a - // prior normalized heading), so the fallback never produces a - // duplicate. Both the renderer and the implicit-reference - // pass run `reserveExplicitIds` first, so this stays - // deterministic across passes. + if ($baseId === '') { + // No usable content: empty heading, all-punctuation text, or a + // script the transliterator could not reduce to ASCII. Fall + // back to a generated `s-N` id, skipping any `s-N` already + // reserved (by `reserveExplicitIds`, an explicit `{#s-N}`, or + // a prior heading) so the fallback never produces a duplicate. + // Parser/render parity is preserved by `BlockParser`'s + // post-parse rewrite (#184) which re-targets implicit refs to + // the renderer-visible deduped id. do { $this->sectionCounter++; $fallback = 's-' . $this->sectionCounter; @@ -254,8 +272,6 @@ protected function generateId(Heading $node): string return $fallback; } - $baseId = $this->normalizeId($idText); - // Track and deduplicate if (!isset($this->usedIds[$baseId])) { $this->usedIds[$baseId] = 0; diff --git a/src/Renderer/ascii_translit_map.php b/src/Renderer/ascii_translit_map.php new file mode 100644 index 00000000..7d2744ea --- /dev/null +++ b/src/Renderer/ascii_translit_map.php @@ -0,0 +1,921 @@ + ASCII transliteration fallback map. + * + * Generated from the ICU "Any-Latin; Latin-ASCII" transliterator over the + * Latin, IPA, combining-marks, Greek, Cyrillic, Latin-Extended-Additional, + * punctuation, super/subscript, currency and letterlike ranges. Used by + * the AsciiTransliterator only when ext-intl is unavailable, so the common + * (European/Cyrillic/Greek/punctuation) output is byte-identical with or + * without intl. Do not hand-edit; regenerate with `php bin/gen-translit-map.php`. + * + * @return array + */ +return [ + ' ' => ' ', + '¡' => '!', + '©' => '(C)', + '«' => '<<', + '­' => '-', + '®' => '(R)', + '±' => '+/-', + '»' => '>>', + '¼' => ' 1/4', + '½' => ' 1/2', + '¾' => ' 3/4', + '¿' => '?', + 'À' => 'A', + 'Á' => 'A', + 'Â' => 'A', + 'Ã' => 'A', + 'Ä' => 'A', + 'Å' => 'A', + 'Æ' => 'AE', + 'Ç' => 'C', + 'È' => 'E', + 'É' => 'E', + 'Ê' => 'E', + 'Ë' => 'E', + 'Ì' => 'I', + 'Í' => 'I', + 'Î' => 'I', + 'Ï' => 'I', + 'Ð' => 'D', + 'Ñ' => 'N', + 'Ò' => 'O', + 'Ó' => 'O', + 'Ô' => 'O', + 'Õ' => 'O', + 'Ö' => 'O', + '×' => '*', + 'Ø' => 'O', + 'Ù' => 'U', + 'Ú' => 'U', + 'Û' => 'U', + 'Ü' => 'U', + 'Ý' => 'Y', + 'Þ' => 'TH', + 'ß' => 'ss', + 'à' => 'a', + 'á' => 'a', + 'â' => 'a', + 'ã' => 'a', + 'ä' => 'a', + 'å' => 'a', + 'æ' => 'ae', + 'ç' => 'c', + 'è' => 'e', + 'é' => 'e', + 'ê' => 'e', + 'ë' => 'e', + 'ì' => 'i', + 'í' => 'i', + 'î' => 'i', + 'ï' => 'i', + 'ð' => 'd', + 'ñ' => 'n', + 'ò' => 'o', + 'ó' => 'o', + 'ô' => 'o', + 'õ' => 'o', + 'ö' => 'o', + '÷' => '/', + 'ø' => 'o', + 'ù' => 'u', + 'ú' => 'u', + 'û' => 'u', + 'ü' => 'u', + 'ý' => 'y', + 'þ' => 'th', + 'ÿ' => 'y', + 'Ā' => 'A', + 'ā' => 'a', + 'Ă' => 'A', + 'ă' => 'a', + 'Ą' => 'A', + 'ą' => 'a', + 'Ć' => 'C', + 'ć' => 'c', + 'Ĉ' => 'C', + 'ĉ' => 'c', + 'Ċ' => 'C', + 'ċ' => 'c', + 'Č' => 'C', + 'č' => 'c', + 'Ď' => 'D', + 'ď' => 'd', + 'Đ' => 'D', + 'đ' => 'd', + 'Ē' => 'E', + 'ē' => 'e', + 'Ĕ' => 'E', + 'ĕ' => 'e', + 'Ė' => 'E', + 'ė' => 'e', + 'Ę' => 'E', + 'ę' => 'e', + 'Ě' => 'E', + 'ě' => 'e', + 'Ĝ' => 'G', + 'ĝ' => 'g', + 'Ğ' => 'G', + 'ğ' => 'g', + 'Ġ' => 'G', + 'ġ' => 'g', + 'Ģ' => 'G', + 'ģ' => 'g', + 'Ĥ' => 'H', + 'ĥ' => 'h', + 'Ħ' => 'H', + 'ħ' => 'h', + 'Ĩ' => 'I', + 'ĩ' => 'i', + 'Ī' => 'I', + 'ī' => 'i', + 'Ĭ' => 'I', + 'ĭ' => 'i', + 'Į' => 'I', + 'į' => 'i', + 'İ' => 'I', + 'ı' => 'i', + 'IJ' => 'IJ', + 'ij' => 'ij', + 'Ĵ' => 'J', + 'ĵ' => 'j', + 'Ķ' => 'K', + 'ķ' => 'k', + 'ĸ' => 'q', + 'Ĺ' => 'L', + 'ĺ' => 'l', + 'Ļ' => 'L', + 'ļ' => 'l', + 'Ľ' => 'L', + 'ľ' => 'l', + 'Ŀ' => 'L', + 'ŀ' => 'l', + 'Ł' => 'L', + 'ł' => 'l', + 'Ń' => 'N', + 'ń' => 'n', + 'Ņ' => 'N', + 'ņ' => 'n', + 'Ň' => 'N', + 'ň' => 'n', + 'ʼn' => '\'n', + 'Ŋ' => 'N', + 'ŋ' => 'n', + 'Ō' => 'O', + 'ō' => 'o', + 'Ŏ' => 'O', + 'ŏ' => 'o', + 'Ő' => 'O', + 'ő' => 'o', + 'Œ' => 'OE', + 'œ' => 'oe', + 'Ŕ' => 'R', + 'ŕ' => 'r', + 'Ŗ' => 'R', + 'ŗ' => 'r', + 'Ř' => 'R', + 'ř' => 'r', + 'Ś' => 'S', + 'ś' => 's', + 'Ŝ' => 'S', + 'ŝ' => 's', + 'Ş' => 'S', + 'ş' => 's', + 'Š' => 'S', + 'š' => 's', + 'Ţ' => 'T', + 'ţ' => 't', + 'Ť' => 'T', + 'ť' => 't', + 'Ŧ' => 'T', + 'ŧ' => 't', + 'Ũ' => 'U', + 'ũ' => 'u', + 'Ū' => 'U', + 'ū' => 'u', + 'Ŭ' => 'U', + 'ŭ' => 'u', + 'Ů' => 'U', + 'ů' => 'u', + 'Ű' => 'U', + 'ű' => 'u', + 'Ų' => 'U', + 'ų' => 'u', + 'Ŵ' => 'W', + 'ŵ' => 'w', + 'Ŷ' => 'Y', + 'ŷ' => 'y', + 'Ÿ' => 'Y', + 'Ź' => 'Z', + 'ź' => 'z', + 'Ż' => 'Z', + 'ż' => 'z', + 'Ž' => 'Z', + 'ž' => 'z', + 'ſ' => 's', + 'ƀ' => 'b', + 'Ɓ' => 'B', + 'Ƃ' => 'B', + 'ƃ' => 'b', + 'Ƈ' => 'C', + 'ƈ' => 'c', + 'Ɖ' => 'D', + 'Ɗ' => 'D', + 'Ƌ' => 'D', + 'ƌ' => 'd', + 'Ɛ' => 'E', + 'Ƒ' => 'F', + 'ƒ' => 'f', + 'Ɠ' => 'G', + 'ƕ' => 'hv', + 'Ɩ' => 'I', + 'Ɨ' => 'I', + 'Ƙ' => 'K', + 'ƙ' => 'k', + 'ƚ' => 'l', + 'Ɲ' => 'N', + 'ƞ' => 'n', + 'Ơ' => 'O', + 'ơ' => 'o', + 'Ƣ' => 'OI', + 'ƣ' => 'oi', + 'Ƥ' => 'P', + 'ƥ' => 'p', + 'ƫ' => 't', + 'Ƭ' => 'T', + 'ƭ' => 't', + 'Ʈ' => 'T', + 'Ư' => 'U', + 'ư' => 'u', + 'Ʋ' => 'V', + 'Ƴ' => 'Y', + 'ƴ' => 'y', + 'Ƶ' => 'Z', + 'ƶ' => 'z', + 'DŽ' => 'DZ', + 'Dž' => 'Dz', + 'dž' => 'dz', + 'LJ' => 'LJ', + 'Lj' => 'Lj', + 'lj' => 'lj', + 'NJ' => 'NJ', + 'Nj' => 'Nj', + 'nj' => 'nj', + 'Ǎ' => 'A', + 'ǎ' => 'a', + 'Ǐ' => 'I', + 'ǐ' => 'i', + 'Ǒ' => 'O', + 'ǒ' => 'o', + 'Ǔ' => 'U', + 'ǔ' => 'u', + 'Ǖ' => 'U', + 'ǖ' => 'u', + 'Ǘ' => 'U', + 'ǘ' => 'u', + 'Ǚ' => 'U', + 'ǚ' => 'u', + 'Ǜ' => 'U', + 'ǜ' => 'u', + 'Ǟ' => 'A', + 'ǟ' => 'a', + 'Ǡ' => 'A', + 'ǡ' => 'a', + 'Ǣ' => 'AE', + 'ǣ' => 'ae', + 'Ǥ' => 'G', + 'ǥ' => 'g', + 'Ǧ' => 'G', + 'ǧ' => 'g', + 'Ǩ' => 'K', + 'ǩ' => 'k', + 'Ǫ' => 'O', + 'ǫ' => 'o', + 'Ǭ' => 'O', + 'ǭ' => 'o', + 'ǰ' => 'j', + 'DZ' => 'DZ', + 'Dz' => 'Dz', + 'dz' => 'dz', + 'Ǵ' => 'G', + 'ǵ' => 'g', + 'Ǹ' => 'N', + 'ǹ' => 'n', + 'Ǻ' => 'A', + 'ǻ' => 'a', + 'Ǽ' => 'AE', + 'ǽ' => 'ae', + 'Ǿ' => 'O', + 'ǿ' => 'o', + 'Ȁ' => 'A', + 'ȁ' => 'a', + 'Ȃ' => 'A', + 'ȃ' => 'a', + 'Ȅ' => 'E', + 'ȅ' => 'e', + 'Ȇ' => 'E', + 'ȇ' => 'e', + 'Ȉ' => 'I', + 'ȉ' => 'i', + 'Ȋ' => 'I', + 'ȋ' => 'i', + 'Ȍ' => 'O', + 'ȍ' => 'o', + 'Ȏ' => 'O', + 'ȏ' => 'o', + 'Ȑ' => 'R', + 'ȑ' => 'r', + 'Ȓ' => 'R', + 'ȓ' => 'r', + 'Ȕ' => 'U', + 'ȕ' => 'u', + 'Ȗ' => 'U', + 'ȗ' => 'u', + 'Ș' => 'S', + 'ș' => 's', + 'Ț' => 'T', + 'ț' => 't', + 'Ȟ' => 'H', + 'ȟ' => 'h', + 'ȡ' => 'd', + 'Ȥ' => 'Z', + 'ȥ' => 'z', + 'Ȧ' => 'A', + 'ȧ' => 'a', + 'Ȩ' => 'E', + 'ȩ' => 'e', + 'Ȫ' => 'O', + 'ȫ' => 'o', + 'Ȭ' => 'O', + 'ȭ' => 'o', + 'Ȯ' => 'O', + 'ȯ' => 'o', + 'Ȱ' => 'O', + 'ȱ' => 'o', + 'Ȳ' => 'Y', + 'ȳ' => 'y', + 'ȴ' => 'l', + 'ȵ' => 'n', + 'ȶ' => 't', + 'ȷ' => 'j', + 'ȸ' => 'db', + 'ȹ' => 'qp', + 'Ⱥ' => 'A', + 'Ȼ' => 'C', + 'ȼ' => 'c', + 'Ƚ' => 'L', + 'Ⱦ' => 'T', + 'ȿ' => 's', + 'ɀ' => 'z', + 'Ƀ' => 'B', + 'Ʉ' => 'U', + 'Ɇ' => 'E', + 'ɇ' => 'e', + 'Ɉ' => 'J', + 'ɉ' => 'j', + 'Ɍ' => 'R', + 'ɍ' => 'r', + 'Ɏ' => 'Y', + 'ɏ' => 'y', + 'ɓ' => 'b', + 'ɕ' => 'c', + 'ɖ' => 'd', + 'ɗ' => 'd', + 'ɛ' => 'e', + 'ɟ' => 'j', + 'ɠ' => 'g', + 'ɡ' => 'g', + 'ɢ' => 'G', + 'ɦ' => 'h', + 'ɧ' => 'h', + 'ɨ' => 'i', + 'ɪ' => 'I', + 'ɫ' => 'l', + 'ɬ' => 'l', + 'ɭ' => 'l', + 'ɱ' => 'm', + 'ɲ' => 'n', + 'ɳ' => 'n', + 'ɴ' => 'N', + 'ɶ' => 'OE', + 'ɼ' => 'r', + 'ɽ' => 'r', + 'ɾ' => 'r', + 'ʀ' => 'R', + 'ʂ' => 's', + 'ʈ' => 't', + 'ʉ' => 'u', + 'ʋ' => 'v', + 'ʏ' => 'Y', + 'ʐ' => 'z', + 'ʑ' => 'z', + 'ʙ' => 'B', + 'ʛ' => 'G', + 'ʜ' => 'H', + 'ʝ' => 'j', + 'ʟ' => 'L', + 'ʠ' => 'q', + 'ʣ' => 'dz', + 'ʥ' => 'dz', + 'ʦ' => 'ts', + 'ʪ' => 'ls', + 'ʫ' => 'lz', + 'Ѐ' => 'E', + 'Ё' => 'E', + 'Ђ' => 'D', + 'Ѓ' => 'G', + 'Є' => 'E', + 'Ѕ' => 'Z', + 'І' => 'I', + 'Ї' => 'I', + 'Ј' => 'J', + 'Љ' => 'L', + 'Њ' => 'N', + 'Ћ' => 'C', + 'Ќ' => 'K', + 'Ѝ' => 'I', + 'Ў' => 'U', + 'Џ' => 'D', + 'А' => 'A', + 'Б' => 'B', + 'В' => 'V', + 'Г' => 'G', + 'Д' => 'D', + 'Е' => 'E', + 'Ж' => 'Z', + 'З' => 'Z', + 'И' => 'I', + 'Й' => 'J', + 'К' => 'K', + 'Л' => 'L', + 'М' => 'M', + 'Н' => 'N', + 'О' => 'O', + 'П' => 'P', + 'Р' => 'R', + 'С' => 'S', + 'Т' => 'T', + 'У' => 'U', + 'Ф' => 'F', + 'Х' => 'H', + 'Ц' => 'C', + 'Ч' => 'C', + 'Ш' => 'S', + 'Щ' => 'S', + 'Ы' => 'Y', + 'Э' => 'E', + 'Ю' => 'U', + 'Я' => 'A', + 'а' => 'a', + 'б' => 'b', + 'в' => 'v', + 'г' => 'g', + 'д' => 'd', + 'е' => 'e', + 'ж' => 'z', + 'з' => 'z', + 'и' => 'i', + 'й' => 'j', + 'к' => 'k', + 'л' => 'l', + 'м' => 'm', + 'н' => 'n', + 'о' => 'o', + 'п' => 'p', + 'р' => 'r', + 'с' => 's', + 'т' => 't', + 'у' => 'u', + 'ф' => 'f', + 'х' => 'h', + 'ц' => 'c', + 'ч' => 'c', + 'ш' => 's', + 'щ' => 's', + 'ъ' => '"', + 'ы' => 'y', + 'ь' => '\'', + 'э' => 'e', + 'ю' => 'u', + 'я' => 'a', + 'ѐ' => 'e', + 'ё' => 'e', + 'ђ' => 'd', + 'ѓ' => 'g', + 'є' => 'e', + 'ѕ' => 'z', + 'і' => 'i', + 'ї' => 'i', + 'ј' => 'j', + 'љ' => 'l', + 'њ' => 'n', + 'ћ' => 'c', + 'ќ' => 'k', + 'ѝ' => 'i', + 'ў' => 'u', + 'џ' => 'd', + 'Ґ' => 'G', + 'ґ' => 'g', + 'Ғ' => 'G', + 'ғ' => 'g', + 'Ҕ' => 'G', + 'ҕ' => 'g', + 'Ҙ' => 'Z', + 'ҙ' => 'z', + 'Ң' => 'N', + 'ң' => 'n', + 'Ү' => 'U', + 'ү' => 'u', + 'Ұ' => 'U', + 'ұ' => 'u', + 'Һ' => 'H', + 'һ' => 'h', + 'Ӂ' => 'Z', + 'ӂ' => 'z', + 'Ӑ' => 'A', + 'ӑ' => 'a', + 'Ӓ' => 'A', + 'ӓ' => 'a', + 'Ӕ' => 'AE', + 'ӕ' => 'ae', + 'Ӗ' => 'E', + 'ӗ' => 'e', + 'Ӝ' => 'Z', + 'ӝ' => 'z', + 'Ӟ' => 'Z', + 'ӟ' => 'z', + 'Ӣ' => 'I', + 'ӣ' => 'i', + 'Ӥ' => 'I', + 'ӥ' => 'i', + 'Ӧ' => 'O', + 'ӧ' => 'o', + 'Ө' => 'O', + 'ө' => 'o', + 'Ӭ' => 'E', + 'ӭ' => 'e', + 'Ӯ' => 'U', + 'ӯ' => 'u', + 'Ӱ' => 'U', + 'ӱ' => 'u', + 'Ӳ' => 'U', + 'ӳ' => 'u', + 'Ӵ' => 'C', + 'ӵ' => 'c', + 'Ӹ' => 'Y', + 'ӹ' => 'y', + 'Ḁ' => 'A', + 'ḁ' => 'a', + 'Ḃ' => 'B', + 'ḃ' => 'b', + 'Ḅ' => 'B', + 'ḅ' => 'b', + 'Ḇ' => 'B', + 'ḇ' => 'b', + 'Ḉ' => 'C', + 'ḉ' => 'c', + 'Ḋ' => 'D', + 'ḋ' => 'd', + 'Ḍ' => 'D', + 'ḍ' => 'd', + 'Ḏ' => 'D', + 'ḏ' => 'd', + 'Ḑ' => 'D', + 'ḑ' => 'd', + 'Ḓ' => 'D', + 'ḓ' => 'd', + 'Ḕ' => 'E', + 'ḕ' => 'e', + 'Ḗ' => 'E', + 'ḗ' => 'e', + 'Ḙ' => 'E', + 'ḙ' => 'e', + 'Ḛ' => 'E', + 'ḛ' => 'e', + 'Ḝ' => 'E', + 'ḝ' => 'e', + 'Ḟ' => 'F', + 'ḟ' => 'f', + 'Ḡ' => 'G', + 'ḡ' => 'g', + 'Ḣ' => 'H', + 'ḣ' => 'h', + 'Ḥ' => 'H', + 'ḥ' => 'h', + 'Ḧ' => 'H', + 'ḧ' => 'h', + 'Ḩ' => 'H', + 'ḩ' => 'h', + 'Ḫ' => 'H', + 'ḫ' => 'h', + 'Ḭ' => 'I', + 'ḭ' => 'i', + 'Ḯ' => 'I', + 'ḯ' => 'i', + 'Ḱ' => 'K', + 'ḱ' => 'k', + 'Ḳ' => 'K', + 'ḳ' => 'k', + 'Ḵ' => 'K', + 'ḵ' => 'k', + 'Ḷ' => 'L', + 'ḷ' => 'l', + 'Ḹ' => 'L', + 'ḹ' => 'l', + 'Ḻ' => 'L', + 'ḻ' => 'l', + 'Ḽ' => 'L', + 'ḽ' => 'l', + 'Ḿ' => 'M', + 'ḿ' => 'm', + 'Ṁ' => 'M', + 'ṁ' => 'm', + 'Ṃ' => 'M', + 'ṃ' => 'm', + 'Ṅ' => 'N', + 'ṅ' => 'n', + 'Ṇ' => 'N', + 'ṇ' => 'n', + 'Ṉ' => 'N', + 'ṉ' => 'n', + 'Ṋ' => 'N', + 'ṋ' => 'n', + 'Ṍ' => 'O', + 'ṍ' => 'o', + 'Ṏ' => 'O', + 'ṏ' => 'o', + 'Ṑ' => 'O', + 'ṑ' => 'o', + 'Ṓ' => 'O', + 'ṓ' => 'o', + 'Ṕ' => 'P', + 'ṕ' => 'p', + 'Ṗ' => 'P', + 'ṗ' => 'p', + 'Ṙ' => 'R', + 'ṙ' => 'r', + 'Ṛ' => 'R', + 'ṛ' => 'r', + 'Ṝ' => 'R', + 'ṝ' => 'r', + 'Ṟ' => 'R', + 'ṟ' => 'r', + 'Ṡ' => 'S', + 'ṡ' => 's', + 'Ṣ' => 'S', + 'ṣ' => 's', + 'Ṥ' => 'S', + 'ṥ' => 's', + 'Ṧ' => 'S', + 'ṧ' => 's', + 'Ṩ' => 'S', + 'ṩ' => 's', + 'Ṫ' => 'T', + 'ṫ' => 't', + 'Ṭ' => 'T', + 'ṭ' => 't', + 'Ṯ' => 'T', + 'ṯ' => 't', + 'Ṱ' => 'T', + 'ṱ' => 't', + 'Ṳ' => 'U', + 'ṳ' => 'u', + 'Ṵ' => 'U', + 'ṵ' => 'u', + 'Ṷ' => 'U', + 'ṷ' => 'u', + 'Ṹ' => 'U', + 'ṹ' => 'u', + 'Ṻ' => 'U', + 'ṻ' => 'u', + 'Ṽ' => 'V', + 'ṽ' => 'v', + 'Ṿ' => 'V', + 'ṿ' => 'v', + 'Ẁ' => 'W', + 'ẁ' => 'w', + 'Ẃ' => 'W', + 'ẃ' => 'w', + 'Ẅ' => 'W', + 'ẅ' => 'w', + 'Ẇ' => 'W', + 'ẇ' => 'w', + 'Ẉ' => 'W', + 'ẉ' => 'w', + 'Ẋ' => 'X', + 'ẋ' => 'x', + 'Ẍ' => 'X', + 'ẍ' => 'x', + 'Ẏ' => 'Y', + 'ẏ' => 'y', + 'Ẑ' => 'Z', + 'ẑ' => 'z', + 'Ẓ' => 'Z', + 'ẓ' => 'z', + 'Ẕ' => 'Z', + 'ẕ' => 'z', + 'ẖ' => 'h', + 'ẗ' => 't', + 'ẘ' => 'w', + 'ẙ' => 'y', + 'ẚ' => 'a', + 'ẛ' => 's', + 'ẜ' => 's', + 'ẝ' => 's', + 'ẞ' => 'SS', + 'Ạ' => 'A', + 'ạ' => 'a', + 'Ả' => 'A', + 'ả' => 'a', + 'Ấ' => 'A', + 'ấ' => 'a', + 'Ầ' => 'A', + 'ầ' => 'a', + 'Ẩ' => 'A', + 'ẩ' => 'a', + 'Ẫ' => 'A', + 'ẫ' => 'a', + 'Ậ' => 'A', + 'ậ' => 'a', + 'Ắ' => 'A', + 'ắ' => 'a', + 'Ằ' => 'A', + 'ằ' => 'a', + 'Ẳ' => 'A', + 'ẳ' => 'a', + 'Ẵ' => 'A', + 'ẵ' => 'a', + 'Ặ' => 'A', + 'ặ' => 'a', + 'Ẹ' => 'E', + 'ẹ' => 'e', + 'Ẻ' => 'E', + 'ẻ' => 'e', + 'Ẽ' => 'E', + 'ẽ' => 'e', + 'Ế' => 'E', + 'ế' => 'e', + 'Ề' => 'E', + 'ề' => 'e', + 'Ể' => 'E', + 'ể' => 'e', + 'Ễ' => 'E', + 'ễ' => 'e', + 'Ệ' => 'E', + 'ệ' => 'e', + 'Ỉ' => 'I', + 'ỉ' => 'i', + 'Ị' => 'I', + 'ị' => 'i', + 'Ọ' => 'O', + 'ọ' => 'o', + 'Ỏ' => 'O', + 'ỏ' => 'o', + 'Ố' => 'O', + 'ố' => 'o', + 'Ồ' => 'O', + 'ồ' => 'o', + 'Ổ' => 'O', + 'ổ' => 'o', + 'Ỗ' => 'O', + 'ỗ' => 'o', + 'Ộ' => 'O', + 'ộ' => 'o', + 'Ớ' => 'O', + 'ớ' => 'o', + 'Ờ' => 'O', + 'ờ' => 'o', + 'Ở' => 'O', + 'ở' => 'o', + 'Ỡ' => 'O', + 'ỡ' => 'o', + 'Ợ' => 'O', + 'ợ' => 'o', + 'Ụ' => 'U', + 'ụ' => 'u', + 'Ủ' => 'U', + 'ủ' => 'u', + 'Ứ' => 'U', + 'ứ' => 'u', + 'Ừ' => 'U', + 'ừ' => 'u', + 'Ử' => 'U', + 'ử' => 'u', + 'Ữ' => 'U', + 'ữ' => 'u', + 'Ự' => 'U', + 'ự' => 'u', + 'Ỳ' => 'Y', + 'ỳ' => 'y', + 'Ỵ' => 'Y', + 'ỵ' => 'y', + 'Ỷ' => 'Y', + 'ỷ' => 'y', + 'Ỹ' => 'Y', + 'ỹ' => 'y', + 'Ỻ' => 'LL', + 'ỻ' => 'll', + 'Ỽ' => 'V', + 'ỽ' => 'v', + 'Ỿ' => 'Y', + 'ỿ' => 'y', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + ' ' => ' ', + '‐' => '-', + '‑' => '-', + '‒' => '-', + '–' => '-', + '—' => '-', + '―' => '-', + '‖' => '||', + '‘' => '\'', + '’' => '\'', + '‚' => ',', + '‛' => '\'', + '“' => '"', + '”' => '"', + '„' => ',,', + '‟' => '"', + '․' => '.', + '‥' => '..', + '…' => '...', + '′' => '\'', + '″' => '"', + '‹' => '<', + '›' => '>', + '‼' => '!!', + '⁄' => '/', + '⁅' => '[', + '⁆' => ']', + '⁇' => '??', + '⁈' => '?!', + '⁉' => '!?', + '⁎' => '*', + ' ' => ' ', + '₠' => 'CE', + '₢' => 'Cr', + '₣' => 'Fr.', + '₤' => 'L.', + '₧' => 'Pts', + '₹' => 'Rs', + '₺' => 'TL', + '℀' => 'a/c', + '℁' => 'a/s', + 'ℂ' => 'C', + '℅' => 'c/o', + '℆' => 'c/u', + 'ℊ' => 'g', + 'ℋ' => 'H', + 'ℌ' => 'x', + 'ℍ' => 'H', + 'ℎ' => 'h', + 'ℐ' => 'I', + 'ℑ' => 'I', + 'ℒ' => 'L', + 'ℓ' => 'l', + 'ℕ' => 'N', + '№' => 'No', + '℗' => '(P)', + '℘' => 'P', + 'ℙ' => 'P', + 'ℚ' => 'Q', + 'ℛ' => 'R', + 'ℜ' => 'R', + 'ℝ' => 'R', + '℞' => 'Rx', + '℡' => 'TEL', + 'ℤ' => 'Z', + 'Ω' => 'O', + 'ℨ' => 'Z', + 'K' => 'K', + 'Å' => 'A', + 'ℬ' => 'B', + 'ℭ' => 'C', + 'ℯ' => 'e', + 'ℰ' => 'E', + 'ℱ' => 'F', + 'ℳ' => 'M', + 'ℴ' => 'o', + 'ℹ' => 'i', + '℻' => 'FAX', + 'ⅅ' => 'D', + 'ⅆ' => 'd', + 'ⅇ' => 'e', + 'ⅈ' => 'i', + 'ⅉ' => 'j', +]; diff --git a/tests/TestCase/DjotConverterTest.php b/tests/TestCase/DjotConverterTest.php index 25d74eb1..0373d5e3 100644 --- a/tests/TestCase/DjotConverterTest.php +++ b/tests/TestCase/DjotConverterTest.php @@ -17,6 +17,7 @@ use LengthException; use PHPUnit\Framework\TestCase; use RuntimeException; +use Transliterator; class DjotConverterTest extends TestCase { @@ -1900,8 +1901,33 @@ public function testUnicodeInHeading(): void $djot = '# 日本語の見出し'; $result = $this->converter->convert($djot); - $this->assertStringContainsString('
', $result); + // The visible heading text is unchanged; only the ID is made + // ASCII-safe so it survives being shared as a URL fragment. $this->assertStringContainsString('

日本語の見出し

', $result); + $this->assertStringNotContainsString('id="日本語の見出し"', $result); + $this->assertMatchesRegularExpression('/
/', $result); + + if (class_exists(Transliterator::class)) { + // With ext-intl the CJK heading is romanized rather than dropped. + $this->assertStringContainsString('
', $result); + } + } + + /** + * The implicit-heading-reference pass (BlockParser, fresh tracker) and the + * renderer must compute the same `s-N` fallback id, even when an explicit + * non-heading id exists. Regression guard: a render-only dedup once made + * the heading `s-2` while the reference still pointed at `#s-1`. + */ + public function testGeneratedFallbackIdStaysConsistentWithImplicitReference(): void + { + $result = $this->converter->convert("{#s-1}\npara\n\n# !!!\n\n[!!!][]\n"); + + $this->assertSame( + 1, + preg_match('/
/', $result, $section), + ); + $this->assertStringContainsString('href="#' . $section[1] . '"', $result); } /** diff --git a/tests/TestCase/Renderer/AsciiTransliteratorTest.php b/tests/TestCase/Renderer/AsciiTransliteratorTest.php new file mode 100644 index 00000000..abdb8817 --- /dev/null +++ b/tests/TestCase/Renderer/AsciiTransliteratorTest.php @@ -0,0 +1,128 @@ +assertSame($expected, $withMap->transliterate($input), 'map fallback'); + + if (class_exists(Transliterator::class)) { + $withIntl = new AsciiTransliterator(useIntl: true); + $this->assertSame($expected, $withIntl->transliterate($input), 'intl engine'); + } + } + + /** + * @return array + */ + public static function deterministicCases(): array + { + return [ + 'plain ascii untouched' => ['Hello World', 'Hello World'], + 'german umlaut' => ['Über uns', 'Uber uns'], + 'sharp s' => ['Straße', 'Strasse'], + 'french accents' => ['café résumé', 'cafe resume'], + 'ligatures' => ['œuvre Æsir', 'oeuvre AEsir'], + 'smart quotes' => ['Bob’s “Guide”', "Bob's \"Guide\""], + 'dashes' => ['en–dash em—dash', 'en-dash em-dash'], + 'cyrillic' => ['Привет мир', 'Privet mir'], + 'nbsp' => ["a\u{00A0}b", 'a b'], + ]; + } + + public function testEmptyStringStaysEmpty(): void + { + $this->assertSame('', (new AsciiTransliterator(useIntl: false))->transliterate('')); + } + + public function testUnmappedScriptIsStrippedByMapFallback(): void + { + $map = new AsciiTransliterator(useIntl: false); + + // The baked map has no CJK entries (non-Latin script). + $this->assertSame('', $map->transliterate('日本語')); + + // Greek romanization is context-sensitive in ICU (`αυ`→`au` but + // `υ`→`y`), so the whole Greek block is excluded from the baked map + // rather than baked half-right. Without intl it degrades to the + // generated id, exactly like CJK; with intl it is still romanized. + $this->assertSame('', $map->transliterate('Αυγή')); + $this->assertSame('', $map->transliterate('Ελλάδα')); + } + + /** + * Word boundaries must survive the map fallback: unmapped non-ASCII + * separators / punctuation (ideographic space, ideographic comma, …) + * become a space so they later normalize to `-` instead of merging + * adjacent ASCII words into one token. + */ + public function testUnmappedSeparatorsKeepWordBoundaries(): void + { + $translit = new AsciiTransliterator(useIntl: false); + + $this->assertSame('foo bar', $translit->transliterate("foo\u{3000}bar")); + $this->assertSame('foo bar', $translit->transliterate('foo、bar')); + $this->assertSame('a b', $translit->transliterate('a→b')); + } + + public function testIntlRomanizesUnmappedScript(): void + { + if (!class_exists(Transliterator::class)) { + self::markTestSkipped('ext-intl not available'); + } + + foreach (['日本語', 'Αυγή'] as $input) { + $result = (new AsciiTransliterator(useIntl: true))->transliterate($input); + + $this->assertNotSame('', $result, $input); + $this->assertMatchesRegularExpression('/^[\x00-\x7F]+$/', $result, $input); + } + } + + public function testResultIsAlwaysPureAscii(): void + { + foreach (['🎉 party', 'mixed Übér 日本語 �яtest'] as $input) { + foreach ([true, false] as $useIntl) { + if ($useIntl && !class_exists(Transliterator::class)) { + continue; + } + $out = (new AsciiTransliterator(useIntl: $useIntl))->transliterate($input); + $this->assertSame(1, preg_match('/^[\x00-\x7F]*$/', $out), $input); + } + } + } + + /** + * On an ext-intl build where ICU cannot create the transliterator, + * the deterministic baked map must still be used for covered ranges + * instead of silently stripping everything. + */ + public function testFallsBackToMapWhenIcuUnavailable(): void + { + $translit = new class (useIntl: true) extends AsciiTransliterator { + protected static function icu(): ?Transliterator + { + return null; // simulate Transliterator::create() returning null + } + }; + + $this->assertSame('Uber-uns Privet', $translit->transliterate('Über-uns Привет')); + } +} diff --git a/tests/TestCase/Renderer/HeadingIdTrackerTest.php b/tests/TestCase/Renderer/HeadingIdTrackerTest.php index 4625f1a5..22a27ba0 100644 --- a/tests/TestCase/Renderer/HeadingIdTrackerTest.php +++ b/tests/TestCase/Renderer/HeadingIdTrackerTest.php @@ -11,6 +11,7 @@ use Djot\Node\Inline\Strong; use Djot\Node\Inline\Symbol; use Djot\Node\Inline\Text; +use Djot\Renderer\AsciiTransliterator; use Djot\Renderer\HeadingIdTracker; use PHPUnit\Framework\TestCase; @@ -174,30 +175,36 @@ public function testNormalizeId(): void $this->assertSame('Multiple-Spaces', $this->tracker->normalizeId('Multiple Spaces')); $this->assertSame('this-t-key-params-fallback', $this->tracker->normalizeId("\$this->t(\$key, \$params = [], \$fallback = '')")); $this->assertSame('My-title', $this->tracker->normalizeId('My --- title')); - $this->assertSame('日本語の見出し', $this->tracker->normalizeId('日本語の見出し')); - $this->assertSame('heading', $this->tracker->normalizeId('###')); + // Non-ASCII is transliterated to keep shared anchors link-safe; the + // Latin/Cyrillic output is deterministic with or without ext-intl. + $this->assertSame('Privet-mir', $this->tracker->normalizeId('Привет мир')); + $this->assertSame('', $this->tracker->normalizeId('###')); $this->assertSame('h-123-Things', $this->tracker->normalizeId('123 Things')); $this->assertSame('h-1-Introduction', $this->tracker->normalizeId('1. Introduction')); } /** - * Pins behaviour discussed in jgm/djot#391 (spec wording on auto-ID generation). + * Pins djot-php's heading-ID behaviour around jgm/djot#391. * - * djot-php sides with djot.js / djoths on remove-vs-replace (mid-word punctuation - * becomes `-`), and deliberately deviates on apostrophes / quotes / `;` / `:` by - * also replacing them, so generated IDs are valid CSS identifiers and safe to use - * with `querySelector()`. + * djot-php replaces (not removes) mid-word punctuation, additionally + * replaces apostrophes / quotes / `;` / `:` so IDs are valid CSS + * identifiers, and transliterates non-ASCII to ASCII so the IDs survive + * being shared as URL fragments through auto-linkers. All cases below + * are deterministic with or without ext-intl. */ public function testNormalizeIdSpecAlignmentEdgeCases(): void { $this->assertSame('A-B-C', $this->tracker->normalizeId('A+B=C')); $this->assertSame('Emphasis-strong', $this->tracker->normalizeId('Emphasis/strong')); $this->assertSame('That-s-all', $this->tracker->normalizeId("That's all")); + $this->assertSame('That-s-all', $this->tracker->normalizeId('That’s all')); $this->assertSame('foo-bar', $this->tracker->normalizeId('foo...bar')); $this->assertSame('Uber-uns', $this->tracker->normalizeId('Uber uns')); - $this->assertSame('Über-uns', $this->tracker->normalizeId('Über uns')); + $this->assertSame('Uber-uns', $this->tracker->normalizeId('Über uns')); + $this->assertSame('cafe-resume', $this->tracker->normalizeId('café résumé')); + $this->assertSame('Strasse', $this->tracker->normalizeId('Straße')); $this->assertSame('h-2024-recap', $this->tracker->normalizeId('2024 recap')); - $this->assertSame('heading', $this->tracker->normalizeId('!!!')); + $this->assertSame('', $this->tracker->normalizeId('!!!')); } public function testGetPlainText(): void @@ -376,4 +383,48 @@ public function testUnderscoreRetainedInId(): void $this->assertSame('foo_bar-baz', $id); } + + /** + * When transliteration removes the entire heading text (a script outside + * the baked map, no ext-intl), the heading must fall back to a stable + * generated `s-N` id — not the legacy `heading` sentinel. + */ + public function testHeadingThatTransliteratesToNothingGetsFallbackId(): void + { + $tracker = new HeadingIdTracker(new AsciiTransliterator(useIntl: false)); + + $cjk = new Heading(2); + $cjk->appendChild(new Text('日本語の見出し')); + + $next = new Heading(2); + $next->appendChild(new Text('عنوان عربي')); + + $this->assertSame('s-1', $tracker->getIdForHeading($cjk)); + $this->assertSame('s-2', $tracker->getIdForHeading($next)); + } + + public function testAllPunctuationHeadingGetsFallbackId(): void + { + $heading = new Heading(2); + $heading->appendChild(new Text('!!!')); + + $this->assertSame('s-1', $this->tracker->getIdForHeading($heading)); + } + + /** + * The `s-N` fallback dedupes against reserved IDs: an earlier explicit + * `{#s-1}` forces the next all-punct/empty heading to take `s-2`, + * skipping the taken slot. Parser/render parity is preserved by + * BlockParser's post-parse rewrite (see #184), so the do-while here is + * safe — both passes seed their tracker with `reserveExplicitIds`. + */ + public function testFallbackIdSkipsReservedSNCollision(): void + { + $this->tracker->trackId('s-1'); + + $heading = new Heading(2); + $heading->appendChild(new Text('###')); + + $this->assertSame('s-2', $this->tracker->getIdForHeading($heading)); + } }