Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions bin/gen-translit-map.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env php
<?php

declare(strict_types=1);

/**
* Regenerates src/Renderer/ascii_translit_map.php from the ICU
* "Any-Latin; Latin-ASCII" transliterator.
*
* The baked map is the deterministic fallback used by AsciiTransliterator
* when ext-intl is unavailable. Generating it from ICU guarantees the
* common (European / Cyrillic / Greek / punctuation) output is identical
* with and without intl. Requires ext-intl to run.
*
* Usage: php bin/gen-translit-map.php
*/

if (!class_exists(Transliterator::class)) {
fwrite(STDERR, "ext-intl is required to regenerate the map.\n");
exit(1);
}

$tr = Transliterator::create('Any-Latin; Latin-ASCII');
if ($tr === null) {
fwrite(STDERR, "Failed to create ICU transliterator.\n");
exit(1);
}

// Latin-1/Extended, IPA, combining marks, Greek, Cyrillic, Latin Extended
// Additional, general punctuation, super/subscripts, currency, letterlike.
$ranges = [
[0x00A0, 0x024F], [0x0250, 0x02AF], [0x0300, 0x036F], [0x0370, 0x03FF],
[0x0400, 0x04FF], [0x1E00, 0x1EFF], [0x2000, 0x206F], [0x2070, 0x209F],
[0x20A0, 0x20BF], [0x2100, 0x214F],
];

// A per-character map cannot reproduce ICU's context-sensitive rules (e.g.
// Greek `αυ` → `au` but `υ` alone → `y`). Baking only the context-free
// subset of such a script is *worse* than excluding it: `Αυγή` would give
// `Ae` from the map vs `Auge` from ICU. So each range is all-or-nothing —
// it is baked only if EVERY pure-ASCII code point in it is context-free.
// Context-sensitive scripts (Greek, …) are excluded wholesale and degrade
// to the generated `s-N` fallback when ext-intl is absent, exactly like
// CJK; with ext-intl they are still romanized.
$map = [];
foreach ($ranges as [$start, $end]) {
$rangeEntries = [];
$rangeIsContextFree = true;

for ($cp = $start; $cp <= $end; $cp++) {
$char = IntlChar::chr($cp);
if ($char === null) {
continue;
}

$ascii = $tr->transliterate($char);
if ($ascii === false || $ascii === $char || preg_match('/^[\x00-\x7F]*$/', $ascii) !== 1) {
continue;
}

$contextFree =
$tr->transliterate($char . $char) === $ascii . $ascii
&& $tr->transliterate('a' . $char . 'a') === 'a' . $ascii . 'a'
&& $tr->transliterate('Z' . $char . 'Z') === 'Z' . $ascii . 'Z';
if (!$contextFree) {
$rangeIsContextFree = false;

break;
}

$rangeEntries[$char] = $ascii;
}

if ($rangeIsContextFree) {
$map += $rangeEntries;
}
}
ksort($map);

$lines = [];
foreach ($map as $from => $to) {
$lines[] = ' ' . var_export($from, true) . ' => ' . var_export($to, true) . ',';
}

$header = <<<'PHP'
<?php

declare(strict_types=1);

/**
* Unicode -> ASCII transliteration fallback map.
*
* Generated from the ICU "Any-Latin; Latin-ASCII" transliterator over the
* Latin, IPA, combining-marks, Greek, Cyrillic, Latin-Extended-Additional,
* punctuation, super/subscript, currency and letterlike ranges. Used by
* the AsciiTransliterator only when ext-intl is unavailable, so the common
* (European/Cyrillic/Greek/punctuation) output is byte-identical with or
* without intl. Do not hand-edit; regenerate with `php bin/gen-translit-map.php`.
*
* @return array<string, string>
*/
return [
PHP;

$target = dirname(__DIR__) . '/src/Renderer/ascii_translit_map.php';
file_put_contents($target, $header . "\n" . implode("\n", $lines) . "\n];\n");

echo 'Wrote ' . count($map) . " entries to {$target}\n";
3 changes: 3 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
"require": {
"php": "^8.2"
},
"suggest": {
"ext-intl": "Recommended for heading-ID transliteration: enables ICU romanization of non-Latin scripts (e.g. CJK, Arabic). Without it, a baked map covers Latin/Cyrillic/Greek/punctuation identically and other scripts fall back to generated `s-N` ids."
},
"require-dev": {
"nikic/php-fuzzer": "^0.0.11",
"php-collective/code-sniffer": "dev-master",
Expand Down
79 changes: 35 additions & 44 deletions docs/reference/enhancements.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,64 +160,55 @@ content is a symbol falls back to a generated `s-N` ID.

**Status:** Implemented in djot-php

Auto-generated heading IDs are normalized to be valid CSS selectors, ensuring compatibility with `querySelector()`, HTMX scroll restoration, and CSS attribute selectors.
Auto-generated heading IDs are normalized to be valid CSS selectors **and ASCII-only**, so they work with `querySelector()` / HTMX scroll restoration *and* survive being copied around as URL fragments (see [Why ASCII](#why-ascii) below).

### Normalization Rules

1. **Strip `#` characters** — Prevents invalid selectors
2. **Trim whitespace** — Clean leading/trailing spaces
3. **Whitespace to dashes** — Spaces become single `-`
4. **Invalid characters to dashes** — Only Unicode letters (`\p{L}`), numbers (`\p{N}`), hyphens, and underscores are preserved
5. **Collapse consecutive dashes** — `foo--bar` becomes `foo-bar`
6. **Trim leading/trailing dashes** — `-foo-` becomes `foo`
7. **Prefix digits** — IDs starting with a number get `h-` prefix (CSS requirement)
8. **Fallback** — Empty results become `heading`
1. **Transliterate to ASCII** — `Über`→`Uber`, `café`→`cafe`, `Привет`→`Privet`, smart quotes/dashes→`'"-` (then replaced)
2. **Strip `#` characters** — Prevents invalid selectors
3. **Trim whitespace**
4. **Whitespace to dashes** — Spaces become single `-`
5. **Invalid characters to dashes** — Anything other than letters, numbers, `-`, `_` becomes `-`
6. **Collapse consecutive dashes** — `foo--bar` becomes `foo-bar`
7. **Trim leading/trailing dashes**
8. **Prefix digits** — IDs starting with a digit get an `h-` prefix (CSS requirement)
9. **Fallback** — Empty results become `heading` (or a generated `s-N` for empty headings)

### Examples

| Heading | Generated ID |
|---------|--------------|
| `# Hello World` | `Hello-World` |
| `# Hello World!` | `Hello-World` |
| `# 日本語の見出し` | `日本語の見出し` |
| `# Привет мир` | `Привет-мир` |
| `# Über uns` | `Uber-uns` |
| `# café résumé` | `cafe-resume` |
| `# Привет мир` | `Privet-mir` |
| `# Bob's Guide` (smart quotes) | `Bob-s-Guide` |
| `# E=mc^2` | `E-mc-2` |
| `# 123 Numbers First` | `h-123-Numbers-First` |
| `# $this->method()` | `this-method` |
| `# ###` | `heading` |

### Unicode Preservation
### Why ASCII {#why-ascii}

International characters are preserved while special characters are normalized:
Heading IDs end up as URL fragments (`…/page#Über-uns`) that get copied into chat, email and other documents, where **auto-linkers re-detect the URL heuristically**. Non-ASCII fragments are routinely:

```djot
# 日本語の見出し

# Cześć świecie
```
- **truncated** — the link is cut at the first non-ASCII byte (`#Über` → `#`), producing a silent dead link;
- **percent-encoded inconsistently** — `’`→`%E2%80%99`, bloating and sometimes breaking the link;
- **re-normalized differently** by the receiving app (NFC/NFD), so the pasted fragment no longer matches the page's `id`.

**Output:**
```html
<h1 id="日本語の見出し">日本語の見出し</h1>
<h1 id="Cześć-świecie">Cześć świecie</h1>
```
Transliterating to ASCII keeps shared deep links robust. It's a deliberate deviation from both the djot.js reference and the [jgm/djot#393](https://github.com/jgm/djot/pull/393) spec prose (both preserve non-ASCII) — see [Spec Alignment](#spec-alignment).

### Why This Matters
### Transliteration engine & determinism

Without CSS-safe normalization, headings with special characters would break:
Two engines produce the ASCII form:

```js
// This would throw SyntaxError with unsafe IDs
document.querySelector('#E=mc^2'); // Invalid selector
htmx.scrollToElement('#$this->foo'); // Invalid selector
```
- **ICU `Transliterator`** (`Any-Latin; Latin-ASCII`) when `ext-intl` is installed — also romanizes scripts the map doesn't cover (Greek, CJK, Arabic, …);
- a **baked Unicode→ASCII map** (`src/Renderer/ascii_translit_map.php`) otherwise.

With normalization, these work correctly:
The baked map is generated *from the same ICU transform*, and the generator bakes a script **only if every code point in it transliterates context-free** (verified standalone, doubled, and between Latin letters). For those scripts — Latin (so all of German, French, Spanish, Polish, Czech, Turkish, Vietnamese, …), Cyrillic, punctuation, smart quotes, dashes, currency — the output is **byte-identical with or without `ext-intl`**, so shared anchors stay stable across environments.

```js
document.querySelector('#E-mc-2'); // Works
htmx.scrollToElement('#this-foo'); // Works
```
Scripts whose ICU romanization is context-sensitive (e.g. Greek: `αυ`→`au` but `υ`→`y`) are excluded *wholesale* — baking only their context-free letters would produce IDs that disagree with ICU, which is worse than not covering them. Those scripts, plus non-Latin scripts the map never covers (CJK, Arabic, …), behave one way: **with `ext-intl` they are romanized; without it they are dropped and the heading falls back to a generated `s-N` id**. `ext-intl` is therefore *recommended* (a `composer suggest`) but not required; the determinism guarantee above never depends on it.

### Explicit IDs

Expand All @@ -227,27 +218,27 @@ You can always override with an explicit ID attribute:
# My Heading {#custom-id}
```

Explicit IDs are used as-is without normalization.
Explicit IDs are used as-is without normalization or transliteration.

### Spec Alignment
### Spec Alignment {#spec-alignment}

The remove-vs-replace question raised in [jgm/djot#391](https://github.com/jgm/djot/issues/391) was settled by [jgm/djot#393](https://github.com/jgm/djot/pull/393), which reworded the spec to: *"replacing each maximal run of non-alphanumeric ASCII characters with `-`, removing any leading or trailing `-`"*. Note that #393 only changes the spec **prose**the djot.js reference implementation is unchanged and (per djot's own changelog policy) remains the authoritative behavior. The new prose is actually broader than djot.js itself: it would also strip `_`, which djot.js keeps.
The remove-vs-replace question raised in [jgm/djot#391](https://github.com/jgm/djot/issues/391) was settled by [jgm/djot#393](https://github.com/jgm/djot/pull/393), which reworded the spec to: *"replacing each maximal run of non-alphanumeric ASCII characters with `-`, removing any leading or trailing `-`"*. #393 changed only the spec **prose**; the djot.js reference implementation is unchanged.

djot-php replaces (does not remove) mid-word punctuation — the direction #393 settled on — and tracks the djot.js **implementation** where the prose and implementation disagree, deliberately deviating only where required to produce valid CSS identifiers for `querySelector()` consumers.
djot-php replaces (does not remove) mid-word punctuation — the direction #393 settled on — additionally replaces `' " ; :` so IDs are valid CSS identifiers, and **transliterates non-ASCII to ASCII** so IDs stay link-safe when shared. The last point is a deliberate deviation from *both* djot.js and the #393 prose, justified by the [Why ASCII](#why-ascii) failure mode.

| Aspect | djot.js reference impl | #393 spec prose | djot-php |
|--------|------------------------|-----------------|----------|
| Mid-word punctuation (`A+B=C`) | `A-B-C` | `A-B-C` | `A-B-C` |
| Non-ASCII letters (`Über uns`) | preserve → `Über-uns` | preserve → `Über-uns` | preserve → `Über-uns` |
| Consecutive punctuation (`foo...bar`) | collapse → `foo-bar` | collapse → `foo-bar` | collapse → `foo-bar` |
| Underscore (`foo_bar`) | keep → `foo_bar` | strip → `foo-bar` | keep → `foo_bar` (follows impl; CSS-valid) |
| Apostrophe (`That's all`) | preserve → `That's-all` | replace → `That-s-all` | replace → `That-s-all` (CSS-safe) |
| Double quote / `;` / `:` | preserve | replace | replace with `-` (CSS-safe) |
| Underscore (`foo_bar`) | keep → `foo_bar` | strip → `foo-bar` | keep → `foo_bar` (CSS-valid, link-safe) |
| Apostrophe / `"` / `;` / `:` | preserve | replace | replace → `-` (CSS-safe) |
| Non-ASCII letters (`Über uns`) | preserve → `Über-uns` | preserve → `Über-uns` | **transliterate → `Uber-uns`** (link-safe) |
| Non-ASCII / smart quotes (`Bob’s`) | preserve → `Bob’s` | preserve → `Bob’s` | **transliterate → `Bob-s`** (link-safe) |
| Leading digit (`2024 recap`) | `2024-recap` | `2024-recap` | prefix → `h-2024-recap` (CSS requires non-digit start) |
| Empty result (`!!!`) | `s-N` family | unspecified | fallback → `heading` |
| Symbols / footnote refs | excluded | excluded | excluded |

The apostrophe / quote / semicolon / colon deviation is deliberate: these characters are not valid in unescaped CSS identifiers, so preserving them per djot.js would force every JS consumer to round-trip through `CSS.escape()` before doing a selector lookup. The leading-digit and empty-result behaviors fill in gaps that the spec and implementation handle inconsistently.
The deviations are deliberate: `' " ; :` are not valid in unescaped CSS identifiers, and non-ASCII fragments break when shared (see [Why ASCII](#why-ascii)). The leading-digit and empty-result behaviors fill in gaps the spec and reference handle inconsistently. A note proposing the spec clarify the non-ASCII question is tracked against [jgm/djot#391](https://github.com/jgm/djot/issues/391).

---

Expand Down
103 changes: 103 additions & 0 deletions src/Renderer/AsciiTransliterator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
<?php

declare(strict_types=1);

namespace Djot\Renderer;

use Transliterator;

/**
* Transliterates arbitrary Unicode text to ASCII for use in heading IDs.
*
* Heading IDs become URL fragments that get copied around and re-detected by
* auto-linkers in chat clients, mail and other docs. Non-ASCII fragments are
* routinely truncated or percent-encoded inconsistently there, producing
* broken deep links. Reducing IDs to ASCII keeps shared links robust.
*
* Two engines:
* - ICU `Transliterator` ("Any-Latin; Latin-ASCII") when ext-intl is
* available — also romanizes scripts the map does not cover (e.g. CJK);
* - a baked Unicode->ASCII map generated *from that same ICU transform*
* (see ascii_translit_map.php) as the fallback.
*
* Because the fallback is generated from ICU, the common European / Cyrillic
* / Greek / punctuation output is byte-identical with or without intl, so
* shared anchors stay stable across environments. Only scripts outside the
* baked ranges differ: intl romanizes them, the map drops them (the caller's
* empty-result handling then yields a stable generated id).
*/
class AsciiTransliterator
{
protected static bool $icuResolved = false;

protected static ?Transliterator $icu = null;

/**
* @var array<string, string>|null
*/
protected static ?array $map = null;

protected bool $useIntl;

/**
* @param bool|null $useIntl Force the engine; null auto-detects ext-intl.
*/
public function __construct(?bool $useIntl = null)
{
$this->useIntl = $useIntl ?? class_exists(Transliterator::class);
}

public function transliterate(string $text): string
{
if ($text === '') {
return '';
}

$icu = $this->useIntl ? static::icu() : null;
if ($icu !== null) {
$converted = $icu->transliterate($text);
if ($converted !== false) {
$text = $converted;
}
} else {
// No usable ICU (intl absent, or Transliterator::create()
// returned null on a broken build) — use the deterministic
// baked map rather than stripping covered characters.
$text = strtr($text, static::map());
}

// Anything still non-ASCII is something neither ICU nor the map
// resolved. Turn separators / punctuation / symbols into a space
// first so word boundaries (e.g. the ideographic space U+3000 or
// comma U+3001 between ASCII words) survive as `-` instead of
// merging tokens; then drop the rest (letters of unromanizable
// scripts) so the caller falls back to a stable generated id.
$text = (string)preg_replace_callback(
'/[^\x00-\x7F]+/',
static fn (array $m): string => (string)preg_replace('/[\p{Z}\p{P}\p{S}]/u', ' ', $m[0]),
$text,
);

return (string)preg_replace('/[^\x00-\x7F]+/', '', $text);
}

protected static function icu(): ?Transliterator
{
if (!static::$icuResolved) {
static::$icuResolved = true;
static::$icu = class_exists(Transliterator::class)
? Transliterator::create('Any-Latin; Latin-ASCII')
: null;
}

return static::$icu;
}

/**
* @return array<string, string>
*/
protected static function map(): array
{
return static::$map ??= require __DIR__ . '/ascii_translit_map.php';
}
}
Loading
Loading