diff --git a/docs/extensions/index.md b/docs/extensions/index.md index 40214460..9df909ef 100644 --- a/docs/extensions/index.md +++ b/docs/extensions/index.md @@ -609,7 +609,7 @@ This follows the approach discussed in [djot issue #286](https://github.com/jgm/ Adds a fenced line block written as a `:::` div whose only class token is a pipe: `::: |`. It produces the same `line-block` div as the [`|`-prefixed form](/guide/syntax#line-blocks), but without prefixing every line - convenient for verse, addresses, lyrics, and signature blocks where each line would otherwise need a leading `|`. -Inside the fence, each soft line break becomes a hard break (`
`), leading whitespace is preserved, and a blank line separates stanzas (each becomes its own paragraph). Inline djot (emphasis, links, ...) still parses normally. +Inside the fence, each soft line break becomes a hard break (`
`), significant whitespace is preserved, and a blank line separates stanzas (each becomes its own paragraph). Inline djot (emphasis, links, ...) still parses normally. ```php use Djot\Extension\LineBlockDivExtension; @@ -617,7 +617,25 @@ use Djot\Extension\LineBlockDivExtension; $converter->addExtension(new LineBlockDivExtension()); ``` -Leading whitespace on each line is preserved as a non-breaking space, so the indentation survives without any CSS: ` ` in HTML, a real non-breaking space (`U+00A0`) in Markdown - which keeps it through a round-trip re-render and never trips Markdown's indented-code-block rule - and an ordinary space in the plain-text and ANSI renderers. Tabs expand to four-column stops. +Significant whitespace on each line is preserved as a non-breaking space, so the shape survives without any CSS: ` ` in HTML, a real non-breaking space (`U+00A0`) in Markdown - which keeps it through a round-trip re-render and never trips Markdown's indented-code-block rule - and an ordinary space in the plain-text and ANSI renderers. Tabs expand to four-column stops. + +Significant means: all **leading** indentation, plus any **medial or trailing** run of two or more columns - the gaps used for inline alignment, such as the caesura in Old English verse or the columns of an address. A lone space between words stays an ordinary, collapsible space, so a long line can still wrap. This follows Pandoc's line-block rule that every space the author writes is meaningful, and addresses the [medial-gap point raised on djot#29](https://github.com/jgm/djot/issues/29). + +**Input:** + +```djot +::: | +Hwaet! We Gardena in geardagum +theodcyninga thrym gefrunon +::: +``` + +```html +
+

Hwaet! We Gardena    in geardagum
+theodcyninga       thrym gefrunon

+
+``` **Input:** diff --git a/docs/guide/syntax.md b/docs/guide/syntax.md index 5f40b222..3bdf60f3 100644 --- a/docs/guide/syntax.md +++ b/docs/guide/syntax.md @@ -1277,7 +1277,7 @@ And so are you.

::: tip Fenced alternative -Prefer not to prefix every line? The [`LineBlockDivExtension`](/extensions/#lineblockdivextension) adds a fenced form, `::: |`, that produces the same `line-block` div without the per-line `|`. Leading whitespace is preserved and a blank line separates stanzas. +Prefer not to prefix every line? The [`LineBlockDivExtension`](/extensions/#lineblockdivextension) adds a fenced form, `::: |`, that produces the same `line-block` div without the per-line `|`. Leading indentation and medial alignment gaps are preserved, and a blank line separates stanzas. ::: ### Block Attributes diff --git a/src/Extension/LineBlockDivExtension.php b/src/Extension/LineBlockDivExtension.php index ac3157cb..0ed0cdac 100644 --- a/src/Extension/LineBlockDivExtension.php +++ b/src/Extension/LineBlockDivExtension.php @@ -12,6 +12,7 @@ use Djot\Node\Node; use Djot\Parser\Block\FencedBlockParser; use Djot\Parser\BlockParser; +use Djot\Parser\InlineParser; /** * Adds a fenced line-block div: `::: |`. @@ -198,8 +199,8 @@ protected function splitStanzas(array $innerLines): array } /** - * Build one stanza paragraph: each line is inline-parsed and joined by a - * hard break, so single newlines render as `
`. + * Build one stanza paragraph: each line keeps its significant whitespace and + * is joined to the next by a hard break, so single newlines render as `
`. * * @param \Djot\Parser\BlockParser $blockParser * @param array $stanza @@ -212,19 +213,7 @@ protected function buildStanza(BlockParser $blockParser, array $stanza, int $bas $last = count($stanza) - 1; $index = 0; foreach ($stanza as $line) { - // Emit leading whitespace via the internal non-breaking-space - // placeholder (U+E000, the same sentinel the parser uses for an - // escaped space). The HTML renderer turns it into a   entity so - // the indent survives whitespace collapsing; Markdown keeps a real - // non-breaking space (U+00A0); plain text and ANSI use a normal space. - // A private-use character is used so it never collides with a literal - // U+00A0 in the author's own text. Tabs expand to four-column stops. - [$columns, $content] = $this->splitLeadingWhitespace($line); - if ($columns > 0) { - $paragraph->appendChild(new Text(str_repeat("\u{E000}", $columns))); - } - - $inlineParser->parse($paragraph, $content, $baseLine + $index); + $this->appendLine($paragraph, $inlineParser, $line, $baseLine + $index); if ($index < $last) { $paragraph->appendChild(new HardBreak()); } @@ -235,27 +224,80 @@ protected function buildStanza(BlockParser $blockParser, array $stanza, int $bas } /** - * Split a line into its leading-whitespace width (in columns, tabs expanded - * to four-column stops) and the remaining content. + * Append one verse line, preserving its significant whitespace. + * + * In a line block every space the author typed is meaningful (Pandoc's + * definition), so a verse keeps not only its leading indent but also the + * medial gaps used for alignment - the caesura of Old English verse, columns + * in an address, chords aligned above lyrics. The rule: * - * @return array{0: int, 1: string} + * - **Leading** whitespace (indentation): always preserved, even one space. + * - **Inner / trailing** runs of **two or more** columns (medial gaps, + * inline alignment): preserved. + * - A **single** inner space stays an ordinary, collapsible space, so a long + * line can still wrap between words. + * + * Each preserved column is emitted via the internal non-breaking-space + * placeholder (U+E000, the same sentinel the parser uses for an escaped + * space): the HTML renderer turns it into ` `, Markdown keeps a real + * U+00A0 (so the gap survives a round-trip and is never read as indented + * code), and plain text / ANSI use an ordinary space. A private-use character + * is used so it never collides with a literal U+00A0 in the author's text. + * Tabs expand to four-column stops. Text segments between gaps are + * inline-parsed, so emphasis, links, and the rest still work. + * + * @param \Djot\Node\Block\Paragraph $paragraph + * @param \Djot\Parser\InlineParser $inlineParser + * @param string $line + * @param int $lineNo */ - protected function splitLeadingWhitespace(string $line): array + protected function appendLine(Paragraph $paragraph, InlineParser $inlineParser, string $line, int $lineNo): void { - $column = 0; - $offset = 0; $length = strlen($line); + $offset = 0; + $column = 0; + $text = ''; + $seenContent = false; + while ($offset < $length) { - if ($line[$offset] === ' ') { + $char = $line[$offset]; + if ($char !== ' ' && $char !== "\t") { + $text .= $char; + $seenContent = true; $column++; - } elseif ($line[$offset] === "\t") { - $column += 4 - ($column % 4); - } else { - break; + $offset++; + + continue; + } + + $width = 0; + while ($offset < $length && ($line[$offset] === ' ' || $line[$offset] === "\t")) { + if ($line[$offset] === "\t") { + $width += 4 - (($column + $width) % 4); + } else { + $width++; + } + $offset++; } - $offset++; + $column += $width; + + // Leading indent is always significant; an inner or trailing gap only + // from two columns up. A lone inner space stays a normal, wrappable space. + if (!$seenContent || $width >= 2) { + if ($text !== '') { + $inlineParser->parse($paragraph, $text, $lineNo); + $text = ''; + } + $paragraph->appendChild(new Text(str_repeat("\u{E000}", $width))); + + continue; + } + + $text .= ' '; } - return [$column, substr($line, $offset)]; + if ($text !== '') { + $inlineParser->parse($paragraph, $text, $lineNo); + } } } diff --git a/tests/TestCase/Extension/LineBlockDivExtensionTest.php b/tests/TestCase/Extension/LineBlockDivExtensionTest.php index dcb04d66..f6c12d3c 100644 --- a/tests/TestCase/Extension/LineBlockDivExtensionTest.php +++ b/tests/TestCase/Extension/LineBlockDivExtensionTest.php @@ -221,6 +221,71 @@ public function testWorksInsideBlockquotedList(): void $this->assertStringContainsString("alpha
\n  beta", $html); } + public function testMedialGapIsPreservedAsNonBreakingSpaces(): void + { + // A line block keeps medial alignment, not only the leading indent: the + // caesura of Old English verse is a run of spaces in the middle of a line. + $djot = "::: |\nHwaet! We Gardena in geardagum\n:::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('Hwaet! We Gardena' . str_repeat(' ', 4) . 'in geardagum', $html); + } + + public function testSingleInnerSpaceStaysOrdinary(): void + { + // Ordinary word spacing must stay a real, collapsible space so the line + // can still wrap; only runs of two or more columns are treated as a gap. + $djot = "::: |\nThe limerick packs laughs\n:::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('The limerick packs laughs', $html); + $this->assertStringNotContainsString(' ', $html); + } + + public function testMedialGapNonHtmlUsesRegularSpaces(): void + { + // Plain text keeps the gap visible as ordinary spaces, with no placeholder + // leaking through. + $document = $this->converter()->parse("::: |\nleft right\n:::"); + $text = (new PlainTextRenderer())->render($document); + + $this->assertStringContainsString('left right', $text); + $this->assertStringNotContainsString(self::NBSP_PLACEHOLDER, $text); + } + + public function testMedialGapMarkdownUsesNonBreakingSpaces(): void + { + // Markdown round-trips the gap as real U+00A0 so re-rendering keeps it. + $document = $this->converter()->parse("::: |\nleft right\n:::"); + $markdown = (new MarkdownRenderer())->render($document); + + $this->assertStringContainsString('left' . self::NBSP . self::NBSP . 'right', $markdown); + $this->assertStringNotContainsString(self::NBSP_PLACEHOLDER, $markdown); + } + + public function testMedialTabExpandsToColumnStop(): void + { + // A tab used as a medial gap expands to the next four-column stop. + $djot = "::: |\nab\tcd\n:::"; + + $html = $this->converter()->convert($djot); + + // "ab" sits at columns 0-1; the tab fills columns 2-3 -> two nbsp. + $this->assertStringContainsString('ab' . str_repeat(' ', 2) . 'cd', $html); + } + + public function testMedialGapKeepsInlineMarkup(): void + { + // Inline markup on either side of a gap still parses. + $djot = "::: |\n_em_ [link](https://example.com)\n:::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('em' . str_repeat(' ', 4) . 'link', $html); + } + public function testPlainDivWithRealClassIsUntouched(): void { $djot = "::: warning\nHello\n:::";