From e0076bd1d024acb47643ec3afe76eeb0ba1dafc1 Mon Sep 17 00:00:00 2001
From: Mark Scherer
Date: Fri, 12 Jun 2026 17:44:50 +0200
Subject: [PATCH] Preserve medial gaps in `::: |` line blocks
LineBlockDivExtension preserved only leading indentation; medial and
trailing runs of two or more spaces were collapsed by normal inline
parsing. That dropped the alignment gaps verse relies on - the caesura of
Old English verse, aligned columns in an address - exactly the
inconsistency raised on djot#29.
Now preserve every leading run plus any inner or trailing run of two or
more columns as non-breaking spaces; a single inner space stays an
ordinary, collapsible space so long lines can still wrap. Tabs expand to
four-column stops. Renders as a non-breaking space in every format:
(HTML), U+00A0 (Markdown), ordinary space (plain text / ANSI).
---
docs/extensions/index.md | 22 ++++-
docs/guide/syntax.md | 2 +-
src/Extension/LineBlockDivExtension.php | 98 +++++++++++++------
.../Extension/LineBlockDivExtensionTest.php | 65 ++++++++++++
4 files changed, 156 insertions(+), 31 deletions(-)
diff --git a/docs/extensions/index.md b/docs/extensions/index.md
index 40214460..9df909ef 100644
--- a/docs/extensions/index.md
+++ b/docs/extensions/index.md
@@ -609,7 +609,7 @@ This follows the approach discussed in [djot issue #286](https://github.com/jgm/
Adds a fenced line block written as a `:::` div whose only class token is a pipe: `::: |`. It produces the same `line-block` div as the [`|`-prefixed form](/guide/syntax#line-blocks), but without prefixing every line - convenient for verse, addresses, lyrics, and signature blocks where each line would otherwise need a leading `|`.
-Inside the fence, each soft line break becomes a hard break (`
`), leading whitespace is preserved, and a blank line separates stanzas (each becomes its own paragraph). Inline djot (emphasis, links, ...) still parses normally.
+Inside the fence, each soft line break becomes a hard break (`
`), significant whitespace is preserved, and a blank line separates stanzas (each becomes its own paragraph). Inline djot (emphasis, links, ...) still parses normally.
```php
use Djot\Extension\LineBlockDivExtension;
@@ -617,7 +617,25 @@ use Djot\Extension\LineBlockDivExtension;
$converter->addExtension(new LineBlockDivExtension());
```
-Leading whitespace on each line is preserved as a non-breaking space, so the indentation survives without any CSS: ` ` in HTML, a real non-breaking space (`U+00A0`) in Markdown - which keeps it through a round-trip re-render and never trips Markdown's indented-code-block rule - and an ordinary space in the plain-text and ANSI renderers. Tabs expand to four-column stops.
+Significant whitespace on each line is preserved as a non-breaking space, so the shape survives without any CSS: ` ` in HTML, a real non-breaking space (`U+00A0`) in Markdown - which keeps it through a round-trip re-render and never trips Markdown's indented-code-block rule - and an ordinary space in the plain-text and ANSI renderers. Tabs expand to four-column stops.
+
+Significant means: all **leading** indentation, plus any **medial or trailing** run of two or more columns - the gaps used for inline alignment, such as the caesura in Old English verse or the columns of an address. A lone space between words stays an ordinary, collapsible space, so a long line can still wrap. This follows Pandoc's line-block rule that every space the author writes is meaningful, and addresses the [medial-gap point raised on djot#29](https://github.com/jgm/djot/issues/29).
+
+**Input:**
+
+```djot
+::: |
+Hwaet! We Gardena in geardagum
+theodcyninga thrym gefrunon
+:::
+```
+
+```html
+
+
Hwaet! We Gardena in geardagum
+theodcyninga thrym gefrunon
+
+```
**Input:**
diff --git a/docs/guide/syntax.md b/docs/guide/syntax.md
index 5f40b222..3bdf60f3 100644
--- a/docs/guide/syntax.md
+++ b/docs/guide/syntax.md
@@ -1277,7 +1277,7 @@ And so are you.
::: tip Fenced alternative
-Prefer not to prefix every line? The [`LineBlockDivExtension`](/extensions/#lineblockdivextension) adds a fenced form, `::: |`, that produces the same `line-block` div without the per-line `|`. Leading whitespace is preserved and a blank line separates stanzas.
+Prefer not to prefix every line? The [`LineBlockDivExtension`](/extensions/#lineblockdivextension) adds a fenced form, `::: |`, that produces the same `line-block` div without the per-line `|`. Leading indentation and medial alignment gaps are preserved, and a blank line separates stanzas.
:::
### Block Attributes
diff --git a/src/Extension/LineBlockDivExtension.php b/src/Extension/LineBlockDivExtension.php
index ac3157cb..0ed0cdac 100644
--- a/src/Extension/LineBlockDivExtension.php
+++ b/src/Extension/LineBlockDivExtension.php
@@ -12,6 +12,7 @@
use Djot\Node\Node;
use Djot\Parser\Block\FencedBlockParser;
use Djot\Parser\BlockParser;
+use Djot\Parser\InlineParser;
/**
* Adds a fenced line-block div: `::: |`.
@@ -198,8 +199,8 @@ protected function splitStanzas(array $innerLines): array
}
/**
- * Build one stanza paragraph: each line is inline-parsed and joined by a
- * hard break, so single newlines render as `
`.
+ * Build one stanza paragraph: each line keeps its significant whitespace and
+ * is joined to the next by a hard break, so single newlines render as `
`.
*
* @param \Djot\Parser\BlockParser $blockParser
* @param array $stanza
@@ -212,19 +213,7 @@ protected function buildStanza(BlockParser $blockParser, array $stanza, int $bas
$last = count($stanza) - 1;
$index = 0;
foreach ($stanza as $line) {
- // Emit leading whitespace via the internal non-breaking-space
- // placeholder (U+E000, the same sentinel the parser uses for an
- // escaped space). The HTML renderer turns it into a entity so
- // the indent survives whitespace collapsing; Markdown keeps a real
- // non-breaking space (U+00A0); plain text and ANSI use a normal space.
- // A private-use character is used so it never collides with a literal
- // U+00A0 in the author's own text. Tabs expand to four-column stops.
- [$columns, $content] = $this->splitLeadingWhitespace($line);
- if ($columns > 0) {
- $paragraph->appendChild(new Text(str_repeat("\u{E000}", $columns)));
- }
-
- $inlineParser->parse($paragraph, $content, $baseLine + $index);
+ $this->appendLine($paragraph, $inlineParser, $line, $baseLine + $index);
if ($index < $last) {
$paragraph->appendChild(new HardBreak());
}
@@ -235,27 +224,80 @@ protected function buildStanza(BlockParser $blockParser, array $stanza, int $bas
}
/**
- * Split a line into its leading-whitespace width (in columns, tabs expanded
- * to four-column stops) and the remaining content.
+ * Append one verse line, preserving its significant whitespace.
+ *
+ * In a line block every space the author typed is meaningful (Pandoc's
+ * definition), so a verse keeps not only its leading indent but also the
+ * medial gaps used for alignment - the caesura of Old English verse, columns
+ * in an address, chords aligned above lyrics. The rule:
*
- * @return array{0: int, 1: string}
+ * - **Leading** whitespace (indentation): always preserved, even one space.
+ * - **Inner / trailing** runs of **two or more** columns (medial gaps,
+ * inline alignment): preserved.
+ * - A **single** inner space stays an ordinary, collapsible space, so a long
+ * line can still wrap between words.
+ *
+ * Each preserved column is emitted via the internal non-breaking-space
+ * placeholder (U+E000, the same sentinel the parser uses for an escaped
+ * space): the HTML renderer turns it into ` `, Markdown keeps a real
+ * U+00A0 (so the gap survives a round-trip and is never read as indented
+ * code), and plain text / ANSI use an ordinary space. A private-use character
+ * is used so it never collides with a literal U+00A0 in the author's text.
+ * Tabs expand to four-column stops. Text segments between gaps are
+ * inline-parsed, so emphasis, links, and the rest still work.
+ *
+ * @param \Djot\Node\Block\Paragraph $paragraph
+ * @param \Djot\Parser\InlineParser $inlineParser
+ * @param string $line
+ * @param int $lineNo
*/
- protected function splitLeadingWhitespace(string $line): array
+ protected function appendLine(Paragraph $paragraph, InlineParser $inlineParser, string $line, int $lineNo): void
{
- $column = 0;
- $offset = 0;
$length = strlen($line);
+ $offset = 0;
+ $column = 0;
+ $text = '';
+ $seenContent = false;
+
while ($offset < $length) {
- if ($line[$offset] === ' ') {
+ $char = $line[$offset];
+ if ($char !== ' ' && $char !== "\t") {
+ $text .= $char;
+ $seenContent = true;
$column++;
- } elseif ($line[$offset] === "\t") {
- $column += 4 - ($column % 4);
- } else {
- break;
+ $offset++;
+
+ continue;
+ }
+
+ $width = 0;
+ while ($offset < $length && ($line[$offset] === ' ' || $line[$offset] === "\t")) {
+ if ($line[$offset] === "\t") {
+ $width += 4 - (($column + $width) % 4);
+ } else {
+ $width++;
+ }
+ $offset++;
}
- $offset++;
+ $column += $width;
+
+ // Leading indent is always significant; an inner or trailing gap only
+ // from two columns up. A lone inner space stays a normal, wrappable space.
+ if (!$seenContent || $width >= 2) {
+ if ($text !== '') {
+ $inlineParser->parse($paragraph, $text, $lineNo);
+ $text = '';
+ }
+ $paragraph->appendChild(new Text(str_repeat("\u{E000}", $width)));
+
+ continue;
+ }
+
+ $text .= ' ';
}
- return [$column, substr($line, $offset)];
+ if ($text !== '') {
+ $inlineParser->parse($paragraph, $text, $lineNo);
+ }
}
}
diff --git a/tests/TestCase/Extension/LineBlockDivExtensionTest.php b/tests/TestCase/Extension/LineBlockDivExtensionTest.php
index dcb04d66..f6c12d3c 100644
--- a/tests/TestCase/Extension/LineBlockDivExtensionTest.php
+++ b/tests/TestCase/Extension/LineBlockDivExtensionTest.php
@@ -221,6 +221,71 @@ public function testWorksInsideBlockquotedList(): void
$this->assertStringContainsString("alpha
\n beta", $html);
}
+ public function testMedialGapIsPreservedAsNonBreakingSpaces(): void
+ {
+ // A line block keeps medial alignment, not only the leading indent: the
+ // caesura of Old English verse is a run of spaces in the middle of a line.
+ $djot = "::: |\nHwaet! We Gardena in geardagum\n:::";
+
+ $html = $this->converter()->convert($djot);
+
+ $this->assertStringContainsString('Hwaet! We Gardena' . str_repeat(' ', 4) . 'in geardagum', $html);
+ }
+
+ public function testSingleInnerSpaceStaysOrdinary(): void
+ {
+ // Ordinary word spacing must stay a real, collapsible space so the line
+ // can still wrap; only runs of two or more columns are treated as a gap.
+ $djot = "::: |\nThe limerick packs laughs\n:::";
+
+ $html = $this->converter()->convert($djot);
+
+ $this->assertStringContainsString('The limerick packs laughs', $html);
+ $this->assertStringNotContainsString(' ', $html);
+ }
+
+ public function testMedialGapNonHtmlUsesRegularSpaces(): void
+ {
+ // Plain text keeps the gap visible as ordinary spaces, with no placeholder
+ // leaking through.
+ $document = $this->converter()->parse("::: |\nleft right\n:::");
+ $text = (new PlainTextRenderer())->render($document);
+
+ $this->assertStringContainsString('left right', $text);
+ $this->assertStringNotContainsString(self::NBSP_PLACEHOLDER, $text);
+ }
+
+ public function testMedialGapMarkdownUsesNonBreakingSpaces(): void
+ {
+ // Markdown round-trips the gap as real U+00A0 so re-rendering keeps it.
+ $document = $this->converter()->parse("::: |\nleft right\n:::");
+ $markdown = (new MarkdownRenderer())->render($document);
+
+ $this->assertStringContainsString('left' . self::NBSP . self::NBSP . 'right', $markdown);
+ $this->assertStringNotContainsString(self::NBSP_PLACEHOLDER, $markdown);
+ }
+
+ public function testMedialTabExpandsToColumnStop(): void
+ {
+ // A tab used as a medial gap expands to the next four-column stop.
+ $djot = "::: |\nab\tcd\n:::";
+
+ $html = $this->converter()->convert($djot);
+
+ // "ab" sits at columns 0-1; the tab fills columns 2-3 -> two nbsp.
+ $this->assertStringContainsString('ab' . str_repeat(' ', 2) . 'cd', $html);
+ }
+
+ public function testMedialGapKeepsInlineMarkup(): void
+ {
+ // Inline markup on either side of a gap still parses.
+ $djot = "::: |\n_em_ [link](https://example.com)\n:::";
+
+ $html = $this->converter()->convert($djot);
+
+ $this->assertStringContainsString('em' . str_repeat(' ', 4) . 'link', $html);
+ }
+
public function testPlainDivWithRealClassIsUntouched(): void
{
$djot = "::: warning\nHello\n:::";