diff --git a/docs/extensions/index.md b/docs/extensions/index.md index 5c76e0c3..40214460 100644 --- a/docs/extensions/index.md +++ b/docs/extensions/index.md @@ -16,6 +16,7 @@ Extensions provide a clean way to bundle related customizations together. Each e | [HeadingReferenceExtension](#headingreferenceextension) | Resolves `[[Heading Text]]` links to headings in the current document | | [HeadingPermalinksExtension](#headingpermalinksextension) | Adds clickable anchor links to headings | | [InlineFootnotesExtension](#inlinefootnotesextension) | Converts `[content]{.fn}` spans to inline footnotes | +| [LineBlockDivExtension](#lineblockdivextension) | Adds a fenced `::: |` line block (verse/addresses) without prefixing every line | | [MentionsExtension](#mentionsextension) | Converts `@username` patterns to profile links | | [MermaidExtension](#mermaidextension) | Transforms mermaid code blocks into diagrams | | [SemanticSpanExtension](#semanticspanextension) | Converts span attributes to semantic HTML elements (``, ``, ``) | @@ -604,6 +605,47 @@ fallback such as parenthetical inline content. This follows the approach discussed in [djot issue #286](https://github.com/jgm/djot/issues/286). The `^[...]` syntax used by Pandoc conflicts with djot's superscript syntax (`^text^`), so the span-with-class approach provides inline footnotes without parser changes. +## LineBlockDivExtension + +Adds a fenced line block written as a `:::` div whose only class token is a pipe: `::: |`. It produces the same `line-block` div as the [`|`-prefixed form](/guide/syntax#line-blocks), but without prefixing every line - convenient for verse, addresses, lyrics, and signature blocks where each line would otherwise need a leading `|`. + +Inside the fence, each soft line break becomes a hard break (`
`), leading whitespace is preserved, and a blank line separates stanzas (each becomes its own paragraph). Inline djot (emphasis, links, ...) still parses normally. + +```php +use Djot\Extension\LineBlockDivExtension; + +$converter->addExtension(new LineBlockDivExtension()); +``` + +Leading whitespace on each line is preserved as a non-breaking space, so the indentation survives without any CSS: ` ` in HTML, a real non-breaking space (`U+00A0`) in Markdown - which keeps it through a round-trip re-render and never trips Markdown's indented-code-block rule - and an ordinary space in the plain-text and ANSI renderers. Tabs expand to four-column stops. + +**Input:** + +```djot +::: | +The limerick packs laughs anatomical + Into space that is quite economical. + +But the good ones I've seen + So seldom are clean +::: +``` + +```html +
+

The limerick packs laughs anatomical
+  Into space that is quite economical.

+

But the good ones I've seen
+  So seldom are clean

+
+``` + +The pipe is consumed as the marker, so the output is a `line-block` div, never a literal `class="|"`. Because `|` is not a meaningful class, intercepting it cannot collide with real usage - which is why this needs no core parser change. It composes with nesting: a `::: |` block works inside blockquotes and list items. + +### Why This Syntax? + +This follows the approach discussed in [djot issue #29](https://github.com/jgm/djot/issues/29). A leading `|` on every line (Pandoc-style line blocks) can be confused with pipe tables and is awkward to edit; an English keyword div class (`::: verse`) was undesirable. A language-neutral `|` marker on the div opener sidesteps both concerns. + ## MentionsExtension Converts `@username` patterns into user profile links. diff --git a/docs/guide/syntax.md b/docs/guide/syntax.md index 4409a499..5f40b222 100644 --- a/docs/guide/syntax.md +++ b/docs/guide/syntax.md @@ -1276,6 +1276,10 @@ And so are you.

+::: tip Fenced alternative +Prefer not to prefix every line? The [`LineBlockDivExtension`](/extensions/#lineblockdivextension) adds a fenced form, `::: |`, that produces the same `line-block` div without the per-line `|`. Leading whitespace is preserved and a blank line separates stanzas. +::: + ### Block Attributes For an overview of where attributes attach for each block construct diff --git a/src/Extension/LineBlockDivExtension.php b/src/Extension/LineBlockDivExtension.php new file mode 100644 index 00000000..ac3157cb --- /dev/null +++ b/src/Extension/LineBlockDivExtension.php @@ -0,0 +1,261 @@ +`) and + * leading whitespace is preserved, so verse, addresses, lyrics, and signature + * blocks keep their shape. A blank line separates stanzas (each becomes its own + * paragraph). Inline djot (emphasis, links, ...) still parses normally. + * + * Syntax: + * ``` + * ::: | + * The limerick packs laughs anatomical + * Into space that is quite economical. + * + * But the good ones I've seen + * So seldom are clean + * ::: + * ``` + * + * The pipe is consumed as the marker, so the output is a `line-block` div, not + * a literal `class="|"`. This is why no core change is needed: a `|` is not a + * meaningful class, so intercepting it cannot collide with real usage. + * + * Rationale (djot#29): sidesteps both objections to a `|`-prefix line block - + * no per-line pipe to confuse with pipe tables, and a language-neutral symbol + * rather than an English keyword div class. + * + * Example usage: + * ```php + * $converter = new DjotConverter(); + * $converter->addExtension(new LineBlockDivExtension()); + * $html = $converter->convert($djot); + * ``` + */ +class LineBlockDivExtension implements ExtensionInterface +{ + /** + * Opener: 3+ colons, then only a pipe (optional surrounding spaces/tabs). + * + * @var string + */ + protected const OPENER = '/^(:{3,})[ \t]*\|[ \t]*$/'; + + public function register(DjotConverter $converter): void + { + $converter->getParser()->addBlockPattern(self::OPENER, $this->parseLineBlockDiv(...)); + } + + /** + * @param array $lines + * @param \Djot\Parser\BlockParser $blockParser + * @param \Djot\Node\Node $parent + * @param int $start + */ + protected function parseLineBlockDiv(array $lines, int $start, Node $parent, BlockParser $blockParser): ?int + { + if (preg_match(self::OPENER, $lines[$start], $matches) !== 1) { + return null; // @codeCoverageIgnore - pattern already matched + } + + $fenceLength = strlen($matches[1]); + $innerLines = $this->collectInnerLines($lines, $start, $fenceLength, $consumed); + if ($innerLines === null) { + // Unclosed fence: leave it for the core parser to report. + return null; + } + + $lineBlock = new LineBlock(); + foreach ($this->splitStanzas($innerLines) as $offset => $stanza) { + $lineBlock->appendChild($this->buildStanza($blockParser, $stanza, $start + 1 + $offset)); + } + + $attributes = $blockParser->consumePendingAttributes(); + if ($attributes !== []) { + $lineBlock->setAttributes($attributes); + } + + $parent->appendChild($lineBlock); + + return $consumed; + } + + /** + * Collect the lines between the opener and its matching closing fence. + * + * Uses the core {@see FencedBlockParser} detectors so code-fence and + * div-closer recognition stay identical to the built-in div parser: a `:::` + * inside a fenced code block is not treated as the closer, and an indented or + * info-string code fence is recognized the same way. A nested div uses a + * longer fence (djot semantics), so the closer is the first bare `:::` run of + * at least the opener length. Returns null when no closer is found, so the + * caller can decline the match. + * + * @param array $lines + * @param int $start + * @param int $fenceLength + * @param int|null $consumed Set to the number of lines consumed (opener..closer). + * + * @return array|null + */ + protected function collectInnerLines(array $lines, int $start, int $fenceLength, ?int &$consumed): ?array + { + $fences = new FencedBlockParser(); + $inner = []; + $inCode = false; + $codeChar = ''; + $codeLength = 0; + $count = count($lines); + $i = $start + 1; + + while ($i < $count) { + $line = $lines[$i]; + + if (!$inCode) { + $opener = $fences->parseCodeFenceOpener($line); + if ($opener !== null) { + $inCode = true; + $codeChar = $opener['char']; + $codeLength = $opener['length']; + $inner[] = $line; + $i++; + + continue; + } + } + if ($inCode) { + if ($fences->isCodeFenceCloser($line, $codeChar, $codeLength)) { + $inCode = false; + } + $inner[] = $line; + $i++; + + continue; + } + + if ($fences->isDivFenceCloser($line, $fenceLength)) { + $consumed = $i + 1 - $start; + + return $inner; + } + + $inner[] = $line; + $i++; + } + + return null; + } + + /** + * Split inner lines into stanzas on blank lines. + * + * @param array $innerLines + * + * @return array> Stanza index (line offset of its + * first line) => its lines, leading whitespace preserved. + */ + protected function splitStanzas(array $innerLines): array + { + $stanzas = []; + $current = []; + $offset = 0; + foreach ($innerLines as $index => $line) { + if (trim($line) === '') { + if ($current !== []) { + $stanzas[$offset] = $current; + $current = []; + } + + continue; + } + if ($current === []) { + $offset = $index; + } + $current[] = $line; + } + if ($current !== []) { + $stanzas[$offset] = $current; + } + + return $stanzas; + } + + /** + * Build one stanza paragraph: each line is inline-parsed and joined by a + * hard break, so single newlines render as `
`. + * + * @param \Djot\Parser\BlockParser $blockParser + * @param array $stanza + * @param int $baseLine + */ + protected function buildStanza(BlockParser $blockParser, array $stanza, int $baseLine): Paragraph + { + $paragraph = new Paragraph(); + $inlineParser = $blockParser->getInlineParser(); + $last = count($stanza) - 1; + $index = 0; + foreach ($stanza as $line) { + // Emit leading whitespace via the internal non-breaking-space + // placeholder (U+E000, the same sentinel the parser uses for an + // escaped space). The HTML renderer turns it into a   entity so + // the indent survives whitespace collapsing; Markdown keeps a real + // non-breaking space (U+00A0); plain text and ANSI use a normal space. + // A private-use character is used so it never collides with a literal + // U+00A0 in the author's own text. Tabs expand to four-column stops. + [$columns, $content] = $this->splitLeadingWhitespace($line); + if ($columns > 0) { + $paragraph->appendChild(new Text(str_repeat("\u{E000}", $columns))); + } + + $inlineParser->parse($paragraph, $content, $baseLine + $index); + if ($index < $last) { + $paragraph->appendChild(new HardBreak()); + } + $index++; + } + + return $paragraph; + } + + /** + * Split a line into its leading-whitespace width (in columns, tabs expanded + * to four-column stops) and the remaining content. + * + * @return array{0: int, 1: string} + */ + protected function splitLeadingWhitespace(string $line): array + { + $column = 0; + $offset = 0; + $length = strlen($line); + while ($offset < $length) { + if ($line[$offset] === ' ') { + $column++; + } elseif ($line[$offset] === "\t") { + $column += 4 - ($column % 4); + } else { + break; + } + $offset++; + } + + return [$column, substr($line, $offset)]; + } +} diff --git a/src/Renderer/AnsiRenderer.php b/src/Renderer/AnsiRenderer.php index 0013a98f..4c51b80c 100644 --- a/src/Renderer/AnsiRenderer.php +++ b/src/Renderer/AnsiRenderer.php @@ -371,7 +371,12 @@ public function render(Document $document): string // Normalize multiple blank lines $output = preg_replace("/\n{3,}/", "\n\n", $output) ?? $output; - return trim($output) . "\n"; + $output = trim($output) . "\n"; + + // The internal non-breaking-space placeholder (U+E000) collapses to an + // ordinary space in terminal output. Done as the final step, after + // trimming, so placeholder-derived leading indentation survives. + return str_replace("\u{E000}", ' ', $output); } protected function renderNode(Node $node): string diff --git a/src/Renderer/HtmlRenderer.php b/src/Renderer/HtmlRenderer.php index 5524374e..81d6e5cf 100644 --- a/src/Renderer/HtmlRenderer.php +++ b/src/Renderer/HtmlRenderer.php @@ -809,6 +809,8 @@ protected function renderLineBlock(LineBlock $node): string $attrs = $this->getRenderableAttributes($node); $attrs = $this->mergeAttribute($attrs, 'class', 'line-block'); + // Leading-indent placeholders (U+E000) are turned into   entities by + // the shared text escaper, so no extra handling is needed here. return 'renderAttributeArray($attrs) . ">\n" . $this->renderChildren($node) . "\n"; } diff --git a/src/Renderer/MarkdownRenderer.php b/src/Renderer/MarkdownRenderer.php index 9640dc01..8c7cd940 100644 --- a/src/Renderer/MarkdownRenderer.php +++ b/src/Renderer/MarkdownRenderer.php @@ -97,7 +97,16 @@ public function render(Document $document): string // Normalize multiple blank lines $markdown = preg_replace("/\n{3,}/", "\n\n", $markdown) ?? $markdown; - return trim($markdown) . "\n"; + $markdown = trim($markdown) . "\n"; + + // The internal non-breaking-space placeholder (U+E000) becomes a literal + // non-breaking space (U+00A0). Markdown is a re-parseable round-trip + // format, so unlike the display renderers it keeps the real nbsp: that + // survives a re-render as ` ` and is never mistaken for a Markdown + // indented-code-block prefix the way ordinary leading spaces would be. + // Done as the final step, after trimming, so placeholder-derived leading + // indentation (e.g. in a line block) survives. + return str_replace("\u{E000}", "\u{00A0}", $markdown); } protected function renderNode(Node $node): string diff --git a/src/Renderer/PlainTextRenderer.php b/src/Renderer/PlainTextRenderer.php index f891baa6..450cfb74 100644 --- a/src/Renderer/PlainTextRenderer.php +++ b/src/Renderer/PlainTextRenderer.php @@ -92,7 +92,12 @@ public function render(Document $document): string // Normalize multiple blank lines to single $text = preg_replace("/\n{3,}/", "\n\n", $text) ?? $text; - return trim($text) . "\n"; + $text = trim($text) . "\n"; + + // The internal non-breaking-space placeholder (U+E000) collapses to an + // ordinary space in plain text. Done as the final step, after trimming, so + // placeholder-derived leading indentation (e.g. in a line block) survives. + return str_replace("\u{E000}", ' ', $text); } protected function renderNode(Node $node): string diff --git a/tests/TestCase/Extension/LineBlockDivExtensionTest.php b/tests/TestCase/Extension/LineBlockDivExtensionTest.php new file mode 100644 index 00000000..dcb04d66 --- /dev/null +++ b/tests/TestCase/Extension/LineBlockDivExtensionTest.php @@ -0,0 +1,233 @@ +addExtension(new LineBlockDivExtension()); + + return $converter; + } + + public function testPipeMarkerBecomesLineBlockDiv(): void + { + $djot = "::: |\nLine one\nLine two\n:::"; + + $html = $this->converter()->convert($djot); + + // The pipe marker is consumed: a `line-block` div, never `class="|"`. + $this->assertStringContainsString('
', $html); + $this->assertStringNotContainsString('class="|"', $html); + } + + public function testSoftBreaksBecomeHardBreaks(): void + { + $djot = "::: |\nLine one\nLine two\n:::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString("Line one
\nLine two", $html); + } + + public function testLeadingWhitespaceIsPreservedAsNonBreakingSpaces(): void + { + $djot = "::: |\nFlush left\n Indented two\n:::"; + + $html = $this->converter()->convert($djot); + + // Leading spaces become non-breaking spaces so the indent survives the + // browser's whitespace collapsing. + $this->assertStringContainsString("Flush left
\n  Indented two", $html); + } + + public function testNonHtmlOutputUsesRegularSpaces(): void + { + // The nbsp placeholder is an HTML-only concern; plain text gets ordinary + // spaces, so the indentation is still visible without an invisible + // placeholder leaking into the output. + $document = $this->converter()->parse("::: |\nLine one\n Indented two\n:::"); + $text = (new PlainTextRenderer())->render($document); + + $this->assertStringContainsString("Line one\n Indented two", $text); + $this->assertStringNotContainsString(self::NBSP_PLACEHOLDER, $text); + } + + public function testMarkdownPreservesIndentedFirstLineAsNonBreakingSpaces(): void + { + // Markdown is a re-parseable round-trip format, so the indent is kept as + // real non-breaking spaces (U+00A0): they survive trimming, survive a + // re-render as  , and are never mistaken for an indented code block. + $document = $this->converter()->parse("::: |\n first\n second\n:::"); + $markdown = (new MarkdownRenderer())->render($document); + $firstLine = explode("\n", $markdown)[0]; + + $this->assertSame(self::NBSP . self::NBSP . 'first', rtrim($firstLine)); + $this->assertStringNotContainsString(self::NBSP_PLACEHOLDER, $markdown); + } + + public function testLiteralNonBreakingSpaceInContentIsPreserved(): void + { + // A real U+00A0 the author typed in the verse must survive in plain text + // (the placeholder uses a private-use char, so it is not clobbered). + $document = $this->converter()->parse("::: |\nice" . self::NBSP . "cream\n:::"); + $text = (new PlainTextRenderer())->render($document); + + $this->assertStringContainsString('ice' . self::NBSP . 'cream', $text); + } + + public function testTabIndentExpandsToFourColumns(): void + { + $djot = "::: |\nflush\n\ttabbed\n:::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString("flush
\n" . str_repeat(' ', 4) . 'tabbed', $html); + } + + public function testBlankLineSeparatesStanzas(): void + { + $djot = "::: |\nStanza one a\nStanza one b\n\nStanza two a\nStanza two b\n:::"; + + $html = $this->converter()->convert($djot); + + // Two paragraphs inside a single line-block div. + $this->assertSame(2, substr_count($html, '

')); + $this->assertStringContainsString("Stanza one a
\nStanza one b", $html); + $this->assertStringContainsString("Stanza two a
\nStanza two b", $html); + } + + public function testInlineMarkupStillParses(): void + { + $djot = "::: |\nA _em_ and a [link](https://example.com)\nplain\n:::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('em', $html); + $this->assertStringContainsString('link', $html); + } + + public function testPendingAttributesAttachToTheDiv(): void + { + $djot = "{#poem .verse}\n::: |\nLine one\nLine two\n:::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('id="poem"', $html); + $this->assertStringContainsString('verse', $html); + $this->assertStringContainsString('line-block', $html); + } + + public function testFencedCodeInsideIsNotTreatedAsClosingFence(): void + { + $djot = "::: |\nbefore\n```\n:::\nstill code\n```\nafter\n:::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('

', $html); + $this->assertStringContainsString('after', $html); + // The ::: inside the code fence did not close the line block. + $this->assertStringContainsString('still code', $html); + } + + public function testInfoStringCodeFenceInsideIsNotAClosingFence(): void + { + // An info-string code fence (``` djot) opens a code block; the `:::` + // inside it must not close the line block (matches the core div parser). + $djot = "::: |\nbefore\n``` djot\n:::\n```\nafter\n:::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('
', $html); + $this->assertStringContainsString('after', $html); + } + + public function testLongerOpenerFenceRequiresAtLeastAsLongCloser(): void + { + $djot = ":::: |\nLine one\n:::\nstill inside\n::::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('
', $html); + // The shorter ::: does not close a :::: opener. + $this->assertStringContainsString('still inside', $html); + } + + public function testUnclosedFenceFallsThroughToCore(): void + { + $djot = "::: |\nLine one\nLine two"; + + $html = $this->converter()->convert($djot); + + // No closer: not a line block. Core handles it as an ordinary div, so the + // extension must not have produced a line-block div. + $this->assertStringNotContainsString('class="line-block"', $html); + } + + public function testWorksInsideBlockquote(): void + { + $djot = "> ::: |\n> Roses are red\n> Violets are blue\n> :::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('
', $html); + $this->assertStringContainsString('
', $html); + // Indentation relative to the line block survives the blockquote dedent. + $this->assertStringContainsString("Roses are red
\n  Violets are blue", $html); + } + + public function testWorksInsideListItem(): void + { + $djot = "- item\n\n ::: |\n Line one\n Indented two\n :::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('
  • ', $html); + $this->assertStringContainsString('
    ', $html); + $this->assertStringContainsString("Line one
    \n  Indented two", $html); + } + + public function testWorksInsideBlockquotedList(): void + { + $djot = "> - x\n>\n> ::: |\n> alpha\n> beta\n> :::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('
    ', $html); + $this->assertStringContainsString('
  • ', $html); + $this->assertStringContainsString('
    ', $html); + $this->assertStringContainsString("alpha
    \n  beta", $html); + } + + public function testPlainDivWithRealClassIsUntouched(): void + { + $djot = "::: warning\nHello\n:::"; + + $html = $this->converter()->convert($djot); + + $this->assertStringContainsString('
    ', $html); + $this->assertStringNotContainsString('line-block', $html); + } +}