From 8e8413f601f8bbd2f940c7478543b92bf9657bed Mon Sep 17 00:00:00 2001 From: Mark Scherer Date: Tue, 16 Jun 2026 13:51:05 +0200 Subject: [PATCH] Tables: tolerate trailing whitespace on rows; reject empty delimiter cells Two table delimiter-row divergences (carve-js and carve-rs agreed; the PHP impl was the outlier), ported from carve-php commit 28d4c10: 1. Trailing whitespace after a row's closing pipe broke recognition. A line like `| a |` followed by spaces, or a separator `|---|` followed by spaces, was not treated as a table row, so a table with a trailing-whitespace separator split into separate blocks and the separator rendered as a paragraph. Trailing spaces/tabs after the closing pipe are now stripped before the structural checks (isTableRow, isSeparatorRow, the cell parsers, and the BlockParser row loop). 2. A delimiter row with an empty cell (`|---||`) was accepted as a header separator. isSeparatorRow used a character class that put `|` inside it and so never validated per cell. It now splits the row into cells and requires each to be a delimiter cell (optional whitespace, optional leading `:`, one or more `-`, optional trailing `:`); an empty cell or any other content disqualifies the row, which then stays an ordinary data row. Behavior now matches the JS and Rust implementations on these delimiter-row edge cases. --- src/Parser/Block/TableParser.php | 34 +++++++++++++- src/Parser/BlockParser.php | 4 +- tests/TestCase/TableDelimiterRowTest.php | 56 ++++++++++++++++++++++++ 3 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 tests/TestCase/TableDelimiterRowTest.php diff --git a/src/Parser/Block/TableParser.php b/src/Parser/Block/TableParser.php index e7b37692..f3a56f73 100644 --- a/src/Parser/Block/TableParser.php +++ b/src/Parser/Block/TableParser.php @@ -34,6 +34,10 @@ public function isTableRow(string $line): bool return false; } + // Trailing whitespace after the closing pipe is insignificant (parity + // with carve-js / carve-rs); strip it before the structural checks. + $line = rtrim($line, " \t"); + // Strip row attributes if present (|...|{.class}) $lineWithoutRowAttrs = $this->stripRowAttributes($line); @@ -88,7 +92,29 @@ public function extractRowAttributes(string $line): array */ public function isSeparatorRow(string $line): bool { - return preg_match('/^\|[\s:|-]+\|$/', $line) === 1 && str_contains($line, '-'); + // Trailing whitespace after the closing pipe is insignificant. + $line = rtrim($line, " \t"); + + $len = strlen($line); + if ($len < 2 || $line[0] !== '|' || $line[$len - 1] !== '|') { + return false; + } + + // Every cell must be a delimiter cell: optional whitespace, an optional + // leading ':', one or more '-', an optional trailing ':', optional + // whitespace. An EMPTY cell (`|---||`) or any other content disqualifies + // the row -- it is then an ordinary data row (matches carve-js/carve-rs). + $cells = $this->parseTableCells($line); + if ($cells === []) { + return false; + } + foreach ($cells as $cell) { + if (preg_match('/^\s*:?-+:?\s*$/', $cell) !== 1) { + return false; + } + } + + return true; } /** @@ -152,6 +178,9 @@ public function parseTableCells(string $line): array // Strip row attributes first $line = $this->stripRowAttributes($line); + // Trailing whitespace after the closing pipe is insignificant. + $line = rtrim($line, " \t"); + // Remove leading and trailing | $line = substr($line, 1, -1); @@ -357,6 +386,9 @@ public function parseTableCellsRaw(string $line): array // Strip row attributes first $line = $this->stripRowAttributes($line); + // Trailing whitespace after the closing pipe is insignificant. + $line = rtrim($line, " \t"); + // Must start with | to be a potential table row if (!str_starts_with($line, '|')) { return []; diff --git a/src/Parser/BlockParser.php b/src/Parser/BlockParser.php index 05538b50..95773c74 100644 --- a/src/Parser/BlockParser.php +++ b/src/Parser/BlockParser.php @@ -2750,7 +2750,9 @@ protected function tryParseTable(Node $parent, array $lines, int $start): ?int // Strip row attributes for validation (|...|{.class} → |...|) $lineWithoutRowAttrs = $this->tableParser->stripRowAttributes($currentLine); - if (!preg_match('/^\|.*\|$/', $lineWithoutRowAttrs)) { + // Trailing whitespace after the closing pipe is insignificant + // (parity with carve-js / carve-rs). + if (!preg_match('/^\|.*\|[ \t]*$/', $lineWithoutRowAttrs)) { break; } diff --git a/tests/TestCase/TableDelimiterRowTest.php b/tests/TestCase/TableDelimiterRowTest.php new file mode 100644 index 00000000..3e446b5e --- /dev/null +++ b/tests/TestCase/TableDelimiterRowTest.php @@ -0,0 +1,56 @@ +converter = new DjotConverter(); + } + + public function testSeparatorRowWithTrailingWhitespaceStillPromotesHeader(): void + { + // Trailing whitespace after the closing pipe is insignificant; the + // separator must still promote the first row to a header. + $result = $this->converter->convert("| H | G |\n|---| \n| a | b |"); + + $this->assertStringContainsString('', $result); + $this->assertStringContainsString('', $result); + $this->assertStringContainsString('', $result); + $this->assertStringNotContainsString('

', $result); + } + + public function testDataRowWithTrailingWhitespaceStillParsesAsTable(): void + { + $result = $this->converter->convert('| a | '); + + $this->assertStringContainsString('

', $result); + $this->assertStringNotContainsString('

', $result); + } + + public function testSeparatorRowWithEmptyCellIsNotASeparator(): void + { + // `|---||` has an empty second cell, so it is NOT a delimiter row: the + // first row must not be promoted to a header. + $result = $this->converter->convert("| H | G |\n|---||\n| a | b |"); + + $this->assertStringNotContainsString('

', $result); + } +}
HGa', $result); + $this->assertStringContainsString('H