Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
fc9b52d
Fix decoded HTML attribute prefix matching
sirreal Jun 10, 2026
2a69f27
Add WP_HTML_Decoder fuzzer
sirreal Jun 10, 2026
0c628b9
Bound HTML decoder fuzzer retention
sirreal Jun 10, 2026
934482b
Run decoder fuzzer cases in both contexts
sirreal Jun 11, 2026
56de5c8
Add byte-space decoder fuzzer lane
sirreal Jun 11, 2026
eb22b41
Add reference-at-EOF decoder fuzzing
sirreal Jun 11, 2026
3ba9e2d
Add attribute prefix monotonicity checks
sirreal Jun 11, 2026
2a99ed2
Exercise multi-code-point attribute prefixes
sirreal Jun 11, 2026
14f85a7
Generate numeric references from code point ranges
sirreal Jun 11, 2026
bafb908
Add deterministic name sweep mode
sirreal Jun 11, 2026
2bccc8f
Generate edit-distance name lookalikes
sirreal Jun 11, 2026
e49d526
Sweep legacy reference follower bytes
sirreal Jun 11, 2026
0dff210
Add prefix-family sweep mode
sirreal Jun 11, 2026
a836508
Add numeric boundary sweep mode
sirreal Jun 11, 2026
446b54d
Compose decoder fuzz strategies
sirreal Jun 11, 2026
bd70f06
Add corpus mutation fuzz mode
sirreal Jun 11, 2026
ff102ac
Assert reader reference compositionality
sirreal Jun 11, 2026
55ae440
Add case-mangled entity fuzzing
sirreal Jun 11, 2026
f3d4004
Preserve null reader match length
sirreal Jun 11, 2026
3d5b327
Probe non-amp reader offsets
sirreal Jun 11, 2026
e7bcffc
Assert attribute no-amp identity
sirreal Jun 11, 2026
bc0064a
Add whitespace to generator alphabet
sirreal Jun 11, 2026
10c2503
Assert gapless reader reconstruction
sirreal Jun 11, 2026
a559e8a
Assert invalid numeric replacements
sirreal Jun 11, 2026
8659842
Assert C1 numeric remapping
sirreal Jun 11, 2026
9dabbdb
Add secondary text entity oracle
sirreal Jun 11, 2026
a9d190d
Add token-map structure sweep mode
sirreal Jun 11, 2026
6ed4685
Add coverage-guided fuzz lane
sirreal Jun 11, 2026
249d9d6
Assert single-level entity decoding
sirreal Jun 11, 2026
a312e3e
Stabilize decoder fuzzer name ordering
sirreal Jun 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
279 changes: 279 additions & 0 deletions progress-handoff-91xXCG.md

Large diffs are not rendered by default.

26 changes: 17 additions & 9 deletions src/wp-includes/html-api/class-wp-html-decoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -60,17 +60,23 @@ public static function attribute_starts_with( $haystack, $search_text, $case_sen
continue;
}

// If there is a character reference, then the decoded value must exactly match what follows in the search string.
if ( 0 !== substr_compare( $search_text, $next_chunk, $search_at, strlen( $next_chunk ), $loose_case ) ) {
/*
* If there is a character reference, then the decoded value must
* match what follows in the search string. The search string may
* end within a multi-code-point replacement, such as `<⃒`
* decoding to `<⃒`, and still be a prefix match.
*/
$match_length = min( strlen( $next_chunk ), $search_length - $search_at );
if ( 0 !== substr_compare( $search_text, $next_chunk, $search_at, $match_length, $loose_case ) ) {
return false;
}

// The character reference matched, so continue checking.
$haystack_at += $token_length;
$search_at += strlen( $next_chunk );
$search_at += $match_length;
}

return true;
return $search_at === $search_length;
}

/**
Expand Down Expand Up @@ -361,7 +367,7 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat

$name_length = 0;
$replacement = $html5_named_character_references->read_token( $text, $name_at, $name_length );
if ( false === $replacement ) {
if ( null === $replacement ) {
return null;
}

Expand All @@ -378,12 +384,14 @@ public static function read_character_reference( $context, $text, $at = 0, &$mat
* character reference table but the match doesn't end in `;`.
* It may be allowed if it's followed by something unambiguous.
*/
$follower_byte = $after_name < $length ? ord( $text[ $after_name ] ) : null;
$ambiguous_follower = (
$after_name < $length &&
$name_at < $length &&
null !== $follower_byte &&
(
ctype_alnum( $text[ $after_name ] ) ||
'=' === $text[ $after_name ]
( $follower_byte >= 0x30 && $follower_byte <= 0x39 ) ||
( $follower_byte >= 0x41 && $follower_byte <= 0x5A ) ||
( $follower_byte >= 0x61 && $follower_byte <= 0x7A ) ||
0x3D === $follower_byte
)
);

Expand Down
125 changes: 125 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlDecoder.php
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,126 @@
$this->assertSame( "&\x00b", $decoded, 'Should have decoded the text without changing it.' );
}

/**
* Ensures semicolonless legacy references decode before non-ASCII UTF-8 bytes in attributes.
*/
public function test_semicolonless_legacy_reference_before_multibyte_attribute_follower() {
$raw_attribute = "&Aacute\xC2\x80";

$this->assertSame(
"\xC3\x81\xC2\x80",
WP_HTML_Decoder::decode_attribute( $raw_attribute ),
'Should have decoded the semicolonless legacy reference before a multibyte follower.'
);

$match_byte_length = null;
$this->assertSame(
"\xC3\x81",
WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ),
'Should have matched the semicolonless legacy reference before a multibyte follower.'
);
$this->assertSame( strlen( '&Aacute' ), $match_byte_length );
}

/**
* Ensures semicolonless legacy references remain ambiguous before ASCII alnum or equals.
*
* @dataProvider data_ambiguous_ascii_attribute_followers
*
* @param string $raw_attribute Raw attribute value with an ambiguous legacy reference follower.
*/
public function test_semicolonless_legacy_reference_before_ascii_attribute_follower_is_ambiguous( $raw_attribute ) {
$this->assertSame(
$raw_attribute,
WP_HTML_Decoder::decode_attribute( $raw_attribute ),
'Should not have decoded an ambiguous semicolonless legacy reference.'
);

$match_byte_length = 'sentinel';
$this->assertNull(
WP_HTML_Decoder::read_character_reference( 'attribute', $raw_attribute, 0, $match_byte_length ),
'Should not have matched an ambiguous semicolonless legacy reference.'
);
$this->assertSame( 'sentinel', $match_byte_length );
}

/**
* Data provider.
*
* @return array[].
*/
public static function data_ambiguous_ascii_attribute_followers() {
return array(
'ASCII digit' => array( '&Aacute0' ),

Check warning on line 114 in tests/phpunit/tests/html-api/wpHtmlDecoder.php

View workflow job for this annotation

GitHub Actions / Coding standards / PHP checks

Array double arrow not aligned correctly; expected 11 space(s) between "'ASCII digit'" and double arrow, but found 1.
'ASCII uppercase alpha' => array( '&AacuteA' ),
'ASCII lowercase alpha' => array( '&Aacutea' ),
'equals' => array( '&Aacute=' ),

Check warning on line 117 in tests/phpunit/tests/html-api/wpHtmlDecoder.php

View workflow job for this annotation

GitHub Actions / Coding standards / PHP checks

Array double arrow not aligned correctly; expected 16 space(s) between "'equals'" and double arrow, but found 1.
);
}

/**
* Ensures unmatched named character references leave the by-ref match length unchanged.
*
* @dataProvider data_unmatched_named_character_references
*
* @param string $context Decoder context.
* @param string $raw_text_node Raw text containing an unmatched named character reference.
*/
public function test_unmatched_named_character_reference_does_not_set_match_byte_length( $context, $raw_text_node ) {
$match_byte_length = 'sentinel';
$this->assertNull(
WP_HTML_Decoder::read_character_reference( $context, $raw_text_node, 0, $match_byte_length ),
'Should not have matched an unmatched named character reference.'
);
$this->assertSame( 'sentinel', $match_byte_length );
}

/**
* Data provider.
*
* @return array[].
*/
public static function data_unmatched_named_character_references() {
return array(
'text invalid name' => array( 'data', '&bogus;' ),

Check warning on line 145 in tests/phpunit/tests/html-api/wpHtmlDecoder.php

View workflow job for this annotation

GitHub Actions / Coding standards / PHP checks

Array double arrow not aligned correctly; expected 22 space(s) between "'text invalid name'" and double arrow, but found 17.
'text invalid short-name candidate' => array( 'data', '&Fv=q' ),

Check warning on line 146 in tests/phpunit/tests/html-api/wpHtmlDecoder.php

View workflow job for this annotation

GitHub Actions / Coding standards / PHP checks

Array double arrow not aligned correctly; expected 6 space(s) between "'text invalid short-name candidate'" and double arrow, but found 1.
'attribute invalid name' => array( 'attribute', '&bogus;' ),

Check warning on line 147 in tests/phpunit/tests/html-api/wpHtmlDecoder.php

View workflow job for this annotation

GitHub Actions / Coding standards / PHP checks

Array double arrow not aligned correctly; expected 17 space(s) between "'attribute invalid name'" and double arrow, but found 12.
'attribute invalid short-name candidate' => array( 'attribute', '&Fv=q' ),
);
}

/**
* Ensures non-ampersand offsets never match character references.
*
* @dataProvider data_non_ampersand_character_reference_offsets
*
* @param string $context Decoder context.
* @param string $raw_text_node Raw text containing a character reference away from offset.
* @param int $offset Offset that does not point at an ampersand.
*/
public function test_non_ampersand_offset_does_not_set_match_byte_length( $context, $raw_text_node, $offset ) {
$match_byte_length = 'sentinel';
$this->assertNull(
WP_HTML_Decoder::read_character_reference( $context, $raw_text_node, $offset, $match_byte_length ),
'Should not have matched a character reference away from an ampersand.'
);
$this->assertSame( 'sentinel', $match_byte_length );
}

/**
* Data provider.
*
* @return array[].
*/
public static function data_non_ampersand_character_reference_offsets() {
return array(
'text before reference' => array( 'data', 'a&amp;b', 0 ),

Check warning on line 177 in tests/phpunit/tests/html-api/wpHtmlDecoder.php

View workflow job for this annotation

GitHub Actions / Coding standards / PHP checks

Array double arrow not aligned correctly; expected 11 space(s) between "'text before reference'" and double arrow, but found 7.
'text inside reference name' => array( 'data', 'a&amp;b', 2 ),

Check warning on line 178 in tests/phpunit/tests/html-api/wpHtmlDecoder.php

View workflow job for this annotation

GitHub Actions / Coding standards / PHP checks

Array double arrow not aligned correctly; expected 6 space(s) between "'text inside reference name'" and double arrow, but found 2.
'attribute before reference' => array( 'attribute', 'a&amp;b', 0 ),

Check warning on line 179 in tests/phpunit/tests/html-api/wpHtmlDecoder.php

View workflow job for this annotation

GitHub Actions / Coding standards / PHP checks

Array double arrow not aligned correctly; expected 6 space(s) between "'attribute before reference'" and double arrow, but found 2.
'attribute inside reference name' => array( 'attribute', 'a&amp;b', 2 ),
);
}

/**
* Ensures proper detection of attribute prefixes ignoring ASCII case.
*
Expand Down Expand Up @@ -161,6 +281,11 @@
array( 'http://wordpress.org', 'Http', 'ascii-case-insensitive', true ),
array( 'http://wordpress.org', 'https', 'case-sensitive', false ),
array( 'http://wordpress.org', 'https', 'ascii-case-insensitive', false ),
array( '', 'http', 'case-sensitive', false ),
array( 'jav', 'javascript:', 'case-sensitive', false ),
array( 'jav', 'javascript:', 'ascii-case-insensitive', false ),
array( '&nvlt;script', '<', 'case-sensitive', true ),
array( '&nvgt;script', '>', 'case-sensitive', true ),
);
}
}
Loading
Loading