From 05cfed923c3d43c9b8eda70625f004d28440ac5f Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 23:19:37 +0200 Subject: [PATCH 1/6] Fix WP_Token_Map array export key length --- src/wp-includes/class-wp-token-map.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php index fc223b187f8c5..108f28475241a 100644 --- a/src/wp-includes/class-wp-token-map.php +++ b/src/wp-includes/class-wp-token-map.php @@ -662,7 +662,7 @@ public function to_array(): array { } foreach ( $this->large_words as $index => $group ) { - $prefix = substr( $this->groups, $index * ( $this->key_length + 1 ), 2 ); + $prefix = substr( $this->groups, $index * ( $this->key_length + 1 ), $this->key_length ); $group_length = strlen( $group ); $at = 0; while ( $at < $group_length ) { From d875beeb79a145a8ec52325a397e8ae0b6279ac6 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 23:30:41 +0200 Subject: [PATCH 2/6] Fix WP_Token_Map read_token bounds --- src/wp-includes/class-wp-token-map.php | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php index 108f28475241a..da038556146fe 100644 --- a/src/wp-includes/class-wp-token-map.php +++ b/src/wp-includes/class-wp-token-map.php @@ -536,14 +536,16 @@ public function read_token( string $text, int $offset = 0, &$matched_token_byte_ $text_length = strlen( $text ); // Search for a long word first, if the text is long enough, and if that fails, a short one. - if ( $text_length > $this->key_length ) { + if ( $text_length - $offset > $this->key_length ) { /* * Keys cannot contain null bytes, which is taken care of for the full words, * but here it’s required to reject group keys with null bytes so that the * lookup doesn’t get off track when scanning the group string. */ if ( strcspn( $text, "\x00", $offset, $this->key_length ) < $this->key_length ) { - return null; + return strlen( $this->small_words ) > 0 + ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) + : null; } $group_key = substr( $text, $offset, $this->key_length ); @@ -596,6 +598,10 @@ private function read_small_token( string $text, int $offset = 0, &$matched_toke $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; $small_length = strlen( $this->small_words ); $search_text = substr( $text, $offset, $this->key_length ); + if ( '' === $search_text ) { + return null; + } + if ( $ignore_case ) { $search_text = strtoupper( $search_text ); } @@ -617,6 +623,11 @@ private function read_small_token( string $text, int $offset = 0, &$matched_toke return $this->small_mappings[ $at / ( $this->key_length + 1 ) ]; } + if ( ! isset( $search_text[ $adjust ] ) ) { + $at += $this->key_length + 1; + continue 2; + } + if ( $search_text[ $adjust ] !== $this->small_words[ $at + $adjust ] && ( ! $ignore_case || strtoupper( $this->small_words[ $at + $adjust ] !== $search_text[ $adjust ] ) ) From 2110e539bdf64e2ce7b5d4121d13a1885f35e161 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 23:31:53 +0200 Subject: [PATCH 3/6] Fix WP_Token_Map ASCII matching --- src/wp-includes/class-wp-token-map.php | 74 +++++++++++++++++++++----- 1 file changed, 62 insertions(+), 12 deletions(-) diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php index da038556146fe..3409d5519b13c 100644 --- a/src/wp-includes/class-wp-token-map.php +++ b/src/wp-includes/class-wp-token-map.php @@ -451,13 +451,20 @@ public function contains( string $word, string $case_sensitivity = 'case-sensiti return false; } - $term = str_pad( $word, $this->key_length + 1, "\x00", STR_PAD_RIGHT ); - $word_at = $ignore_case ? stripos( $this->small_words, $term ) : strpos( $this->small_words, $term ); - if ( false === $word_at ) { - return false; + $term = str_pad( $word, $this->key_length + 1, "\x00", STR_PAD_RIGHT ); + if ( ! $ignore_case ) { + return false !== strpos( $this->small_words, $term ); + } + + $small_length = strlen( $this->small_words ); + $record_length = $this->key_length + 1; + for ( $at = 0; $at < $small_length; $at += $record_length ) { + if ( self::matches_at( $this->small_words, $term, $at, $record_length, $ignore_case ) ) { + return true; + } } - return true; + return false; } $group_key = substr( $word, 0, $this->key_length ); @@ -478,7 +485,7 @@ public function contains( string $word, string $case_sensitivity = 'case-sensiti $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; $mapping_at = $at; - if ( $token_length === $length && 0 === substr_compare( $group, $slug, $token_at, $token_length, $ignore_case ) ) { + if ( $token_length === $length && self::matches_at( $group, $slug, $token_at, $token_length, $ignore_case ) ) { return true; } @@ -567,7 +574,7 @@ public function read_token( string $text, int $offset = 0, &$matched_token_byte_ $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; $mapping_at = $at; - if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length, $ignore_case ) ) { + if ( self::matches_at( $text, $token, $offset + $this->key_length, $token_length, $ignore_case ) ) { $matched_token_byte_length = $this->key_length + $token_length; return substr( $group, $mapping_at, $mapping_length ); } @@ -603,15 +610,18 @@ private function read_small_token( string $text, int $offset = 0, &$matched_toke } if ( $ignore_case ) { - $search_text = strtoupper( $search_text ); + $search_text = self::ascii_lowercase( $search_text ); } $starting_char = $search_text[0]; $at = 0; while ( $at < $small_length ) { + $stored_starting_char = $ignore_case + ? self::ascii_lowercase( $this->small_words[ $at ] ) + : $this->small_words[ $at ]; + if ( - $starting_char !== $this->small_words[ $at ] && - ( ! $ignore_case || strtoupper( $this->small_words[ $at ] ) !== $starting_char ) + $starting_char !== $stored_starting_char ) { $at += $this->key_length + 1; continue; @@ -628,9 +638,12 @@ private function read_small_token( string $text, int $offset = 0, &$matched_toke continue 2; } + $stored_char = $ignore_case + ? self::ascii_lowercase( $this->small_words[ $at + $adjust ] ) + : $this->small_words[ $at + $adjust ]; + if ( - $search_text[ $adjust ] !== $this->small_words[ $at + $adjust ] && - ( ! $ignore_case || strtoupper( $this->small_words[ $at + $adjust ] !== $search_text[ $adjust ] ) ) + $search_text[ $adjust ] !== $stored_char ) { $at += $this->key_length + 1; continue 2; @@ -840,4 +853,41 @@ private static function longest_first_then_alphabetical( string $a, string $b ): return strcmp( $a, $b ); } + + /** + * Checks whether a substring matches at a given offset. + * + * @since 6.6.0 + * + * @param string $haystack String to search within. + * @param string $needle String to match. + * @param int $offset Offset into the haystack. + * @param int $length Number of bytes to compare. + * @param bool $ignore_case Whether to fold ASCII case while matching. + * @return bool Whether the substring matched. + */ + private static function matches_at( string $haystack, string $needle, int $offset, int $length, bool $ignore_case ): bool { + $candidate = substr( $haystack, $offset, $length ); + if ( strlen( $candidate ) !== $length ) { + return false; + } + + if ( ! $ignore_case ) { + return $candidate === $needle; + } + + return self::ascii_lowercase( $candidate ) === self::ascii_lowercase( $needle ); + } + + /** + * Lowercases ASCII bytes only. + * + * @since 6.6.0 + * + * @param string $text Text to lowercase. + * @return string Text with only ASCII uppercase bytes folded to lowercase. + */ + private static function ascii_lowercase( string $text ): string { + return strtr( $text, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz' ); + } } From fc0d5febcc328b93362ca597cf86863b376911b0 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 23:33:13 +0200 Subject: [PATCH 4/6] Handle WP_Token_Map folded group keys --- src/wp-includes/class-wp-token-map.php | 142 +++++++++++++++++++------ 1 file changed, 108 insertions(+), 34 deletions(-) diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php index 3409d5519b13c..6e38fd9c05774 100644 --- a/src/wp-includes/class-wp-token-map.php +++ b/src/wp-includes/class-wp-token-map.php @@ -467,29 +467,33 @@ public function contains( string $word, string $case_sensitivity = 'case-sensiti return false; } - $group_key = substr( $word, 0, $this->key_length ); - $group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key ); - if ( false === $group_at ) { + $group_key = substr( $word, 0, $this->key_length ); + $group_indexes = $this->find_group_indexes( $group_key, $ignore_case ); + if ( empty( $group_indexes ) ) { return false; } - $group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ]; - $group_length = strlen( $group ); - $slug = substr( $word, $this->key_length ); - $length = strlen( $slug ); - $at = 0; - while ( $at < $group_length ) { - $token_length = unpack( 'C', $group[ $at++ ] )[1]; - $token_at = $at; - $at += $token_length; - $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; - $mapping_at = $at; + $slug = substr( $word, $this->key_length ); + $length = strlen( $slug ); - if ( $token_length === $length && self::matches_at( $group, $slug, $token_at, $token_length, $ignore_case ) ) { - return true; - } + foreach ( $group_indexes as $group_index ) { + $group = $this->large_words[ $group_index ]; + $group_length = strlen( $group ); + $at = 0; + + while ( $at < $group_length ) { + $token_length = unpack( 'C', $group[ $at++ ] )[1]; + $token_at = $at; + $at += $token_length; + $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; + $mapping_at = $at; + + if ( $token_length === $length && self::matches_at( $group, $slug, $token_at, $token_length, $ignore_case ) ) { + return true; + } - $at = $mapping_at + $mapping_length; + $at = $mapping_at + $mapping_length; + } } return false; @@ -555,31 +559,67 @@ public function read_token( string $text, int $offset = 0, &$matched_token_byte_ : null; } - $group_key = substr( $text, $offset, $this->key_length ); - $group_at = $ignore_case ? stripos( $this->groups, $group_key ) : strpos( $this->groups, $group_key ); - if ( false === $group_at ) { + $group_key = substr( $text, $offset, $this->key_length ); + $group_indexes = $this->find_group_indexes( $group_key, $ignore_case ); + if ( empty( $group_indexes ) ) { // Perhaps a short word then. return strlen( $this->small_words ) > 0 ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) : null; } - $group = $this->large_words[ $group_at / ( $this->key_length + 1 ) ]; - $group_length = strlen( $group ); - $at = 0; - while ( $at < $group_length ) { - $token_length = unpack( 'C', $group[ $at++ ] )[1]; - $token = substr( $group, $at, $token_length ); - $at += $token_length; - $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; - $mapping_at = $at; + if ( ! $ignore_case ) { + $group = $this->large_words[ $group_indexes[0] ]; + $group_length = strlen( $group ); + $at = 0; + while ( $at < $group_length ) { + $token_length = unpack( 'C', $group[ $at++ ] )[1]; + $token = substr( $group, $at, $token_length ); + $at += $token_length; + $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; + $mapping_at = $at; + + if ( 0 === substr_compare( $text, $token, $offset + $this->key_length, $token_length ) ) { + $matched_token_byte_length = $this->key_length + $token_length; + return substr( $group, $mapping_at, $mapping_length ); + } + + $at = $mapping_at + $mapping_length; + } - if ( self::matches_at( $text, $token, $offset + $this->key_length, $token_length, $ignore_case ) ) { - $matched_token_byte_length = $this->key_length + $token_length; - return substr( $group, $mapping_at, $mapping_length ); + return strlen( $this->small_words ) > 0 + ? $this->read_small_token( $text, $offset, $matched_token_byte_length, $case_sensitivity ) + : null; + } + + $best_match_length = null; + $best_mapping = null; + foreach ( $group_indexes as $group_index ) { + $group = $this->large_words[ $group_index ]; + $group_length = strlen( $group ); + $at = 0; + while ( $at < $group_length ) { + $token_length = unpack( 'C', $group[ $at++ ] )[1]; + $token = substr( $group, $at, $token_length ); + $at += $token_length; + $mapping_length = unpack( 'C', $group[ $at++ ] )[1]; + $mapping_at = $at; + + if ( self::matches_at( $text, $token, $offset + $this->key_length, $token_length, $ignore_case ) ) { + $match_length = $this->key_length + $token_length; + if ( null === $best_match_length || $match_length > $best_match_length ) { + $best_match_length = $match_length; + $best_mapping = substr( $group, $mapping_at, $mapping_length ); + } + } + + $at = $mapping_at + $mapping_length; } + } - $at = $mapping_at + $mapping_length; + if ( null !== $best_match_length ) { + $matched_token_byte_length = $best_match_length; + return $best_mapping; } } @@ -854,6 +894,40 @@ private static function longest_first_then_alphabetical( string $a, string $b ): return strcmp( $a, $b ); } + /** + * Finds group indexes that match a lookup key. + * + * @since 6.6.0 + * + * @param string $group_key Group key to find. + * @param bool $ignore_case Whether to fold ASCII case while searching. + * @return int[] Matching group indexes. + */ + private function find_group_indexes( string $group_key, bool $ignore_case ): array { + if ( ! $ignore_case ) { + $group_at = strpos( $this->groups, $group_key ); + + return false === $group_at + ? array() + : array( $group_at / ( $this->key_length + 1 ) ); + } + + $group_indexes = array(); + $record_length = $this->key_length + 1; + $groups_length = strlen( $this->groups ); + $group_index = 0; + + for ( $at = 0; $at < $groups_length; $at += $record_length ) { + if ( self::matches_at( $this->groups, $group_key, $at, $this->key_length, $ignore_case ) ) { + $group_indexes[] = $group_index; + } + + ++$group_index; + } + + return $group_indexes; + } + /** * Checks whether a substring matches at a given offset. * From 505e46b56074c56754ff908718dfa8655b7a7e97 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 23:33:30 +0200 Subject: [PATCH 5/6] Escape WP_Token_Map precomputed source --- src/wp-includes/class-wp-token-map.php | 96 ++++++++++++++++++-------- 1 file changed, 68 insertions(+), 28 deletions(-) diff --git a/src/wp-includes/class-wp-token-map.php b/src/wp-includes/class-wp-token-map.php index 6e38fd9c05774..1818c3d700b11 100644 --- a/src/wp-includes/class-wp-token-map.php +++ b/src/wp-includes/class-wp-token-map.php @@ -785,7 +785,7 @@ public function precomputed_php_source_table( string $indent = "\t" ): string { $output .= "{$i2}\"storage_version\" => \"{$class_version}\",\n"; $output .= "{$i2}\"key_length\" => {$this->key_length},\n"; - $group_line = str_replace( "\x00", "\\x00", $this->groups ); + $group_line = self::escape_precomputed_php_string( $this->groups ); $output .= "{$i2}\"groups\" => \"{$group_line}\",\n"; $output .= "{$i2}\"large_words\" => array(\n"; @@ -798,7 +798,7 @@ public function precomputed_php_source_table( string $indent = "\t" ): string { $group = $this->large_words[ $index ]; $group_length = strlen( $group ); $comment_line = "{$i3}//"; - $data_line = "{$i3}\""; + $group_data = ''; $at = 0; while ( $at < $group_length ) { $token_length = unpack( 'C', $group[ $at++ ] )[1]; @@ -808,32 +808,11 @@ public function precomputed_php_source_table( string $indent = "\t" ): string { $mapping = substr( $group, $at, $mapping_length ); $at += $mapping_length; - $token_digits = str_pad( dechex( $token_length ), 2, '0', STR_PAD_LEFT ); - $mapping_digits = str_pad( dechex( $mapping_length ), 2, '0', STR_PAD_LEFT ); - - $mapping = preg_replace_callback( - "~[\\x00-\\x1f\\x22\\x5c]~", - static function ( $match_result ) { - switch ( $match_result[0] ) { - case '"': - return '\\"'; - - case '\\': - return '\\\\'; - - default: - $hex = dechex( ord( $match_result[0] ) ); - return "\\x{$hex}"; - } - }, - $mapping - ); - - $comment_line .= " {$prefix}{$token}[{$mapping}]"; - $data_line .= "\\x{$token_digits}{$token}\\x{$mapping_digits}{$mapping}"; + $group_data .= pack( 'C', $token_length ) . $token . pack( 'C', $mapping_length ) . $mapping; + $comment_line .= ' ' . self::escape_precomputed_php_comment( "{$prefix}{$token}" ) . '[' . self::escape_precomputed_php_comment( $mapping ) . ']'; } $comment_line .= ".\n"; - $data_line .= "\",\n"; + $data_line = "{$i3}\"" . self::escape_precomputed_php_string( $group_data ) . "\",\n"; $output .= $comment_line; $output .= $data_line; @@ -849,12 +828,12 @@ static function ( $match_result ) { $at += $this->key_length + 1; } - $small_text = str_replace( "\x00", '\x00', implode( '', $small_words ) ); + $small_text = self::escape_precomputed_php_string( implode( '', $small_words ) ); $output .= "{$i2}\"small_words\" => \"{$small_text}\",\n"; $output .= "{$i2}\"small_mappings\" => array(\n"; foreach ( $this->small_mappings as $mapping ) { - $output .= "{$i3}\"{$mapping}\",\n"; + $output .= "{$i3}\"" . self::escape_precomputed_php_string( $mapping ) . "\",\n"; } $output .= "{$i2})\n"; $output .= "{$i1})\n"; @@ -964,4 +943,65 @@ private static function matches_at( string $haystack, string $needle, int $offse private static function ascii_lowercase( string $text ): string { return strtr( $text, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz' ); } + + /** + * Escapes text for use inside a double-quoted PHP string literal. + * + * @since 6.6.0 + * + * @param string $text Text to escape. + * @return string Escaped string literal body. + */ + private static function escape_precomputed_php_string( string $text ): string { + $escaped = ''; + $length = strlen( $text ); + + for ( $i = 0; $i < $length; $i++ ) { + $byte = ord( $text[ $i ] ); + switch ( $text[ $i ] ) { + case '"': + $escaped .= '\\"'; + break; + + case '\\': + $escaped .= '\\\\'; + break; + + case '$': + $escaped .= '\\$'; + break; + + default: + $escaped .= ( $byte < 0x20 || $byte >= 0x7f ) + ? sprintf( '\\x%02x', $byte ) + : $text[ $i ]; + } + } + + return $escaped; + } + + /** + * Escapes text for use inside generated PHP comments. + * + * @since 6.6.0 + * + * @param string $text Text to escape. + * @return string Escaped comment text. + */ + private static function escape_precomputed_php_comment( string $text ): string { + $escaped = ''; + $length = strlen( $text ); + + for ( $i = 0; $i < $length; $i++ ) { + $byte = ord( $text[ $i ] ); + $char = $text[ $i ]; + + $escaped .= ( $byte < 0x20 || $byte >= 0x7f || '?' === $char || '\\' === $char ) + ? sprintf( '\\x%02x', $byte ) + : $char; + } + + return $escaped; + } } From 3572d0d09ef3aab7bf6517ccb12ba1dfe6ffa614 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 10 Jun 2026 23:33:47 +0200 Subject: [PATCH 6/6] Add WP_Token_Map property tests --- .../wp-token-map/wpTokenMapProperties.php | 876 ++++++++++++++++++ 1 file changed, 876 insertions(+) create mode 100644 tests/phpunit/tests/wp-token-map/wpTokenMapProperties.php diff --git a/tests/phpunit/tests/wp-token-map/wpTokenMapProperties.php b/tests/phpunit/tests/wp-token-map/wpTokenMapProperties.php new file mode 100644 index 0000000000000..a2db89db04fce --- /dev/null +++ b/tests/phpunit/tests/wp-token-map/wpTokenMapProperties.php @@ -0,0 +1,876 @@ +assertInstanceOf( WP_Token_Map::class, $map ); + + foreach ( self::contains_probes( $mappings, $seed ) as $probe ) { + foreach ( self::case_sensitivities() as $case_sensitivity ) { + $this->assert_contains_matches_reference( $map, $mappings, $probe, $key_length, $seed, $case_sensitivity, 'contains' ); + } + } + } + + /** + * Ensure generated read_token() probes agree with a naive reference lookup. + * + * @ticket 60698 + * + * @dataProvider data_generated_token_sets + * + * @param array $mappings Generated token mappings. + * @param int $key_length Group key length for the generated map. + * @param int $seed Seed used to generate the token set. + */ + public function test_read_token_matches_reference_for_generated_documents( $mappings, $key_length, $seed ) { + $map = WP_Token_Map::from_array( $mappings, $key_length ); + $this->assertInstanceOf( WP_Token_Map::class, $map ); + + foreach ( self::generated_documents( $mappings, $seed ) as $document_index => $document ) { + $document_length = strlen( $document ); + + for ( $offset = 0; $offset <= $document_length; $offset++ ) { + foreach ( self::case_sensitivities() as $case_sensitivity ) { + $this->assert_read_token_matches_reference( + $map, + $mappings, + $document, + $offset, + $key_length, + $seed, + $case_sensitivity, + "read_token document {$document_index}" + ); + } + } + } + } + + /** + * Ensure generated nested-prefix families match greedily. + * + * @ticket 60698 + * + * @dataProvider data_key_lengths + * + * @param int $key_length Group key length for the generated map. + */ + public function test_generated_nested_prefix_families_match_longest_token( $key_length ) { + $mappings = array(); + $token = ''; + foreach ( array( 'a', 'b', 'c', 'D', ';', "\x80", 'e', 'f' ) as $chunk ) { + $token .= $chunk; + $mappings[ $token ] = 'value-' . strlen( $token ); + } + + $map = WP_Token_Map::from_array( $mappings, $key_length ); + $this->assertInstanceOf( WP_Token_Map::class, $map ); + + $document = "{$token} suffix"; + $length = null; + $this->assertSame( $mappings[ $token ], $map->read_token( $document, 0, $length ) ); + $this->assertSame( strlen( $token ), $length ); + } + + /** + * Ensure generated maps preserve behavior after to_array()/from_array(). + * + * @ticket 60698 + * + * @dataProvider data_generated_token_sets + * + * @param array $mappings Generated token mappings. + * @param int $key_length Group key length for the generated map. + * @param int $seed Seed used to generate the token set. + */ + public function test_generated_maps_round_trip_through_array_export( $mappings, $key_length, $seed ) { + $map = WP_Token_Map::from_array( $mappings, $key_length ); + $this->assertInstanceOf( WP_Token_Map::class, $map ); + + $round_tripped = WP_Token_Map::from_array( $map->to_array(), $key_length ); + $this->assertInstanceOf( WP_Token_Map::class, $round_tripped ); + + $this->assert_map_behavior_matches_reference( $round_tripped, $mappings, $key_length, $seed, 'to_array round-trip' ); + } + + /** + * Ensure generated maps preserve behavior after precomputed table export. + * + * @ticket 60698 + * + * @dataProvider data_generated_token_sets + * + * @param array $mappings Generated token mappings. + * @param int $key_length Group key length for the generated map. + * @param int $seed Seed used to generate the token set. + */ + public function test_generated_maps_round_trip_through_precomputed_source_table( $mappings, $key_length, $seed ) { + $map = WP_Token_Map::from_array( $mappings, $key_length ); + $this->assertInstanceOf( WP_Token_Map::class, $map ); + + $source_table = $map->precomputed_php_source_table(); + // phpcs:ignore Squiz.PHP.Eval.Discouraged -- This verifies generated source round-trips. + $round_tripped = eval( "return {$source_table};" ); + $this->assertInstanceOf( WP_Token_Map::class, $round_tripped ); + + $this->assert_map_behavior_matches_reference( $round_tripped, $mappings, $key_length, $seed, 'precomputed table round-trip' ); + } + + /** + * Ensure ASCII-insensitive matching leaves non-ASCII bytes literal. + * + * @ticket 60698 + */ + public function test_ascii_case_insensitive_matching_keeps_non_ascii_bytes_literal() { + $mappings = array( + "alpha\xE9" => 'latin-1-lower', + "bravo\xC3\xA9" => 'utf-8-lower', + "charlie\x80Z" => 'raw-byte', + ); + $map = WP_Token_Map::from_array( $mappings, 2 ); + + $this->assertTrue( $map->contains( "ALPHA\xE9", 'ascii-case-insensitive' ) ); + $this->assertFalse( $map->contains( "ALPHA\xC9", 'ascii-case-insensitive' ) ); + $this->assertTrue( $map->contains( "BRAVO\xC3\xA9", 'ascii-case-insensitive' ) ); + $this->assertFalse( $map->contains( "BRAVO\xC3\x89", 'ascii-case-insensitive' ) ); + + $length = null; + $this->assertSame( 'raw-byte', $map->read_token( "CHARLIE\x80z", 0, $length, 'ascii-case-insensitive' ) ); + $this->assertSame( strlen( "charlie\x80Z" ), $length ); + + $length = null; + $this->assertNull( $map->read_token( "CHARLIE\x81z", 0, $length, 'ascii-case-insensitive' ) ); + $this->assertNull( $length ); + } + + /** + * Ensure array export preserves one-byte group keys. + * + * This is the minimized regression for generated key_length=1 maps. + * + * @ticket 60698 + */ + public function test_array_export_preserves_single_byte_group_keys() { + $mappings = array( + 'a' => 'short', + 'ab' => 'long', + 'ac' => 'sibling', + ); + $map = WP_Token_Map::from_array( $mappings, 1 ); + + $expected = $mappings; + $actual = $map->to_array(); + ksort( $expected ); + ksort( $actual ); + + $this->assertSame( $expected, $actual ); + } + + /** + * Ensure ASCII-insensitive reads work for short tokens. + * + * This is the minimized regression for generated case-insensitive short + * token probes. + * + * @ticket 60698 + */ + public function test_ascii_case_insensitive_reads_short_tokens() { + $map = WP_Token_Map::from_array( array( 'ab' => 'short-token' ), 2 ); + $length = null; + + $this->assertSame( 'short-token', $map->read_token( 'AB', 0, $length, 'ascii-case-insensitive' ) ); + $this->assertSame( 2, $length ); + } + + /** + * Ensure ASCII-insensitive reads check every folded-equivalent group key. + * + * @ticket 60698 + * + * @dataProvider data_ascii_case_insensitive_group_key_collisions + * + * @param array $mappings Token mappings with folded-equivalent group keys. + * @param int $key_length Group key length for the generated map. + * @param string $probe Probe text. + * @param string $expected Expected mapping. + */ + public function test_ascii_case_insensitive_reads_folded_group_key_collisions( $mappings, $key_length, $probe, $expected ) { + $map = WP_Token_Map::from_array( $mappings, $key_length ); + $length = null; + + $this->assertTrue( $map->contains( $probe, 'ascii-case-insensitive' ) ); + $this->assertSame( $expected, $map->read_token( $probe, 0, $length, 'ascii-case-insensitive' ) ); + $this->assertSame( strlen( $probe ), $length ); + } + + /** + * Ensure generated PHP source escapes tokens and mappings safely. + * + * @ticket 60698 + */ + public function test_precomputed_source_table_escapes_php_string_and_comment_bytes() { + $mappings = array( + 'quote"token' => 'quote"value', + 'slash\\token' => 'slash\\value', + 'dollar$token' => 'dollar$value', + "control\ntoken" => "control\nvalue", + 'close?>tag' => 'close?>value', + "high\x80\xFFtoken" => "high\x80\xFFvalue", + ); + $map = WP_Token_Map::from_array( $mappings, 2 ); + + $source_table = $map->precomputed_php_source_table(); + // phpcs:ignore Squiz.PHP.Eval.Discouraged -- This verifies generated source round-trips. + $round_tripped = eval( "return {$source_table};" ); + + $this->assertInstanceOf( WP_Token_Map::class, $round_tripped ); + $this->assertSame( $map->to_array(), $round_tripped->to_array() ); + } + + /** + * Ensure short-token reads do not consume missing bytes. + * + * @ticket 60698 + */ + public function test_short_token_reads_ignore_text_shorter_than_token() { + $map = WP_Token_Map::from_array( array( 'ab' => 'short-token' ), 2 ); + $length = null; + + $this->assertNull( $map->read_token( 'a', 0, $length ) ); + $this->assertNull( $length ); + + $length = null; + $this->assertNull( $map->read_token( '', 0, $length ) ); + $this->assertNull( $length ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public static function data_generated_token_sets() { + $cases = array( + 'seed 539231511 key_length 1' => array( 539231511, 1, 70 ), + 'seed 539231512 key_length 2' => array( 539231512, 2, 90 ), + 'seed 867530901 key_length 1' => array( 867530901, 1, 60 ), + 'seed 867530902 key_length 2' => array( 867530902, 2, 80 ), + ); + + foreach ( $cases as $name => $case ) { + list( $seed, $key_length, $target_count ) = $case; + yield $name => array( self::generate_token_set( $seed, $key_length, $target_count ), $key_length, $seed ); + } + } + + /** + * Data provider. + * + * @return array[]. + */ + public static function data_key_lengths() { + return array( + 'key length 1' => array( 1 ), + 'key length 2' => array( 2 ), + ); + } + + /** + * Data provider. + * + * @return array[]. + */ + public static function data_ascii_case_insensitive_group_key_collisions() { + return array( + 'key length 1' => array( + array( + 'Ab' => 'upper-group', + 'aa' => 'lower-group', + ), + 1, + 'aa', + 'lower-group', + ), + 'key length 2' => array( + array( + 'Abc' => 'mixed-group-one', + 'aBd' => 'mixed-group-two', + ), + 2, + 'abd', + 'mixed-group-two', + ), + ); + } + + /** + * Assert that a token map behaves like the reference implementation. + * + * @param WP_Token_Map $map Token map under test. + * @param array $mappings Generated token mappings. + * @param int $key_length Group key length for the generated map. + * @param int $seed Seed used to generate the token set. + * @param string $label Describes the map under test. + */ + private function assert_map_behavior_matches_reference( $map, $mappings, $key_length, $seed, $label ) { + foreach ( self::contains_probes( $mappings, $seed ) as $probe ) { + foreach ( self::case_sensitivities() as $case_sensitivity ) { + $this->assert_contains_matches_reference( $map, $mappings, $probe, $key_length, $seed, $case_sensitivity, "{$label} contains" ); + } + } + + foreach ( self::generated_documents( $mappings, $seed ) as $document_index => $document ) { + $document_length = strlen( $document ); + for ( $offset = 0; $offset <= $document_length; $offset++ ) { + foreach ( self::case_sensitivities() as $case_sensitivity ) { + $this->assert_read_token_matches_reference( + $map, + $mappings, + $document, + $offset, + $key_length, + $seed, + $case_sensitivity, + "{$label} read_token document {$document_index}" + ); + } + } + } + } + + /** + * Assert contains() behavior against the reference implementation. + * + * @param WP_Token_Map $map Token map under test. + * @param array $mappings Generated token mappings. + * @param string $probe Probe word. + * @param int $key_length Group key length for the generated map. + * @param int $seed Seed used to generate the token set. + * @param string $case_sensitivity Case sensitivity mode. + * @param string $operation Operation being tested. + */ + private function assert_contains_matches_reference( $map, $mappings, $probe, $key_length, $seed, $case_sensitivity, $operation ) { + $expected = self::reference_contains( $mappings, $probe, $case_sensitivity ); + $actual = $map->contains( $probe, $case_sensitivity ); + + if ( $expected !== $actual ) { + $this->assertSame( + $expected, + $actual, + self::failure_context( $mappings, $key_length, $seed, $case_sensitivity, $operation, $probe ) + ); + } + } + + /** + * Assert read_token() behavior against the reference implementation. + * + * @param WP_Token_Map $map Token map under test. + * @param array $mappings Generated token mappings. + * @param string $document Document to probe. + * @param int $offset Offset at which to probe. + * @param int $key_length Group key length for the generated map. + * @param int $seed Seed used to generate the token set. + * @param string $case_sensitivity Case sensitivity mode. + * @param string $operation Operation being tested. + */ + private function assert_read_token_matches_reference( $map, $mappings, $document, $offset, $key_length, $seed, $case_sensitivity, $operation ) { + $expected = self::reference_read_token( $mappings, $document, $offset, $case_sensitivity ); + $actual_length = null; + $actual_response = $map->read_token( $document, $offset, $actual_length, $case_sensitivity ); + + if ( $expected['value'] !== $actual_response ) { + $this->assertSame( + $expected['value'], + $actual_response, + self::failure_context( $mappings, $key_length, $seed, $case_sensitivity, $operation, $document, $offset ) . '; response' + ); + } + + if ( $expected['length'] !== $actual_length ) { + $this->assertSame( + $expected['length'], + $actual_length, + self::failure_context( $mappings, $key_length, $seed, $case_sensitivity, $operation, $document, $offset ) . '; matched length' + ); + } + } + + /** + * Return case-sensitivity modes used by the public API. + * + * @return string[] Case-sensitivity modes. + */ + private static function case_sensitivities() { + return array( 'case-sensitive', 'ascii-case-insensitive' ); + } + + /** + * Generate a deterministic token set. + * + * NUL is excluded from generated tokens because the implementation treats + * lookup words containing NUL as invalid. Probe words and documents include + * NUL so failed lookups still exercise that byte. + * + * @param int $seed Seed used to generate the token set. + * @param int $key_length Group key length for the generated map. + * @param int $target_count Number of generated tokens to target. + * @return array Generated token mappings. + */ + private static function generate_token_set( $seed, $key_length, $target_count ) { + $state = $seed; + $mappings = array(); + + self::add_token( $mappings, 'a', $seed ); + self::add_token( $mappings, 'B', $seed ); + if ( $key_length > 1 ) { + self::add_token( $mappings, 'c', $seed ); + } + self::add_token( $mappings, str_repeat( 'k', $key_length ), $seed ); + self::add_token( $mappings, str_repeat( 'L', 255 ), $seed ); + self::add_token( $mappings, "hi\x80A;", $seed ); + self::add_token( $mappings, "jo\xFFb;", $seed ); + self::add_token( $mappings, "utf\xC3\xA9;", $seed ); + self::add_token( $mappings, "euro\xE2\x82\xAC;", $seed ); + if ( 1 === $key_length ) { + self::add_token( $mappings, 'Ab', $seed ); + self::add_token( $mappings, 'aa', $seed ); + } else { + self::add_token( $mappings, 'Abc', $seed ); + self::add_token( $mappings, 'aBd', $seed ); + } + + $nested = ''; + foreach ( array( 'p', 'r', 'e', 'F', 'i', 'x', ';', "\x80", 'z' ) as $chunk ) { + $nested .= $chunk; + self::add_token( $mappings, $nested, $seed ); + } + + $group_key = 1 === $key_length ? 'g' : 'gy'; + for ( $i = 0; $i < 24; $i++ ) { + self::add_token( $mappings, $group_key . self::random_token_suffix( $state, 2 + ( $i % 7 ) ), $seed ); + } + + $attempts = 0; + while ( count( $mappings ) < $target_count && $attempts < $target_count * 40 ) { + self::add_token( $mappings, self::random_token( $state, $key_length, $attempts ), $seed ); + ++$attempts; + } + + return $mappings; + } + + /** + * Add a token to the generated map if it is unambiguous. + * + * @param array $mappings Generated token mappings. + * @param string $token Token to add. + * @param int $seed Seed used to generate the token set. + */ + private static function add_token( &$mappings, $token, $seed ) { + if ( '' === $token || false !== strpos( $token, "\x00" ) || WP_Token_Map::MAX_LENGTH <= strlen( $token ) ) { + return; + } + + foreach ( $mappings as $existing_token => $mapping ) { + if ( self::ascii_lowercase( $existing_token ) === self::ascii_lowercase( $token ) ) { + return; + } + } + + $mappings[ $token ] = 'value-' . $seed . '-' . count( $mappings ); + } + + /** + * Generate a token from the allowed byte classes. + * + * @param int $state Pseudo-random generator state. + * @param int $key_length Group key length for the generated map. + * @param int $index Token index. + * @return string Generated token. + */ + private static function random_token( &$state, $key_length, $index ) { + $choice = self::random_int( $state, 0, 9 ); + if ( $choice < 3 && $key_length > 1 ) { + $target_length = self::random_int( $state, 1, $key_length - 1 ); + } elseif ( $choice < 6 ) { + $target_length = $key_length; + } elseif ( $choice < 9 ) { + $target_length = self::random_int( $state, $key_length + 1, 24 ); + } else { + $target_length = self::random_int( $state, 48, 96 ); + } + + $token = chr( ord( 'm' ) + ( $index % 10 ) ); + while ( strlen( $token ) < $target_length ) { + $token .= self::random_token_chunk( $state ); + } + + return substr( $token, 0, $target_length ); + } + + /** + * Generate a random suffix. + * + * @param int $state Pseudo-random generator state. + * @param int $target_length Target byte length. + * @return string Generated suffix. + */ + private static function random_token_suffix( &$state, $target_length ) { + $suffix = ''; + while ( strlen( $suffix ) < $target_length ) { + $suffix .= self::random_token_chunk( $state ); + } + + return substr( $suffix, 0, $target_length ); + } + + /** + * Generate a random token chunk. + * + * @param int $state Pseudo-random generator state. + * @return string Generated chunk. + */ + private static function random_token_chunk( &$state ) { + $chunks = array( + 'a', + 'b', + 'C', + 'D', + '0', + '9', + ';', + "\x80", + "\xFF", + "\xC2\xA9", + "\xE2\x82\xAC", + ); + + return $chunks[ self::random_int( $state, 0, count( $chunks ) - 1 ) ]; + } + + /** + * Generate contains() probe words. + * + * @param array $mappings Generated token mappings. + * @param int $seed Seed used to generate the token set. + * @return string[] Probe words. + */ + private static function contains_probes( $mappings, $seed ) { + $state = $seed ^ 0x5A5A5A5A; + $probes = array( '', "\x00", "a\x00", "z\x00z" ); + + foreach ( array_keys( $mappings ) as $token ) { + $probes[] = $token; + $probes[] = self::swap_ascii_case( $token ); + $probes[] = $token . self::random_probe_byte( $state ); + $probes[] = self::mutate_one_byte( $token, $state ); + + if ( strlen( $token ) > 1 ) { + $probes[] = substr( $token, 0, -1 ); + } + + for ( $length = 1; $length < strlen( $token ); $length++ ) { + $probes[] = substr( $token, 0, $length ); + } + } + + for ( $i = 0; $i < 400; $i++ ) { + $probes[] = self::random_probe_word( $state, self::random_int( $state, 0, 32 ) ); + } + + return array_values( array_unique( $probes, SORT_STRING ) ); + } + + /** + * Generate documents for read_token() probes. + * + * @param array $mappings Generated token mappings. + * @param int $seed Seed used to generate the token set. + * @return string[] Generated documents. + */ + private static function generated_documents( $mappings, $seed ) { + $state = $seed ^ 0x13572468; + $tokens = array_keys( $mappings ); + usort( $tokens, array( __CLASS__, 'longest_first_then_alphabetical' ) ); + + $documents = array( + '', + 'prefix' . $tokens[0] . 'suffix', + self::swap_ascii_case( $tokens[0] ) . "\x00" . $tokens[ count( $tokens ) - 1 ], + ); + + for ( $i = 0; $i < 10; $i++ ) { + $document = ''; + for ( $j = 0; $j < 32; $j++ ) { + $token = $tokens[ self::random_int( $state, 0, count( $tokens ) - 1 ) ]; + switch ( self::random_int( $state, 0, 5 ) ) { + case 0: + $document .= $token; + break; + + case 1: + $document .= self::swap_ascii_case( $token ); + break; + + case 2: + $document .= substr( $token, 0, self::random_int( $state, 0, strlen( $token ) ) ); + break; + + case 3: + $document .= self::mutate_one_byte( $token, $state ); + break; + + case 4: + $document .= $token . self::random_probe_word( $state, self::random_int( $state, 1, 4 ) ); + break; + + default: + $document .= self::random_probe_word( $state, self::random_int( $state, 1, 8 ) ); + break; + } + } + + $documents[] = $document; + } + + return $documents; + } + + /** + * Reference implementation for contains(). + * + * @param array $mappings Generated token mappings. + * @param string $word Probe word. + * @param string $case_sensitivity Case sensitivity mode. + * @return bool Whether the generated set contains the probe word. + */ + private static function reference_contains( $mappings, $word, $case_sensitivity ) { + if ( 'case-sensitive' === $case_sensitivity ) { + return array_key_exists( $word, $mappings ); + } + + foreach ( array_keys( $mappings ) as $token ) { + if ( self::ascii_lowercase( $word ) === self::ascii_lowercase( $token ) ) { + return true; + } + } + + return false; + } + + /** + * Reference implementation for read_token(). + * + * @param array $mappings Generated token mappings. + * @param string $document Document to probe. + * @param int $offset Offset at which to probe. + * @param string $case_sensitivity Case sensitivity mode. + * @return array Expected response and matched token length. + */ + private static function reference_read_token( $mappings, $document, $offset, $case_sensitivity ) { + $tokens = array_keys( $mappings ); + $document_length = strlen( $document ); + $ignore_case = 'ascii-case-insensitive' === $case_sensitivity; + usort( $tokens, array( __CLASS__, 'longest_first_then_alphabetical' ) ); + + foreach ( $tokens as $token ) { + $token_length = strlen( $token ); + if ( $offset + $token_length > $document_length ) { + continue; + } + + $candidate = substr( $document, $offset, $token_length ); + $matches = $ignore_case + ? self::ascii_lowercase( $candidate ) === self::ascii_lowercase( $token ) + : $candidate === $token; + + if ( $matches ) { + return array( + 'value' => $mappings[ $token ], + 'length' => $token_length, + ); + } + } + + return array( + 'value' => null, + 'length' => null, + ); + } + + /** + * Sort longer strings first, then alphabetically. + * + * @param string $a First string to compare. + * @param string $b Second string to compare. + * @return int Sort order. + */ + private static function longest_first_then_alphabetical( $a, $b ) { + if ( $a === $b ) { + return 0; + } + + $length_a = strlen( $a ); + $length_b = strlen( $b ); + if ( $length_a !== $length_b ) { + return $length_b - $length_a; + } + + return strcmp( $a, $b ); + } + + /** + * Mutate one byte in a token. + * + * @param string $token Token to mutate. + * @param int $state Pseudo-random generator state. + * @return string Mutated token. + */ + private static function mutate_one_byte( $token, &$state ) { + if ( '' === $token ) { + return self::random_probe_byte( $state ); + } + + $offset = self::random_int( $state, 0, strlen( $token ) - 1 ); + $replacement = self::random_probe_byte( $state ); + while ( $replacement === $token[ $offset ] ) { + $replacement = self::random_probe_byte( $state ); + } + + return substr( $token, 0, $offset ) . $replacement . substr( $token, $offset + 1 ); + } + + /** + * Swap ASCII case in a byte string. + * + * @param string $text Text whose ASCII case should be swapped. + * @return string Text with ASCII case swapped. + */ + private static function swap_ascii_case( $text ) { + $output = ''; + $length = strlen( $text ); + + for ( $i = 0; $i < $length; $i++ ) { + $byte = ord( $text[ $i ] ); + if ( 0x41 <= $byte && $byte <= 0x5A ) { + $output .= chr( $byte + 0x20 ); + } elseif ( 0x61 <= $byte && $byte <= 0x7A ) { + $output .= chr( $byte - 0x20 ); + } else { + $output .= $text[ $i ]; + } + } + + return $output; + } + + /** + * Lowercase ASCII bytes only. + * + * @param string $text Text to lowercase. + * @return string Text with only ASCII uppercase bytes folded to lowercase. + */ + private static function ascii_lowercase( $text ) { + return strtr( $text, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz' ); + } + + /** + * Generate a random probe word. + * + * @param int $state Pseudo-random generator state. + * @param int $length Target byte length. + * @return string Generated word. + */ + private static function random_probe_word( &$state, $length ) { + $word = ''; + while ( strlen( $word ) < $length ) { + $word .= self::random_probe_byte( $state ); + } + + return substr( $word, 0, $length ); + } + + /** + * Generate one random probe byte. + * + * @param int $state Pseudo-random generator state. + * @return string Generated byte. + */ + private static function random_probe_byte( &$state ) { + $bytes = array( + "\x00", + 'a', + 'Z', + '4', + ';', + '_', + "\x80", + "\xFF", + "\xC3", + "\xA9", + "\xE2", + "\x82", + "\xAC", + ); + + return $bytes[ self::random_int( $state, 0, count( $bytes ) - 1 ) ]; + } + + /** + * Deterministic pseudo-random integer. + * + * @param int $state Pseudo-random generator state. + * @param int $min Minimum value. + * @param int $max Maximum value. + * @return int Generated integer. + */ + private static function random_int( &$state, $min, $max ) { + $state = ( ( 1103515245 * $state ) + 12345 ) % 2147483648; + + return $min + ( $state % ( $max - $min + 1 ) ); + } + + /** + * Build an actionable assertion failure message. + * + * @param array $mappings Generated token mappings. + * @param int $key_length Group key length for the generated map. + * @param int $seed Seed used to generate the token set. + * @param string $case_sensitivity Case sensitivity mode. + * @param string $operation Operation being tested. + * @param string $probe Probe word or document. + * @param int|null $offset Optional offset into the probe. + * @return string Assertion failure context. + */ + private static function failure_context( $mappings, $key_length, $seed, $case_sensitivity, $operation, $probe, $offset = null ) { + $context = "Seed {$seed}; key_length {$key_length}; {$operation}; case {$case_sensitivity}; probe " . bin2hex( $probe ); + if ( null !== $offset ) { + $context .= "; offset {$offset}"; + } + + return $context . '; token_set ' . base64_encode( serialize( $mappings ) ); + } +}