From bfb6efbf075f3d5dec1db724b7f12f3e68e6dc8b Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 13:24:52 +0200 Subject: [PATCH 001/486] Add native HTML API Rust extension prototype --- ext/html-api-rust/.gitignore | 31 + ext/html-api-rust/Cargo.lock | 7 + ext/html-api-rust/Cargo.toml | 14 + ext/html-api-rust/README.md | 22 + ext/html-api-rust/build.sh | 9 + ext/html-api-rust/config.m4 | 13 + ext/html-api-rust/configure.ac | 200 ++ ext/html-api-rust/php_wp_html_api_rust.h | 9 + ext/html-api-rust/src/lib.rs | 2655 +++++++++++++++++ ext/html-api-rust/wp_html_api_rust.c | 2561 ++++++++++++++++ .../html-api/class-wp-html-processor.php | 8 +- .../html-api/class-wp-html-tag-processor.php | 18 + 12 files changed, 5545 insertions(+), 2 deletions(-) create mode 100644 ext/html-api-rust/.gitignore create mode 100644 ext/html-api-rust/Cargo.lock create mode 100644 ext/html-api-rust/Cargo.toml create mode 100644 ext/html-api-rust/README.md create mode 100644 ext/html-api-rust/build.sh create mode 100644 ext/html-api-rust/config.m4 create mode 100644 ext/html-api-rust/configure.ac create mode 100644 ext/html-api-rust/php_wp_html_api_rust.h create mode 100644 ext/html-api-rust/src/lib.rs create mode 100644 ext/html-api-rust/wp_html_api_rust.c diff --git a/ext/html-api-rust/.gitignore b/ext/html-api-rust/.gitignore new file mode 100644 index 0000000000000..ffade7ab1e7b2 --- /dev/null +++ b/ext/html-api-rust/.gitignore @@ -0,0 +1,31 @@ +/target/ +/autom4te.cache/ +/build/ +/modules/ +/.libs/ +/Makefile +/Makefile.fragments +/Makefile.objects +/Makefile.global +/acinclude.m4 +/aclocal.m4 +/config.cache +/config.guess +/config.h +/config.h.in +/config.log +/config.nice +/config.status +/config.sub +/configure +/configure~ +/install-sh +/libtool +/ltmain.sh +/missing +/mkinstalldirs +/run-tests.php +/tmp-php.ini +/*.dep +/*.la +/*.lo diff --git a/ext/html-api-rust/Cargo.lock b/ext/html-api-rust/Cargo.lock new file mode 100644 index 0000000000000..13b8d48675c6a --- /dev/null +++ b/ext/html-api-rust/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "wp-html-api-rust-core" +version = "0.1.0" diff --git a/ext/html-api-rust/Cargo.toml b/ext/html-api-rust/Cargo.toml new file mode 100644 index 0000000000000..24cb7d69a3d45 --- /dev/null +++ b/ext/html-api-rust/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "wp-html-api-rust-core" +version = "0.1.0" +edition = "2021" +publish = false + +[lib] +name = "wp_html_api_rust_core" +crate-type = ["staticlib"] + +[profile.release] +lto = true +codegen-units = 1 +panic = "abort" diff --git a/ext/html-api-rust/README.md b/ext/html-api-rust/README.md new file mode 100644 index 0000000000000..2e047085ee2c3 --- /dev/null +++ b/ext/html-api-rust/README.md @@ -0,0 +1,22 @@ +# WordPress HTML API Rust Extension + +This directory contains the native Rust core and PHP extension shim for the +incremental Rust implementation of the WordPress HTML API. + +Build locally with: + +```sh +cd ext/html-api-rust +sh build.sh +``` + +Smoke-test the built extension with: + +```sh +php -d extension="$(pwd)/modules/wp_html_api_rust.so" \ + -r 'var_dump(wp_html_api_rust_version(), wp_html_api_rust_scan_next_tag("

Hi

"));' +``` + +The first Rust surface is a tokenizer slice for locating the next tag opener. +It is intentionally narrow and will be expanded into the `WP_HTML_Tag_Processor` +state machine before replacing the PHP interface in WordPress bootstrap. diff --git a/ext/html-api-rust/build.sh b/ext/html-api-rust/build.sh new file mode 100644 index 0000000000000..4de3764dabfe8 --- /dev/null +++ b/ext/html-api-rust/build.sh @@ -0,0 +1,9 @@ +#!/bin/sh +set -eu + +PHP_CONFIG_BIN="${PHP_CONFIG:-php-config}" + +cargo build --release +phpize +./configure --enable-wp-html-api-rust --with-php-config="${PHP_CONFIG_BIN}" +make diff --git a/ext/html-api-rust/config.m4 b/ext/html-api-rust/config.m4 new file mode 100644 index 0000000000000..911511934b22c --- /dev/null +++ b/ext/html-api-rust/config.m4 @@ -0,0 +1,13 @@ +PHP_ARG_ENABLE( + [wp-html-api-rust], + [whether to enable the WordPress HTML API Rust extension], + [AS_HELP_STRING([--enable-wp-html-api-rust], [Enable WordPress HTML API Rust extension])], + [no] +) + +if test "$PHP_WP_HTML_API_RUST" != "no"; then + PHP_SUBST(WP_HTML_API_RUST_SHARED_LIBADD) + PHP_ADD_LIBRARY_WITH_PATH(wp_html_api_rust_core, $abs_srcdir/target/release, WP_HTML_API_RUST_SHARED_LIBADD) + + PHP_NEW_EXTENSION([wp_html_api_rust], [wp_html_api_rust.c], [$ext_shared]) +fi diff --git a/ext/html-api-rust/configure.ac b/ext/html-api-rust/configure.ac new file mode 100644 index 0000000000000..be7065b786d96 --- /dev/null +++ b/ext/html-api-rust/configure.ac @@ -0,0 +1,200 @@ +dnl This file becomes configure.ac for self-contained extensions. + +dnl Include external macro definitions before the AC_INIT to also remove +dnl comments starting with # and empty newlines from the included files. +m4_include([build/ax_check_compile_flag.m4]) +m4_include([build/ax_gcc_func_attribute.m4]) +m4_include([build/libtool.m4]) +m4_include([build/php_cxx_compile_stdcxx.m4]) +m4_include([build/php.m4]) +m4_include([build/pkg.m4]) + +AC_PREREQ([2.68]) +AC_INIT +AC_CONFIG_SRCDIR([config.m4]) +AC_CONFIG_AUX_DIR([build]) +AC_PRESERVE_HELP_ORDER + +PHP_CONFIG_NICE([config.nice]) + +AC_DEFUN([PHP_EXT_BUILDDIR],[.])dnl +AC_DEFUN([PHP_EXT_DIR],[""])dnl +AC_DEFUN([PHP_EXT_SRCDIR],[$abs_srcdir])dnl +AC_DEFUN([PHP_ALWAYS_SHARED],[ + ext_output="yes, shared" + ext_shared=yes + test "[$]$1" = "no" && $1=yes +])dnl + +PHP_INIT_BUILD_SYSTEM + +PKG_PROG_PKG_CONFIG +AC_PROG_CC([cc gcc]) +PHP_DETECT_ICC +PHP_DETECT_SUNCC + +dnl Support systems with system libraries in e.g. /usr/lib64. +PHP_ARG_WITH([libdir], + [for system library directory], + [AS_HELP_STRING([--with-libdir=NAME], + [Look for libraries in .../NAME rather than .../lib])], + [lib], + [no]) + +PHP_RUNPATH_SWITCH +PHP_SHLIB_SUFFIX_NAMES + +dnl Find php-config script. +PHP_ARG_WITH([php-config],, + [AS_HELP_STRING([--with-php-config=PATH], + [Path to php-config [php-config]])], + [php-config], + [no]) + +dnl For BC. +PHP_CONFIG=$PHP_PHP_CONFIG +prefix=$($PHP_CONFIG --prefix 2>/dev/null) +phpincludedir=$($PHP_CONFIG --include-dir 2>/dev/null) +INCLUDES=$($PHP_CONFIG --includes 2>/dev/null) +EXTENSION_DIR=$($PHP_CONFIG --extension-dir 2>/dev/null) +PHP_EXECUTABLE=$($PHP_CONFIG --php-binary 2>/dev/null) + +AS_VAR_IF([prefix],, + [AC_MSG_ERROR([Cannot find php-config. Please use --with-php-config=PATH])]) + +AC_MSG_CHECKING([for PHP prefix]) +AC_MSG_RESULT([$prefix]) +AC_MSG_CHECKING([for PHP includes]) +AC_MSG_RESULT([$INCLUDES]) +AC_MSG_CHECKING([for PHP extension directory]) +AC_MSG_RESULT([$EXTENSION_DIR]) +AC_MSG_CHECKING([for PHP installed headers prefix]) +AC_MSG_RESULT([$phpincludedir]) + +dnl Checks for PHP_DEBUG / ZEND_DEBUG / ZTS. +AC_MSG_CHECKING([if debugging is enabled]) +old_CPPFLAGS=$CPPFLAGS +CPPFLAGS="-I$phpincludedir" +AC_EGREP_CPP([php_debug_is_enabled], [ +#include
+#if ZEND_DEBUG +php_debug_is_enabled +#endif +], + [PHP_DEBUG=yes], + [PHP_DEBUG=no]) +CPPFLAGS=$old_CPPFLAGS +AC_MSG_RESULT([$PHP_DEBUG]) + +AC_MSG_CHECKING([if PHP is built with thread safety (ZTS)]) +old_CPPFLAGS=$CPPFLAGS +CPPFLAGS="-I$phpincludedir" +AC_EGREP_CPP([php_zts_is_enabled], [ +#include
+#ifdef ZTS +php_zts_is_enabled +#endif +], + [PHP_THREAD_SAFETY=yes], + [PHP_THREAD_SAFETY=no]) +CPPFLAGS=$old_CPPFLAGS +AC_MSG_RESULT([$PHP_THREAD_SAFETY]) + +dnl Discard optimization flags when debugging is enabled. +AS_VAR_IF([PHP_DEBUG], [yes], [ + PHP_DEBUG=1 + ZEND_DEBUG=yes + PHP_REMOVE_OPTIMIZATION_FLAGS + dnl Add -O0 only if GCC or ICC is used. + if test "$GCC" = "yes" || test "$ICC" = "yes"; then + CFLAGS="$CFLAGS -O0" + CXXFLAGS="$CXXFLAGS -g -O0" + fi + if test "$SUNCC" = "yes"; then + if test -n "$auto_cflags"; then + CFLAGS="-g" + CXXFLAGS="-g" + else + CFLAGS="$CFLAGS -g" + CXXFLAGS="$CFLAGS -g" + fi + fi +], [ + PHP_DEBUG=0 + ZEND_DEBUG=no +]) + +dnl Always shared. +PHP_BUILD_SHARED + +PHP_HELP_SEPARATOR([Extension:]) +PHP_CONFIGURE_PART([Configuring extension]) + +sinclude(config.m4) + +enable_static=no +enable_shared=yes + +PHP_HELP_SEPARATOR([Libtool:]) +PHP_CONFIGURE_PART([Configuring libtool]) + +dnl Only allow AC_PROG_CXX and AC_PROG_CXXCPP if they are explicitly called (by +dnl PHP_REQUIRE_CXX). Otherwise AC_PROG_LIBTOOL fails if there is no working C++ +dnl compiler. +AC_PROVIDE_IFELSE([PHP_REQUIRE_CXX], [], [ + undefine([AC_PROG_CXX]) + AC_DEFUN([AC_PROG_CXX], []) + undefine([AC_PROG_CXXCPP]) + AC_DEFUN([AC_PROG_CXXCPP], [php_prog_cxxcpp=disabled]) +]) +AC_PROG_LIBTOOL + +all_targets='$(PHP_MODULES) $(PHP_ZEND_EX)' +install_targets="install-modules install-headers" +CPPFLAGS="$CPPFLAGS -DHAVE_CONFIG_H" +CFLAGS_CLEAN='$(CFLAGS) -D_GNU_SOURCE' +CXXFLAGS_CLEAN='$(CXXFLAGS)' + +AS_VAR_IF([prefix], [NONE], [prefix=/usr/local]) +AS_VAR_IF([exec_prefix], [NONE], [exec_prefix='$(prefix)']) + +AS_VAR_IF([cross_compiling], [yes], + [AC_CHECK_PROGS([BUILD_CC], [gcc clang c99 c89 cc cl], [none]) + AC_MSG_CHECKING([for native build C compiler]) + AC_MSG_RESULT([$BUILD_CC])], + [BUILD_CC=$CC]) + +PHP_SUBST([PHP_MODULES]) +PHP_SUBST([PHP_ZEND_EX]) +PHP_SUBST([all_targets]) +PHP_SUBST([install_targets]) +PHP_SUBST([prefix]) +PHP_SUBST([exec_prefix]) +PHP_SUBST([libdir]) +PHP_SUBST([phpincludedir]) +PHP_SUBST([CC]) +PHP_SUBST([CFLAGS]) +PHP_SUBST([CFLAGS_CLEAN]) +PHP_SUBST([CPP]) +PHP_SUBST([CPPFLAGS]) +PHP_SUBST([CXX]) +PHP_SUBST([CXXFLAGS]) +PHP_SUBST([CXXFLAGS_CLEAN]) +PHP_SUBST([EXTENSION_DIR]) +PHP_SUBST([PHP_EXECUTABLE]) +PHP_SUBST([EXTRA_LDFLAGS]) +PHP_SUBST([EXTRA_LIBS]) +PHP_SUBST([INCLUDES]) +PHP_SUBST([LDFLAGS]) +PHP_SUBST([LIBTOOL]) +PHP_SUBST([SHELL]) +PHP_SUBST([INSTALL_HEADERS]) +PHP_SUBST([BUILD_CC]) + +PHP_CONFIGURE_PART([Generating files]) + +AC_CONFIG_HEADERS([config.h]) + +AC_CONFIG_COMMANDS_PRE([PHP_PATCH_CONFIG_HEADERS([config.h.in])]) + +AC_OUTPUT diff --git a/ext/html-api-rust/php_wp_html_api_rust.h b/ext/html-api-rust/php_wp_html_api_rust.h new file mode 100644 index 0000000000000..bd56b4b28fe4f --- /dev/null +++ b/ext/html-api-rust/php_wp_html_api_rust.h @@ -0,0 +1,9 @@ +#ifndef PHP_WP_HTML_API_RUST_H +#define PHP_WP_HTML_API_RUST_H + +extern zend_module_entry wp_html_api_rust_module_entry; +#define phpext_wp_html_api_rust_ptr &wp_html_api_rust_module_entry + +#define PHP_WP_HTML_API_RUST_VERSION "0.1.0" + +#endif diff --git a/ext/html-api-rust/src/lib.rs b/ext/html-api-rust/src/lib.rs new file mode 100644 index 0000000000000..515f8775bef20 --- /dev/null +++ b/ext/html-api-rust/src/lib.rs @@ -0,0 +1,2655 @@ +use std::ffi::c_char; +use std::ptr; +use std::slice; + +static VERSION: &[u8] = b"0.1.0\0"; + +const TOKEN_TYPE_TAG: u8 = 1; +const TOKEN_TYPE_TEXT: u8 = 2; +const TOKEN_TYPE_COMMENT: u8 = 3; +const TOKEN_TYPE_DOCTYPE: u8 = 4; +const TOKEN_TYPE_CDATA: u8 = 5; +const TOKEN_TYPE_PRESUMPTUOUS_TAG: u8 = 6; +const TOKEN_TYPE_FUNKY_COMMENT: u8 = 7; + +const NAMESPACE_HTML: u8 = 0; +const NAMESPACE_FOREIGN: u8 = 1; + +const COMMENT_TYPE_NONE: u8 = 0; +const COMMENT_TYPE_ABRUPTLY_CLOSED: u8 = 1; +const COMMENT_TYPE_CDATA_LOOKALIKE: u8 = 2; +const COMMENT_TYPE_HTML: u8 = 3; +const COMMENT_TYPE_PI_LOOKALIKE: u8 = 4; +const COMMENT_TYPE_INVALID: u8 = 5; + +#[repr(C)] +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub struct TagScan { + pub tag_start: usize, + pub tag_end: usize, + pub name_start: usize, + pub name_len: usize, + pub is_closing: bool, + pub has_self_closing_flag: bool, + pub token_end: usize, + pub token_type: u8, +} + +#[repr(C)] +#[derive(Clone, Copy, Debug, Default)] +pub struct ByteSlice { + pub ptr: *const u8, + pub len: usize, +} + +pub struct TagProcessor { + html: Vec, + offset: usize, + current: Option, + scratch: Vec, + paused_at_incomplete: bool, + inserted_attributes: Vec>, + parsing_namespace: u8, +} + +#[derive(Clone, Copy, Debug)] +struct AttributeSpan { + name_start: usize, + full_end: usize, + value: Option<(usize, usize)>, +} + +#[no_mangle] +pub extern "C" fn wp_html_api_rust_core_version() -> *const c_char { + VERSION.as_ptr().cast() +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_scan_next_tag( + html: *const u8, + len: usize, + offset: usize, + out: *mut TagScan, +) -> bool { + if html.is_null() || out.is_null() { + return false; + } + + let html = slice::from_raw_parts(html, len); + + match scan_next_tag(html, offset) { + Some(scan) => { + ptr::write(out, scan); + true + } + None => false, + } +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_new( + html: *const u8, + len: usize, +) -> *mut TagProcessor { + if html.is_null() && len > 0 { + return ptr::null_mut(); + } + + let html = if len == 0 { + Vec::new() + } else { + slice::from_raw_parts(html, len).to_vec() + }; + + Box::into_raw(Box::new(TagProcessor { + html, + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + })) +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_free(processor: *mut TagProcessor) { + if !processor.is_null() { + drop(Box::from_raw(processor)); + } +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_next_tag( + processor: *mut TagProcessor, + query: *const u8, + query_len: usize, + visit_closers: bool, +) -> bool { + let Some(processor) = processor.as_mut() else { + return false; + }; + + let query = if query.is_null() { + None + } else { + Some(slice::from_raw_parts(query, query_len)) + }; + + processor.paused_at_incomplete = false; + processor.inserted_attributes.clear(); + + loop { + let scan = match scan_next_token_in_namespace( + &processor.html, + processor.offset, + processor.parsing_namespace, + ) { + ScanResult::Token(scan) => scan, + ScanResult::Incomplete => { + processor.paused_at_incomplete = true; + return false; + } + ScanResult::None => { + return false; + } + }; + + processor.offset = scan.token_end; + + if scan.token_type != TOKEN_TYPE_TAG { + continue; + } + + if scan.is_closing && !visit_closers { + continue; + } + + if let Some(query) = query { + let tag_name = &processor.html[scan.name_start..scan.name_start + scan.name_len]; + if tag_name.len() != query.len() || !eq_ignore_ascii_case(tag_name, query) { + continue; + } + } + + processor.current = Some(scan); + return true; + } +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_next_token( + processor: *mut TagProcessor, +) -> bool { + let Some(processor) = processor.as_mut() else { + return false; + }; + + processor.paused_at_incomplete = false; + processor.inserted_attributes.clear(); + + match scan_next_token_in_namespace( + &processor.html, + processor.offset, + processor.parsing_namespace, + ) { + ScanResult::Token(scan) => { + processor.offset = scan.token_end; + processor.current = Some(scan); + true + } + ScanResult::Incomplete => { + processor.paused_at_incomplete = true; + false + } + ScanResult::None => false, + } +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_seek( + processor: *mut TagProcessor, + offset: usize, +) { + let Some(processor) = processor.as_mut() else { + return; + }; + + processor.offset = offset.min(processor.html.len()); + processor.current = None; + processor.paused_at_incomplete = false; + processor.inserted_attributes.clear(); +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_set_namespace( + processor: *mut TagProcessor, + namespace: u8, +) { + let Some(processor) = processor.as_mut() else { + return; + }; + + processor.parsing_namespace = if namespace == NAMESPACE_FOREIGN { + NAMESPACE_FOREIGN + } else { + NAMESPACE_HTML + }; +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_apply_lexical_update( + processor: *mut TagProcessor, + start: usize, + length: usize, + replacement: *const u8, + replacement_len: usize, +) -> bool { + let Some(processor) = processor.as_mut() else { + return false; + }; + + if replacement.is_null() && replacement_len > 0 { + return false; + } + + let Some(end) = start.checked_add(length) else { + return false; + }; + + if end > processor.html.len() { + return false; + } + + let replacement = if replacement_len == 0 { + &[] + } else { + slice::from_raw_parts(replacement, replacement_len) + }; + + processor.replace_range_preserving_cursor_at_inserted_start(start, end, replacement); + true +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_current_span( + processor: *const TagProcessor, + start: *mut usize, + length: *mut usize, +) -> bool { + let Some(processor) = processor.as_ref() else { + return false; + }; + + let Some(scan) = processor.current else { + return false; + }; + + if start.is_null() || length.is_null() { + return false; + } + + ptr::write(start, scan.tag_start); + ptr::write(length, scan.token_end - scan.tag_start); + true +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_current_token_type( + processor: *const TagProcessor, +) -> u8 { + let Some(processor) = processor.as_ref() else { + return 0; + }; + + processor.current.map(|scan| scan.token_type).unwrap_or(0) +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_paused_at_incomplete( + processor: *const TagProcessor, +) -> bool { + let Some(processor) = processor.as_ref() else { + return false; + }; + + processor.paused_at_incomplete +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_get_modifiable_text( + processor: *mut TagProcessor, + out: *mut ByteSlice, +) -> bool { + let Some(processor) = processor.as_mut() else { + return false; + }; + + let Some(scan) = processor.current else { + return false; + }; + + if out.is_null() { + return false; + } + + let Some(text) = processor.current_modifiable_text(scan) else { + return false; + }; + + processor.scratch = text; + ptr::write( + out, + ByteSlice { + ptr: processor.scratch.as_ptr(), + len: processor.scratch.len(), + }, + ); + true +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_set_modifiable_text( + processor: *mut TagProcessor, + text: *const u8, + text_len: usize, +) -> bool { + let Some(processor) = processor.as_mut() else { + return false; + }; + + if text.is_null() && text_len > 0 { + return false; + } + + let replacement = if text_len == 0 { + &[] + } else { + slice::from_raw_parts(text, text_len) + }; + + processor.set_modifiable_text(replacement) +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_current_comment_type( + processor: *const TagProcessor, +) -> u8 { + let Some(processor) = processor.as_ref() else { + return COMMENT_TYPE_NONE; + }; + + let Some(scan) = processor.current else { + return COMMENT_TYPE_NONE; + }; + + processor.comment_type(scan) +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_script_content_type( + processor: *const TagProcessor, +) -> u8 { + let Some(processor) = processor.as_ref() else { + return 0; + }; + + let Some(scan) = processor.current else { + return 0; + }; + + if processor.parsing_namespace != NAMESPACE_HTML || scan.token_type != TOKEN_TYPE_TAG || scan.is_closing { + return 0; + } + + let tag_name = &processor.html[scan.name_start..scan.name_start + scan.name_len]; + if !eq_ignore_ascii_case(tag_name, b"SCRIPT") { + return 0; + } + + match processor.script_content_type(scan) { + ScriptContentType::JavaScript => 1, + ScriptContentType::Json => 2, + ScriptContentType::Other => 0, + } +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_get_tag( + processor: *const TagProcessor, + out: *mut ByteSlice, +) -> bool { + let Some(processor) = processor.as_ref() else { + return false; + }; + + let Some(scan) = processor.current else { + return false; + }; + + if scan.token_type == TOKEN_TYPE_COMMENT { + let Some((target_start, target_end)) = pi_target_span(&processor.html, scan) else { + return false; + }; + + ptr::write( + out, + ByteSlice { + ptr: processor.html.as_ptr().add(target_start), + len: target_end - target_start, + }, + ); + return true; + } + + if scan.token_type != TOKEN_TYPE_TAG { + return false; + } + + ptr::write( + out, + ByteSlice { + ptr: processor.html.as_ptr().add(scan.name_start), + len: scan.name_len, + }, + ); + + true +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_is_tag_closer( + processor: *const TagProcessor, +) -> bool { + let Some(processor) = processor.as_ref() else { + return false; + }; + + let Some(scan) = processor.current else { + return false; + }; + + if scan.token_type != TOKEN_TYPE_TAG { + return false; + } + + scan.is_closing + && !eq_ignore_ascii_case(&processor.html[scan.name_start..scan.name_start + scan.name_len], b"BR") +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_has_self_closing_flag( + processor: *const TagProcessor, +) -> bool { + let Some(processor) = processor.as_ref() else { + return false; + }; + + processor + .current + .filter(|scan| scan.token_type == TOKEN_TYPE_TAG) + .map(|scan| scan.has_self_closing_flag) + .unwrap_or(false) +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_get_attribute( + processor: *mut TagProcessor, + name: *const u8, + name_len: usize, + out: *mut ByteSlice, +) -> u8 { + let Some(processor) = processor.as_mut() else { + return 0; + }; + + if name.is_null() || out.is_null() { + return 0; + } + + let name = slice::from_raw_parts(name, name_len); + match processor.get_attribute(name) { + AttributeValue::Missing => 0, + AttributeValue::Boolean => 1, + AttributeValue::String => { + ptr::write( + out, + ByteSlice { + ptr: processor.scratch.as_ptr(), + len: processor.scratch.len(), + }, + ); + 2 + } + } +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_get_attribute_names_with_prefix( + processor: *mut TagProcessor, + prefix: *const u8, + prefix_len: usize, + out: *mut ByteSlice, +) -> u8 { + let Some(processor) = processor.as_mut() else { + return 0; + }; + + if prefix.is_null() || out.is_null() { + return 0; + } + + let prefix = slice::from_raw_parts(prefix, prefix_len); + if !processor.get_attribute_names_with_prefix(prefix) { + return 0; + } + + ptr::write( + out, + ByteSlice { + ptr: processor.scratch.as_ptr(), + len: processor.scratch.len(), + }, + ); + 1 +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_set_attribute( + processor: *mut TagProcessor, + name: *const u8, + name_len: usize, + value: *const u8, + value_len: usize, + value_kind: u8, +) -> bool { + let Some(processor) = processor.as_mut() else { + return false; + }; + + if name.is_null() { + return false; + } + + let name = slice::from_raw_parts(name, name_len); + let value = if value.is_null() { + &[][..] + } else { + slice::from_raw_parts(value, value_len) + }; + + processor.set_attribute(name, value, value_kind) +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_remove_attribute( + processor: *mut TagProcessor, + name: *const u8, + name_len: usize, +) -> bool { + let Some(processor) = processor.as_mut() else { + return false; + }; + + if name.is_null() { + return false; + } + + let name = slice::from_raw_parts(name, name_len); + processor.remove_attribute(name) +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_add_class( + processor: *mut TagProcessor, + class_name: *const u8, + class_name_len: usize, +) -> bool { + let Some(processor) = processor.as_mut() else { + return false; + }; + + if class_name.is_null() { + return false; + } + + let class_name = slice::from_raw_parts(class_name, class_name_len); + processor.add_class(class_name) +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_remove_class( + processor: *mut TagProcessor, + class_name: *const u8, + class_name_len: usize, +) -> bool { + let Some(processor) = processor.as_mut() else { + return false; + }; + + if class_name.is_null() { + return false; + } + + let class_name = slice::from_raw_parts(class_name, class_name_len); + processor.remove_class(class_name) +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_has_class( + processor: *mut TagProcessor, + class_name: *const u8, + class_name_len: usize, +) -> u8 { + let Some(processor) = processor.as_mut() else { + return 0; + }; + + if class_name.is_null() { + return 0; + } + + let class_name = slice::from_raw_parts(class_name, class_name_len); + processor.has_class(class_name) +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_class_list( + processor: *mut TagProcessor, + out: *mut ByteSlice, +) -> u8 { + let Some(processor) = processor.as_mut() else { + return 0; + }; + + if out.is_null() || !processor.class_list() { + return 0; + } + + ptr::write( + out, + ByteSlice { + ptr: processor.scratch.as_ptr(), + len: processor.scratch.len(), + }, + ); + 1 +} + +#[no_mangle] +pub unsafe extern "C" fn wp_html_api_rust_tag_processor_get_html( + processor: *const TagProcessor, + out: *mut ByteSlice, +) -> bool { + let Some(processor) = processor.as_ref() else { + return false; + }; + + ptr::write( + out, + ByteSlice { + ptr: processor.html.as_ptr(), + len: processor.html.len(), + }, + ); + + true +} + +enum AttributeValue { + Missing, + Boolean, + String, +} + +impl TagProcessor { + fn current_modifiable_text(&self, scan: TagScan) -> Option> { + match scan.token_type { + TOKEN_TYPE_TEXT => { + let mut raw = &self.html[scan.tag_start..scan.token_end]; + if self.text_follows_pre_or_listing(scan.tag_start) { + raw = strip_initial_newline(raw); + } + Some(transform_text(raw, true, NullTransform::Remove)) + } + TOKEN_TYPE_CDATA if scan.token_end >= scan.tag_start + 12 => Some(transform_text( + &self.html[scan.tag_start + 9..scan.token_end - 3], + false, + NullTransform::Replace, + )), + TOKEN_TYPE_DOCTYPE if scan.token_end > scan.tag_start + 9 => { + Some(self.html[scan.tag_start + 9..scan.token_end - 1].to_vec()) + } + TOKEN_TYPE_COMMENT => self.comment_modifiable_text(scan), + TOKEN_TYPE_FUNKY_COMMENT => Some(transform_text( + &self.html[scan.tag_start + 2..scan.token_end.saturating_sub(1)], + false, + NullTransform::Replace, + )), + TOKEN_TYPE_TAG if scan.token_end > scan.tag_end => { + let inner = &self.html[scan.tag_end..scan.token_end]; + let relative = find_last_subslice(inner, b" None, + } + } + + fn comment_modifiable_text(&self, scan: TagScan) -> Option> { + let token = &self.html[scan.tag_start..scan.token_end]; + + if token.starts_with(b"") { + scan.token_end.saturating_sub(4) + } else if token.ends_with(b"-->") { + scan.token_end.saturating_sub(3) + } else { + scan.token_end.saturating_sub(1) + }; + if end < body_start { + end = body_start; + } + return Some(transform_text( + &self.html[body_start..end], + false, + NullTransform::Replace, + )); + } + + if starts_with_ignore_ascii_case(token, b"") { + return Some(transform_text( + &self.html[scan.tag_start + 9..scan.token_end - 3], + false, + NullTransform::Replace, + )); + } + } + + if token.starts_with(b"") { + scan.token_end.saturating_sub(2) + } else { + scan.token_end.saturating_sub(1) + }; + return Some(transform_text( + &self.html[target_end..text_end], + false, + NullTransform::Replace, + )); + } + } + + Some(transform_text( + &self.html[scan.tag_start + 2..scan.token_end.saturating_sub(1)], + false, + NullTransform::Replace, + )) + } + + fn comment_type(&self, scan: TagScan) -> u8 { + if scan.token_type == TOKEN_TYPE_FUNKY_COMMENT { + return COMMENT_TYPE_INVALID; + } + + if scan.token_type != TOKEN_TYPE_COMMENT { + return COMMENT_TYPE_NONE; + } + + let token = &self.html[scan.tag_start..scan.token_end]; + if token.starts_with(b"") || token.ends_with(b"") || token.ends_with(b"") { + return COMMENT_TYPE_ABRUPTLY_CLOSED; + } + return COMMENT_TYPE_HTML; + } + + if starts_with_ignore_ascii_case(token, b"") { + return COMMENT_TYPE_CDATA_LOOKALIKE; + } + return COMMENT_TYPE_INVALID; + } + + if token.starts_with(b"") { + return COMMENT_TYPE_PI_LOOKALIKE; + } + return COMMENT_TYPE_INVALID; + } + + COMMENT_TYPE_INVALID + } + + fn text_follows_pre_or_listing(&self, text_start: usize) -> bool { + if text_start == 0 || self.html[text_start - 1] != b'>' { + return false; + } + + let Some(tag_start) = self.html[..text_start].iter().rposition(|&byte| byte == b'<') else { + return false; + }; + + let ScanResult::Token(scan) = scan_next_token_in_namespace(&self.html, tag_start, self.parsing_namespace) else { + return false; + }; + + if scan.token_type != TOKEN_TYPE_TAG || scan.is_closing || scan.tag_end != text_start { + return false; + } + + matches_ignore_ascii_case( + &self.html[scan.name_start..scan.name_start + scan.name_len], + &[&b"PRE"[..], &b"LISTING"[..]], + ) + } + + fn set_modifiable_text(&mut self, plaintext: &[u8]) -> bool { + let Some(scan) = self.current else { + return false; + }; + + match scan.token_type { + TOKEN_TYPE_TEXT => { + if self.parsing_namespace != NAMESPACE_HTML { + return false; + } + let replacement = escape_html_text(plaintext); + self.replace_range(scan.tag_start, scan.token_end, &replacement); + true + } + TOKEN_TYPE_COMMENT => { + if self.comment_type(scan) != COMMENT_TYPE_HTML { + return false; + } + if find_subslice(plaintext, b"-->").is_some() + || find_subslice(plaintext, b"--!>").is_some() + { + return false; + } + let Some((start, end)) = self.comment_body_span(scan) else { + return false; + }; + self.replace_range(start, end, plaintext); + true + } + TOKEN_TYPE_TAG => self.set_atomic_modifiable_text(scan, plaintext), + _ => false, + } + } + + fn set_atomic_modifiable_text(&mut self, scan: TagScan, plaintext: &[u8]) -> bool { + if self.parsing_namespace != NAMESPACE_HTML || scan.is_closing || scan.token_end <= scan.tag_end { + return false; + } + + let Some((start, end)) = self.atomic_text_span(scan) else { + return false; + }; + + let tag_name = &self.html[scan.name_start..scan.name_start + scan.name_len]; + let replacement = if eq_ignore_ascii_case(tag_name, b"SCRIPT") { + let script_type = self.script_content_type(scan); + match script_type { + ScriptContentType::JavaScript | ScriptContentType::Json => { + escape_script_text(plaintext) + } + ScriptContentType::Other => { + if find_case_insensitive_script_tag(plaintext).is_some() { + return false; + } + plaintext.to_vec() + } + } + } else if eq_ignore_ascii_case(tag_name, b"STYLE") { + escape_rawtext_closer(plaintext, b"style", b"\\3c\\2f") + } else if eq_ignore_ascii_case(tag_name, b"TEXTAREA") { + let mut escaped = escape_rcdata_closer(plaintext, b"textarea"); + if matches!(escaped.first(), Some(b'\n' | b'\r')) { + let mut with_extra_newline = Vec::with_capacity(escaped.len() + 1); + with_extra_newline.push(b'\n'); + with_extra_newline.extend_from_slice(&escaped); + escaped = with_extra_newline; + } + escaped + } else if eq_ignore_ascii_case(tag_name, b"TITLE") { + escape_rcdata_closer(plaintext, b"title") + } else { + return false; + }; + + self.replace_atomic_text_range(start, end, &replacement); + true + } + + fn atomic_text_span(&self, scan: TagScan) -> Option<(usize, usize)> { + let inner = &self.html[scan.tag_end..scan.token_end]; + find_last_subslice(inner, b" Option<(usize, usize)> { + if scan.token_type != TOKEN_TYPE_COMMENT { + return None; + } + + let token = &self.html[scan.tag_start..scan.token_end]; + if !token.starts_with(b"") { + scan.token_end.saturating_sub(4) + } else if token.ends_with(b"-->") { + scan.token_end.saturating_sub(3) + } else { + scan.token_end.saturating_sub(1) + }; + + if body_end < body_start { + body_end = body_start; + } + + Some((body_start, body_end)) + } + + fn script_content_type(&self, scan: TagScan) -> ScriptContentType { + let type_attr = self.find_attribute(scan, b"type"); + let language_attr = self.find_attribute(scan, b"language"); + + if let Some(attribute) = type_attr { + let type_string = match attribute.value { + None => return ScriptContentType::JavaScript, + Some((start, end)) => { + let decoded = decode_html_attribute(&self.html[start..end]); + let trimmed = trim_ascii_whitespace(&decoded); + if trimmed.is_empty() { + return ScriptContentType::JavaScript; + } + ascii_lowercase_vec(trimmed) + } + }; + + return classify_script_type_string(&type_string); + } + + let Some(attribute) = language_attr else { + return ScriptContentType::JavaScript; + }; + + let language = match attribute.value { + None => return ScriptContentType::JavaScript, + Some((start, end)) => decode_html_attribute(&self.html[start..end]), + }; + + if language.is_empty() { + return ScriptContentType::JavaScript; + } + + let mut type_string = Vec::with_capacity(b"text/".len() + language.len()); + type_string.extend_from_slice(b"text/"); + type_string.extend(ascii_lowercase_vec(&language)); + classify_script_type_string(&type_string) + } + + fn raw_attribute_value(&self, scan: TagScan, name: &[u8]) -> Option<&[u8]> { + let attribute = self.find_attribute(scan, name)?; + let (start, end) = attribute.value?; + Some(&self.html[start..end]) + } + + fn get_attribute(&mut self, wanted_name: &[u8]) -> AttributeValue { + let Some(scan) = self.current else { + return AttributeValue::Missing; + }; + + if scan.token_type != TOKEN_TYPE_TAG || scan.is_closing { + return AttributeValue::Missing; + } + + let Some(attribute) = self.find_attribute(scan, wanted_name) else { + return AttributeValue::Missing; + }; + + let Some((value_start, value_end)) = attribute.value else { + return AttributeValue::Boolean; + }; + + self.scratch = decode_html_attribute(&self.html[value_start..value_end]); + AttributeValue::String + } + + fn set_attribute(&mut self, name: &[u8], value: &[u8], value_kind: u8) -> bool { + let Some(scan) = self.current else { + return false; + }; + + if scan.token_type != TOKEN_TYPE_TAG || scan.is_closing || !is_valid_attribute_name(name) { + return false; + } + + if value_kind == 0 { + return self.remove_attribute(name); + } + + let comparable_name = ascii_lowercase_vec(name); + let replacement = serialize_attribute(name, value, value_kind); + + if let Some(attribute) = self.find_attribute(scan, name) { + self.replace_range(attribute.name_start, attribute.full_end, &replacement); + return true; + } + + let mut inserted = Vec::with_capacity(replacement.len() + 1); + inserted.push(b' '); + inserted.extend_from_slice(&replacement); + + let insertion_point = scan.name_start + scan.name_len; + self.replace_range(insertion_point, insertion_point, &inserted); + if !self + .inserted_attributes + .iter() + .any(|inserted_name| inserted_name == &comparable_name) + { + self.inserted_attributes.push(comparable_name); + } + true + } + + fn remove_attribute(&mut self, name: &[u8]) -> bool { + let Some(scan) = self.current else { + return false; + }; + + if scan.token_type != TOKEN_TYPE_TAG || scan.is_closing { + return false; + } + + let comparable_name = ascii_lowercase_vec(name); + let remove_inserted_space = self + .inserted_attributes + .iter() + .any(|inserted_name| inserted_name == &comparable_name); + let mut removed = false; + while let Some(attribute) = self.current.and_then(|current| self.find_attribute(current, name)) { + let removal_start = if remove_inserted_space + && attribute.name_start > 0 + && is_html_whitespace(self.html[attribute.name_start - 1]) + { + attribute.name_start - 1 + } else { + attribute.name_start + }; + self.replace_range(removal_start, attribute.full_end, &[]); + removed = true; + } + + if removed { + self.inserted_attributes + .retain(|inserted_name| inserted_name != &comparable_name); + } + + removed + } + + fn add_class(&mut self, class_name: &[u8]) -> bool { + let Some(scan) = self.current else { + return false; + }; + + if scan.token_type != TOKEN_TYPE_TAG || scan.is_closing { + return false; + } + + let normalized_class_name = normalize_class_bytes(class_name); + let mut classes = self.current_classes(); + if !classes.iter().any(|class| class.as_slice() == normalized_class_name.as_slice()) { + match self.get_attribute(b"class") { + AttributeValue::String => { + let mut value = self.scratch.clone(); + trim_html_whitespace_in_place(&mut value); + if !value.is_empty() { + value.push(b' '); + } + value.extend_from_slice(&normalized_class_name); + return self.set_attribute(b"class", &value, 2); + } + AttributeValue::Boolean | AttributeValue::Missing => { + classes.push(normalized_class_name); + } + } + } + + let value = join_classes(&classes); + self.set_attribute(b"class", &value, 2) + } + + fn remove_class(&mut self, class_name: &[u8]) -> bool { + let Some(scan) = self.current else { + return false; + }; + + if scan.token_type != TOKEN_TYPE_TAG || scan.is_closing { + return false; + } + + let normalized_class_name = normalize_class_bytes(class_name); + let classes: Vec> = self + .current_classes() + .into_iter() + .filter(|class| class.as_slice() != normalized_class_name.as_slice()) + .collect(); + + if classes.is_empty() { + let _ = self.remove_attribute(b"class"); + return true; + } + + let value = join_classes(&classes); + self.set_attribute(b"class", &value, 2) + } + + fn has_class(&mut self, class_name: &[u8]) -> u8 { + let Some(scan) = self.current else { + return 0; + }; + + if scan.token_type != TOKEN_TYPE_TAG || scan.is_closing { + return 0; + } + + let normalized_class_name = normalize_class_bytes(class_name); + + if self + .current_classes() + .iter() + .any(|class| class.as_slice() == normalized_class_name.as_slice()) + { + 2 + } else { + 1 + } + } + + fn class_list(&mut self) -> bool { + let Some(scan) = self.current else { + return false; + }; + + if scan.token_type != TOKEN_TYPE_TAG || scan.is_closing { + return false; + } + + let classes = self.current_classes(); + self.scratch.clear(); + for class in classes { + if !self.scratch.is_empty() { + self.scratch.push(0x1f); + } + self.scratch.extend_from_slice(&class); + } + + true + } + + fn current_classes(&mut self) -> Vec> { + let value = match self.get_attribute(b"class") { + AttributeValue::String => self.scratch.clone(), + AttributeValue::Boolean | AttributeValue::Missing => Vec::new(), + }; + + let mut classes = Vec::new(); + for class in value.split(|byte| is_html_whitespace(*byte)) { + if class.is_empty() || classes.iter().any(|seen: &Vec| seen.as_slice() == class) { + continue; + } + classes.push(normalize_class_bytes(class)); + } + + classes + } + + fn get_attribute_names_with_prefix(&mut self, prefix: &[u8]) -> bool { + let Some(scan) = self.current else { + return false; + }; + + if scan.token_type != TOKEN_TYPE_TAG || scan.is_closing { + return false; + } + + self.scratch.clear(); + let mut at = scan.name_start + scan.name_len; + let mut end = scan.tag_end.saturating_sub(1); + + if scan.has_self_closing_flag { + end = end.saturating_sub(1); + } + + while at < end { + while at < end && (is_html_whitespace(self.html[at]) || self.html[at] == b'/') { + at += 1; + } + + if at >= end { + break; + } + + let name_start = at; + while at < end && !is_attribute_name_delimiter(self.html[at]) { + at += 1; + } + + if name_start == at { + at += 1; + continue; + } + + let name_end = at; + if starts_with_ignore_ascii_case(&self.html[name_start..name_end], prefix) { + if !self.scratch.is_empty() { + self.scratch.push(0); + } + self.scratch.extend( + self.html[name_start..name_end] + .iter() + .map(u8::to_ascii_lowercase), + ); + } + + while at < end && is_html_whitespace(self.html[at]) { + at += 1; + } + + if at < end && self.html[at] == b'=' { + at += 1; + while at < end && is_html_whitespace(self.html[at]) { + at += 1; + } + + if at < end && (self.html[at] == b'\'' || self.html[at] == b'"') { + let quote = self.html[at]; + at += 1; + while at < end && self.html[at] != quote { + at += 1; + } + if at < end { + at += 1; + } + } else { + while at < end && !is_html_whitespace(self.html[at]) && self.html[at] != b'/' { + at += 1; + } + } + } + } + + true + } + + fn find_attribute(&self, scan: TagScan, wanted_name: &[u8]) -> Option { + let mut at = scan.name_start + scan.name_len; + let mut end = scan.tag_end.saturating_sub(1); + + if scan.has_self_closing_flag { + end = end.saturating_sub(1); + } + + while at < end { + while at < end && (is_html_whitespace(self.html[at]) || self.html[at] == b'/') { + at += 1; + } + + if at >= end { + break; + } + + let name_start = at; + while at < end && !is_attribute_name_delimiter(self.html[at]) { + at += 1; + } + + if name_start == at { + at += 1; + continue; + } + + let name_end = at; + let mut full_end = name_end; + while at < end && is_html_whitespace(self.html[at]) { + at += 1; + } + + let mut value = None; + if at < end && self.html[at] == b'=' { + at += 1; + while at < end && is_html_whitespace(self.html[at]) { + at += 1; + } + + if at < end && (self.html[at] == b'\'' || self.html[at] == b'"') { + let quote = self.html[at]; + at += 1; + let value_start = at; + while at < end && self.html[at] != quote { + at += 1; + } + value = Some((value_start, at)); + if at < end { + at += 1; + } + } else { + let value_start = at; + while at < end && !is_html_whitespace(self.html[at]) && self.html[at] != b'/' { + at += 1; + } + value = Some((value_start, at)); + } + full_end = at; + } + + if eq_ignore_ascii_case(&self.html[name_start..name_end], wanted_name) { + return Some(AttributeSpan { + name_start, + full_end, + value, + }); + } + } + + None + } + + fn replace_range(&mut self, start: usize, end: usize, replacement: &[u8]) { + self.replace_range_internal(start, end, replacement, true); + } + + fn replace_range_preserving_cursor_at_inserted_start( + &mut self, + start: usize, + end: usize, + replacement: &[u8], + ) { + self.replace_range_internal(start, end, replacement, false); + } + + fn replace_atomic_text_range(&mut self, start: usize, end: usize, replacement: &[u8]) { + let old_len = end - start; + let new_len = replacement.len(); + self.html.splice(start..end, replacement.iter().copied()); + + let delta = new_len as isize - old_len as isize; + if delta == 0 { + return; + } + + if let Some(scan) = self.current.as_mut() { + if should_shift_point(scan.token_end, end, old_len, true) { + scan.token_end = scan.token_end.saturating_add_signed(delta); + } + scan.has_self_closing_flag = scan.tag_end >= 2 && self.html[scan.tag_end - 2] == b'/'; + } + + if should_shift_point(self.offset, end, old_len, true) { + self.offset = self.offset.saturating_add_signed(delta); + } + } + + fn replace_range_internal( + &mut self, + start: usize, + end: usize, + replacement: &[u8], + shift_points_at_zero_width_end: bool, + ) { + let old_len = end - start; + let new_len = replacement.len(); + self.html.splice(start..end, replacement.iter().copied()); + + let delta = new_len as isize - old_len as isize; + if delta == 0 { + return; + } + + if let Some(scan) = self.current.as_mut() { + if should_shift_point(scan.tag_end, end, old_len, shift_points_at_zero_width_end) { + scan.tag_end = scan.tag_end.saturating_add_signed(delta); + } + if should_shift_point(scan.token_end, end, old_len, shift_points_at_zero_width_end) { + scan.token_end = scan.token_end.saturating_add_signed(delta); + } + scan.has_self_closing_flag = scan.tag_end >= 2 && self.html[scan.tag_end - 2] == b'/'; + } + + if should_shift_point(self.offset, end, old_len, shift_points_at_zero_width_end) { + self.offset = self.offset.saturating_add_signed(delta); + } + } +} + +fn should_shift_point( + point: usize, + edit_end: usize, + old_len: usize, + shift_points_at_zero_width_end: bool, +) -> bool { + point > edit_end || (point == edit_end && (old_len > 0 || shift_points_at_zero_width_end)) +} + +fn trim_html_whitespace_in_place(value: &mut Vec) { + let start = value + .iter() + .position(|&byte| !is_html_whitespace(byte)) + .unwrap_or(value.len()); + let end = value + .iter() + .rposition(|&byte| !is_html_whitespace(byte)) + .map(|index| index + 1) + .unwrap_or(start); + + if start > 0 || end < value.len() { + value.copy_within(start..end, 0); + value.truncate(end - start); + } +} + +#[derive(Clone, Copy)] +enum NullTransform { + Remove, + Replace, +} + +#[derive(Clone, Copy)] +enum ScriptContentType { + JavaScript, + Json, + Other, +} + +fn classify_script_type_string(type_string: &[u8]) -> ScriptContentType { + match type_string { + b"application/ecmascript" + | b"application/javascript" + | b"application/x-ecmascript" + | b"application/x-javascript" + | b"text/ecmascript" + | b"text/javascript" + | b"text/javascript1.0" + | b"text/javascript1.1" + | b"text/javascript1.2" + | b"text/javascript1.3" + | b"text/javascript1.4" + | b"text/javascript1.5" + | b"text/jscript" + | b"text/livescript" + | b"text/x-ecmascript" + | b"text/x-javascript" + | b"module" => ScriptContentType::JavaScript, + b"importmap" | b"speculationrules" | b"application/json" | b"text/json" => { + ScriptContentType::Json + } + _ => ScriptContentType::Other, + } +} + +fn transform_text(input: &[u8], decode_entities: bool, null_transform: NullTransform) -> Vec { + let mut output = Vec::with_capacity(input.len()); + let mut at = 0; + + while at < input.len() { + if input[at] == 0 { + match null_transform { + NullTransform::Remove => {} + NullTransform::Replace => output.extend_from_slice("\u{FFFD}".as_bytes()), + } + at += 1; + continue; + } + + if decode_entities && input[at] == b'&' { + if let Some((decoded, consumed)) = decode_character_reference(&input[at..]) { + if decoded == '\0' { + match null_transform { + NullTransform::Remove => {} + NullTransform::Replace => output.extend_from_slice("\u{FFFD}".as_bytes()), + } + } else { + let mut buffer = [0; 4]; + output.extend_from_slice(decoded.encode_utf8(&mut buffer).as_bytes()); + } + at += consumed; + continue; + } + } + + output.push(input[at]); + at += 1; + } + + output +} + +fn strip_initial_newline(input: &[u8]) -> &[u8] { + if input.starts_with(b"\r\n") { + return &input[2..]; + } + + if input.starts_with(b"\n") { + return &input[1..]; + } + + if let Some(('\n', consumed)) = decode_character_reference(input) { + return &input[consumed..]; + } + + input +} + +fn pi_target_span(html: &[u8], scan: TagScan) -> Option<(usize, usize)> { + if scan.token_type != TOKEN_TYPE_COMMENT { + return None; + } + + if scan.tag_start + 3 > scan.token_end || !html[scan.tag_start..scan.token_end].starts_with(b"= scan.token_end || !is_html_whitespace(html[target_end]) { + return None; + } + + Some((target_start, target_end)) +} + +fn is_pi_target_char(byte: u8) -> bool { + byte.is_ascii_alphanumeric() || matches!(byte, b'-' | b'_' | b':' | b'.') +} + +fn escape_html_text(input: &[u8]) -> Vec { + let mut output = Vec::with_capacity(input.len()); + for &byte in input { + match byte { + b'<' => output.extend_from_slice(b"<"), + b'>' => output.extend_from_slice(b">"), + b'&' => output.extend_from_slice(b"&"), + b'"' => output.extend_from_slice(b"""), + b'\'' => output.extend_from_slice(b"'"), + _ => output.push(byte), + } + } + output +} + +fn escape_script_text(input: &[u8]) -> Vec { + let mut output = Vec::with_capacity(input.len()); + let mut at = 0; + + while at < input.len() { + if let Some((script_start, escape_at)) = script_tag_match_at(input, at) { + output.extend_from_slice(&input[at..escape_at]); + let escaped = if input[escape_at].is_ascii_uppercase() { + b"\\u0053" + } else { + b"\\u0073" + }; + output.extend_from_slice(escaped); + at = escape_at + 1; + if script_start == at { + at += 1; + } + continue; + } + + output.push(input[at]); + at += 1; + } + + output +} + +fn find_case_insensitive_script_tag(input: &[u8]) -> Option { + let mut at = 0; + while at < input.len() { + if script_tag_match_at(input, at).is_some() { + return Some(at); + } + at += 1; + } + None +} + +fn script_tag_match_at(input: &[u8], at: usize) -> Option<(usize, usize)> { + if at >= input.len() || input[at] != b'<' { + return None; + } + + let (name_start, escape_at) = if at + 1 < input.len() && input[at + 1] == b'/' { + (at + 2, at + 2) + } else { + (at + 1, at + 1) + }; + + let name_end = name_start + b"script".len(); + if name_end > input.len() || !eq_ignore_ascii_case(&input[name_start..name_end], b"script") { + return None; + } + + if name_end < input.len() && !is_tag_name_delimiter(input[name_end]) { + return None; + } + + Some((at, escape_at)) +} + +fn escape_rawtext_closer(input: &[u8], tag_name: &[u8], prefix: &[u8]) -> Vec { + let mut output = Vec::with_capacity(input.len()); + let mut at = 0; + + while at < input.len() { + if starts_with_rawtext_closer(input, at, tag_name) { + output.extend_from_slice(prefix); + output.extend_from_slice(&input[at + 2..at + 2 + tag_name.len()]); + at += 2 + tag_name.len(); + continue; + } + + output.push(input[at]); + at += 1; + } + + output +} + +fn escape_rcdata_closer(input: &[u8], tag_name: &[u8]) -> Vec { + escape_rawtext_closer(input, tag_name, b"</") +} + +fn starts_with_rawtext_closer(input: &[u8], at: usize, tag_name: &[u8]) -> bool { + if at + 2 + tag_name.len() > input.len() || input[at] != b'<' || input[at + 1] != b'/' { + return false; + } + + let name_start = at + 2; + let name_end = name_start + tag_name.len(); + if !eq_ignore_ascii_case(&input[name_start..name_end], tag_name) { + return false; + } + + name_end == input.len() || is_tag_name_delimiter(input[name_end]) +} + +fn trim_ascii_whitespace(value: &[u8]) -> &[u8] { + let start = value + .iter() + .position(|&byte| !is_html_whitespace(byte)) + .unwrap_or(value.len()); + let end = value + .iter() + .rposition(|&byte| !is_html_whitespace(byte)) + .map(|index| index + 1) + .unwrap_or(start); + &value[start..end] +} + +fn scan_next_tag(html: &[u8], offset: usize) -> Option { + let mut at = offset.min(html.len()); + + loop { + match scan_next_token(html, at) { + ScanResult::Token(scan) if scan.token_type == TOKEN_TYPE_TAG => return Some(scan), + ScanResult::Token(scan) => at = scan.token_end, + ScanResult::Incomplete | ScanResult::None => return None, + } + } +} + +enum ScanResult { + Token(TagScan), + Incomplete, + None, +} + +fn scan_next_token(html: &[u8], offset: usize) -> ScanResult { + scan_next_token_in_namespace(html, offset, NAMESPACE_HTML) +} + +fn scan_next_token_in_namespace(html: &[u8], offset: usize, namespace: u8) -> ScanResult { + let len = html.len(); + let at = offset.min(len); + + if at >= len { + return ScanResult::None; + } + + let Some(tag_start) = find_next_token_start(html, at) else { + return ScanResult::Token(text_scan(at, len)); + }; + + if tag_start > at { + return ScanResult::Token(text_scan(at, tag_start)); + } + + if tag_start + 1 >= len { + return ScanResult::Incomplete; + } + + if starts_with_ignore_ascii_case(&html[tag_start..], b"") { + return ScanResult::Token(non_tag_scan(tag_start, tag_start + 5, TOKEN_TYPE_COMMENT)); + } + + if tag_start + 5 < html.len() && html[tag_start..].starts_with(b"") { + return ScanResult::Token(non_tag_scan(tag_start, tag_start + 6, TOKEN_TYPE_COMMENT)); + } + + let Some(token_end) = find_comment_end(html, tag_start + 4) else { + return ScanResult::Incomplete; + }; + + ScanResult::Token(non_tag_scan(tag_start, token_end, TOKEN_TYPE_COMMENT)) +} + +fn find_comment_end(html: &[u8], offset: usize) -> Option { + let mut at = offset; + + while at + 2 < html.len() { + let relative = find_subslice(&html[at..], b"--")?; + let dash_start = at + relative; + let after_dashes = dash_start + 2; + + if after_dashes < html.len() && html[after_dashes] == b'>' { + return Some(after_dashes + 1); + } + + if after_dashes + 1 < html.len() + && html[after_dashes] == b'!' + && html[after_dashes + 1] == b'>' + { + return Some(after_dashes + 2); + } + + at = dash_start + 1; + } + + None +} + +fn scan_cdata(html: &[u8], tag_start: usize, namespace: u8) -> ScanResult { + if namespace == NAMESPACE_HTML { + return scan_markup_declaration(html, tag_start, TOKEN_TYPE_COMMENT); + } + + let Some(relative_end) = find_subslice(&html[tag_start + 9..], b"]]>") else { + return scan_markup_declaration(html, tag_start, TOKEN_TYPE_COMMENT); + }; + + let token_end = tag_start + 9 + relative_end + 3; + ScanResult::Token(non_tag_scan(tag_start, token_end, TOKEN_TYPE_CDATA)) +} + +fn scan_markup_declaration(html: &[u8], tag_start: usize, token_type: u8) -> ScanResult { + let Some(relative_end) = html[tag_start + 2..].iter().position(|&byte| byte == b'>') else { + return ScanResult::Incomplete; + }; + + let token_end = tag_start + 2 + relative_end + 1; + ScanResult::Token(non_tag_scan(tag_start, token_end, token_type)) +} + +fn non_tag_scan(start: usize, end: usize, token_type: u8) -> TagScan { + TagScan { + tag_start: start, + tag_end: end, + name_start: start, + name_len: 0, + is_closing: false, + has_self_closing_flag: false, + token_end: end, + token_type, + } +} + +fn is_special_atomic_tag(tag_name: &[u8]) -> bool { + matches_ignore_ascii_case( + tag_name, + &[ + &b"IFRAME"[..], + &b"NOEMBED"[..], + &b"NOFRAMES"[..], + &b"SCRIPT"[..], + &b"STYLE"[..], + &b"TEXTAREA"[..], + &b"TITLE"[..], + &b"XMP"[..], + ], + ) +} + +fn find_special_closer(html: &[u8], offset: usize, tag_name: &[u8]) -> Option { + if eq_ignore_ascii_case(tag_name, b"SCRIPT") { + return find_script_closer(html, offset); + } + + let mut at = offset; + + while at + 3 + tag_name.len() <= html.len() { + let relative = find_subslice(&html[at..], b" Option { + let mut at = offset; + let mut escaped = false; + let mut double_escaped = false; + + while at < html.len() { + if html[at..].starts_with(b"") { + at += 5; + continue; + } + + if html[at..].starts_with(b"") || html[at..].starts_with(b"--!>")) + { + escaped = false; + double_escaped = false; + at += if html[at..].starts_with(b"-->") { 3 } else { 4 }; + continue; + } + + if starts_with_ignore_ascii_case(&html[at..], b" Option { + if needle.is_empty() { + return Some(0); + } + + haystack + .windows(needle.len()) + .position(|candidate| candidate == needle) +} + +fn find_last_subslice(haystack: &[u8], needle: &[u8]) -> Option { + if needle.is_empty() { + return Some(haystack.len()); + } + + haystack + .windows(needle.len()) + .rposition(|candidate| candidate == needle) +} + +fn matches_ignore_ascii_case(value: &[u8], candidates: &[&[u8]]) -> bool { + candidates + .iter() + .any(|candidate| value.len() == candidate.len() && eq_ignore_ascii_case(value, candidate)) +} + +fn is_tag_name_delimiter(byte: u8) -> bool { + matches!(byte, b' ' | b'\t' | b'\n' | b'\x0c' | b'\r' | b'/' | b'>') +} + +fn is_attribute_name_delimiter(byte: u8) -> bool { + matches!(byte, b' ' | b'\t' | b'\n' | b'\x0c' | b'\r' | b'/' | b'>' | b'=') +} + +fn is_html_whitespace(byte: u8) -> bool { + matches!(byte, b' ' | b'\t' | b'\n' | b'\x0c' | b'\r') +} + +fn is_valid_attribute_name(name: &[u8]) -> bool { + if name.is_empty() { + return false; + } + + name.iter().all(|&byte| { + byte > 0x1f + && !matches!( + byte, + b' ' | b'\t' | b'\n' | b'\x0c' | b'\r' | b'"' | b'\'' | b'>' | b'&' | b'<' | b'/' | b'=' + ) + }) +} + +fn serialize_attribute(name: &[u8], value: &[u8], value_kind: u8) -> Vec { + let mut output = Vec::new(); + output.extend_from_slice(name); + + if value_kind == 1 { + return output; + } + + output.extend_from_slice(b"=\""); + output.extend_from_slice(&encode_html_attribute(value)); + output.push(b'"'); + output +} + +fn encode_html_attribute(input: &[u8]) -> Vec { + let mut output = Vec::with_capacity(input.len()); + + for &byte in input { + match byte { + b'&' => output.extend_from_slice(b"&"), + b'"' => output.extend_from_slice(b"""), + b'\'' => output.extend_from_slice(b"'"), + b'<' => output.extend_from_slice(b"<"), + b'>' => output.extend_from_slice(b">"), + _ => output.push(byte), + } + } + + output +} + +fn join_classes(classes: &[Vec]) -> Vec { + let mut output = Vec::new(); + + for class in classes { + if !output.is_empty() { + output.push(b' '); + } + output.extend_from_slice(class); + } + + output +} + +fn normalize_class_bytes(class_name: &[u8]) -> Vec { + let mut output = Vec::with_capacity(class_name.len()); + + for &byte in class_name { + if byte == 0 { + output.extend_from_slice("\u{fffd}".as_bytes()); + } else { + output.push(byte); + } + } + + output +} + +fn ascii_lowercase_vec(value: &[u8]) -> Vec { + value.iter().map(u8::to_ascii_lowercase).collect() +} + +fn find_tag_end(html: &[u8], offset: usize) -> Option { + let mut at = offset; + let mut quote = None; + let mut after_equals = false; + + while at < html.len() { + let byte = html[at]; + + match quote { + Some(quote_byte) if byte == quote_byte => quote = None, + Some(_) => {} + None if after_equals && (byte == b'\'' || byte == b'"') => { + quote = Some(byte); + after_equals = false; + } + None if byte == b'=' => after_equals = true, + None if is_html_whitespace(byte) => {} + None if byte == b'/' => {} + None if byte == b'>' => return Some(at + 1), + None => after_equals = false, + } + + at += 1; + } + + None +} + +fn eq_ignore_ascii_case(left: &[u8], right: &[u8]) -> bool { + left.iter() + .zip(right.iter()) + .all(|(&left, &right)| left.eq_ignore_ascii_case(&right)) +} + +fn starts_with_ignore_ascii_case(value: &[u8], prefix: &[u8]) -> bool { + value.len() >= prefix.len() && eq_ignore_ascii_case(&value[..prefix.len()], prefix) +} + +fn decode_html_attribute(input: &[u8]) -> Vec { + let mut output = Vec::with_capacity(input.len()); + let mut at = 0; + + while at < input.len() { + if input[at] != b'&' { + output.push(input[at]); + at += 1; + continue; + } + + let Some((decoded, consumed)) = decode_character_reference(&input[at..]) else { + output.push(input[at]); + at += 1; + continue; + }; + + let mut buffer = [0; 4]; + output.extend_from_slice(decoded.encode_utf8(&mut buffer).as_bytes()); + at += consumed; + } + + output +} + +fn decode_character_reference(input: &[u8]) -> Option<(char, usize)> { + if input.len() < 3 || input[0] != b'&' { + return None; + } + + if input[1] == b'#' { + let mut at = 2; + let radix = if at < input.len() && (input[at] == b'x' || input[at] == b'X') { + at += 1; + 16 + } else { + 10 + }; + + let digits_start = at; + while at < input.len() && input[at].is_ascii_hexdigit() { + if radix == 10 && !input[at].is_ascii_digit() { + break; + } + at += 1; + } + + if at == digits_start { + return None; + } + + let digits = std::str::from_utf8(&input[digits_start..at]).ok()?; + let value = u32::from_str_radix(digits, radix).ok()?; + let consumed = if at < input.len() && input[at] == b';' { at + 1 } else { at }; + return char::from_u32(value).map(|decoded| (decoded, consumed)); + } + + let mut at = 1; + while at < input.len() && input[at].is_ascii_alphanumeric() { + at += 1; + } + + if at == 1 { + return None; + } + + let has_semicolon = at < input.len() && input[at] == b';'; + let name = std::str::from_utf8(&input[1..at]).ok()?; + let decoded = match name { + "amp" => '&', + "apos" => '\'', + "dagger" => '†', + "gt" => '>', + "hellip" => '…', + "lt" => '<', + "nbsp" => '\u{00a0}', + "notin" => '∉', + "quot" => '"', + _ => return None, + }; + + Some((decoded, at + usize::from(has_semicolon))) +} + +#[cfg(test)] +mod tests { + use super::{scan_next_tag, AttributeValue, TagProcessor, TagScan, NAMESPACE_HTML, TOKEN_TYPE_TAG}; + use std::ptr; + + #[test] + fn scans_basic_start_tag() { + assert_eq!( + scan_next_tag(b"one
two", 0).unwrap(), + TagScan { + tag_start: 4, + tag_end: 19, + name_start: 5, + name_len: 3, + is_closing: false, + has_self_closing_flag: false, + token_end: 19, + token_type: TOKEN_TYPE_TAG, + } + ); + } + + #[test] + fn scans_basic_closing_tag() { + assert_eq!( + scan_next_tag(b"

text

", 3).unwrap(), + TagScan { + tag_start: 7, + tag_end: 11, + name_start: 9, + name_len: 1, + is_closing: true, + has_self_closing_flag: false, + token_end: 11, + token_type: TOKEN_TYPE_TAG, + } + ); + } + + #[test] + fn skips_non_tag_less_than_sequences() { + let scan = scan_next_tag(b"1 < 2 ", 0).unwrap(); + + assert_eq!(scan.tag_start, 23); + assert_eq!(scan.name_start, 24); + assert_eq!(scan.name_len, 4); + } + + #[test] + fn ignores_gt_inside_quoted_attributes() { + let scan = scan_next_tag(br#"
ok
"#, 0).unwrap(); + + assert_eq!(scan.tag_end, 19); + } + + #[test] + fn reports_incomplete_tag_as_not_found() { + assert!(scan_next_tag(br#"
-->"; + let before = scan_next_tag(html, 0).unwrap(); + assert_eq!(&html[before.name_start..before.name_start + before.name_len], b"img"); + + let after = scan_next_tag(html, before.token_end).unwrap(); + assert_eq!(&html[after.name_start..after.name_start + after.name_len], b"img"); + assert!(html[after.tag_start..after.token_end].starts_with(b"
"; + let first = scan_next_tag(html, 0).unwrap(); + let after = scan_next_tag(html, first.token_end).unwrap(); + assert!(html[after.tag_start..after.token_end].starts_with(b"
a HTML Tag]]>"; + let tag = scan_next_tag(html, 0).unwrap(); + assert_eq!(&html[tag.name_start..tag.name_start + tag.name_len], b"strong"); + } + + #[test] + fn tag_processor_finds_start_tags_matching_query() { + let mut processor = TagProcessor { + html: b"

".to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, b"p".as_ptr(), 1, false) + }); + + assert_eq!( + processor.current.unwrap(), + TagScan { + tag_start: 18, + tag_end: 21, + name_start: 19, + name_len: 1, + is_closing: false, + has_self_closing_flag: false, + token_end: 21, + token_type: TOKEN_TYPE_TAG, + } + ); + } + + #[test] + fn tag_processor_retains_current_token_after_eof() { + let mut processor = TagProcessor { + html: b"".to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, b"body".as_ptr(), 4, false) + }); + let body = processor.current.unwrap(); + + assert!(!unsafe { + super::wp_html_api_rust_tag_processor_next_tag( + &mut processor, + b"missing".as_ptr(), + b"missing".len(), + false, + ) + }); + assert_eq!(processor.current.unwrap(), body); + } + + #[test] + fn tag_processor_can_visit_closing_tags() { + let mut processor = TagProcessor { + html: b"

".to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, b"div".as_ptr(), 3, true) + }); + assert!(!unsafe { super::wp_html_api_rust_tag_processor_is_tag_closer(&processor) }); + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, b"div".as_ptr(), 3, true) + }); + assert!(unsafe { super::wp_html_api_rust_tag_processor_is_tag_closer(&processor) }); + } + + #[test] + fn tag_processor_reports_self_closing_flag() { + let mut processor = TagProcessor { + html: b"
".to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, ptr::null(), 0, false) + }); + assert!(!unsafe { super::wp_html_api_rust_tag_processor_has_self_closing_flag(&processor) }); + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, ptr::null(), 0, false) + }); + assert!(unsafe { super::wp_html_api_rust_tag_processor_has_self_closing_flag(&processor) }); + } + + #[test] + fn tag_processor_reads_attributes() { + let mut processor = TagProcessor { + html: br#"
"#.to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, ptr::null(), 0, false) + }); + + assert!(matches!(processor.get_attribute(b"enabled"), AttributeValue::Boolean)); + assert!(matches!(processor.get_attribute(b"data-ID"), AttributeValue::String)); + assert_eq!(processor.scratch, b"the \"one\""); + assert!(matches!(processor.get_attribute(b"a"), AttributeValue::Boolean)); + assert!(matches!(processor.get_attribute(b"b"), AttributeValue::Boolean)); + assert!(matches!(processor.get_attribute(b"c"), AttributeValue::String)); + assert_eq!(processor.scratch, b"test"); + } + + #[test] + fn tag_processor_sets_and_removes_attributes() { + let mut processor = TagProcessor { + html: br#"
Text
"#.to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, ptr::null(), 0, false) + }); + + assert!(processor.set_attribute(b"data-enabled", b"abc", 2)); + assert_eq!( + std::str::from_utf8(&processor.html).unwrap(), + r#"
Text
"# + ); + + assert!(processor.set_attribute(b"test", br#""<&"#, 2)); + assert_eq!( + std::str::from_utf8(&processor.html).unwrap(), + r#"
Text
"# + ); + + assert!(processor.remove_attribute(b"data-enabled")); + assert_eq!( + std::str::from_utf8(&processor.html).unwrap(), + r#"
Text
"# + ); + } + + #[test] + fn tag_processor_lists_attribute_names_with_prefix() { + let mut processor = TagProcessor { + html: br#"
"#.to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, ptr::null(), 0, false) + }); + + assert!(processor.get_attribute_names_with_prefix(b"data-")); + assert_eq!(processor.scratch, b"data-enabled\0data-test-id"); + + assert!(processor.get_attribute_names_with_prefix(b"aria-")); + assert!(processor.scratch.is_empty()); + } + + #[test] + fn tag_processor_updates_classes() { + let mut processor = TagProcessor { + html: br#"
"#.to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, ptr::null(), 0, false) + }); + + assert_eq!(processor.has_class(b"one"), 2); + assert_eq!(processor.has_class(b"three"), 1); + assert!(processor.add_class(b"three")); + assert!(processor.remove_class(b"two")); + assert!(processor.class_list()); + assert_eq!(processor.scratch, b"one\x1fthree"); + assert_eq!( + std::str::from_utf8(&processor.html).unwrap(), + r#"
"# + ); + } + + #[test] + fn tag_processor_trims_class_edges_when_adding_class() { + let mut processor = TagProcessor { + html: br#"
"#.to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, ptr::null(), 0, false) + }); + + assert!(processor.add_class(b"foo-class")); + assert_eq!( + std::str::from_utf8(&processor.html).unwrap(), + r#"
"# + ); + } +} diff --git a/ext/html-api-rust/wp_html_api_rust.c b/ext/html-api-rust/wp_html_api_rust.c new file mode 100644 index 0000000000000..63d276cb84aa0 --- /dev/null +++ b/ext/html-api-rust/wp_html_api_rust.c @@ -0,0 +1,2561 @@ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "php.h" +#include "ext/standard/info.h" +#include +#include +#include + +#include "php_wp_html_api_rust.h" + +typedef struct _wp_html_api_rust_tag_scan { + size_t tag_start; + size_t tag_end; + size_t name_start; + size_t name_len; + bool is_closing; + bool has_self_closing_flag; + size_t token_end; + unsigned char token_type; +} wp_html_api_rust_tag_scan; + +typedef struct _wp_html_api_rust_byte_slice { + const unsigned char *ptr; + size_t len; +} wp_html_api_rust_byte_slice; + +typedef struct _wp_html_tag_processor_object { + void *native; + zend_long seek_count; + zend_object std; +} wp_html_tag_processor_object; + +typedef struct _wp_html_api_rust_text_replacement { + size_t start; + size_t length; + zend_string *text; +} wp_html_api_rust_text_replacement; + +static zend_class_entry *wp_html_tag_processor_ce; +static zend_class_entry *wp_html_processor_ce; +static zend_object_handlers wp_html_tag_processor_handlers; + +extern const char *wp_html_api_rust_core_version(void); +extern bool wp_html_api_rust_scan_next_tag( + const unsigned char *html, + size_t html_len, + size_t offset, + wp_html_api_rust_tag_scan *out +); +extern void *wp_html_api_rust_tag_processor_new(const unsigned char *html, size_t html_len); +extern void wp_html_api_rust_tag_processor_free(void *processor); +extern bool wp_html_api_rust_tag_processor_next_tag( + void *processor, + const unsigned char *query, + size_t query_len, + bool visit_closers +); +extern bool wp_html_api_rust_tag_processor_next_token(void *processor); +extern void wp_html_api_rust_tag_processor_seek(void *processor, size_t offset); +extern void wp_html_api_rust_tag_processor_set_namespace(void *processor, unsigned char namespace_id); +extern bool wp_html_api_rust_tag_processor_apply_lexical_update( + void *processor, + size_t start, + size_t length, + const unsigned char *replacement, + size_t replacement_len +); +extern bool wp_html_api_rust_tag_processor_current_span( + const void *processor, + size_t *start, + size_t *length +); +extern unsigned char wp_html_api_rust_tag_processor_current_token_type(const void *processor); +extern bool wp_html_api_rust_tag_processor_paused_at_incomplete(const void *processor); +extern bool wp_html_api_rust_tag_processor_get_modifiable_text( + void *processor, + wp_html_api_rust_byte_slice *out +); +extern bool wp_html_api_rust_tag_processor_set_modifiable_text( + void *processor, + const unsigned char *text, + size_t text_len +); +extern unsigned char wp_html_api_rust_tag_processor_current_comment_type(const void *processor); +extern unsigned char wp_html_api_rust_tag_processor_script_content_type(const void *processor); +extern bool wp_html_api_rust_tag_processor_get_tag( + const void *processor, + wp_html_api_rust_byte_slice *out +); +extern bool wp_html_api_rust_tag_processor_is_tag_closer(const void *processor); +extern bool wp_html_api_rust_tag_processor_has_self_closing_flag(const void *processor); +extern unsigned char wp_html_api_rust_tag_processor_get_attribute( + void *processor, + const unsigned char *name, + size_t name_len, + wp_html_api_rust_byte_slice *out +); +extern unsigned char wp_html_api_rust_tag_processor_get_attribute_names_with_prefix( + void *processor, + const unsigned char *prefix, + size_t prefix_len, + wp_html_api_rust_byte_slice *out +); +extern bool wp_html_api_rust_tag_processor_set_attribute( + void *processor, + const unsigned char *name, + size_t name_len, + const unsigned char *value, + size_t value_len, + unsigned char value_kind +); +extern bool wp_html_api_rust_tag_processor_remove_attribute( + void *processor, + const unsigned char *name, + size_t name_len +); +extern bool wp_html_api_rust_tag_processor_add_class( + void *processor, + const unsigned char *class_name, + size_t class_name_len +); +extern bool wp_html_api_rust_tag_processor_remove_class( + void *processor, + const unsigned char *class_name, + size_t class_name_len +); +extern unsigned char wp_html_api_rust_tag_processor_has_class( + void *processor, + const unsigned char *class_name, + size_t class_name_len +); +extern unsigned char wp_html_api_rust_tag_processor_class_list( + void *processor, + wp_html_api_rust_byte_slice *out +); +extern bool wp_html_api_rust_tag_processor_get_html( + const void *processor, + wp_html_api_rust_byte_slice *out +); + +PHP_INI_BEGIN() + PHP_INI_ENTRY("wp_html_api_rust.replace_html_api", "0", PHP_INI_SYSTEM, NULL) +PHP_INI_END() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_api_rust_version, 0, 0, IS_STRING, 0) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_wp_html_api_rust_scan_next_tag, 0, 1, MAY_BE_ARRAY | MAY_BE_FALSE) + ZEND_ARG_TYPE_INFO(0, html, IS_STRING, 0) + ZEND_ARG_TYPE_INFO_WITH_DEFAULT_VALUE(0, offset, IS_LONG, 0, "0") +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_INFO_EX(arginfo_wp_html_tag_processor_construct, 0, 0, 1) + ZEND_ARG_INFO(0, html) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_next_tag, 0, 0, _IS_BOOL, 0) + ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, query, "null") +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_get_tag, 0, 0, IS_STRING, 1) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_INFO_EX(arginfo_wp_html_tag_processor_get_attribute, 0, 0, 1) + ZEND_ARG_INFO(0, name) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_get_attribute_names_with_prefix, 0, 1, IS_ARRAY, 1) + ZEND_ARG_INFO(0, prefix) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_set_attribute, 0, 2, _IS_BOOL, 0) + ZEND_ARG_INFO(0, name) + ZEND_ARG_INFO(0, value) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_remove_attribute, 0, 1, _IS_BOOL, 0) + ZEND_ARG_INFO(0, name) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_class_mutation, 0, 1, _IS_BOOL, 0) + ZEND_ARG_INFO(0, class_name) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_has_class, 0, 1, _IS_BOOL, 1) + ZEND_ARG_INFO(0, wanted_class) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_INFO_EX(arginfo_wp_html_tag_processor_class_list, 0, 0, 0) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_bool, 0, 0, _IS_BOOL, 0) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_nullable_string, 0, 0, IS_STRING, 1) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_set_modifiable_text, 0, 1, _IS_BOOL, 0) + ZEND_ARG_TYPE_INFO(0, plaintext_content, IS_STRING, 0) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_INFO_EX(arginfo_wp_html_tag_processor_nullable_mixed, 0, 0, 0) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_bookmark, 0, 1, _IS_BOOL, 0) + ZEND_ARG_INFO(0, bookmark_name) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_change_namespace, 0, 1, _IS_BOOL, 0) + ZEND_ARG_TYPE_INFO(0, new_namespace, IS_STRING, 0) +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_INFO_EX(arginfo_wp_html_processor_construct, 0, 0, 1) + ZEND_ARG_INFO(0, html) + ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, use_the_static_create_methods_instead, "null") +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_INFO_EX(arginfo_wp_html_processor_create_fragment, 0, 0, 1) + ZEND_ARG_INFO(0, html) + ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, context, "\"\"") + ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, encoding, "\"UTF-8\"") +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_INFO_EX(arginfo_wp_html_processor_create_full_parser, 0, 0, 1) + ZEND_ARG_INFO(0, html) + ZEND_ARG_INFO_WITH_DEFAULT_VALUE(0, encoding, "\"UTF-8\"") +ZEND_END_ARG_INFO() + +ZEND_BEGIN_ARG_WITH_RETURN_TYPE_INFO_EX(arginfo_wp_html_tag_processor_get_html, 0, 0, IS_STRING, 0) +ZEND_END_ARG_INFO() + +static inline wp_html_tag_processor_object *wp_html_tag_processor_from_obj(zend_object *obj) +{ + return (wp_html_tag_processor_object *) ((char *) obj - XtOffsetOf(wp_html_tag_processor_object, std)); +} + +#define Z_WP_HTML_TAG_PROCESSOR_P(zv) wp_html_tag_processor_from_obj(Z_OBJ_P((zv))) + +static zend_string *wp_html_api_rust_uppercase_ascii_slice(const unsigned char *ptr, size_t len) +{ + zend_string *string = zend_string_alloc(len, 0); + size_t i; + + for (i = 0; i < len; i++) { + unsigned char byte = ptr[i]; + ZSTR_VAL(string)[i] = (byte >= 'a' && byte <= 'z') ? (char) (byte - 32) : (char) byte; + } + + ZSTR_VAL(string)[len] = '\0'; + return string; +} + +static bool wp_html_api_rust_ascii_eq_ci(const unsigned char *left, size_t left_len, const char *right) +{ + size_t i; + size_t right_len = strlen(right); + + if (left_len != right_len) { + return false; + } + + for (i = 0; i < left_len; i++) { + unsigned char left_byte = left[i]; + unsigned char right_byte = (unsigned char) right[i]; + + if (left_byte >= 'a' && left_byte <= 'z') { + left_byte = (unsigned char) (left_byte - 32); + } + + if (right_byte >= 'a' && right_byte <= 'z') { + right_byte = (unsigned char) (right_byte - 32); + } + + if (left_byte != right_byte) { + return false; + } + } + + return true; +} + +static void wp_html_api_rust_doing_it_wrong(const char *function_name, const char *message, const char *version) +{ + zval callable; + zval retval; + zval params[3]; + zend_fcall_info fci; + zend_fcall_info_cache fcc; + + if (!zend_hash_str_exists(CG(function_table), "_doing_it_wrong", sizeof("_doing_it_wrong") - 1)) { + return; + } + + ZVAL_STRING(&callable, "_doing_it_wrong"); + ZVAL_STRING(¶ms[0], function_name); + ZVAL_STRING(¶ms[1], message); + ZVAL_STRING(¶ms[2], version); + + memset(&fci, 0, sizeof(fci)); + memset(&fcc, 0, sizeof(fcc)); + + fci.size = sizeof(fci); + fci.function_name = callable; + fci.retval = &retval; + fci.params = params; + fci.param_count = 3; + + if (SUCCESS == zend_call_function(&fci, &fcc)) { + zval_ptr_dtor(&retval); + } + + zval_ptr_dtor(¶ms[2]); + zval_ptr_dtor(¶ms[1]); + zval_ptr_dtor(¶ms[0]); + zval_ptr_dtor(&callable); +} + +static bool wp_html_api_rust_is_valid_attribute_name(const char *name, size_t name_len) +{ + size_t i; + uint32_t codepoint; + unsigned char byte; + + if (0 == name_len) { + return false; + } + + for (i = 0; i < name_len; i++) { + byte = (unsigned char) name[i]; + + if ( + byte <= 0x1f || + '"' == byte || + '\'' == byte || + '>' == byte || + '&' == byte || + '<' == byte || + '/' == byte || + ' ' == byte || + '=' == byte + ) { + return false; + } + + if (byte < 0x80) { + continue; + } + + codepoint = 0; + if ((byte & 0xe0) == 0xc0 && i + 1 < name_len) { + codepoint = ((uint32_t) (byte & 0x1f) << 6) | + ((uint32_t) ((unsigned char) name[i + 1] & 0x3f)); + i += 1; + } else if ((byte & 0xf0) == 0xe0 && i + 2 < name_len) { + codepoint = ((uint32_t) (byte & 0x0f) << 12) | + ((uint32_t) ((unsigned char) name[i + 1] & 0x3f) << 6) | + ((uint32_t) ((unsigned char) name[i + 2] & 0x3f)); + i += 2; + } else if ((byte & 0xf8) == 0xf0 && i + 3 < name_len) { + codepoint = ((uint32_t) (byte & 0x07) << 18) | + ((uint32_t) ((unsigned char) name[i + 1] & 0x3f) << 12) | + ((uint32_t) ((unsigned char) name[i + 2] & 0x3f) << 6) | + ((uint32_t) ((unsigned char) name[i + 3] & 0x3f)); + i += 3; + } + + if ( + (codepoint >= 0xfdd0 && codepoint <= 0xfdef) || + (codepoint <= 0x10ffff && 0xfffe == (codepoint & 0xfffe)) + ) { + return false; + } + } + + return true; +} + +static void wp_html_tag_processor_update_parser_state(zval *object, const char *state) +{ + zend_update_property_string( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "parser_state", + sizeof("parser_state") - 1, + state + ); +} + +static void wp_html_tag_processor_update_parser_state_from_native(zval *object, void *native) +{ + const char *state = "STATE_READY"; + + switch (wp_html_api_rust_tag_processor_current_token_type(native)) { + case 1: + state = "STATE_MATCHED_TAG"; + break; + case 2: + state = "STATE_TEXT_NODE"; + break; + case 3: + state = "STATE_COMMENT"; + break; + case 4: + state = "STATE_DOCTYPE"; + break; + case 5: + state = "STATE_CDATA_NODE"; + break; + case 6: + state = "STATE_PRESUMPTUOUS_TAG"; + break; + case 7: + state = "STATE_FUNKY_COMMENT"; + break; + } + + wp_html_tag_processor_update_parser_state(object, state); +} + +static void wp_html_tag_processor_sync_html_property(zval *object, void *native) +{ + wp_html_api_rust_byte_slice html; + + if (NULL == native || !wp_html_api_rust_tag_processor_get_html(native, &html)) { + zend_update_property_string( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "html", + sizeof("html") - 1, + "" + ); + return; + } + + zend_update_property_stringl( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "html", + sizeof("html") - 1, + (const char *) html.ptr, + html.len + ); +} + +static zval *wp_html_tag_processor_read_bookmarks(zval *object, zval *rv) +{ + zval *bookmarks = zend_read_property( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "bookmarks", + sizeof("bookmarks") - 1, + 0, + rv + ); + + if (IS_ARRAY != Z_TYPE_P(bookmarks)) { + zval empty_bookmarks; + + array_init(&empty_bookmarks); + zend_update_property( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "bookmarks", + sizeof("bookmarks") - 1, + &empty_bookmarks + ); + zval_ptr_dtor(&empty_bookmarks); + + bookmarks = zend_read_property( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "bookmarks", + sizeof("bookmarks") - 1, + 0, + rv + ); + } + + return bookmarks; +} + +static bool wp_html_tag_processor_read_long_property(zval *object, const char *name, size_t name_len, zend_long *out) +{ + zval rv; + zval *value; + + if (IS_OBJECT != Z_TYPE_P(object)) { + return false; + } + + value = zend_read_property(Z_OBJCE_P(object), Z_OBJ_P(object), name, name_len, 1, &rv); + if (IS_LONG == Z_TYPE_P(value)) { + *out = Z_LVAL_P(value); + return true; + } + + if (IS_DOUBLE == Z_TYPE_P(value)) { + *out = (zend_long) Z_DVAL_P(value); + return true; + } + + return false; +} + +static bool wp_html_tag_processor_read_string_property(zval *object, const char *name, size_t name_len, zend_string **out) +{ + zval rv; + zval *value; + + if (IS_OBJECT != Z_TYPE_P(object)) { + return false; + } + + value = zend_read_property(Z_OBJCE_P(object), Z_OBJ_P(object), name, name_len, 1, &rv); + if (IS_STRING != Z_TYPE_P(value)) { + return false; + } + + *out = Z_STR_P(value); + return true; +} + +static bool wp_html_api_rust_is_html_whitespace(unsigned char byte) +{ + return ' ' == byte || '\t' == byte || '\f' == byte || '\r' == byte || '\n' == byte; +} + +static bool wp_html_tag_processor_parser_state_is(zval *object, const char *state, size_t state_len) +{ + zval rv; + zval *parser_state; + + parser_state = zend_read_property( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "parser_state", + sizeof("parser_state") - 1, + 1, + &rv + ); + + return ( + IS_STRING == Z_TYPE_P(parser_state) && + state_len == Z_STRLEN_P(parser_state) && + 0 == memcmp(Z_STRVAL_P(parser_state), state, state_len) + ); +} + +static bool wp_html_tag_processor_parser_state_is_terminal(zval *object) +{ + return ( + wp_html_tag_processor_parser_state_is(object, "STATE_COMPLETE", sizeof("STATE_COMPLETE") - 1) || + wp_html_tag_processor_parser_state_is(object, "STATE_INCOMPLETE_INPUT", sizeof("STATE_INCOMPLETE_INPUT") - 1) + ); +} + +static int wp_html_api_rust_compare_text_replacements(const void *left_ptr, const void *right_ptr) +{ + const wp_html_api_rust_text_replacement *left = (const wp_html_api_rust_text_replacement *) left_ptr; + const wp_html_api_rust_text_replacement *right = (const wp_html_api_rust_text_replacement *) right_ptr; + int by_text; + + if (left->start < right->start) { + return -1; + } + + if (left->start > right->start) { + return 1; + } + + by_text = zend_binary_strcmp( + ZSTR_VAL(left->text), + ZSTR_LEN(left->text), + ZSTR_VAL(right->text), + ZSTR_LEN(right->text) + ); + if (0 != by_text) { + return by_text; + } + + if (left->length < right->length) { + return -1; + } + + if (left->length > right->length) { + return 1; + } + + return 0; +} + +static bool wp_html_tag_processor_apply_lexical_updates(zval *object, void *native) +{ + zval rv; + zval *updates; + zval *update; + zval empty_updates; + wp_html_api_rust_text_replacement *replacements; + uint32_t replacement_count = 0; + uint32_t replacement_index = 0; + zend_long accumulated_shift = 0; + bool applied = true; + + if (NULL == native) { + return false; + } + + updates = zend_read_property( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "lexical_updates", + sizeof("lexical_updates") - 1, + 1, + &rv + ); + + if (IS_ARRAY != Z_TYPE_P(updates) || 0 == zend_hash_num_elements(Z_ARRVAL_P(updates))) { + return true; + } + + replacements = safe_emalloc(zend_hash_num_elements(Z_ARRVAL_P(updates)), sizeof(wp_html_api_rust_text_replacement), 0); + + ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(updates), update) { + zend_long start; + zend_long length; + zend_string *text; + + if ( + IS_OBJECT != Z_TYPE_P(update) || + !wp_html_tag_processor_read_long_property(update, "start", sizeof("start") - 1, &start) || + !wp_html_tag_processor_read_long_property(update, "length", sizeof("length") - 1, &length) || + !wp_html_tag_processor_read_string_property(update, "text", sizeof("text") - 1, &text) || + start < 0 || + length < 0 + ) { + applied = false; + break; + } + + replacements[replacement_count].start = (size_t) start; + replacements[replacement_count].length = (size_t) length; + replacements[replacement_count].text = zend_string_copy(text); + ++replacement_count; + } ZEND_HASH_FOREACH_END(); + + if (applied && replacement_count > 1) { + qsort( + replacements, + replacement_count, + sizeof(wp_html_api_rust_text_replacement), + wp_html_api_rust_compare_text_replacements + ); + } + + for (replacement_index = 0; applied && replacement_index < replacement_count; ++replacement_index) { + zend_long adjusted_start = (zend_long) replacements[replacement_index].start + accumulated_shift; + zend_long shift = (zend_long) ZSTR_LEN(replacements[replacement_index].text) - (zend_long) replacements[replacement_index].length; + + if ( + adjusted_start < 0 || + !wp_html_api_rust_tag_processor_apply_lexical_update( + native, + (size_t) adjusted_start, + replacements[replacement_index].length, + (const unsigned char *) ZSTR_VAL(replacements[replacement_index].text), + ZSTR_LEN(replacements[replacement_index].text) + ) + ) { + applied = false; + break; + } + + accumulated_shift += shift; + } + + for (replacement_index = 0; replacement_index < replacement_count; ++replacement_index) { + zend_string_release(replacements[replacement_index].text); + } + efree(replacements); + + if (!applied) { + return false; + } + + array_init(&empty_updates); + zend_update_property( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "lexical_updates", + sizeof("lexical_updates") - 1, + &empty_updates + ); + zval_ptr_dtor(&empty_updates); + + wp_html_tag_processor_sync_html_property(object, native); + return true; +} + +static void wp_html_tag_processor_create_span(zval *span, zend_long start, zend_long length) +{ + zend_string *span_class_name; + zend_class_entry *span_ce; + + span_class_name = zend_string_init("WP_HTML_Span", sizeof("WP_HTML_Span") - 1, 0); + span_ce = zend_lookup_class(span_class_name); + zend_string_release(span_class_name); + + if (NULL != span_ce) { + object_init_ex(span, span_ce); + zend_update_property_long(span_ce, Z_OBJ_P(span), "start", sizeof("start") - 1, start); + zend_update_property_long(span_ce, Z_OBJ_P(span), "length", sizeof("length") - 1, length); + return; + } + + object_init(span); + add_property_long(span, "start", start); + add_property_long(span, "length", length); +} + +static void wp_html_tag_processor_adjust_bookmarks_after_current_token_update( + zval *object, + zend_long old_start, + zend_long old_length, + zend_long new_start, + zend_long new_length +) { + zval rv; + zval *bookmarks; + zval *bookmark; + zend_long delta = new_length - old_length; + + if (old_start != new_start && 0 == delta) { + delta = new_start - old_start; + } + + if (0 == delta && old_length == new_length) { + return; + } + + bookmarks = wp_html_tag_processor_read_bookmarks(object, &rv); + ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(bookmarks), bookmark) { + zend_long start; + zend_long length; + + if ( + IS_OBJECT != Z_TYPE_P(bookmark) || + !wp_html_tag_processor_read_long_property(bookmark, "start", sizeof("start") - 1, &start) || + !wp_html_tag_processor_read_long_property(bookmark, "length", sizeof("length") - 1, &length) + ) { + continue; + } + + if (start == old_start) { + zend_update_property_long( + Z_OBJCE_P(bookmark), + Z_OBJ_P(bookmark), + "length", + sizeof("length") - 1, + new_length + ); + } else if (start > old_start) { + zend_update_property_long( + Z_OBJCE_P(bookmark), + Z_OBJ_P(bookmark), + "start", + sizeof("start") - 1, + start + delta + ); + } + } ZEND_HASH_FOREACH_END(); +} + +static bool wp_html_tag_processor_initialize(zval *object, const char *html, size_t html_len) +{ + wp_html_tag_processor_object *intern = Z_WP_HTML_TAG_PROCESSOR_P(object); + zval bookmarks; + zval lexical_updates; + + if (NULL != intern->native) { + wp_html_api_rust_tag_processor_free(intern->native); + } + + intern->native = wp_html_api_rust_tag_processor_new((const unsigned char *) html, html_len); + if (NULL == intern->native) { + zend_throw_error(NULL, "Failed to initialize WP_HTML_Tag_Processor native state"); + return false; + } + + intern->seek_count = 0; + wp_html_tag_processor_sync_html_property(object, intern->native); + wp_html_tag_processor_update_parser_state(object, "STATE_READY"); + zend_update_property_string( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "parsing_namespace", + sizeof("parsing_namespace") - 1, + "html" + ); + + array_init(&bookmarks); + zend_update_property( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "bookmarks", + sizeof("bookmarks") - 1, + &bookmarks + ); + zval_ptr_dtor(&bookmarks); + + array_init(&lexical_updates); + zend_update_property( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "lexical_updates", + sizeof("lexical_updates") - 1, + &lexical_updates + ); + zval_ptr_dtor(&lexical_updates); + + return true; +} + +static zend_object *wp_html_tag_processor_create_object(zend_class_entry *class_type) +{ + wp_html_tag_processor_object *intern = zend_object_alloc(sizeof(wp_html_tag_processor_object), class_type); + + zend_object_std_init(&intern->std, class_type); + object_properties_init(&intern->std, class_type); + + intern->native = NULL; + intern->seek_count = 0; + intern->std.handlers = &wp_html_tag_processor_handlers; + + return &intern->std; +} + +static void wp_html_tag_processor_free_obj(zend_object *object) +{ + wp_html_tag_processor_object *intern = wp_html_tag_processor_from_obj(object); + + if (NULL != intern->native) { + wp_html_api_rust_tag_processor_free(intern->native); + intern->native = NULL; + } + + zend_object_std_dtor(&intern->std); +} + +PHP_FUNCTION(wp_html_api_rust_version) +{ + ZEND_PARSE_PARAMETERS_NONE(); + + RETURN_STRING(wp_html_api_rust_core_version()); +} + +PHP_FUNCTION(wp_html_api_rust_scan_next_tag) +{ + char *html; + size_t html_len; + zend_long offset = 0; + wp_html_api_rust_tag_scan scan; + zend_string *tag_name; + + ZEND_PARSE_PARAMETERS_START(1, 2) + Z_PARAM_STRING(html, html_len) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(offset) + ZEND_PARSE_PARAMETERS_END(); + + if (offset < 0) { + offset = 0; + } + + if (!wp_html_api_rust_scan_next_tag((const unsigned char *) html, html_len, (size_t) offset, &scan)) { + RETURN_FALSE; + } + + tag_name = wp_html_api_rust_uppercase_ascii_slice((const unsigned char *) html + scan.name_start, scan.name_len); + + array_init(return_value); + add_assoc_str(return_value, "tag_name", tag_name); + add_assoc_long(return_value, "tag_start", (zend_long) scan.tag_start); + add_assoc_long(return_value, "tag_end", (zend_long) scan.tag_end); + add_assoc_long(return_value, "name_start", (zend_long) scan.name_start); + add_assoc_long(return_value, "name_length", (zend_long) scan.name_len); + add_assoc_bool(return_value, "is_closing", scan.is_closing); +} + +PHP_METHOD(WP_HTML_Tag_Processor, __construct) +{ + zval *html_param; + const char *html = ""; + size_t html_len = 0; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_ZVAL(html_param) + ZEND_PARSE_PARAMETERS_END(); + + if (IS_STRING == Z_TYPE_P(html_param)) { + html = Z_STRVAL_P(html_param); + html_len = Z_STRLEN_P(html_param); + } else { + wp_html_api_rust_doing_it_wrong( + "WP_HTML_Tag_Processor::__construct", + "The HTML parameter must be a string.", + "6.9.0" + ); + } + + if (!wp_html_tag_processor_initialize(ZEND_THIS, html, html_len)) { + RETURN_THROWS(); + } +} + +PHP_METHOD(WP_HTML_Tag_Processor, next_tag) +{ + zval *query = NULL; + zend_string *query_tag_name = NULL; + zend_string *query_class_name = NULL; + zend_long match_offset = 1; + zend_long found_matches = 0; + bool visit_closers = false; + wp_html_tag_processor_object *intern; + + ZEND_PARSE_PARAMETERS_START(0, 1) + Z_PARAM_OPTIONAL + Z_PARAM_ZVAL(query) + ZEND_PARSE_PARAMETERS_END(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_tag_processor_apply_lexical_updates(ZEND_THIS, intern->native)) { + RETURN_FALSE; + } + + if (NULL != query) { + if (IS_STRING == Z_TYPE_P(query)) { + query_tag_name = Z_STR_P(query); + } else if (IS_ARRAY == Z_TYPE_P(query)) { + zval *tag_name = zend_hash_str_find(Z_ARRVAL_P(query), "tag_name", sizeof("tag_name") - 1); + zval *class_name = zend_hash_str_find(Z_ARRVAL_P(query), "class_name", sizeof("class_name") - 1); + zval *query_match_offset = zend_hash_str_find(Z_ARRVAL_P(query), "match_offset", sizeof("match_offset") - 1); + zval *tag_closers = zend_hash_str_find(Z_ARRVAL_P(query), "tag_closers", sizeof("tag_closers") - 1); + + if (NULL != tag_name && IS_STRING == Z_TYPE_P(tag_name)) { + query_tag_name = Z_STR_P(tag_name); + } + + if (NULL != class_name && IS_STRING == Z_TYPE_P(class_name)) { + query_class_name = Z_STR_P(class_name); + } + + if (NULL != query_match_offset && IS_LONG == Z_TYPE_P(query_match_offset) && Z_LVAL_P(query_match_offset) > 0) { + match_offset = Z_LVAL_P(query_match_offset); + } + + if ( + NULL != tag_closers && + IS_STRING == Z_TYPE_P(tag_closers) && + sizeof("visit") - 1 == Z_STRLEN_P(tag_closers) && + 0 == memcmp(Z_STRVAL_P(tag_closers), "visit", sizeof("visit") - 1) + ) { + visit_closers = true; + } + } + } + + while (wp_html_api_rust_tag_processor_next_tag( + intern->native, + NULL == query_tag_name ? NULL : (const unsigned char *) ZSTR_VAL(query_tag_name), + NULL == query_tag_name ? 0 : ZSTR_LEN(query_tag_name), + visit_closers + )) { + if ( + NULL != query_class_name && + 2 != wp_html_api_rust_tag_processor_has_class( + intern->native, + (const unsigned char *) ZSTR_VAL(query_class_name), + ZSTR_LEN(query_class_name) + ) + ) { + continue; + } + + if (++found_matches < match_offset) { + continue; + } + + wp_html_tag_processor_update_parser_state_from_native(ZEND_THIS, intern->native); + RETURN_TRUE; + } + + wp_html_tag_processor_update_parser_state( + ZEND_THIS, + wp_html_api_rust_tag_processor_paused_at_incomplete(intern->native) ? "STATE_INCOMPLETE_INPUT" : "STATE_COMPLETE" + ); + RETURN_FALSE; +} + +PHP_METHOD(WP_HTML_Tag_Processor, next_token) +{ + wp_html_tag_processor_object *intern; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_tag_processor_apply_lexical_updates(ZEND_THIS, intern->native)) { + RETURN_FALSE; + } + + if (wp_html_api_rust_tag_processor_next_token(intern->native)) { + wp_html_tag_processor_update_parser_state_from_native(ZEND_THIS, intern->native); + RETURN_TRUE; + } + + wp_html_tag_processor_update_parser_state( + ZEND_THIS, + wp_html_api_rust_tag_processor_paused_at_incomplete(intern->native) ? "STATE_INCOMPLETE_INPUT" : "STATE_COMPLETE" + ); + RETURN_FALSE; +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_tag) +{ + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice tag_name; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + if ( + !wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_COMMENT", sizeof("STATE_COMMENT") - 1) || + 4 != wp_html_api_rust_tag_processor_current_comment_type(intern->native) + ) { + RETURN_NULL(); + } + } + + if (!wp_html_api_rust_tag_processor_get_tag(intern->native, &tag_name)) { + RETURN_NULL(); + } + + if (4 == wp_html_api_rust_tag_processor_current_comment_type(intern->native)) { + RETURN_STRINGL((const char *) tag_name.ptr, tag_name.len); + } + + RETURN_STR(wp_html_api_rust_uppercase_ascii_slice(tag_name.ptr, tag_name.len)); +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_attribute) +{ + char *name; + size_t name_len; + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice value; + unsigned char result; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STRING(name, name_len) + ZEND_PARSE_PARAMETERS_END(); + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_NULL(); + } + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + result = wp_html_api_rust_tag_processor_get_attribute( + intern->native, + (const unsigned char *) name, + name_len, + &value + ); + + switch (result) { + case 1: + RETURN_TRUE; + case 2: + RETURN_STRINGL((const char *) value.ptr, value.len); + default: + RETURN_NULL(); + } +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_attribute_names_with_prefix) +{ + char *prefix; + size_t prefix_len; + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice names; + size_t start = 0; + size_t i; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STRING(prefix, prefix_len) + ZEND_PARSE_PARAMETERS_END(); + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_NULL(); + } + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_api_rust_tag_processor_get_attribute_names_with_prefix( + intern->native, + (const unsigned char *) prefix, + prefix_len, + &names + )) { + RETURN_NULL(); + } + + array_init(return_value); + if (0 == names.len) { + return; + } + + for (i = 0; i <= names.len; i++) { + if (i == names.len || 0 == names.ptr[i]) { + add_next_index_stringl(return_value, (const char *) names.ptr + start, i - start); + start = i + 1; + } + } +} + +PHP_METHOD(WP_HTML_Tag_Processor, set_attribute) +{ + char *name; + size_t name_len; + zval *value; + zend_string *value_string = NULL; + const unsigned char *value_ptr = NULL; + size_t value_len = 0; + unsigned char value_kind = 2; + bool result; + bool had_span; + size_t old_token_start = 0; + size_t old_token_length = 0; + size_t new_token_start = 0; + size_t new_token_length = 0; + wp_html_tag_processor_object *intern; + + ZEND_PARSE_PARAMETERS_START(2, 2) + Z_PARAM_STRING(name, name_len) + Z_PARAM_ZVAL(value) + ZEND_PARSE_PARAMETERS_END(); + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_FALSE; + } + + if (!wp_html_api_rust_is_valid_attribute_name(name, name_len)) { + wp_html_api_rust_doing_it_wrong( + "WP_HTML_Tag_Processor::set_attribute", + "Invalid attribute name.", + "6.2.0" + ); + RETURN_FALSE; + } + + if (IS_FALSE == Z_TYPE_P(value)) { + value_kind = 0; + } else if (IS_TRUE == Z_TYPE_P(value)) { + value_kind = 1; + } else { + value_string = zval_get_string(value); + value_ptr = (const unsigned char *) ZSTR_VAL(value_string); + value_len = ZSTR_LEN(value_string); + } + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + if (NULL != value_string) { + zend_string_release(value_string); + } + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + had_span = wp_html_api_rust_tag_processor_current_span(intern->native, &old_token_start, &old_token_length); + result = wp_html_api_rust_tag_processor_set_attribute( + intern->native, + (const unsigned char *) name, + name_len, + value_ptr, + value_len, + value_kind + ); + + if (NULL != value_string) { + zend_string_release(value_string); + } + + if (result) { + if ( + had_span && + wp_html_api_rust_tag_processor_current_span(intern->native, &new_token_start, &new_token_length) + ) { + wp_html_tag_processor_adjust_bookmarks_after_current_token_update( + ZEND_THIS, + (zend_long) old_token_start, + (zend_long) old_token_length, + (zend_long) new_token_start, + (zend_long) new_token_length + ); + } + wp_html_tag_processor_sync_html_property(ZEND_THIS, intern->native); + } + + RETURN_BOOL(result); +} + +PHP_METHOD(WP_HTML_Tag_Processor, remove_attribute) +{ + char *name; + size_t name_len; + wp_html_tag_processor_object *intern; + bool had_span; + size_t old_token_start = 0; + size_t old_token_length = 0; + size_t new_token_start = 0; + size_t new_token_length = 0; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STRING(name, name_len) + ZEND_PARSE_PARAMETERS_END(); + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_FALSE; + } + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + { + had_span = wp_html_api_rust_tag_processor_current_span(intern->native, &old_token_start, &old_token_length); + bool result = wp_html_api_rust_tag_processor_remove_attribute( + intern->native, + (const unsigned char *) name, + name_len + ); + + if (result) { + if ( + had_span && + wp_html_api_rust_tag_processor_current_span(intern->native, &new_token_start, &new_token_length) + ) { + wp_html_tag_processor_adjust_bookmarks_after_current_token_update( + ZEND_THIS, + (zend_long) old_token_start, + (zend_long) old_token_length, + (zend_long) new_token_start, + (zend_long) new_token_length + ); + } + wp_html_tag_processor_sync_html_property(ZEND_THIS, intern->native); + } + + RETURN_BOOL(result); + } +} + +PHP_METHOD(WP_HTML_Tag_Processor, add_class) +{ + char *class_name; + size_t class_name_len; + wp_html_tag_processor_object *intern; + bool had_span; + size_t old_token_start = 0; + size_t old_token_length = 0; + size_t new_token_start = 0; + size_t new_token_length = 0; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STRING(class_name, class_name_len) + ZEND_PARSE_PARAMETERS_END(); + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_FALSE; + } + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + { + had_span = wp_html_api_rust_tag_processor_current_span(intern->native, &old_token_start, &old_token_length); + bool result = wp_html_api_rust_tag_processor_add_class( + intern->native, + (const unsigned char *) class_name, + class_name_len + ); + + if (result) { + if ( + had_span && + wp_html_api_rust_tag_processor_current_span(intern->native, &new_token_start, &new_token_length) + ) { + wp_html_tag_processor_adjust_bookmarks_after_current_token_update( + ZEND_THIS, + (zend_long) old_token_start, + (zend_long) old_token_length, + (zend_long) new_token_start, + (zend_long) new_token_length + ); + } + wp_html_tag_processor_sync_html_property(ZEND_THIS, intern->native); + } + + RETURN_BOOL(result); + } +} + +PHP_METHOD(WP_HTML_Tag_Processor, remove_class) +{ + char *class_name; + size_t class_name_len; + wp_html_tag_processor_object *intern; + bool had_span; + size_t old_token_start = 0; + size_t old_token_length = 0; + size_t new_token_start = 0; + size_t new_token_length = 0; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STRING(class_name, class_name_len) + ZEND_PARSE_PARAMETERS_END(); + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_FALSE; + } + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + { + had_span = wp_html_api_rust_tag_processor_current_span(intern->native, &old_token_start, &old_token_length); + bool result = wp_html_api_rust_tag_processor_remove_class( + intern->native, + (const unsigned char *) class_name, + class_name_len + ); + + if (result) { + if ( + had_span && + wp_html_api_rust_tag_processor_current_span(intern->native, &new_token_start, &new_token_length) + ) { + wp_html_tag_processor_adjust_bookmarks_after_current_token_update( + ZEND_THIS, + (zend_long) old_token_start, + (zend_long) old_token_length, + (zend_long) new_token_start, + (zend_long) new_token_length + ); + } + wp_html_tag_processor_sync_html_property(ZEND_THIS, intern->native); + } + + RETURN_BOOL(result); + } +} + +PHP_METHOD(WP_HTML_Tag_Processor, has_class) +{ + char *class_name; + size_t class_name_len; + wp_html_tag_processor_object *intern; + unsigned char result; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STRING(class_name, class_name_len) + ZEND_PARSE_PARAMETERS_END(); + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_NULL(); + } + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + result = wp_html_api_rust_tag_processor_has_class( + intern->native, + (const unsigned char *) class_name, + class_name_len + ); + + if (0 == result) { + RETURN_NULL(); + } + + RETURN_BOOL(2 == result); +} + +PHP_METHOD(WP_HTML_Tag_Processor, class_list) +{ + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice classes; + size_t start = 0; + size_t i; + + ZEND_PARSE_PARAMETERS_NONE(); + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_NULL(); + } + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_api_rust_tag_processor_class_list(intern->native, &classes)) { + RETURN_NULL(); + } + + array_init(return_value); + if (0 == classes.len) { + return; + } + + for (i = 0; i <= classes.len; i++) { + if (i == classes.len || 0x1f == classes.ptr[i]) { + add_next_index_stringl(return_value, (const char *) classes.ptr + start, i - start); + start = i + 1; + } + } +} + +PHP_METHOD(WP_HTML_Tag_Processor, is_tag_closer) +{ + wp_html_tag_processor_object *intern; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_FALSE; + } + + RETURN_BOOL(wp_html_api_rust_tag_processor_is_tag_closer(intern->native)); +} + +PHP_METHOD(WP_HTML_Tag_Processor, has_self_closing_flag) +{ + wp_html_tag_processor_object *intern; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_FALSE; + } + + RETURN_BOOL(wp_html_api_rust_tag_processor_has_self_closing_flag(intern->native)); +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_token_name) +{ + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice tag_name; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_TEXT_NODE", sizeof("STATE_TEXT_NODE") - 1)) { + RETURN_STRING("#text"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_COMMENT", sizeof("STATE_COMMENT") - 1)) { + RETURN_STRING("#comment"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_DOCTYPE", sizeof("STATE_DOCTYPE") - 1)) { + RETURN_STRING("html"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_CDATA_NODE", sizeof("STATE_CDATA_NODE") - 1)) { + RETURN_STRING("#cdata-section"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_PRESUMPTUOUS_TAG", sizeof("STATE_PRESUMPTUOUS_TAG") - 1)) { + RETURN_STRING("#presumptuous-tag"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_FUNKY_COMMENT", sizeof("STATE_FUNKY_COMMENT") - 1)) { + RETURN_STRING("#funky-comment"); + } + + if (!wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_NULL(); + } + + if (!wp_html_api_rust_tag_processor_get_tag(intern->native, &tag_name)) { + RETURN_NULL(); + } + + RETURN_STR(wp_html_api_rust_uppercase_ascii_slice(tag_name.ptr, tag_name.len)); +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_token_type) +{ + wp_html_tag_processor_object *intern; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1)) { + RETURN_STRING("#tag"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_DOCTYPE", sizeof("STATE_DOCTYPE") - 1)) { + RETURN_STRING("#doctype"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_TEXT_NODE", sizeof("STATE_TEXT_NODE") - 1)) { + RETURN_STRING("#text"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_COMMENT", sizeof("STATE_COMMENT") - 1)) { + RETURN_STRING("#comment"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_CDATA_NODE", sizeof("STATE_CDATA_NODE") - 1)) { + RETURN_STRING("#cdata-section"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_PRESUMPTUOUS_TAG", sizeof("STATE_PRESUMPTUOUS_TAG") - 1)) { + RETURN_STRING("#presumptuous-tag"); + } + + if (wp_html_tag_processor_parser_state_is(ZEND_THIS, "STATE_FUNKY_COMMENT", sizeof("STATE_FUNKY_COMMENT") - 1)) { + RETURN_STRING("#funky-comment"); + } + + RETURN_NULL(); +} + +PHP_METHOD(WP_HTML_Tag_Processor, paused_at_incomplete_token) +{ + wp_html_tag_processor_object *intern; + zval rv; + zval *state; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL != intern->native && wp_html_api_rust_tag_processor_paused_at_incomplete(intern->native)) { + RETURN_TRUE; + } + + state = zend_read_property( + wp_html_tag_processor_ce, + Z_OBJ_P(ZEND_THIS), + "parser_state", + sizeof("parser_state") - 1, + 1, + &rv + ); + + RETURN_BOOL( + IS_STRING == Z_TYPE_P(state) && + sizeof("STATE_INCOMPLETE_INPUT") - 1 == Z_STRLEN_P(state) && + 0 == memcmp(Z_STRVAL_P(state), "STATE_INCOMPLETE_INPUT", sizeof("STATE_INCOMPLETE_INPUT") - 1) + ); +} + +PHP_METHOD(WP_HTML_Tag_Processor, subdivide_text_appropriately) +{ + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice text; + zval rv; + zval *parser_state; + bool is_whitespace = true; + size_t at; + + ZEND_PARSE_PARAMETERS_NONE(); + + parser_state = zend_read_property( + wp_html_tag_processor_ce, + Z_OBJ_P(ZEND_THIS), + "parser_state", + sizeof("parser_state") - 1, + 1, + &rv + ); + if ( + IS_STRING != Z_TYPE_P(parser_state) || + sizeof("STATE_TEXT_NODE") - 1 != Z_STRLEN_P(parser_state) || + 0 != memcmp(Z_STRVAL_P(parser_state), "STATE_TEXT_NODE", sizeof("STATE_TEXT_NODE") - 1) + ) { + RETURN_FALSE; + } + + zend_update_property_string( + wp_html_tag_processor_ce, + Z_OBJ_P(ZEND_THIS), + "text_node_classification", + sizeof("text_node_classification") - 1, + "TEXT_IS_GENERIC" + ); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_api_rust_tag_processor_get_modifiable_text(intern->native, &text) || 0 == text.len) { + RETURN_FALSE; + } + + for (at = 0; at < text.len; ++at) { + if (!wp_html_api_rust_is_html_whitespace(text.ptr[at])) { + is_whitespace = false; + break; + } + } + + if (is_whitespace) { + zend_update_property_string( + wp_html_tag_processor_ce, + Z_OBJ_P(ZEND_THIS), + "text_node_classification", + sizeof("text_node_classification") - 1, + "TEXT_IS_WHITESPACE" + ); + RETURN_TRUE; + } + + RETURN_FALSE; +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_modifiable_text) +{ + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice text; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_api_rust_tag_processor_get_modifiable_text(intern->native, &text)) { + RETURN_EMPTY_STRING(); + } + + RETURN_STRINGL((const char *) text.ptr, text.len); +} + +PHP_METHOD(WP_HTML_Tag_Processor, native_get_script_content_type) +{ + wp_html_tag_processor_object *intern; + unsigned char content_type; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + content_type = wp_html_api_rust_tag_processor_script_content_type(intern->native); + switch (content_type) { + case 1: + RETURN_STRING("javascript"); + case 2: + RETURN_STRING("json"); + default: + RETURN_NULL(); + } +} + +PHP_METHOD(WP_HTML_Tag_Processor, set_modifiable_text) +{ + char *text; + size_t text_len; + wp_html_tag_processor_object *intern; + zval rv; + zval *parser_state; + bool had_span; + size_t old_token_start = 0; + size_t old_token_length = 0; + size_t new_token_start = 0; + size_t new_token_length = 0; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STRING(text, text_len) + ZEND_PARSE_PARAMETERS_END(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + parser_state = zend_read_property( + wp_html_tag_processor_ce, + Z_OBJ_P(ZEND_THIS), + "parser_state", + sizeof("parser_state") - 1, + 1, + &rv + ); + if ( + IS_STRING == Z_TYPE_P(parser_state) && + ( + ( + sizeof("STATE_COMPLETE") - 1 == Z_STRLEN_P(parser_state) && + 0 == memcmp(Z_STRVAL_P(parser_state), "STATE_COMPLETE", sizeof("STATE_COMPLETE") - 1) + ) || + ( + sizeof("STATE_INCOMPLETE_INPUT") - 1 == Z_STRLEN_P(parser_state) && + 0 == memcmp(Z_STRVAL_P(parser_state), "STATE_INCOMPLETE_INPUT", sizeof("STATE_INCOMPLETE_INPUT") - 1) + ) + ) + ) { + RETURN_FALSE; + } + + had_span = wp_html_api_rust_tag_processor_current_span(intern->native, &old_token_start, &old_token_length); + if ( + wp_html_api_rust_tag_processor_set_modifiable_text( + intern->native, + (const unsigned char *) text, + text_len + ) + ) { + if ( + had_span && + wp_html_api_rust_tag_processor_current_span(intern->native, &new_token_start, &new_token_length) + ) { + wp_html_tag_processor_adjust_bookmarks_after_current_token_update( + ZEND_THIS, + (zend_long) old_token_start, + (zend_long) old_token_length, + (zend_long) new_token_start, + (zend_long) new_token_length + ); + } + wp_html_tag_processor_sync_html_property(ZEND_THIS, intern->native); + RETURN_TRUE; + } + + RETURN_FALSE; +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_comment_type) +{ + wp_html_tag_processor_object *intern; + unsigned char comment_type; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + comment_type = wp_html_api_rust_tag_processor_current_comment_type(intern->native); + switch (comment_type) { + case 1: + RETURN_STRING("COMMENT_AS_ABRUPTLY_CLOSED_COMMENT"); + case 2: + RETURN_STRING("COMMENT_AS_CDATA_LOOKALIKE"); + case 3: + RETURN_STRING("COMMENT_AS_HTML_COMMENT"); + case 4: + RETURN_STRING("COMMENT_AS_PI_NODE_LOOKALIKE"); + case 5: + RETURN_STRING("COMMENT_AS_INVALID_HTML"); + } + + RETURN_NULL(); +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_doctype_info) +{ + wp_html_tag_processor_object *intern; + zend_string *doctype_class_name; + zend_class_entry *doctype_ce; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (4 != wp_html_api_rust_tag_processor_current_token_type(intern->native)) { + RETURN_NULL(); + } + + doctype_class_name = zend_string_init("WP_HTML_Doctype_Info", sizeof("WP_HTML_Doctype_Info") - 1, 0); + doctype_ce = zend_lookup_class(doctype_class_name); + zend_string_release(doctype_class_name); + + if (NULL != doctype_ce) { + object_init_ex(return_value, doctype_ce); + } else { + object_init(return_value); + } + + add_property_string(return_value, "name", "html"); + add_property_null(return_value, "public_identifier"); + add_property_null(return_value, "system_identifier"); + add_property_string(return_value, "indicated_compatibility_mode", "no-quirks"); +} + +PHP_METHOD(WP_HTML_Tag_Processor, set_bookmark) +{ + zend_string *bookmark_name; + wp_html_tag_processor_object *intern; + zval rv; + zval *bookmarks; + zval span; + size_t token_start; + size_t token_length; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STR(bookmark_name) + ZEND_PARSE_PARAMETERS_END(); + + if (wp_html_tag_processor_parser_state_is_terminal(ZEND_THIS)) { + RETURN_FALSE; + } + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_api_rust_tag_processor_current_span(intern->native, &token_start, &token_length)) { + RETURN_FALSE; + } + + bookmarks = wp_html_tag_processor_read_bookmarks(ZEND_THIS, &rv); + if ( + NULL == zend_symtable_find(Z_ARRVAL_P(bookmarks), bookmark_name) && + zend_hash_num_elements(Z_ARRVAL_P(bookmarks)) >= 10 + ) { + wp_html_api_rust_doing_it_wrong( + "WP_HTML_Tag_Processor::set_bookmark", + "Too many bookmarks: cannot create any more.", + "6.2.0" + ); + RETURN_FALSE; + } + + wp_html_tag_processor_create_span(&span, (zend_long) token_start, (zend_long) token_length); + zend_symtable_update(Z_ARRVAL_P(bookmarks), bookmark_name, &span); + + RETURN_TRUE; +} + +PHP_METHOD(WP_HTML_Tag_Processor, release_bookmark) +{ + zend_string *bookmark_name; + zval rv; + zval *bookmarks; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STR(bookmark_name) + ZEND_PARSE_PARAMETERS_END(); + + bookmarks = wp_html_tag_processor_read_bookmarks(ZEND_THIS, &rv); + RETURN_BOOL(SUCCESS == zend_symtable_del(Z_ARRVAL_P(bookmarks), bookmark_name)); +} + +PHP_METHOD(WP_HTML_Tag_Processor, has_bookmark) +{ + zend_string *bookmark_name; + zval rv; + zval *bookmarks; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STR(bookmark_name) + ZEND_PARSE_PARAMETERS_END(); + + bookmarks = wp_html_tag_processor_read_bookmarks(ZEND_THIS, &rv); + RETURN_BOOL(NULL != zend_symtable_find(Z_ARRVAL_P(bookmarks), bookmark_name)); +} + +PHP_METHOD(WP_HTML_Tag_Processor, seek) +{ + zend_string *bookmark_name; + wp_html_tag_processor_object *intern; + zval rv; + zval *bookmarks; + zval *bookmark; + zend_long bookmark_start; + zend_long bookmark_length; + size_t token_start; + size_t token_length; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STR(bookmark_name) + ZEND_PARSE_PARAMETERS_END(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_tag_processor_apply_lexical_updates(ZEND_THIS, intern->native)) { + RETURN_FALSE; + } + + bookmarks = wp_html_tag_processor_read_bookmarks(ZEND_THIS, &rv); + bookmark = zend_symtable_find(Z_ARRVAL_P(bookmarks), bookmark_name); + if (NULL == bookmark) { + wp_html_api_rust_doing_it_wrong( + "WP_HTML_Tag_Processor::seek", + "Unknown bookmark name.", + "6.2.0" + ); + RETURN_FALSE; + } + + if ( + !wp_html_tag_processor_read_long_property(bookmark, "start", sizeof("start") - 1, &bookmark_start) || + !wp_html_tag_processor_read_long_property(bookmark, "length", sizeof("length") - 1, &bookmark_length) + ) { + RETURN_FALSE; + } + + if ( + wp_html_api_rust_tag_processor_current_span(intern->native, &token_start, &token_length) && + token_start == (size_t) bookmark_start && + token_length == (size_t) bookmark_length + ) { + RETURN_TRUE; + } + + if (++intern->seek_count > 1000) { + wp_html_api_rust_doing_it_wrong( + "WP_HTML_Tag_Processor::seek", + "Too many calls to seek() - this can lead to performance issues.", + "6.2.0" + ); + RETURN_FALSE; + } + + if (0 == bookmark_length) { + wp_html_api_rust_tag_processor_seek(intern->native, (size_t) bookmark_start); + wp_html_tag_processor_update_parser_state(ZEND_THIS, "STATE_READY"); + RETURN_TRUE; + } + + wp_html_api_rust_tag_processor_seek(intern->native, (size_t) bookmark_start); + if (wp_html_api_rust_tag_processor_next_token(intern->native)) { + wp_html_tag_processor_update_parser_state_from_native(ZEND_THIS, intern->native); + RETURN_TRUE; + } + + wp_html_tag_processor_update_parser_state( + ZEND_THIS, + wp_html_api_rust_tag_processor_paused_at_incomplete(intern->native) ? "STATE_INCOMPLETE_INPUT" : "STATE_COMPLETE" + ); + RETURN_FALSE; +} + +PHP_METHOD(WP_HTML_Tag_Processor, change_parsing_namespace) +{ + char *new_namespace; + size_t new_namespace_len; + wp_html_tag_processor_object *intern; + unsigned char namespace_id = 0; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STRING(new_namespace, new_namespace_len) + ZEND_PARSE_PARAMETERS_END(); + + if ( + !( + sizeof("html") - 1 == new_namespace_len && + 0 == memcmp(new_namespace, "html", sizeof("html") - 1) + ) && + !( + sizeof("math") - 1 == new_namespace_len && + 0 == memcmp(new_namespace, "math", sizeof("math") - 1) + ) && + !( + sizeof("svg") - 1 == new_namespace_len && + 0 == memcmp(new_namespace, "svg", sizeof("svg") - 1) + ) + ) { + RETURN_FALSE; + } + + zend_update_property_stringl( + wp_html_tag_processor_ce, + Z_OBJ_P(ZEND_THIS), + "parsing_namespace", + sizeof("parsing_namespace") - 1, + new_namespace, + new_namespace_len + ); + + if (!(sizeof("html") - 1 == new_namespace_len && 0 == memcmp(new_namespace, "html", sizeof("html") - 1))) { + namespace_id = 1; + } + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL != intern->native) { + wp_html_api_rust_tag_processor_set_namespace(intern->native, namespace_id); + } + + RETURN_TRUE; +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_namespace) +{ + zval rv; + zval *namespace_value; + + ZEND_PARSE_PARAMETERS_NONE(); + + namespace_value = zend_read_property( + wp_html_tag_processor_ce, + Z_OBJ_P(ZEND_THIS), + "parsing_namespace", + sizeof("parsing_namespace") - 1, + 1, + &rv + ); + + if (IS_STRING == Z_TYPE_P(namespace_value)) { + RETURN_STR_COPY(Z_STR_P(namespace_value)); + } + + RETURN_STRING("html"); +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_qualified_tag_name) +{ + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice tag_name; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_api_rust_tag_processor_get_tag(intern->native, &tag_name)) { + RETURN_NULL(); + } + + RETURN_STR(wp_html_api_rust_uppercase_ascii_slice(tag_name.ptr, tag_name.len)); +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_qualified_attribute_name) +{ + char *attribute_name; + size_t attribute_name_len; + + ZEND_PARSE_PARAMETERS_START(1, 1) + Z_PARAM_STRING(attribute_name, attribute_name_len) + ZEND_PARSE_PARAMETERS_END(); + + RETURN_STRINGL(attribute_name, attribute_name_len); +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_full_comment_text) +{ + ZEND_PARSE_PARAMETERS_NONE(); + + RETURN_NULL(); +} + +PHP_METHOD(WP_HTML_Tag_Processor, get_updated_html) +{ + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice html; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_tag_processor_apply_lexical_updates(ZEND_THIS, intern->native)) { + RETURN_EMPTY_STRING(); + } + + if (!wp_html_api_rust_tag_processor_get_html(intern->native, &html)) { + RETURN_EMPTY_STRING(); + } + + wp_html_tag_processor_sync_html_property(ZEND_THIS, intern->native); + RETURN_STRINGL((const char *) html.ptr, html.len); +} + +PHP_METHOD(WP_HTML_Tag_Processor, __toString) +{ + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice html; + + ZEND_PARSE_PARAMETERS_NONE(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native) { + zend_throw_error(NULL, "WP_HTML_Tag_Processor is not initialized"); + RETURN_THROWS(); + } + + if (!wp_html_tag_processor_apply_lexical_updates(ZEND_THIS, intern->native)) { + RETURN_EMPTY_STRING(); + } + + if (!wp_html_api_rust_tag_processor_get_html(intern->native, &html)) { + RETURN_EMPTY_STRING(); + } + + RETURN_STRINGL((const char *) html.ptr, html.len); +} + +PHP_METHOD(WP_HTML_Processor, __construct) +{ + zval *html_param; + zval *unlock_param = NULL; + const char *html = ""; + size_t html_len = 0; + + ZEND_PARSE_PARAMETERS_START(1, 2) + Z_PARAM_ZVAL(html_param) + Z_PARAM_OPTIONAL + Z_PARAM_ZVAL(unlock_param) + ZEND_PARSE_PARAMETERS_END(); + + if (IS_STRING == Z_TYPE_P(html_param)) { + html = Z_STRVAL_P(html_param); + html_len = Z_STRLEN_P(html_param); + } else { + wp_html_api_rust_doing_it_wrong( + "WP_HTML_Processor::__construct", + "The HTML parameter must be a string.", + "6.9.0" + ); + } + + if (!wp_html_tag_processor_initialize(ZEND_THIS, html, html_len)) { + RETURN_THROWS(); + } +} + +static void wp_html_processor_create_initialized(INTERNAL_FUNCTION_PARAMETERS, const char *html, size_t html_len) +{ + zend_class_entry *called_scope = zend_get_called_scope(execute_data); + + if (NULL == called_scope || !instanceof_function(called_scope, wp_html_processor_ce)) { + called_scope = wp_html_processor_ce; + } + + object_init_ex(return_value, called_scope); + if (!wp_html_tag_processor_initialize(return_value, html, html_len)) { + zval_ptr_dtor(return_value); + RETURN_THROWS(); + } +} + +PHP_METHOD(WP_HTML_Processor, create_fragment) +{ + zval *html_param; + zval *context_param = NULL; + zval *encoding_param = NULL; + const char *html; + size_t html_len; + + ZEND_PARSE_PARAMETERS_START(1, 3) + Z_PARAM_ZVAL(html_param) + Z_PARAM_OPTIONAL + Z_PARAM_ZVAL(context_param) + Z_PARAM_ZVAL(encoding_param) + ZEND_PARSE_PARAMETERS_END(); + + if (IS_STRING != Z_TYPE_P(html_param)) { + wp_html_api_rust_doing_it_wrong( + "WP_HTML_Processor::create_fragment", + "The HTML parameter must be a string.", + "6.9.0" + ); + RETURN_NULL(); + } + + if ( + NULL != context_param && + !( + IS_STRING == Z_TYPE_P(context_param) && + sizeof("") - 1 == Z_STRLEN_P(context_param) && + 0 == memcmp(Z_STRVAL_P(context_param), "", sizeof("") - 1) + ) + ) { + RETURN_NULL(); + } + + if ( + NULL != encoding_param && + !( + IS_STRING == Z_TYPE_P(encoding_param) && + sizeof("UTF-8") - 1 == Z_STRLEN_P(encoding_param) && + 0 == memcmp(Z_STRVAL_P(encoding_param), "UTF-8", sizeof("UTF-8") - 1) + ) + ) { + RETURN_NULL(); + } + + html = Z_STRVAL_P(html_param); + html_len = Z_STRLEN_P(html_param); + wp_html_processor_create_initialized(INTERNAL_FUNCTION_PARAM_PASSTHRU, html, html_len); +} + +PHP_METHOD(WP_HTML_Processor, create_full_parser) +{ + zval *html_param; + zval *encoding_param = NULL; + const char *html; + size_t html_len; + + ZEND_PARSE_PARAMETERS_START(1, 2) + Z_PARAM_ZVAL(html_param) + Z_PARAM_OPTIONAL + Z_PARAM_ZVAL(encoding_param) + ZEND_PARSE_PARAMETERS_END(); + + if (IS_STRING != Z_TYPE_P(html_param)) { + wp_html_api_rust_doing_it_wrong( + "WP_HTML_Processor::create_full_parser", + "The HTML parameter must be a string.", + "6.9.0" + ); + RETURN_NULL(); + } + + if ( + NULL != encoding_param && + !( + IS_STRING == Z_TYPE_P(encoding_param) && + sizeof("UTF-8") - 1 == Z_STRLEN_P(encoding_param) && + 0 == memcmp(Z_STRVAL_P(encoding_param), "UTF-8", sizeof("UTF-8") - 1) + ) + ) { + RETURN_NULL(); + } + + html = Z_STRVAL_P(html_param); + html_len = Z_STRLEN_P(html_param); + wp_html_processor_create_initialized(INTERNAL_FUNCTION_PARAM_PASSTHRU, html, html_len); +} + +PHP_METHOD(WP_HTML_Processor, get_last_error) +{ + ZEND_PARSE_PARAMETERS_NONE(); + + RETURN_NULL(); +} + +PHP_METHOD(WP_HTML_Processor, get_unsupported_exception) +{ + ZEND_PARSE_PARAMETERS_NONE(); + + RETURN_NULL(); +} + +PHP_METHOD(WP_HTML_Processor, is_virtual) +{ + ZEND_PARSE_PARAMETERS_NONE(); + + RETURN_FALSE; +} + +PHP_METHOD(WP_HTML_Processor, expects_closer) +{ + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice tag_name; + zval *node = NULL; + + ZEND_PARSE_PARAMETERS_START(0, 1) + Z_PARAM_OPTIONAL + Z_PARAM_ZVAL(node) + ZEND_PARSE_PARAMETERS_END(); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL == intern->native || !wp_html_api_rust_tag_processor_get_tag(intern->native, &tag_name)) { + RETURN_NULL(); + } + + if (wp_html_api_rust_tag_processor_is_tag_closer(intern->native)) { + RETURN_FALSE; + } + + if ( + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "AREA") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "BASE") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "BR") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "COL") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "EMBED") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "HR") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "IMG") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "INPUT") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "LINK") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "META") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "PARAM") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "SOURCE") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "TRACK") || + wp_html_api_rust_ascii_eq_ci(tag_name.ptr, tag_name.len, "WBR") + ) { + RETURN_FALSE; + } + + RETURN_TRUE; +} + +PHP_METHOD(WP_HTML_Processor, get_breadcrumbs) +{ + wp_html_tag_processor_object *intern; + wp_html_api_rust_byte_slice tag_name; + + ZEND_PARSE_PARAMETERS_NONE(); + + array_init(return_value); + add_next_index_string(return_value, "HTML"); + add_next_index_string(return_value, "BODY"); + + intern = Z_WP_HTML_TAG_PROCESSOR_P(ZEND_THIS); + if (NULL != intern->native && wp_html_api_rust_tag_processor_get_tag(intern->native, &tag_name)) { + zend_string *upper = wp_html_api_rust_uppercase_ascii_slice(tag_name.ptr, tag_name.len); + add_next_index_str(return_value, upper); + } +} + +static const zend_function_entry wp_html_api_rust_functions[] = { + PHP_FE(wp_html_api_rust_version, arginfo_wp_html_api_rust_version) + PHP_FE(wp_html_api_rust_scan_next_tag, arginfo_wp_html_api_rust_scan_next_tag) + PHP_FE_END +}; + +static const zend_function_entry wp_html_tag_processor_methods[] = { + PHP_ME(WP_HTML_Tag_Processor, __construct, arginfo_wp_html_tag_processor_construct, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, next_tag, arginfo_wp_html_tag_processor_next_tag, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, next_token, arginfo_wp_html_tag_processor_bool, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_tag, arginfo_wp_html_tag_processor_get_tag, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_attribute, arginfo_wp_html_tag_processor_get_attribute, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_attribute_names_with_prefix, arginfo_wp_html_tag_processor_get_attribute_names_with_prefix, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, set_attribute, arginfo_wp_html_tag_processor_set_attribute, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, remove_attribute, arginfo_wp_html_tag_processor_remove_attribute, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, add_class, arginfo_wp_html_tag_processor_class_mutation, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, remove_class, arginfo_wp_html_tag_processor_class_mutation, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, has_class, arginfo_wp_html_tag_processor_has_class, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, class_list, arginfo_wp_html_tag_processor_class_list, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, is_tag_closer, arginfo_wp_html_tag_processor_bool, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, has_self_closing_flag, arginfo_wp_html_tag_processor_bool, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_token_name, arginfo_wp_html_tag_processor_nullable_string, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_token_type, arginfo_wp_html_tag_processor_nullable_string, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, paused_at_incomplete_token, arginfo_wp_html_tag_processor_bool, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, subdivide_text_appropriately, arginfo_wp_html_tag_processor_bool, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_modifiable_text, arginfo_wp_html_tag_processor_nullable_string, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, native_get_script_content_type, arginfo_wp_html_tag_processor_nullable_string, ZEND_ACC_PROTECTED) + PHP_ME(WP_HTML_Tag_Processor, set_modifiable_text, arginfo_wp_html_tag_processor_set_modifiable_text, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_comment_type, arginfo_wp_html_tag_processor_nullable_string, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_doctype_info, arginfo_wp_html_tag_processor_nullable_mixed, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, set_bookmark, arginfo_wp_html_tag_processor_bookmark, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, release_bookmark, arginfo_wp_html_tag_processor_bookmark, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, has_bookmark, arginfo_wp_html_tag_processor_bookmark, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, seek, arginfo_wp_html_tag_processor_bookmark, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, change_parsing_namespace, arginfo_wp_html_tag_processor_change_namespace, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_namespace, arginfo_wp_html_tag_processor_nullable_string, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_qualified_tag_name, arginfo_wp_html_tag_processor_nullable_string, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_qualified_attribute_name, arginfo_wp_html_tag_processor_get_attribute, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_full_comment_text, arginfo_wp_html_tag_processor_nullable_string, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, get_updated_html, arginfo_wp_html_tag_processor_get_html, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Tag_Processor, __toString, arginfo_wp_html_tag_processor_get_html, ZEND_ACC_PUBLIC) + PHP_FE_END +}; + +static const zend_function_entry wp_html_processor_methods[] = { + PHP_ME(WP_HTML_Processor, __construct, arginfo_wp_html_processor_construct, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Processor, create_fragment, arginfo_wp_html_processor_create_fragment, ZEND_ACC_PUBLIC | ZEND_ACC_STATIC) + PHP_ME(WP_HTML_Processor, create_full_parser, arginfo_wp_html_processor_create_full_parser, ZEND_ACC_PUBLIC | ZEND_ACC_STATIC) + PHP_ME(WP_HTML_Processor, get_last_error, arginfo_wp_html_tag_processor_nullable_string, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Processor, get_unsupported_exception, arginfo_wp_html_tag_processor_nullable_mixed, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Processor, is_virtual, arginfo_wp_html_tag_processor_bool, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Processor, expects_closer, arginfo_wp_html_tag_processor_nullable_mixed, ZEND_ACC_PUBLIC) + PHP_ME(WP_HTML_Processor, get_breadcrumbs, arginfo_wp_html_tag_processor_class_list, ZEND_ACC_PUBLIC) + PHP_FE_END +}; + +static void wp_html_api_rust_register_tag_processor_class(void) +{ + zend_class_entry ce; + + INIT_CLASS_ENTRY(ce, "WP_HTML_Tag_Processor_Native", wp_html_tag_processor_methods); + wp_html_tag_processor_ce = zend_register_internal_class(&ce); + wp_html_tag_processor_ce->create_object = wp_html_tag_processor_create_object; + + zend_declare_class_constant_long(wp_html_tag_processor_ce, "MAX_BOOKMARKS", sizeof("MAX_BOOKMARKS") - 1, 10); + zend_declare_class_constant_long(wp_html_tag_processor_ce, "MAX_SEEK_OPS", sizeof("MAX_SEEK_OPS") - 1, 1000); + zend_declare_class_constant_bool(wp_html_tag_processor_ce, "ADD_CLASS", sizeof("ADD_CLASS") - 1, true); + zend_declare_class_constant_bool(wp_html_tag_processor_ce, "REMOVE_CLASS", sizeof("REMOVE_CLASS") - 1, false); + zend_declare_class_constant_null(wp_html_tag_processor_ce, "SKIP_CLASS", sizeof("SKIP_CLASS") - 1); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "STATE_READY", sizeof("STATE_READY") - 1, "STATE_READY"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "STATE_COMPLETE", sizeof("STATE_COMPLETE") - 1, "STATE_COMPLETE"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "STATE_INCOMPLETE_INPUT", sizeof("STATE_INCOMPLETE_INPUT") - 1, "STATE_INCOMPLETE_INPUT"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "STATE_MATCHED_TAG", sizeof("STATE_MATCHED_TAG") - 1, "STATE_MATCHED_TAG"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "STATE_TEXT_NODE", sizeof("STATE_TEXT_NODE") - 1, "STATE_TEXT_NODE"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "STATE_CDATA_NODE", sizeof("STATE_CDATA_NODE") - 1, "STATE_CDATA_NODE"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "STATE_COMMENT", sizeof("STATE_COMMENT") - 1, "STATE_COMMENT"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "STATE_DOCTYPE", sizeof("STATE_DOCTYPE") - 1, "STATE_DOCTYPE"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "STATE_PRESUMPTUOUS_TAG", sizeof("STATE_PRESUMPTUOUS_TAG") - 1, "STATE_PRESUMPTUOUS_TAG"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "STATE_FUNKY_COMMENT", sizeof("STATE_FUNKY_COMMENT") - 1, "STATE_WP_FUNKY"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "COMMENT_AS_ABRUPTLY_CLOSED_COMMENT", sizeof("COMMENT_AS_ABRUPTLY_CLOSED_COMMENT") - 1, "COMMENT_AS_ABRUPTLY_CLOSED_COMMENT"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "COMMENT_AS_CDATA_LOOKALIKE", sizeof("COMMENT_AS_CDATA_LOOKALIKE") - 1, "COMMENT_AS_CDATA_LOOKALIKE"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "COMMENT_AS_HTML_COMMENT", sizeof("COMMENT_AS_HTML_COMMENT") - 1, "COMMENT_AS_HTML_COMMENT"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "COMMENT_AS_PI_NODE_LOOKALIKE", sizeof("COMMENT_AS_PI_NODE_LOOKALIKE") - 1, "COMMENT_AS_PI_NODE_LOOKALIKE"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "COMMENT_AS_INVALID_HTML", sizeof("COMMENT_AS_INVALID_HTML") - 1, "COMMENT_AS_INVALID_HTML"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "NO_QUIRKS_MODE", sizeof("NO_QUIRKS_MODE") - 1, "no-quirks-mode"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "QUIRKS_MODE", sizeof("QUIRKS_MODE") - 1, "quirks-mode"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "TEXT_IS_GENERIC", sizeof("TEXT_IS_GENERIC") - 1, "TEXT_IS_GENERIC"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "TEXT_IS_NULL_SEQUENCE", sizeof("TEXT_IS_NULL_SEQUENCE") - 1, "TEXT_IS_NULL_SEQUENCE"); + zend_declare_class_constant_string(wp_html_tag_processor_ce, "TEXT_IS_WHITESPACE", sizeof("TEXT_IS_WHITESPACE") - 1, "TEXT_IS_WHITESPACE"); + + zend_declare_property_null(wp_html_tag_processor_ce, "html", sizeof("html") - 1, ZEND_ACC_PROTECTED); + zend_declare_property_string(wp_html_tag_processor_ce, "parser_state", sizeof("parser_state") - 1, "STATE_READY", ZEND_ACC_PROTECTED); + zend_declare_property_string(wp_html_tag_processor_ce, "compat_mode", sizeof("compat_mode") - 1, "no-quirks-mode", ZEND_ACC_PROTECTED); + zend_declare_property_string(wp_html_tag_processor_ce, "parsing_namespace", sizeof("parsing_namespace") - 1, "html", ZEND_ACC_PRIVATE); + zend_declare_property_null(wp_html_tag_processor_ce, "comment_type", sizeof("comment_type") - 1, ZEND_ACC_PROTECTED); + zend_declare_property_string(wp_html_tag_processor_ce, "text_node_classification", sizeof("text_node_classification") - 1, "TEXT_IS_GENERIC", ZEND_ACC_PROTECTED); + zend_declare_property_null(wp_html_tag_processor_ce, "bookmarks", sizeof("bookmarks") - 1, ZEND_ACC_PROTECTED); + zend_declare_property_null(wp_html_tag_processor_ce, "lexical_updates", sizeof("lexical_updates") - 1, ZEND_ACC_PROTECTED); + + memcpy(&wp_html_tag_processor_handlers, zend_get_std_object_handlers(), sizeof(zend_object_handlers)); + wp_html_tag_processor_handlers.offset = XtOffsetOf(wp_html_tag_processor_object, std); + wp_html_tag_processor_handlers.free_obj = wp_html_tag_processor_free_obj; +} + +static void wp_html_api_rust_register_processor_class(void) +{ + zend_class_entry ce; + + INIT_CLASS_ENTRY(ce, "WP_HTML_Processor", wp_html_processor_methods); + wp_html_processor_ce = zend_register_internal_class_ex(&ce, wp_html_tag_processor_ce); + wp_html_processor_ce->create_object = wp_html_tag_processor_create_object; +} + +PHP_MINIT_FUNCTION(wp_html_api_rust) +{ + REGISTER_INI_ENTRIES(); + + if (INI_BOOL("wp_html_api_rust.replace_html_api")) { + wp_html_api_rust_register_tag_processor_class(); + } + + return SUCCESS; +} + +PHP_MSHUTDOWN_FUNCTION(wp_html_api_rust) +{ + UNREGISTER_INI_ENTRIES(); + + return SUCCESS; +} + +PHP_MINFO_FUNCTION(wp_html_api_rust) +{ + php_info_print_table_start(); + php_info_print_table_header(2, "wp_html_api_rust support", "enabled"); + php_info_print_table_row(2, "extension version", PHP_WP_HTML_API_RUST_VERSION); + php_info_print_table_row(2, "rust core version", wp_html_api_rust_core_version()); + php_info_print_table_row(2, "replace HTML API classes", INI_BOOL("wp_html_api_rust.replace_html_api") ? "enabled" : "disabled"); + php_info_print_table_end(); +} + +zend_module_entry wp_html_api_rust_module_entry = { + STANDARD_MODULE_HEADER, + "wp_html_api_rust", + wp_html_api_rust_functions, + PHP_MINIT(wp_html_api_rust), + PHP_MSHUTDOWN(wp_html_api_rust), + NULL, + NULL, + PHP_MINFO(wp_html_api_rust), + PHP_WP_HTML_API_RUST_VERSION, + STANDARD_MODULE_PROPERTIES +}; + +#ifdef COMPILE_DL_WP_HTML_API_RUST +# ifdef ZTS +ZEND_TSRMLS_CACHE_DEFINE() +# endif +ZEND_GET_MODULE(wp_html_api_rust) +#endif diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 35d91fad3129c..36b936ac8d576 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -7,6 +7,10 @@ * @since 6.4.0 */ +if ( class_exists( 'WP_HTML_Processor', false ) ) { + return; +} + /** * Core class used to safely parse and modify an HTML document. * @@ -5617,7 +5621,7 @@ public function seek( $bookmark_name ): bool { $actual_bookmark_name = "_{$bookmark_name}"; $processor_started_at = $this->state->current_token ? $this->bookmarks[ $this->state->current_token->bookmark_name ]->start - : 0; + : ( WP_HTML_Tag_Processor::STATE_COMPLETE === $this->parser_state ? strlen( $this->html ) : 0 ); $bookmark_starts_at = $this->bookmarks[ $actual_bookmark_name ]->start; $direction = $bookmark_starts_at > $processor_started_at ? 'forward' : 'backward'; @@ -5730,7 +5734,7 @@ public function seek( $bookmark_name ): bool { * The processor will stop on virtual tokens, but bookmarks may not be set on them. * They should not be matched when seeking a bookmark, skip them. */ - if ( $this->is_virtual() ) { + if ( ! isset( $this->state->current_token ) || $this->is_virtual() ) { continue; } if ( $bookmark_starts_at === $this->bookmarks[ $this->state->current_token->bookmark_name ]->start ) { diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 77c1a471db5b1..e78d87264a78e 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -21,6 +21,24 @@ * @since 6.2.0 */ +if ( class_exists( 'WP_HTML_Tag_Processor_Native', false ) ) { + if ( class_exists( 'WP_HTML_Tag_Processor', false ) ) { + return; + } + + class WP_HTML_Tag_Processor extends WP_HTML_Tag_Processor_Native { + private function get_script_content_type(): ?string { + return $this->native_get_script_content_type(); + } + } + + return; +} + +if ( class_exists( 'WP_HTML_Tag_Processor', false ) ) { + return; +} + /** * Core class used to modify attributes in an HTML document for tags matching a query. * From a02af216454b41a940aaefd87e3287fab80e1721 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 13:26:06 +0200 Subject: [PATCH 002/486] Normalize TEXTAREA modifiable text newlines --- ext/html-api-rust/src/lib.rs | 63 ++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/ext/html-api-rust/src/lib.rs b/ext/html-api-rust/src/lib.rs index 515f8775bef20..068a5feb54728 100644 --- a/ext/html-api-rust/src/lib.rs +++ b/ext/html-api-rust/src/lib.rs @@ -918,8 +918,9 @@ impl TagProcessor { } else if eq_ignore_ascii_case(tag_name, b"STYLE") { escape_rawtext_closer(plaintext, b"style", b"\\3c\\2f") } else if eq_ignore_ascii_case(tag_name, b"TEXTAREA") { - let mut escaped = escape_rcdata_closer(plaintext, b"textarea"); - if matches!(escaped.first(), Some(b'\n' | b'\r')) { + let normalized = normalize_newlines(plaintext); + let mut escaped = escape_rcdata_closer(&normalized, b"textarea"); + if matches!(escaped.first(), Some(b'\n')) { let mut with_extra_newline = Vec::with_capacity(escaped.len() + 1); with_extra_newline.push(b'\n'); with_extra_newline.extend_from_slice(&escaped); @@ -1559,6 +1560,24 @@ fn strip_initial_newline(input: &[u8]) -> &[u8] { input } +fn normalize_newlines(input: &[u8]) -> Vec { + let mut normalized = Vec::with_capacity(input.len()); + let mut at = 0; + + while at < input.len() { + if input[at] == b'\r' { + normalized.push(b'\n'); + at += if input.get(at + 1) == Some(&b'\n') { 2 } else { 1 }; + continue; + } + + normalized.push(input[at]); + at += 1; + } + + normalized +} + fn pi_target_span(html: &[u8], scan: TagScan) -> Option<(usize, usize)> { if scan.token_type != TOKEN_TYPE_COMMENT { return None; @@ -2388,6 +2407,46 @@ mod tests { ); } + #[test] + fn set_modifiable_text_normalizes_textarea_newlines() { + for (plaintext, expected_html, expected_text) in [ + ( + &b"\rCR"[..], + &b""[..], + &b"\nCR"[..], + ), + ( + &b"\r\nCR-N"[..], + &b""[..], + &b"\nCR-N"[..], + ), + ] { + let mut processor = TagProcessor { + html: b"".to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag( + &mut processor, + b"textarea".as_ptr(), + b"textarea".len(), + false, + ) + }); + assert!(processor.set_modifiable_text(plaintext)); + + let scan = processor.current.unwrap(); + assert_eq!(&processor.html, expected_html); + assert_eq!(processor.current_modifiable_text(scan).unwrap(), expected_text); + } + } + #[test] fn scanner_closes_incorrectly_closed_comments() { let html = b"-->"; From e5763cd678a9facc6598cee6e0b995bf9b54110c Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 13:27:19 +0200 Subject: [PATCH 003/486] Guard processor modifiable text namespace --- .../html-api/class-wp-html-processor.php | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 36b936ac8d576..3e66d7ba1c01d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5302,6 +5302,25 @@ public function has_self_closing_flag(): bool { return $this->is_virtual() ? false : parent::has_self_closing_flag(); } + /** + * Sets the modifiable text for the matched token, if matched. + * + * @since 6.9.0 Subclassed for the HTML Processor. + * + * @param string $plaintext_content New text content to represent in the matched token. + * @return bool Whether the text was able to update. + */ + public function set_modifiable_text( string $plaintext_content ): bool { + if ( + self::STATE_MATCHED_TAG === $this->parser_state && + 'html' !== $this->get_namespace() + ) { + return false; + } + + return parent::set_modifiable_text( $plaintext_content ); + } + /** * Returns the node name represented by the token. * From 1018acb8ab60b937fd0f36015cb2ae9bb03e0967 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 13:28:23 +0200 Subject: [PATCH 004/486] Restore parser state when seeking retained token --- ext/html-api-rust/wp_html_api_rust.c | 1 + 1 file changed, 1 insertion(+) diff --git a/ext/html-api-rust/wp_html_api_rust.c b/ext/html-api-rust/wp_html_api_rust.c index 63d276cb84aa0..8c81c18d95ff0 100644 --- a/ext/html-api-rust/wp_html_api_rust.c +++ b/ext/html-api-rust/wp_html_api_rust.c @@ -1994,6 +1994,7 @@ PHP_METHOD(WP_HTML_Tag_Processor, seek) token_start == (size_t) bookmark_start && token_length == (size_t) bookmark_length ) { + wp_html_tag_processor_update_parser_state_from_native(ZEND_THIS, intern->native); RETURN_TRUE; } From 89a241e2b6941449c854aa22ea15e25e5ec24777 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 13:32:20 +0200 Subject: [PATCH 005/486] Honor quirks mode in native class helpers --- ext/html-api-rust/src/lib.rs | 169 ++++++++++++++++++++------- ext/html-api-rust/wp_html_api_rust.c | 51 ++++++-- 2 files changed, 168 insertions(+), 52 deletions(-) diff --git a/ext/html-api-rust/src/lib.rs b/ext/html-api-rust/src/lib.rs index 068a5feb54728..1ec9cf7db0820 100644 --- a/ext/html-api-rust/src/lib.rs +++ b/ext/html-api-rust/src/lib.rs @@ -604,6 +604,7 @@ pub unsafe extern "C" fn wp_html_api_rust_tag_processor_add_class( processor: *mut TagProcessor, class_name: *const u8, class_name_len: usize, + quirks_mode: bool, ) -> bool { let Some(processor) = processor.as_mut() else { return false; @@ -614,7 +615,7 @@ pub unsafe extern "C" fn wp_html_api_rust_tag_processor_add_class( } let class_name = slice::from_raw_parts(class_name, class_name_len); - processor.add_class(class_name) + processor.add_class(class_name, quirks_mode) } #[no_mangle] @@ -622,6 +623,7 @@ pub unsafe extern "C" fn wp_html_api_rust_tag_processor_remove_class( processor: *mut TagProcessor, class_name: *const u8, class_name_len: usize, + quirks_mode: bool, ) -> bool { let Some(processor) = processor.as_mut() else { return false; @@ -632,7 +634,7 @@ pub unsafe extern "C" fn wp_html_api_rust_tag_processor_remove_class( } let class_name = slice::from_raw_parts(class_name, class_name_len); - processor.remove_class(class_name) + processor.remove_class(class_name, quirks_mode) } #[no_mangle] @@ -640,6 +642,7 @@ pub unsafe extern "C" fn wp_html_api_rust_tag_processor_has_class( processor: *mut TagProcessor, class_name: *const u8, class_name_len: usize, + quirks_mode: bool, ) -> u8 { let Some(processor) = processor.as_mut() else { return 0; @@ -650,19 +653,20 @@ pub unsafe extern "C" fn wp_html_api_rust_tag_processor_has_class( } let class_name = slice::from_raw_parts(class_name, class_name_len); - processor.has_class(class_name) + processor.has_class(class_name, quirks_mode) } #[no_mangle] pub unsafe extern "C" fn wp_html_api_rust_tag_processor_class_list( processor: *mut TagProcessor, out: *mut ByteSlice, + quirks_mode: bool, ) -> u8 { let Some(processor) = processor.as_mut() else { return 0; }; - if out.is_null() || !processor.class_list() { + if out.is_null() || !processor.class_list(quirks_mode) { return 0; } @@ -702,6 +706,11 @@ enum AttributeValue { String, } +struct ClassEntry { + name: Vec, + comparable: Vec, +} + impl TagProcessor { fn current_modifiable_text(&self, scan: TagScan) -> Option> { match scan.token_type { @@ -1107,7 +1116,7 @@ impl TagProcessor { removed } - fn add_class(&mut self, class_name: &[u8]) -> bool { + fn add_class(&mut self, class_name: &[u8], quirks_mode: bool) -> bool { let Some(scan) = self.current else { return false; }; @@ -1117,29 +1126,32 @@ impl TagProcessor { } let normalized_class_name = normalize_class_bytes(class_name); - let mut classes = self.current_classes(); - if !classes.iter().any(|class| class.as_slice() == normalized_class_name.as_slice()) { - match self.get_attribute(b"class") { - AttributeValue::String => { - let mut value = self.scratch.clone(); - trim_html_whitespace_in_place(&mut value); - if !value.is_empty() { - value.push(b' '); - } - value.extend_from_slice(&normalized_class_name); - return self.set_attribute(b"class", &value, 2); - } - AttributeValue::Boolean | AttributeValue::Missing => { - classes.push(normalized_class_name); + let comparable_class_name = comparable_class_bytes(&normalized_class_name, quirks_mode); + if self + .current_class_entries(quirks_mode) + .iter() + .any(|class| class.comparable.as_slice() == comparable_class_name.as_slice()) + { + return true; + } + + match self.get_attribute(b"class") { + AttributeValue::String => { + let mut value = self.scratch.clone(); + trim_html_whitespace_in_place(&mut value); + if !value.is_empty() { + value.push(b' '); } + value.extend_from_slice(&normalized_class_name); + self.set_attribute(b"class", &value, 2) + } + AttributeValue::Boolean | AttributeValue::Missing => { + self.set_attribute(b"class", &normalized_class_name, 2) } } - - let value = join_classes(&classes); - self.set_attribute(b"class", &value, 2) } - fn remove_class(&mut self, class_name: &[u8]) -> bool { + fn remove_class(&mut self, class_name: &[u8], quirks_mode: bool) -> bool { let Some(scan) = self.current else { return false; }; @@ -1149,12 +1161,18 @@ impl TagProcessor { } let normalized_class_name = normalize_class_bytes(class_name); - let classes: Vec> = self - .current_classes() - .into_iter() - .filter(|class| class.as_slice() != normalized_class_name.as_slice()) + let comparable_class_name = comparable_class_bytes(&normalized_class_name, quirks_mode); + let entries = self.current_class_entries(quirks_mode); + let classes: Vec> = entries + .iter() + .filter(|class| class.comparable.as_slice() != comparable_class_name.as_slice()) + .map(|class| class.name.clone()) .collect(); + if classes.len() == entries.len() { + return true; + } + if classes.is_empty() { let _ = self.remove_attribute(b"class"); return true; @@ -1164,7 +1182,7 @@ impl TagProcessor { self.set_attribute(b"class", &value, 2) } - fn has_class(&mut self, class_name: &[u8]) -> u8 { + fn has_class(&mut self, class_name: &[u8], quirks_mode: bool) -> u8 { let Some(scan) = self.current else { return 0; }; @@ -1174,11 +1192,12 @@ impl TagProcessor { } let normalized_class_name = normalize_class_bytes(class_name); + let comparable_class_name = comparable_class_bytes(&normalized_class_name, quirks_mode); if self - .current_classes() - .iter() - .any(|class| class.as_slice() == normalized_class_name.as_slice()) + .current_class_entries(quirks_mode) + .into_iter() + .any(|class| class.comparable.as_slice() == comparable_class_name.as_slice()) { 2 } else { @@ -1186,7 +1205,7 @@ impl TagProcessor { } } - fn class_list(&mut self) -> bool { + fn class_list(&mut self, quirks_mode: bool) -> bool { let Some(scan) = self.current else { return false; }; @@ -1195,19 +1214,23 @@ impl TagProcessor { return false; } - let classes = self.current_classes(); + let classes = self.current_class_entries(quirks_mode); self.scratch.clear(); for class in classes { if !self.scratch.is_empty() { self.scratch.push(0x1f); } - self.scratch.extend_from_slice(&class); + self.scratch.extend_from_slice(if quirks_mode { + &class.comparable + } else { + &class.name + }); } true } - fn current_classes(&mut self) -> Vec> { + fn current_class_entries(&mut self, quirks_mode: bool) -> Vec { let value = match self.get_attribute(b"class") { AttributeValue::String => self.scratch.clone(), AttributeValue::Boolean | AttributeValue::Missing => Vec::new(), @@ -1215,10 +1238,19 @@ impl TagProcessor { let mut classes = Vec::new(); for class in value.split(|byte| is_html_whitespace(*byte)) { - if class.is_empty() || classes.iter().any(|seen: &Vec| seen.as_slice() == class) { + if class.is_empty() { continue; } - classes.push(normalize_class_bytes(class)); + + let name = normalize_class_bytes(class); + let comparable = comparable_class_bytes(&name, quirks_mode); + if classes + .iter() + .any(|seen: &ClassEntry| seen.comparable.as_slice() == comparable.as_slice()) + { + continue; + } + classes.push(ClassEntry { name, comparable }); } classes @@ -2181,6 +2213,14 @@ fn normalize_class_bytes(class_name: &[u8]) -> Vec { output } +fn comparable_class_bytes(class_name: &[u8], quirks_mode: bool) -> Vec { + if quirks_mode { + ascii_lowercase_vec(class_name) + } else { + class_name.to_vec() + } +} + fn ascii_lowercase_vec(value: &[u8]) -> Vec { value.iter().map(u8::to_ascii_lowercase).collect() } @@ -2677,11 +2717,11 @@ mod tests { super::wp_html_api_rust_tag_processor_next_tag(&mut processor, ptr::null(), 0, false) }); - assert_eq!(processor.has_class(b"one"), 2); - assert_eq!(processor.has_class(b"three"), 1); - assert!(processor.add_class(b"three")); - assert!(processor.remove_class(b"two")); - assert!(processor.class_list()); + assert_eq!(processor.has_class(b"one", false), 2); + assert_eq!(processor.has_class(b"three", false), 1); + assert!(processor.add_class(b"three", false)); + assert!(processor.remove_class(b"two", false)); + assert!(processor.class_list(false)); assert_eq!(processor.scratch, b"one\x1fthree"); assert_eq!( std::str::from_utf8(&processor.html).unwrap(), @@ -2689,6 +2729,49 @@ mod tests { ); } + #[test] + fn tag_processor_matches_classes_case_insensitively_in_quirks_mode() { + let mut processor = TagProcessor { + html: "" + .as_bytes() + .to_vec(), + offset: 0, + current: None, + scratch: Vec::new(), + paused_at_incomplete: false, + inserted_attributes: Vec::new(), + parsing_namespace: NAMESPACE_HTML, + }; + + assert!(unsafe { + super::wp_html_api_rust_tag_processor_next_tag(&mut processor, ptr::null(), 0, false) + }); + + assert_eq!(processor.has_class(b"upper", true), 2); + assert!(processor.add_class(b"upper", true)); + assert_eq!( + std::str::from_utf8(&processor.html).unwrap(), + "" + ); + + assert!(processor.add_class(b"ANOTHER-UPPER", true)); + assert!(processor.class_list(true)); + assert_eq!( + processor.scratch, + b"upper\x1fa\x1fe\xcc\x81\x1fanother-upper" + ); + assert_eq!( + std::str::from_utf8(&processor.html).unwrap(), + "" + ); + + assert!(processor.remove_class(b"upper", true)); + assert_eq!( + std::str::from_utf8(&processor.html).unwrap(), + "" + ); + } + #[test] fn tag_processor_trims_class_edges_when_adding_class() { let mut processor = TagProcessor { @@ -2705,7 +2788,7 @@ mod tests { super::wp_html_api_rust_tag_processor_next_tag(&mut processor, ptr::null(), 0, false) }); - assert!(processor.add_class(b"foo-class")); + assert!(processor.add_class(b"foo-class", false)); assert_eq!( std::str::from_utf8(&processor.html).unwrap(), r#"
"# diff --git a/ext/html-api-rust/wp_html_api_rust.c b/ext/html-api-rust/wp_html_api_rust.c index 8c81c18d95ff0..69b0682bb46d0 100644 --- a/ext/html-api-rust/wp_html_api_rust.c +++ b/ext/html-api-rust/wp_html_api_rust.c @@ -119,21 +119,25 @@ extern bool wp_html_api_rust_tag_processor_remove_attribute( extern bool wp_html_api_rust_tag_processor_add_class( void *processor, const unsigned char *class_name, - size_t class_name_len + size_t class_name_len, + bool quirks_mode ); extern bool wp_html_api_rust_tag_processor_remove_class( void *processor, const unsigned char *class_name, - size_t class_name_len + size_t class_name_len, + bool quirks_mode ); extern unsigned char wp_html_api_rust_tag_processor_has_class( void *processor, const unsigned char *class_name, - size_t class_name_len + size_t class_name_len, + bool quirks_mode ); extern unsigned char wp_html_api_rust_tag_processor_class_list( void *processor, - wp_html_api_rust_byte_slice *out + wp_html_api_rust_byte_slice *out, + bool quirks_mode ); extern bool wp_html_api_rust_tag_processor_get_html( const void *processor, @@ -556,6 +560,27 @@ static bool wp_html_tag_processor_parser_state_is_terminal(zval *object) ); } +static bool wp_html_tag_processor_is_quirks_mode(zval *object) +{ + zval rv; + zval *compat_mode; + + compat_mode = zend_read_property( + wp_html_tag_processor_ce, + Z_OBJ_P(object), + "compat_mode", + sizeof("compat_mode") - 1, + 1, + &rv + ); + + return ( + IS_STRING == Z_TYPE_P(compat_mode) && + sizeof("quirks-mode") - 1 == Z_STRLEN_P(compat_mode) && + 0 == memcmp(Z_STRVAL_P(compat_mode), "quirks-mode", sizeof("quirks-mode") - 1) + ); +} + static int wp_html_api_rust_compare_text_replacements(const void *left_ptr, const void *right_ptr) { const wp_html_api_rust_text_replacement *left = (const wp_html_api_rust_text_replacement *) left_ptr; @@ -982,7 +1007,8 @@ PHP_METHOD(WP_HTML_Tag_Processor, next_tag) 2 != wp_html_api_rust_tag_processor_has_class( intern->native, (const unsigned char *) ZSTR_VAL(query_class_name), - ZSTR_LEN(query_class_name) + ZSTR_LEN(query_class_name), + wp_html_tag_processor_is_quirks_mode(ZEND_THIS) ) ) { continue; @@ -1318,7 +1344,8 @@ PHP_METHOD(WP_HTML_Tag_Processor, add_class) bool result = wp_html_api_rust_tag_processor_add_class( intern->native, (const unsigned char *) class_name, - class_name_len + class_name_len, + wp_html_tag_processor_is_quirks_mode(ZEND_THIS) ); if (result) { @@ -1371,7 +1398,8 @@ PHP_METHOD(WP_HTML_Tag_Processor, remove_class) bool result = wp_html_api_rust_tag_processor_remove_class( intern->native, (const unsigned char *) class_name, - class_name_len + class_name_len, + wp_html_tag_processor_is_quirks_mode(ZEND_THIS) ); if (result) { @@ -1418,7 +1446,8 @@ PHP_METHOD(WP_HTML_Tag_Processor, has_class) result = wp_html_api_rust_tag_processor_has_class( intern->native, (const unsigned char *) class_name, - class_name_len + class_name_len, + wp_html_tag_processor_is_quirks_mode(ZEND_THIS) ); if (0 == result) { @@ -1447,7 +1476,11 @@ PHP_METHOD(WP_HTML_Tag_Processor, class_list) RETURN_THROWS(); } - if (!wp_html_api_rust_tag_processor_class_list(intern->native, &classes)) { + if (!wp_html_api_rust_tag_processor_class_list( + intern->native, + &classes, + wp_html_tag_processor_is_quirks_mode(ZEND_THIS) + )) { RETURN_NULL(); } From df60ae00808112eed93cb8f5adb11f7b437452dc Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 13:33:34 +0200 Subject: [PATCH 006/486] Use adjusted tag names for processor tokens --- src/wp-includes/html-api/class-wp-html-processor.php | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-processor.php b/src/wp-includes/html-api/class-wp-html-processor.php index 3e66d7ba1c01d..d4e62f6238b5d 100644 --- a/src/wp-includes/html-api/class-wp-html-processor.php +++ b/src/wp-includes/html-api/class-wp-html-processor.php @@ -5342,8 +5342,12 @@ public function set_modifiable_text( string $plaintext_content ): bool { * @return string|null Name of the matched token. */ public function get_token_name(): ?string { - return $this->is_virtual() - ? $this->current_element->token->node_name + if ( $this->is_virtual() ) { + return $this->current_element->token->node_name; + } + + return '#tag' === parent::get_token_type() + ? $this->get_tag() : parent::get_token_name(); } From e8e2ea1c3219f52ca8c3a4d0c6da057fb4f49c02 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 13:35:40 +0200 Subject: [PATCH 007/486] Limit atomic text scanning to HTML namespace --- ext/html-api-rust/src/lib.rs | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/ext/html-api-rust/src/lib.rs b/ext/html-api-rust/src/lib.rs index 1ec9cf7db0820..4eb1f0bd967a5 100644 --- a/ext/html-api-rust/src/lib.rs +++ b/ext/html-api-rust/src/lib.rs @@ -1884,7 +1884,10 @@ fn scan_next_token_in_namespace(html: &[u8], offset: usize, namespace: u8) -> Sc token_type: TOKEN_TYPE_TAG, }; - if !is_closing && is_special_atomic_tag(&html[name_start..name_end]) { + if namespace == NAMESPACE_HTML + && !is_closing + && is_special_atomic_tag(&html[name_start..name_end]) + { let Some(closer_end) = find_special_closer(html, tag_end, &html[name_start..name_end]) else { return ScanResult::Incomplete; }; @@ -2349,7 +2352,10 @@ fn decode_character_reference(input: &[u8]) -> Option<(char, usize)> { #[cfg(test)] mod tests { - use super::{scan_next_tag, AttributeValue, TagProcessor, TagScan, NAMESPACE_HTML, TOKEN_TYPE_TAG}; + use super::{ + scan_next_tag, scan_next_token_in_namespace, AttributeValue, ScanResult, TagProcessor, + TagScan, NAMESPACE_FOREIGN, NAMESPACE_HTML, TOKEN_TYPE_TAG, + }; use std::ptr; #[test] @@ -2417,6 +2423,27 @@ mod tests { assert_eq!(&html[div.name_start..div.name_start + div.name_len], b"div"); } + #[test] + fn scanner_does_not_extend_foreign_script_to_closer() { + let html = b"
'), + '
', +); +assert.equal( + WP_HTML_Processor.normalize("
"), + "
", +); + const selectOptionProcessor = WP_HTML_Processor.create_fragment(""); assert.equal(selectOptionProcessor.next_tag({ breadcrumbs: ["SELECT", "OPTION"], match_offset: 2 }), true); assert.deepEqual(selectOptionProcessor.get_breadcrumbs(), ["HTML", "BODY", "SELECT", "OPTION"]); diff --git a/ext/html-api-rust/wasm/wp-html-api-rust.js b/ext/html-api-rust/wasm/wp-html-api-rust.js index 3174dafd51593..e230e9019518a 100644 --- a/ext/html-api-rust/wasm/wp-html-api-rust.js +++ b/ext/html-api-rust/wasm/wp-html-api-rust.js @@ -217,6 +217,17 @@ const TABLE_ROW_MODE_IGNORED_END_TAGS = new Set([ "TH", ]); const TABLE_CELL_ELEMENTS = new Set(["TD", "TH"]); +const FORM_TABLE_DESCENDANT_ELEMENTS = new Set([ + "CAPTION", + "COLGROUP", + "TABLE", + "TBODY", + "TD", + "TFOOT", + "TH", + "THEAD", + "TR", +]); const TABLE_CELL_BOUNDARY_START_TAGS = new Set([ "CAPTION", "COL", @@ -1849,6 +1860,20 @@ export function createHtmlApi(wasm) { existingIndex = this.#findOpenElementBeforeBoundary("LI", LIST_ITEM_SCOPE_BOUNDARIES); } + if ( + allowVirtualPreclosures && + tagName === "FORM" && + closingNamespace === "html" && + existingIndex !== -1 && + this.#hasOnlyTableElementsAfter(existingIndex) + ) { + this.current_token_namespace = this.current_namespace; + this.breadcrumbs = [...this.open_elements]; + this.#queueVirtualPopsFrom(existingIndex + 1); + this.skip_current_token = true; + return; + } + if (this.#shouldBailUnsupportedFormCloser(tagName, closingNamespace, existingIndex)) { this.#bailUnsupported("Cannot close a FORM when other elements remain open as this would throw off the breadcrumbs for the following tokens."); return; @@ -1954,6 +1979,18 @@ export function createHtmlApi(wasm) { return; } + if ( + this.current_namespace === "html" && + tagName === "FORM" && + !this.#hasOpenHtmlElement("TEMPLATE") && + this.#hasOpenHtmlElement("FORM") + ) { + this.current_token_namespace = this.current_namespace; + this.breadcrumbs = [...this.open_elements]; + this.skip_current_token = true; + return; + } + if ( this.is_full_parser && this.encoding_confidence === "tentative" && @@ -3403,6 +3440,23 @@ export function createHtmlApi(wasm) { return false; } + #hasOnlyTableElementsAfter(index) { + if (index >= this.open_elements.length - 1) { + return false; + } + + for (let i = index + 1; i < this.open_elements.length; i += 1) { + if ( + this.open_element_namespaces[i] !== "html" || + !FORM_TABLE_DESCENDANT_ELEMENTS.has(this.open_elements[i]) + ) { + return false; + } + } + + return true; + } + #shouldBailUnsupportedAdoptionAgency(tagName, namespaceName, formattingElementIndex) { return ( namespaceName === "html" && From f236ac5a0cc397beb4d61706dea313a221b9aeee Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:46:18 +0200 Subject: [PATCH 102/486] Limit WASM active formatting reconstruction --- ext/html-api-rust/wasm/smoke-test.mjs | 16 +++++++ ext/html-api-rust/wasm/wp-html-api-rust.js | 50 +++++++++++++++++++++- 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/ext/html-api-rust/wasm/smoke-test.mjs b/ext/html-api-rust/wasm/smoke-test.mjs index f7d8239ac9a23..3e7069b349f1e 100644 --- a/ext/html-api-rust/wasm/smoke-test.mjs +++ b/ext/html-api-rust/wasm/smoke-test.mjs @@ -348,6 +348,22 @@ assert.equal( '

One

Two

', ); +const repeatedFormattingProcessor = WP_HTML_Processor.create_full_parser("

x"); +while (repeatedFormattingProcessor.next_token()) { + if ( + repeatedFormattingProcessor.get_token_type() === "#text" && + repeatedFormattingProcessor.get_modifiable_text() === "x" + ) { + break; + } +} +assert.deepEqual(repeatedFormattingProcessor.get_breadcrumbs(), ["HTML", "BODY", "P", "B", "B", "B", "#text"]); +repeatedFormattingProcessor.destroy(); +assert.equal( + WP_HTML_Processor.normalize("

x"), + "

x

", +); + const closedFormattingProcessor = WP_HTML_Processor.create_fragment("one

two"); assert.equal(closedFormattingProcessor.next_tag("b"), true); assert.equal(closedFormattingProcessor.next_tag({ tag_name: "b", tag_closers: "visit" }), true); diff --git a/ext/html-api-rust/wasm/wp-html-api-rust.js b/ext/html-api-rust/wasm/wp-html-api-rust.js index e230e9019518a..77dc97fd10e8f 100644 --- a/ext/html-api-rust/wasm/wp-html-api-rust.js +++ b/ext/html-api-rust/wasm/wp-html-api-rust.js @@ -2023,7 +2023,7 @@ export function createHtmlApi(wasm) { this.open_elements.push(tagName); this.open_element_namespaces.push(this.current_token_namespace); if (this.current_token_namespace === "html" && FORMATTING_ELEMENTS.has(tagName)) { - this.active_formatting_elements.push(this.#createActiveFormattingElement(tagName)); + this.#insertActiveFormattingElement(this.#createActiveFormattingElement(tagName)); } this.breadcrumbs = [...this.open_elements]; @@ -2195,6 +2195,54 @@ export function createHtmlApi(wasm) { }; } + #insertActiveFormattingElement(entry) { + let equivalentEntries = 0; + for (let i = this.active_formatting_elements.length - 1; i >= 0; i -= 1) { + if (!this.#activeFormattingElementsAreEquivalent(entry, this.active_formatting_elements[i])) { + continue; + } + + equivalentEntries += 1; + if (equivalentEntries === 3) { + this.active_formatting_elements.splice(i, 1); + break; + } + } + + this.active_formatting_elements.push(entry); + } + + #activeFormattingElementsAreEquivalent(left, right) { + if ( + left.tagName !== right.tagName || + left.namespaceName !== right.namespaceName || + left.attributes.length !== right.attributes.length + ) { + return false; + } + + const rightAttributes = new Map(); + for (const attribute of right.attributes) { + rightAttributes.set( + this.#activeFormattingAttributeName(right, attribute), + attribute.value, + ); + } + + for (const attribute of left.attributes) { + const attributeName = this.#activeFormattingAttributeName(left, attribute); + if (!rightAttributes.has(attributeName) || rightAttributes.get(attributeName) !== attribute.value) { + return false; + } + } + + return true; + } + + #activeFormattingAttributeName(entry, attribute) { + return entry.namespaceName === "html" ? asciiLower(attribute.name) : attribute.name; + } + #cloneActiveFormattingElement(entry) { return { tagName: entry.tagName, From 1a6554309d6f5aafd1e9ce87aae70aa0753134ff Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:47:48 +0200 Subject: [PATCH 103/486] Close paragraphs before WASM listing tags --- ext/html-api-rust/wasm/smoke-test.mjs | 7 +++++++ ext/html-api-rust/wasm/wp-html-api-rust.js | 1 + 2 files changed, 8 insertions(+) diff --git a/ext/html-api-rust/wasm/smoke-test.mjs b/ext/html-api-rust/wasm/smoke-test.mjs index 3e7069b349f1e..e29cf7694ecaf 100644 --- a/ext/html-api-rust/wasm/smoke-test.mjs +++ b/ext/html-api-rust/wasm/smoke-test.mjs @@ -680,6 +680,13 @@ assert.deepEqual(paragraphProcessor.get_breadcrumbs(), ["HTML", "BODY", "P"]); assert.equal(paragraphProcessor.get_attribute("target"), true); paragraphProcessor.destroy(); +const listingClosesParagraphProcessor = WP_HTML_Processor.create_full_parser("

foo

bar

baz"); +assert.equal(listingClosesParagraphProcessor.next_tag("listing"), true); +assert.deepEqual(listingClosesParagraphProcessor.get_breadcrumbs(), ["HTML", "BODY", "LISTING"]); +assert.equal(listingClosesParagraphProcessor.next_tag("p"), true); +assert.deepEqual(listingClosesParagraphProcessor.get_breadcrumbs(), ["HTML", "BODY", "LISTING", "P"]); +listingClosesParagraphProcessor.destroy(); + const articleProcessor = WP_HTML_Processor.create_fragment("

"); assert.equal(articleProcessor.next_tag("article"), true); assert.deepEqual(articleProcessor.get_breadcrumbs(), ["HTML", "BODY", "ARTICLE"]); diff --git a/ext/html-api-rust/wasm/wp-html-api-rust.js b/ext/html-api-rust/wasm/wp-html-api-rust.js index 77dc97fd10e8f..b00f31cd74115 100644 --- a/ext/html-api-rust/wasm/wp-html-api-rust.js +++ b/ext/html-api-rust/wasm/wp-html-api-rust.js @@ -324,6 +324,7 @@ const P_CLOSING_START_TAGS = new Set([ "HGROUP", "HR", "LI", + "LISTING", "MAIN", "MENU", "NAV", From 376e5e5f49da84e5f99a2287e3fcb7c483a403de Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:50:54 +0200 Subject: [PATCH 104/486] Handle WASM menuitem parser quirks --- ext/html-api-rust/wasm/smoke-test.mjs | 11 +++++++++++ ext/html-api-rust/wasm/wp-html-api-rust.js | 19 ++++++++++++++++++- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/ext/html-api-rust/wasm/smoke-test.mjs b/ext/html-api-rust/wasm/smoke-test.mjs index e29cf7694ecaf..d0654aa3f085f 100644 --- a/ext/html-api-rust/wasm/smoke-test.mjs +++ b/ext/html-api-rust/wasm/smoke-test.mjs @@ -364,6 +364,11 @@ assert.equal( "

x

", ); +const menuitemReconstructsFormattingProcessor = WP_HTML_Processor.create_full_parser("

"); +assert.equal(menuitemReconstructsFormattingProcessor.next_tag("menuitem"), true); +assert.deepEqual(menuitemReconstructsFormattingProcessor.get_breadcrumbs(), ["HTML", "BODY", "B", "MENUITEM"]); +menuitemReconstructsFormattingProcessor.destroy(); + const closedFormattingProcessor = WP_HTML_Processor.create_fragment("one

two"); assert.equal(closedFormattingProcessor.next_tag("b"), true); assert.equal(closedFormattingProcessor.next_tag({ tag_name: "b", tag_closers: "visit" }), true); @@ -705,6 +710,12 @@ assert.deepEqual(buttonProcessor.get_breadcrumbs(), ["HTML", "BODY", "BUTTON"]); assert.equal(buttonProcessor.get_attribute("three"), true); buttonProcessor.destroy(); +const selectMenuitemProcessor = WP_HTML_Processor.create_full_parser(""); +assert.equal(selectMenuitemProcessor.next_tag("select"), true); +assert.equal(selectMenuitemProcessor.next_tag("menuitem"), false); +assert.equal(selectMenuitemProcessor.get_last_error(), null); +selectMenuitemProcessor.destroy(); + const listBoundaryProcessor = WP_HTML_Processor.create_fragment("

  • "); assert.equal(listBoundaryProcessor.next_tag({ breadcrumbs: ["LI"], match_offset: 3 }), true); assert.deepEqual(listBoundaryProcessor.get_breadcrumbs(), ["HTML", "BODY", "LI", "BLOCKQUOTE", "LI"]); diff --git a/ext/html-api-rust/wasm/wp-html-api-rust.js b/ext/html-api-rust/wasm/wp-html-api-rust.js index b00f31cd74115..011db3b61299e 100644 --- a/ext/html-api-rust/wasm/wp-html-api-rust.js +++ b/ext/html-api-rust/wasm/wp-html-api-rust.js @@ -157,6 +157,9 @@ const ADOPTION_AGENCY_END_TAGS = new Set([ ...FORMATTING_ELEMENTS, "NOBR", ]); +const ACTIVE_FORMATTING_RECONSTRUCTING_START_TAGS = new Set([ + "MENUITEM", +]); const TABLE_SECTION_ELEMENTS = new Set(["TBODY", "TFOOT", "THEAD"]); const TABLE_TEXT_CURRENT_NODE_ELEMENTS = new Set([ "TABLE", @@ -1992,6 +1995,17 @@ export function createHtmlApi(wasm) { return; } + if ( + this.current_namespace === "html" && + tagName === "MENUITEM" && + this.#hasOpenHtmlElement("SELECT") + ) { + this.current_token_namespace = this.current_namespace; + this.breadcrumbs = [...this.open_elements]; + this.skip_current_token = true; + return; + } + if ( this.is_full_parser && this.encoding_confidence === "tentative" && @@ -2013,7 +2027,10 @@ export function createHtmlApi(wasm) { this.#applySimpleHtmlSemanticClosures(tagName); if ( allowVirtualPreclosures && - FORMATTING_ELEMENTS.has(tagName) && + ( + FORMATTING_ELEMENTS.has(tagName) || + ACTIVE_FORMATTING_RECONSTRUCTING_START_TAGS.has(tagName) + ) && this.#queueReconstructActiveFormattingElements() ) { this.pending_real_token = true; From 547ca06d3db37eb02f534a683dc98fa4619130aa Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:52:19 +0200 Subject: [PATCH 105/486] Guard WASM colgroup foster parenting --- ext/html-api-rust/wasm/smoke-test.mjs | 8 ++++++++ ext/html-api-rust/wasm/wp-html-api-rust.js | 1 + 2 files changed, 9 insertions(+) diff --git a/ext/html-api-rust/wasm/smoke-test.mjs b/ext/html-api-rust/wasm/smoke-test.mjs index d0654aa3f085f..079a3d496be60 100644 --- a/ext/html-api-rust/wasm/smoke-test.mjs +++ b/ext/html-api-rust/wasm/smoke-test.mjs @@ -937,6 +937,14 @@ assert.equal(tableTextProcessor.get_unsupported_exception().message, "Foster par tableTextProcessor.destroy(); assert.equal(WP_HTML_Processor.normalize("text
    cell"), null); +const colgroupTextProcessor = WP_HTML_Processor.create_fragment("foo
    "); +while (colgroupTextProcessor.next_token()) { +} +assert.equal(colgroupTextProcessor.get_last_error(), WP_HTML_Processor.ERROR_UNSUPPORTED); +assert.equal(colgroupTextProcessor.get_unsupported_exception().message, "Foster parenting is not supported."); +colgroupTextProcessor.destroy(); +assert.equal(WP_HTML_Processor.normalize("foo
    "), null); + const tableWhitespaceProcessor = WP_HTML_Processor.create_fragment(" \n
    cell"); assert.equal(tableWhitespaceProcessor.next_token(), true); assert.equal(tableWhitespaceProcessor.get_tag(), "TABLE"); diff --git a/ext/html-api-rust/wasm/wp-html-api-rust.js b/ext/html-api-rust/wasm/wp-html-api-rust.js index 011db3b61299e..6b4d501ae188c 100644 --- a/ext/html-api-rust/wasm/wp-html-api-rust.js +++ b/ext/html-api-rust/wasm/wp-html-api-rust.js @@ -162,6 +162,7 @@ const ACTIVE_FORMATTING_RECONSTRUCTING_START_TAGS = new Set([ ]); const TABLE_SECTION_ELEMENTS = new Set(["TBODY", "TFOOT", "THEAD"]); const TABLE_TEXT_CURRENT_NODE_ELEMENTS = new Set([ + "COLGROUP", "TABLE", "TBODY", "TEMPLATE", From 1ce350c31336db2366d51288c31928eb6f4b3380 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 11 Jun 2026 18:55:15 +0200 Subject: [PATCH 106/486] Handle WASM head noscript mode --- ext/html-api-rust/wasm/smoke-test.mjs | 20 +++++++++ ext/html-api-rust/wasm/wp-html-api-rust.js | 51 ++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/ext/html-api-rust/wasm/smoke-test.mjs b/ext/html-api-rust/wasm/smoke-test.mjs index 079a3d496be60..485c1d53c4bb5 100644 --- a/ext/html-api-rust/wasm/smoke-test.mjs +++ b/ext/html-api-rust/wasm/smoke-test.mjs @@ -450,6 +450,26 @@ assert.equal(fullParserDoctype.get_tag(), "P"); assert.deepEqual(fullParserDoctype.get_breadcrumbs(), ["HTML", "BODY", "P"]); fullParserDoctype.destroy(); +const fullParserHeadNoscriptBreakout = WP_HTML_Processor.create_full_parser(""); +assert.equal(fullParserHeadNoscriptBreakout.next_tag("noscript"), true); +assert.deepEqual(fullParserHeadNoscriptBreakout.get_breadcrumbs(), ["HTML", "HEAD", "NOSCRIPT"]); +assert.equal(fullParserHeadNoscriptBreakout.next_tag("br"), true); +assert.deepEqual(fullParserHeadNoscriptBreakout.get_breadcrumbs(), ["HTML", "BODY", "BR"]); +assert.equal(fullParserHeadNoscriptBreakout.next_token(), true); +assert.equal(fullParserHeadNoscriptBreakout.get_token_type(), "#comment"); +assert.deepEqual(fullParserHeadNoscriptBreakout.get_breadcrumbs(), ["HTML", "BODY", "#comment"]); +fullParserHeadNoscriptBreakout.destroy(); + +const fullParserNestedHeadNoscript = WP_HTML_Processor.create_full_parser('