From dd18b993daa5ecd1549a3e6e9f6307091b7b32c0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 2 Jun 2026 16:44:12 +0000 Subject: [PATCH 1/3] Initial plan From 81c4ffbec61ee1af0665627ff5aeab20b0169f04 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 2 Jun 2026 16:48:11 +0000 Subject: [PATCH 2/3] fix: default read_html() encoding to UTF-8 to prevent double-encoding on Windows --- NEWS.md | 3 +++ R/xml_parse.R | 10 ++++++---- tests/testthat/test-xml_parse.R | 9 +++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/NEWS.md b/NEWS.md index b3a25b5..d159410 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # xml2 1.5.2 +* `read_html()` now defaults to `encoding = "UTF-8"` to prevent double-encoding + of UTF-8 content on Windows with codepage 65001 (#490). + * Enable the myExternalEntityLoader also on libxml 2.14.4 for MacOS # xml2 1.5.1 diff --git a/R/xml_parse.R b/R/xml_parse.R index 01910d3..00169c3 100644 --- a/R/xml_parse.R +++ b/R/xml_parse.R @@ -30,7 +30,9 @@ #' @param encoding Specify a default encoding for the document. Unless #' otherwise specified XML documents are assumed to be in UTF-8 or #' UTF-16. If the document is not UTF-8/16, and lacks an explicit -#' encoding directive, this allows you to supply a default. +#' encoding directive, this allows you to supply a default. For +#' `read_html()` the default is `"UTF-8"` since the vast majority of +#' modern web pages are UTF-8 encoded. #' @param ... Additional arguments passed on to methods. #' @param as_html Optionally parse an xml file as if it's html. #' @param base_url When loading from a connection, raw vector or literal @@ -74,7 +76,7 @@ read_xml <- function( #' @rdname read_xml read_html <- function( x, - encoding = "", + encoding = "UTF-8", ..., options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE") ) { @@ -84,7 +86,7 @@ read_html <- function( #' @export read_html.default <- function( x, - encoding = "", + encoding = "UTF-8", ..., options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE") ) { @@ -102,7 +104,7 @@ read_html.default <- function( #' @export read_html.response <- function( x, - encoding = "", + encoding = "UTF-8", options = c("RECOVER", "NOERROR", "NOBLANKS"), ... ) { diff --git a/tests/testthat/test-xml_parse.R b/tests/testthat/test-xml_parse.R index e226bf1..29feae9 100644 --- a/tests/testthat/test-xml_parse.R +++ b/tests/testthat/test-xml_parse.R @@ -115,6 +115,15 @@ test_that("read_html works with non-ASCII encodings", { ) }) +test_that("read_html defaults to UTF-8 encoding for raw bytes", { + # Regression test: UTF-8 HTML passed as raw bytes should not be double-encoded + # (previously broken on Windows codepage 65001 when encoding defaulted to "") + html_utf8 <- charToRaw("\u00c4pfel") + res <- read_html(html_utf8) + expect_equal(xml_text(xml_find_first(res, "//body")), "\u00c4pfel") +}) + + test_that("read_xml and read_html fail with > 1 input", { expect_snapshot(error = TRUE, { read_xml(c("foo", "bar")) From b945ab3bcbf1c8cb3914724b6934666391d81330 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 2 Jun 2026 16:48:44 +0000 Subject: [PATCH 3/3] style: remove extra blank line in test file --- tests/testthat/test-xml_parse.R | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/testthat/test-xml_parse.R b/tests/testthat/test-xml_parse.R index 29feae9..57cc29f 100644 --- a/tests/testthat/test-xml_parse.R +++ b/tests/testthat/test-xml_parse.R @@ -123,7 +123,6 @@ test_that("read_html defaults to UTF-8 encoding for raw bytes", { expect_equal(xml_text(xml_find_first(res, "//body")), "\u00c4pfel") }) - test_that("read_xml and read_html fail with > 1 input", { expect_snapshot(error = TRUE, { read_xml(c("foo", "bar"))