From dd18b993daa5ecd1549a3e6e9f6307091b7b32c0 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 2 Jun 2026 16:44:12 +0000
Subject: [PATCH 1/3] Initial plan


From 81c4ffbec61ee1af0665627ff5aeab20b0169f04 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 2 Jun 2026 16:48:11 +0000
Subject: [PATCH 2/3] fix: default read_html() encoding to UTF-8 to prevent
 double-encoding on Windows

---
 NEWS.md                         |  3 +++
 R/xml_parse.R                   | 10 ++++++----
 tests/testthat/test-xml_parse.R |  9 +++++++++
 3 files changed, 18 insertions(+), 4 deletions(-)
diff --git a/NEWS.md b/NEWS.md
index b3a25b5..d159410 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,8 @@
 # xml2 1.5.2
 
+* `read_html()` now defaults to `encoding = "UTF-8"` to prevent double-encoding
+  of UTF-8 content on Windows with codepage 65001 (#490).
+
 * Enable the myExternalEntityLoader also on libxml 2.14.4 for MacOS
 
 # xml2 1.5.1
diff --git a/R/xml_parse.R b/R/xml_parse.R
index 01910d3..00169c3 100644
--- a/R/xml_parse.R
+++ b/R/xml_parse.R
@@ -30,7 +30,9 @@
 #' @param encoding Specify a default encoding for the document. Unless
 #'   otherwise specified XML documents are assumed to be in UTF-8 or
 #'   UTF-16. If the document is not UTF-8/16, and lacks an explicit
-#'   encoding directive, this allows you to supply a default.
+#'   encoding directive, this allows you to supply a default. For
+#'   `read_html()` the default is `"UTF-8"` since the vast majority of
+#'   modern web pages are UTF-8 encoded.
 #' @param ... Additional arguments passed on to methods.
 #' @param as_html Optionally parse an xml file as if it's html.
 #' @param base_url When loading from a connection, raw vector or literal
@@ -74,7 +76,7 @@ read_xml <- function(
 #' @rdname read_xml
 read_html <- function(
   x,
-  encoding = "",
+  encoding = "UTF-8",
   ...,
   options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE")
 ) {
@@ -84,7 +86,7 @@ read_html <- function(
 #' @export
 read_html.default <- function(
   x,
-  encoding = "",
+  encoding = "UTF-8",
   ...,
   options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE")
 ) {
@@ -102,7 +104,7 @@ read_html.default <- function(
 #' @export
 read_html.response <- function(
   x,
-  encoding = "",
+  encoding = "UTF-8",
   options = c("RECOVER", "NOERROR", "NOBLANKS"),
   ...
 ) {
diff --git a/tests/testthat/test-xml_parse.R b/tests/testthat/test-xml_parse.R
index e226bf1..29feae9 100644
--- a/tests/testthat/test-xml_parse.R
+++ b/tests/testthat/test-xml_parse.R
@@ -115,6 +115,15 @@ test_that("read_html works with non-ASCII encodings", {
   )
 })
 
+test_that("read_html defaults to UTF-8 encoding for raw bytes", {
+  # Regression test: UTF-8 HTML passed as raw bytes should not be double-encoded
+  # (previously broken on Windows codepage 65001 when encoding defaulted to "")
+  html_utf8 <- charToRaw("<html><body>\u00c4pfel</body></html>")
+  res <- read_html(html_utf8)
+  expect_equal(xml_text(xml_find_first(res, "//body")), "\u00c4pfel")
+})
+
+
 test_that("read_xml and read_html fail with > 1 input", {
   expect_snapshot(error = TRUE, {
     read_xml(c("foo", "bar"))

From b945ab3bcbf1c8cb3914724b6934666391d81330 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 2 Jun 2026 16:48:44 +0000
Subject: [PATCH 3/3] style: remove extra blank line in test file

---
 tests/testthat/test-xml_parse.R | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/testthat/test-xml_parse.R b/tests/testthat/test-xml_parse.R
index 29feae9..57cc29f 100644
--- a/tests/testthat/test-xml_parse.R
+++ b/tests/testthat/test-xml_parse.R
@@ -123,7 +123,6 @@ test_that("read_html defaults to UTF-8 encoding for raw bytes", {
   expect_equal(xml_text(xml_find_first(res, "//body")), "\u00c4pfel")
 })
 
-
 test_that("read_xml and read_html fail with > 1 input", {
   expect_snapshot(error = TRUE, {
     read_xml(c("foo", "bar"))