diff --git a/NEWS.md b/NEWS.md index b3a25b5..d159410 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # xml2 1.5.2 +* `read_html()` now defaults to `encoding = "UTF-8"` to prevent double-encoding + of UTF-8 content on Windows with codepage 65001 (#490). + * Enable the myExternalEntityLoader also on libxml 2.14.4 for MacOS # xml2 1.5.1 diff --git a/R/xml_parse.R b/R/xml_parse.R index 01910d3..00169c3 100644 --- a/R/xml_parse.R +++ b/R/xml_parse.R @@ -30,7 +30,9 @@ #' @param encoding Specify a default encoding for the document. Unless #' otherwise specified XML documents are assumed to be in UTF-8 or #' UTF-16. If the document is not UTF-8/16, and lacks an explicit -#' encoding directive, this allows you to supply a default. +#' encoding directive, this allows you to supply a default. For +#' `read_html()` the default is `"UTF-8"` since the vast majority of +#' modern web pages are UTF-8 encoded. #' @param ... Additional arguments passed on to methods. #' @param as_html Optionally parse an xml file as if it's html. #' @param base_url When loading from a connection, raw vector or literal @@ -74,7 +76,7 @@ read_xml <- function( #' @rdname read_xml read_html <- function( x, - encoding = "", + encoding = "UTF-8", ..., options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE") ) { @@ -84,7 +86,7 @@ read_html <- function( #' @export read_html.default <- function( x, - encoding = "", + encoding = "UTF-8", ..., options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE") ) { @@ -102,7 +104,7 @@ read_html.default <- function( #' @export read_html.response <- function( x, - encoding = "", + encoding = "UTF-8", options = c("RECOVER", "NOERROR", "NOBLANKS"), ... ) { diff --git a/tests/testthat/test-xml_parse.R b/tests/testthat/test-xml_parse.R index e226bf1..57cc29f 100644 --- a/tests/testthat/test-xml_parse.R +++ b/tests/testthat/test-xml_parse.R @@ -115,6 +115,14 @@ test_that("read_html works with non-ASCII encodings", { ) }) +test_that("read_html defaults to UTF-8 encoding for raw bytes", { + # Regression test: UTF-8 HTML passed as raw bytes should not be double-encoded + # (previously broken on Windows codepage 65001 when encoding defaulted to "") + html_utf8 <- charToRaw("\u00c4pfel") + res <- read_html(html_utf8) + expect_equal(xml_text(xml_find_first(res, "//body")), "\u00c4pfel") +}) + test_that("read_xml and read_html fail with > 1 input", { expect_snapshot(error = TRUE, { read_xml(c("foo", "bar"))