Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# xml2 1.5.2

* `read_html()` now defaults to `encoding = "UTF-8"` to prevent double-encoding
of UTF-8 content on Windows with codepage 65001 (#490).

* Enable the myExternalEntityLoader also on libxml 2.14.4 for MacOS

# xml2 1.5.1
Expand Down
10 changes: 6 additions & 4 deletions R/xml_parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@
#' @param encoding Specify a default encoding for the document. Unless
#' otherwise specified XML documents are assumed to be in UTF-8 or
#' UTF-16. If the document is not UTF-8/16, and lacks an explicit
#' encoding directive, this allows you to supply a default.
#' encoding directive, this allows you to supply a default. For
#' `read_html()` the default is `"UTF-8"` since the vast majority of
#' modern web pages are UTF-8 encoded.
#' @param ... Additional arguments passed on to methods.
#' @param as_html Optionally parse an xml file as if it's html.
#' @param base_url When loading from a connection, raw vector or literal
Expand Down Expand Up @@ -74,7 +76,7 @@ read_xml <- function(
#' @rdname read_xml
read_html <- function(
x,
encoding = "",
encoding = "UTF-8",
...,
options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE")
) {
Expand All @@ -84,7 +86,7 @@ read_html <- function(
#' @export
read_html.default <- function(
x,
encoding = "",
encoding = "UTF-8",
...,
options = c("RECOVER", "NOERROR", "NOBLANKS", "HUGE")
) {
Expand All @@ -102,7 +104,7 @@ read_html.default <- function(
#' @export
read_html.response <- function(
x,
encoding = "",
encoding = "UTF-8",
options = c("RECOVER", "NOERROR", "NOBLANKS"),
...
) {
Expand Down
8 changes: 8 additions & 0 deletions tests/testthat/test-xml_parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,14 @@ test_that("read_html works with non-ASCII encodings", {
)
})

test_that("read_html defaults to UTF-8 encoding for raw bytes", {
# Regression test: UTF-8 HTML passed as raw bytes should not be double-encoded
# (previously broken on Windows codepage 65001 when encoding defaulted to "")
html_utf8 <- charToRaw("<html><body>\u00c4pfel</body></html>")
res <- read_html(html_utf8)
expect_equal(xml_text(xml_find_first(res, "//body")), "\u00c4pfel")
})

test_that("read_xml and read_html fail with > 1 input", {
expect_snapshot(error = TRUE, {
read_xml(c("foo", "bar"))
Expand Down
Loading