Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# xml2 1.5.2

* `xml_path()` is faster for large node sets with shared ancestor paths, and
`xml_find_all()` materializes large node-set results more efficiently.

* `read_html()` now defaults to `encoding = "UTF-8"` to prevent double-encoding
of UTF-8 content on Windows with codepage 65001 (#490).

Expand Down
2 changes: 1 addition & 1 deletion R/xml_find.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ xml_find_all.xml_missing <- function(x, xpath, ns = xml_ns(x), ...) {
#' @export
xml_find_all.xml_node <- function(x, xpath, ns = xml_ns(x), ...) {
nodes <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf)
xml_nodeset(nodes)
xml_nodeset(nodes, deduplicate = FALSE)
}

#' @param flatten A logical indicating whether to return a single, flattened
Expand Down
237 changes: 234 additions & 3 deletions src/xml2_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include <vector>
#include <string>
#include <set>
#include <map>
#include <cstdio>

#include "xml2_types.h"
#include "xml2_utils.h"
Expand Down Expand Up @@ -777,6 +779,227 @@ SEXP node_path_impl(SEXP x) {
return out;
}

static bool path_same_namespace_name(const xmlNode* cur, const xmlNode* other, bool generic) {
if (other->type != XML_ELEMENT_NODE) {
return false;
}

if (generic) {
return true;
}

if (!xmlStrEqual(cur->name, other->name)) {
return false;
}

return (other->ns == cur->ns) ||
(other->ns != NULL && cur->ns != NULL &&
xmlStrEqual(other->ns->prefix, cur->ns->prefix));
}

static int path_node_occurrence(const xmlNode* cur) {
const xmlNode* tmp;
int occur = 0;

switch (cur->type) {
case XML_ELEMENT_NODE: {
bool generic = cur->ns && cur->ns->prefix == NULL;

for (tmp = cur->prev; tmp != NULL; tmp = tmp->prev) {
if (path_same_namespace_name(cur, tmp, generic)) {
occur++;
}
}

if (occur == 0) {
for (tmp = cur->next; tmp != NULL && occur == 0; tmp = tmp->next) {
if (path_same_namespace_name(cur, tmp, generic)) {
occur++;
}
}
if (occur != 0) {
occur = 1;
}
} else {
occur++;
}
break;
}
case XML_COMMENT_NODE:
for (tmp = cur->prev; tmp != NULL; tmp = tmp->prev) {
if (tmp->type == XML_COMMENT_NODE) {
occur++;
}
}

if (occur == 0) {
for (tmp = cur->next; tmp != NULL && occur == 0; tmp = tmp->next) {
if (tmp->type == XML_COMMENT_NODE) {
occur++;
}
}
if (occur != 0) {
occur = 1;
}
} else {
occur++;
}
break;
case XML_TEXT_NODE:
case XML_CDATA_SECTION_NODE:
for (tmp = cur->prev; tmp != NULL; tmp = tmp->prev) {
if (tmp->type == XML_TEXT_NODE || tmp->type == XML_CDATA_SECTION_NODE) {
occur++;
}
}

if (occur == 0) {
for (tmp = cur->next; tmp != NULL; tmp = tmp->next) {
if (tmp->type == XML_TEXT_NODE || tmp->type == XML_CDATA_SECTION_NODE) {
occur = 1;
break;
}
}
} else {
occur++;
}
break;
case XML_PI_NODE:
for (tmp = cur->prev; tmp != NULL; tmp = tmp->prev) {
if (tmp->type == XML_PI_NODE && xmlStrEqual(cur->name, tmp->name)) {
occur++;
}
}

if (occur == 0) {
for (tmp = cur->next; tmp != NULL && occur == 0; tmp = tmp->next) {
if (tmp->type == XML_PI_NODE && xmlStrEqual(cur->name, tmp->name)) {
occur++;
}
}
if (occur != 0) {
occur = 1;
}
} else {
occur++;
}
break;
default:
break;
}

return occur;
}

static bool path_node_segment(const xmlNode* cur, std::string* segment) {
switch (cur->type) {
case XML_ELEMENT_NODE:
*segment += "/";
if (cur->ns) {
if (cur->ns->prefix != NULL) {
*segment += Xml2String(cur->ns->prefix).asStdString();
*segment += ":";
*segment += Xml2String(cur->name).asStdString();
} else {
*segment += "*";
}
} else {
*segment += Xml2String(cur->name).asStdString();
}
break;
case XML_COMMENT_NODE:
*segment += "/comment()";
break;
case XML_TEXT_NODE:
case XML_CDATA_SECTION_NODE:
*segment += "/text()";
break;
case XML_PI_NODE:
*segment += "/processing-instruction('";
*segment += Xml2String(cur->name).asStdString();
*segment += "')";
break;
case XML_ATTRIBUTE_NODE:
*segment += "/@";
if (cur->ns && cur->ns->prefix != NULL) {
*segment += Xml2String(cur->ns->prefix).asStdString();
*segment += ":";
}
*segment += Xml2String(cur->name).asStdString();
break;
default:
return false;
}

int occur = path_node_occurrence(cur);
if (occur > 0) {
char tmpbuf[30];
snprintf(tmpbuf, sizeof(tmpbuf), "[%d]", occur);
*segment += tmpbuf;
}

return true;
}

typedef std::map<const xmlNode*, std::string> NodePathCache;

static bool path_cacheable_node(const xmlNode* node) {
return node->type == XML_DOCUMENT_NODE ||
node->type == XML_HTML_DOCUMENT_NODE ||
node->type == XML_ELEMENT_NODE;
}

static bool node_path_cached(const xmlNode* node, NodePathCache* cache, std::string* out) {
if (node == NULL || node->type == XML_NAMESPACE_DECL) {
return false;
}

bool cacheable = path_cacheable_node(node);
if (cacheable) {
NodePathCache::const_iterator cached = cache->find(node);
if (cached != cache->end()) {
*out = cached->second;
return true;
}
}

if (node->type == XML_DOCUMENT_NODE || node->type == XML_HTML_DOCUMENT_NODE) {
*out = "/";
(*cache)[node] = *out;
return true;
}

std::string parent_path;
const xmlNode* parent = node->parent;
if (parent != NULL &&
parent->type != XML_DOCUMENT_NODE &&
parent->type != XML_HTML_DOCUMENT_NODE) {
if (!node_path_cached(parent, cache, &parent_path)) {
return false;
}
}

std::string segment;
if (!path_node_segment(node, &segment)) {
return false;
}

*out = parent_path + segment;
if (cacheable) {
(*cache)[node] = *out;
}
return true;
}

static SEXP node_path_string(const xmlNode* node, NodePathCache* cache) {
std::string path;
if (node_path_cached(node, cache, &path)) {
return Rf_mkCharCE(path.c_str(), CE_UTF8);
}

return Xml2String(xmlGetNodePath(node)).asRString();
}

// [[export]]
extern "C" SEXP node_path(SEXP x) {
BEGIN_CPP
Expand All @@ -789,13 +1012,21 @@ extern "C" SEXP node_path(SEXP x) {
return Rf_ScalarString(node_path_impl(x));
break;
case NodeType::nodeset: {
int n = Rf_xlength(x);
R_xlen_t n = Rf_xlength(x);

SEXP out = PROTECT(Rf_allocVector(STRSXP, n));
NodePathCache cache;

for (int i = 0; i < n; ++i) {
for (R_xlen_t i = 0; i < n; ++i) {
SEXP x_i = VECTOR_ELT(x, i);
SEXP name_i = node_path_impl(x_i);
SEXP name_i;
if (getNodeType(x_i) == NodeType::missing) {
name_i = NA_STRING;
} else {
SEXP node_sxp = VECTOR_ELT(x_i, 0);
XPtrNode node(node_sxp);
name_i = node_path_string(node.checked_get(), &cache);
}
SET_STRING_ELT(out, i, name_i);
}

Expand Down
8 changes: 5 additions & 3 deletions src/xml2_xpath.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,21 +62,23 @@ class XmlSeeker {
SET_STRING_ELT(names, 0, Rf_mkChar("node"));
SET_STRING_ELT(names, 1, Rf_mkChar("doc"));

SEXP xml_node_class = PROTECT(Rf_mkString("xml_node"));

for (int i = 0; i < n; i++) {
SEXP ret = PROTECT(Rf_allocVector(VECSXP, 2));

SET_VECTOR_ELT(ret, 0, XPtrNode(nodes->nodeTab[i]));
SET_VECTOR_ELT(ret, 0, R_MakeExternalPtr((void*) nodes->nodeTab[i], R_NilValue, R_NilValue));
SET_VECTOR_ELT(ret, 1, doc_);

Rf_setAttrib(ret, R_NamesSymbol, names);
Rf_setAttrib(ret, R_ClassSymbol, Rf_mkString("xml_node"));
Rf_setAttrib(ret, R_ClassSymbol, xml_node_class);

SET_VECTOR_ELT(out, i, ret);

UNPROTECT(1);
}

UNPROTECT(2);
UNPROTECT(3);
return out;
}
case XPATH_NUMBER: { return Rf_ScalarReal(result_->floatval); }
Expand Down
15 changes: 15 additions & 0 deletions tests/testthat/test-xml_find.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,21 @@ test_that("xml_find_all returns nodeset or list of nodesets based on flatten", {
expect_s3_class(z[[1L]], "xml_nodeset")
})

test_that("xml_find_all on a single node returns unique xpath node sets", {
x <- read_xml("<root><a/><a/><b><a/></b></root>")

nodes <- xml_find_all(x, "//* | //a | //a")

expect_false(any(.Call(nodes_duplicated, unclass(nodes))))
expect_identical(xml_path(nodes), c(
"/root",
"/root/a[1]",
"/root/a[2]",
"/root/b",
"/root/b/a"
))
})

# Find num ---------------------------------------------------------------------
test_that("xml_find_num errors with non numeric results", {
x <- read_xml("<x><y/><y/></x>")
Expand Down
18 changes: 18 additions & 0 deletions tests/testthat/test-xml_path.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
test_that("xml_path nodeset output matches individual node paths", {
x <- read_xml(
paste0(
"<?pi top?>",
"<root xmlns='urn:default' xmlns:p='urn:p'>",
"<a plain='1' p:attr='2'/>",
"<a>one<![CDATA[two]]>three</a>",
"<!-- first --><!-- second -->",
"<?pi one?><?pi two?>",
"</root>"
)
)

nodes <- xml_find_all(x, "//*|//@*|//text()|//comment()|//processing-instruction()")
individual <- vapply(seq_along(nodes), function(i) xml_path(nodes[[i]]), character(1))

expect_identical(xml_path(nodes), individual)
})
Loading