diff --git a/NEWS.md b/NEWS.md index d159410..d825681 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # xml2 1.5.2 +* `xml_path()` is faster for large node sets with shared ancestor paths, and + `xml_find_all()` materializes large node-set results more efficiently. + * `read_html()` now defaults to `encoding = "UTF-8"` to prevent double-encoding of UTF-8 content on Windows with codepage 65001 (#490). diff --git a/R/xml_find.R b/R/xml_find.R index 50d4dc6..e0c5af7 100644 --- a/R/xml_find.R +++ b/R/xml_find.R @@ -86,7 +86,7 @@ xml_find_all.xml_missing <- function(x, xpath, ns = xml_ns(x), ...) { #' @export xml_find_all.xml_node <- function(x, xpath, ns = xml_ns(x), ...) { nodes <- .Call(xpath_search, x$node, x$doc, xpath, ns, Inf) - xml_nodeset(nodes) + xml_nodeset(nodes, deduplicate = FALSE) } #' @param flatten A logical indicating whether to return a single, flattened diff --git a/src/xml2_node.cpp b/src/xml2_node.cpp index 6fcf59f..4680acf 100644 --- a/src/xml2_node.cpp +++ b/src/xml2_node.cpp @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include "xml2_types.h" #include "xml2_utils.h" @@ -777,6 +779,227 @@ SEXP node_path_impl(SEXP x) { return out; } +static bool path_same_namespace_name(const xmlNode* cur, const xmlNode* other, bool generic) { + if (other->type != XML_ELEMENT_NODE) { + return false; + } + + if (generic) { + return true; + } + + if (!xmlStrEqual(cur->name, other->name)) { + return false; + } + + return (other->ns == cur->ns) || + (other->ns != NULL && cur->ns != NULL && + xmlStrEqual(other->ns->prefix, cur->ns->prefix)); +} + +static int path_node_occurrence(const xmlNode* cur) { + const xmlNode* tmp; + int occur = 0; + + switch (cur->type) { + case XML_ELEMENT_NODE: { + bool generic = cur->ns && cur->ns->prefix == NULL; + + for (tmp = cur->prev; tmp != NULL; tmp = tmp->prev) { + if (path_same_namespace_name(cur, tmp, generic)) { + occur++; + } + } + + if (occur == 0) { + for (tmp = cur->next; tmp != NULL && occur == 0; tmp = tmp->next) { + if (path_same_namespace_name(cur, tmp, generic)) { + occur++; + } + } + if (occur != 0) { + occur = 1; + } + } else { + occur++; + } + break; + } + case XML_COMMENT_NODE: + for (tmp = cur->prev; tmp != NULL; tmp = tmp->prev) { + if (tmp->type == XML_COMMENT_NODE) { + occur++; + } + } + + if (occur == 0) { + for (tmp = cur->next; tmp != NULL && occur == 0; tmp = tmp->next) { + if (tmp->type == XML_COMMENT_NODE) { + occur++; + } + } + if (occur != 0) { + occur = 1; + } + } else { + occur++; + } + break; + case XML_TEXT_NODE: + case XML_CDATA_SECTION_NODE: + for (tmp = cur->prev; tmp != NULL; tmp = tmp->prev) { + if (tmp->type == XML_TEXT_NODE || tmp->type == XML_CDATA_SECTION_NODE) { + occur++; + } + } + + if (occur == 0) { + for (tmp = cur->next; tmp != NULL; tmp = tmp->next) { + if (tmp->type == XML_TEXT_NODE || tmp->type == XML_CDATA_SECTION_NODE) { + occur = 1; + break; + } + } + } else { + occur++; + } + break; + case XML_PI_NODE: + for (tmp = cur->prev; tmp != NULL; tmp = tmp->prev) { + if (tmp->type == XML_PI_NODE && xmlStrEqual(cur->name, tmp->name)) { + occur++; + } + } + + if (occur == 0) { + for (tmp = cur->next; tmp != NULL && occur == 0; tmp = tmp->next) { + if (tmp->type == XML_PI_NODE && xmlStrEqual(cur->name, tmp->name)) { + occur++; + } + } + if (occur != 0) { + occur = 1; + } + } else { + occur++; + } + break; + default: + break; + } + + return occur; +} + +static bool path_node_segment(const xmlNode* cur, std::string* segment) { + switch (cur->type) { + case XML_ELEMENT_NODE: + *segment += "/"; + if (cur->ns) { + if (cur->ns->prefix != NULL) { + *segment += Xml2String(cur->ns->prefix).asStdString(); + *segment += ":"; + *segment += Xml2String(cur->name).asStdString(); + } else { + *segment += "*"; + } + } else { + *segment += Xml2String(cur->name).asStdString(); + } + break; + case XML_COMMENT_NODE: + *segment += "/comment()"; + break; + case XML_TEXT_NODE: + case XML_CDATA_SECTION_NODE: + *segment += "/text()"; + break; + case XML_PI_NODE: + *segment += "/processing-instruction('"; + *segment += Xml2String(cur->name).asStdString(); + *segment += "')"; + break; + case XML_ATTRIBUTE_NODE: + *segment += "/@"; + if (cur->ns && cur->ns->prefix != NULL) { + *segment += Xml2String(cur->ns->prefix).asStdString(); + *segment += ":"; + } + *segment += Xml2String(cur->name).asStdString(); + break; + default: + return false; + } + + int occur = path_node_occurrence(cur); + if (occur > 0) { + char tmpbuf[30]; + snprintf(tmpbuf, sizeof(tmpbuf), "[%d]", occur); + *segment += tmpbuf; + } + + return true; +} + +typedef std::map NodePathCache; + +static bool path_cacheable_node(const xmlNode* node) { + return node->type == XML_DOCUMENT_NODE || + node->type == XML_HTML_DOCUMENT_NODE || + node->type == XML_ELEMENT_NODE; +} + +static bool node_path_cached(const xmlNode* node, NodePathCache* cache, std::string* out) { + if (node == NULL || node->type == XML_NAMESPACE_DECL) { + return false; + } + + bool cacheable = path_cacheable_node(node); + if (cacheable) { + NodePathCache::const_iterator cached = cache->find(node); + if (cached != cache->end()) { + *out = cached->second; + return true; + } + } + + if (node->type == XML_DOCUMENT_NODE || node->type == XML_HTML_DOCUMENT_NODE) { + *out = "/"; + (*cache)[node] = *out; + return true; + } + + std::string parent_path; + const xmlNode* parent = node->parent; + if (parent != NULL && + parent->type != XML_DOCUMENT_NODE && + parent->type != XML_HTML_DOCUMENT_NODE) { + if (!node_path_cached(parent, cache, &parent_path)) { + return false; + } + } + + std::string segment; + if (!path_node_segment(node, &segment)) { + return false; + } + + *out = parent_path + segment; + if (cacheable) { + (*cache)[node] = *out; + } + return true; +} + +static SEXP node_path_string(const xmlNode* node, NodePathCache* cache) { + std::string path; + if (node_path_cached(node, cache, &path)) { + return Rf_mkCharCE(path.c_str(), CE_UTF8); + } + + return Xml2String(xmlGetNodePath(node)).asRString(); +} + // [[export]] extern "C" SEXP node_path(SEXP x) { BEGIN_CPP @@ -789,13 +1012,21 @@ extern "C" SEXP node_path(SEXP x) { return Rf_ScalarString(node_path_impl(x)); break; case NodeType::nodeset: { - int n = Rf_xlength(x); + R_xlen_t n = Rf_xlength(x); SEXP out = PROTECT(Rf_allocVector(STRSXP, n)); + NodePathCache cache; - for (int i = 0; i < n; ++i) { + for (R_xlen_t i = 0; i < n; ++i) { SEXP x_i = VECTOR_ELT(x, i); - SEXP name_i = node_path_impl(x_i); + SEXP name_i; + if (getNodeType(x_i) == NodeType::missing) { + name_i = NA_STRING; + } else { + SEXP node_sxp = VECTOR_ELT(x_i, 0); + XPtrNode node(node_sxp); + name_i = node_path_string(node.checked_get(), &cache); + } SET_STRING_ELT(out, i, name_i); } diff --git a/src/xml2_xpath.cpp b/src/xml2_xpath.cpp index 004f343..9341b1f 100644 --- a/src/xml2_xpath.cpp +++ b/src/xml2_xpath.cpp @@ -62,21 +62,23 @@ class XmlSeeker { SET_STRING_ELT(names, 0, Rf_mkChar("node")); SET_STRING_ELT(names, 1, Rf_mkChar("doc")); + SEXP xml_node_class = PROTECT(Rf_mkString("xml_node")); + for (int i = 0; i < n; i++) { SEXP ret = PROTECT(Rf_allocVector(VECSXP, 2)); - SET_VECTOR_ELT(ret, 0, XPtrNode(nodes->nodeTab[i])); + SET_VECTOR_ELT(ret, 0, R_MakeExternalPtr((void*) nodes->nodeTab[i], R_NilValue, R_NilValue)); SET_VECTOR_ELT(ret, 1, doc_); Rf_setAttrib(ret, R_NamesSymbol, names); - Rf_setAttrib(ret, R_ClassSymbol, Rf_mkString("xml_node")); + Rf_setAttrib(ret, R_ClassSymbol, xml_node_class); SET_VECTOR_ELT(out, i, ret); UNPROTECT(1); } - UNPROTECT(2); + UNPROTECT(3); return out; } case XPATH_NUMBER: { return Rf_ScalarReal(result_->floatval); } diff --git a/tests/testthat/test-xml_find.R b/tests/testthat/test-xml_find.R index 2809706..f93bc5f 100644 --- a/tests/testthat/test-xml_find.R +++ b/tests/testthat/test-xml_find.R @@ -61,6 +61,21 @@ test_that("xml_find_all returns nodeset or list of nodesets based on flatten", { expect_s3_class(z[[1L]], "xml_nodeset") }) +test_that("xml_find_all on a single node returns unique xpath node sets", { + x <- read_xml("") + + nodes <- xml_find_all(x, "//* | //a | //a") + + expect_false(any(.Call(nodes_duplicated, unclass(nodes)))) + expect_identical(xml_path(nodes), c( + "/root", + "/root/a[1]", + "/root/a[2]", + "/root/b", + "/root/b/a" + )) +}) + # Find num --------------------------------------------------------------------- test_that("xml_find_num errors with non numeric results", { x <- read_xml("") diff --git a/tests/testthat/test-xml_path.R b/tests/testthat/test-xml_path.R new file mode 100644 index 0000000..e2ead32 --- /dev/null +++ b/tests/testthat/test-xml_path.R @@ -0,0 +1,18 @@ +test_that("xml_path nodeset output matches individual node paths", { + x <- read_xml( + paste0( + "", + "", + "", + "onethree", + "", + "", + "" + ) + ) + + nodes <- xml_find_all(x, "//*|//@*|//text()|//comment()|//processing-instruction()") + individual <- vapply(seq_along(nodes), function(i) xml_path(nodes[[i]]), character(1)) + + expect_identical(xml_path(nodes), individual) +})