pseudo-tags must round-trip as
- # visible text without losing attribute values.
- html = ''
- rendered = render_html_tree(get_html_tree(html))
- assert rendered == 'x<addr@domain foo="bar">yz
'
+@pytest.mark.parametrize(
+ ("html", "expected"),
+ [
+ # '@' in tag name — unescaped email-style pseudo-tag
+ (
+ '',
+ 'x<addr@domain foo="bar">yz
',
+ ),
+ # ':' and '"' in tag name — lxml parses this way
+ (
+ '',
+ 'x<ahref="https: example.com"="">clickz
',
+ ),
+ # ':' and '=' in tag name — e.g.
+ (
+ "",
+ "x<a:b=c>clickz
",
+ ),
+ ],
+)
+def test_get_html_tree_flattens_malformed_tags(html, expected):
+ # Tags whose names contain XPath-special or invalid characters
+ # must be rendered as escaped visible text rather than roundtripped as real
+ # tags,which would raise ValueError in lxml
+ assert render_html_tree(get_html_tree(html)) == expected
+
+
+def test_get_html_tree_outlook_tag_roundtrip():
+ # Outlook uses for paragraph padding. The tag must survive the
+ # get_html_tree → render_html_tree roundtrip unchanged.
+ html = "foobar
"
+ assert (
+ render_html_tree(get_html_tree(html)) == "foobar
"
+ )
+
+
+def test_render_html_tree_suppresses_space_in_stored_tag_name():
+ # Verify that if a tag name containing ':' and ' ' somehow ends up in
+ # __tag_name, render_html_tree must not raise ValueError.
+ tree = lxml.html.fragment_fromstring("text
")
+ span = tree.find("span")
+ span.attrib["__tag_name"] = "o:p style"
+ result = render_html_tree(tree)
+ assert "text" in result
+ assert "__tag_name" not in result