diff --git a/quotequail/_html.py b/quotequail/_html.py index 3bcb0cd..e58c8ae 100644 --- a/quotequail/_html.py +++ b/quotequail/_html.py @@ -1,4 +1,5 @@ # HTML utils +import contextlib import html from collections.abc import Iterator from typing import TYPE_CHECKING, TypeAlias @@ -243,15 +244,18 @@ def get_html_tree(html_str: str) -> Element: el.attrib["__tag_name"] = f"{prefix}:{el.tag}" el.tag = "span" - elif ":" in el.tag: + elif ":" in el.tag and not any(c in el.tag for c in ('"', "=", " ")): # Outlook padding: same treatment, round-tripped. + # Only applies to genuine namespace tags (e.g. o:p, v:shape). + # Tags that contain '"', '=', or spaces alongside ':' are garbage + # from malformed HTML (e.g. ) and must be + # flattened instead, since restoring them would raise ValueError. el.attrib["__tag_name"] = el.tag el.tag = "span" - elif "@" in el.tag or "=" in el.tag: - # Mail client forgot to escape or used other - # XPath-special chars (like =) in tag names. Flatten back to - # visible text so the address actually renders. + elif ":" in el.tag or "@" in el.tag or "=" in el.tag: + # Malformed tag whose name contains XPath-special or otherwise + # invalid characters. Flatten back to visible text. attrs = "".join( f' {k}="{html.escape(v, quote=True)}"' for k, v in el.attrib.items() @@ -283,7 +287,11 @@ def render_html_tree(tree: Element) -> str: for el in tree.iter(): if "__tag_name" in el.attrib: actual_tag_name = el.attrib.pop("__tag_name") - el.tag = actual_tag_name + # If lxml rejects restoring the tag name (malformed input), + # leave the element as a span, __tag_name is already + # popped so it won't appear in the output. + with contextlib.suppress(ValueError): + el.tag = actual_tag_name html_str = lxml.html.tostring(tree, encoding="utf8").decode("utf8") diff --git a/tests/test_html.py b/tests/test_html.py index e084d62..ed0e21c 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,3 +1,6 @@ +import lxml.html +import pytest + from quotequail._html import ( Position, get_html_tree, @@ -106,9 +109,48 @@ def test_trim_before(): assert render_html_tree(tree) == "
E
" -def test_get_html_tree_flattens_at_pseudo_tag_with_attributes(): - # Unescaped pseudo-tags must round-trip as - # visible text without losing attribute values. - html = '
xyz
' - rendered = render_html_tree(get_html_tree(html)) - assert rendered == '
x<addr@domain foo="bar">yz
' +@pytest.mark.parametrize( + ("html", "expected"), + [ + # '@' in tag name — unescaped email-style pseudo-tag + ( + '
xyz
', + '
x<addr@domain foo="bar">yz
', + ), + # ':' and '"' in tag name — lxml parses this way + ( + '
xclickz
', + '
x<ahref="https: example.com"="">clickz
', + ), + # ':' and '=' in tag name — e.g. + ( + "
xclickz
", + "
x<a:b=c>clickz
", + ), + ], +) +def test_get_html_tree_flattens_malformed_tags(html, expected): + # Tags whose names contain XPath-special or invalid characters + # must be rendered as escaped visible text rather than roundtripped as real + # tags,which would raise ValueError in lxml + assert render_html_tree(get_html_tree(html)) == expected + + +def test_get_html_tree_outlook_tag_roundtrip(): + # Outlook uses for paragraph padding. The tag must survive the + # get_html_tree → render_html_tree roundtrip unchanged. + html = "
foobar
" + assert ( + render_html_tree(get_html_tree(html)) == "
foobar
" + ) + + +def test_render_html_tree_suppresses_space_in_stored_tag_name(): + # Verify that if a tag name containing ':' and ' ' somehow ends up in + # __tag_name, render_html_tree must not raise ValueError. + tree = lxml.html.fragment_fromstring("
text
") + span = tree.find("span") + span.attrib["__tag_name"] = "o:p style" + result = render_html_tree(tree) + assert "text" in result + assert "__tag_name" not in result