From 6b13b62a0edfb57beac4cce8966adcca8f87ea33 Mon Sep 17 00:00:00 2001 From: Marcos Prieto Date: Tue, 2 Jun 2026 15:10:24 +0200 Subject: [PATCH 1/4] Adding failing test for invalid tags --- tests/test_html.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_html.py b/tests/test_html.py index e084d62..2f33e78 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -112,3 +112,15 @@ def test_get_html_tree_flattens_at_pseudo_tag_with_attributes(): html = '
xyz
' rendered = render_html_tree(get_html_tree(html)) assert rendered == '
x<addr@domain foo="bar">yz
' + + +def test_get_html_tree_flattens_malformed_tag_with_colon_and_equals(): + # is parsed by lxml as a tag whose name contains + # both ':' and '"'/'='. The ':' alone would trigger the Outlook span + # roundtrip, but restoring that name raises ValueError in lxml 6.x. + # It must be flattened to visible text instead. + html = '
xclickz
' + + rendered = render_html_tree(get_html_tree(html)) + + assert rendered == '
xclickz
' From 3ed2c4e554bf982e6b06b620d4711e6ecb090819 Mon Sep 17 00:00:00 2001 From: Marcos Prieto Date: Tue, 2 Jun 2026 15:12:49 +0200 Subject: [PATCH 2/4] Ignore ValueError for invalid tags --- quotequail/_html.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/quotequail/_html.py b/quotequail/_html.py index 3bcb0cd..74bf100 100644 --- a/quotequail/_html.py +++ b/quotequail/_html.py @@ -1,4 +1,5 @@ # HTML utils +import contextlib import html from collections.abc import Iterator from typing import TYPE_CHECKING, TypeAlias @@ -283,7 +284,9 @@ def render_html_tree(tree: Element) -> str: for el in tree.iter(): if "__tag_name" in el.attrib: actual_tag_name = el.attrib.pop("__tag_name") - el.tag = actual_tag_name + # Stored tag name is invalid, leave it + with contextlib.suppress(ValueError): + el.tag = actual_tag_name html_str = lxml.html.tostring(tree, encoding="utf8").decode("utf8") From ca75ec1285e1db1ecfbf1bb38a93c6e012600c8b Mon Sep 17 00:00:00 2001 From: Marcos Prieto Date: Tue, 2 Jun 2026 15:28:42 +0200 Subject: [PATCH 3/4] Add test for rendering outlook : tags and malformed ones --- tests/test_html.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/test_html.py b/tests/test_html.py index 2f33e78..3240af3 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -114,13 +114,19 @@ def test_get_html_tree_flattens_at_pseudo_tag_with_attributes(): assert rendered == '
x<addr@domain foo="bar">yz
' +def test_get_html_tree_outlook_tag_roundtrip(): + # Outlook uses for paragraph padding. The tag must survive the + # get_html_tree → render_html_tree roundtrip unchanged. + html = "
foobar
" + assert render_html_tree(get_html_tree(html)) == "
foobar
" + + def test_get_html_tree_flattens_malformed_tag_with_colon_and_equals(): # is parsed by lxml as a tag whose name contains # both ':' and '"'/'='. The ':' alone would trigger the Outlook span # roundtrip, but restoring that name raises ValueError in lxml 6.x. # It must be flattened to visible text instead. html = '
xclickz
' - - rendered = render_html_tree(get_html_tree(html)) - - assert rendered == '
xclickz
' + assert render_html_tree(get_html_tree(html)) == ( + '
x<ahref="https: example.com"="">clickz
' + ) From b7e95ba2b035831fbdb0d5b6e5cb66d960444ded Mon Sep 17 00:00:00 2001 From: Marcos Prieto Date: Wed, 3 Jun 2026 12:04:03 +0200 Subject: [PATCH 4/4] Explict handle more chars in tags --- quotequail/_html.py | 17 +++++++++----- tests/test_html.py | 56 ++++++++++++++++++++++++++++++++------------- 2 files changed, 51 insertions(+), 22 deletions(-) diff --git a/quotequail/_html.py b/quotequail/_html.py index 74bf100..e58c8ae 100644 --- a/quotequail/_html.py +++ b/quotequail/_html.py @@ -244,15 +244,18 @@ def get_html_tree(html_str: str) -> Element: el.attrib["__tag_name"] = f"{prefix}:{el.tag}" el.tag = "span" - elif ":" in el.tag: + elif ":" in el.tag and not any(c in el.tag for c in ('"', "=", " ")): # Outlook padding: same treatment, round-tripped. + # Only applies to genuine namespace tags (e.g. o:p, v:shape). + # Tags that contain '"', '=', or spaces alongside ':' are garbage + # from malformed HTML (e.g. ) and must be + # flattened instead, since restoring them would raise ValueError. el.attrib["__tag_name"] = el.tag el.tag = "span" - elif "@" in el.tag or "=" in el.tag: - # Mail client forgot to escape or used other - # XPath-special chars (like =) in tag names. Flatten back to - # visible text so the address actually renders. + elif ":" in el.tag or "@" in el.tag or "=" in el.tag: + # Malformed tag whose name contains XPath-special or otherwise + # invalid characters. Flatten back to visible text. attrs = "".join( f' {k}="{html.escape(v, quote=True)}"' for k, v in el.attrib.items() @@ -284,7 +287,9 @@ def render_html_tree(tree: Element) -> str: for el in tree.iter(): if "__tag_name" in el.attrib: actual_tag_name = el.attrib.pop("__tag_name") - # Stored tag name is invalid, leave it + # If lxml rejects restoring the tag name (malformed input), + # leave the element as a span, __tag_name is already + # popped so it won't appear in the output. with contextlib.suppress(ValueError): el.tag = actual_tag_name diff --git a/tests/test_html.py b/tests/test_html.py index 3240af3..ed0e21c 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,3 +1,6 @@ +import lxml.html +import pytest + from quotequail._html import ( Position, get_html_tree, @@ -106,27 +109,48 @@ def test_trim_before(): assert render_html_tree(tree) == "
E
" -def test_get_html_tree_flattens_at_pseudo_tag_with_attributes(): - # Unescaped pseudo-tags must round-trip as - # visible text without losing attribute values. - html = '
xyz
' - rendered = render_html_tree(get_html_tree(html)) - assert rendered == '
x<addr@domain foo="bar">yz
' +@pytest.mark.parametrize( + ("html", "expected"), + [ + # '@' in tag name — unescaped email-style pseudo-tag + ( + '
xyz
', + '
x<addr@domain foo="bar">yz
', + ), + # ':' and '"' in tag name — lxml parses this way + ( + '
xclickz
', + '
x<ahref="https: example.com"="">clickz
', + ), + # ':' and '=' in tag name — e.g. + ( + "
xclickz
", + "
x<a:b=c>clickz
", + ), + ], +) +def test_get_html_tree_flattens_malformed_tags(html, expected): + # Tags whose names contain XPath-special or invalid characters + # must be rendered as escaped visible text rather than roundtripped as real + # tags,which would raise ValueError in lxml + assert render_html_tree(get_html_tree(html)) == expected def test_get_html_tree_outlook_tag_roundtrip(): # Outlook uses for paragraph padding. The tag must survive the # get_html_tree → render_html_tree roundtrip unchanged. html = "
foobar
" - assert render_html_tree(get_html_tree(html)) == "
foobar
" + assert ( + render_html_tree(get_html_tree(html)) == "
foobar
" + ) -def test_get_html_tree_flattens_malformed_tag_with_colon_and_equals(): - # is parsed by lxml as a tag whose name contains - # both ':' and '"'/'='. The ':' alone would trigger the Outlook span - # roundtrip, but restoring that name raises ValueError in lxml 6.x. - # It must be flattened to visible text instead. - html = '
xclickz
' - assert render_html_tree(get_html_tree(html)) == ( - '
x<ahref="https: example.com"="">clickz
' - ) +def test_render_html_tree_suppresses_space_in_stored_tag_name(): + # Verify that if a tag name containing ':' and ' ' somehow ends up in + # __tag_name, render_html_tree must not raise ValueError. + tree = lxml.html.fragment_fromstring("
text
") + span = tree.find("span") + span.attrib["__tag_name"] = "o:p style" + result = render_html_tree(tree) + assert "text" in result + assert "__tag_name" not in result