Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions quotequail/_html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# HTML utils
import contextlib
import html
from collections.abc import Iterator
from typing import TYPE_CHECKING, TypeAlias
Expand Down Expand Up @@ -243,15 +244,18 @@ def get_html_tree(html_str: str) -> Element:
el.attrib["__tag_name"] = f"{prefix}:{el.tag}"
el.tag = "span"

elif ":" in el.tag:
elif ":" in el.tag and not any(c in el.tag for c in ('"', "=", " ")):
# Outlook <o:p> padding: same treatment, round-tripped.
# Only applies to genuine namespace tags (e.g. o:p, v:shape).
# Tags that contain '"', '=', or spaces alongside ':' are garbage
# from malformed HTML (e.g. <ahref="https:...>) and must be
# flattened instead, since restoring them would raise ValueError.
el.attrib["__tag_name"] = el.tag
el.tag = "span"

elif "@" in el.tag or "=" in el.tag:
# Mail client forgot to escape <addr@domain> or used other
# XPath-special chars (like =) in tag names. Flatten back to
# visible text so the address actually renders.
elif ":" in el.tag or "@" in el.tag or "=" in el.tag:
# Malformed tag whose name contains XPath-special or otherwise
# invalid characters. Flatten back to visible text.
attrs = "".join(
f' {k}="{html.escape(v, quote=True)}"'
for k, v in el.attrib.items()
Expand Down Expand Up @@ -283,7 +287,11 @@ def render_html_tree(tree: Element) -> str:
for el in tree.iter():
if "__tag_name" in el.attrib:
actual_tag_name = el.attrib.pop("__tag_name")
el.tag = actual_tag_name
# If lxml rejects restoring the tag name (malformed input),
# leave the element as a span, __tag_name is already
# popped so it won't appear in the output.
with contextlib.suppress(ValueError):
el.tag = actual_tag_name

html_str = lxml.html.tostring(tree, encoding="utf8").decode("utf8")

Expand Down
54 changes: 48 additions & 6 deletions tests/test_html.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import lxml.html
import pytest

from quotequail._html import (
Position,
get_html_tree,
Expand Down Expand Up @@ -106,9 +109,48 @@ def test_trim_before():
assert render_html_tree(tree) == "<div>E</div>"


def test_get_html_tree_flattens_at_pseudo_tag_with_attributes():
# Unescaped <addr@domain attr="..."> pseudo-tags must round-trip as
# visible text without losing attribute values.
html = '<div>x<addr@domain foo="bar">y</addr@domain>z</div>'
rendered = render_html_tree(get_html_tree(html))
assert rendered == '<div>x&lt;addr@domain foo="bar"&gt;yz</div>'
@pytest.mark.parametrize(
("html", "expected"),
[
# '@' in tag name — unescaped email-style pseudo-tag
(
'<div>x<addr@domain foo="bar">y</addr@domain>z</div>',
'<div>x&lt;addr@domain foo="bar"&gt;yz</div>',
),
# ':' and '"' in tag name — lxml parses <ahref="https://..."> this way
(
'<div>x<ahref="https://example.com">click</ahref>z</div>',
'<div>x&lt;ahref="https: example.com"=""&gt;clickz</div>',
),
# ':' and '=' in tag name — e.g. <a:b=c>
(
"<div>x<a:b=c>click</a:b>z</div>",
"<div>x&lt;a:b=c&gt;clickz</div>",
),
],
)
def test_get_html_tree_flattens_malformed_tags(html, expected):
# Tags whose names contain XPath-special or invalid characters
# must be rendered as escaped visible text rather than roundtripped as real
# tags,which would raise ValueError in lxml
assert render_html_tree(get_html_tree(html)) == expected


def test_get_html_tree_outlook_tag_roundtrip():
# Outlook uses <o:p> for paragraph padding. The tag must survive the
# get_html_tree → render_html_tree roundtrip unchanged.
html = "<div>foo<o:p></o:p>bar</div>"
assert (
render_html_tree(get_html_tree(html)) == "<div>foo<o:p></o:p>bar</div>"
)


def test_render_html_tree_suppresses_space_in_stored_tag_name():
# Verify that if a tag name containing ':' and ' ' somehow ends up in
# __tag_name, render_html_tree must not raise ValueError.
tree = lxml.html.fragment_fromstring("<div><span>text</span></div>")
span = tree.find("span")
span.attrib["__tag_name"] = "o:p style"
result = render_html_tree(tree)
assert "text" in result
assert "__tag_name" not in result
Loading