From 6b13b62a0edfb57beac4cce8966adcca8f87ea33 Mon Sep 17 00:00:00 2001
From: Marcos Prieto <marcos.prieto@close.com>
Date: Tue, 2 Jun 2026 15:10:24 +0200
Subject: [PATCH 1/4] Adding failing test for invalid tags

---
 tests/test_html.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/tests/test_html.py b/tests/test_html.py
index e084d62..2f33e78 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -112,3 +112,15 @@ def test_get_html_tree_flattens_at_pseudo_tag_with_attributes():
     html = '<div>x<addr@domain foo="bar">y</addr@domain>z</div>'
     rendered = render_html_tree(get_html_tree(html))
     assert rendered == '<div>x&lt;addr@domain foo="bar"&gt;yz</div>'
+
+
+def test_get_html_tree_flattens_malformed_tag_with_colon_and_equals():
+    # <ahref="https://..."> is parsed by lxml as a tag whose name contains
+    # both ':' and '"'/'='. The ':' alone would trigger the Outlook span
+    # roundtrip, but restoring that name raises ValueError in lxml 6.x.
+    # It must be flattened to visible text instead.
+    html = '<div>x<ahref="https://example.com">click</ahref>z</div>'
+
+    rendered = render_html_tree(get_html_tree(html))
+
+    assert rendered == '<div>x<span example.com">clickz</span></div>'

From 3ed2c4e554bf982e6b06b620d4711e6ecb090819 Mon Sep 17 00:00:00 2001
From: Marcos Prieto <marcos.prieto@close.com>
Date: Tue, 2 Jun 2026 15:12:49 +0200
Subject: [PATCH 2/4] Ignore ValueError for invalid tags

---
 quotequail/_html.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/quotequail/_html.py b/quotequail/_html.py
index 3bcb0cd..74bf100 100644
--- a/quotequail/_html.py
+++ b/quotequail/_html.py
@@ -1,4 +1,5 @@
 # HTML utils
+import contextlib
 import html
 from collections.abc import Iterator
 from typing import TYPE_CHECKING, TypeAlias
@@ -283,7 +284,9 @@ def render_html_tree(tree: Element) -> str:
     for el in tree.iter():
         if "__tag_name" in el.attrib:
             actual_tag_name = el.attrib.pop("__tag_name")
-            el.tag = actual_tag_name
+            # Stored tag name is invalid, leave it
+            with contextlib.suppress(ValueError):
+                el.tag = actual_tag_name
 
     html_str = lxml.html.tostring(tree, encoding="utf8").decode("utf8")
 

From ca75ec1285e1db1ecfbf1bb38a93c6e012600c8b Mon Sep 17 00:00:00 2001
From: Marcos Prieto <marcos.prieto@close.com>
Date: Tue, 2 Jun 2026 15:28:42 +0200
Subject: [PATCH 3/4] Add test for rendering outlook : tags and malformed ones

---
 tests/test_html.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/test_html.py b/tests/test_html.py
index 2f33e78..3240af3 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -114,13 +114,19 @@ def test_get_html_tree_flattens_at_pseudo_tag_with_attributes():
     assert rendered == '<div>x&lt;addr@domain foo="bar"&gt;yz</div>'
 
 
+def test_get_html_tree_outlook_tag_roundtrip():
+    # Outlook uses <o:p> for paragraph padding. The tag must survive the
+    # get_html_tree → render_html_tree roundtrip unchanged.
+    html = "<div>foo<o:p></o:p>bar</div>"
+    assert render_html_tree(get_html_tree(html)) == "<div>foo<o:p></o:p>bar</div>"
+
+
 def test_get_html_tree_flattens_malformed_tag_with_colon_and_equals():
     # <ahref="https://..."> is parsed by lxml as a tag whose name contains
     # both ':' and '"'/'='. The ':' alone would trigger the Outlook span
     # roundtrip, but restoring that name raises ValueError in lxml 6.x.
     # It must be flattened to visible text instead.
     html = '<div>x<ahref="https://example.com">click</ahref>z</div>'
-
-    rendered = render_html_tree(get_html_tree(html))
-
-    assert rendered == '<div>x<span example.com">clickz</span></div>'
+    assert render_html_tree(get_html_tree(html)) == (
+        '<div>x&lt;ahref="https: example.com"=""&gt;clickz</div>'
+    )

From b7e95ba2b035831fbdb0d5b6e5cb66d960444ded Mon Sep 17 00:00:00 2001
From: Marcos Prieto <marcos.prieto@close.com>
Date: Wed, 3 Jun 2026 12:04:03 +0200
Subject: [PATCH 4/4] Explict handle more chars in tags

---
 quotequail/_html.py | 17 +++++++++-----
 tests/test_html.py  | 56 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 51 insertions(+), 22 deletions(-)

diff --git a/quotequail/_html.py b/quotequail/_html.py
index 74bf100..e58c8ae 100644
--- a/quotequail/_html.py
+++ b/quotequail/_html.py
@@ -244,15 +244,18 @@ def get_html_tree(html_str: str) -> Element:
             el.attrib["__tag_name"] = f"{prefix}:{el.tag}"
             el.tag = "span"
 
-        elif ":" in el.tag:
+        elif ":" in el.tag and not any(c in el.tag for c in ('"', "=", " ")):
             # Outlook <o:p> padding: same treatment, round-tripped.
+            # Only applies to genuine namespace tags (e.g. o:p, v:shape).
+            # Tags that contain '"', '=', or spaces alongside ':' are garbage
+            # from malformed HTML (e.g. <ahref="https:...>) and must be
+            # flattened instead, since restoring them would raise ValueError.
             el.attrib["__tag_name"] = el.tag
             el.tag = "span"
 
-        elif "@" in el.tag or "=" in el.tag:
-            # Mail client forgot to escape <addr@domain> or used other
-            # XPath-special chars (like =) in tag names. Flatten back to
-            # visible text so the address actually renders.
+        elif ":" in el.tag or "@" in el.tag or "=" in el.tag:
+            # Malformed tag whose name contains XPath-special or otherwise
+            # invalid characters. Flatten back to visible text.
             attrs = "".join(
                 f' {k}="{html.escape(v, quote=True)}"'
                 for k, v in el.attrib.items()
@@ -284,7 +287,9 @@ def render_html_tree(tree: Element) -> str:
     for el in tree.iter():
         if "__tag_name" in el.attrib:
             actual_tag_name = el.attrib.pop("__tag_name")
-            # Stored tag name is invalid, leave it
+            # If lxml rejects restoring the tag name (malformed input),
+            # leave the element as a span,  __tag_name is already
+            # popped so it won't appear in the output.
             with contextlib.suppress(ValueError):
                 el.tag = actual_tag_name
 
diff --git a/tests/test_html.py b/tests/test_html.py
index 3240af3..ed0e21c 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -1,3 +1,6 @@
+import lxml.html
+import pytest
+
 from quotequail._html import (
     Position,
     get_html_tree,
@@ -106,27 +109,48 @@ def test_trim_before():
     assert render_html_tree(tree) == "<div>E</div>"
 
 
-def test_get_html_tree_flattens_at_pseudo_tag_with_attributes():
-    # Unescaped <addr@domain attr="..."> pseudo-tags must round-trip as
-    # visible text without losing attribute values.
-    html = '<div>x<addr@domain foo="bar">y</addr@domain>z</div>'
-    rendered = render_html_tree(get_html_tree(html))
-    assert rendered == '<div>x&lt;addr@domain foo="bar"&gt;yz</div>'
+@pytest.mark.parametrize(
+    ("html", "expected"),
+    [
+        # '@' in tag name — unescaped email-style pseudo-tag
+        (
+            '<div>x<addr@domain foo="bar">y</addr@domain>z</div>',
+            '<div>x&lt;addr@domain foo="bar"&gt;yz</div>',
+        ),
+        # ':' and '"' in tag name — lxml parses <ahref="https://..."> this way
+        (
+            '<div>x<ahref="https://example.com">click</ahref>z</div>',
+            '<div>x&lt;ahref="https: example.com"=""&gt;clickz</div>',
+        ),
+        # ':' and '=' in tag name — e.g. <a:b=c>
+        (
+            "<div>x<a:b=c>click</a:b>z</div>",
+            "<div>x&lt;a:b=c&gt;clickz</div>",
+        ),
+    ],
+)
+def test_get_html_tree_flattens_malformed_tags(html, expected):
+    # Tags whose names contain XPath-special or invalid characters
+    # must be rendered as escaped visible text rather than roundtripped as real
+    # tags,which would raise ValueError in lxml
+    assert render_html_tree(get_html_tree(html)) == expected
 
 
 def test_get_html_tree_outlook_tag_roundtrip():
     # Outlook uses <o:p> for paragraph padding. The tag must survive the
     # get_html_tree → render_html_tree roundtrip unchanged.
     html = "<div>foo<o:p></o:p>bar</div>"
-    assert render_html_tree(get_html_tree(html)) == "<div>foo<o:p></o:p>bar</div>"
+    assert (
+        render_html_tree(get_html_tree(html)) == "<div>foo<o:p></o:p>bar</div>"
+    )
 
 
-def test_get_html_tree_flattens_malformed_tag_with_colon_and_equals():
-    # <ahref="https://..."> is parsed by lxml as a tag whose name contains
-    # both ':' and '"'/'='. The ':' alone would trigger the Outlook span
-    # roundtrip, but restoring that name raises ValueError in lxml 6.x.
-    # It must be flattened to visible text instead.
-    html = '<div>x<ahref="https://example.com">click</ahref>z</div>'
-    assert render_html_tree(get_html_tree(html)) == (
-        '<div>x&lt;ahref="https: example.com"=""&gt;clickz</div>'
-    )
+def test_render_html_tree_suppresses_space_in_stored_tag_name():
+    # Verify that if a tag name containing ':' and ' ' somehow ends up in
+    # __tag_name, render_html_tree must not raise ValueError.
+    tree = lxml.html.fragment_fromstring("<div><span>text</span></div>")
+    span = tree.find("span")
+    span.attrib["__tag_name"] = "o:p style"
+    result = render_html_tree(tree)
+    assert "text" in result
+    assert "__tag_name" not in result