From 36467ccef523f0f572c570bab0e6479c5e9c9042 Mon Sep 17 00:00:00 2001 From: Ville Laitila Date: Fri, 24 Apr 2026 12:36:56 +0300 Subject: [PATCH 1/2] fix(to_xml): strip XML-invalid C0 control chars from attribute values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Attribute values containing C0 control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F) produced XML that parsers reject outright — XML 1.0 forbids these characters even as numeric entities. enc_xml_a_v now strips them before escaping. TAB, LF, CR remain supported (LF is escaped as ). Add a roundtrip test that builds an element with every forbidden C0 char plus <&"'> and DEL, serialises, and re-parses to confirm the output is valid. --- src/sgraph/sgraph.py | 7 +++++++ tests/sgraph_test.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/src/sgraph/sgraph.py b/src/sgraph/sgraph.py index af5f293..443053a 100644 --- a/src/sgraph/sgraph.py +++ b/src/sgraph/sgraph.py @@ -15,6 +15,7 @@ from collections.abc import Sequence import io import os +import re import sys import uuid import xml.sax.handler @@ -155,6 +156,11 @@ def traverse(self, traverser: Callable[[SElement], None]): for e in self.rootNode.children: traverser(e) + # C0 control characters that XML 1.0 forbids in any content (see + # https://www.w3.org/TR/xml/#charsets). TAB (0x09), LF (0x0A) and CR (0x0D) + # are allowed and handled explicitly during escaping. + _XML_INVALID_CONTROL_RE = re.compile('[\x00-\x08\x0b\x0c\x0e-\x1f]') + def to_xml(self, fname: str | None, stdout: bool = True) -> str | None: rootNode = self.rootNode counter = Counter() @@ -218,6 +224,7 @@ def enc_xml_a_v(v: int | float | set[str] | dict[object, object] | list[str] | s # Forbidden chars are: naked ampersand, left angle bracket, double quote # single quote is fine as we are using double quotes in XML for attributes v = v.encode('utf-8', 'replace').decode() + v = SGraph._XML_INVALID_CONTROL_RE.sub('', v) return v.replace('&', '&').replace('<', '<').replace('>', '>').replace( '\n', '&' + '#' + '10;').replace('"', '"') return '' diff --git a/tests/sgraph_test.py b/tests/sgraph_test.py index 51f330b..af7a82e 100644 --- a/tests/sgraph_test.py +++ b/tests/sgraph_test.py @@ -1,4 +1,5 @@ import copy +import io import os from typing import Any @@ -121,3 +122,32 @@ def test_repr_multi_root_omits_count_when_small(): text = repr(graph) assert 'nginx' in text and 'bar' in text assert 'count=' not in text + + +def test_to_xml_strips_invalid_control_chars_and_roundtrips(): + """Attribute values containing C0 control chars forbidden by XML 1.0 must + be sanitised so that the serialised model parses back cleanly.""" + from sgraph.selement import SElement + + graph = SGraph() + repo = SElement(graph.rootNode, 'repo') + elem = SElement(repo, 'file.py') + + c0_controls = ''.join(chr(i) for i in range(0x20) if i not in (0x09, 0x0A, 0x0D)) + nasty = f'before{c0_controls}after<&"\'>{chr(0x7F)} tab\there' + elem.attrs['description'] = nasty + + xml = graph.to_xml(None, stdout=False) + assert xml is not None + parsed = SGraph.parse_xml_file_or_stream(io.StringIO(xml)) + + repo_node = parsed.rootNode.children[0] + file_node = next(c for c in repo_node.children if c.name == 'file.py') + got = file_node.attrs['description'] + + # All C0 chars except TAB/LF/CR must be stripped. + for ch in c0_controls: + assert ch not in got, f'control char {ch!r} leaked into parsed attribute' + # The visible content must survive intact. + assert got.startswith('before') + assert 'afterxxx'.replace('xxx', '<&"\'>') in got or 'after<&"\'>' in got From a497a1bf2d9c8c24b0c8b8f3d1641968db80c0f0 Mon Sep 17 00:00:00 2001 From: Ville Laitila Date: Wed, 29 Apr 2026 11:22:43 +0300 Subject: [PATCH 2/2] test(to_xml): simplify redundant assertion in C0 control char test Both branches of the previous `or` collapsed to the same expression (`'after<&"\'>' in got`), so the assertion only ever exercised one predicate. Reduce to the single equivalent check for readability. --- tests/sgraph_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/sgraph_test.py b/tests/sgraph_test.py index af7a82e..82680a9 100644 --- a/tests/sgraph_test.py +++ b/tests/sgraph_test.py @@ -150,4 +150,4 @@ def test_to_xml_strips_invalid_control_chars_and_roundtrips(): assert ch not in got, f'control char {ch!r} leaked into parsed attribute' # The visible content must survive intact. assert got.startswith('before') - assert 'afterxxx'.replace('xxx', '<&"\'>') in got or 'after<&"\'>' in got + assert 'after<&"\'>' in got