From 23347bfda37b0781732e7025607172109724b214 Mon Sep 17 00:00:00 2001 From: Aravinda Sharma <7734009+assinchu@users.noreply.github.com> Date: Tue, 23 Jun 2026 18:09:16 -0700 Subject: [PATCH] fix(report): strip ANSI/control bytes from report output Scanned skill content (and LLM output quoting it) can carry ANSI escape sequences and control bytes (NUL, ESC, ...) into finding text. Emitted verbatim, these make a report register as binary: GitLab/editors offer 'download' instead of rendering the Markdown, and terminals print garbled output. Sanitize every finding's free-text fields once in the report node (the single scoring/formatting point), so terminal, JSON, Markdown, and SARIF output all stay clean UTF-8. Tabs, newlines, and multibyte UTF-8 (e.g. emoji severity markers) are preserved. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Aravinda Sharma <7734009+assinchu@users.noreply.github.com> --- src/skillspector/nodes/report.py | 38 ++++++++++++++ tests/nodes/test_report_sanitizer.py | 76 ++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 tests/nodes/test_report_sanitizer.py diff --git a/src/skillspector/nodes/report.py b/src/skillspector/nodes/report.py index 48e15d30..16e50cb5 100644 --- a/src/skillspector/nodes/report.py +++ b/src/skillspector/nodes/report.py @@ -22,6 +22,8 @@ from __future__ import annotations import json +import re +from dataclasses import replace from datetime import UTC, datetime from io import StringIO from typing import Literal @@ -65,6 +67,38 @@ } +# Scanned skill content (and LLM output that quotes it) can carry ANSI escape +# sequences and control bytes (e.g. NUL, ESC) into finding text. Left in, they +# make a rendered report register as binary — GitLab/editors show "download" +# instead of the Markdown, and terminals emit garbled output. Strip them from +# every finding text field so all report formats stay clean UTF-8. +_ANSI_RE = re.compile(r"\x1b\[[0-9;?]*[ -/]*[@-~]") +_CONTROL_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]") + +# Finding free-text fields that may carry scanned/LLM content. +_SANITIZED_FIELDS = ( + "message", + "explanation", + "remediation", + "finding", + "context", + "matched_text", + "code_snippet", +) + + +def _clean_text(value: str | None) -> str | None: + """Strip ANSI escape sequences and disallowed control chars (keep tab/newline).""" + if not isinstance(value, str): + return value + return _CONTROL_RE.sub("", _ANSI_RE.sub("", value)) + + +def _sanitize_finding(finding: Finding) -> Finding: + """Return a copy of *finding* with control/ANSI bytes stripped from text fields.""" + return replace(finding, **{f: _clean_text(getattr(finding, f)) for f in _SANITIZED_FIELDS}) + + def _severity_to_sarif_level(severity: str) -> Literal["error", "warning", "note"]: """Map Finding.severity to SARIF result level.""" return { @@ -533,6 +567,10 @@ def report(state: SkillspectorState) -> dict[str, object]: """ raw_findings = state.get("findings", []) filtered_findings = state.get("filtered_findings", raw_findings) + # Strip ANSI/control bytes once here so every downstream format (terminal, + # json, markdown, sarif) and the returned findings stay clean UTF-8. Applied + # before partition/dedup so active and suppressed findings are both clean. + filtered_findings = [_sanitize_finding(f) for f in filtered_findings] component_metadata = state.get("component_metadata") or [] components = state.get("components") or [] file_cache = state.get("file_cache") or {} diff --git a/tests/nodes/test_report_sanitizer.py b/tests/nodes/test_report_sanitizer.py new file mode 100644 index 00000000..0f2b5ba1 --- /dev/null +++ b/tests/nodes/test_report_sanitizer.py @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for report-output sanitization (ANSI / control-byte stripping).""" + +from __future__ import annotations + +import pytest + +from skillspector.models import Finding +from skillspector.nodes.report import _clean_text, _sanitize_finding, report +from skillspector.state import SkillspectorState + + +def _dirty_finding() -> Finding: + return Finding( + rule_id="E2", + message="creds \x1b[31mleak\x1b[0m here\x00", + severity="HIGH", + confidence=0.9, + file="a/SKILL.md", + start_line=5, + remediation="redact \x1b[1mnow\x1b[0m", + context="line with \x07 bell and \x1b[0m reset", + ) + + +def test_clean_text_strips_ansi_and_control_keeps_readable() -> None: + assert _clean_text("a\x1b[31mb\x1b[0mc\x00d") == "abcd" + # Tabs and newlines are preserved. + assert _clean_text("a\tb\nc") == "a\tb\nc" + # Emoji / multibyte UTF-8 is untouched. + assert _clean_text("🔴 HIGH") == "🔴 HIGH" + # Non-strings pass through. + assert _clean_text(None) is None + + +def test_sanitize_finding_cleans_text_fields_only() -> None: + cleaned = _sanitize_finding(_dirty_finding()) + assert "\x1b" not in cleaned.message and "\x00" not in cleaned.message + assert "leak" in cleaned.message and "here" in cleaned.message + assert "\x1b" not in (cleaned.remediation or "") + assert "\x07" not in (cleaned.context or "") + # Non-text fields are unchanged. + assert cleaned.rule_id == "E2" + assert cleaned.start_line == 5 + + +@pytest.mark.parametrize("fmt", ["markdown", "json", "sarif", "terminal"]) +def test_report_emits_clean_utf8_for_all_formats(fmt: str) -> None: + """No ANSI/control bytes leak into any report format.""" + state: SkillspectorState = { + "filtered_findings": [_dirty_finding()], + "component_metadata": [], + "has_executable_scripts": False, + "manifest": {}, + "skill_path": None, + "output_format": fmt, + } + body = report(state)["report_body"] + assert "\x00" not in body, f"NUL leaked into {fmt}" + assert "\x1b" not in body, f"ESC leaked into {fmt}" + # The readable content survives the sanitization. + assert "leak" in body and "here" in body