diff --git a/src/skillspector/nodes/report.py b/src/skillspector/nodes/report.py index 48e15d30..16e50cb5 100644 --- a/src/skillspector/nodes/report.py +++ b/src/skillspector/nodes/report.py @@ -22,6 +22,8 @@ from __future__ import annotations import json +import re +from dataclasses import replace from datetime import UTC, datetime from io import StringIO from typing import Literal @@ -65,6 +67,38 @@ } +# Scanned skill content (and LLM output that quotes it) can carry ANSI escape +# sequences and control bytes (e.g. NUL, ESC) into finding text. Left in, they +# make a rendered report register as binary — GitLab/editors show "download" +# instead of the Markdown, and terminals emit garbled output. Strip them from +# every finding text field so all report formats stay clean UTF-8. +_ANSI_RE = re.compile(r"\x1b\[[0-9;?]*[ -/]*[@-~]") +_CONTROL_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]") + +# Finding free-text fields that may carry scanned/LLM content. +_SANITIZED_FIELDS = ( + "message", + "explanation", + "remediation", + "finding", + "context", + "matched_text", + "code_snippet", +) + + +def _clean_text(value: str | None) -> str | None: + """Strip ANSI escape sequences and disallowed control chars (keep tab/newline).""" + if not isinstance(value, str): + return value + return _CONTROL_RE.sub("", _ANSI_RE.sub("", value)) + + +def _sanitize_finding(finding: Finding) -> Finding: + """Return a copy of *finding* with control/ANSI bytes stripped from text fields.""" + return replace(finding, **{f: _clean_text(getattr(finding, f)) for f in _SANITIZED_FIELDS}) + + def _severity_to_sarif_level(severity: str) -> Literal["error", "warning", "note"]: """Map Finding.severity to SARIF result level.""" return { @@ -533,6 +567,10 @@ def report(state: SkillspectorState) -> dict[str, object]: """ raw_findings = state.get("findings", []) filtered_findings = state.get("filtered_findings", raw_findings) + # Strip ANSI/control bytes once here so every downstream format (terminal, + # json, markdown, sarif) and the returned findings stay clean UTF-8. Applied + # before partition/dedup so active and suppressed findings are both clean. + filtered_findings = [_sanitize_finding(f) for f in filtered_findings] component_metadata = state.get("component_metadata") or [] components = state.get("components") or [] file_cache = state.get("file_cache") or {} diff --git a/tests/nodes/test_report_sanitizer.py b/tests/nodes/test_report_sanitizer.py new file mode 100644 index 00000000..0f2b5ba1 --- /dev/null +++ b/tests/nodes/test_report_sanitizer.py @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for report-output sanitization (ANSI / control-byte stripping).""" + +from __future__ import annotations + +import pytest + +from skillspector.models import Finding +from skillspector.nodes.report import _clean_text, _sanitize_finding, report +from skillspector.state import SkillspectorState + + +def _dirty_finding() -> Finding: + return Finding( + rule_id="E2", + message="creds \x1b[31mleak\x1b[0m here\x00", + severity="HIGH", + confidence=0.9, + file="a/SKILL.md", + start_line=5, + remediation="redact \x1b[1mnow\x1b[0m", + context="line with \x07 bell and \x1b[0m reset", + ) + + +def test_clean_text_strips_ansi_and_control_keeps_readable() -> None: + assert _clean_text("a\x1b[31mb\x1b[0mc\x00d") == "abcd" + # Tabs and newlines are preserved. + assert _clean_text("a\tb\nc") == "a\tb\nc" + # Emoji / multibyte UTF-8 is untouched. + assert _clean_text("🔴 HIGH") == "🔴 HIGH" + # Non-strings pass through. + assert _clean_text(None) is None + + +def test_sanitize_finding_cleans_text_fields_only() -> None: + cleaned = _sanitize_finding(_dirty_finding()) + assert "\x1b" not in cleaned.message and "\x00" not in cleaned.message + assert "leak" in cleaned.message and "here" in cleaned.message + assert "\x1b" not in (cleaned.remediation or "") + assert "\x07" not in (cleaned.context or "") + # Non-text fields are unchanged. + assert cleaned.rule_id == "E2" + assert cleaned.start_line == 5 + + +@pytest.mark.parametrize("fmt", ["markdown", "json", "sarif", "terminal"]) +def test_report_emits_clean_utf8_for_all_formats(fmt: str) -> None: + """No ANSI/control bytes leak into any report format.""" + state: SkillspectorState = { + "filtered_findings": [_dirty_finding()], + "component_metadata": [], + "has_executable_scripts": False, + "manifest": {}, + "skill_path": None, + "output_format": fmt, + } + body = report(state)["report_body"] + assert "\x00" not in body, f"NUL leaked into {fmt}" + assert "\x1b" not in body, f"ESC leaked into {fmt}" + # The readable content survives the sanitization. + assert "leak" in body and "here" in body