From c97c1aef6b18a43fbea4897a166f043a0401ac1a Mon Sep 17 00:00:00 2001 From: Meftun Akarsu Date: Sun, 21 Jun 2026 15:10:07 +0300 Subject: [PATCH] fix(viewer): skip unreadable/non-UTF-8 files instead of aborting the whole bundle _walk_concepts caught only OKFDocumentError, so a single file with invalid UTF-8 bytes, a read error, or a bare ValueError from PyYAML (e.g. an out-of-range timestamp) aborted the entire visualize run. Broaden the handler to (ValueError, OSError) and skip the offending file, matching the tolerance bundle.index._load_doc already applies. Adds tests for the invalid-UTF-8 and bad-timestamp cases. --- okf/src/reference_agent/viewer/generator.py | 11 +++++-- okf/tests/test_viewer.py | 34 +++++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/okf/src/reference_agent/viewer/generator.py b/okf/src/reference_agent/viewer/generator.py index 86aae8b..62197f2 100644 --- a/okf/src/reference_agent/viewer/generator.py +++ b/okf/src/reference_agent/viewer/generator.py @@ -6,7 +6,7 @@ from pathlib import Path from typing import Any -from reference_agent.bundle.document import OKFDocument, OKFDocumentError +from reference_agent.bundle.document import OKFDocument _INDEX_NAME = "index.md" _LINK_RE = re.compile(r"\]\(([^)\s]+\.md)(?:#[A-Za-z0-9_\-]*)?\)") @@ -75,7 +75,14 @@ def _walk_concepts(bundle_root: Path) -> list[Concept]: concept_id = "/".join(rel.parts) try: doc = OKFDocument.parse(md_path.read_text(encoding="utf-8")) - except OKFDocumentError: + except (ValueError, OSError): + # Skip a single malformed/unreadable file rather than aborting the + # whole visualization. ValueError covers OKFDocument parse failures + # (OKFDocumentError) and invalid UTF-8 (UnicodeDecodeError) — both + # ValueError subclasses — plus PyYAML's *bare* ValueError for an + # out-of-range frontmatter value such as a bad timestamp; OSError + # covers read errors. bundle.index._load_doc tolerates the same + # files via a broader `except Exception`. continue fm = doc.frontmatter or {} tags = fm.get("tags") or [] diff --git a/okf/tests/test_viewer.py b/okf/tests/test_viewer.py index cdb0b18..9fa626e 100644 --- a/okf/tests/test_viewer.py +++ b/okf/tests/test_viewer.py @@ -160,6 +160,40 @@ def test_node_colors_match_palette(tmp_path: Path): assert by_id["references/metrics/dau"]["color"] == "#10b981" +def test_unreadable_concept_file_is_skipped(tmp_path: Path): + # A single file with invalid UTF-8 bytes must not abort the whole + # visualization — it should be skipped like a malformed-frontmatter file. + bundle = tmp_path / "bundle" + _make_bundle(bundle) + corrupt = bundle / "tables" / "corrupt.md" + corrupt.parent.mkdir(parents=True, exist_ok=True) + corrupt.write_bytes(b"---\ntype: BigQuery Table\n\xff\xfe not utf-8\n---\n") + out = tmp_path / "viz.html" + stats = generate_visualization(bundle, out) + assert out.exists() + # The 4 well-formed concepts still render; the corrupt file is skipped. + assert stats["concepts"] == 4 + + +def test_malformed_timestamp_concept_file_is_skipped(tmp_path: Path): + # A valid-UTF-8 file whose frontmatter has an out-of-range timestamp makes + # PyYAML raise a *bare* ValueError (not an OKFDocumentError); it must be + # skipped like any other malformed file, not abort the whole viz. + bundle = tmp_path / "bundle" + _make_bundle(bundle) + bad = bundle / "tables" / "badts.md" + bad.parent.mkdir(parents=True, exist_ok=True) + bad.write_text( + "---\ntype: BigQuery Table\ntimestamp: 2026-13-45\n---\nbody\n", + encoding="utf-8", + ) + out = tmp_path / "viz.html" + stats = generate_visualization(bundle, out) + assert out.exists() + # The 4 well-formed concepts still render; the bad-timestamp file is skipped. + assert stats["concepts"] == 4 + + def test_raises_when_bundle_missing(tmp_path: Path): with pytest.raises(FileNotFoundError): generate_visualization(tmp_path / "nope", tmp_path / "viz.html")