OWASP · PRAteek-singHWY · Jun 9, 2026 · Jun 9, 2026
diff --git a/application/tests/librarian/__init__.py b/application/tests/librarian/__init__.py
diff --git a/application/tests/librarian/config_loader_test.py b/application/tests/librarian/config_loader_test.py
@@ -0,0 +1,83 @@
+import os
+import unittest
+from unittest import mock
+
+from application.utils.librarian.config_loader import LibrarianConfig, load_config
+
+
+class TestConfigLoaderDefaults(unittest.TestCase):
+    def test_defaults_when_env_unset(self):
+        with mock.patch.dict(os.environ, {}, clear=True):
+            cfg = load_config()
+        self.assertEqual(cfg.crossencoder_model, "cross-encoder/ms-marco-MiniLM-L-6-v2")
+        self.assertEqual(cfg.top_k_retrieval, 20)
+        self.assertEqual(cfg.top_k_rerank, 5)
+        self.assertEqual(cfg.link_threshold, 0.8)
+        self.assertEqual(cfg.batch_size, 32)
+        self.assertEqual(cfg.ece_target, 0.10)
+        self.assertEqual(cfg.conformal_alpha, 0.10)
+
+    def test_config_is_frozen(self):
+        cfg = load_config()
+        with self.assertRaises(Exception):
+            cfg.link_threshold = 0.5  # type: ignore[misc]
+
+
+class TestConfigLoaderOverrides(unittest.TestCase):
+    OVERRIDES = {
+        "CRE_LIBRARIAN_CROSSENCODER_MODEL": "cross-encoder/other",
+        "CRE_LIBRARIAN_TOP_K_RETRIEVAL": "50",
+        "CRE_LIBRARIAN_TOP_K_RERANK": "10",
+        "CRE_LIBRARIAN_LINK_THRESHOLD": "0.7",
+        "CRE_LIBRARIAN_BATCH_SIZE": "64",
+        "CRE_LIBRARIAN_ECE_TARGET": "0.05",
+        "CRE_LIBRARIAN_CONFORMAL_ALPHA": "0.20",
+    }
+
+    def test_env_overrides_apply(self):
+        with mock.patch.dict(os.environ, self.OVERRIDES, clear=True):
+            cfg = load_config()
+        self.assertEqual(cfg.crossencoder_model, "cross-encoder/other")
+        self.assertEqual(cfg.top_k_retrieval, 50)
+        self.assertEqual(cfg.top_k_rerank, 10)
+        self.assertAlmostEqual(cfg.link_threshold, 0.7)
+        self.assertEqual(cfg.batch_size, 64)
+        self.assertAlmostEqual(cfg.ece_target, 0.05)
+        self.assertAlmostEqual(cfg.conformal_alpha, 0.20)
+
+    def test_bad_int_env_raises(self):
+        with mock.patch.dict(
+            os.environ, {"CRE_LIBRARIAN_TOP_K_RETRIEVAL": "not-an-int"}, clear=True
+        ):
+            with self.assertRaises(ValueError):
+                load_config()
+
+    def test_link_threshold_above_one_raises(self):
+        with mock.patch.dict(
+            os.environ, {"CRE_LIBRARIAN_LINK_THRESHOLD": "1.2"}, clear=True
+        ):
+            with self.assertRaises(ValueError):
+                load_config()
+
+    def test_negative_top_k_retrieval_raises(self):
+        with mock.patch.dict(
+            os.environ, {"CRE_LIBRARIAN_TOP_K_RETRIEVAL": "-1"}, clear=True
+        ):
+            with self.assertRaises(ValueError):
+                load_config()
+
+    def test_rerank_greater_than_retrieval_raises(self):
+        with mock.patch.dict(
+            os.environ,
+            {
+                "CRE_LIBRARIAN_TOP_K_RETRIEVAL": "3",
+                "CRE_LIBRARIAN_TOP_K_RERANK": "5",
+            },
+            clear=True,
+        ):
+            with self.assertRaises(ValueError):
+                load_config()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/application/tests/librarian/dataset_test.py b/application/tests/librarian/dataset_test.py
@@ -0,0 +1,123 @@
+"""Sanity tests for the populated golden dataset.
+
+These tests run in CI against the committed JSON. They do NOT require the DB —
+the DB-driven derivation is covered by ``scripts/build_golden_dataset.py``'s
+own ``--check`` mode, which the determinism test invokes when the DB is present.
+"""
+
+import json
+import os
+import subprocess
+import sys
+import unittest
+from collections import Counter
+
+import jsonschema
+from pydantic import ValidationError
+
+from application.utils.librarian.schemas import GoldenDatasetRow
+
+_HERE = os.path.dirname(__file__)
+_REPO_ROOT = os.path.abspath(os.path.join(_HERE, "..", "..", ".."))
+_DATASET = os.path.join(_HERE, "fixtures", "golden_dataset.json")
+_JSON_SCHEMA = os.path.join(_HERE, "fixtures", "golden_dataset.schema.json")
+_BUILD_SCRIPT = os.path.join(_REPO_ROOT, "scripts", "build_golden_dataset.py")
+_DB = os.path.join(_REPO_ROOT, "standards_cache.sqlite")
+
+
+def _load(path):
+    with open(path, encoding="utf-8") as fh:
+        return json.load(fh)
+
+
+class TestGoldenDatasetShape(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.rows = _load(_DATASET)
+        cls.json_schema = _load(_JSON_SCHEMA)
+
+    def test_has_at_least_the_asvs_core(self):
+        # Master guide §9.3 + golden-dataset-plan §3: at least 277 ASVS-core rows.
+        positive_asvs = [
+            r
+            for r in self.rows
+            if r["slice"] == "positive"
+            and r["provenance"]["ground_truth_source"].startswith(
+                "OpenCRE DB mapping (cre_node_links)"
+            )
+        ]
+        self.assertGreaterEqual(len(positive_asvs), 277)
+
+    def test_all_five_slices_present(self):
+        slices = Counter(r["slice"] for r in self.rows)
+        self.assertEqual(
+            set(slices),
+            {"explicit", "positive", "hard_negative", "update", "ambiguous"},
+        )
+        # Every slice has at least a few rows so the harness can stratify.
+        for s, n in slices.items():
+            self.assertGreaterEqual(n, 5, msg=f"slice {s} only has {n} rows")
+
+    def test_multilink_positive_rows_exist(self):
+        # The Q-D scoring rule has nothing to exercise unless multi-link
+        # ground truth is actually present somewhere.
+        multi = [
+            r
+            for r in self.rows
+            if r["slice"] == "positive"
+            and len(r.get("expected", {}).get("cre_ids") or []) > 1
+        ]
+        self.assertGreater(len(multi), 0, "no multi-link positive rows present")
+
+    def test_every_row_validates_against_pydantic_model(self):
+        errors = []
+        for r in self.rows:
+            try:
+                GoldenDatasetRow.model_validate(r)
+            except ValidationError as e:
+                errors.append((r.get("id"), str(e)))
+        self.assertEqual(errors, [], msg=f"first failure: {errors[:1]}")
+
+    def test_every_row_validates_against_json_schema(self):
+        validator = jsonschema.Draft202012Validator(self.json_schema)
+        first_errors = []
+        for r in self.rows:
+            errs = sorted(validator.iter_errors(r), key=str)
+            if errs:
+                first_errors.append((r.get("id"), [str(e) for e in errs[:1]]))
+                if len(first_errors) >= 3:
+                    break
+        self.assertEqual(first_errors, [])
+
+    def test_provenance_is_recorded_per_row(self):
+        for r in self.rows:
+            self.assertTrue(
+                r["provenance"].get("ground_truth_source"),
+                msg=f"row {r.get('id')} missing ground_truth_source",
+            )
+
+    def test_ids_are_unique(self):
+        ids = [r["id"] for r in self.rows]
+        self.assertEqual(len(ids), len(set(ids)))
+
+
+class TestDatasetDeterminism(unittest.TestCase):
+    """The committed JSON must re-derive identically from the DB."""
+
+    def test_build_check_matches_committed_dataset(self):
+        if not os.path.exists(_DB):
+            self.skipTest("standards_cache.sqlite not present")
+        result = subprocess.run(
+            [sys.executable, _BUILD_SCRIPT, "--check"],
+            capture_output=True,
+            text=True,
+        )
+        self.assertEqual(
+            result.returncode,
+            0,
+            msg=f"stdout={result.stdout}\nstderr={result.stderr}",
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()