OpenDIKW · helebest · Jun 28, 2026 · Jun 25, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,59 @@
+name: CI
+
+# Deterministic floor for dikw-data: lint, type-check, unit tests, and a $0
+# dataset shape-gate. Mirrors dikw-core's reusable-ci lint-type-test job, minus
+# the engine-specific legs (Postgres / wheel / e2e). NO provider keys are used —
+# this workflow never calls `dikw client eval`, so it makes no API requests.
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  lint-type-test:
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.12", "3.13"]
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b  # v8.1.0
+        with:
+          enable-cache: true
+
+      - name: Set up Python ${{ matrix.python-version }}
+        run: uv python install ${{ matrix.python-version }}
+
+      - name: Sync dependencies
+        run: uv sync
+
+      - name: Ruff
+        run: uv run ruff check .
+
+      - name: Mypy
+        run: uv run mypy src
+
+      - name: Pytest
+        run: uv run pytest
+
+      - name: Validate datasets (shape gate, no API)
+        # scripts/validate_dataset.py exits non-zero on the first invalid dataset,
+        # so under `bash -e` a bad dataset fails the job. Costs nothing — pure
+        # file-shape checks (required files, corpus refs, target relationships).
+        run: |
+          for d in datasets/*/; do
+            echo "== validating $d =="
+            uv run python scripts/validate_dataset.py "$d"
+          done
diff --git a/.github/workflows/eval-gate.yml b/.github/workflows/eval-gate.yml
@@ -0,0 +1,46 @@
+name: Eval gate
+
+# A dataset change (corpus / queries / thresholds) shifts the engine's retrieval
+# numbers, so it must land with a baseline entry recording the real-vector
+# outcome. This is the dikw-data analog of dikw-core's eval-gate: a *content*
+# check (parse the added reports/BASELINES.md lines, assert a new dated entry that
+# names a retrieval metric), not a presence check — a blank-line edit won't pass.
+# Re-running the eval to verify the numbers is separate (needs provider keys).
+#
+# Override: label the PR `no-baseline-needed` for a dataset edit that genuinely
+# shifts no numbers (a corpus typo fix, a rename).
+
+on:
+  pull_request:
+    paths:
+      - 'datasets/**'
+
+permissions:
+  contents: read
+
+jobs:
+  baseline-must-update:
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10  # v6.0.3
+        with:
+          fetch-depth: 0
+
+      # Override via a native expression (no shell): a label containing a quote
+      # can't break detection the way an inline grep on the label could.
+      - name: Skip note (no-baseline-needed)
+        if: ${{ contains(github.event.pull_request.labels.*.name, 'no-baseline-needed') }}
+        run: |
+          echo "::notice::PR labeled 'no-baseline-needed' — skipping baseline content check."
+          echo "Reviewer is expected to confirm the dataset change shifts no numbers."
+
+      # SHAs come in via env (not inline ${{ }} in the script) — the safe pattern
+      # for workflow inputs. check_baselines.py is stdlib-only, so no uv/setup-python.
+      - name: Content-check reports/BASELINES.md
+        if: ${{ !contains(github.event.pull_request.labels.*.name, 'no-baseline-needed') }}
+        env:
+          BASE_SHA: ${{ github.event.pull_request.base.sha }}
+          HEAD_SHA: ${{ github.event.pull_request.head.sha }}
+        run: |
+          python3 tools/check_baselines.py --base-sha "$BASE_SHA" --head-sha "$HEAD_SHA"
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,12 @@ __pycache__/
 .pytest_cache/
 .ruff_cache/
 generated/
-reports/
+# Eval run artifacts are disposable, but the human-readable baseline LOG is the
+# tracked source of truth (the eval-gate workflow asserts a new entry on dataset
+# changes). Ignore everything under reports/ EXCEPT that log — a file inside a
+# fully-ignored dir cannot be re-included, so ignore the contents, not the dir.
+reports/*
+!reports/BASELINES.md
 bases/
 datasets/markdown-books/
+.impeccable/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,21 @@
+# Pre-commit hooks for dikw-data. Install once: `uv run pre-commit install`.
+#
+# Local hooks shell out to `uv run` so they use the EXACT ruff / mypy pinned in
+# pyproject — no version drift between the hook and CI (.github/workflows/ci.yml).
+# These are the cheap deterministic stages; the full floor (incl. pytest +
+# dataset validation) runs in CI.
+repos:
+  - repo: local
+    hooks:
+      - id: ruff
+        name: ruff check
+        entry: uv run ruff check --force-exclude
+        language: system
+        types_or: [python, pyi]
+        require_serial: true
+      - id: mypy
+        name: mypy (strict, src)
+        entry: uv run mypy src
+        language: system
+        pass_filenames: false
+        types: [python]
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,8 +19,57 @@ dependencies = [
 dev = [
     "pytest>=8.0",
     "pytest-asyncio>=0.23",
+    "ruff>=0.15",
+    "mypy>=1.20",
+    "types-PyYAML",
+    # Local git pre-commit hooks (ruff + mypy) mirroring the CI floor. Wire
+    # them once with `uv run pre-commit install`; config in .pre-commit-config.yaml.
+    "pre-commit>=4.0",
 ]
 
+[tool.ruff]
+line-length = 100
+target-version = "py312"
+# This repo keeps runnable code outside src/ too (scripts/, web/), so lint the lot.
+src = ["src", "scripts", "web", "tests"]
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "I", "UP", "B", "SIM", "C4", "RUF"]
+ignore = [
+    "E501",  # line length is handled by formatter
+    # This is a bilingual zh/en data factory: prompts, sample corpora, and
+    # query strings embed Chinese text and full-width punctuation directly in
+    # .py files. RUF001/002/003 (ambiguous-unicode) then fire on every CJK
+    # character — pure false positives here, so silence them repo-wide.
+    "RUF001",
+    "RUF002",
+    "RUF003",
+]
+
+[tool.ruff.lint.per-file-ignores]
+"tests/*" = ["SIM117"]
+# FastAPI's idiomatic ``Query(default=...)`` / ``Depends(...)`` in parameter
+# defaults is exactly the B008 anti-pattern, but here it's the framework's
+# contract — silence the rule scope-locally rather than refactoring endpoints.
+"web/*.py" = ["B008"]
+# This generator draws pictograms procedurally: each icon is one line of
+# grouped canvas draw-ops (``c.rect(...); c.line(...); wheel(...)``). The
+# semicolon grouping is deliberate — one line == one icon — so splitting it
+# per E702 would bloat and obscure the file. Scope the allowance to this file.
+"scripts/generate_additional_multimodal_datasets.py" = ["E702"]
+
+[tool.mypy]
+python_version = "3.12"
+strict = true
+packages = ["dikw_data"]
+mypy_path = "src"
+explicit_package_bases = true
+
+[[tool.mypy.overrides]]
+# Third-party modules without bundled type stubs in this repo's install.
+module = ["anthropic.*", "yaml.*"]
+ignore_missing_imports = true
+
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 addopts = "-ra -q"

diff --git a/reports/BASELINES.md b/reports/BASELINES.md
@@ -0,0 +1,30 @@
+# dikw-data eval baselines
+
+Dated log of real-vector eval runs against the `dikw-core` engine — the tracked
+source of truth that mirrors `dikw-core/evals/BASELINES.md`. Everything else under
+`reports/` (per-run NDJSON + `summary.json`) is disposable and gitignored; this
+file is kept under version control via the `!reports/BASELINES.md` exception in
+`.gitignore`.
+
+The `eval-gate` workflow (`.github/workflows/eval-gate.yml` +
+`tools/check_baselines.py`) requires a **new** entry here whenever a PR changes
+`datasets/**`: it must be a new dated header and name at least one retrieval
+metric. That keeps a dataset change from shifting the engine's numbers without a
+recorded, reviewable outcome.
+
+## Entry template
+
+```
+## <YYYY-MM-DD> — <short title>
+
+- dikw-core: <version>   provider: <llm>+<embedder>   retrieval: <hybrid|all>   cache: <mode>
+- <dataset>: ndcg_at_10 <v>, hit_at_3 <v>, hit_at_10 <v>, mrr <v>, recall_at_100 <v>
+- notes: <anchor delta / saturation / per-language split / std across reruns>
+```
+
+## Entries
+
+_None yet._ The first real entries come from the Phase 0→1 public-anchor
+calibration (`scifact` + `cmteb-t2-subset`); see `docs/dikw-eval-plan.md` §2.3 and
+`docs/phase0-smoke-results.md`. Phase 0 set **no gates** — the synthetic sets
+saturate at 1.0, so thresholds wait for non-saturated, anchored data.
diff --git a/scripts/add_multi_image_chunks.py b/scripts/add_multi_image_chunks.py
@@ -6,7 +6,6 @@
 
 import yaml
 
-
 ROOT = Path(__file__).resolve().parents[1]
 DATASET = "synthetic-multimodal-datasets-v1"
 DATASET_DIR = ROOT / "datasets" / DATASET

diff --git a/scripts/apply_imagegen_multimodal_sheets.py b/scripts/apply_imagegen_multimodal_sheets.py
@@ -5,7 +5,6 @@
 
 from PIL import Image
 
-
 ROOT = Path(__file__).resolve().parents[1]
 DATASET = "synthetic-multimodal-datasets-v1"
 SHEET_DIR = Path.home() / ".codex" / "generated_images" / "019dca28-eaa6-77f0-8a22-0e9235befec2"

diff --git a/scripts/audit_corpus_quality.py b/scripts/audit_corpus_quality.py
@@ -6,7 +6,6 @@
 import sys
 from pathlib import Path
 
-
 BAD_PATTERNS = [
     "The user wants",
     "We need",

diff --git a/scripts/augment_multimodal_dataset.py b/scripts/augment_multimodal_dataset.py
@@ -7,6 +7,8 @@
 
 from generate_multimodal_asset_chunk_dataset import (
     CATEGORIES as BASE_CATEGORIES,
+)
+from generate_multimodal_asset_chunk_dataset import (
     CORPUS_DIR,
     DATASET,
     DATASET_DIR,
@@ -15,7 +17,6 @@
     yaml_scalar,
 )
 
-
 SHEET_DIR = Path.home() / ".codex" / "generated_images" / "019dca28-eaa6-77f0-8a22-0e9235befec2"
 
 

diff --git a/scripts/clean_corpus.py b/scripts/clean_corpus.py
@@ -3,7 +3,6 @@
 import argparse
 import json
 import shutil
-import subprocess
 import sys
 from datetime import datetime
 from pathlib import Path

diff --git a/scripts/generate_additional_multimodal_datasets.py b/scripts/generate_additional_multimodal_datasets.py
@@ -3,10 +3,8 @@
 import math
 import struct
 import zlib
-from collections.abc import Callable
 from pathlib import Path
 
-
 ROOT = Path(__file__).resolve().parents[1]
 W = 256
 H = 256
@@ -253,7 +251,7 @@ def landmark_icon(slug: str) -> list[Color]:
     elif slug == "forbidden_city":
         c.rect(56, 136, 200, 190, (170, 64, 46)); c.polygon([(45, 136), (211, 136), (188, 106), (68, 106)], (198, 146, 55)); c.polygon([(74, 106), (182, 106), (164, 82), (92, 82)], (198, 146, 55)); c.rect(117, 153, 139, 190, (94, 55, 37))
     elif slug == "colosseum":
-        c.ellipse(128, 140, 82, 55, (173, 143, 101)); c.rect(49, 140, 207, 188, (173, 143, 101)); 
+        c.ellipse(128, 140, 82, 55, (173, 143, 101)); c.rect(49, 140, 207, 188, (173, 143, 101))
         for x in [72, 102, 132, 162]:
             c.ellipse(x, 158, 10, 22, (88, 75, 62))
     elif slug == "tower_bridge":

diff --git a/scripts/generate_animals_multimodal_dataset.py b/scripts/generate_animals_multimodal_dataset.py
@@ -5,7 +5,6 @@
 import zlib
 from pathlib import Path
 
-
 ROOT = Path(__file__).resolve().parents[1]
 DATASET_DIR = ROOT / "datasets" / "synthetic-animals-multimodal-v1"
 CORPUS_DIR = DATASET_DIR / "corpus"
@@ -69,7 +68,7 @@ def draw_icon(kind: str) -> list[tuple[int, int, int]]:
                     p = blend(p, (30, 35, 30), 0.96)
                 if ellipse(x, y, 128, 145, 9, 6):
                     p = blend(p, (93, 48, 45), 0.9)
-                if abs(y - 150) < 2 and 88 < x < 115 or abs(y - 150) < 2 and 141 < x < 168:
+                if (abs(y - 150) < 2 and 88 < x < 115) or (abs(y - 150) < 2 and 141 < x < 168):
                     p = blend(p, (80, 58, 47), 0.55)
 
             elif kind == "dog":

diff --git a/scripts/generate_bilingual_corpus.py b/scripts/generate_bilingual_corpus.py
@@ -6,14 +6,12 @@
 import sys
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any
 
 from dikw_data.audit import AuditRecord, AuditStore
 from dikw_data.llm_client import RetryingMiniMaxClient, TaskResult
 from dikw_data.pipeline import add_provider_args, load_config_from_args
 from dikw_data.tasks import LLMTask, hash_text
 
-
 PROMPT_VERSION = "v1"
 MISSING_PROMPT_VERSION = "v5-clean"
 

diff --git a/scripts/generate_diverse_dataset.py b/scripts/generate_diverse_dataset.py
@@ -4,7 +4,6 @@
 import sys
 from pathlib import Path
 
-
 TOPICS = [
     {
         "stem": "chinese-history-tang-founding",

diff --git a/scripts/generate_fruits_multimodal_dataset.py b/scripts/generate_fruits_multimodal_dataset.py
@@ -5,7 +5,6 @@
 import zlib
 from pathlib import Path
 
-
 ROOT = Path(__file__).resolve().parents[1]
 DATASET_DIR = ROOT / "datasets" / "synthetic-fruits-multimodal-v1"
 CORPUS_DIR = DATASET_DIR / "corpus"

diff --git a/scripts/generate_multimodal_asset_chunk_dataset.py b/scripts/generate_multimodal_asset_chunk_dataset.py
@@ -4,7 +4,6 @@
 from dataclasses import dataclass
 from pathlib import Path
 
-
 ROOT = Path(__file__).resolve().parents[1]
 DATASET = "synthetic-multimodal-datasets-v1"
 DATASET_DIR = ROOT / "datasets" / DATASET

diff --git a/scripts/generate_queries_local.py b/scripts/generate_queries_local.py
@@ -1,11 +1,9 @@
 from __future__ import annotations
 
 import argparse
-import re
 import sys
 from pathlib import Path
 
-
 NEGATIVE_QUERIES = [
     "What's the weather in Shanghai tomorrow?",
     "How do I tune a PostgreSQL vacuum schedule for a write-heavy table?",
@@ -53,8 +51,6 @@ def build_positive_queries(docs: list[Path], limit: int) -> list[dict[str, objec
 
     queries: list[dict[str, object]] = []
     for path in selected:
-        text = path.read_text(encoding="utf-8")
-        title = extract_title(text, path.stem)
         marker = "dikwdoc" + path.stem.replace("-", "")
         if path.name.startswith("zh-"):
             queries.append(
@@ -90,16 +86,6 @@ def spread(items: list[Path], count: int) -> list[Path]:
     return picked
 
 
-def extract_title(text: str, fallback: str) -> str:
-    match = re.search(r"^title:\s*(.+)$", text, flags=re.MULTILINE)
-    if match:
-        return match.group(1).strip().strip('"')
-    match = re.search(r"^#\s+(.+)$", text, flags=re.MULTILINE)
-    if match:
-        return match.group(1).strip()
-    return fallback
-
-
 def write_dataset_yaml(dataset_dir: Path) -> None:
     name = dataset_dir.name
     content = f"""name: {name}