Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: CI

# Deterministic floor for dikw-data: lint, type-check, unit tests, and a $0
# dataset shape-gate. Mirrors dikw-core's reusable-ci lint-type-test job, minus
# the engine-specific legs (Postgres / wheel / e2e). NO provider keys are used —
# this workflow never calls `dikw client eval`, so it makes no API requests.

on:
push:
branches: [main]
pull_request:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

permissions:
contents: read

jobs:
lint-type-test:
runs-on: ubuntu-latest
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
python-version: ["3.12", "3.13"]
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3

- name: Install uv
uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
with:
enable-cache: true

- name: Set up Python ${{ matrix.python-version }}
run: uv python install ${{ matrix.python-version }}

- name: Sync dependencies
run: uv sync

- name: Ruff
run: uv run ruff check .

- name: Mypy
run: uv run mypy src

- name: Pytest
run: uv run pytest

- name: Validate datasets (shape gate, no API)
# scripts/validate_dataset.py exits non-zero on the first invalid dataset,
# so under `bash -e` a bad dataset fails the job. Costs nothing — pure
# file-shape checks (required files, corpus refs, target relationships).
run: |
for d in datasets/*/; do
echo "== validating $d =="
uv run python scripts/validate_dataset.py "$d"
done
46 changes: 46 additions & 0 deletions .github/workflows/eval-gate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
name: Eval gate

# A dataset change (corpus / queries / thresholds) shifts the engine's retrieval
# numbers, so it must land with a baseline entry recording the real-vector
# outcome. This is the dikw-data analog of dikw-core's eval-gate: a *content*
# check (parse the added reports/BASELINES.md lines, assert a new dated entry that
# names a retrieval metric), not a presence check — a blank-line edit won't pass.
# Re-running the eval to verify the numbers is separate (needs provider keys).
#
# Override: label the PR `no-baseline-needed` for a dataset edit that genuinely
# shifts no numbers (a corpus typo fix, a rename).

on:
pull_request:
paths:
- 'datasets/**'

permissions:
contents: read

jobs:
baseline-must-update:
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with:
fetch-depth: 0

# Override via a native expression (no shell): a label containing a quote
# can't break detection the way an inline grep on the label could.
- name: Skip note (no-baseline-needed)
if: ${{ contains(github.event.pull_request.labels.*.name, 'no-baseline-needed') }}
run: |
echo "::notice::PR labeled 'no-baseline-needed' — skipping baseline content check."
echo "Reviewer is expected to confirm the dataset change shifts no numbers."

# SHAs come in via env (not inline ${{ }} in the script) — the safe pattern
# for workflow inputs. check_baselines.py is stdlib-only, so no uv/setup-python.
- name: Content-check reports/BASELINES.md
if: ${{ !contains(github.event.pull_request.labels.*.name, 'no-baseline-needed') }}
env:
BASE_SHA: ${{ github.event.pull_request.base.sha }}
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
run: |
python3 tools/check_baselines.py --base-sha "$BASE_SHA" --head-sha "$HEAD_SHA"
8 changes: 7 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ __pycache__/
.pytest_cache/
.ruff_cache/
generated/
reports/
# Eval run artifacts are disposable, but the human-readable baseline LOG is the
# tracked source of truth (the eval-gate workflow asserts a new entry on dataset
# changes). Ignore everything under reports/ EXCEPT that log — a file inside a
# fully-ignored dir cannot be re-included, so ignore the contents, not the dir.
reports/*
!reports/BASELINES.md
bases/
datasets/markdown-books/
.impeccable/
21 changes: 21 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Pre-commit hooks for dikw-data. Install once: `uv run pre-commit install`.
#
# Local hooks shell out to `uv run` so they use the EXACT ruff / mypy pinned in
# pyproject — no version drift between the hook and CI (.github/workflows/ci.yml).
# These are the cheap deterministic stages; the full floor (incl. pytest +
# dataset validation) runs in CI.
repos:
- repo: local
hooks:
- id: ruff
name: ruff check
entry: uv run ruff check --force-exclude
language: system
types_or: [python, pyi]
require_serial: true
- id: mypy
name: mypy (strict, src)
entry: uv run mypy src
language: system
pass_filenames: false
types: [python]
49 changes: 49 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,57 @@ dependencies = [
dev = [
"pytest>=8.0",
"pytest-asyncio>=0.23",
"ruff>=0.15",
"mypy>=1.20",
"types-PyYAML",
# Local git pre-commit hooks (ruff + mypy) mirroring the CI floor. Wire
# them once with `uv run pre-commit install`; config in .pre-commit-config.yaml.
"pre-commit>=4.0",
]

[tool.ruff]
line-length = 100
target-version = "py312"
# This repo keeps runnable code outside src/ too (scripts/, web/), so lint the lot.
src = ["src", "scripts", "web", "tests"]

[tool.ruff.lint]
select = ["E", "F", "W", "I", "UP", "B", "SIM", "C4", "RUF"]
ignore = [
"E501", # line length is handled by formatter
# This is a bilingual zh/en data factory: prompts, sample corpora, and
# query strings embed Chinese text and full-width punctuation directly in
# .py files. RUF001/002/003 (ambiguous-unicode) then fire on every CJK
# character — pure false positives here, so silence them repo-wide.
"RUF001",
"RUF002",
"RUF003",
]

[tool.ruff.lint.per-file-ignores]
"tests/*" = ["SIM117"]
# FastAPI's idiomatic ``Query(default=...)`` / ``Depends(...)`` in parameter
# defaults is exactly the B008 anti-pattern, but here it's the framework's
# contract — silence the rule scope-locally rather than refactoring endpoints.
"web/*.py" = ["B008"]
# This generator draws pictograms procedurally: each icon is one line of
# grouped canvas draw-ops (``c.rect(...); c.line(...); wheel(...)``). The
# semicolon grouping is deliberate — one line == one icon — so splitting it
# per E702 would bloat and obscure the file. Scope the allowance to this file.
"scripts/generate_additional_multimodal_datasets.py" = ["E702"]

[tool.mypy]
python_version = "3.12"
strict = true
packages = ["dikw_data"]
mypy_path = "src"
explicit_package_bases = true

[[tool.mypy.overrides]]
# Third-party modules without bundled type stubs in this repo's install.
module = ["anthropic.*", "yaml.*"]
ignore_missing_imports = true

[tool.pytest.ini_options]
testpaths = ["tests"]
addopts = "-ra -q"
Expand Down
30 changes: 30 additions & 0 deletions reports/BASELINES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# dikw-data eval baselines

Dated log of real-vector eval runs against the `dikw-core` engine — the tracked
source of truth that mirrors `dikw-core/evals/BASELINES.md`. Everything else under
`reports/` (per-run NDJSON + `summary.json`) is disposable and gitignored; this
file is kept under version control via the `!reports/BASELINES.md` exception in
`.gitignore`.

The `eval-gate` workflow (`.github/workflows/eval-gate.yml` +
`tools/check_baselines.py`) requires a **new** entry here whenever a PR changes
`datasets/**`: it must be a new dated header and name at least one retrieval
metric. That keeps a dataset change from shifting the engine's numbers without a
recorded, reviewable outcome.

## Entry template

```
## <YYYY-MM-DD> — <short title>

- dikw-core: <version> provider: <llm>+<embedder> retrieval: <hybrid|all> cache: <mode>
- <dataset>: ndcg_at_10 <v>, hit_at_3 <v>, hit_at_10 <v>, mrr <v>, recall_at_100 <v>
- notes: <anchor delta / saturation / per-language split / std across reruns>
```

## Entries

_None yet._ The first real entries come from the Phase 0→1 public-anchor
calibration (`scifact` + `cmteb-t2-subset`); see `docs/dikw-eval-plan.md` §2.3 and
`docs/phase0-smoke-results.md`. Phase 0 set **no gates** — the synthetic sets
saturate at 1.0, so thresholds wait for non-saturated, anchored data.
1 change: 0 additions & 1 deletion scripts/add_multi_image_chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@

import yaml


ROOT = Path(__file__).resolve().parents[1]
DATASET = "synthetic-multimodal-datasets-v1"
DATASET_DIR = ROOT / "datasets" / DATASET
Expand Down
1 change: 0 additions & 1 deletion scripts/apply_imagegen_multimodal_sheets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from PIL import Image


ROOT = Path(__file__).resolve().parents[1]
DATASET = "synthetic-multimodal-datasets-v1"
SHEET_DIR = Path.home() / ".codex" / "generated_images" / "019dca28-eaa6-77f0-8a22-0e9235befec2"
Expand Down
1 change: 0 additions & 1 deletion scripts/audit_corpus_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import sys
from pathlib import Path


BAD_PATTERNS = [
"The user wants",
"We need",
Expand Down
3 changes: 2 additions & 1 deletion scripts/augment_multimodal_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from generate_multimodal_asset_chunk_dataset import (
CATEGORIES as BASE_CATEGORIES,
)
from generate_multimodal_asset_chunk_dataset import (
CORPUS_DIR,
DATASET,
DATASET_DIR,
Expand All @@ -15,7 +17,6 @@
yaml_scalar,
)


SHEET_DIR = Path.home() / ".codex" / "generated_images" / "019dca28-eaa6-77f0-8a22-0e9235befec2"


Expand Down
1 change: 0 additions & 1 deletion scripts/clean_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import argparse
import json
import shutil
import subprocess
import sys
from datetime import datetime
from pathlib import Path
Expand Down
4 changes: 1 addition & 3 deletions scripts/generate_additional_multimodal_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,8 @@
import math
import struct
import zlib
from collections.abc import Callable
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
W = 256
H = 256
Expand Down Expand Up @@ -253,7 +251,7 @@ def landmark_icon(slug: str) -> list[Color]:
elif slug == "forbidden_city":
c.rect(56, 136, 200, 190, (170, 64, 46)); c.polygon([(45, 136), (211, 136), (188, 106), (68, 106)], (198, 146, 55)); c.polygon([(74, 106), (182, 106), (164, 82), (92, 82)], (198, 146, 55)); c.rect(117, 153, 139, 190, (94, 55, 37))
elif slug == "colosseum":
c.ellipse(128, 140, 82, 55, (173, 143, 101)); c.rect(49, 140, 207, 188, (173, 143, 101));
c.ellipse(128, 140, 82, 55, (173, 143, 101)); c.rect(49, 140, 207, 188, (173, 143, 101))
for x in [72, 102, 132, 162]:
c.ellipse(x, 158, 10, 22, (88, 75, 62))
elif slug == "tower_bridge":
Expand Down
3 changes: 1 addition & 2 deletions scripts/generate_animals_multimodal_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import zlib
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
DATASET_DIR = ROOT / "datasets" / "synthetic-animals-multimodal-v1"
CORPUS_DIR = DATASET_DIR / "corpus"
Expand Down Expand Up @@ -69,7 +68,7 @@ def draw_icon(kind: str) -> list[tuple[int, int, int]]:
p = blend(p, (30, 35, 30), 0.96)
if ellipse(x, y, 128, 145, 9, 6):
p = blend(p, (93, 48, 45), 0.9)
if abs(y - 150) < 2 and 88 < x < 115 or abs(y - 150) < 2 and 141 < x < 168:
if (abs(y - 150) < 2 and 88 < x < 115) or (abs(y - 150) < 2 and 141 < x < 168):
p = blend(p, (80, 58, 47), 0.55)

elif kind == "dog":
Expand Down
2 changes: 0 additions & 2 deletions scripts/generate_bilingual_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any

from dikw_data.audit import AuditRecord, AuditStore
from dikw_data.llm_client import RetryingMiniMaxClient, TaskResult
from dikw_data.pipeline import add_provider_args, load_config_from_args
from dikw_data.tasks import LLMTask, hash_text


PROMPT_VERSION = "v1"
MISSING_PROMPT_VERSION = "v5-clean"

Expand Down
1 change: 0 additions & 1 deletion scripts/generate_diverse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import sys
from pathlib import Path


TOPICS = [
{
"stem": "chinese-history-tang-founding",
Expand Down
1 change: 0 additions & 1 deletion scripts/generate_fruits_multimodal_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import zlib
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
DATASET_DIR = ROOT / "datasets" / "synthetic-fruits-multimodal-v1"
CORPUS_DIR = DATASET_DIR / "corpus"
Expand Down
1 change: 0 additions & 1 deletion scripts/generate_multimodal_asset_chunk_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from dataclasses import dataclass
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
DATASET = "synthetic-multimodal-datasets-v1"
DATASET_DIR = ROOT / "datasets" / DATASET
Expand Down
14 changes: 0 additions & 14 deletions scripts/generate_queries_local.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from __future__ import annotations

import argparse
import re
import sys
from pathlib import Path


NEGATIVE_QUERIES = [
"What's the weather in Shanghai tomorrow?",
"How do I tune a PostgreSQL vacuum schedule for a write-heavy table?",
Expand Down Expand Up @@ -53,8 +51,6 @@ def build_positive_queries(docs: list[Path], limit: int) -> list[dict[str, objec

queries: list[dict[str, object]] = []
for path in selected:
text = path.read_text(encoding="utf-8")
title = extract_title(text, path.stem)
marker = "dikwdoc" + path.stem.replace("-", "")
if path.name.startswith("zh-"):
queries.append(
Expand Down Expand Up @@ -90,16 +86,6 @@ def spread(items: list[Path], count: int) -> list[Path]:
return picked


def extract_title(text: str, fallback: str) -> str:
match = re.search(r"^title:\s*(.+)$", text, flags=re.MULTILINE)
if match:
return match.group(1).strip().strip('"')
match = re.search(r"^#\s+(.+)$", text, flags=re.MULTILINE)
if match:
return match.group(1).strip()
return fallback


def write_dataset_yaml(dataset_dir: Path) -> None:
name = dataset_dir.name
content = f"""name: {name}
Expand Down
Loading
Loading