diff --git a/app/src/explore/fasta-prep-client.ts b/app/src/explore/fasta-prep-client.ts index 86bc593b..f9387908 100644 --- a/app/src/explore/fasta-prep-client.ts +++ b/app/src/explore/fasta-prep-client.ts @@ -83,7 +83,13 @@ function describeBundleFailure(status: number): string { } /** @public */ -export type FastaPrepStage = 'queued' | 'embedding' | 'projecting' | 'annotating' | 'bundling'; +export type FastaPrepStage = + | 'queued' + | 'embedding' + | 'projecting' + | 'annotating' + | 'bundling' + | 'computing_statistics'; /** @public */ export interface FastaPrepOptions { diff --git a/app/src/explore/runtime.ts b/app/src/explore/runtime.ts index 68b07fb6..5171ec0b 100644 --- a/app/src/explore/runtime.ts +++ b/app/src/explore/runtime.ts @@ -191,8 +191,18 @@ export async function initializeExploreRuntime(): Promise { lastProgress = 70; overlayController.update(true, lastProgress, 'Preparing FASTA…', 'Projecting…'); } else if (stage === 'bundling') { + stopCreep(); lastProgress = 90; overlayController.update(true, lastProgress, 'Preparing FASTA…', 'Bundling…'); + } else if (stage === 'computing_statistics') { + stopCreep(); + lastProgress = 95; + overlayController.update( + true, + lastProgress, + 'Preparing FASTA…', + 'Computing statistics…', + ); } }, }); diff --git a/openspec/changes/add-projection-statistics/.openspec.yaml b/openspec/changes/add-projection-statistics/.openspec.yaml new file mode 100644 index 00000000..a4ac4d78 --- /dev/null +++ b/openspec/changes/add-projection-statistics/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-23 diff --git a/openspec/changes/add-projection-statistics/design.md b/openspec/changes/add-projection-statistics/design.md new file mode 100644 index 00000000..e85da105 --- /dev/null +++ b/openspec/changes/add-projection-statistics/design.md @@ -0,0 +1,266 @@ +## Context + +ProtSpace prepares data with the `protspace` Python package (`embed → project → annotate → bundle`) +and renders the resulting `.parquetbundle` in the `protspace_web` frontend. The bundle is Apache +Parquet tables concatenated with a `---PARQUET_DELIMITER---` separator: + +1. `selected_annotations` (in-memory Arrow key `protein_annotations`) — identifier + annotations +2. `projections_metadata` — `projection_name` (e.g. `UMAP_2`), `dimensions`, `info_json` (reducer + params incl. the distance `metric`) +3. `projections_data` — long table of `(projection_name, identifier, x, y, z)` +4. `settings` (optional) — annotation styles / display config + +The bundle **does not retain the high-dimensional embeddings** — only reduced coordinates. Bundle +I/O lives in `data/io/bundle.py`: `write_bundle(tables, path, settings=None)`, +`read_bundle → (parts[:3], settings)` (a **2-tuple**, unpacked as `_, settings = read_bundle(...)` +at two sites in `utils/add_annotation_style.py`), `extract_bundle_to_dir`, and +`replace_settings_in_bundle` (used by `protspace style`, rebuilds `parts[:3] + settings`). All hard- +fail outside 3–4 parts. The frontend reader +(`packages/core/src/components/data-loader/utils/bundle.ts`) throws unless 2 or 3 delimiters and +keys off **delimiter count** (not part contents); `cli/app.py:_register_commands()` registers +commands from a **hardcoded list** (no auto-discovery). + +Issue #219 (from #216) asks for statistics to interpret a projection. Owner's metric list: elbow +(optimal cluster count), silhouette, Davies–Bouldin, Calinski–Harabasz. A prototype, +`ProtSpaceExtractor` (`~/Downloads/ProtSpaceExtractor_v1.7.4_mod 1.py`), performs proximity/ +neighborhood mining; its distance-to-chord knee algorithm is reused (re-implemented, not imported). + +### Decisions taken during brainstorming + two adversarial review rounds + +- **Deliverable:** the MVP metric set now, expandable architecture. +- **Compute home:** the `protspace` core package (shared by CLI + web bundle). +- **Timing:** baked at prep time; interactivity deferred. +- **Scope (trimmed):** **per-projection only** — `cluster_validity` (unsupervised KMeans+elbow) + + `faithfulness` (kNN-overlap, trustworthiness/continuity vs the embedding). Embedding-space + cluster-validity and annotation-feature label sources are deferred (non-breaking expansions). +- **Metric set:** owner's four **plus** projection-faithfulness (#216's "competitive" framing). +- **Carriage:** a dedicated fifth parquet part. +- **Sequencing:** infra-only — baked, **not rendered** (committed follow-up). +- **Spec home:** this OpenSpec change; engine internals built in the `protspace` PR. + +## Goals / Non-Goals + +**Goals** + +- Per projection: cluster-validity (elbow K + silhouette/DB/CH on the coords) and faithfulness + (kNN-overlap, trustworthiness/continuity of the projection vs its source embedding), baked in. +- A registry where a new **scalar** statistic, label source, or space is a small unit plus a + registry entry — no driver rewrite, no bundle schema migration. +- Backward-compatible carriage across **all** readers/writers, including `protspace style`. +- Statistics are **secondary**: computing them never fails a prep job and never costs the bundle. +- Trustworthy, reproducible numbers (seeded; bounded cost; degenerate cases guarded; provenance + recorded). + +**Non-Goals** + +- Embedding-space cluster-validity; annotation-feature label sources; interactive recompute; the + full `ProtSpaceExtractor` workflow; frontend rendering; other metric sets. Seams left for the + scalar expansions; the rest are explicit future work. + +## Decisions + +### D1. New `protspace.stats` package, generalized `Statistic` contract, lazy registry + +A scalar `(X, labels) -> float` metric is too narrow — the elbow emits a vector + chosen `K`, +faithfulness needs **two** point sets and **no** labels, and statistics take parameters. So: + +```python +# stats/base.py +@dataclass +class StatContext: + space_kind: str # "projection" (MVP); "embedding" later + space_name: str # == projections_metadata.projection_name (joinable) + coords: np.ndarray # (n, d) projection coordinates + embedding: np.ndarray | None # (n, D) source embedding, id-joined to coords + embedding_name: str | None # which embedding set produced this projection + high_dim_metric: str # reducer's own metric (from info_json); cosine fallback + ids: list[str] + rng_seed: int + params: dict # k, kmax, sample_size, sample_threshold, ... + +class Statistic(Protocol): + family: str # "cluster_validity" | "faithfulness" + requires_embedding: bool + def compute(self, ctx: StatContext) -> list[StatRow]: ... +``` + +`stats/__init__.py` holds a lazy `STATISTICS` registry (mirroring `REDUCERS`) and +`compute_statistics(...)`. **All sklearn imports inside `stats/` are function-local** to preserve the +package's ~50 ms CLI startup (the registry and `cli/stats.py` registration must not import compute +modules at module load). + +**Honest scope of the registry:** it is the seam for **scalar** statistics, label sources, and +spaces (new rows, no schema change). The `ProtSpaceExtractor`'s pair/edge/set/graph outputs do +**not** fit this row model and are **not** claimed to; they get their own future typed bundle parts. + +`cluster_validity` is split internally into a `Clusterer` (KMeans+elbow → labels + diagnostics) and +the sklearn validity scorers that consume those labels, so a future `annotation_feature` grouping or +`hdbscan` clusterer slots in without touching the scorers. + +### D2. Output is one tidy long-format table with split, joinable keys + +`statistics.parquet` — **eight** columns, one statistic value per row: + +| column | type | example | +| ------------- | ------ | ----------------------------------------------------------------------------------------------------------------- | +| `space_kind` | string | `projection` | +| `space_name` | string | `UMAP_2` (== `projections_metadata.projection_name`) | +| `stat_family` | string | `cluster_validity`, `faithfulness` | +| `label_kind` | string | `kmeans_elbow`, `none` | +| `metric` | string | `silhouette`, `davies_bouldin`, `calinski_harabasz`, `n_clusters`, `knn_overlap`, `trustworthiness`, `continuity` | +| `metric_kind` | string | `validity`, `meta`, `faithfulness` | +| `value` | double | `0.42` | +| `extra_json` | string | `{"k":15,"metric":"cosine","seed":42,"sampled":false,"knee_confidence":"high","embedding":"prot_t5"}` | + +`metric_kind` is a **column** (not buried in JSON) so consumers can aggregate validity scores without +folding in `n_clusters` (`metric_kind="meta"`). New scalar statistics add **rows**; a future +annotation label source carries its column name **inside `extra_json`** (`{"label_name":"family"}`), +not as a new column — the eight columns stay frozen. `space_name` equals +`projections_metadata.projection_name` so the table is joinable without string-parsing. Cluster rows +carry `label_kind="kmeans_elbow"`; faithfulness rows carry `label_kind="none"`. + +### D3. Scope = projections only; both statistic families per projection + +For each **projection** space: + +- **cluster_validity** — `coords` only: KMeans sweep → elbow `K` → silhouette/DB/CH at `K`. +- **faithfulness** — `coords` + its **source** `embedding`: neighbourhood preservation. + +`ReductionPipeline.run` holds `embedding_sets` (high-dim) and `all_reductions` (coords, each knowing +its `emb_set.name`) together before `save_output`; the driver runs there. **Annotations are not +needed**, removing the `identifier`/`protein_id`/`sp|…` join hazards. Deferred expansions stay +non-breaking: `space_kind:"embedding"` and `label_kind:"annotation"` are new **rows**. + +### D4. Elbow — distance-to-chord knee, argmax **index** → K, confidence-guarded + +Sweep `K ∈ [2, K_max]` (`K_max = min(round(sqrt(n)), 50)`), record inertia, compute the perpendicular +deviation of each point from the first→last chord of the (normalised) inertia curve, take +`k_idx = argmax(deviation)`, select `K = k_range[k_idx]`. **Implement the deviation directly** — the +algorithm originates in `ProtSpaceExtractor._knee_distance_to_chord`, but that function returns the +curve _y-value_ (a distance cutoff), so we reuse only the argmax-index logic, not its return. The +prototype's `_knee_median_jump` ensemble half is intentionally **not** used (tuned for sorted- +distance distributions, not inertia). KMeans runs on **2-D/3-D coords** where the knee is reliable. +Guards: if normalised max deviation is small (≈ linear), record `knee_confidence:"low"`; also report +the silhouette-optimal K over the sweep so consumers can cross-check. KMeans uses +`random_state=rng_seed`, `n_init ≥ 10`. Emit `n_clusters` with `metric_kind="meta"`. + +### D5. Faithfulness — neighbourhood preservation, embedding vs projection (corrected) + +- **kNN-overlap@k**: mean neighbour-set overlap (embedding vs projection) per point; default + `k = 15` (recorded). Built with a neighbour query, `O(n·k)` after the index. +- **trustworthiness** and **continuity**: `sklearn.manifold.trustworthiness` computes + trustworthiness; **continuity is the same function with arguments swapped** (there is no + `continuity` in sklearn). `trustworthiness(A, B, metric=m)` applies `m` to `A`. So the high-dim + metric must attach to whichever call has the **embedding** as the first arg: + - trustworthiness = `trustworthiness(embedding, coords, metric=high_dim_metric)` + - continuity = `trustworthiness(coords, embedding, metric=projection_metric)` (Euclidean) + The metric is recorded **per row** (trustworthiness and continuity may differ). + +**High-dim metric default = the reducer's own metric**, read from +`projections_metadata.info_json` / `ReducerParams.metric` (Euclidean by default; PCA/MDS are +Euclidean), falling back to cosine only when unknown — so faithfulness scores the same neighbourhood +graph the reducer optimised, and cross-method comparison stays valid. Recorded in `extra_json`. + +**Cost guard (critical).** `sklearn.manifold.trustworthiness` densifies a full `n×n` distance matrix +over the high-dim vectors — there is **no** ANN path. Above a threshold (default 5000, shared with +silhouette), trustworthiness/continuity/kNN-overlap run on a **fixed-seed subsample** derived from +`(rng_seed, sorted ids)` so **all projections of the same embedding use the identical subset** +(apples-to-apples); record `sampled`/`sample_size`. Beyond a hard ceiling, skip with a recorded +`skipped:"n_too_large"` row. This applies on the uncapped CLI `prepare --stats` path, not just the +1500-capped web path. + +**Projection→embedding mapping.** Multi-embedding runs put projections from several embeddings into +one `projections_data.parquet`, named per embedding (`ProtT5 — UMAP 2`). Each projection's +faithfulness must use **its own** source embedding. In-process this comes from the reduction's +`emb_set.name`; the discrete path records the source embedding in provenance and the standalone +`stats` command accepts multiple `-i` and matches by embedding name, **skipping** projections whose +embedding isn't supplied (recorded). Embedding rows are aligned to coords by an **id intersection +join** (not a positional zip — `load_h5` drops NaN rows and `cli/project.py` writes all coords +against `embedding_sets[0].headers`), asserting/ recording any id-set mismatch. + +### D6. Bundle carriage — dedicated fifth part, ALL readers/writers, no arity breaks + +Layout `core(3) + settings? + statistics?`. "Empty settings slot" = **zero bytes** (matches the +existing `if len(parts)==4 and parts[3]` truthiness check). Readers/writers branch on the **fourth +part's emptiness**, not raw count. + +- `data/io/bundle.py`: `write_bundle(tables, path, settings=None, statistics=None)`. **Keep + `read_bundle`'s 2-tuple signature** (two `add_annotation_style.py` callers unpack it) and add a + separate `read_statistics_from_bundle(path) -> bytes | None`; `extract_bundle_to_dir` writes + `STATISTICS_FILENAME` when present. Accept 3–5 parts. +- **`replace_settings_in_bundle`** (the `protspace style` path) — rewrite with explicit part-count + cases so it never drops stats: `len==3 → core+settings`; `len==4 → settings at [3], no stats, +replace [3]`; `len==5 → stats at [4], emit core + new_settings + parts[4]`, using a zero-byte + settings slot when inserting settings ahead of stats. Verify `protspace style` round-trips both a + settings+stats and a stats-only (empty-settings) 5-part bundle. +- `base_processor.save_output` / `create_output`: thread an optional statistics table. +- `cli/bundle.py`: optional `-s/--statistics`. +- Frontend `bundle.ts`: accept **4 delimiters** (5 parts); **branch on part-4 byte length** — empty + ⇒ settings absent, part 5 = stats; non-empty ⇒ part 4 = settings, part 5 = stats. Guard + `extractSettings` to return `null` on a zero-byte buffer **before** `assertValidParquetMagic`. + Skip part 5. **Invert** `bundle.test.ts`'s "should reject bundle with 4 delimiters (5 parts)" + test; widen the `"Expected 2 or 3 delimiters"` guard; update the layout comment in + `packages/utils/src/parquet/bundle-writer.ts`. The frontend _writer_ (`createParquetBundle`) is + unchanged and drops stats on re-export — a documented MVP limitation. + +**Why a dedicated part, not `settings` JSON:** statistics are results, not display state; `settings` +is rewritten by `style`/publish and would stale or drop them. + +### D7. Wiring + +- **`prepare` path (engine, CLI users):** `ReductionPipeline.run` calls `compute_statistics( +embedding_sets, all_reductions, config)` when enabled; the table is threaded through + `create_output`/`save_output` into `write_bundle` in one pass. `cli/prepare.py` adds + `--stats/--no-stats` (default on); the faithfulness cost guard (D5) bounds the work. +- **Discrete path (engine):** `protspace stats -i emb.h5 [-i emb2.h5 ...] -p project_dir +-o statistics.parquet` — loads the H5 embeddings + a **new projections-parquet loader** that turns + `projections_data.parquet` into `{projection_name: (coords ndarray, ids)}`, id-joins each + projection to its source embedding (by name + id intersection), runs the driver. **No `-a`.** + Register `stats` in `cli/app.py:_register_commands()` (hardcoded list — explicit edit). +- **Prep service (web):** to make stats truly non-fatal against the single shared + `pipeline_timeout_seconds`, **produce the core bundle first** (within the parent budget), then run + `protspace stats` as a best-effort step under its **own** `asyncio.timeout(PREP_STATS_TIMEOUT_ +SECONDS)`, catching every failure/timeout **locally** (never reaching the parent handler) and, on + success, re-running `bundle` with `-s` to fold in the fifth part. A stats failure/timeout leaves + the already-shipped stats-less bundle. Gated by `PREP_STATS` (default on) **and** a one-time + `protspace stats --help` feature probe (stale install ⇒ skip). Emits an SSE `computing_statistics` + stage and adds it to the frontend `FastaPrepStage` union. The `protspace` floor is raised to the + release that introduces `protspace stats` (filled in once the engine PR cuts the version). + +### D8. Robustness, determinism, guards + +- **Failure isolation:** the driver catches per-statistic exceptions and omits that row; total + failure returns an empty report and never raises. +- **Determinism:** one `rng_seed` (default 42) threads into KMeans (`random_state`, `n_init ≥ 10`) + and all subsampling; recorded per row. Faithfulness subsamples a shared id subset across a + projection group (D5). Projection-space stats are only as reproducible as the (already-seeded) + reducer. +- **Cost:** silhouette and trustworthiness/continuity are `O(n²)`; both honour the sampling + threshold/ceiling (D5). The 1500 web cap keeps the web path cheap; the CLI path relies on the + guard. +- **Degenerate guards:** silhouette needs `2 ≤ K ≤ n−1`; DB/CH require ≥ 2 members per cluster (else + omit); `n < 3` → no rows. + +## Risks / Trade-offs + +- **Faithfulness cost.** The densified `n×n` is the dominant risk; mitigated by the sampling + threshold + hard ceiling + skip row, the web cap, and the prep step's own timeout. `--stats` + default-on is safe only because of the guard. +- **Cross-repo coordination.** Engine releases before web consumes it; the frontend reader is + backward compatible and independent, and the prep step is version-floored + feature-probed. +- **Bundle positional fragility.** The zero-byte-settings convention is encoded in `write_bundle`, + enforced across all writers (incl. `replace_settings_in_bundle` by part-count case), with both + readers branching on part-4 emptiness; covered by round-trip tests in both repos. +- **No user-visible value yet.** Infra-only; rendering is a committed follow-up; re-export drops the + part until then (documented). +- **Faithfulness metric choices.** `k` and the high-dim metric drive the numbers; both are recorded, + and the metric defaults to the reducer's own for validity of cross-method comparison. + +## Open Questions + +- Default `k` (15), `K_max` (`min(round(sqrt(n)), 50)`), sampling threshold (5000) and hard ceiling — + confirm against representative datasets. +- Engine release version for the `protspace` floor — filled once the engine PR's semantic-release + version is cut. +- Whether to expose parsed statistics on the frontend `BundleExtractionResult` now (forward-compat) + or skip part 5 entirely — default skip-but-tolerate. diff --git a/openspec/changes/add-projection-statistics/proposal.md b/openspec/changes/add-projection-statistics/proposal.md new file mode 100644 index 00000000..92ec1560 --- /dev/null +++ b/openspec/changes/add-projection-statistics/proposal.md @@ -0,0 +1,103 @@ +## Why + +ProtSpace renders dimensionality-reduction projections but gives users **no quantitative way to +judge them**: _Is this projection meaningful? How many clusters are there? Can I trust the geometry, +or did the reduction distort it?_ Issue #216 catalogued the statistics ProtSpace needs to be +competitive; this change (issue #219) implements the MVP. + +Today the preparation pipeline (`embed → project → annotate → bundle`) ships coordinates and +annotations with **zero quality metrics**, so interpretation is purely visual. This change adds a +**projection-statistics** capability computed at prep time and baked into the `.parquetbundle`, +covering two complementary questions per projection: + +- **Cluster structure** — KMeans with an **elbow** estimate of the optimal cluster count, scored by + **silhouette**, **Davies–Bouldin**, and **Calinski–Harabasz** (issue #219's metric set). +- **Projection faithfulness** — **kNN-overlap** and **trustworthiness / continuity** between the + original embedding and the projection, i.e. how much the reduction preserved or distorted the + neighbourhood structure. These are the metrics that most directly tell a user whether to trust the + map (#216's "competitive" framing). + +It is built as an **expandable subsystem** so further statistics can be added without rework. The +honest boundary of that claim (see `design.md`): the registry + tidy long-format table make new +**scalar** statistics, label sources, and spaces cheap to add (new rows, no schema change); the +richer pairwise/neighborhood analyses prototyped in the standalone `ProtSpaceExtractor` script +(query↔reference proximity, cross-method consensus, Top-N mining) are pair/edge/set-shaped and will +ride **their own future typed bundle parts**, reusing the same registry pattern but not this table. + +## What Changes + +Two PRs: the engine (`protspace`) first, then this repo (`protspace_web`) consumes it. + +- **Engine — `protspace` package (separate PR):** a new `protspace.stats` module with a generalized + `Statistic` contract (each statistic declares the inputs it needs — projection coords, embeddings, + and/or labels — and returns one or more result rows) and a light registry. MVP statistics, all + **per projection**: + - `cluster_validity`: KMeans sweep + elbow (knee on the inertia curve) selecting `K`; then + silhouette, Davies–Bouldin, Calinski–Harabasz on the projection coordinates at that `K`. + - `faithfulness`: kNN-overlap and trustworthiness / continuity between the embedding and the + projection. + Wired into `ReductionPipeline.run` — the one stage holding embeddings **and** projections — + behind a `--stats/--no-stats` flag, and exposed as `protspace stats -i emb.h5 -p project_dir +-o statistics.parquet` (no annotations needed for the MVP). +- **Bundle format:** an optional **fifth part** `statistics.parquet`. Layout + `core(3) + settings? + statistics?`; when statistics is present without settings, a **zero-byte + settings slot** keeps the fifth position unambiguous. **All** bundle readers/writers are updated: + `write_bundle`, `read_bundle`, `extract_bundle_to_dir`, **and `replace_settings_in_bundle`** (the + `protspace style` path, which today would silently drop a fifth part). +- **Prep service (`services/protspace-prep`):** the core bundle is **produced first**; a `stats` step + then runs `protspace stats` best-effort under its **own nested timeout**, caught locally so it can + never reach the parent handler, re-bundling with `-s` on success. On stats failure or timeout the + already-shipped stats-less bundle stands (the job never fails for a secondary artifact). The + `protspace` dependency floor is raised to the stats-bearing release and the step feature-probes the + subcommand before use. +- **Frontend reader (`@protspace/utils` + core data-loader):** accept a five-part bundle **without + error** (the existing test asserting five-part bundles are _rejected_ is inverted). The statistics + part is parsed-but-unused; rendering is a committed follow-up, out of scope here. + +**Scope (MVP):** per-projection `cluster_validity` (unsupervised/elbow) + `faithfulness`, baked at +prep time, carried in the bundle, not yet rendered. + +**Non-goals (explicit, non-breaking expansions):** embedding-space cluster-validity; annotation- +feature label sources; interactive / on-demand recompute; the broader `ProtSpaceExtractor` analyses +(future typed bundle parts); frontend rendering of the statistics. The registries and long-format +table leave seams for the scalar expansions; the others are acknowledged as new parts/work. + +## Capabilities + +### New Capabilities + +- `projection-statistics`: per-projection cluster-validity and faithfulness statistics computed at + preparation time and carried in the `.parquetbundle` as an optional statistics part. Covers the + bundle-boundary data contract (a stable tidy long-format table), production by the prep pipeline, + the backward-compatible fifth-part layout (including the `protspace style` round-trip), + reproducibility and robustness guards, and reader tolerance. + +## Impact + +- **Upstream (`protspace` repo, separate PR):** + - New `src/protspace/stats/` package (generalized `Statistic` contract + registry, cluster-validity + statistics, faithfulness statistics, driver). + - `data/io/bundle.py`: `write_bundle` / `read_bundle` / `extract_bundle_to_dir` extended to a fifth + statistics part (`STATISTICS_FILENAME`), **and `replace_settings_in_bundle` updated to preserve a + trailing statistics part** so `protspace style` is non-lossy. + - `utils/add_annotation_style.py` (the `style` command) verified/tested against five-part bundles. + - `data/processors/base_processor.py` (`create_output`/`save_output`) and `pipeline.py` + (`ReductionPipeline.run`) thread an optional statistics table; embeddings (already in memory) feed + faithfulness. + - `cli/prepare.py` gains `--stats/--no-stats`; new `cli/stats.py`; `cli/bundle.py` gains + `-s/--statistics`. + - Tests with **known-answer numeric fixtures** (blob separation; faithfulness on identity vs random + projections) and a label-permutation alignment test. + - No new dependency: scikit-learn (KMeans, silhouette/DB/CH, `manifold.trustworthiness`) is already a + core dep; the elbow knee is ported from `ProtSpaceExtractor` (distance-to-chord, **argmax index → + K**). +- **This repo (`protspace_web`):** + - `services/protspace-prep/src/protspace_prep/pipeline.py` (+ `config.py`): the `stats` step with its + own timeout, failure isolation, version probe, and an SSE `computing_statistics` stage; + `services/protspace-prep/tests/`. + - `packages/core/src/components/data-loader/utils/bundle.ts` (+ `packages/utils/src/parquet/*`): + accept the optional fifth part; **invert** `bundle.test.ts`'s five-part-rejection test; document + that frontend re-export (`createParquetBundle`) currently drops the part. +- **Data-format change:** additive, backward compatible. Existing three- and four-part bundles read + and write unchanged. +- **API / dependencies:** no HTTP API change; no new dependencies. diff --git a/openspec/changes/add-projection-statistics/specs/projection-statistics/spec.md b/openspec/changes/add-projection-statistics/specs/projection-statistics/spec.md new file mode 100644 index 00000000..b4873996 --- /dev/null +++ b/openspec/changes/add-projection-statistics/specs/projection-statistics/spec.md @@ -0,0 +1,178 @@ +## ADDED Requirements + +### Requirement: Per-projection statistics are computed at preparation time + +The preparation pipeline SHALL compute, for each projection, both **cluster-validity** and +**projection-faithfulness** statistics during data preparation and embed them in the produced +`.parquetbundle` when statistics are enabled (the default). Cluster-validity SHALL include the +silhouette score, the Davies–Bouldin index, the Calinski–Harabasz index, and the elbow-estimated +optimal cluster count, and its rows SHALL carry `label_kind` `kmeans_elbow`. Faithfulness SHALL +include kNN-overlap and trustworthiness/continuity of the projection relative to its source +embedding. + +#### Scenario: Statistics are produced by default + +- **WHEN** a bundle is prepared from embeddings with statistics enabled +- **THEN** the bundle contains a statistics part with, for every projection, cluster-validity rows + (silhouette, davies_bouldin, calinski_harabasz, n_clusters) and faithfulness rows (knn_overlap, + trustworthiness, continuity) + +#### Scenario: Statistics can be disabled + +- **WHEN** preparation is run with statistics disabled (`--no-stats` / prep flag off) +- **THEN** no statistics part is produced and the bundle remains a valid three- or four-part bundle + +### Requirement: The elbow estimates the optimal cluster count from the inertia knee + +The unsupervised cluster source SHALL sweep KMeans across a bounded range of cluster counts on the +projection coordinates and select the optimal count using the **index** of the maximum +perpendicular deviation of the inertia curve from its first-to-last chord (not a distance value). It +SHALL report the selected count as `n_clusters` with `metric_kind` `meta`, label points accordingly +so the validity metrics score that clustering, and record a knee-confidence indicator and the +silhouette-optimal count for cross-checking. + +#### Scenario: Elbow recovers a known cluster count + +- **WHEN** the data forms `k` well-separated clusters +- **THEN** the elbow-estimated `n_clusters` is within one of `k` and the silhouette of the selected + clustering is high relative to an overlapping-cluster baseline + +#### Scenario: No clear knee is flagged, not faked + +- **WHEN** the inertia curve is approximately linear (no distinct knee) +- **THEN** the selected count is still emitted but marked with low knee confidence in `extra_json` + +### Requirement: Faithfulness compares embedding and projection neighbourhoods + +Faithfulness statistics SHALL take a projection and **its source embedding** as input and measure how +well the projection preserves each point's neighbourhood. Continuity SHALL be computed as +trustworthiness with the embedding and projection arguments swapped, and the high-dimensional +distance metric SHALL be applied to whichever computation has the embedding as its primary input. +The high-dimensional metric SHALL default to the projection's own reducer metric (Euclidean by +default), falling back to cosine only when unknown, and SHALL be recorded per row. These statistics +carry `metric_kind` `faithfulness` and `label_kind` `none`, and SHALL bound their cost on large +inputs (sampling a shared subset above a threshold, skipping with a recorded marker beyond a hard +ceiling) since trustworthiness materialises a full pairwise distance matrix. + +#### Scenario: A faithful projection scores high + +- **WHEN** the projection preserves neighbourhoods (e.g. a near-identity reduction) +- **THEN** kNN-overlap and trustworthiness are near their maximum + +#### Scenario: A distorting projection scores lower + +- **WHEN** the projection scrambles neighbourhoods (e.g. a random projection) +- **THEN** kNN-overlap and trustworthiness are markedly lower, with the neighbourhood size and the + per-row distance metric recorded in `extra_json` + +#### Scenario: Each projection uses its own embedding + +- **WHEN** a run produces projections from more than one embedding +- **THEN** each projection's faithfulness is computed against the embedding that produced it (matched + by name and id-intersection join), and any projection whose embedding is unavailable is skipped + with a recorded marker rather than scored against the wrong embedding + +#### Scenario: Large inputs are bounded + +- **WHEN** the number of points exceeds the sampling threshold +- **THEN** faithfulness (and silhouette) are computed on a fixed-seed shared subsample with the + sample size recorded, and beyond a hard ceiling the statistic is skipped with a recorded marker + rather than exhausting memory + +### Requirement: Statistics are a stable tidy long-format table with joinable keys + +The statistics part SHALL be a tidy long-format table with a fixed eight-column schema: `space_kind` +(string), `space_name` (string), `stat_family` (string), `label_kind` (string), `metric` (string), +`metric_kind` (string), `value` (double), and `extra_json` (string), with one statistic value per +row. `metric_kind` (`validity` | `meta` | `faithfulness`) SHALL be a column so consumers can +aggregate validity scores without folding in meta rows such as `n_clusters`. `space_name` SHALL equal +the corresponding `projections_metadata.projection_name` so the table is joinable without string- +parsing. Adding a new scalar statistic, label source, or space SHALL add **rows** and SHALL NOT +change the column schema; any per-source attribute (e.g. an annotation column name) SHALL be carried +inside `extra_json`, not as a new column. + +#### Scenario: Each row is self-describing and joinable + +- **WHEN** a consumer reads the statistics part +- **THEN** every row identifies its space (kind + name), family, label kind, metric, and metric kind, + carries a numeric value, and its `space_name` matches a `projection_name` in the projections metadata + +#### Scenario: Meta rows are separable from validity rows + +- **WHEN** a consumer aggregates cluster-validity scores +- **THEN** it can exclude `n_clusters` by its `metric_kind` `meta` column without parsing `extra_json` + +#### Scenario: New statistics do not change the schema + +- **WHEN** a later expansion adds an embedding space or an annotation-feature label source +- **THEN** it appears as additional rows (e.g. `space_kind` `embedding`, or `label_kind` `annotation` + with the source column in `extra_json`) under the same eight-column schema + +### Requirement: Statistics ride in the bundle as an optional fifth part, backward compatibly + +The `.parquetbundle` SHALL carry statistics as an optional fifth parquet part in the positional order +`core(3) + settings? + statistics?`. When statistics are present but settings are absent, a zero-byte +settings part SHALL occupy the fourth slot, and all readers and writers SHALL distinguish settings- +present from settings-absent by the **fourth part's emptiness**, not by raw part count. All bundle +readers and writers — including `read_bundle` (whose existing return shape is preserved so its callers +do not break), `extract_bundle_to_dir`, and `replace_settings_in_bundle` (the styling path) — SHALL +handle the fifth part; three- and four-part bundles SHALL retain their exact current meaning. + +#### Scenario: Legacy bundles are unaffected + +- **WHEN** a three-part (core only) or four-part (core + settings) bundle is read or rewritten +- **THEN** it behaves exactly as before, with no statistics part + +#### Scenario: Five-part bundle round-trips + +- **WHEN** a five-part bundle (core + settings + statistics) is read +- **THEN** projection, annotation, and settings data load normally and the statistics part is + recovered + +#### Scenario: Styling preserves statistics for both shapes + +- **WHEN** an annotation-styling / settings-replacement operation rewrites a statistics-bearing + bundle, whether it has display settings or only a zero-byte settings slot +- **THEN** the statistics part is preserved (not dropped), with a valid settings part written and the + statistics part kept as the fifth + +### Requirement: The frontend reader tolerates the statistics part + +The frontend bundle reader SHALL load a five-part bundle without error in both the settings+statistics +and the zero-byte-settings (statistics-only) shapes, distinguishing them by the fourth part's +emptiness, and treating the statistics part as optional and parsed-but-unused. Its presence or absence +SHALL NOT affect rendering of projections, annotations, or settings. + +#### Scenario: App loads a statistics-bearing bundle + +- **WHEN** the app loads a five-part bundle produced by the prep pipeline (with or without settings) +- **THEN** the projection renders normally with no error and the statistics part is ignored for now + +### Requirement: Statistics computation is non-fatal, reproducible, and guarded + +Statistics are secondary to the core bundle. In the prep service the **core bundle SHALL be produced +first**, and computing statistics — under its own bounded timeout caught locally — SHALL NOT fail the +job, lose the bundle, or consume the budget the bundle needs; a stats failure or timeout SHALL leave +the already-produced stats-less bundle in place. Computation SHALL be deterministic under a recorded +seed (KMeans, silhouette sampling, and faithfulness subsampling), and SHALL guard expensive and +degenerate cases: silhouette and faithfulness SHALL use a bounded shared sample above a size +threshold and skip beyond a hard ceiling; clusters with fewer than two members SHALL be excluded from +Davies–Bouldin / Calinski–Harabasz; and inputs too small to score (fewer than three points, a single +cluster) SHALL yield no row rather than raising. Provenance (seed, neighbourhood size, distance +metric, sampling, knee confidence, source embedding) SHALL be recorded in `extra_json`. + +#### Scenario: A computation failure does not fail the job or lose the bundle + +- **WHEN** statistics computation fails or times out in the prep service +- **THEN** the job still succeeds and the core bundle (produced before the stats step) is delivered + without a statistics part + +#### Scenario: A stale engine degrades to skip + +- **WHEN** the installed `protspace` does not provide the `stats` subcommand +- **THEN** the prep service detects this via a feature probe and skips statistics rather than failing + +#### Scenario: Results are reproducible + +- **WHEN** the same input is prepared twice with the same seed and projection +- **THEN** the statistics values are identical, and each row records the seed used diff --git a/openspec/changes/add-projection-statistics/tasks.md b/openspec/changes/add-projection-statistics/tasks.md new file mode 100644 index 00000000..275f1f12 --- /dev/null +++ b/openspec/changes/add-projection-statistics/tasks.md @@ -0,0 +1,138 @@ +Two PRs: **A — upstream `protspace` engine** (sections 1–7), then **B — this repo `protspace_web`** +(sections 8–10). TDD; every statistic ships a **known-answer numeric fixture**, not a "rows exist" +check. All sklearn imports under `stats/` are **function-local** (preserve ~50 ms CLI startup). + +## 1. Engine — `protspace.stats` scaffolding & contract + +- [ ] 1.1 `src/protspace/stats/base.py`: `StatContext` (space_kind, space_name, coords, embedding, + embedding_name, high_dim_metric, ids, rng_seed, params); `Statistic` Protocol (`family`, + `requires_embedding`, `compute(ctx) -> list[StatRow]`); `StatRow` (space_kind, space_name, + stat_family, label_kind, metric, **metric_kind**, value, extra: dict); `StatsReport.to_arrow()` + → the **8-column** tidy table (`extra` → `extra_json`). +- [ ] 1.2 `stats/__init__.py`: lazy `STATISTICS` registry (mirroring `REDUCERS`) + + `compute_statistics(embedding_sets, reductions, config)` iterating registered statistics per + projection, isolating per-statistic failures (catch → omit row). No compute-module imports at + module load. +- [ ] 1.3 Tests: registry lookup; `to_arrow()` 8-column schema; empty-report round-trip; partial + report when one statistic raises. + +## 2. Engine — cluster_validity (elbow + silhouette/DB/CH) + +- [ ] 2.1 `stats/cluster/kmeans_elbow.py`: KMeans sweep `K ∈ [2, min(round(sqrt(n)), 50)]` + (`random_state=ctx.rng_seed`, `n_init>=10`); record inertia; select `K` via the **argmax index** + of perpendicular deviation from the inertia curve's first→last chord (implement directly; + algorithm from `ProtSpaceExtractor._knee_distance_to_chord` but reuse the index, **not** its + y-value return; do **not** port `_knee_median_jump`). Return labels at `K` + diagnostics + (inertia curve, `knee_confidence`, silhouette-optimal K) in `extra`. Emit `n_clusters` with + `metric_kind="meta"`. Cluster rows carry `label_kind="kmeans_elbow"`. +- [ ] 2.2 `stats/metrics/validity.py`: silhouette (fixed-seed sample above threshold; record + `sampled`/`sample_size`), davies_bouldin, calinski_harabasz on the clusterer's labels; + `metric_kind="validity"`. Guard `2 ≤ K ≤ n−1`; drop singleton clusters for DB/CH. +- [ ] 2.3 Register `cluster_validity` in `STATISTICS`. +- [ ] 2.4 Known-answer tests: `k`-blob → elbow `K ∈ {k−1,k,k+1}`; silhouette `>0.6` (separated) vs + `<0.2` (overlapping); n<3 / single cluster → no row; near-linear inertia → `knee_confidence:"low"`. + +## 3. Engine — faithfulness (kNN-overlap, trustworthiness/continuity) + +- [ ] 3.1 `stats/metrics/faithfulness.py`: kNN-overlap@k (default `k=15`); **trustworthiness = + `trustworthiness(embedding, coords, metric=ctx.high_dim_metric)`** and **continuity = + `trustworthiness(coords, embedding, metric=)`** (no `continuity` in + sklearn; args swapped, metric attaches to the embedding-first call). `metric_kind="faithfulness"`, + `label_kind="none"`, `requires_embedding=True`. Record the metric **per row** in `extra`. +- [ ] 3.2 **Cost guard:** above `sample_threshold` (default 5000) subsample a **shared** id subset + derived from `(rng_seed, sorted ids)` for all projections of an embedding (record + `sampled`/`sample_size`); beyond a hard ceiling emit a `skipped:"n_too_large"` row. (Trustworthiness + densifies an `n×n` matrix — no ANN path.) +- [ ] 3.3 Register in `STATISTICS`. +- [ ] 3.4 Known-answer tests: near-identity projection → faithfulness ≈ 1.0; random projection → + markedly lower; `k`/metric recorded; large-n path samples (and the very-large path skips). + +## 4. Engine — driver: mapping, alignment, determinism + +- [ ] 4.1 `stats/driver.py`: per projection, build a `StatContext` — select **its source embedding** + by `emb_set.name`; set `high_dim_metric` from the reduction's `info_json` (reducer metric; + cosine fallback); **id-intersection join** embedding↔coords (not positional zip; `load_h5` drops + NaN rows and `cli/project.py` writes coords against `embedding_sets[0].headers`); record id-set + mismatches; skip `requires_embedding` statistics when no matching embedding. +- [ ] 4.2 Tests: label/id-permutation yields identical scores; an H5 with an extra and a dropped id + vs `projections_data` is handled by intersection (not mis-paired); multi-embedding run maps each + projection to the correct embedding. + +## 5. Engine — bundle format (fifth part, ALL readers/writers, no arity break) + +- [ ] 5.1 `data/io/bundle.py`: `STATISTICS_FILENAME`; `write_bundle(tables, path, settings=None, +statistics=None)` emitting `core(3) + settings? + statistics?` with a **zero-byte** settings slot + when statistics is given without settings. +- [ ] 5.2 **Keep `read_bundle`'s 2-tuple** (two `add_annotation_style.py` callers unpack it); add + `read_statistics_from_bundle(path) -> bytes | None`; `extract_bundle_to_dir` writes + `STATISTICS_FILENAME` when present; accept 3–5 parts (branch on part-4 emptiness). Update the + "Expected 3 or 4 parts" message. +- [ ] 5.3 **Rewrite `replace_settings_in_bundle`** by part-count case (`len==3/4/5`) to preserve a + trailing statistics part (zero-byte settings slot when inserting settings ahead of stats); verify + `utils/add_annotation_style.py` / `protspace style`. +- [ ] 5.4 Round-trip tests: 3-, 4-, 5- (settings+stats), 5-empty-settings (stats-only) read back + correctly; **`protspace style` preserves stats on BOTH settings+stats and stats-only inputs**; + legacy bundles unchanged. + +## 6. Engine — pipeline & CLI wiring + +- [ ] 6.1 `data/processors/base_processor.py`: `create_output` accepts an optional statistics table; + `save_output` passes it to `write_bundle`. +- [ ] 6.2 `data/processors/pipeline.py`: `stats` flag on `PipelineConfig`; in `ReductionPipeline.run`, + call `compute_statistics(embedding_sets, all_reductions, ...)` before `create_output` when + enabled; thread the table through. +- [ ] 6.3 `cli/prepare.py`: add `--stats/--no-stats` (default on, same on/off convention as the + existing `--scores`); log it. +- [ ] 6.4 `cli/stats.py`: `protspace stats -i emb.h5 [-i ...] -p project_dir -o statistics.parquet` + (no `-a`). Build a **projections-parquet loader** (`projections_data.parquet` → + `{projection_name: (coords ndarray, ids)}`) and id-join each projection to its source embedding. + **Add `stats` to `cli/app.py:_register_commands()`** (hardcoded list). +- [ ] 6.5 `cli/bundle.py`: optional `-s/--statistics`; pass to `write_bundle`. (Hatchling src-layout + auto-includes `src/protspace/stats/`; no `[tool.hatch]` change needed.) + +## 7. Engine — verification (PR A) + +- [ ] 7.1 `uv run pytest tests/ -m "not slow"` green; `uv run ruff check src/ tests/` clean; confirm + CLI startup not regressed (lazy imports). +- [ ] 7.2 End-to-end on a **real, committed fixture** (`data/sizes/phosphatase.h5` does **not** exist; + use an existing test H5 or add a small generated fixture): `prepare ... --stats` → 5-part bundle + with cluster_validity + faithfulness rows per projection; `protspace stats` standalone matches; + `--no-stats` → 3-part bundle; `protspace style` on the 5-part bundle keeps stats. +- [ ] 7.3 CHANGELOG/version bump (semantic-release); record the released version for the web floor; + open the draft PR. + +## 8. This repo — prep service stats step (bundle-first, non-fatal, probed) + +- [ ] 8.1 `services/protspace-prep/.../config.py`: add `PREP_STATS` (default on) and + `PREP_STATS_TIMEOUT_SECONDS` (default 120); raise the `protspace` floor (pyproject + lock) to the + release from task 7.3. +- [ ] 8.2 `services/protspace-prep/.../pipeline.py`: **produce the core bundle first** (within the + parent `pipeline_timeout_seconds`), then run `protspace stats -i -p + -o statistics.parquet` under a **nested** `asyncio.timeout(PREP_STATS_TIMEOUT_ +SECONDS)`, catching failure/timeout **locally** so it never reaches the outer handler; on success + re-run `bundle` with `-s`. A one-time `protspace stats --help` probe → skip if absent. Emit an + SSE `computing_statistics` stage. +- [ ] 8.3 Tests (`tests/test_pipeline.py`): stats invoked with the resolved H5 + project dir and folded + via `-s`; **a stats failure/timeout still yields the core bundle and a successful job** (and does + not consume the parent budget such that bundle is lost); missing-subcommand probe skips; SSE event + emitted. + +## 9. This repo — frontend reader tolerance + +- [ ] 9.1 `packages/core/.../data-loader/utils/bundle.ts`: accept **4 delimiters** (5 parts); branch on + **part-4 byte length** (empty ⇒ settings absent, part 5 = stats; non-empty ⇒ part 4 = settings, + part 5 = stats); guard `extractSettings` to return null on a zero-byte buffer **before** + `assertValidParquetMagic`; skip part 5. Keep 2/3-delimiter behaviour identical. +- [ ] 9.2 **Invert** `bundle.test.ts`'s "should reject bundle with 4 delimiters (5 parts)" test; update + the layout comment in `packages/utils/src/parquet/bundle-writer.ts`; add a `FastaPrepStage` union + entry `'computing_statistics'` (and any exhaustive consumer); note `createParquetBundle` re-export + drops the stats part (documented limitation). +- [ ] 9.3 Tests: 5-part (settings+stats) AND 5-part (empty-settings, stats-only) load projections + + annotations + settings with no error and ignore statistics; 3/4-part bundles unchanged. + +## 10. This repo — verification (PR B) + +- [ ] 10.1 `pnpm precommit` green; prep-service `uv run pytest -q` green. +- [ ] 10.2 Load a real engine-produced 5-part bundle (both settings+stats and stats-only) in the app — + renders normally, no console errors, statistics ignored. +- [ ] 10.3 Open the draft PR; reference issue #219 and the upstream engine PR. diff --git a/openspec/changes/route-projection-statistics/.openspec.yaml b/openspec/changes/route-projection-statistics/.openspec.yaml new file mode 100644 index 00000000..fab62b41 --- /dev/null +++ b/openspec/changes/route-projection-statistics/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-06-24 diff --git a/openspec/changes/route-projection-statistics/design.md b/openspec/changes/route-projection-statistics/design.md new file mode 100644 index 00000000..b611a6a8 --- /dev/null +++ b/openspec/changes/route-projection-statistics/design.md @@ -0,0 +1,194 @@ +## Context + +`add-projection-statistics` shipped two statistic families, computed per projection at prep time and +baked into the `.parquetbundle` as a single fifth part (`statistics.parquet`, a tidy 8-column +long-format table). The frontend reader was made to _tolerate_ that part but not render it — +"frontend rendering" was an explicit committed follow-up. + +The `.parquetbundle` is concatenated Parquet tables separated by `---PARQUET_DELIMITER---`: + +1. `protein_annotations` — `identifier` + arbitrary annotation columns (categorical or numeric); + the frontend's **color-by** control enumerates these columns. +2. `projections_metadata` — `projection_name`, `dimensions`, `info_json` (reducer params incl. the + distance `metric`), `source`. +3. `projections_data` — long `(projection_name, identifier, x, y, z)`. +4. `settings` (optional) — annotation **styles** / display config (palettes, pinned values). +5. `statistics` (optional, from `add-projection-statistics`) — the 8-column stats table. + +In his PR #61 review, tsenoner asked us to route each statistic to the part whose **existing +frontend consumer** matches its **granularity**: + +| Statistic | Granularity | Destination part | Existing consumer it rides | +| ------------------------------------------------------------------- | -------------- | ------------------------------------ | ------------------------------------- | +| KMeans cluster label @ elbow `K` | per protein | `protein_annotations` (1) | color-by (categorical) | +| per-sample silhouette @ `K` | per protein | `protein_annotations` (1) | color-by (continuous) | +| kNN-overlap / trustworthiness / continuity | per projection | `projections_metadata.info_json` (2) | projection method/info surface | +| silhouette mean / Davies–Bouldin / Calinski–Harabasz / `n_clusters` | per projection | `statistics.parquet` (5) | (dedicated quality table — new, thin) | + +The payoff: the two per-protein columns and the per-method scalars render through machinery that +already exists, shrinking "frontend rendering" to (at most) a small aggregate-validity table. + +## Goals / Non-Goals + +**Goals** + +- Route each already-computed statistic to its consumer-aligned bundle part (the table above). +- Surface per-protein outputs through the existing color-by control (membership pre-styled), and + faithfulness through the existing projection-info surface, with minimal new frontend code. +- Preserve every guarantee from `add-projection-statistics`: statistics are **secondary** (the core + bundle ships even if they fail), deterministic under a seed, cost-bounded, and backward compatible. + +**Non-Goals** + +- Changing **which** statistics are computed, the elbow algorithm, or the cost guards — unchanged. +- A rich statistics dashboard. The aggregate-validity table is minimal and may be a thin follow-up. +- External/label-based validation (ARI/NMI), embedding-space validity, interactive recompute — still + future work, unchanged from the prior change's non-goals. + +## Decisions + +### D1 — A statistic output declares a destination + +Today a statistic returns `StatRow`s that all land in one table. Introduce a **destination** on each +output: `annotation` (per-protein, carries `identifier`+value), `projection_metadata` (per-method +scalar), or `statistics_part` (the long table). The carriage layer partitions outputs by destination +and fans them to the matching bundle part. This keeps the registry/driver shape and lets future +statistics pick a destination without a new mechanism. + +### D2 — Per-protein outputs (cluster membership + per-sample silhouette) → annotations + +`cluster_validity` already computes KMeans `labels` at the elbow `K` and discards them; surface them +as a per-protein **categorical** annotation, plus **per-sample silhouette** (`silhouette_samples`, +new but cheap, scikit-learn) as a per-protein **numeric** annotation. Both are **per projection** — +one column pair per reduction, since each reduction has its own elbow `K`. + +- **Naming (default):** `cluster_` and `silhouette_` (e.g. + `cluster_umap2`). The elbow `K` is recorded in annotation metadata / display label, not baked into + the machine column name (keeps names stable if `K` shifts on re-run). **Open for tsenoner** — he + wrote "Kx with x variable"; if he wants `K` in the visible name, it goes in the display label. +- **Auto-style (default yes):** generate a categorical palette for each membership column into the + `settings` part so clusters are colored on load; the numeric silhouette column uses the default + continuous ramp. This is the concrete "frontend for free" win. +- **Provenance:** membership/silhouette columns are marked computed (not retrieved biological + annotation) so the UI/consumers can distinguish them; the source projection + `K` + seed live in + the column metadata. + +### D3 — Faithfulness → `projections_metadata.info_json.quality` + +kNN-overlap, trustworthiness, continuity are single scalars per projection. Fold them into the +projection's existing `info_json` under a `quality` object (e.g. +`info_json.quality = {knn_overlap, trustworthiness, continuity, k, metric, sampled, ...}`). Additive +(no new metadata column); existing `info_json` consumers ignore unknown keys. This is also the +**natural compute site**: faithfulness needs the embedding _and_ the projection together, which the +`prepare`/`project` stage has in hand — so it is written into metadata there, and the standalone +`stats` command recomputes + merges into `projections_metadata.parquet` for existing projects. + +### D4 — Aggregate cluster-validity stays in `statistics.parquet` + +silhouette mean, Davies–Bouldin, Calinski–Harabasz, and the `n_clusters` meta row remain the +8-column tidy table — now the **only** thing in the fifth part. Its schema and the "new scalar = new +row" property are unchanged; it simply no longer carries per-protein or faithfulness data. + +### D5 — Robustness & decoupling preserved via core-first re-enrichment + +`add-projection-statistics` guarantees the prep job never fails for statistics and never loses the +bundle. Under routing, per-protein outputs live in `protein_annotations` (a **core** part), so +enrichment now rewrites parts 1/2/4/5, not just appends part 5. Preserve the guarantee by keeping the +**existing shape**: build the core (un-enriched) bundle first within the parent timeout; then a +best-effort step computes statistics and **re-bundles to a temp file with the routed parts merged +in**, atomically `os.replace`-ing it over the core bundle on success. On any stats failure/timeout +the un-enriched core bundle stands. No regression in the safety property; the diff is _which_ parts +the enrichment rewrites. + +### D6 — Backward compatibility + +- Annotations: extra computed columns; readers already accept arbitrary annotation columns. +- `info_json`: an extra `quality` key; readers parse JSON and ignore unknown keys. +- `statistics.parquet`: same 8-column schema, fewer row families (aggregate only). Readers that + tolerate the fifth part are unaffected; consumers that previously expected faithfulness/labels in + the part must now read them from metadata/annotations — but the only such consumer is our own + not-yet-shipped frontend, so there is no external break. Bundles produced before this change still + read; bundles without statistics are unchanged. + +## Risks / trade-offs + +- **Annotation-table growth.** One membership + one silhouette column **per projection** (6 DR + methods → 12 computed columns). Mitigate with the `cluster_*`/`silhouette_*` prefix + computed- + provenance flag so the UI can group/collapse them; consider a default of styling only the membership + columns. Reviewers should sanity-check color-by UX with ~12 extra columns. +- **Loss of single-format generality.** The prior design's "all statistics are rows in one tidy + table" is deliberately broken up. Accepted: consumer-fit beats storage uniformity here, and it is + the maintainer's call. +- **Tighter coupling of stats into the core bundle.** Mitigated by D5 (core-first re-enrichment); + must be covered by a test asserting a stats failure still yields the core bundle. +- **Frontend assumptions unverified.** This design assumes (a) color-by enumerates all annotation + columns including computed ones, (b) there is a projection info/method surface that can show + `info_json.quality`, and (c) styles in the `settings` part drive initial coloring. These are + **assumptions to verify against the actual frontend** (a review lens below targets exactly this); + if (b) does not exist, faithfulness display becomes a small new surface (still cheaper than a full + stats panel). + +## Migration / sequencing + +Engine PR first (routing + per-point outputs + metadata/annotation writers), then the web PR (prep +re-enrichment + frontend surfacing). Both layer on top of the still-open `add-projection-statistics` +PRs (#61 / #295); cleanest is to **fold this routing into those PRs before merge** rather than ship +the opaque fifth part and immediately refactor it — to be confirmed with tsenoner (amend #61/#295 vs +stacked follow-up PRs). + +## Open questions for tsenoner (defaulted, non-blocking) + +1. Column naming + whether `K` shows in the visible name; elbow-`K`-only vs a small `K` sweep. +2. Amend #61/#295 in place vs land them and stack this refactor on top. +3. Should the aggregate-validity fifth part get a minimal UI now, or stay carried-but-unrendered + until a later dashboard change. + +## Review outcomes (4-lens fan-out) — revised decisions + +A parallel review (frontend-feasibility, engine-feasibility, spec-quality, adversarial) grounded in +the real code corrected several first-pass assumptions. Revised decisions: + +- **D-route (NEW, supersedes part of D3/D5): fan out at bundle-assembly time, not in `protspace +stats`.** `cli/stats.py` reads only `projections_data`/`projections_metadata` and writes one + `statistics.parquet` ("No annotations are needed"); it has no annotations handle and the prep + service calls it directly. So `stats` stays a **pure aggregate-only producer**; all routing happens + where annotations + metadata + coords coexist — the `prepare`/pipeline path (`pipeline.run`'s + `metadata` frame is the join target) and the prep re-bundle. This removes the showstopper of + "rewrite the `stats` command into a project rewriter." + +- **D-phase (NEW): phase per-protein behind faithfulness+aggregate, and stack on #61/#295.** See + proposal "Sequencing". Faithfulness→metadata + aggregate-only fifth part (Phase 1) carry almost no + core-bundle risk; the per-protein annotations (Phase 2) are where the typing/UX/coupling cost lives. + +- **D2 corrected — typing is not automatic.** `base_processor._create_protein_annotations_table` + does `df.fillna("").astype(str)` on the whole frame, and the frontend infers type from **content** + (`conversion.ts:inferAnnotationType`). So integer cluster labels would infer as _numeric_ (wrong); + membership MUST be written as non-numeric strings (`cluster 0`). Per-point silhouette must survive + as clean numeric strings (no `;`/`|`, empty for missing) to infer continuous. `silhouette_samples` + is **not** "cheap" — unlike the aggregate mean it has no sampling path; it needs its own hard-ceiling + skip guard. + +- **D-style corrected — `settings` is a full envelope, not a palette; "colored on load" is false.** + The settings part stores `LegendPersistedSettings` per annotation (`maxVisibleValues`, `shapeSize`, + `sortMode`, `hiddenValues`, `enableDuplicateStackUI`, `selectedPaletteId`, `categories:{value: +{color,shape,zOrder}}`), and `sanitizeLegendSettingsEntry` **drops** any incomplete entry. The + engine must emit the whole envelope, categories keyed by the exact label strings. Initial color-by + is `annotations[0]` and file settings apply only to the _selected_ annotation, so the guarantee is + "colored **when selected**"; making a membership column the initial selection is separate frontend + work (initial-view / `publishState`). + +- **D3 corrected — faithfulness display is small new code, not free.** `projection-metadata.ts` + flattens `info_json` one level; a nested `quality` object renders as a raw `JSON.stringify` blob, + so the component must be extended to expand `quality` into per-metric rows. + +- **Carriage hazard — filename mismatch.** `ArrowReader._load_data` reads + `selected_annotations.parquet` but `ArrowReader.save_data` writes `protein_annotations.parquet`; + a router that reuses `save_data` would silently lose annotations. Build the augmented annotations + table directly and re-bundle via `write_bundle` (which already accepts `statistics=`); + `replace_settings_in_bundle` preserves core parts byte-for-byte and **cannot add columns**. + +- **Determinism — elbow `K` must be stable**, else membership labels re-bucket between runs (added to + the reproducibility scenario). + +- **`metric_kind` enum — `faithfulness` value is retired from the fifth part** (now only `validity` + / `meta` rows there); the column may still admit it but it is not produced. diff --git a/openspec/changes/route-projection-statistics/proposal.md b/openspec/changes/route-projection-statistics/proposal.md new file mode 100644 index 00000000..1e5e4e2d --- /dev/null +++ b/openspec/changes/route-projection-statistics/proposal.md @@ -0,0 +1,134 @@ +## Why + +The `add-projection-statistics` change (issue #219, engine PR tsenoner/protspace#61 + web #295) +computes the right numbers but delivers **all** of them into a single opaque fifth part +(`statistics.parquet`) that the frontend parses-but-ignores — so nothing is visible to a user, and +"render the statistics" was left as an open follow-up. + +In his review of PR #61, the maintainer (tsenoner) asked us to **place each statistic where the +frontend already knows how to consume that shape of data**, rather than in one catch-all part: + +> - Cluster Elbow Kx → save as an annotation +> - Cluster Silhouette Kx → save as annotation (x variable) +> - kNN-overlap + trustworthiness + continuity → metadata per dimensionality-reduction method +> - silhouette, Davies–Bouldin, Calinski–Harabasz → separate parquet file (as currently) + +The insight: the statistics differ in **granularity and consumer**. Per-protein quantities (which +cluster a point lands in; how well it fits that cluster) are _annotations_ — the color-by machinery +already renders them. Per-method scalars (how faithful a projection is) are _reducer metadata_ — +they belong with the projection's other info. Only the per-method **aggregate** validity scalars +need a dedicated part. Routing this way makes most of the statistics render through existing UI +surfaces with little or no new frontend code — turning the deferred "frontend rendering" follow-up +into mostly-free reuse. + +This change does **not** alter which statistics are computed (that shipped in +`add-projection-statistics`); it changes **where they are carried in the bundle and how they reach +the screen**. + +## What Changes + +The work spans the engine (`protspace`) and this repo (`protspace_web`), engine first. + +- **Engine — `cluster_validity` gains per-point outputs.** The KMeans labels at the elbow `K` are + already computed and currently discarded; surface them as a **per-protein cluster-membership** + output. Add **per-sample silhouette** (`sklearn.metrics.silhouette_samples`) at that `K` as a + second per-protein output. Both are produced **per projection** (each reduction has its own elbow + `K`). +- **Engine — routing at carriage time.** A statistic now declares a **destination** for each output: + `annotation` (per-protein, joined on identifier), `projection_metadata` (per-method scalar), or + `statistics_part` (the long-format table). The bundle writer fans each output to the matching part: + - cluster membership + per-point silhouette → **`protein_annotations`** columns (one pair per + projection), with an auto-generated categorical color **style** for the membership column so it + is colorable on load; + - faithfulness (kNN-overlap, trustworthiness, continuity) → **`projections_metadata`**, folded into + each projection's `info_json` under a `quality` key; + - aggregate cluster-validity (silhouette mean, Davies–Bouldin, Calinski–Harabasz, `n_clusters`) → + **`statistics.parquet`** (the existing fifth part, now carrying only these). +- **Fan-out happens at bundle-assembly time, not in `protspace stats`.** The standalone `stats` + command today reads only projections and writes a single `statistics.parquet` (its docstring: "No + annotations are needed") — it has **no annotations handle**, so it stays a pure aggregate-only + producer. Routing is done where annotations + metadata + coordinates are already in hand: the + `prepare`/pipeline path writes the routed outputs inline, and the prep service's re-bundle step + merges them. (Review finding — see `design.md` D-route.) +- **Prep service (`services/protspace-prep`):** the same **core-bundle-first** shape, but the + best-effort enrichment step now rewrites parts **1/2/4/5** (annotations, metadata, settings, + statistics), not just appends part 5 — so it re-bundles the full set to a temp file and atomically + swaps it in on success. This is a **larger correctness surface** than the prior append-only step + (it touches user-visible color-by data), so it is not "free": it needs an explicit test that a + successful enrichment equals the core bundle plus exactly the computed columns (no row drops / + reordering / retyping). A stats failure still leaves the already-shipped, un-enriched core bundle + in place. +- **Frontend (`@protspace/core` + `@protspace/utils`):** the routed statistics surface through + existing consumers — cluster-membership and per-point-silhouette appear in the **color-by** + control automatically (membership pre-styled); faithfulness is shown in the projection's + **method/info** surface. The aggregate `statistics.parquet` part may render as a small per- + projection quality table (the only genuinely new UI; may itself be a thin follow-up). + +## Capabilities + +### Modified Capabilities + +- `projection-statistics`: change the **carriage and delivery** of the already-computed statistics + from "one opaque fifth part, parsed-but-unused" to "routed to consumer-aligned bundle parts and + surfaced through existing frontend controls". The fifth-part contract narrows to aggregate + validity only; per-protein and per-method outputs move to the annotations and projection-metadata + parts; the prep robustness/decoupling guarantees are preserved under the new re-enrichment shape. + +## Impact + +- **Upstream (`protspace` repo, separate PR):** + - `stats/base.py`: each `StatRow`/output declares a **destination** (`annotation` | + `projection_metadata` | `statistics_part`); `StatsReport` can partition outputs by destination. + - `stats/metrics/validity.py`: emit per-protein **cluster membership** (the existing labels) and + **per-sample silhouette** (`silhouette_samples`) as `annotation`-destined outputs keyed by + identifier; keep the aggregate silhouette/DB/CH + `n_clusters` as `statistics_part` outputs. + - `stats/metrics/faithfulness.py`: mark its scalars `projection_metadata`-destined. + - `data/io/bundle.py` + `data/processors/base_processor.py`: a **router** that merges + annotation-destined outputs into `protein_annotations`, faithfulness into + `projections_metadata.info_json.quality`, and aggregate rows into `statistics.parquet`; generate + a categorical style for each cluster-membership column (`utils/add_annotation_style.py`). + - `cli/stats.py` / `cli/prepare.py`: write the routed outputs (annotations + metadata + statistics) + instead of a single statistics parquet. + - Tests: per-protein outputs join correctly on identifiers; faithfulness lands in `info_json`; + aggregate-only statistics part; styling round-trip. +- **This repo (`protspace_web`):** + - `services/protspace-prep/src/protspace_prep/pipeline.py`: re-enrichment step rewrites parts + 1/2/4/5 atomically, still core-bundle-first and non-fatal; SSE stage retained. + - `packages/core` data-loader + `packages/utils`: read faithfulness from `info_json.quality` and + surface it; ensure computed cluster/silhouette annotation columns flow into the color-by control; + optional aggregate-validity table. +- **Data-format change:** additive and backward compatible. Bundles without statistics are + unchanged; the annotations and `info_json` additions are extra columns/keys existing readers + tolerate; the statistics part keeps its eight-column schema (now aggregate rows only). +- **API / dependencies:** no HTTP API change; no new dependency (`silhouette_samples` is in the + already-present scikit-learn). + +## Sequencing & phasing (revised after fan-out review) + +The four-lens review converged on **stack, don't amend**: the open engine/web PRs (#61/#295) ship a +self-contained, green, backward-compatible opaque fifth part with **zero user-visible surface**. +Folding this correctness-heavy routing refactor into them holds the green infra hostage and makes any +routing bug block the whole feature. Because the fifth part is unconsumed, landing it and then +**removing** the per-protein/faithfulness rows in this follow-up is non-breaking. + +- **Phase 0 — land #61/#295 as-is** (opaque fifth part; reversible; no migration cost). +- **Phase 1 — low-risk routing:** faithfulness → `info_json.quality`; narrow the fifth part to + aggregate validity only. No core-part (annotations) rewrite, so the prep robustness story is barely + touched. Satisfies three of tsenoner's four bullets. +- **Phase 2 — per-protein annotations** (cluster membership + per-point silhouette + auto-styling + + the color-by/provenance UX): the part that pulls statistics into the **core** annotations table, + forces categorical/numeric typing through `base_processor`'s `.astype(str)` writer, generates full + legend-settings envelopes, and floods the color-by dropdown (one membership + one silhouette column + per projection; parameter sweeps / multi-embedding push this to 30–60 columns). Gate it behind a + flag and land it once the typing, provenance-grouping, and initial-selection questions are answered. + +## Open questions (defaulted now; confirm with tsenoner, do not block) + +1. **Annotation column naming / which K.** Default: one membership column `cluster_` and + one silhouette column `silhouette_` per projection, at the **elbow K only** (not a + sweep); the chosen `K` is recorded in annotation metadata and the display label. ("x variable" is + read as "the elbow K, which differs per projection".) +2. **Auto-styling.** Default: **yes** — generate a categorical palette for each membership column so + clusters are colored on load. +3. **Faithfulness placement.** Default: inside `projections_metadata.info_json` under a `quality` + sub-object (additive), not a new top-level metadata column. diff --git a/openspec/changes/route-projection-statistics/specs/projection-statistics/spec.md b/openspec/changes/route-projection-statistics/specs/projection-statistics/spec.md new file mode 100644 index 00000000..79b34578 --- /dev/null +++ b/openspec/changes/route-projection-statistics/specs/projection-statistics/spec.md @@ -0,0 +1,264 @@ +## ADDED Requirements + +### Requirement: Statistics are routed to consumer-aligned bundle parts + +Each computed statistic SHALL be carried in the bundle part that matches its **granularity and +consumer**, not in a single catch-all part. Specifically: **per-protein** outputs (cluster +membership, per-point silhouette) SHALL be carried as columns in `protein_annotations`; **per-method +faithfulness** scalars SHALL be carried in `projections_metadata` (within each projection's +`info_json`); and **per-method aggregate cluster-validity** scalars SHALL be carried in the +`statistics.parquet` part. The routing SHALL be additive and backward compatible: bundles without +statistics, and pre-routing bundles, SHALL still read; the annotation columns and `info_json` +additions SHALL be ignorable by existing readers. + +#### Scenario: A statistics-bearing bundle routes outputs to three parts + +- **WHEN** a bundle is prepared from embeddings with statistics enabled +- **THEN** for each projection the cluster membership and per-point silhouette appear as + `protein_annotations` columns, the faithfulness scalars appear in that projection's + `projections_metadata.info_json`, and the aggregate cluster-validity scalars appear in + `statistics.parquet` + +#### Scenario: A statistic declares its destination + +- **WHEN** a statistic produces an output +- **THEN** the output carries a destination of `annotation`, `projection_metadata`, or + `statistics_part`, and the carriage layer fans it to the matching bundle part without the statistic + knowing the bundle layout + +#### Scenario: Existing readers tolerate the additions + +- **WHEN** a reader that predates this change loads a routed bundle +- **THEN** the extra annotation columns and the `info_json.quality` key are ignored without error, + and projections/annotations/settings load normally + +#### Scenario: Statistics disabled produces an un-routed bundle + +- **WHEN** preparation is run with statistics disabled (`--no-stats` / prep flag off) +- **THEN** no `cluster_*`/`silhouette_*` annotation columns are added, no `info_json.quality` key is + written, no auto-style is generated, and no `statistics.parquet` part is emitted — the bundle is + the same shape as a pre-routing core bundle + +### Requirement: Cluster membership is delivered as a per-projection annotation + +For each projection, the KMeans labelling at the elbow-estimated `K` SHALL be delivered as a +**categorical per-protein annotation column**, joined to proteins by `identifier`. Membership values +SHALL be written as **non-numeric category labels** (e.g. `cluster 0`, not `0`) so the frontend's +content-based type inference (`conversion.ts:inferAnnotationType`) classifies the column as +categorical rather than as a continuous numeric ramp; the all-string annotation serialization in +`base_processor._create_protein_annotations_table` (which `.astype(str)`s every column) MUST NOT +cause membership to be mis-typed. There SHALL be one membership column per projection (each projection +has its own elbow `K`). The column SHALL be marked as a **computed** annotation (distinct from +retrieved biological annotations), and its source projection, elbow `K`, and seed SHALL be recorded +in the column's metadata. A protein absent from a projection SHALL have no membership value for that +projection rather than a wrong one. + +#### Scenario: Membership appears in the color-by control + +- **WHEN** the app loads a routed bundle +- **THEN** each projection's cluster-membership column is selectable in the color-by control and + colors points by cluster + +#### Scenario: Membership is per projection + +- **WHEN** a run produces more than one projection +- **THEN** each projection contributes its own membership column at its own elbow `K`, and selecting + one does not mislabel points of another + +### Requirement: Per-point silhouette is delivered as a per-projection annotation + +For each projection, the per-sample silhouette values at the elbow `K` (`silhouette_samples`, +computed over the **full** labelled point set — not the cost-bounded subsample the aggregate mean +silhouette uses) SHALL be delivered as a **numeric per-protein annotation column**, joined by +`identifier`, one column per projection, marked computed with source projection / `K` / seed in its +metadata. The column MUST round-trip as clean numeric strings (no `;`/`|` separators, empty for +missing) so the frontend infers it as a continuous field. Because `silhouette_samples` has no +sampling escape hatch and is O(n²)-class, its own cost guard (skip beyond the hard ceiling, recording +a marker) SHALL apply. Points excluded from silhouette scoring (degenerate `K`, beyond the ceiling, +or absent from the projection) SHALL carry no value rather than a fabricated one. + +#### Scenario: Per-point silhouette is colorable as a continuous field + +- **WHEN** the app loads a routed bundle +- **THEN** the per-point silhouette column is selectable in color-by and renders as a continuous + ramp, letting a user see which points fit their cluster well + +#### Scenario: Aggregate and per-point silhouette are distinct + +- **WHEN** both the per-point silhouette annotation and the aggregate silhouette statistic exist +- **THEN** the annotation carries one value per protein and the aggregate carries one value per + projection in `statistics.parquet`; neither is derived by the consumer from the other + +### Requirement: Faithfulness is delivered as per-projection reducer metadata + +The faithfulness scalars (kNN-overlap, trustworthiness, continuity) SHALL be carried **per +projection** inside `projections_metadata.info_json`, under a `quality` object, alongside the +reducer's other info. Each value SHALL record its neighbourhood size `k`, the high-dimensional +distance metric used, and any sampling/skip provenance. `info_json` SHALL remain valid JSON whose +unknown keys are ignorable by existing consumers. + +#### Scenario: Faithfulness rides with the projection's other metadata + +- **WHEN** a consumer reads a projection's metadata +- **THEN** `info_json.quality` exposes that projection's kNN-overlap, trustworthiness, and continuity + with their `k`, metric, and sampling provenance + +#### Scenario: Faithfulness is computed where the embedding is in hand + +- **WHEN** statistics are produced during `prepare`/`project` (embedding and projection both present) +- **THEN** faithfulness is computed there and written into `projections_metadata`; the standalone + `stats` path recomputes and merges it into an existing project's `projections_metadata` + +#### Scenario: A projection without an available embedding omits faithfulness + +- **WHEN** a projection's source embedding is unavailable +- **THEN** its `info_json.quality` omits faithfulness (no key) rather than recording a wrong value + +#### Scenario: Multi-embedding runs route to the correct projection's metadata + +- **WHEN** a run produces projections from more than one source embedding +- **THEN** each projection's `info_json.quality` is computed against the embedding that produced it + (matched by name + id-intersection join), never a neighbour's + +### Requirement: Cluster-membership annotations are auto-styled + +When statistics are enabled, the carriage layer SHALL generate a complete legend-settings entry for +each cluster-membership column and write it into the bundle's `settings` part so the membership is +colored **when selected**, without a manual styling step. The generated entry SHALL be a full +`LegendPersistedSettings` envelope as the frontend loader requires — `maxVisibleValues`, `shapeSize`, +`sortMode`, `hiddenValues`, `enableDuplicateStackUI`, `selectedPaletteId`, and `categories` keyed by +the exact membership label strings, each category carrying `color`, `shape`, and `zOrder` — because +the loader's `sanitizeLegendSettingsEntry` rejects (drops) any entry missing these fields. Because the +frontend's initial color-by selection is the first annotation column (not necessarily a membership +column) and file settings apply only to the _selected_ annotation, "colored when selected" is the +guarantee; making a membership column the **initial** selection is a separate frontend concern (see +the frontend requirement). Generating styles SHALL NOT remove or overwrite existing user/annotation +styles, and SHALL be skipped cleanly when statistics are disabled. + +#### Scenario: A selected membership column is colored without manual styling + +- **WHEN** the app loads a routed bundle with statistics enabled and the user selects a + cluster-membership column +- **THEN** that column already has a complete legend-settings entry (categories colored per label), + so clusters are colored with no manual styling step + +#### Scenario: Incomplete style envelopes are not emitted + +- **WHEN** the carriage layer writes a generated membership style +- **THEN** it writes the full `LegendPersistedSettings` envelope (not a bare color map), so the + loader's sanitizer does not discard the entry + +#### Scenario: Auto-styling preserves existing styles + +- **WHEN** the bundle already carries annotation styles in its settings part +- **THEN** the generated cluster styles are added without dropping or altering the pre-existing styles + +### Requirement: Settings rewrites preserve routed statistics and generated styles + +The `protspace style` / settings-replacement path (`replace_settings_in_bundle`) SHALL preserve a +routed bundle's statistics carriage across a settings rewrite: the `statistics.parquet` part SHALL be +retained, and the generated `cluster_*` membership styles SHALL NOT be dropped, when a user re-runs +styling on a routed bundle. (This replaces the prior change's "Styling preserves statistics for both +shapes" guarantee, extended to cover the auto-generated cluster styles.) + +#### Scenario: Styling a routed bundle keeps statistics and cluster styles + +- **WHEN** a settings-replacement / `protspace style` operation rewrites a routed, + statistics-bearing bundle +- **THEN** the statistics part is preserved and the auto-generated cluster-membership styles survive + the rewrite (merged with any user styling), rather than being clobbered + +## MODIFIED Requirements + +### Requirement: Statistics are a stable tidy long-format table with joinable keys + +The `statistics.parquet` part SHALL be a tidy long-format table with the fixed eight-column schema +(`space_kind`, `space_name`, `stat_family`, `label_kind`, `metric`, `metric_kind`, `value`, +`extra_json`), one statistic value per row, and SHALL now carry **only per-method aggregate +cluster-validity** — the silhouette mean, Davies–Bouldin, Calinski–Harabasz (`metric_kind` +`validity`) and the `n_clusters` row (`metric_kind` `meta`). Per-protein outputs and faithfulness +SHALL NOT appear in this part (they are routed to annotations and projection metadata respectively). +`space_name` SHALL still equal the corresponding `projections_metadata.projection_name` so the table +is joinable, and adding a new **aggregate scalar** statistic SHALL add rows, not columns. + +#### Scenario: The statistics part holds aggregate validity only + +- **WHEN** a consumer reads `statistics.parquet` +- **THEN** it finds per-projection silhouette/davies_bouldin/calinski_harabasz validity rows and the + `n_clusters` meta row, and finds no faithfulness rows and no per-protein rows + +#### Scenario: Aggregate rows remain joinable to projections + +- **WHEN** a consumer joins the statistics part to projection metadata +- **THEN** each row's `space_name` matches a `projection_name`, and `metric_kind` `meta` separates + `n_clusters` from the `validity` scores + +### Requirement: The frontend reader tolerates the statistics part + +The frontend SHALL **surface** the routed statistics through existing consumers rather than only +tolerating an opaque part: cluster-membership and per-point-silhouette annotation columns SHALL be +available in the color-by control (membership colored via its generated style; silhouette as a +continuous ramp). Each projection's `info_json.quality` faithfulness SHALL be displayed in the +projection's method/info surface (`projection-metadata.ts`); because that component currently flattens +`info_json` only one level (rendering a nested object as a raw JSON blob), it SHALL be extended to +render the `quality` sub-object as discrete per-metric rows. The aggregate `statistics.parquet` part +MAY be surfaced as a small per-projection quality table; if not yet surfaced, its presence or absence +SHALL NOT affect rendering. A bundle lacking any statistics SHALL render exactly as before. + +#### Scenario: Routed statistics render through existing controls + +- **WHEN** the app loads a routed bundle +- **THEN** cluster and per-point-silhouette columns appear in color-by, faithfulness is available + per projection for display, and the projection renders with no error + +#### Scenario: A statistics-less bundle is unaffected + +- **WHEN** the app loads a bundle prepared with statistics disabled +- **THEN** no statistics-derived columns, metadata quality, or table appear, and rendering is + identical to pre-change behaviour + +### Requirement: Statistics computation is non-fatal, reproducible, and guarded + +Statistics remain **secondary** to the core bundle under the new routing. In the prep service the +**core (un-enriched) bundle SHALL be produced first**; a best-effort enrichment step — under its own +bounded timeout caught locally — SHALL then recompute the routed statistics and **rewrite the +affected parts (annotations, projection metadata, settings, and the statistics part) into a temporary +bundle swapped in atomically on success**. A stats failure or timeout SHALL leave the already-produced +un-enriched bundle in place and SHALL NOT fail the job or lose the bundle. Computation SHALL remain +deterministic under a recorded seed and SHALL keep the existing cost and degeneracy guards (bounded +sampling above a threshold, skip beyond a hard ceiling, singleton-cluster exclusion for DB/CH, no row +for inputs too small to score). Provenance (seed, neighbourhood size, distance metric, sampling, knee +confidence, source embedding, elbow `K`) SHALL be recorded with each output. + +#### Scenario: Enrichment failure preserves the core bundle + +- **WHEN** the enrichment step fails or times out +- **THEN** the job still succeeds and the un-enriched core bundle (produced first) is delivered, with + no statistics columns, metadata quality, or statistics part + +#### Scenario: Enrichment is atomic + +- **WHEN** the enrichment step is interrupted mid-write +- **THEN** the original core bundle is never left partially overwritten — the enriched bundle is + swapped in only when fully written + +#### Scenario: Results are reproducible + +- **WHEN** the same input is prepared twice with the same seed and projection +- **THEN** the selected elbow `K` per projection is identical, and the routed statistics (membership, + per-point silhouette, faithfulness, aggregate validity) are identical, with the seed recorded — a + stable `K` being required so membership labels do not silently re-bucket between runs + +## REMOVED Requirements + +### Requirement: Statistics ride in the bundle as an optional fifth part, backward compatibly + +**Reason:** Replaced by "Statistics are routed to consumer-aligned bundle parts". Statistics are no +longer carried solely as a fifth part; the fifth part now holds aggregate validity only, while +per-protein and per-method outputs are routed to the annotations and projection-metadata parts. The +backward-compatibility guarantee is carried by the routing requirement; the `protspace style` +round-trip guarantee is **explicitly re-stated** in the new "Settings rewrites preserve routed +statistics and generated styles" requirement (not silently dropped). **Migration:** the only consumer +that read faithfulness/labels from the fifth part is our own not-yet-shipped frontend, so there is no +external break; landing #61/#295 (the opaque fifth part) first and removing those rows in this +follow-up is therefore non-breaking (see proposal "Sequencing"). diff --git a/openspec/changes/route-projection-statistics/tasks.md b/openspec/changes/route-projection-statistics/tasks.md new file mode 100644 index 00000000..f7074d49 --- /dev/null +++ b/openspec/changes/route-projection-statistics/tasks.md @@ -0,0 +1,107 @@ +Phased per the fan-out review (see `proposal.md` "Sequencing"). **Stack on #61/#295, don't amend.** +Routing fans out at **bundle-assembly time** (`prepare`/pipeline + prep re-bundle); `protspace stats` +stays a pure aggregate-only producer. TDD; every routed output ships a known-answer fixture, not a +"column exists" check. Engine PRs land before the matching web PRs. + +## Phase 0 — land the prior change (no work here) + +- [ ] 0.1 Merge engine #61 + web #295 as-is (opaque fifth part). They are green and reversible; this + change removes the per-protein/faithfulness rows from that part non-breakingly. Confirm with + tsenoner: stack (recommended) vs amend. + +## Phase 1 — low-risk routing (engine then web) + +Faithfulness → `info_json.quality`; narrow the fifth part to aggregate validity only. **No core +(annotations) rewrite** — the prep robustness story is barely touched. + +### 1A. Engine — destination on outputs + faithfulness to metadata + +- [ ] 1A.1 `stats/base.py`: add `destination: str = "statistics_part"` to `StatRow` (default keeps + every existing construction valid; `STATS_SCHEMA`/`to_record` unchanged — `destination` is not a + column). Add `StatsReport.partition() -> dict[str, list[StatRow]]`; `to_arrow()` serializes only + the `statistics_part` bucket. +- [ ] 1A.2 `stats/metrics/faithfulness.py`: mark its rows `destination="projection_metadata"`. +- [ ] 1A.3 Carriage: a router (in `data/processors/base_processor.py` / `pipeline.py`) that, **before** + `create_output` builds `projections_metadata`, indexes faithfulness rows by `space_name` and + injects a `quality` object into the matching `reduction["info"]` dict (so it serializes into + `info_json`). Keep `info_json` valid JSON; unknown keys ignorable. +- [ ] 1A.4 `cli/stats.py` stays aggregate-only (writes `statistics.parquet` with only + `validity`/`meta` rows). Confirm the prep `protspace stats … -o statistics.parquet` + + `protspace bundle -s` path still works (faithfulness now absent from the part). +- [ ] 1A.5 Tests: faithfulness lands in `info_json.quality` with `k`/metric/sampling provenance; + multi-embedding routes to the correct projection; statistics part has no faithfulness rows; + `--no-stats` writes none of it; determinism incl. stable elbow `K`. +- [ ] 1A.6 Version bump; record the release for the web floor; open engine PR (or fold into #61 if + amending). + +### 1B. Web — surface faithfulness + prep + +- [ ] 1B.1 `packages/core/.../scatter-plot/projection-metadata.ts`: expand `info_json.quality` into + discrete per-metric rows (it currently flattens one level → raw JSON blob). Add a test. +- [ ] 1B.2 `services/protspace-prep`: bump `protspace` floor; the existing core-first + atomic + re-bundle step is unchanged in shape (faithfulness rides in metadata via the engine). Verify the + SSE stage + non-fatal behavior still hold. +- [ ] 1B.3 Tests: a bundle with `info_json.quality` renders faithfulness; a stats-less bundle is + unchanged; existing readers tolerate the new key. + +## Phase 2 — per-protein annotations (flagged) + +Cluster membership + per-point silhouette as annotation columns, auto-styled. This is the +core-bundle-coupling, typing, and color-by-UX work. Gate behind a flag until the open questions land. + +### 2A. Engine — per-point outputs + +- [ ] 2A.1 `stats/metrics/validity.py`: emit per-protein **cluster membership** (existing elbow-`K` + `labels`, already computed) as `destination="annotation"`, keyed by `ctx.ids`, values as + **non-numeric strings** (`cluster 0`) so content-based inference is categorical. Emit per-point + **`silhouette_samples`** over the **full** labelled set (not the aggregate's subsample) as a + numeric `annotation` output with its own hard-ceiling skip guard. +- [ ] 2A.2 Carriage router: merge `annotation`-destined outputs into the `protein_annotations` Arrow + table **without** going through `ArrowReader.save_data` (it writes `protein_annotations.parquet` + while readers expect `selected_annotations.parquet` — silent-loss bug) and **outside** the + `.astype(str)` path for the numeric silhouette column (preserve float round-trip). One + membership + one silhouette column per projection; absent proteins → empty, not fabricated. +- [ ] 2A.3 Auto-style: generate a **full `LegendPersistedSettings` envelope** per membership column + (categories keyed by exact label strings, each with `color`/`shape`/`zOrder`, plus + `maxVisibleValues`/`shapeSize`/`sortMode`/`hiddenValues`/`enableDuplicateStackUI`/ + `selectedPaletteId`), merged into `settings` without clobbering existing styles. Style **only + membership** (not silhouette). +- [ ] 2A.4 `replace_settings_in_bundle` / `protspace style`: preserve the statistics part **and** the + generated cluster styles across a settings rewrite (restore + extend the prior "styling preserves + statistics" round-trip). +- [ ] 2A.5 Flag: `--stats-annotations/--no-stats-annotations` (or equivalent) gating Phase-2 output; + default per tsenoner. +- [ ] 2A.6 Known-answer tests: membership joins by identifier across a bundle where annotation + id-set ≠ projection id-set ≠ embedding id-set; membership infers **categorical**, silhouette + **numeric**, in the actual frontend type-inference rules; incomplete style envelope is rejected + by the sanitizer (negative test); style round-trip; flag off → no columns. + +### 2B. Web — color-by surfacing + provenance UX + +- [ ] 2B.1 Confirm `cluster_*`/`silhouette_*` columns appear in color-by (no allowlist hides them) and + silhouette renders as a continuous ramp. +- [ ] 2B.2 Provenance/grouping: register a "Computed/Statistics" annotation category (does not exist + today — `annotation-categories.ts` / `annotation-metadata.ts`) so ~12–60 computed columns don't + flood the flat "Other" group. Decide a default/initial selection (initial-view) if "colored on + load" is wanted. +- [ ] 2B.3 Prep re-bundle now rewrites parts 1/2/4/5: add a test that a successful enrichment equals + the core annotations **plus exactly** the computed columns (no row drops / reorder / retype), and + that a stats failure still ships the un-enriched core bundle. Note the ~2× bundle I/O cost. +- [ ] 2B.4 Tests: 5-part routed bundle (with computed annotations + styles) loads, colors clusters + when selected, ignores nothing erroneously; large-N / sweep bundle doesn't break color-by. + +## Verification (each PR) + +- [ ] V.1 Engine: `uv run pytest -m "not slow"` + `ruff check` + **`ruff format --check`** green + (the CI step that bit #61); CLI startup not regressed (lazy imports). +- [ ] V.2 Web: `pnpm quality:ci` (`format:check && lint && quality`) green — run `format:check`, not + just `quality`; prep `uv run pytest -q` green. +- [ ] V.3 Load a real engine-produced routed bundle in the app: faithfulness shows per-metric; + (Phase 2) clusters color when selected; no console errors. + +## Open decisions to resolve before Phase 2 (carry to tsenoner) + +- [ ] Column naming + whether `K` shows in the visible label; elbow-`K`-only vs a small sweep. +- [ ] Initial-selection: should a membership column be the default color-by, or colored-when-selected. +- [ ] Aggregate fifth-part UI now vs deferred. +- [ ] Amend #61/#295 vs stack (recommended: stack). diff --git a/packages/core/src/components/control-bar/annotation-categories.test.ts b/packages/core/src/components/control-bar/annotation-categories.test.ts new file mode 100644 index 00000000..81581e0b --- /dev/null +++ b/packages/core/src/components/control-bar/annotation-categories.test.ts @@ -0,0 +1,21 @@ +import { describe, it, expect } from 'vitest'; +import { groupAnnotations } from './annotation-categories'; + +describe('groupAnnotations', () => { + it('groups computed cluster_/silhouette_ columns under a Statistics section', () => { + const groups = groupAnnotations(['cluster_PCA_2', 'silhouette_PCA_2', 'cluster_UMAP_2']); + const stats = groups.find((g) => g.category === 'Statistics'); + expect(stats).toBeDefined(); + expect(stats?.annotations).toEqual(['cluster_PCA_2', 'cluster_UMAP_2', 'silhouette_PCA_2']); + // computed columns must not leak into the catch-all Other section + expect(groups.find((g) => g.category === 'Other')).toBeUndefined(); + }); + + it('keeps non-computed annotations out of the Statistics section', () => { + const groups = groupAnnotations(['cluster_PCA_2', 'my_random_label']); + const stats = groups.find((g) => g.category === 'Statistics'); + expect(stats?.annotations).toEqual(['cluster_PCA_2']); + const other = groups.find((g) => g.category === 'Other'); + expect(other?.annotations).toEqual(['my_random_label']); + }); +}); diff --git a/packages/core/src/components/control-bar/annotation-categories.ts b/packages/core/src/components/control-bar/annotation-categories.ts index 36d3e783..c1d26b74 100644 --- a/packages/core/src/components/control-bar/annotation-categories.ts +++ b/packages/core/src/components/control-bar/annotation-categories.ts @@ -1,13 +1,30 @@ import { annotationSource, compareTaxonomyRank, type AnnotationSource } from '@protspace/utils'; -/** Dropdown section names — one per annotation source (the predicted flag is shown per-row, not as a group). */ -export type CategoryName = 'Biocentral' | 'InterPro' | 'TED' | 'Taxonomy' | 'UniProt' | 'Other'; +/** Dropdown section names — one per annotation source, plus a Statistics section for + * computed columns (the predicted flag is shown per-row, not as a group). */ +export type CategoryName = + | 'Biocentral' + | 'InterPro' + | 'TED' + | 'Taxonomy' + | 'UniProt' + | 'Statistics' + | 'Other'; export interface GroupedAnnotation { category: CategoryName; annotations: string[]; } +/** Prefixes of computed per-protein statistic columns (route-projection-statistics): + * one `cluster_` + one `silhouette_` per projection. Grouping + * them keeps ~12 computed columns from flooding the catch-all "Other" section. */ +const COMPUTED_STAT_PREFIXES = ['cluster_', 'silhouette_']; + +function isComputedStatistic(annotation: string): boolean { + return COMPUTED_STAT_PREFIXES.some((prefix) => annotation.startsWith(prefix)); +} + /** Map an annotation's source to its dropdown section. */ function categoryForSource(source: AnnotationSource): CategoryName { switch (source) { @@ -26,13 +43,15 @@ function categoryForSource(source: AnnotationSource): CategoryName { } } -// Display order of the dropdown sections (predicted sources first, then the rest). +// Display order of the dropdown sections (predicted sources first, then the rest; +// computed Statistics grouped just before the catch-all Other). const CATEGORY_ORDER: CategoryName[] = [ 'Biocentral', 'InterPro', 'TED', 'Taxonomy', 'UniProt', + 'Statistics', 'Other', ]; @@ -52,11 +71,15 @@ export function groupAnnotations(annotations: string[]): GroupedAnnotation[] { TED: [], Taxonomy: [], UniProt: [], + Statistics: [], Other: [], }; for (const annotation of annotations) { - categorized[categoryForSource(annotationSource(annotation))].push(annotation); + const category = isComputedStatistic(annotation) + ? 'Statistics' + : categoryForSource(annotationSource(annotation)); + categorized[category].push(annotation); } for (const category of CATEGORY_ORDER) { diff --git a/packages/core/src/components/data-loader/utils/bundle-roundtrip.test.ts b/packages/core/src/components/data-loader/utils/bundle-roundtrip.test.ts index 666357b8..ffdaa717 100644 --- a/packages/core/src/components/data-loader/utils/bundle-roundtrip.test.ts +++ b/packages/core/src/components/data-loader/utils/bundle-roundtrip.test.ts @@ -3,7 +3,13 @@ import { readFileSync } from 'fs'; import { resolve } from 'path'; import { extractRowsFromParquetBundle } from './bundle'; import { convertParquetToVisualizationData } from './conversion'; -import { createParquetBundle, countBundleDelimiters, isParquetBundle } from '@protspace/utils'; +import { + createParquetBundle, + countBundleDelimiters, + isParquetBundle, + BUNDLE_DELIMITER_BYTES, + findBundleDelimiterPositions, +} from '@protspace/utils'; function loadArrayBuffer(filePath: string): ArrayBuffer { const buffer = readFileSync(filePath); @@ -333,6 +339,93 @@ describe('metadata preservation through round-trip', () => { }); }); +describe('5-part statistics bundle parsing', () => { + // The frontend writer (createParquetBundle) only emits 3/4 parts, so we splice + // real parquet parts into the 5-part shapes the backend produces. + const DELIM = BUNDLE_DELIMITER_BYTES; + + function splitParts(buf: ArrayBuffer): Uint8Array[] { + const u8 = new Uint8Array(buf); + const pos = findBundleDelimiterPositions(u8); + const out: Uint8Array[] = []; + for (let i = 0; i <= pos.length; i++) { + const start = i === 0 ? 0 : pos[i - 1] + DELIM.length; + const end = i < pos.length ? pos[i] : u8.length; + out.push(u8.subarray(start, end).slice()); + } + return out; + } + + function assemble(parts: Uint8Array[]): ArrayBuffer { + let total = 0; + parts.forEach((p, i) => { + total += p.length + (i < parts.length - 1 ? DELIM.length : 0); + }); + const out = new Uint8Array(total); + let off = 0; + parts.forEach((p, i) => { + out.set(p, off); + off += p.length; + if (i < parts.length - 1) { + out.set(DELIM, off); + off += DELIM.length; + } + }); + return out.buffer; + } + + const original = { + protein_ids: ['P1', 'P2', 'P3'], + projections: [{ name: 'UMAP', data: Float32Array.of(0, 0, 1, 1, 2, 2), dimension: 2 as const }], + annotations: { + family: { + kind: 'categorical' as const, + values: ['A', 'B'], + colors: ['#1F77B4', '#FF7F0E'], + shapes: ['circle', 'circle'], + }, + }, + annotation_data: { family: [[0], [1], [0]] }, + annotation_scores: {}, + annotation_evidence: {}, + }; + + it('parses 5 parts with a zero-byte settings slot (statistics-only) → settings null', async () => { + const core = splitParts(createParquetBundle(original)); // [c0, c1, c2] + expect(core.length).toBe(3); + const stats = core[0]; // part 5 is ignored; any valid parquet bytes + const fivePart = assemble([core[0], core[1], core[2], new Uint8Array(0), stats]); + expect(countBundleDelimiters(new Uint8Array(fivePart))).toBe(4); + const res = await extractRowsFromParquetBundle(fivePart); + expect(res.projections.length).toBeGreaterThan(0); + expect(res.settings).toBeNull(); + }); + + it('parses 5 parts with settings + statistics → settings parsed, statistics ignored', async () => { + const settings = { + legendSettings: { + family: { + maxVisibleValues: 10, + shapeSize: 24, + sortMode: 'size-desc' as const, + hiddenValues: [], + categories: {}, + enableDuplicateStackUI: false, + selectedPaletteId: 'kellys', + }, + }, + exportOptions: {}, + }; + const ps = splitParts(createParquetBundle(original, { includeSettings: true, settings })); + expect(ps.length).toBe(4); // [c0, c1, c2, settings] + const fivePart = assemble([ps[0], ps[1], ps[2], ps[3], ps[0]]); + expect(countBundleDelimiters(new Uint8Array(fivePart))).toBe(4); + const res = await extractRowsFromParquetBundle(fivePart); + expect(res.projections.length).toBeGreaterThan(0); + expect(res.settings).not.toBeNull(); + }); +}); + describe('numeric annotation round-trip', () => { it('preserves raw numeric annotations through bundle export and import', async () => { const original = { diff --git a/packages/core/src/components/data-loader/utils/bundle.test.ts b/packages/core/src/components/data-loader/utils/bundle.test.ts index efedb305..05bbfdb1 100644 --- a/packages/core/src/components/data-loader/utils/bundle.test.ts +++ b/packages/core/src/components/data-loader/utils/bundle.test.ts @@ -112,15 +112,29 @@ describe('bundle utilities', () => { const bundle = createMockBundle(2); await expect(extractRowsFromParquetBundle(bundle)).rejects.toThrow( - /Expected 2 or 3 delimiters/, + /Expected 2 to 4 delimiters/, ); }); - it('should reject bundle with 4 delimiters (5 parts)', async () => { + it('should accept a 5-part (core+settings+statistics) bundle structurally', async () => { + // 4 delimiters (5 parts) is now valid: core + settings + statistics. With + // mock (non-parquet) buffers it still fails later at parquet decode — but + // NOT at the delimiter-count guard, which is what we assert here. const bundle = createMockBundle(5); + let message = ''; + try { + await extractRowsFromParquetBundle(bundle); + } catch (e) { + message = (e as Error).message; + } + expect(message).not.toMatch(/Expected 2 to 4 delimiters/); + }); + + it('should reject bundle with 5 delimiters (6 parts)', async () => { + const bundle = createMockBundle(6); await expect(extractRowsFromParquetBundle(bundle)).rejects.toThrow( - /Expected 2 or 3 delimiters/, + /Expected 2 to 4 delimiters/, ); }); @@ -128,7 +142,7 @@ describe('bundle utilities', () => { const buffer = createMockParquetBuffer('no delimiter'); await expect(extractRowsFromParquetBundle(buffer)).rejects.toThrow( - /Expected 2 or 3 delimiters/, + /Expected 2 to 4 delimiters/, ); }); }); diff --git a/packages/core/src/components/data-loader/utils/bundle.ts b/packages/core/src/components/data-loader/utils/bundle.ts index 58ec5d03..9b7c1874 100644 --- a/packages/core/src/components/data-loader/utils/bundle.ts +++ b/packages/core/src/components/data-loader/utils/bundle.ts @@ -29,9 +29,11 @@ export interface BundleExtractionResult { /** * Extract rows and optional settings from a parquetbundle. * - * Supports two formats: + * Supports: * - 2 delimiters (3 parts): Original format without settings * - 3 delimiters (4 parts): Extended format with settings + * - 4 delimiters (5 parts): With statistics (optional 5th part, ignored here; + * the settings slot may be zero bytes when statistics are present without settings) */ export async function extractRowsFromParquetBundle( arrayBuffer: ArrayBuffer, @@ -39,36 +41,33 @@ export async function extractRowsFromParquetBundle( const uint8Array = new Uint8Array(arrayBuffer); const delimiterPositions = findBundleDelimiterPositions(uint8Array); - // Support both 2 delimiters (original) and 3 delimiters (with settings) - if (delimiterPositions.length !== 2 && delimiterPositions.length !== 3) { + // Supported shapes (by delimiter count): + // 2 → 3 parts (core only) + // 3 → 4 parts (core + settings) + // 4 → 5 parts (core + settings + statistics; the settings slot may be empty) + if (delimiterPositions.length < 2 || delimiterPositions.length > 4) { throw new Error( - `Expected 2 or 3 delimiters in parquetbundle, found ${delimiterPositions.length}`, + `Expected 2 to 4 delimiters in parquetbundle, found ${delimiterPositions.length}`, ); } - const hasSettingsPart = delimiterPositions.length === 3; - - // Extract the three required parts - let part1: ArrayBuffer | null = uint8Array.subarray(0, delimiterPositions[0]).slice().buffer; - let part2: ArrayBuffer | null = uint8Array - .subarray(delimiterPositions[0] + BUNDLE_DELIMITER_BYTES.length, delimiterPositions[1]) - .slice().buffer; - - let part3: ArrayBuffer | null; - let part4: ArrayBuffer | null = null; - - if (hasSettingsPart) { - part3 = uint8Array - .subarray(delimiterPositions[1] + BUNDLE_DELIMITER_BYTES.length, delimiterPositions[2]) - .slice().buffer; - part4 = uint8Array - .subarray(delimiterPositions[2] + BUNDLE_DELIMITER_BYTES.length) - .slice().buffer; - } else { - part3 = uint8Array - .subarray(delimiterPositions[1] + BUNDLE_DELIMITER_BYTES.length) - .slice().buffer; - } + const delimLen = BUNDLE_DELIMITER_BYTES.length; + const partStart = (i: number): number => (i === 0 ? 0 : delimiterPositions[i - 1] + delimLen); + const partEnd = (i: number): number => + i < delimiterPositions.length ? delimiterPositions[i] : uint8Array.length; + const slicePart = (i: number): ArrayBuffer => + uint8Array.subarray(partStart(i), partEnd(i)).slice().buffer; + + // Extract the three required core parts. + let part1: ArrayBuffer | null = slicePart(0); + let part2: ArrayBuffer | null = slicePart(1); + let part3: ArrayBuffer | null = slicePart(2); + + // Part 4 is settings (optional). It may be a zero-byte slot when a statistics + // part follows without settings — branch on emptiness, not the raw count. + // Part 5 is statistics, intentionally ignored for now (rendering is separate). + const part4: ArrayBuffer | null = + delimiterPositions.length >= 3 && partEnd(3) > partStart(3) ? slicePart(3) : null; // Validate parquet magic for each part before parsing assertValidParquetMagic(part1); @@ -142,6 +141,10 @@ export async function extractRowsFromParquetBundle( */ async function extractSettings(settingsBuffer: ArrayBuffer): Promise { try { + // A zero-byte settings slot (statistics-without-settings) is "no settings". + if (settingsBuffer.byteLength === 0) { + return null; + } // Validate parquet magic assertValidParquetMagic(settingsBuffer); diff --git a/packages/core/src/components/scatter-plot/projection-metadata-helpers.test.ts b/packages/core/src/components/scatter-plot/projection-metadata-helpers.test.ts new file mode 100644 index 00000000..5ee6542c --- /dev/null +++ b/packages/core/src/components/scatter-plot/projection-metadata-helpers.test.ts @@ -0,0 +1,57 @@ +import { describe, it, expect } from 'vitest'; +import { buildProjectionMetadataRows } from './projection-metadata-helpers'; + +describe('buildProjectionMetadataRows', () => { + it('returns [] for null or empty metadata', () => { + expect(buildProjectionMetadataRows(null)).toEqual([]); + expect(buildProjectionMetadataRows({})).toEqual([]); + }); + + it('skips internal dimension/name fields', () => { + const rows = buildProjectionMetadataRows({ name: 'PCA 2', dimensions: 2, foo: 'bar' }); + const keys = rows.map(([k]) => k); + expect(keys).not.toContain('Name'); + expect(keys).not.toContain('Dimensions'); + expect(rows).toContainEqual(['Foo', 'bar']); + }); + + it('flattens info_json one level (existing behavior)', () => { + const rows = buildProjectionMetadataRows({ + info_json: '{"metric":"cosine","n_neighbors":15}', + }); + expect(rows).toContainEqual(['Metric', 'cosine']); + expect(rows).toContainEqual(['N Neighbors', '15']); + }); + + it('expands info_json.quality into discrete per-metric rows', () => { + const info = JSON.stringify({ + metric: 'cosine', + quality: { + knn_overlap: { value: 0.83, k: 15, metric: 'cosine' }, + trustworthiness: { value: 0.952, k: 15, metric: 'cosine' }, + continuity: { value: 0.948, k: 15, metric: 'euclidean' }, + }, + }); + const rows = buildProjectionMetadataRows({ info_json: info }); + const map = new Map(rows); + // NOT a single "Quality" row holding a raw JSON blob + expect(map.has('Quality')).toBe(false); + expect(map.get('Knn Overlap')).toBe('0.830 (cosine, k=15)'); + expect(map.get('Trustworthiness')).toBe('0.952 (cosine, k=15)'); + expect(map.get('Continuity')).toBe('0.948 (euclidean, k=15)'); + }); + + it('renders a skipped faithfulness metric as N/A with the marker', () => { + const info = JSON.stringify({ + quality: { knn_overlap: { value: null, skipped: 'n_too_large', n: 30000 } }, + }); + const rows = buildProjectionMetadataRows({ info_json: info }); + expect(new Map(rows).get('Knn Overlap')).toBe('N/A (skipped: n_too_large)'); + }); + + it('tolerates a flat (scalar) quality value', () => { + const info = JSON.stringify({ quality: { knn_overlap: 0.83 } }); + const rows = buildProjectionMetadataRows({ info_json: info }); + expect(new Map(rows).get('Knn Overlap')).toBe('0.830'); + }); +}); diff --git a/packages/core/src/components/scatter-plot/projection-metadata-helpers.ts b/packages/core/src/components/scatter-plot/projection-metadata-helpers.ts new file mode 100644 index 00000000..b61cee0c --- /dev/null +++ b/packages/core/src/components/scatter-plot/projection-metadata-helpers.ts @@ -0,0 +1,132 @@ +import { NA_DISPLAY } from '@protspace/utils'; + +/** + * Build the display rows for a projection's metadata tooltip. + * + * Behaviour: + * - internal fields (`name`, `dimension(s)`) are skipped; + * - JSON-string fields (`info`, `info_json`, `*json*`) are parsed and flattened + * one level into individual rows; + * - the `quality` sub-object inside `info_json` (per-projection faithfulness from + * route-projection-statistics) is expanded into one row per metric — each + * showing its value plus compact provenance (distance metric, `k`) — instead of + * rendering as a single raw `JSON.stringify` blob. + */ +export function buildProjectionMetadataRows( + metadata: Record | null | undefined, +): Array<[string, string]> { + if (!metadata) return []; + + const rows: Array<[string, string]> = []; + + for (const [key, value] of Object.entries(metadata)) { + const lowerKey = key.toLowerCase(); + if (lowerKey === 'dimension' || lowerKey === 'dimensions' || lowerKey === 'name') { + continue; + } + + if (isJsonField(lowerKey) && typeof value === 'string') { + const parsed = tryParseJson(value); + if (isPlainObject(parsed)) { + for (const [innerKey, innerValue] of Object.entries(parsed)) { + if (innerKey.toLowerCase() === 'quality' && isPlainObject(innerValue)) { + rows.push(...expandQuality(innerValue)); + } else { + rows.push([formatMetadataKey(innerKey), formatMetadataValue(innerValue, innerKey)]); + } + } + continue; + } + } + + rows.push([formatMetadataKey(key), formatMetadataValue(value, key)]); + } + + return rows; +} + +/** Expand a faithfulness `quality` object into one display row per metric. */ +function expandQuality(quality: Record): Array<[string, string]> { + return Object.entries(quality).map(([metric, raw]) => [ + formatMetadataKey(metric), + formatQualityValue(raw), + ]); +} + +/** + * Format one faithfulness metric. Each is `{ value, k, metric, ... }` (engine + * Phase 1A); a missing value (the skip case) renders as N/A with its marker, and + * a bare scalar (older flat shape) is formatted directly. + */ +function formatQualityValue(raw: unknown): string { + if (isPlainObject(raw) && 'value' in raw) { + const value = raw.value; + if (value == null) { + const skipped = typeof raw.skipped === 'string' ? raw.skipped : null; + return skipped ? `${NA_DISPLAY} (skipped: ${skipped})` : NA_DISPLAY; + } + const valueStr = formatSingleValue(value, false); + const provenance: string[] = []; + if (typeof raw.metric === 'string') provenance.push(raw.metric); + if (raw.k != null) provenance.push(`k=${formatSingleValue(raw.k, false)}`); + return provenance.length > 0 ? `${valueStr} (${provenance.join(', ')})` : valueStr; + } + if (isPlainObject(raw)) { + return JSON.stringify(raw); + } + return formatSingleValue(raw, false); +} + +function isPlainObject(value: unknown): value is Record { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function isJsonField(key: string): boolean { + return key === 'info' || key === 'info_json' || key.includes('json'); +} + +function tryParseJson(value: string): unknown { + try { + return JSON.parse(value); + } catch { + return null; + } +} + +function formatMetadataKey(key: string): string { + return key + .replace(/_/g, ' ') + .replace(/([A-Z])/g, ' $1') + .split(' ') + .filter((word) => word.length > 0) + .map((word) => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()) + .join(' '); +} + +function formatMetadataValue(value: unknown, key: string): string { + if (value == null) return NA_DISPLAY; + + const lowerKey = key.toLowerCase(); + const isVarianceRatio = + lowerKey.includes('explained_variance') || lowerKey.includes('variance_ratio'); + + if (Array.isArray(value)) { + return value.map((item) => formatSingleValue(item, isVarianceRatio)).join(', '); + } + + return formatSingleValue(value, isVarianceRatio); +} + +function formatSingleValue(value: unknown, isVarianceRatio: boolean): string { + if (typeof value === 'number') { + if (Number.isInteger(value)) return value.toString(); + return value.toFixed(isVarianceRatio ? 2 : 3); + } + if (typeof value === 'boolean') { + return value ? 'Yes' : 'No'; + } + if (typeof value === 'object' && value !== null) { + return JSON.stringify(value); + } + return String(value); +} diff --git a/packages/core/src/components/scatter-plot/projection-metadata.ts b/packages/core/src/components/scatter-plot/projection-metadata.ts index 0b0f0339..d0db6b3b 100644 --- a/packages/core/src/components/scatter-plot/projection-metadata.ts +++ b/packages/core/src/components/scatter-plot/projection-metadata.ts @@ -2,8 +2,8 @@ import { LitElement, html } from 'lit'; import { property } from 'lit/decorators.js'; import { customElement } from '../../utils/safe-custom-element'; import type { Projection } from '@protspace/utils'; -import { NA_DISPLAY } from '@protspace/utils'; import { projectionMetadataStyles } from './projection-metadata.styles'; +import { buildProjectionMetadataRows } from './projection-metadata-helpers'; @customElement('protspace-projection-metadata') class ProtspaceProjectionMetadata extends LitElement { @@ -12,7 +12,7 @@ class ProtspaceProjectionMetadata extends LitElement { static styles = projectionMetadataStyles; render() { - const metadata = this._getProjectionMetadata(); + const metadata = buildProjectionMetadataRows(this.projection?.metadata); if (metadata.length === 0) { return html``; @@ -50,109 +50,6 @@ class ProtspaceProjectionMetadata extends LitElement { `; } - - /** - * Get formatted projection metadata for display - */ - private _getProjectionMetadata(): Array<[string, string]> { - if (!this.projection?.metadata) { - return []; - } - - const rawMetadata = this.projection.metadata; - const processedEntries: Array<[string, unknown]> = []; - - // Filter and process metadata entries - for (const [key, value] of Object.entries(rawMetadata)) { - // Skip internal fields - const lowerKey = key.toLowerCase(); - if (lowerKey === 'dimension' || lowerKey === 'dimensions' || lowerKey === 'name') { - continue; - } - - // Parse and flatten JSON fields - if (this._isJsonField(lowerKey) && typeof value === 'string') { - const parsed = this._tryParseJson(value); - if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) { - processedEntries.push(...Object.entries(parsed)); - continue; - } - } - - processedEntries.push([key, value]); - } - - // Format all entries - return processedEntries.map(([key, value]) => [ - this._formatMetadataKey(key), - this._formatMetadataValue(value, key), - ]); - } - - /** - * Check if a key indicates a JSON field - */ - private _isJsonField(key: string): boolean { - return key === 'info' || key === 'info_json' || key.includes('json'); - } - - /** - * Safely parse JSON string - */ - private _tryParseJson(value: string): unknown { - try { - return JSON.parse(value); - } catch { - return null; - } - } - - /** - * Format metadata key to Title Case - */ - private _formatMetadataKey(key: string): string { - return key - .replace(/_/g, ' ') - .replace(/([A-Z])/g, ' $1') - .split(' ') - .filter((word) => word.length > 0) - .map((word) => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()) - .join(' '); - } - - /** - * Format metadata value with appropriate precision - */ - private _formatMetadataValue(value: unknown, key: string): string { - if (value == null) return NA_DISPLAY; - - const lowerKey = key.toLowerCase(); - const isVarianceRatio = - lowerKey.includes('explained_variance') || lowerKey.includes('variance_ratio'); - - if (Array.isArray(value)) { - return value.map((item) => this._formatSingleValue(item, isVarianceRatio)).join(', '); - } - - return this._formatSingleValue(value, isVarianceRatio); - } - - /** - * Format a single metadata value - */ - private _formatSingleValue(value: unknown, isVarianceRatio: boolean): string { - if (typeof value === 'number') { - if (Number.isInteger(value)) return value.toString(); - return value.toFixed(isVarianceRatio ? 2 : 3); - } - if (typeof value === 'boolean') { - return value ? 'Yes' : 'No'; - } - if (typeof value === 'object' && value !== null) { - return JSON.stringify(value); - } - return String(value); - } } declare global { diff --git a/packages/utils/src/parquet/bundle-writer.ts b/packages/utils/src/parquet/bundle-writer.ts index a372e7fd..c21e0ae6 100644 --- a/packages/utils/src/parquet/bundle-writer.ts +++ b/packages/utils/src/parquet/bundle-writer.ts @@ -9,6 +9,11 @@ * - Part 3: projections_data.parquet (projection_name, identifier, x, y, z) * - Delimiter: ---PARQUET_DELIMITER--- (optional, only if settings included) * - Part 4: settings.parquet (optional, settings_json column) + * - Part 5: statistics.parquet (optional, written by the backend prep service; + * this writer does not currently emit or preserve it on re-export) + * + * Readers accept 3–5 parts; when statistics are present without settings, the + * settings slot is written as zero bytes so statistics stays the fifth part. */ import { parquetWriteBuffer } from 'hyparquet-writer'; diff --git a/services/protspace-prep/src/protspace_prep/config.py b/services/protspace-prep/src/protspace_prep/config.py index e900d92d..a2113d38 100644 --- a/services/protspace-prep/src/protspace_prep/config.py +++ b/services/protspace-prep/src/protspace_prep/config.py @@ -20,6 +20,8 @@ class Settings: annotations: str sweep_interval_seconds: int pipeline_timeout_seconds: int + stats_enabled: bool + stats_timeout_seconds: int log_level: str log_json_format: bool cors_allowed_origins: tuple[str, ...] @@ -48,6 +50,8 @@ def load_settings() -> Settings: annotations=os.getenv("PREP_ANNOTATIONS", "default"), sweep_interval_seconds=int(os.getenv("PREP_SWEEP_INTERVAL_SECONDS", "300")), pipeline_timeout_seconds=int(os.getenv("PREP_PIPELINE_TIMEOUT_SECONDS", "420")), + stats_enabled=os.getenv("PREP_STATS", "true").lower() in {"1", "true", "yes"}, + stats_timeout_seconds=int(os.getenv("PREP_STATS_TIMEOUT_SECONDS", "120")), log_level=os.getenv("PREP_LOG_LEVEL", "INFO"), log_json_format=os.getenv("PREP_LOG_JSON_FORMAT", "false").lower() in {"1", "true", "yes"}, cors_allowed_origins=_parse_origins(os.getenv("CORS_ALLOWED_ORIGIN", "")), diff --git a/services/protspace-prep/src/protspace_prep/pipeline.py b/services/protspace-prep/src/protspace_prep/pipeline.py index 962c4c93..2bd20653 100644 --- a/services/protspace-prep/src/protspace_prep/pipeline.py +++ b/services/protspace-prep/src/protspace_prep/pipeline.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio import logging +import os from pathlib import Path from typing import Awaitable, Callable, Sequence @@ -179,9 +180,159 @@ async def run_protspace_prepare( raise PipelineFailure( "protspace bundle reported success but produced no .parquetbundle file." ) + + # Best-effort projection statistics — AFTER the core bundle exists and OUTSIDE + # the pipeline timeout budget, so a stats failure/timeout can never cost the + # job or lose the bundle. + if settings.stats_enabled: + await _maybe_add_statistics( + emit, + settings, + embed_dir=embed_dir, + project_dir=project_dir, + annotations_path=annotations_path, + bundle_path=bundle_path, + ) + return bundle_path +_STATS_CLI_AVAILABLE: bool | None = None +_STATS_CLI_PROBE_LOCK = asyncio.Lock() +_STATS_PROBE_TIMEOUT_SECONDS = 10.0 + + +async def _stats_cli_available() -> bool: + """Bounded one-time probe: is the installed protspace new enough to have ``stats``? + + Caches only DEFINITIVE answers — a clean exit (0 ⇒ present, non-zero / not on + PATH ⇒ absent). A transient spawn error or a hung probe returns unavailable for + THIS call but is NOT latched, so a later job can retry. The probe is bounded by a + hard timeout and kills a hung subprocess, so it can never stall the job or leak a + concurrency slot. A lock collapses the first concurrent wave to a single probe. + """ + global _STATS_CLI_AVAILABLE + if _STATS_CLI_AVAILABLE is not None: + return _STATS_CLI_AVAILABLE + async with _STATS_CLI_PROBE_LOCK: + if _STATS_CLI_AVAILABLE is not None: # re-check after acquiring the lock + return _STATS_CLI_AVAILABLE + try: + proc = await asyncio.create_subprocess_exec( + "protspace", + "stats", + "--help", + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + except FileNotFoundError: + _STATS_CLI_AVAILABLE = False # binary not on PATH: definitively absent + return _STATS_CLI_AVAILABLE + except Exception: # noqa: BLE001 - could not spawn; transient, do not latch + logger.warning("protspace stats probe could not spawn; will retry next job", exc_info=True) + return False + try: + rc = await asyncio.wait_for(proc.wait(), timeout=_STATS_PROBE_TIMEOUT_SECONDS) + _STATS_CLI_AVAILABLE = rc == 0 # definitive + return _STATS_CLI_AVAILABLE + except asyncio.TimeoutError: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=5) + except asyncio.TimeoutError: + pass + logger.warning("protspace stats probe timed out; will retry next job") + return False # transient hang, do not latch + + +async def _maybe_add_statistics( + emit: EmitFn, + settings: Settings, + *, + embed_dir: Path, + project_dir: Path, + annotations_path: Path, + bundle_path: Path, +) -> None: + """Compute projection statistics and fold them into the bundle (best-effort). + + Runs ``protspace stats`` then re-bundles with ``-s`` under a dedicated + timeout, swallowing every failure so the already-shipped bundle stands. + """ + try: + if not await _stats_cli_available(): + logger.info("protspace `stats` subcommand unavailable; skipping statistics") + return + h5_files = sorted(embed_dir.glob("*.h5")) + if not h5_files: + return + stats_path = bundle_path.parent / "statistics.parquet" + # Auto-generated cluster-membership legend styles, written by `stats` and + # folded into the re-bundle so clusters are colored when selected. + styles_path = bundle_path.parent / "cluster_styles.json" + # Re-bundle to a sibling temp file, then atomically rename — a stats timeout + # /kill mid-write must never corrupt the already-shipped core bundle. The + # ".parquetbundle" suffix is required because `protspace bundle` forces it. + tmp_bundle = bundle_path.with_name(bundle_path.stem + ".stats-tmp.parquetbundle") + await emit("computing_statistics", {}) + try: + async with asyncio.timeout(settings.stats_timeout_seconds): + await _run_step( + "stats", + [ + "protspace", + "stats", + "-i", + str(h5_files[0]), + "-p", + str(project_dir), + # Enrich the annotations file in place with per-protein + # cluster-membership + silhouette columns so the re-bundle + # below (`bundle -a`) carries them. Faithfulness is folded + # into projections_metadata by the same command. + "-a", + str(annotations_path), + "--settings-out", + str(styles_path), + "-o", + str(stats_path), + ], + ) + if not stats_path.exists(): + return + bundle_cmd = [ + "protspace", + "bundle", + "-p", + str(project_dir), + "-a", + str(annotations_path), + "-s", + str(stats_path), + "-o", + str(tmp_bundle), + ] + # Fold the auto-generated cluster styles into the bundle's settings + # part when `stats` produced them (older engines may not). + if styles_path.exists(): + bundle_cmd[-2:-2] = ["--settings", str(styles_path)] + await _run_step("bundle", bundle_cmd) + # Promote only a fully-written temp bundle (atomic, same filesystem). + if tmp_bundle.exists(): + os.replace(tmp_bundle, bundle_path) + finally: + tmp_bundle.unlink(missing_ok=True) # clear any partial temp on failure + except asyncio.TimeoutError: + logger.warning( + "statistics step timed out after %ss; bundle ships without stats", + settings.stats_timeout_seconds, + ) + except PipelineFailure as exc: + logger.warning("statistics step failed; bundle ships without stats: %s", exc) + except Exception as exc: # noqa: BLE001 - statistics are secondary + logger.warning("statistics step error; bundle ships without stats: %s", exc) + + async def _run_step(name: str, cmd: Sequence[str]) -> None: """Run one protspace subcommand. Raise PipelineFailure on non-zero exit. diff --git a/services/protspace-prep/tests/test_pipeline.py b/services/protspace-prep/tests/test_pipeline.py index ce5ecd2b..7658488c 100644 --- a/services/protspace-prep/tests/test_pipeline.py +++ b/services/protspace-prep/tests/test_pipeline.py @@ -350,3 +350,142 @@ def test_classify_failure_matches_503_service_unavailable(): result = _classify_failure(exc) assert result.code == "BIOCENTRAL_UNAVAILABLE" assert "Biocentral embedding service is unavailable" in str(result) + + +# --------------------------------------------------------------------------- # +# Projection statistics step (bundle-first, best-effort, probed) +# --------------------------------------------------------------------------- # + + +@pytest.fixture(autouse=True) +def _reset_stats_probe_cache(): + """Keep the module-global probe cache from leaking across tests.""" + import protspace_prep.pipeline as ppl + + ppl._STATS_CLI_AVAILABLE = None + yield + ppl._STATS_CLI_AVAILABLE = None + + +async def test_stats_probe_filenotfound_is_absent_and_latched(): + import protspace_prep.pipeline as ppl + + async def boom(*args, **kwargs): + raise FileNotFoundError + + with patch("asyncio.create_subprocess_exec", new=boom): + assert await ppl._stats_cli_available() is False + assert ppl._STATS_CLI_AVAILABLE is False # binary absent → definitive, cached + + +async def test_stats_probe_timeout_is_bounded_and_not_latched(monkeypatch): + import protspace_prep.pipeline as ppl + + monkeypatch.setattr(ppl, "_STATS_PROBE_TIMEOUT_SECONDS", 0.05) + proc = MagicMock() + proc.kill = MagicMock() + + async def _hang(): + await asyncio.sleep(0.2) + return 0 + + proc.wait = _hang + + async def fake(*args, **kwargs): + return proc + + with patch("asyncio.create_subprocess_exec", new=fake): + assert await ppl._stats_cli_available() is False # bounded, returns quickly + assert proc.kill.called # hung probe was killed + assert ppl._STATS_CLI_AVAILABLE is None # transient hang → NOT latched, retried later + + +def _stats_router(ctx: JobContext, *, stats_returncode: int = 0): + """Step router that also writes a statistics parquet on the `stats` step and + a distinct bundle on the stats re-bundle (`bundle ... -s`).""" + embed_dir = ctx.output_dir / "embed" + project_dir = ctx.output_dir / "project" + annotations_path = ctx.output_dir / "annotations.parquet" + bundle_path = ctx.output_dir / "data.parquetbundle" + stats_path = ctx.output_dir / "statistics.parquet" + + async def fake_create(*args, **kwargs): + cmd = list(args) + if not cmd or cmd[0] != "protspace": + return _mock_subprocess(0) + step = cmd[1] if len(cmd) > 1 else "" + if step == "stats" and "--help" in cmd: + return _mock_subprocess(0) # feature probe + if step == "embed": + embed_dir.mkdir(parents=True, exist_ok=True) + (embed_dir / "prot_t5.h5").write_bytes(b"H5") + elif step == "annotate": + annotations_path.write_bytes(b"P") + elif step == "project": + project_dir.mkdir(parents=True, exist_ok=True) + (project_dir / "projections_metadata.parquet").write_bytes(b"M") + (project_dir / "projections_data.parquet").write_bytes(b"D") + elif step == "stats": + if stats_returncode != 0: + return _mock_subprocess(stats_returncode, [b"ERROR stats boom\n"]) + stats_path.write_bytes(b"STATS") + elif step == "bundle": + # Honor the -o target: core bundle → bundle_path; stats re-bundle → temp. + out = Path(cmd[cmd.index("-o") + 1]) + out.write_bytes(b"BUNDLE+STATS" if "-s" in cmd else b"BUNDLE") + return _mock_subprocess(0) + + return fake_create + + +async def test_statistics_folds_into_bundle_and_emits_stage(ctx): + bundle_path = ctx.output_dir / "data.parquetbundle" + emitted = [] + + async def emit(stage, payload): + emitted.append(stage) + + settings = load_settings() + with patch("protspace_prep.pipeline._stats_cli_available", new=AsyncMock(return_value=True)): + with patch("asyncio.create_subprocess_exec", new=_stats_router(ctx)): + result = await run_protspace_prepare(ctx, emit, settings=settings) + assert result == bundle_path + assert bundle_path.read_bytes() == b"BUNDLE+STATS" + assert (ctx.output_dir / "statistics.parquet").exists() + assert "computing_statistics" in emitted + + +async def test_statistics_failure_keeps_core_bundle(ctx): + bundle_path = ctx.output_dir / "data.parquetbundle" + settings = load_settings() + with patch("protspace_prep.pipeline._stats_cli_available", new=AsyncMock(return_value=True)): + with patch("asyncio.create_subprocess_exec", new=_stats_router(ctx, stats_returncode=2)): + result = await run_protspace_prepare(ctx, AsyncMock(), settings=settings) + # Stats failed → job still succeeds with the core (stats-less) bundle. + assert result == bundle_path + assert bundle_path.read_bytes() == b"BUNDLE" + + +async def test_statistics_skipped_when_subcommand_unavailable(ctx): + bundle_path = ctx.output_dir / "data.parquetbundle" + settings = load_settings() + with patch("protspace_prep.pipeline._stats_cli_available", new=AsyncMock(return_value=False)): + with patch("asyncio.create_subprocess_exec", new=_stats_router(ctx)): + result = await run_protspace_prepare(ctx, AsyncMock(), settings=settings) + assert result == bundle_path + assert bundle_path.read_bytes() == b"BUNDLE" + assert not (ctx.output_dir / "statistics.parquet").exists() + + +async def test_statistics_disabled_by_settings(ctx): + bundle_path = ctx.output_dir / "data.parquetbundle" + base = load_settings() + fields = {k: getattr(base, k) for k in base.__slots__} + fields["stats_enabled"] = False + settings = type(base)(**fields) + probe = AsyncMock(return_value=True) + with patch("protspace_prep.pipeline._stats_cli_available", new=probe): + with patch("asyncio.create_subprocess_exec", new=_stats_router(ctx)): + await run_protspace_prepare(ctx, AsyncMock(), settings=settings) + assert bundle_path.read_bytes() == b"BUNDLE" + probe.assert_not_called()