tsenoner · jcoludar · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/app/src/explore/fasta-prep-client.ts b/app/src/explore/fasta-prep-client.ts
@@ -83,7 +83,13 @@ function describeBundleFailure(status: number): string {
 }
 
 /** @public */
-export type FastaPrepStage = 'queued' | 'embedding' | 'projecting' | 'annotating' | 'bundling';
+export type FastaPrepStage =
+  | 'queued'
+  | 'embedding'
+  | 'projecting'
+  | 'annotating'
+  | 'bundling'
+  | 'computing_statistics';
 
 /** @public */
 export interface FastaPrepOptions {

diff --git a/app/src/explore/runtime.ts b/app/src/explore/runtime.ts
@@ -191,8 +191,18 @@ export async function initializeExploreRuntime(): Promise<ExploreController> {
               lastProgress = 70;
               overlayController.update(true, lastProgress, 'Preparing FASTA…', 'Projecting…');
             } else if (stage === 'bundling') {
+              stopCreep();
               lastProgress = 90;
               overlayController.update(true, lastProgress, 'Preparing FASTA…', 'Bundling…');
+            } else if (stage === 'computing_statistics') {
+              stopCreep();
+              lastProgress = 95;
+              overlayController.update(
+                true,
+                lastProgress,
+                'Preparing FASTA…',
+                'Computing statistics…',
+              );
             }
           },
         });

diff --git a/openspec/changes/add-projection-statistics/.openspec.yaml b/openspec/changes/add-projection-statistics/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-06-23
diff --git a/openspec/changes/add-projection-statistics/design.md b/openspec/changes/add-projection-statistics/design.md
diff --git a/openspec/changes/add-projection-statistics/proposal.md b/openspec/changes/add-projection-statistics/proposal.md
@@ -0,0 +1,103 @@
+## Why
+
+ProtSpace renders dimensionality-reduction projections but gives users **no quantitative way to
+judge them**: _Is this projection meaningful? How many clusters are there? Can I trust the geometry,
+or did the reduction distort it?_ Issue #216 catalogued the statistics ProtSpace needs to be
+competitive; this change (issue #219) implements the MVP.
+
+Today the preparation pipeline (`embed → project → annotate → bundle`) ships coordinates and
+annotations with **zero quality metrics**, so interpretation is purely visual. This change adds a
+**projection-statistics** capability computed at prep time and baked into the `.parquetbundle`,
+covering two complementary questions per projection:
+
+- **Cluster structure** — KMeans with an **elbow** estimate of the optimal cluster count, scored by
+  **silhouette**, **Davies–Bouldin**, and **Calinski–Harabasz** (issue #219's metric set).
+- **Projection faithfulness** — **kNN-overlap** and **trustworthiness / continuity** between the
+  original embedding and the projection, i.e. how much the reduction preserved or distorted the
+  neighbourhood structure. These are the metrics that most directly tell a user whether to trust the
+  map (#216's "competitive" framing).
+
+It is built as an **expandable subsystem** so further statistics can be added without rework. The
+honest boundary of that claim (see `design.md`): the registry + tidy long-format table make new
+**scalar** statistics, label sources, and spaces cheap to add (new rows, no schema change); the
+richer pairwise/neighborhood analyses prototyped in the standalone `ProtSpaceExtractor` script
+(query↔reference proximity, cross-method consensus, Top-N mining) are pair/edge/set-shaped and will
+ride **their own future typed bundle parts**, reusing the same registry pattern but not this table.
+
+## What Changes
+
+Two PRs: the engine (`protspace`) first, then this repo (`protspace_web`) consumes it.
+
+- **Engine — `protspace` package (separate PR):** a new `protspace.stats` module with a generalized
+  `Statistic` contract (each statistic declares the inputs it needs — projection coords, embeddings,
+  and/or labels — and returns one or more result rows) and a light registry. MVP statistics, all
+  **per projection**:
+  - `cluster_validity`: KMeans sweep + elbow (knee on the inertia curve) selecting `K`; then
+    silhouette, Davies–Bouldin, Calinski–Harabasz on the projection coordinates at that `K`.
+  - `faithfulness`: kNN-overlap and trustworthiness / continuity between the embedding and the
+    projection.
+    Wired into `ReductionPipeline.run` — the one stage holding embeddings **and** projections —
+    behind a `--stats/--no-stats` flag, and exposed as `protspace stats -i emb.h5 -p project_dir
+-o statistics.parquet` (no annotations needed for the MVP).
+- **Bundle format:** an optional **fifth part** `statistics.parquet`. Layout
+  `core(3) + settings? + statistics?`; when statistics is present without settings, a **zero-byte
+  settings slot** keeps the fifth position unambiguous. **All** bundle readers/writers are updated:
+  `write_bundle`, `read_bundle`, `extract_bundle_to_dir`, **and `replace_settings_in_bundle`** (the
+  `protspace style` path, which today would silently drop a fifth part).
+- **Prep service (`services/protspace-prep`):** the core bundle is **produced first**; a `stats` step
+  then runs `protspace stats` best-effort under its **own nested timeout**, caught locally so it can
+  never reach the parent handler, re-bundling with `-s` on success. On stats failure or timeout the
+  already-shipped stats-less bundle stands (the job never fails for a secondary artifact). The
+  `protspace` dependency floor is raised to the stats-bearing release and the step feature-probes the
+  subcommand before use.
+- **Frontend reader (`@protspace/utils` + core data-loader):** accept a five-part bundle **without
+  error** (the existing test asserting five-part bundles are _rejected_ is inverted). The statistics
+  part is parsed-but-unused; rendering is a committed follow-up, out of scope here.
+
+**Scope (MVP):** per-projection `cluster_validity` (unsupervised/elbow) + `faithfulness`, baked at
+prep time, carried in the bundle, not yet rendered.
+
+**Non-goals (explicit, non-breaking expansions):** embedding-space cluster-validity; annotation-
+feature label sources; interactive / on-demand recompute; the broader `ProtSpaceExtractor` analyses
+(future typed bundle parts); frontend rendering of the statistics. The registries and long-format
+table leave seams for the scalar expansions; the others are acknowledged as new parts/work.
+
+## Capabilities
+
+### New Capabilities
+
+- `projection-statistics`: per-projection cluster-validity and faithfulness statistics computed at
+  preparation time and carried in the `.parquetbundle` as an optional statistics part. Covers the
+  bundle-boundary data contract (a stable tidy long-format table), production by the prep pipeline,
+  the backward-compatible fifth-part layout (including the `protspace style` round-trip),
+  reproducibility and robustness guards, and reader tolerance.
+
+## Impact
+
+- **Upstream (`protspace` repo, separate PR):**
+  - New `src/protspace/stats/` package (generalized `Statistic` contract + registry, cluster-validity
+    statistics, faithfulness statistics, driver).
+  - `data/io/bundle.py`: `write_bundle` / `read_bundle` / `extract_bundle_to_dir` extended to a fifth
+    statistics part (`STATISTICS_FILENAME`), **and `replace_settings_in_bundle` updated to preserve a
+    trailing statistics part** so `protspace style` is non-lossy.
+  - `utils/add_annotation_style.py` (the `style` command) verified/tested against five-part bundles.
+  - `data/processors/base_processor.py` (`create_output`/`save_output`) and `pipeline.py`
+    (`ReductionPipeline.run`) thread an optional statistics table; embeddings (already in memory) feed
+    faithfulness.
+  - `cli/prepare.py` gains `--stats/--no-stats`; new `cli/stats.py`; `cli/bundle.py` gains
+    `-s/--statistics`.
+  - Tests with **known-answer numeric fixtures** (blob separation; faithfulness on identity vs random
+    projections) and a label-permutation alignment test.
+  - No new dependency: scikit-learn (KMeans, silhouette/DB/CH, `manifold.trustworthiness`) is already a
+    core dep; the elbow knee is ported from `ProtSpaceExtractor` (distance-to-chord, **argmax index →
+    K**).
+- **This repo (`protspace_web`):**
+  - `services/protspace-prep/src/protspace_prep/pipeline.py` (+ `config.py`): the `stats` step with its
+    own timeout, failure isolation, version probe, and an SSE `computing_statistics` stage;
+    `services/protspace-prep/tests/`.
+  - `packages/core/src/components/data-loader/utils/bundle.ts` (+ `packages/utils/src/parquet/*`):
+    accept the optional fifth part; **invert** `bundle.test.ts`'s five-part-rejection test; document
+    that frontend re-export (`createParquetBundle`) currently drops the part.
+- **Data-format change:** additive, backward compatible. Existing three- and four-part bundles read
+  and write unchanged.
+- **API / dependencies:** no HTTP API change; no new dependencies.
diff --git a/openspec/changes/add-projection-statistics/specs/projection-statistics/spec.md b/openspec/changes/add-projection-statistics/specs/projection-statistics/spec.md
@@ -0,0 +1,178 @@
+## ADDED Requirements
+
+### Requirement: Per-projection statistics are computed at preparation time
+
+The preparation pipeline SHALL compute, for each projection, both **cluster-validity** and
+**projection-faithfulness** statistics during data preparation and embed them in the produced
+`.parquetbundle` when statistics are enabled (the default). Cluster-validity SHALL include the
+silhouette score, the Davies–Bouldin index, the Calinski–Harabasz index, and the elbow-estimated
+optimal cluster count, and its rows SHALL carry `label_kind` `kmeans_elbow`. Faithfulness SHALL
+include kNN-overlap and trustworthiness/continuity of the projection relative to its source
+embedding.
+
+#### Scenario: Statistics are produced by default
+
+- **WHEN** a bundle is prepared from embeddings with statistics enabled
+- **THEN** the bundle contains a statistics part with, for every projection, cluster-validity rows
+  (silhouette, davies_bouldin, calinski_harabasz, n_clusters) and faithfulness rows (knn_overlap,
+  trustworthiness, continuity)
+
+#### Scenario: Statistics can be disabled
+
+- **WHEN** preparation is run with statistics disabled (`--no-stats` / prep flag off)
+- **THEN** no statistics part is produced and the bundle remains a valid three- or four-part bundle
+
+### Requirement: The elbow estimates the optimal cluster count from the inertia knee
+
+The unsupervised cluster source SHALL sweep KMeans across a bounded range of cluster counts on the
+projection coordinates and select the optimal count using the **index** of the maximum
+perpendicular deviation of the inertia curve from its first-to-last chord (not a distance value). It
+SHALL report the selected count as `n_clusters` with `metric_kind` `meta`, label points accordingly
+so the validity metrics score that clustering, and record a knee-confidence indicator and the
+silhouette-optimal count for cross-checking.
+
+#### Scenario: Elbow recovers a known cluster count
+
+- **WHEN** the data forms `k` well-separated clusters
+- **THEN** the elbow-estimated `n_clusters` is within one of `k` and the silhouette of the selected
+  clustering is high relative to an overlapping-cluster baseline
+
+#### Scenario: No clear knee is flagged, not faked
+
+- **WHEN** the inertia curve is approximately linear (no distinct knee)
+- **THEN** the selected count is still emitted but marked with low knee confidence in `extra_json`
+
+### Requirement: Faithfulness compares embedding and projection neighbourhoods
+
+Faithfulness statistics SHALL take a projection and **its source embedding** as input and measure how
+well the projection preserves each point's neighbourhood. Continuity SHALL be computed as
+trustworthiness with the embedding and projection arguments swapped, and the high-dimensional
+distance metric SHALL be applied to whichever computation has the embedding as its primary input.
+The high-dimensional metric SHALL default to the projection's own reducer metric (Euclidean by
+default), falling back to cosine only when unknown, and SHALL be recorded per row. These statistics
+carry `metric_kind` `faithfulness` and `label_kind` `none`, and SHALL bound their cost on large
+inputs (sampling a shared subset above a threshold, skipping with a recorded marker beyond a hard
+ceiling) since trustworthiness materialises a full pairwise distance matrix.
+
+#### Scenario: A faithful projection scores high
+
+- **WHEN** the projection preserves neighbourhoods (e.g. a near-identity reduction)
+- **THEN** kNN-overlap and trustworthiness are near their maximum
+
+#### Scenario: A distorting projection scores lower
+
+- **WHEN** the projection scrambles neighbourhoods (e.g. a random projection)
+- **THEN** kNN-overlap and trustworthiness are markedly lower, with the neighbourhood size and the
+  per-row distance metric recorded in `extra_json`
+
+#### Scenario: Each projection uses its own embedding
+
+- **WHEN** a run produces projections from more than one embedding
+- **THEN** each projection's faithfulness is computed against the embedding that produced it (matched
+  by name and id-intersection join), and any projection whose embedding is unavailable is skipped
+  with a recorded marker rather than scored against the wrong embedding
+
+#### Scenario: Large inputs are bounded
+
+- **WHEN** the number of points exceeds the sampling threshold
+- **THEN** faithfulness (and silhouette) are computed on a fixed-seed shared subsample with the
+  sample size recorded, and beyond a hard ceiling the statistic is skipped with a recorded marker
+  rather than exhausting memory
+
+### Requirement: Statistics are a stable tidy long-format table with joinable keys
+
+The statistics part SHALL be a tidy long-format table with a fixed eight-column schema: `space_kind`
+(string), `space_name` (string), `stat_family` (string), `label_kind` (string), `metric` (string),
+`metric_kind` (string), `value` (double), and `extra_json` (string), with one statistic value per
+row. `metric_kind` (`validity` | `meta` | `faithfulness`) SHALL be a column so consumers can
+aggregate validity scores without folding in meta rows such as `n_clusters`. `space_name` SHALL equal
+the corresponding `projections_metadata.projection_name` so the table is joinable without string-
+parsing. Adding a new scalar statistic, label source, or space SHALL add **rows** and SHALL NOT
+change the column schema; any per-source attribute (e.g. an annotation column name) SHALL be carried
+inside `extra_json`, not as a new column.
+
+#### Scenario: Each row is self-describing and joinable
+
+- **WHEN** a consumer reads the statistics part
+- **THEN** every row identifies its space (kind + name), family, label kind, metric, and metric kind,
+  carries a numeric value, and its `space_name` matches a `projection_name` in the projections metadata
+
+#### Scenario: Meta rows are separable from validity rows
+
+- **WHEN** a consumer aggregates cluster-validity scores
+- **THEN** it can exclude `n_clusters` by its `metric_kind` `meta` column without parsing `extra_json`
+
+#### Scenario: New statistics do not change the schema
+
+- **WHEN** a later expansion adds an embedding space or an annotation-feature label source
+- **THEN** it appears as additional rows (e.g. `space_kind` `embedding`, or `label_kind` `annotation`
+  with the source column in `extra_json`) under the same eight-column schema
+
+### Requirement: Statistics ride in the bundle as an optional fifth part, backward compatibly
+
+The `.parquetbundle` SHALL carry statistics as an optional fifth parquet part in the positional order
+`core(3) + settings? + statistics?`. When statistics are present but settings are absent, a zero-byte
+settings part SHALL occupy the fourth slot, and all readers and writers SHALL distinguish settings-
+present from settings-absent by the **fourth part's emptiness**, not by raw part count. All bundle
+readers and writers — including `read_bundle` (whose existing return shape is preserved so its callers
+do not break), `extract_bundle_to_dir`, and `replace_settings_in_bundle` (the styling path) — SHALL
+handle the fifth part; three- and four-part bundles SHALL retain their exact current meaning.
+
+#### Scenario: Legacy bundles are unaffected
+
+- **WHEN** a three-part (core only) or four-part (core + settings) bundle is read or rewritten
+- **THEN** it behaves exactly as before, with no statistics part
+
+#### Scenario: Five-part bundle round-trips
+
+- **WHEN** a five-part bundle (core + settings + statistics) is read
+- **THEN** projection, annotation, and settings data load normally and the statistics part is
+  recovered
+
+#### Scenario: Styling preserves statistics for both shapes
+
+- **WHEN** an annotation-styling / settings-replacement operation rewrites a statistics-bearing
+  bundle, whether it has display settings or only a zero-byte settings slot
+- **THEN** the statistics part is preserved (not dropped), with a valid settings part written and the
+  statistics part kept as the fifth
+
+### Requirement: The frontend reader tolerates the statistics part
+
+The frontend bundle reader SHALL load a five-part bundle without error in both the settings+statistics
+and the zero-byte-settings (statistics-only) shapes, distinguishing them by the fourth part's
+emptiness, and treating the statistics part as optional and parsed-but-unused. Its presence or absence
+SHALL NOT affect rendering of projections, annotations, or settings.
+
+#### Scenario: App loads a statistics-bearing bundle
+
+- **WHEN** the app loads a five-part bundle produced by the prep pipeline (with or without settings)
+- **THEN** the projection renders normally with no error and the statistics part is ignored for now
+
+### Requirement: Statistics computation is non-fatal, reproducible, and guarded
+
+Statistics are secondary to the core bundle. In the prep service the **core bundle SHALL be produced
+first**, and computing statistics — under its own bounded timeout caught locally — SHALL NOT fail the
+job, lose the bundle, or consume the budget the bundle needs; a stats failure or timeout SHALL leave
+the already-produced stats-less bundle in place. Computation SHALL be deterministic under a recorded
+seed (KMeans, silhouette sampling, and faithfulness subsampling), and SHALL guard expensive and
+degenerate cases: silhouette and faithfulness SHALL use a bounded shared sample above a size
+threshold and skip beyond a hard ceiling; clusters with fewer than two members SHALL be excluded from
+Davies–Bouldin / Calinski–Harabasz; and inputs too small to score (fewer than three points, a single
+cluster) SHALL yield no row rather than raising. Provenance (seed, neighbourhood size, distance
+metric, sampling, knee confidence, source embedding) SHALL be recorded in `extra_json`.
+
+#### Scenario: A computation failure does not fail the job or lose the bundle
+
+- **WHEN** statistics computation fails or times out in the prep service
+- **THEN** the job still succeeds and the core bundle (produced before the stats step) is delivered
+  without a statistics part
+
+#### Scenario: A stale engine degrades to skip
+
+- **WHEN** the installed `protspace` does not provide the `stats` subcommand
+- **THEN** the prep service detects this via a feature probe and skips statistics rather than failing
+
+#### Scenario: Results are reproducible
+
+- **WHEN** the same input is prepared twice with the same seed and projection
+- **THEN** the statistics values are identical, and each row records the seed used