GrotjahnLab · bbarad · Jun 28, 2026 · Jun 27, 2026 · Jun 27, 2026 · Jun 27, 2026
diff --git a/surface_morphometrics/accept_refinement.py b/surface_morphometrics/accept_refinement.py
@@ -164,7 +164,16 @@ def accept_one(work_dir, basename, step, radius_hit, dry_run):
             continue
         _remove(path, dry_run)
 
-    # 4. Warn if the promoted surface still needs a curvature graph.
+    # 4. Record which iteration was accepted (the per-iteration files are gone now,
+    #    so this one-line marker is how `morphometrics status` reports the step).
+    if not dry_run:
+        with open(f"{work_dir}{basename}.accepted_iter", "w") as handle:
+            handle.write(f"{step}\n")
+    else:
+        print(f"  [dry-run] would record accepted iteration {step} "
+              f"in {basename}.accepted_iter")
+
+    # 5. Warn if the promoted surface still needs a curvature graph.
     if not has_avv:
         print(f"  WARNING: iteration {step} was a lightweight (xcorr) iteration with no "
               f"curvature graph.")

diff --git a/surface_morphometrics/cli.py b/surface_morphometrics/cli.py
@@ -24,6 +24,10 @@
 _LAZY_COMMANDS = {
     "fetch_example": ("surface_morphometrics.fetch_example:fetch_example_cli",
                       "Download example data for testing (small Zenodo set or full EMPIAR data)."),
+    "validate": ("surface_morphometrics.validate:validate_cli",
+                 "Check a config and its seg/tomo folders are consistent before running."),
+    "status": ("surface_morphometrics.status:status_cli",
+               "Summarize what has been computed for each segmentation in the dataset."),
     "make_meshes": ("surface_morphometrics.segmentation_to_meshes:make_meshes_cli",
                     "Convert segmentation MRCs into membrane meshes (step 1)."),
     "pycurv": ("surface_morphometrics.run_pycurv:run_pycurv_cli",
@@ -60,7 +64,7 @@
 # so the standard pipeline order is obvious. `new_config` is registered eagerly
 # but listed here for display.
 _SECTIONS = [
-    ("Setup", ["new_config", "fetch_example"]),
+    ("Setup", ["new_config", "validate", "fetch_example", "status"]),
     ("Pipeline (run in this order)",
      ["make_meshes", "pycurv", "distances_orientations", "sample_density", "measure_thickness"]),
     ("Optional mesh refinement (after pycurv, before distances)",
@@ -75,6 +79,7 @@
 _NEXT_HINTS = {
     "new_config": "Edit the generated config.yml, then run:  morphometrics make_meshes config.yml",
     "fetch_example": "Next:  cd into the example folder, then run:  morphometrics make_meshes config.yml",
+    "validate": "Next:  if validation passed, run:  morphometrics make_meshes config.yml",
     "make_meshes": "Next:  morphometrics pycurv config.yml <name>.surface.vtp   (run per-surface, in parallel on a cluster if you can)",
     "pycurv": "Next:  morphometrics distances_orientations config.yml   (or first refine: morphometrics refine_mesh config.yml)",
     "refine_mesh": "Next:  inspect the convergence plots, then commit one iteration:  morphometrics accept_refinement config.yml <step>",

diff --git a/surface_morphometrics/status.py b/surface_morphometrics/status.py
@@ -0,0 +1,141 @@
+#! /usr/bin/env python
+"""Summarize what has been computed for each segmentation in a dataset.
+
+``status config.yml`` scans the work_dir and reports, per segmentation and per
+membrane surface, which steps have run: meshing, curvature, refinement (and the
+accepted iteration), and which distance / orientation / thickness measurements
+exist. Everything is *derived* from the files on disk (and the per-surface CSV
+column headers), so the report always reflects the current state -- nothing is
+maintained separately and there is no state to get stale.
+
+What each item is read from:
+  * mesh        -- {surface}.surface.vtp exists
+  * curvature   -- {surface}.AVV_rh{rh}.gt exists
+  * refinement  -- the {surface}.accepted_iter marker (written by accept_refinement),
+                   or leftover _refined_iter* files (run but not yet accepted)
+  * self-dist / verticality / inter / thickness -- columns in {surface}.AVV_rh{rh}.csv
+
+Usage:
+  morphometrics status config.yml
+  morphometrics status config.yml --output status.txt
+"""
+
+__author__ = "Benjamin Barad"
+__email__ = "benjamin.barad@gmail.com"
+__license__ = "GPLv3"
+
+import datetime
+import os
+import re
+from glob import glob
+
+import click
+
+from .config_utils import load_config
+
+DONE = "✓"   # check mark
+NONE = "—"   # em dash
+
+
+def _csv_columns(path):
+    """Column names from a CSV header (reads only the first line)."""
+    try:
+        with open(path) as handle:
+            return set(handle.readline().rstrip("\n").split(","))
+    except OSError:
+        return set()
+
+
+def _refine_status(work_dir, surface, radius_hit):
+    """Human-readable refinement state for one surface."""
+    marker = f"{work_dir}{surface}.accepted_iter"
+    if os.path.exists(marker):
+        try:
+            return f"accepted@iter{open(marker).read().strip()}"
+        except OSError:
+            return "accepted"
+    iters = glob(f"{work_dir}{surface}_refined_iter*.surface.vtp")
+    if iters:
+        steps = sorted(int(m.group(1)) for m in
+                       (re.search(r"_refined_iter(\d+)\.surface\.vtp$", p) for p in iters) if m)
+        span = f"iter{steps[0]}-{steps[-1]}" if len(steps) > 1 else f"iter{steps[0]}"
+        return f"run ({span}, not accepted)"
+    if os.path.exists(f"{work_dir}{surface}.AVV_rh{radius_hit}.gt.orig.bak"):
+        return "accepted (step unknown)"
+    return NONE
+
+
+@click.command()
+@click.argument("configfile", type=click.Path(exists=True))
+@click.option("-o", "--output", "output_path", default=None, type=click.Path(),
+              help="Also write the summary to this text file.")
+def status_cli(configfile, output_path):
+    """Report what has been computed for each segmentation (derived from work_dir)."""
+    config = load_config(configfile, require=("seg_dir", "work_dir", "segmentation_values"))
+    seg_dir = config["seg_dir"]
+    work_dir = config["work_dir"]
+    tomo_dir = config.get("tomo_dir")
+    labels = list(config["segmentation_values"])
+    radius_hit = config["curvature_measurements"]["radius_hit"]
+
+    lines = []
+
+    def emit(text=""):
+        lines.append(text)
+
+    emit(f"Status {NONE} {work_dir}  (radius_hit {radius_hit})        "
+         f"{datetime.date.today().isoformat()}")
+    emit()
+
+    seg_files = sorted(glob(f"{seg_dir}*.mrc"))
+    tomo_bases = ([os.path.basename(t)[:-4] for t in sorted(glob(f"{tomo_dir}*.mrc"))]
+                  if tomo_dir and os.path.isdir(tomo_dir) else [])
+
+    if not seg_files:
+        emit(f"(no segmentations found in {seg_dir})")
+
+    for seg in seg_files:
+        seg_base = os.path.basename(seg)[:-4]
+        tnote = ""
+        if tomo_bases:
+            tnote = (f"   [tomogram {DONE}]" if any(seg_base.startswith(t) for t in tomo_bases)
+                     else "   [tomogram missing]")
+        emit(f"{seg_base}{tnote}")
+
+        meshed = []
+        for label in labels:
+            surface = f"{seg_base}_{label}"
+            if not os.path.exists(f"{work_dir}{surface}.surface.vtp"):
+                continue
+            meshed.append(label)
+            curv = os.path.exists(f"{work_dir}{surface}.AVV_rh{radius_hit}.gt")
+            cols = _csv_columns(f"{work_dir}{surface}.AVV_rh{radius_hit}.csv") if curv else set()
+            refine = _refine_status(work_dir, surface, radius_hit)
+            partners = [o for o in labels if o != label and f"{o}_dist" in cols]
+            emit(f"  {label:<8} mesh {DONE}  curv {DONE if curv else NONE}  "
+                 f"refine {refine}  self-dist {DONE if 'self_dist_min' in cols else NONE}  "
+                 f"vert {DONE if 'verticality' in cols else NONE}  "
+                 f"inter {', '.join(partners) if partners else NONE}  "
+                 f"thickness {DONE if 'thickness' in cols else NONE}")
+
+        if not meshed:
+            emit("  (no meshes yet)")
+        else:
+            missing = [label for label in labels if label not in meshed]
+            if missing:
+                emit(f"  (no mesh for: {', '.join(missing)})")
+        emit()
+
+    emit(f"legend:  {DONE} done {NONE} not done   inter: partner surfaces with "
+         "distance/orientation measured")
+
+    report = "\n".join(lines)
+    click.echo(report)
+    if output_path:
+        with open(output_path, "w") as handle:
+            handle.write(report + "\n")
+        click.echo(f"\nWrote {output_path}")
+
+
+if __name__ == "__main__":
+    status_cli()
diff --git a/surface_morphometrics/validate.py b/surface_morphometrics/validate.py
@@ -0,0 +1,189 @@
+#! /usr/bin/env python
+"""Validate that a config.yml and its input folders are set up consistently.
+
+Run this before the pipeline to catch the common setup mistakes up front:
+
+  * ``seg_dir`` exists and holds segmentation MRCs (an error if not),
+  * ``tomo_dir`` exists (a *warning* only -- tomograms are needed for density
+    sampling, thickness, and refinement, but not for the curvature-only pipeline),
+  * each segmentation has a matching tomogram under the prefix convention the rest
+    of the pipeline uses (the tomogram name is a prefix of the segmentation name),
+    so a name mismatch surfaces here instead of as a silently skipped tomogram,
+  * which of the configured ``segmentation_values`` labels actually appear in each
+    segmentation (flagging segmentations where none of them appear, or where a label
+    value is present that the config does not map).
+
+Usage:
+  morphometrics validate config.yml
+"""
+
+__author__ = "Benjamin Barad"
+__email__ = "benjamin.barad@gmail.com"
+__license__ = "GPLv3"
+
+import os
+import sys
+from glob import glob
+
+import click
+import numpy as np
+
+from .config_utils import load_config
+
+
+def _present_label_values(path):
+    """Set of integer label values present in a segmentation MRC (0/background dropped).
+
+    Reads slice-by-slice via mmap so a tomogram-sized volume does not have to be held
+    in memory at once. Returns None if the data could not be read.
+    """
+    import mrcfile
+
+    present = set()
+    try:
+        with mrcfile.mmap(path, permissive=True, mode="r") as mrc:
+            data = mrc.data
+            if data is None:
+                return None
+            for plane in (data if data.ndim == 3 else [data]):
+                for value in np.unique(plane):
+                    rounded = round(float(value))
+                    if rounded != 0 and abs(float(value) - rounded) < 1e-3:
+                        present.add(int(rounded))
+    except Exception:
+        return None
+    return present
+
+
+@click.command()
+@click.argument("configfile", type=click.Path(exists=True))
+def validate_cli(configfile):
+    """Check that CONFIGFILE and its seg/tomo folders are set up consistently.
+
+    Verifies the input directories exist, that every segmentation has a matching
+    tomogram, and reports which configured labels are present in each segmentation.
+    Exits non-zero if there are errors (warnings do not fail).
+    """
+    config = load_config(configfile, require=("seg_dir", "segmentation_values"))
+    seg_dir = config["seg_dir"]
+    tomo_dir = config.get("tomo_dir")
+    seg_values = config["segmentation_values"]
+    value_to_name = {int(v): name for name, v in seg_values.items()}
+
+    errors = 0
+    warnings = 0
+
+    def error(msg):
+        nonlocal errors
+        errors += 1
+        click.echo(f"  ERROR:   {msg}")
+
+    def warn(msg):
+        nonlocal warnings
+        warnings += 1
+        click.echo(f"  WARNING: {msg}")
+
+    def ok(msg):
+        click.echo(f"  OK:      {msg}")
+
+    click.echo(f"Validating {configfile}\n")
+
+    # --- input folders ---
+    click.echo("Folders:")
+    seg_dir_ok = os.path.isdir(seg_dir)
+    if seg_dir_ok:
+        ok(f"seg_dir exists: {seg_dir}")
+    else:
+        error(f"seg_dir does not exist: {seg_dir}")
+
+    tomo_note = "(needed only for density sampling, thickness, and refinement)"
+    tomo_present = bool(tomo_dir) and os.path.isdir(tomo_dir)
+    if not tomo_dir:
+        warn(f"tomo_dir is not set {tomo_note}")
+    elif tomo_present:
+        ok(f"tomo_dir exists: {tomo_dir}")
+    else:
+        warn(f"tomo_dir does not exist: {tomo_dir} {tomo_note}")
+
+    if not seg_dir_ok:
+        return _finish(errors, warnings)
+
+    seg_files = sorted(glob(seg_dir + "*.mrc"))
+    if not seg_files:
+        error(f"no .mrc segmentations found in seg_dir: {seg_dir}")
+        return _finish(errors, warnings)
+
+    tomo_bases = ([os.path.basename(t)[:-4] for t in sorted(glob(tomo_dir + "*.mrc"))]
+                  if tomo_present else [])
+
+    click.echo("\nConfigured labels: "
+               + ", ".join(f"{name}={value}" for name, value in seg_values.items()))
+    click.echo(f"\nSegmentations ({len(seg_files)} found):")
+    configured = set(value_to_name)
+
+    for seg in seg_files:
+        seg_base = os.path.basename(seg)[:-4]
+        click.echo(f"\n  {os.path.basename(seg)}")
+
+        # Matching tomogram: its basename is a prefix of the segmentation basename.
+        if tomo_present:
+            matches = sorted((t for t in tomo_bases if seg_base.startswith(t)),
+                             key=len, reverse=True)
+            if matches:
+                ok(f"matching tomogram: {matches[0]}.mrc")
+            else:
+                warn(f"no matching tomogram in {tomo_dir} (expected one whose name is a "
+                     f"prefix of '{seg_base}'); density/thickness/refinement will skip "
+                     "this segmentation -- check for a name mismatch")
+
+        # Which configured labels actually appear in the volume.
+        present = _present_label_values(seg)
+        if present is None:
+            warn("could not read MRC data to check label values")
+            continue
+        present_configured = sorted(configured & present)
+        if present_configured:
+            ok("labels present: "
+               + ", ".join(f"{value_to_name[v]}({v})" for v in present_configured))
+        else:
+            warn("none of the configured label values appear in this segmentation -- "
+                 "check segmentation_values")
+        absent = sorted(configured - present)
+        if absent:
+            click.echo("           configured but absent here: "
+                       + ", ".join(f"{value_to_name[v]}({v})" for v in absent))
+        extra = sorted(present - configured)
+        if extra:
+            warn(f"label value(s) present but not in segmentation_values: {extra}")
+
+    # The other direction: tomograms that match no segmentation. A tomogram is "used"
+    # if some segmentation name starts with it (the same prefix rule); an unmatched one
+    # is usually a name mismatch and will simply be skipped by the pipeline.
+    if tomo_present:
+        seg_bases = [os.path.basename(s)[:-4] for s in seg_files]
+        click.echo(f"\nTomograms ({len(tomo_bases)} found):")
+        orphans = [t for t in tomo_bases if not any(s.startswith(t) for s in seg_bases)]
+        if orphans:
+            for t in orphans:
+                warn(f"tomogram {t}.mrc matches no segmentation in {seg_dir} (no "
+                     "segmentation name starts with it); it will not be used -- check "
+                     "for a name mismatch")
+        else:
+            ok("every tomogram matches a segmentation")
+
+    return _finish(errors, warnings)
+
+
+def _finish(errors, warnings):
+    click.echo("\n" + "-" * 60)
+    if errors:
+        click.echo(f"FAILED: {errors} error(s), {warnings} warning(s).")
+        sys.exit(1)
+    elif warnings:
+        click.echo(f"OK with {warnings} warning(s).")
+    else:
+        click.echo("All checks passed.")
+
+
+if __name__ == "__main__":
+    validate_cli()