diff --git a/surface_morphometrics/accept_refinement.py b/surface_morphometrics/accept_refinement.py index 3cc6d04..c7acd96 100644 --- a/surface_morphometrics/accept_refinement.py +++ b/surface_morphometrics/accept_refinement.py @@ -164,7 +164,16 @@ def accept_one(work_dir, basename, step, radius_hit, dry_run): continue _remove(path, dry_run) - # 4. Warn if the promoted surface still needs a curvature graph. + # 4. Record which iteration was accepted (the per-iteration files are gone now, + # so this one-line marker is how `morphometrics status` reports the step). + if not dry_run: + with open(f"{work_dir}{basename}.accepted_iter", "w") as handle: + handle.write(f"{step}\n") + else: + print(f" [dry-run] would record accepted iteration {step} " + f"in {basename}.accepted_iter") + + # 5. Warn if the promoted surface still needs a curvature graph. if not has_avv: print(f" WARNING: iteration {step} was a lightweight (xcorr) iteration with no " f"curvature graph.") diff --git a/surface_morphometrics/cli.py b/surface_morphometrics/cli.py index 18cd47d..c52e137 100644 --- a/surface_morphometrics/cli.py +++ b/surface_morphometrics/cli.py @@ -24,6 +24,10 @@ _LAZY_COMMANDS = { "fetch_example": ("surface_morphometrics.fetch_example:fetch_example_cli", "Download example data for testing (small Zenodo set or full EMPIAR data)."), + "validate": ("surface_morphometrics.validate:validate_cli", + "Check a config and its seg/tomo folders are consistent before running."), + "status": ("surface_morphometrics.status:status_cli", + "Summarize what has been computed for each segmentation in the dataset."), "make_meshes": ("surface_morphometrics.segmentation_to_meshes:make_meshes_cli", "Convert segmentation MRCs into membrane meshes (step 1)."), "pycurv": ("surface_morphometrics.run_pycurv:run_pycurv_cli", @@ -60,7 +64,7 @@ # so the standard pipeline order is obvious. `new_config` is registered eagerly # but listed here for display. _SECTIONS = [ - ("Setup", ["new_config", "fetch_example"]), + ("Setup", ["new_config", "validate", "fetch_example", "status"]), ("Pipeline (run in this order)", ["make_meshes", "pycurv", "distances_orientations", "sample_density", "measure_thickness"]), ("Optional mesh refinement (after pycurv, before distances)", @@ -75,6 +79,7 @@ _NEXT_HINTS = { "new_config": "Edit the generated config.yml, then run: morphometrics make_meshes config.yml", "fetch_example": "Next: cd into the example folder, then run: morphometrics make_meshes config.yml", + "validate": "Next: if validation passed, run: morphometrics make_meshes config.yml", "make_meshes": "Next: morphometrics pycurv config.yml .surface.vtp (run per-surface, in parallel on a cluster if you can)", "pycurv": "Next: morphometrics distances_orientations config.yml (or first refine: morphometrics refine_mesh config.yml)", "refine_mesh": "Next: inspect the convergence plots, then commit one iteration: morphometrics accept_refinement config.yml ", diff --git a/surface_morphometrics/status.py b/surface_morphometrics/status.py new file mode 100644 index 0000000..c92dca2 --- /dev/null +++ b/surface_morphometrics/status.py @@ -0,0 +1,141 @@ +#! /usr/bin/env python +"""Summarize what has been computed for each segmentation in a dataset. + +``status config.yml`` scans the work_dir and reports, per segmentation and per +membrane surface, which steps have run: meshing, curvature, refinement (and the +accepted iteration), and which distance / orientation / thickness measurements +exist. Everything is *derived* from the files on disk (and the per-surface CSV +column headers), so the report always reflects the current state -- nothing is +maintained separately and there is no state to get stale. + +What each item is read from: + * mesh -- {surface}.surface.vtp exists + * curvature -- {surface}.AVV_rh{rh}.gt exists + * refinement -- the {surface}.accepted_iter marker (written by accept_refinement), + or leftover _refined_iter* files (run but not yet accepted) + * self-dist / verticality / inter / thickness -- columns in {surface}.AVV_rh{rh}.csv + +Usage: + morphometrics status config.yml + morphometrics status config.yml --output status.txt +""" + +__author__ = "Benjamin Barad" +__email__ = "benjamin.barad@gmail.com" +__license__ = "GPLv3" + +import datetime +import os +import re +from glob import glob + +import click + +from .config_utils import load_config + +DONE = "✓" # check mark +NONE = "—" # em dash + + +def _csv_columns(path): + """Column names from a CSV header (reads only the first line).""" + try: + with open(path) as handle: + return set(handle.readline().rstrip("\n").split(",")) + except OSError: + return set() + + +def _refine_status(work_dir, surface, radius_hit): + """Human-readable refinement state for one surface.""" + marker = f"{work_dir}{surface}.accepted_iter" + if os.path.exists(marker): + try: + return f"accepted@iter{open(marker).read().strip()}" + except OSError: + return "accepted" + iters = glob(f"{work_dir}{surface}_refined_iter*.surface.vtp") + if iters: + steps = sorted(int(m.group(1)) for m in + (re.search(r"_refined_iter(\d+)\.surface\.vtp$", p) for p in iters) if m) + span = f"iter{steps[0]}-{steps[-1]}" if len(steps) > 1 else f"iter{steps[0]}" + return f"run ({span}, not accepted)" + if os.path.exists(f"{work_dir}{surface}.AVV_rh{radius_hit}.gt.orig.bak"): + return "accepted (step unknown)" + return NONE + + +@click.command() +@click.argument("configfile", type=click.Path(exists=True)) +@click.option("-o", "--output", "output_path", default=None, type=click.Path(), + help="Also write the summary to this text file.") +def status_cli(configfile, output_path): + """Report what has been computed for each segmentation (derived from work_dir).""" + config = load_config(configfile, require=("seg_dir", "work_dir", "segmentation_values")) + seg_dir = config["seg_dir"] + work_dir = config["work_dir"] + tomo_dir = config.get("tomo_dir") + labels = list(config["segmentation_values"]) + radius_hit = config["curvature_measurements"]["radius_hit"] + + lines = [] + + def emit(text=""): + lines.append(text) + + emit(f"Status {NONE} {work_dir} (radius_hit {radius_hit}) " + f"{datetime.date.today().isoformat()}") + emit() + + seg_files = sorted(glob(f"{seg_dir}*.mrc")) + tomo_bases = ([os.path.basename(t)[:-4] for t in sorted(glob(f"{tomo_dir}*.mrc"))] + if tomo_dir and os.path.isdir(tomo_dir) else []) + + if not seg_files: + emit(f"(no segmentations found in {seg_dir})") + + for seg in seg_files: + seg_base = os.path.basename(seg)[:-4] + tnote = "" + if tomo_bases: + tnote = (f" [tomogram {DONE}]" if any(seg_base.startswith(t) for t in tomo_bases) + else " [tomogram missing]") + emit(f"{seg_base}{tnote}") + + meshed = [] + for label in labels: + surface = f"{seg_base}_{label}" + if not os.path.exists(f"{work_dir}{surface}.surface.vtp"): + continue + meshed.append(label) + curv = os.path.exists(f"{work_dir}{surface}.AVV_rh{radius_hit}.gt") + cols = _csv_columns(f"{work_dir}{surface}.AVV_rh{radius_hit}.csv") if curv else set() + refine = _refine_status(work_dir, surface, radius_hit) + partners = [o for o in labels if o != label and f"{o}_dist" in cols] + emit(f" {label:<8} mesh {DONE} curv {DONE if curv else NONE} " + f"refine {refine} self-dist {DONE if 'self_dist_min' in cols else NONE} " + f"vert {DONE if 'verticality' in cols else NONE} " + f"inter {', '.join(partners) if partners else NONE} " + f"thickness {DONE if 'thickness' in cols else NONE}") + + if not meshed: + emit(" (no meshes yet)") + else: + missing = [label for label in labels if label not in meshed] + if missing: + emit(f" (no mesh for: {', '.join(missing)})") + emit() + + emit(f"legend: {DONE} done {NONE} not done inter: partner surfaces with " + "distance/orientation measured") + + report = "\n".join(lines) + click.echo(report) + if output_path: + with open(output_path, "w") as handle: + handle.write(report + "\n") + click.echo(f"\nWrote {output_path}") + + +if __name__ == "__main__": + status_cli() diff --git a/surface_morphometrics/validate.py b/surface_morphometrics/validate.py new file mode 100644 index 0000000..61322f0 --- /dev/null +++ b/surface_morphometrics/validate.py @@ -0,0 +1,189 @@ +#! /usr/bin/env python +"""Validate that a config.yml and its input folders are set up consistently. + +Run this before the pipeline to catch the common setup mistakes up front: + + * ``seg_dir`` exists and holds segmentation MRCs (an error if not), + * ``tomo_dir`` exists (a *warning* only -- tomograms are needed for density + sampling, thickness, and refinement, but not for the curvature-only pipeline), + * each segmentation has a matching tomogram under the prefix convention the rest + of the pipeline uses (the tomogram name is a prefix of the segmentation name), + so a name mismatch surfaces here instead of as a silently skipped tomogram, + * which of the configured ``segmentation_values`` labels actually appear in each + segmentation (flagging segmentations where none of them appear, or where a label + value is present that the config does not map). + +Usage: + morphometrics validate config.yml +""" + +__author__ = "Benjamin Barad" +__email__ = "benjamin.barad@gmail.com" +__license__ = "GPLv3" + +import os +import sys +from glob import glob + +import click +import numpy as np + +from .config_utils import load_config + + +def _present_label_values(path): + """Set of integer label values present in a segmentation MRC (0/background dropped). + + Reads slice-by-slice via mmap so a tomogram-sized volume does not have to be held + in memory at once. Returns None if the data could not be read. + """ + import mrcfile + + present = set() + try: + with mrcfile.mmap(path, permissive=True, mode="r") as mrc: + data = mrc.data + if data is None: + return None + for plane in (data if data.ndim == 3 else [data]): + for value in np.unique(plane): + rounded = round(float(value)) + if rounded != 0 and abs(float(value) - rounded) < 1e-3: + present.add(int(rounded)) + except Exception: + return None + return present + + +@click.command() +@click.argument("configfile", type=click.Path(exists=True)) +def validate_cli(configfile): + """Check that CONFIGFILE and its seg/tomo folders are set up consistently. + + Verifies the input directories exist, that every segmentation has a matching + tomogram, and reports which configured labels are present in each segmentation. + Exits non-zero if there are errors (warnings do not fail). + """ + config = load_config(configfile, require=("seg_dir", "segmentation_values")) + seg_dir = config["seg_dir"] + tomo_dir = config.get("tomo_dir") + seg_values = config["segmentation_values"] + value_to_name = {int(v): name for name, v in seg_values.items()} + + errors = 0 + warnings = 0 + + def error(msg): + nonlocal errors + errors += 1 + click.echo(f" ERROR: {msg}") + + def warn(msg): + nonlocal warnings + warnings += 1 + click.echo(f" WARNING: {msg}") + + def ok(msg): + click.echo(f" OK: {msg}") + + click.echo(f"Validating {configfile}\n") + + # --- input folders --- + click.echo("Folders:") + seg_dir_ok = os.path.isdir(seg_dir) + if seg_dir_ok: + ok(f"seg_dir exists: {seg_dir}") + else: + error(f"seg_dir does not exist: {seg_dir}") + + tomo_note = "(needed only for density sampling, thickness, and refinement)" + tomo_present = bool(tomo_dir) and os.path.isdir(tomo_dir) + if not tomo_dir: + warn(f"tomo_dir is not set {tomo_note}") + elif tomo_present: + ok(f"tomo_dir exists: {tomo_dir}") + else: + warn(f"tomo_dir does not exist: {tomo_dir} {tomo_note}") + + if not seg_dir_ok: + return _finish(errors, warnings) + + seg_files = sorted(glob(seg_dir + "*.mrc")) + if not seg_files: + error(f"no .mrc segmentations found in seg_dir: {seg_dir}") + return _finish(errors, warnings) + + tomo_bases = ([os.path.basename(t)[:-4] for t in sorted(glob(tomo_dir + "*.mrc"))] + if tomo_present else []) + + click.echo("\nConfigured labels: " + + ", ".join(f"{name}={value}" for name, value in seg_values.items())) + click.echo(f"\nSegmentations ({len(seg_files)} found):") + configured = set(value_to_name) + + for seg in seg_files: + seg_base = os.path.basename(seg)[:-4] + click.echo(f"\n {os.path.basename(seg)}") + + # Matching tomogram: its basename is a prefix of the segmentation basename. + if tomo_present: + matches = sorted((t for t in tomo_bases if seg_base.startswith(t)), + key=len, reverse=True) + if matches: + ok(f"matching tomogram: {matches[0]}.mrc") + else: + warn(f"no matching tomogram in {tomo_dir} (expected one whose name is a " + f"prefix of '{seg_base}'); density/thickness/refinement will skip " + "this segmentation -- check for a name mismatch") + + # Which configured labels actually appear in the volume. + present = _present_label_values(seg) + if present is None: + warn("could not read MRC data to check label values") + continue + present_configured = sorted(configured & present) + if present_configured: + ok("labels present: " + + ", ".join(f"{value_to_name[v]}({v})" for v in present_configured)) + else: + warn("none of the configured label values appear in this segmentation -- " + "check segmentation_values") + absent = sorted(configured - present) + if absent: + click.echo(" configured but absent here: " + + ", ".join(f"{value_to_name[v]}({v})" for v in absent)) + extra = sorted(present - configured) + if extra: + warn(f"label value(s) present but not in segmentation_values: {extra}") + + # The other direction: tomograms that match no segmentation. A tomogram is "used" + # if some segmentation name starts with it (the same prefix rule); an unmatched one + # is usually a name mismatch and will simply be skipped by the pipeline. + if tomo_present: + seg_bases = [os.path.basename(s)[:-4] for s in seg_files] + click.echo(f"\nTomograms ({len(tomo_bases)} found):") + orphans = [t for t in tomo_bases if not any(s.startswith(t) for s in seg_bases)] + if orphans: + for t in orphans: + warn(f"tomogram {t}.mrc matches no segmentation in {seg_dir} (no " + "segmentation name starts with it); it will not be used -- check " + "for a name mismatch") + else: + ok("every tomogram matches a segmentation") + + return _finish(errors, warnings) + + +def _finish(errors, warnings): + click.echo("\n" + "-" * 60) + if errors: + click.echo(f"FAILED: {errors} error(s), {warnings} warning(s).") + sys.exit(1) + elif warnings: + click.echo(f"OK with {warnings} warning(s).") + else: + click.echo("All checks passed.") + + +if __name__ == "__main__": + validate_cli() diff --git a/tests/test_status.py b/tests/test_status.py new file mode 100644 index 0000000..4341656 --- /dev/null +++ b/tests/test_status.py @@ -0,0 +1,83 @@ +"""Tests for the `morphometrics status` command (derived from work_dir artifacts).""" +from click.testing import CliRunner + +from surface_morphometrics.status import status_cli, _refine_status + + +def _touch(path): + open(path, "w").close() + + +def _csv(path, columns): + with open(path, "w") as handle: + handle.write(",".join(columns) + "\n") + + +def _dataset(tmp_path): + """A work_dir with IMM fully processed (+accepted iter 4) and OMM meshed only.""" + seg, work = tmp_path / "segs", tmp_path / "work" + seg.mkdir(); work.mkdir() + _touch(seg / "TS1_labels.mrc") + base = "TS1_labels" + # IMM: full pipeline + _touch(work / f"{base}_IMM.surface.vtp") + _touch(work / f"{base}_IMM.AVV_rh9.gt") + _csv(work / f"{base}_IMM.AVV_rh9.csv", + ["index", "kappa_1", "verticality", "self_dist_min", + "OMM_dist", "OMM_orientation", "thickness", "bilayer_resolution"]) + (work / f"{base}_IMM.accepted_iter").write_text("4\n") + # OMM: meshed only (no curvature yet) + _touch(work / f"{base}_OMM.surface.vtp") + cfg = tmp_path / "c.yml" + cfg.write_text( + f"seg_dir: {seg}/\nwork_dir: {work}/\n" + "segmentation_values:\n IMM: 1\n OMM: 2\n ER: 3\n" + "curvature_measurements:\n radius_hit: 9\n") + return cfg + + +def test_status_reports_per_surface_state(tmp_path): + r = CliRunner().invoke(status_cli, [str(_dataset(tmp_path))]) + assert r.exit_code == 0 + out = r.output + # IMM: everything done, accepted iter shown, inter partner + thickness derived + assert "accepted@iter4" in out + assert "inter OMM" in out + assert "thickness ✓" in out + # OMM: meshed but no curvature -> no measurements + assert "OMM" in out and "curv —" in out + # configured-but-unmeshed label noted + assert "no mesh for: ER" in out + + +def test_status_writes_output_file(tmp_path): + out = tmp_path / "status.txt" + r = CliRunner().invoke(status_cli, [str(_dataset(tmp_path)), "-o", str(out)]) + assert r.exit_code == 0 and out.exists() + assert "accepted@iter4" in out.read_text() + + +def test_refine_status_run_but_not_accepted(tmp_path): + work = tmp_path / "w"; work.mkdir() + for n in (1, 2, 3): + _touch(work / f"S_IMM_refined_iter{n}.surface.vtp") + state = _refine_status(str(work) + "/", "S_IMM", 9) + assert "not accepted" in state and "iter1-3" in state + + +def test_accept_refinement_writes_marker(tmp_path): + from surface_morphometrics.accept_refinement import accept_one + work = str(tmp_path) + "/" + base = "TS1_labels_IMM" + _touch(tmp_path / f"{base}_refined_iter2.surface.vtp") # the iteration to accept + accept_one(work, base, 2, radius_hit=9, dry_run=False) + assert (tmp_path / f"{base}.accepted_iter").read_text().strip() == "2" + + +def test_accept_refinement_marker_dry_run(tmp_path): + from surface_morphometrics.accept_refinement import accept_one + work = str(tmp_path) + "/" + base = "TS1_labels_IMM" + _touch(tmp_path / f"{base}_refined_iter2.surface.vtp") + accept_one(work, base, 2, radius_hit=9, dry_run=True) + assert not (tmp_path / f"{base}.accepted_iter").exists() # dry-run writes nothing diff --git a/tests/test_validate.py b/tests/test_validate.py new file mode 100644 index 0000000..dcc4438 --- /dev/null +++ b/tests/test_validate.py @@ -0,0 +1,91 @@ +"""Tests for the `morphometrics validate` command.""" +import numpy as np +import pytest + +mrcfile = pytest.importorskip("mrcfile") +from click.testing import CliRunner + +from surface_morphometrics.validate import validate_cli, _present_label_values + + +def _write_seg(path, values): + """Write a tiny segmentation MRC containing exactly the given label values.""" + arr = np.zeros((4, 4, 4), dtype=np.int8) + for i, v in enumerate(values): + arr.flat[i] = v + with mrcfile.new(str(path), overwrite=True) as mrc: + mrc.set_data(arr) + + +def _config(path, seg_dir, segvals, tomo_dir=None): + lines = [f"seg_dir: {seg_dir}/"] + if tomo_dir is not None: + lines.append(f"tomo_dir: {tomo_dir}/") + lines.append("segmentation_values:") + lines += [f" {name}: {val}" for name, val in segvals.items()] + path.write_text("\n".join(lines) + "\n") + + +def test_present_label_values(tmp_path): + p = tmp_path / "s.mrc" + _write_seg(p, [1, 2, 2, 3]) + assert _present_label_values(str(p)) == {1, 2, 3} + + +def test_validate_matches_tomogram_and_reports_labels(tmp_path): + seg, tomo = tmp_path / "segs", tmp_path / "tomos" + seg.mkdir(); tomo.mkdir() + _write_seg(seg / "TS1_labels.mrc", [1, 2]) + _write_seg(tomo / "TS1.mrc", [0]) # tomogram content is irrelevant here + cfg = tmp_path / "c.yml" + _config(cfg, seg, {"IMM": 1, "OMM": 2}, tomo_dir=tomo) + r = CliRunner().invoke(validate_cli, [str(cfg)]) + assert r.exit_code == 0 + assert "matching tomogram: TS1.mrc" in r.output # prefix match + assert "IMM(1), OMM(2)" in r.output + assert "All checks passed." in r.output + + +def test_validate_warns_on_name_mismatch(tmp_path): + seg, tomo = tmp_path / "segs", tmp_path / "tomos" + seg.mkdir(); tomo.mkdir() + _write_seg(seg / "AAA_labels.mrc", [1]) + _write_seg(tomo / "BBB.mrc", [0]) # name does not prefix-match the seg + cfg = tmp_path / "c.yml" + _config(cfg, seg, {"IMM": 1}, tomo_dir=tomo) + r = CliRunner().invoke(validate_cli, [str(cfg)]) + assert r.exit_code == 0 # warnings do not fail + assert "no matching tomogram" in r.output + + +def test_validate_warns_on_orphan_tomogram(tmp_path): + seg, tomo = tmp_path / "segs", tmp_path / "tomos" + seg.mkdir(); tomo.mkdir() + _write_seg(seg / "TS1_labels.mrc", [1]) + _write_seg(tomo / "TS1.mrc", [0]) # matched by the segmentation + _write_seg(tomo / "EXTRA.mrc", [0]) # orphan: no segmentation starts with it + cfg = tmp_path / "c.yml" + _config(cfg, seg, {"IMM": 1}, tomo_dir=tomo) + r = CliRunner().invoke(validate_cli, [str(cfg)]) + assert r.exit_code == 0 + assert "EXTRA.mrc matches no segmentation" in r.output + assert "matching tomogram: TS1.mrc" in r.output # the matched one still reported + + +def test_validate_warns_on_unmapped_label_value(tmp_path): + seg = tmp_path / "segs" + seg.mkdir() + _write_seg(seg / "TS1_labels.mrc", [1, 2]) # value 2 present but not configured + cfg = tmp_path / "c.yml" + _config(cfg, seg, {"IMM": 1}) + r = CliRunner().invoke(validate_cli, [str(cfg)]) + assert r.exit_code == 0 + assert "present but not in segmentation_values" in r.output + + +def test_validate_errors_on_missing_seg_dir(tmp_path): + cfg = tmp_path / "c.yml" + cfg.write_text("seg_dir: /no/such/dir/\nsegmentation_values:\n IMM: 1\n") + r = CliRunner().invoke(validate_cli, [str(cfg)]) + assert r.exit_code == 1 + assert "seg_dir does not exist" in r.output