diff --git a/pyproject.toml b/pyproject.toml index 583b622..c64133b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "surface-morphometrics" -version = "2.0.0b1" +version = "2.0.0b2" description = "Quantification of membrane surfaces segmented from cryo-ET or other volumetric imaging." readme = "README.md" requires-python = ">=3.11" @@ -57,4 +57,4 @@ morphometrics = "surface_morphometrics.cli:main" include = ["surface_morphometrics*"] [tool.setuptools.package-data] -surface_morphometrics = ["config_template.yml"] +surface_morphometrics = ["config_template.yml", "config_template_simple.yml"] diff --git a/surface_morphometrics/_thickness_worker.py b/surface_morphometrics/_thickness_worker.py index ac5ee55..992867e 100644 --- a/surface_morphometrics/_thickness_worker.py +++ b/surface_morphometrics/_thickness_worker.py @@ -8,6 +8,26 @@ import numpy as np import scipy.optimize as opt +# Bilayer fit-quality thresholds, shared by the per-triangle and global fits. +R2_THRESHOLD = 0.6 +MIN_THICKNESS = 2.0 # nm (minimum accepted peak-to-peak leaflet separation) +MAX_THICKNESS = 7.0 # nm (maximum accepted peak-to-peak leaflet separation) +# A second peak only counts as a real leaflet if it rises at least this fraction +# of the strongest peak's height above the shared solvent baseline. Comparing +# height-above-base (not find_peaks prominence) keeps a merged bilayer's second +# leaflet -- which can have a near-zero saddle and so a tiny prominence -- while +# still rejecting small flanking shoulders (e.g. neighboring protein density). +MIN_LEAFLET_HEIGHT_RATIO = 0.5 + +# "Prior-recovery" tier: when the surface's global average resolves a clean bilayer, +# a triangle whose locally-averaged profile merges into a single peak can still hold a +# real (just blurred) bilayer. Seeded from the global fit, its dual fit is trusted +# only if it is non-degenerate and center-stable -- distinguishing a recovered merged +# bilayer from a genuine single / skewed peak: +MIN_LEAFLET_AMP_RATIO = 0.3 # both leaflets must have comparable amplitude +MAX_CENTER_DISAGREEMENT = 0.7 # nm; dual center must agree with a single-Gaussian fit +PINNED_THICKNESS_EPS = 0.15 # nm above MIN_THICKNESS; reject floor-pinned (fully merged) fits + def _monogaussian(x, h, c, w): """Single gaussian function.""" @@ -24,6 +44,189 @@ def _monogaussian_with_offset(x, h, c, w, o): return _monogaussian(x, h, c, w) + o +def _dual_gaussian_centered(x, h1, w1, h2, w2, center, half_sep, o): + """Dual Gaussian parameterized by the bilayer center and half-separation. + + The two leaflet peaks sit at ``center - half_sep`` and ``center + half_sep``, + so bounding ``half_sep`` to a physical range forces one Gaussian into each + leaflet -- the fit can never collapse both peaks into the same leaflet (the + failure mode of seeding both Gaussians at the global-max peak). ``center`` is + the membrane center (the refinement offset); ``2 * half_sep`` is the thickness. + """ + return (_monogaussian(x, h1, center - half_sep, w1) + + _monogaussian(x, h2, center + half_sep, w2) + o) + + +def _dual_gaussian_shared_width(x, h1, h2, w, center, half_sep, o): + """Dual Gaussian like :func:`_dual_gaussian_centered` but with one shared width. + + A bilayer's two leaflets have essentially the same width, so tying them to a + single ``w`` is more physical -- and, critically, removes a centering bias: + with independent widths the fit absorbs *asymmetric* outer density (a heavier + flank on one side, CTF ringing) by making one leaflet wider than the other, and + because both peaks share a single ``center`` that width-tilt is converted into a + systematic shift of the fitted center toward one side. Sharing the width leaves + the center free to sit at the true bilayer midpoint. + """ + return _dual_gaussian_centered(x, h1, w, h2, w, center, half_sep, o) + + +def _symmetric_fit_window(a, b, center_seed, half_seed, buffer=1.5, min_half_width=2.5): + """Window the profile symmetrically about ``center_seed`` for the dual fit. + + The window is scaled to the seeded bilayer (``half_seed + buffer``) and kept + symmetric about the seed: if the profile extends further on one side than the + other, the window is trimmed to the shorter side so the fit sees an equal x-range + each way. An asymmetric range would otherwise give the least-squares fit more + leverage on one side and pull the center off the midpoint. Falls back to the full + region if the symmetric window would be too small to fit. + """ + a = np.asarray(a) + b = np.asarray(b) + half_width = max(half_seed + buffer, min_half_width) + half_width = min(half_width, center_seed - float(a.min()), float(a.max()) - center_seed) + mask = (a >= center_seed - half_width) & (a <= center_seed + half_width) + if int(mask.sum()) >= 5: + return a[mask], b[mask] + return a, b + + +def _seed_bilayer_center(a, b, default_center=0.0): + """Estimate (center, half_sep, n_resolved) of a bilayer profile region. + + Enumerates every local maximum (no prominence floor), anchors on the tallest + peak -- the dominant membrane density -- and pairs it with a partner leaflet a + physical bilayer thickness (MIN..MAX nm) away. The partner is validated by its + height above the shared solvent baseline (``b.min()``), not by ``find_peaks`` + prominence. This matters for two reasons: + + * A merged/thin bilayer (e.g. an OMM at this resolution) has two real leaflets + separated by a near-flat saddle, so the shorter leaflet's *prominence* is tiny + and any prominence floor -- or comparing the two leaflets' prominences -- drops + it. Height above the common base is independent of saddle depth, so both + leaflets of a merged bilayer are kept. + * Comparing height-above-base still rejects small flanking shoulders (neighbouring + protein density), which sit well below ``MIN_LEAFLET_HEIGHT_RATIO`` of the + membrane peak, and the MIN..MAX window rejects distant confounders. + + Being permissive here is safe: the caller validates every dual fit by R^2 and + fitted thickness, so a spurious partner is discarded downstream, whereas a missed + leaflet would wrongly skip the dual fit entirely. ``n_resolved`` is the number of + real leaflets (2 -> fit a dual Gaussian; 1 -> a single central peak, fall back to + a single Gaussian; 0 -> nothing resolved). ``half_sep`` is clamped to the physical + thickness range. + """ + from scipy.signal import find_peaks + + a = np.asarray(a) + b = np.asarray(b) + # All strict local maxima. No prominence floor: a merged bilayer's second leaflet + # has near-zero prominence but is still a real leaflet; it is filtered below by + # height-above-base and separation instead. + peaks, _ = find_peaks(b) + if len(peaks) == 0: + return default_center, MIN_THICKNESS / 2.0, 0 + + positions = a[peaks] + heights = b[peaks] + base = float(b.min()) + # Anchor on the tallest peak (dominant membrane density), then look for the + # partner leaflet: a peak a physical bilayer thickness (MIN..MAX) away whose + # height above the shared solvent base is a real fraction of the anchor's. + primary = int(np.argmax(heights)) + primary_rise = float(heights[primary] - base) + partner = None + best_partner_rise = 0.0 + for j in range(len(peaks)): + if j == primary: + continue + sep = abs(positions[j] - positions[primary]) + if not (MIN_THICKNESS <= sep <= MAX_THICKNESS): + continue + partner_rise = float(heights[j] - base) + if primary_rise <= 0: + continue + if (partner_rise >= MIN_LEAFLET_HEIGHT_RATIO * primary_rise + and partner_rise > best_partner_rise): + partner = j + best_partner_rise = partner_rise + if partner is not None: + p_left, p_right = sorted([positions[primary], positions[partner]]) + center = 0.5 * (p_left + p_right) + half = float(np.clip(0.5 * (p_right - p_left), + MIN_THICKNESS / 2.0, MAX_THICKNESS / 2.0)) + return float(center), half, 2 + # One real peak (a single, possibly unresolved, membrane with weak shoulders). + return float(positions[primary]), MIN_THICKNESS / 2.0, 1 + + +def _leaflet_resolution(a, b): + """Continuous bilayer-resolution score in [0, 1] for a profile region. + + This is the continuous form of the resolution gate: the height (above the shared + solvent base) of the best second leaflet within a physical bilayer distance of the + dominant one, as a fraction of the dominant leaflet's height. A clearly resolved + bilayer scores near 1; a profile with only a weak shoulder scores low; a fully + merged single peak scores 0. The gate accepts a triangle as strictly resolved at + ``MIN_LEAFLET_HEIGHT_RATIO`` (0.5); below that a thickness is only obtained by + prior recovery and reads systematically thin, so the score doubles as a + per-triangle reliability flag for the reported thickness. + """ + from scipy.signal import find_peaks + + a = np.asarray(a) + b = np.asarray(b) + peaks, _ = find_peaks(b) + if len(peaks) < 2: + return 0.0 + positions = a[peaks] + heights = b[peaks] + base = float(b.min()) + primary = int(np.argmax(heights)) + primary_rise = float(heights[primary] - base) + if primary_rise <= 0: + return 0.0 + best = 0.0 + for j in range(len(peaks)): + if j == primary: + continue + if MIN_THICKNESS <= abs(positions[j] - positions[primary]) <= MAX_THICKNESS: + best = max(best, float(heights[j] - base) / primary_rise) + return float(min(best, 1.0)) + + +def _dual_recovery_ok(a, b, popt, r2, center_seed): + """Accept a global-prior-seeded dual fit on a single-peak (merged) triangle? + + Used only when the surface's global average resolves a bilayer but this triangle's + local profile shows one peak. The dual fit is trusted only if it is a real, + non-degenerate, centered bilayer rather than a forced split of a single peak: + + * a good fit with a physical thickness clear of the floor -- a floor-pinned fit + means the leaflets merged completely, so there is no real separation to report; + * both leaflets comparable in amplitude (neither Gaussian collapsed to nothing); + * the dual center agrees with an independent single-Gaussian center -- a genuine + skewed single peak splits asymmetrically and the two disagree. + + ``popt`` is the ``_dual_gaussian_shared_width`` result (h1, h2, w, center, half, o). + """ + thickness = 2.0 * popt[4] + if not (r2 > R2_THRESHOLD + and MIN_THICKNESS + PINNED_THICKNESS_EPS < thickness <= MAX_THICKNESS): + return False + h1, h2 = popt[0], popt[1] + if min(h1, h2) < MIN_LEAFLET_AMP_RATIO * max(h1, h2): + return False + try: + ps, _ = opt.curve_fit(_monogaussian_with_offset, a, b, + [0.03, center_seed, 2.5, 0], + bounds=([0.005, center_seed - 6, 1.0, -1], + [0.06, center_seed + 6, 5.0, 1])) + except Exception: + return False + return abs(popt[3] - ps[1]) < MAX_CENTER_DISAGREEMENT + + def _compute_r_squared(y_true, y_pred): """Compute R² (coefficient of determination).""" ss_res = np.sum((y_true - y_pred) ** 2) @@ -213,7 +416,6 @@ def compute_thickness_chunk(indices): dat = dat / (80/81 * dat_sum) try: - ipk = _lt_x_positions[np.argmax(dat)] mid = len(dat) // 2 left_min = np.argmin(dat[:mid]) right_min = np.argmin(dat[mid:]) + mid @@ -221,16 +423,26 @@ def compute_thickness_chunk(indices): a = _lt_x_positions[left_min+2:right_min-2] b = dat[left_min+2:right_min-2] - if len(a) >= 7: - p0 = [0.02, ipk-0.5, 1.5, 0.02, ipk+0.5, 1.5, 0] - bounds = ([0.005, ipk-6, 0.8, 0.005, ipk-1.5, 0.8, -1], - [0.04, ipk+1.5, 2.2, 0.04, ipk+6, 2.2, 1]) - popt, _ = opt.curve_fit(_dual_gaussian, a, b, p0, bounds=bounds) - thickness = np.abs(popt[4] - popt[1]) - if 2.0 <= thickness <= 7.0: - results.append(thickness) - else: - results.append(np.nan) + if len(a) < 7: + results.append(np.nan) + continue + + # Same quality gate as fit_triangle_chunk: require two resolved leaflets, + # then a centered + shared-width fit accepted only on R^2 and a physical + # thickness; otherwise NaN (no valid bilayer thickness). + center_seed, half_seed, n_resolved = _seed_bilayer_center(a, b) + if n_resolved < 2: + results.append(np.nan) + continue + af, bf = _symmetric_fit_window(a, b, center_seed, half_seed) + p0 = [0.02, 0.02, 1.5, center_seed, half_seed, 0.0] + bounds = ([0.005, 0.005, 0.8, center_seed - 3.0, MIN_THICKNESS / 2.0, -1], + [0.04, 0.04, 2.2, center_seed + 3.0, MAX_THICKNESS / 2.0, 1]) + popt, _ = opt.curve_fit(_dual_gaussian_shared_width, af, bf, p0, bounds=bounds) + r2 = _compute_r_squared(bf, _dual_gaussian_shared_width(af, *popt)) + thickness = 2.0 * popt[4] + if r2 > R2_THRESHOLD and MIN_THICKNESS <= thickness <= MAX_THICKNESS: + results.append(thickness) else: results.append(np.nan) except Exception: @@ -280,7 +492,10 @@ def fit_triangle_chunk(indices): Returns ------- list of tuples - [(thickness, offset), ...] for each triangle in chunk + [(thickness, offset, resolution), ...] for each triangle in chunk. + ``thickness`` is NaN where no valid bilayer was measured; ``resolution`` is + the per-triangle bilayer-resolution score in [0, 1] (NaN if there was no + profile to score) -- a reliability flag for the reported thickness. """ sample_spacing = _worker_x[1] - _worker_x[0] if len(_worker_x) > 1 else 1.0 max_shift_samples = max(1, int(round(2.0 / sample_spacing))) @@ -293,7 +508,7 @@ def fit_triangle_chunk(indices): neighbors = _worker_neighbor_indices[i][valid_mask] if len(neighbors) == 0: - results.append((np.nan, 0)) + results.append((np.nan, 0, np.nan)) continue weights = 1.0 / (1.0 + l) @@ -305,7 +520,7 @@ def fit_triangle_chunk(indices): dat = dat - dat.min() dat_sum = dat.sum() if dat_sum == 0: - results.append((np.nan, 0)) + results.append((np.nan, 0, np.nan)) continue dat = dat / (80 / 81 * dat_sum) else: @@ -316,7 +531,7 @@ def fit_triangle_chunk(indices): row_sums = indiv.sum(axis=1, keepdims=True) valid_rows = row_sums.flatten() > 0 if not np.any(valid_rows): - results.append((np.nan, 0)) + results.append((np.nan, 0, np.nan)) continue indiv[valid_rows] /= (80 / 81 * row_sums[valid_rows]) @@ -339,12 +554,6 @@ def fit_triangle_chunk(indices): else: dat = dat0 - # Find initial peak and set up fitting bounds - ipk = _worker_x[np.argmax(dat)] - p0 = [0.02, ipk-0.5, 1.5, 0.02, ipk+0.5, 1.5, 0] - bounds = ([0.005, ipk-6, 0.8, 0.005, ipk-1.5, 0.8, -1], - [0.04, ipk+1.5, 2.2, 0.04, ipk+6, 2.2, 1]) - # Find minima to constrain fitting region mid = len(dat) // 2 left_min = np.argmin(dat[:mid]) @@ -353,17 +562,58 @@ def fit_triangle_chunk(indices): a = _worker_x[left_min+2:right_min-2] b = dat[left_min+2:right_min-2] - if len(a) < 7: # Need enough points for 7-parameter fit - results.append((np.nan, 0)) + if len(a) < 7: # Need enough points for the fit + results.append((np.nan, 0, np.nan)) + continue + + # Per-triangle reliability flag reported alongside every thickness: how clearly + # the two leaflets are resolved (1 = cleanly separated, 0 = a single merged + # peak). Strict fits score >= MIN_LEAFLET_HEIGHT_RATIO; prior-recovered fits + # below it (and read systematically thin), so downstream can weight or filter. + resolution = _leaflet_resolution(a, b) + + # Quality gate, two tiers (mirrors the refinement path): + # * resolved (n_resolved >= 2): fit and accept on R^2 + physical thickness; + # * prior recovery (n_resolved < 2 but the global average resolved a bilayer): + # a locally-merged profile may still hold a real, blurred bilayer -- fit it + # seeded from the global prior and accept only a non-degenerate, + # center-stable result (_dual_recovery_ok). Without a prior, a single peak + # has no measurable thickness -> NaN. + center_seed, half_seed, n_resolved = _seed_bilayer_center(a, b) + have_prior = _worker_global_fit_params is not None + if have_prior: + c1_g, w1_g, c2_g, w2_g = _worker_global_fit_params + seed_c = 0.5 * (c1_g + c2_g) + seed_h = float(np.clip(0.5 * (c2_g - c1_g), + MIN_THICKNESS / 2.0, MAX_THICKNESS / 2.0)) + seed_w = w1_g + else: + seed_c, seed_h, seed_w = center_seed, half_seed, 1.5 + + recovery = n_resolved < 2 and have_prior + if n_resolved < 2 and not recovery: + results.append((np.nan, 0, resolution)) continue try: - p3, _ = opt.curve_fit(_dual_gaussian, a, b, p0, bounds=bounds) - thickness = np.abs(p3[4] - p3[1]) - offset = (p3[4] + p3[1]) / 2 - results.append((thickness, offset)) + af, bf = _symmetric_fit_window(a, b, seed_c, seed_h) + p0 = [0.02, 0.02, seed_w, seed_c, seed_h, 0.0] + bounds = ([0.005, 0.005, 0.8, seed_c - 3.0, MIN_THICKNESS / 2.0, -1], + [0.04, 0.04, 2.2, seed_c + 3.0, MAX_THICKNESS / 2.0, 1]) + p3, _ = opt.curve_fit(_dual_gaussian_shared_width, af, bf, p0, bounds=bounds) + r2 = _compute_r_squared(bf, _dual_gaussian_shared_width(af, *p3)) + thickness = 2.0 * p3[4] + offset = p3[3] + if recovery: + accept = _dual_recovery_ok(a, b, p3, r2, seed_c) + else: + accept = r2 > R2_THRESHOLD and MIN_THICKNESS <= thickness <= MAX_THICKNESS + if accept: + results.append((thickness, offset, resolution)) + else: + results.append((np.nan, 0, resolution)) except Exception: - results.append((np.nan, 0)) + results.append((np.nan, 0, resolution)) return results @@ -394,10 +644,8 @@ def fit_triangle_chunk_offsets(indices): [(offset, sigma1, sigma2, fit_method), ...] for each triangle in chunk. fit_method: 0=failed, 1=dual_gaussian, 2=single_gaussian, 3=centroid, 4=xcorr """ - # Fit quality thresholds - R2_THRESHOLD = 0.6 - MIN_THICKNESS = 2.0 # nm - MAX_THICKNESS = 7.0 # nm + # Fit-quality thresholds are module-level constants (R2_THRESHOLD, + # MIN_THICKNESS, MAX_THICKNESS). # Sample spacing — constant across all triangles, computed once per chunk sample_spacing = _worker_x[1] - _worker_x[0] if len(_worker_x) > 1 else 1.0 @@ -479,40 +727,74 @@ def fit_triangle_chunk_offsets(indices): best_sigma2 = np.nan best_method = 0 - # Step 1: Try dual Gaussian (unless monolayer mode) - if not _worker_monolayer: + # Step 1: Try dual Gaussian (unless monolayer mode). Parameterized by the + # bilayer center + half-separation so the two leaflet peaks are forced to + # opposite sides and cannot collapse into the same leaflet. + # + # The global average fit (when available) gives more reliable seeds for the + # center/half/width. The dual fit runs in two tiers: + # * resolved (n_resolved >= 2): a clear two-leaflet profile -> fit and accept + # on R^2 + physical thickness (high confidence); + # * prior recovery (n_resolved < 2 but the global average resolved a bilayer): + # a locally-merged profile may still hold a real, blurred bilayer. Seed it + # from the global fit but accept only a non-degenerate, center-stable result + # (_dual_recovery_ok), so a genuine single / skewed peak is still rejected + # and falls through to the single Gaussian rather than inventing a bilayer. + tri_center, tri_half, n_resolved = _seed_bilayer_center(a, b) + have_prior = _worker_global_fit_params is not None + if have_prior: + c1_g, w1_g, c2_g, w2_g = _worker_global_fit_params + center_seed = 0.5 * (c1_g + c2_g) + half_seed = float(np.clip(0.5 * (c2_g - c1_g), + MIN_THICKNESS / 2.0, MAX_THICKNESS / 2.0)) + w1_seed = w2_seed = w1_g + else: + center_seed, half_seed = tri_center, tri_half + w1_seed = w2_seed = 1.5 + + recovery = n_resolved < 2 and have_prior # tier 2 + if not _worker_monolayer and (n_resolved >= 2 or recovery): try: - if _worker_global_fit_params is not None: - # Use global fit parameters for tighter, better-informed initialization - c1_g, w1_g, c2_g, w2_g = _worker_global_fit_params - p0 = [0.02, c1_g, w1_g, 0.02, c2_g, w2_g, 0] - bounds = ([0.005, c1_g - 2, max(0.6, w1_g * 0.5), 0.005, c2_g - 2, max(0.6, w2_g * 0.5), -1], - [0.04, c1_g + 2, min(2.5, w1_g * 2.0), 0.04, c2_g + 2, min(2.5, w2_g * 2.0), 1]) + # Restrict the fit to a window symmetric about the seeded bilayer so + # distant density (adjacent membranes, CTF ringing) cannot pull the + # fit out and an asymmetric range cannot bias the center. + af, bf = _symmetric_fit_window(a, b, center_seed, half_seed) + + # params: h1, h2, shared_w, center, half_sep, offset. A single shared + # width keeps asymmetric flanks from tilting the fit and shifting the + # center (see _dual_gaussian_shared_width). + w_seed = 0.5 * (w1_seed + w2_seed) + p0 = [0.02, 0.02, w_seed, center_seed, half_seed, 0.0] + bounds = ([0.005, 0.005, 0.8, center_seed - 3.0, MIN_THICKNESS / 2.0, -1], + [0.04, 0.04, 2.2, center_seed + 3.0, MAX_THICKNESS / 2.0, 1]) + p3, _ = opt.curve_fit(_dual_gaussian_shared_width, af, bf, p0, bounds=bounds) + + # Validate fit (stricter, center-stable check for the recovery tier). + r2 = _compute_r_squared(bf, _dual_gaussian_shared_width(af, *p3)) + thickness = 2.0 * p3[4] + if recovery: + accept = _dual_recovery_ok(a, b, p3, r2, center_seed) else: - p0 = [0.02, ipk-0.5, 1.5, 0.02, ipk+0.5, 1.5, 0] - bounds = ([0.005, ipk-6, 0.8, 0.005, ipk-1.5, 0.8, -1], - [0.04, ipk+1.5, 2.2, 0.04, ipk+6, 2.2, 1]) - p3, _ = opt.curve_fit(_dual_gaussian, a, b, p0, bounds=bounds) - - # Validate fit - y_pred = _dual_gaussian(a, *p3) - r2 = _compute_r_squared(b, y_pred) - thickness = np.abs(p3[4] - p3[1]) + accept = r2 > R2_THRESHOLD and MIN_THICKNESS <= thickness <= MAX_THICKNESS - if r2 > R2_THRESHOLD and MIN_THICKNESS <= thickness <= MAX_THICKNESS: - best_offset = (p3[1] + p3[4]) / 2 + if accept: + best_offset = p3[3] # bilayer center best_sigma1 = p3[2] - best_sigma2 = p3[5] + best_sigma2 = p3[2] best_method = 1 except Exception: pass - # Step 2: Try single Gaussian if dual didn't work + # Step 2: Try single Gaussian if dual didn't work. For an unresolved triangle + # the global membrane center (when available) is a more robust seed than this + # triangle's noisy argmax, which can latch onto a spurious bump; bounds stay + # wide so a genuinely shifted membrane can still be found. if best_method == 0: try: - p0_mono = [0.03, ipk, 2.5, 0] - bounds_mono = ([0.005, ipk-6, 1.0, -1], - [0.06, ipk+6, 5.0, 1]) + mono_seed = center_seed if _worker_global_fit_params is not None else ipk + p0_mono = [0.03, mono_seed, 2.5, 0] + bounds_mono = ([0.005, mono_seed-6, 1.0, -1], + [0.06, mono_seed+6, 5.0, 1]) p_mono, _ = opt.curve_fit(_monogaussian_with_offset, a, b, p0_mono, bounds=bounds_mono) # Validate fit diff --git a/surface_morphometrics/accept_refinement.py b/surface_morphometrics/accept_refinement.py index f9a66b3..3cc6d04 100644 --- a/surface_morphometrics/accept_refinement.py +++ b/surface_morphometrics/accept_refinement.py @@ -25,12 +25,30 @@ __license__ = "GPLv3" import os +import re from glob import glob from pathlib import Path import click import yaml +from .config_utils import load_config + + +def _iterations_by_surface(work_dir): + """Map each surface basename to its sorted available refinement iterations. + + Refinement stops early once a surface converges, so different surfaces may + have different numbers of `_refined_iter*` files. + """ + pattern = re.compile(r"^(.*)_refined_iter(\d+)\.surface\.vtp$") + out = {} + for path in glob(f"{work_dir}*_refined_iter*.surface.vtp"): + match = pattern.match(os.path.basename(path)) + if match: + out.setdefault(match.group(1), []).append(int(match.group(2))) + return {base: sorted(iters) for base, iters in out.items()} + # The canonical surface files the downstream pipeline consumes. Only these are # backed up (from the original) and promoted (from the accepted iteration). @@ -185,8 +203,7 @@ def accept_refinement_cli(configfile, step, component_name, tomogram, dry_run): graph, the command warns that pycurv must be re-run on the promoted surface before any downstream analysis. """ - with open(configfile) as f: - config = yaml.safe_load(f) + config = load_config(configfile, require=("work_dir",)) work_dir = config.get("work_dir", config.get("seg_dir", "./")) if not work_dir.endswith("/"): @@ -200,11 +217,10 @@ def accept_refinement_cli(configfile, step, component_name, tomogram, dry_run): print("DRY RUN - no files will be changed") print("-" * 60) - # Discover surfaces that have the requested iteration by finding their - # accepted-step surface files, then strip the tag to recover the basename. - iter_tag = f"_refined_iter{step}.surface.vtp" - candidates = sorted(glob(f"{work_dir}*{iter_tag}")) - basenames = [os.path.basename(p)[:-len(iter_tag)] for p in candidates] + # Discover every refined surface and the iterations it actually has (a surface + # that converged early stops before `step`). + iters_by_surface = _iterations_by_surface(work_dir) + basenames = sorted(iters_by_surface) # Apply optional component / tomogram filters on the basename. if tomogram: @@ -213,14 +229,23 @@ def accept_refinement_cli(configfile, step, component_name, tomogram, dry_run): basenames = [b for b in basenames if b.endswith(f"_{component_name}")] if not basenames: - print(f"No surfaces with a refined iteration {step} found in {work_dir}" - + (f" matching the given filters." if (component_name or tomogram) else ".")) + print(f"No refined surfaces found in {work_dir}" + + (" matching the given filters." if (component_name or tomogram) else ".")) return accepted = 0 needs_pycurv = [] for basename in basenames: - result = accept_one(work_dir, basename, step, radius_hit, dry_run) + available = iters_by_surface[basename] + # Use the requested iteration, or the last available one if refinement + # converged before it (largest available iteration <= step). + usable = [i for i in available if i <= step] or available + effective_step = usable[-1] + if effective_step != step: + print(f" WARNING: {basename}: iteration {step} not available - refinement " + f"converged early at iteration {effective_step}; using iteration " + f"{effective_step}.") + result = accept_one(work_dir, basename, effective_step, radius_hit, dry_run) if result is not None: accepted += 1 if result is False: @@ -229,7 +254,8 @@ def accept_refinement_cli(configfile, step, component_name, tomogram, dry_run): print("-" * 60) verb = "Would accept" if dry_run else "Accepted" - print(f"{verb} iteration {step} for {accepted} surface(s).") + print(f"{verb} refinement for {accepted} surface(s) (requested iteration {step}; " + "early-converged surfaces used their last iteration).") if accepted: print("Originals backed up with a .orig.bak suffix; only canonical surfaces " "and refinement summaries were kept.") @@ -239,7 +265,7 @@ def accept_refinement_cli(configfile, step, component_name, tomogram, dry_run): print("iteration and have no curvature graph. Re-run pycurv before any downstream") print("step (distances, density sampling, thickness):") for basename in needs_pycurv: - print(f" python run_pycurv.py {configfile} {basename}.surface.vtp") + print(f" morphometrics pycurv {configfile} {basename}.surface.vtp") if __name__ == "__main__": diff --git a/surface_morphometrics/cli.py b/surface_morphometrics/cli.py index c4313b0..3bea3a8 100644 --- a/surface_morphometrics/cli.py +++ b/surface_morphometrics/cli.py @@ -173,20 +173,34 @@ def cli(): @click.option("-o", "--output", "output_path", default="config.yml", type=click.Path(), show_default=True, help="Where to write the config file.") +@click.option("--simple/--verbose", "simple", default=False, + help="Write a minimal config with only the commonly-adjusted settings " + "(--simple), or the fully annotated template exposing every option " + "(--verbose, the default).") @click.option("--force", is_flag=True, default=False, help="Overwrite the output file if it already exists.") -def new_config(output_path, force): - """Write a template config.yml into the current directory.""" +def new_config(output_path, simple, force): + """Write a template config.yml into the current directory. + + Omitted settings fall back to documented defaults, so a --simple config is fully + runnable; use --verbose to see and tune every available option. + """ from importlib.resources import files if os.path.exists(output_path) and not force: raise click.ClickException( f"{output_path} already exists; use --force to overwrite." ) - template = files("surface_morphometrics").joinpath("config_template.yml").read_text() + template_name = "config_template_simple.yml" if simple else "config_template.yml" + template = files("surface_morphometrics").joinpath(template_name).read_text() with open(output_path, "w") as handle: handle.write(template) - click.echo(f"Wrote {output_path}. Edit it to point at your data and set parameters.") + kind = "minimal" if simple else "full" + click.echo(f"Wrote {kind} config to {output_path}. Edit it to point at your data " + "and set parameters.") + if simple: + click.echo("Omitted settings use sensible defaults; run " + "`morphometrics new_config --verbose` to see every option.") cli.add_command(new_config) diff --git a/surface_morphometrics/config_template_simple.yml b/surface_morphometrics/config_template_simple.yml new file mode 100644 index 0000000..14f7525 --- /dev/null +++ b/surface_morphometrics/config_template_simple.yml @@ -0,0 +1,36 @@ +# Minimal surface_morphometrics configuration. +# +# Only the most commonly-adjusted settings are listed here; every other option uses +# a sensible default. Run `morphometrics new_config --verbose` for the fully +# annotated template that exposes every setting. + +# --- Directories (required) --- +seg_dir: "./segmentations/" # Directory of segmentation MRC files +tomo_dir: "./tomograms/" # Directory of tomogram MRC files (density sampling, thickness, refinement) +work_dir: "./morphometrics/" # Directory for output files + +# --- Features (required): map each feature name to its segmentation label value --- +segmentation_values: + OMM: 1 + IMM: 2 + ER: 3 + +cores: 6 # Number of parallel processes + +# --- Mesh generation --- +surface_generation: + angstroms: false # false rescales surfaces to nm; true keeps angstrom scale + isotropic_remesh: true # Remesh to near-equilateral triangles of ~target_area + target_area: 1.0 # Target triangle area in nm^2 (when isotropic_remesh is true) + simplify: false # Instead of remeshing, decimate to simplify_max_triangles + simplify_max_triangles: 300000 # Only used when simplify is true + +# --- Thickness measurement --- +thickness_measurements: + average_radius: 12 # Radius in nm for local averaging of density profiles + +# --- Mesh refinement (optional step) --- +mesh_refinement: + average_radius: 25 # Radius in nm for local averaging of density profiles + iterations: 6 # Number of refinement iterations + xcorr_iterations: [1, 2, 3] # Iterations that cross-correlate to sharpen the bilayer before fitting diff --git a/surface_morphometrics/config_utils.py b/surface_morphometrics/config_utils.py new file mode 100644 index 0000000..af2e8f7 --- /dev/null +++ b/surface_morphometrics/config_utils.py @@ -0,0 +1,214 @@ +#! /usr/bin/env python +"""Shared helpers for loading the pipeline config.yml.""" + +__author__ = "Benjamin Barad" +__email__ = "benjamin.barad@gmail.com" +__license__ = "GPLv3" + +import copy + +import click +import yaml + +# Directory settings that the pipeline joins with filenames by plain string +# concatenation (e.g. work_dir + "*.gt"), so they must end in a path separator. +DIR_KEYS = ("seg_dir", "tomo_dir", "work_dir") + +# Sensible defaults for every parameter section, so a stripped-down config that +# omits them still runs as documented. These mirror config_template.yml (kept in +# sync by tests/test_config_utils.py) and are the single source of truth for default +# values. User-data keys with no sensible default are intentionally excluded and left +# for the user / each command: top-level segmentation_values and exp_name, and within +# sections the membership/selection lists (distance intra/inter, thickness components, +# patch_analysis membrane_label). +DEFAULTS = { + "cores": 6, + "surface_generation": { + "angstroms": False, + "ultrafine": False, + "extrapolation_distance": 1.5, + "isotropic_remesh": True, + "target_area": 1.0, + "simplify": False, + "simplify_max_triangles": 300000, + "octree_depth": 9, + "point_weight": 0.7, + "neighbor_count": 400, + "smoothing_iterations": 1, + }, + "curvature_measurements": { + "radius_hit": 9, + "min_component": 30, + "exclude_borders": 1, + }, + "distance_and_orientation_measurements": { + "mindist": 3, + "maxdist": 400, + "tolerance": 0.1, + "verticality": True, + "relative_orientation": True, + # intra (which surfaces) and inter (which pairs) are left out on purpose so the + # command can tell "omitted" (-> default to all surfaces / all pairs) from + # "explicitly empty" (-> measure nothing). See measure_distances_orientations. + }, + "density_sampling": { + "sample_spacing": 0.25, + "scan_range": 10, + }, + "thickness_measurements": { + "average_radius": 12, + "fit_curve": True, + }, + "patch_analysis": { + "patch_radius": 12, + "particle_max_distance": 24, + "generate_random": True, + "random_min_distance": 12, + "random_seed": 0, + "star_dir": "./star/", + "star_pattern": "{tomo}.star", + "star_tomo_column": None, + "star_coord_columns": ["rlnCoordinateX", "rlnCoordinateY", "rlnCoordinateZ"], + "star_pixelsize_column": "rlnPixelSize", + "star_coords_in_pixels": True, + "annotate_star": True, + "min_component_size": 0, + "statistics_properties": ["curvedness_VV", "thickness"], + }, + "mesh_refinement": { + "iterations": 6, + "damping_factor": 0.9, + "monolayer": False, + "xcorr_iterations": [1, 2, 3], + "average_radius": 25, + "average_radius_decay": 1.0, + "average_radius_min": 12.0, + "max_total_offset": 8, + "smooth_offsets": True, + "offset_smoothing_radius": 25, + "laplacian_iterations": 5, + "laplacian_lambda": 0.5, + "lowpass_sigma": 0, + "convergence_threshold": 0.05, + }, +} + +# Human-readable purpose of each setting, used in the missing-required-key error. +_KEY_HELP = { + "seg_dir": "directory of segmentation MRC files", + "work_dir": "directory for pipeline output files", + "tomo_dir": "directory of tomogram MRC files (needed for density sampling, " + "thickness, and mesh refinement)", + "segmentation_values": "mapping of feature name to its segmentation label value " + "(e.g. OMM: 1)", +} + + +def normalize_dirs(config): + """Append a trailing "/" to any set directory path that lacks one. + + Mutates and returns ``config``. Only ``seg_dir``/``tomo_dir``/``work_dir`` that + are non-empty strings are touched; empty or unset values are left alone so each + command can apply its own defaulting. + """ + if isinstance(config, dict): + for key in DIR_KEYS: + value = config.get(key) + if isinstance(value, str) and value and not value.endswith("/"): + config[key] = value + "/" + return config + + +def merge_defaults(config): + """Fill in default parameter sections so partial configs run as documented. + + Each section in :data:`DEFAULTS` is merged shallowly into ``config``: any + parameter the user did not set takes its default, while user-set values (and any + extra keys, like a section's membership lists) are kept. Mutates and returns + ``config``. Defaults are deep-copied so the module-level :data:`DEFAULTS` is never + mutated by downstream code. + """ + if not isinstance(config, dict): + return config + # Back-compat: density-sampling settings used to live under thickness_measurements. + # Carry them over before defaults are applied so old configs keep their values. + if "density_sampling" not in config and isinstance(config.get("thickness_measurements"), dict): + legacy = {k: config["thickness_measurements"][k] + for k in ("sample_spacing", "scan_range") + if k in config["thickness_measurements"]} + if legacy: + config["density_sampling"] = legacy + for section, default in DEFAULTS.items(): + if isinstance(default, dict): + merged = copy.deepcopy(default) + user = config.get(section) + if isinstance(user, dict): + merged.update(user) + config[section] = merged + elif config.get(section) is None: + config[section] = copy.deepcopy(default) + return config + + +def resolve_distance_targets(dist_settings, labels): + """Resolve which surfaces/pairs to measure distances for, defaulting to all-vs-all. + + Omitting ``intra``/``inter`` (key absent) opts in to the all-vs-all default: every + surface measured against itself, and every unordered pair against each other. + Setting either explicitly -- including an empty ``[]`` / ``{}`` -- is honored as + given, so an explicit empty means "measure nothing". Returns + ``(intra, inter, intra_defaulted, inter_defaulted)`` where the booleans say whether + the all-vs-all default was applied (so the caller can announce it). + """ + labels = list(labels) + intra = dist_settings.get("intra") + intra_defaulted = intra is None + if intra_defaulted: + intra = labels + inter = dist_settings.get("inter") + inter_defaulted = inter is None + if inter_defaulted: + # Upper triangle only; callers write each pair bidirectionally. + inter = {labels[i]: labels[i + 1:] for i in range(len(labels) - 1) if labels[i + 1:]} + return intra, inter, intra_defaulted, inter_defaulted + + +def require_keys(config, required, configfile=""): + """Raise a clean CLI error if any ``required`` top-level key is unset. + + A key counts as unset if it is missing or maps to an empty/None value. All + missing keys are reported together so the user fixes the config in one pass. + ``required`` is per-command: each command demands only the settings it uses + (e.g. tomogram steps require ``tomo_dir``; analysis/export steps need only + ``work_dir``). + """ + missing = [key for key in required if not (isinstance(config, dict) and config.get(key))] + if missing: + target = configfile or "config.yml" + details = "\n".join( + f" - {key}" + (f" ({_KEY_HELP[key]})" if key in _KEY_HELP else "") + for key in missing + ) + raise click.ClickException( + f"{target} is missing required setting(s):\n{details}\n" + "Add them to your config.yml (start one with `morphometrics new_config`)." + ) + + +def load_config(configfile, require=()): + """Load a pipeline config YAML, normalizing directory paths and validating. + + A config written without trailing slashes (e.g. ``work_dir: ./morphometrics``) + behaves identically to one with them, since the pipeline builds paths by string + concatenation. Missing parameter sections are filled from :data:`DEFAULTS` + (:func:`merge_defaults`), so partial/stripped configs run as documented. + ``require`` is a tuple of top-level keys this command needs; if any is missing the + user gets one clean error (see :func:`require_keys`) instead of a ``KeyError`` + traceback partway through the run. + """ + with open(configfile) as handle: + config = yaml.safe_load(handle) or {} + normalize_dirs(config) + merge_defaults(config) + require_keys(config, require, configfile) + return config diff --git a/surface_morphometrics/export_obj.py b/surface_morphometrics/export_obj.py index 2ee2148..b502268 100644 --- a/surface_morphometrics/export_obj.py +++ b/surface_morphometrics/export_obj.py @@ -30,6 +30,8 @@ import numpy as np import yaml +from .config_utils import load_config + # --------------------------------------------------------------------------- # VTP reading (vtk) @@ -241,8 +243,7 @@ def export_obj_cli(configfile, vtp, feature, cmap, vmin, vmax, nan_color, patter CONFIGFILE: path to config.yml. VTP: a surface .vtp to export; if omitted, all matching surfaces in work_dir. """ - with open(configfile) as f: - config = yaml.safe_load(f) + config = load_config(configfile, require=("work_dir",)) work_dir = config.get("work_dir", config.get("seg_dir", "./")) if not work_dir.endswith("/"): work_dir += "/" diff --git a/surface_morphometrics/extract_patches.py b/surface_morphometrics/extract_patches.py index 5a33827..566d8a1 100644 --- a/surface_morphometrics/extract_patches.py +++ b/surface_morphometrics/extract_patches.py @@ -29,6 +29,8 @@ import numpy as np import yaml +from .config_utils import load_config + def mask_from_property(values, vmin=None, vmax=None): """Boolean mask of triangles whose property is within [vmin, vmax]. @@ -144,8 +146,7 @@ def extract_patches_cli(configfile, graph_file, label_col, prop, vmin, vmax, if (label_col is None) == (prop is None): raise click.UsageError("Specify exactly one of --by