From 63fb8e4f9bebb61dada174ac330258b43c0783bb Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sat, 16 May 2026 17:28:56 +0200 Subject: [PATCH 1/2] fix(summarize-insights): drop reader-engagement filler, reframe audience as downstream AI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'If you read nothing else, read this.' was a reader hook for humans, but the primary consumer of insights.md is downstream AI (another agent, a planning loop, a follow-on extractor) — token-density of useful signal matters more than engagement. The structural markers (## Bad news first, ### Likely deal-breakers, verdict labels) already do the work the prefix tried to do; restating it in prose burns tokens. Removed the prefix from render_bad_news_first. The substantive sentence ('Every item below is a signal the plan does not survive its own assumptions. Items are ordered by severity.') stays because it explains what's in the section. SKILL.md gains a new 'Audience and tone' section codifying the principle: no reader-engagement prefixes, no filler sentences whose only job is to motivate the next sentence, keep substantive explanations (signal, not filler). Replaced 'reader' with 'downstream consumer' in the no-sycophancy rule to reflect the audience reframe. Verified: all three reference plans regenerate insights.md cleanly with the prefix gone. Smoke 8/8, unittest 45/45. --- .../.claude/skills/summarize-insights/SKILL.md | 13 ++++++++++++- experiments/napkin_math/summarize_insights.py | 4 ++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/experiments/napkin_math/.claude/skills/summarize-insights/SKILL.md b/experiments/napkin_math/.claude/skills/summarize-insights/SKILL.md index e5bcf716..ac4f7efd 100644 --- a/experiments/napkin_math/.claude/skills/summarize-insights/SKILL.md +++ b/experiments/napkin_math/.claude/skills/summarize-insights/SKILL.md @@ -53,6 +53,17 @@ Any output classified DOOM or FRAGILE also gets a "bottom line" callout at the t The script does **not** invent thresholds for outputs the user did not declare. To get a verdict on an output, declare a threshold on it in the Monte Carlo settings file. +## Audience and tone + +The primary consumer of `insights.md` is **downstream AI** (another agent, a planning loop, a follow-on extractor), not a human reader skimming for 7 seconds. A project manager may read it secondarily, but the writing optimises for token-density of useful signal over engagement hooks. + +What that means concretely: + +- No reader-engagement prefixes ("If you read nothing else, read this", "Stop and pay attention", "Important:"). The structural markers (`## Bad news first`, `### Likely deal-breakers`, the verdict labels) already carry that weight; restating them in prose burns tokens. +- No filler sentences whose only job is to motivate the next sentence. Lead with the substantive claim. +- Keep substantive explanations (what a verdict label means, what a column shows, what makes an item belong in a section). Those are signal, not filler. +- Don't apologise for or hedge the bad news. State it. + ## Writing rules — apply to the script's output AND to anything you say back to the user about the insights These are not stylistic preferences. They are how this skill is meant to communicate. @@ -61,7 +72,7 @@ These are not stylistic preferences. They are how this skill is meant to communi 2. **No sugar-coating.** A 5% pass probability is "almost certainly fails", not "shows some challenges". A negative base-scenario number is "the plan is in trouble at its own central assumptions", not "may warrant further attention". Use the strongest accurate language; if the script's wording softens a result, fix the script. -3. **No sycophancy.** Never start a paragraph with "Great plan, but..." or "The team has done strong work; one concern is...". The reader has the plan in front of them. They do not need praise from the report. +3. **No sycophancy.** Never start a paragraph with "Great plan, but..." or "The team has done strong work; one concern is...". The downstream consumer has the plan available. It does not need praise from the report. 4. **No hedging phrases.** Banned in both the script's emitted text and in conversational reporting: - `the honest read is`, `frankly`, `to be fair`, `in fairness`, `candidly`, `let's be real`, `look, the truth is` diff --git a/experiments/napkin_math/summarize_insights.py b/experiments/napkin_math/summarize_insights.py index 9fc883d0..f15b6cc2 100644 --- a/experiments/napkin_math/summarize_insights.py +++ b/experiments/napkin_math/summarize_insights.py @@ -158,8 +158,8 @@ def render_bad_news_first(mc: dict | None, scenarios: dict | None, rows.append("## Bad news first") rows.append("") rows.append( - "If you read nothing else, read this. Every item below is a signal the plan does " - "not survive its own assumptions. Items are ordered by severity." + "Every item below is a signal the plan does not survive its own assumptions. " + "Items are ordered by severity." ) rows.append("") From ee1a4c5f701e2fe4ab1ca83d4ddf19b8572d1bca Mon Sep 17 00:00:00 2001 From: Simon Strandgaard Date: Sat, 16 May 2026 19:38:38 +0200 Subject: [PATCH 2/2] feat(napkin-math): five quick wins from ChatGPT's v33 feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ChatGPT's review of v33 raised 15 items; this commit ships the five 'quick win' ones that fit on top of the existing per-run state. All five are runner-side analyses plus matching insights.md sections — no schema changes, no LLM prompt edits. §14 binding-gate frequency tracking. For every min() aggregate, the runner records which dependency provided the min value in each run, then aggregates conditional on the aggregate failing its threshold. Faraday demonstration: the weakest_program_gate fails in 9,826 of 10,000 runs; mil_std_cert_funding is the binder in 67% of those, cash_flow_trigger in 32%, inventory_overhang in 0.5%. That tells the reader which sub-gate to fix first; the previous output only knew it failed. §7 quartile pass-rates. For each threshold × driver, P(threshold passes | driver in bottom quartile) vs P(threshold passes | driver in top quartile). The delta in percentage points is much more actionable than Pearson r — 'P(coverage 99%) goes from 18% in worst-quartile satellite-failure runs to 74% in best-quartile' is a directly usable lever. §13 required-input thresholds. For each FAILING gate (P < 80%), find the input-bound restriction that would lift conditional pass rate to >= 80%. Empty list means no single-input restriction is enough — Faraday's weakest_program_gate gets an empty list, correctly diagnosing it as structurally unreachable. §8 missing-value priority. Rank missing_values entries by |delta_pp on worst gate| * (1 - pass_prob) * bound_width_ratio. The highest-scoring entries are the ones most worth replacing with real data instead of an assumed range. §10 model confidence grades. Per output, grade HIGH/MEDIUM/LOW based on the fraction of upstream input bounds anchored in 'data' vs 'assumption' and the average bound-width-to-base ratio. Cutoffs: data >= 70% AND width < 0.5 -> HIGH; data < 30% OR width > 1.5 -> LOW; else MEDIUM. The reasons array names the specific evidence. Five new render functions in summarize_insights.py emit these as separate sections after the existing verdict table. Five new unittest.TestCase methods (TestNewAnalysisBlocks) cover each block end-to-end against a small synthetic fixture. Smoke 8/8, unittest 50/50. Reference runs regenerated for Nuuk, Cross-Border Rail, Faraday, India Census. --- experiments/napkin_math/run_monte_carlo.py | 223 ++++++++++++++++++ experiments/napkin_math/summarize_insights.py | 117 +++++++++ .../napkin_math/tests/test_run_monte_carlo.py | 110 +++++++++ 3 files changed, 450 insertions(+) diff --git a/experiments/napkin_math/run_monte_carlo.py b/experiments/napkin_math/run_monte_carlo.py index 7257c470..c5925e85 100644 --- a/experiments/napkin_math/run_monte_carlo.py +++ b/experiments/napkin_math/run_monte_carlo.py @@ -248,6 +248,82 @@ def safe_pearson(x: np.ndarray, y: np.ndarray) -> float | None: return r +def is_min_aggregate(entry: dict) -> bool: + """Identify aggregates whose magnitude is not a usual surplus.""" + hint = (entry or {}).get("formula_hint") or "" + return "min(" in hint + + +def collect_min_aggregates(params: dict) -> dict[str, list[str]]: + """Map output_name -> ordered list of dependency output_names for min() aggregates.""" + out: dict[str, list[str]] = {} + for src in ("recommended_first_calculations", "derived_questions"): + for entry in params.get(src, []): + if is_min_aggregate(entry) and entry.get("output_name"): + out[entry["output_name"]] = list(entry.get("depends_on") or []) + return out + + +def threshold_passes(arr: np.ndarray, op: str, value: float) -> np.ndarray: + """Boolean mask of runs where the threshold passes, NaN runs counted as fails.""" + finite_mask = np.isfinite(arr) + pass_mask = np.zeros_like(arr, dtype=bool) + pass_mask[finite_mask] = THRESHOLD_OPS[op](arr[finite_mask], value) + return pass_mask + + +def quartile_pass_rates(input_arr: np.ndarray, threshold_pass: np.ndarray) -> dict | None: + """For one input × one threshold, return P(pass | input in bottom/top quartile).""" + if np.ptp(input_arr) == 0 or input_arr.size < 100: + return None + q1 = np.percentile(input_arr, 25) + q3 = np.percentile(input_arr, 75) + bottom_mask = input_arr <= q1 + top_mask = input_arr >= q3 + if bottom_mask.sum() < 20 or top_mask.sum() < 20: + return None + p_low = float(threshold_pass[bottom_mask].mean()) + p_high = float(threshold_pass[top_mask].mean()) + return {"p_pass_low_quartile": p_low, + "p_pass_high_quartile": p_high, + "delta_pp": round((p_high - p_low) * 100, 2)} + + +def required_input_percentile(input_arr: np.ndarray, threshold_pass: np.ndarray, + target_prob: float = 0.80) -> dict | None: + """For a failing threshold, find which percentile of the input is required + for conditional pass-probability to reach the target. Returns the direction + (the input has to stay below / above that percentile).""" + if np.ptp(input_arr) == 0 or input_arr.size < 100: + return None + # Try both directions. The actionable one is the one that gives the looser bound. + best = None + for direction in ("above", "below"): + for pct in (5, 10, 15, 25, 33, 50, 67, 75, 85, 90, 95): + if direction == "above": + cutoff = float(np.percentile(input_arr, pct)) + mask = input_arr >= cutoff + else: + cutoff = float(np.percentile(input_arr, 100 - pct)) + mask = input_arr <= cutoff + if mask.sum() < 20: + continue + cond_p = float(threshold_pass[mask].mean()) + if cond_p >= target_prob: + # Looser bound = larger admitted fraction (1 - pct/100 for above; (100-pct)/100 for below) + admitted = mask.mean() + if best is None or admitted > best["admitted_fraction"]: + best = { + "direction": direction, + "input_percentile_cutoff": pct, + "cutoff_value": cutoff, + "conditional_pass_prob": round(cond_p, 4), + "admitted_fraction": round(float(admitted), 4), + } + break # take loosest pct in this direction + return best + + def run(params_path: Path, bounds_path: Path, calc_path: Path, settings_path: Path | None, output_path: Path) -> dict: params = load_json(params_path) @@ -281,6 +357,8 @@ def run(params_path: Path, bounds_path: Path, calc_path: Path, name: list(inspect.signature(fn).parameters) for name, _, fn in plan } fn_lookup: dict[str, Any] = {name: fn for name, _, fn in plan} + min_aggregates = collect_min_aggregates(params) + binding_dep_arrays: dict[str, list[str | None]] = {name: [None] * n_runs for name in min_aggregates} for vid, spec in input_specs.items(): if "fixed" in spec: @@ -313,6 +391,11 @@ def run(params_path: Path, bounds_path: Path, calc_path: Path, continue output_arrays[name][i] = val pool[name] = float(val) + if name in min_aggregates: + for dep in min_aggregates[name]: + if dep in pool and pool[dep] == val: + binding_dep_arrays[name][i] = dep + break for name, counts in exception_counts.items(): for exc_name, count in counts.items(): @@ -394,6 +477,141 @@ def run(params_path: Path, bounds_path: Path, calc_path: Path, "top_inputs": [{"id": vid, "correlation": round(r, 4)} for vid, r in candidates[:5]] } + binding_gate_analysis: dict[str, dict] = {} + for agg_name, deps in min_aggregates.items(): + if agg_name not in thresholds_section: + continue + t = thresholds_section[agg_name] + agg_arr = output_arrays[agg_name] + fail_mask = ~threshold_passes(agg_arr, t["operator"], t["value"]) & np.isfinite(agg_arr) + if fail_mask.sum() == 0: + continue + counts: dict[str, int] = {dep: 0 for dep in deps} + for i in range(n_runs): + if not fail_mask[i]: + continue + d = binding_dep_arrays[agg_name][i] + if d is not None: + counts[d] = counts.get(d, 0) + 1 + total = sum(counts.values()) + if total == 0: + continue + binding_gate_analysis[agg_name] = { + "fail_count": int(fail_mask.sum()), + "binding_when_aggregate_fails": [ + {"dependency": d, "frequency": round(c / total, 4)} + for d, c in sorted(counts.items(), key=lambda x: -x[1]) if c > 0 + ], + } + + quartile_analysis: dict[str, list[dict]] = {} + required_input_thresholds: dict[str, list[dict]] = {} + for output_id, t in thresholds_section.items(): + if t.get("probability") is None: + continue + pass_arr = threshold_passes(output_arrays[output_id], t["operator"], t["value"]) + all_used = set(fn_signatures.get(output_id, [])) + changed = True + while changed: + changed = False + for other_name, other_args in fn_signatures.items(): + if other_name in all_used and not set(other_args).issubset(all_used): + all_used.update(other_args) + changed = True + rows = [] + req_rows = [] + for vid in input_arrays: + if vid not in all_used or vid not in input_specs or "bound" not in input_specs[vid]: + continue + q = quartile_pass_rates(input_arrays[vid], pass_arr) + if q is None: + continue + rows.append({"id": vid, **q}) + if t["probability"] is not None and t["probability"] < 0.80: + req = required_input_percentile(input_arrays[vid], pass_arr) + if req is not None: + req_rows.append({"id": vid, **req}) + if rows: + rows.sort(key=lambda x: -abs(x["delta_pp"])) + quartile_analysis[output_id] = rows[:5] + if req_rows: + req_rows.sort(key=lambda x: -x["admitted_fraction"]) + required_input_thresholds[output_id] = req_rows[:5] + + missing_value_priority: list[dict] = [] + missing_ids = [mv["id"] for mv in params.get("missing_values_to_estimate", [])] + for vid in missing_ids: + if vid not in input_arrays: + continue + bound = bounds.get(vid, {}) + low, high, base = bound.get("low", 0), bound.get("high", 0), bound.get("base", 0) + denom = max(abs(base), 1e-9) if base else max(abs(high - low), 1e-9) + bound_width_ratio = abs(high - low) / denom + score = 0.0 + worst_threshold: str | None = None + for output_id, t in thresholds_section.items(): + p = t.get("probability") + if p is None: + continue + rows = quartile_analysis.get(output_id, []) + entry = next((r for r in rows if r["id"] == vid), None) + if entry is None: + continue + impact = abs(entry["delta_pp"]) * (1.0 - p) * bound_width_ratio + if impact > score: + score = impact + worst_threshold = output_id + if worst_threshold: + missing_value_priority.append({ + "id": vid, + "score": round(score, 4), + "worst_gate": worst_threshold, + "bound_width_ratio": round(bound_width_ratio, 4), + "source": bound.get("source", "assumption"), + }) + missing_value_priority.sort(key=lambda x: -x["score"]) + + model_confidence: dict[str, dict] = {} + for output_id in outputs_section: + used: set[str] = set(fn_signatures.get(output_id, [])) + changed = True + while changed: + changed = False + for other_name, other_args in fn_signatures.items(): + if other_name in used and not set(other_args).issubset(used): + used.update(other_args) + changed = True + bound_inputs = [vid for vid in used if vid in input_specs and "bound" in input_specs[vid]] + if not bound_inputs: + model_confidence[output_id] = {"grade": "HIGH", + "reasons": ["all inputs are fixed values"]} + continue + data_n = sum(1 for vid in bound_inputs if bounds.get(vid, {}).get("source") == "data") + assumption_n = len(bound_inputs) - data_n + data_fraction = data_n / len(bound_inputs) + widths = [] + for vid in bound_inputs: + b = bounds.get(vid, {}) + base = b.get("base", 0) + denom = max(abs(base), 1e-9) if base else max(abs(b.get("high", 0) - b.get("low", 0)), 1e-9) + widths.append(abs(b.get("high", 0) - b.get("low", 0)) / denom) + avg_width = sum(widths) / len(widths) + reasons: list[str] = [] + if data_fraction >= 0.70 and avg_width < 0.5: + grade = "HIGH" + elif data_fraction < 0.30 or avg_width > 1.5: + grade = "LOW" + else: + grade = "MEDIUM" + reasons.append(f"{data_n}/{len(bound_inputs)} input bounds anchored in data; {assumption_n}/{len(bound_inputs)} are assumptions") + reasons.append(f"average bound-width-to-base ratio = {avg_width:.2f}") + model_confidence[output_id] = { + "grade": grade, + "data_source_fraction": round(data_fraction, 4), + "average_bound_width_ratio": round(avg_width, 4), + "reasons": reasons, + } + result = { "valid": True, "plan_summary": { @@ -408,6 +626,11 @@ def run(params_path: Path, bounds_path: Path, calc_path: Path, "outputs": outputs_section, "thresholds": thresholds_section, "sensitivity": sensitivity_section, + "quartile_analysis": quartile_analysis, + "binding_gate_analysis": binding_gate_analysis, + "required_input_thresholds": required_input_thresholds, + "missing_value_priority": missing_value_priority, + "model_confidence": model_confidence, "warnings": [ {"stage": "monte_carlo", "run": None, "calculation": None, "message": msg, "severity": "WARN"} diff --git a/experiments/napkin_math/summarize_insights.py b/experiments/napkin_math/summarize_insights.py index f15b6cc2..cf558367 100644 --- a/experiments/napkin_math/summarize_insights.py +++ b/experiments/napkin_math/summarize_insights.py @@ -283,6 +283,118 @@ def render_distributions(mc: dict | None) -> list[str]: return rows +def render_binding_gates(mc: dict | None) -> list[str]: + if not mc or not mc.get("binding_gate_analysis"): + return [] + rows = [ + "## Which sub-gate causes the aggregate to fail", + "", + "For each `min()` aggregate threshold that fails, this is the frequency with which each underlying gate was the binding constraint (the one with the smallest surplus in that run). Read as: \"if you only fix one gate, this is the one whose pass rate matters most.\"", + "", + ] + for agg_name, info in mc["binding_gate_analysis"].items(): + rows.append(f"**`{agg_name}`** (failed in {info['fail_count']:,} runs):") + rows.append("") + for entry in info["binding_when_aggregate_fails"]: + rows.append(f"- `{entry['dependency']}` was the binder in {entry['frequency'] * 100:.1f}% of failed runs") + rows.append("") + return rows + + +def render_quartile_analysis(mc: dict | None) -> list[str]: + if not mc or not mc.get("quartile_analysis"): + return [] + rows = [ + "## Marginal effect of each input — bottom vs top quartile", + "", + "For each threshold, this is how the pass-probability shifts when a single input is in the bottom quartile of its bound versus the top quartile. The `Δ-pp` column is percentage points: positive means moving the input up helps the gate pass; negative means moving the input up hurts. This complements the correlation-based driver list above by showing the actual lift in pass probability you would get from improving one input at a time.", + "", + ] + for output_id, entries in mc["quartile_analysis"].items(): + rows.append(f"**`{output_id}`**:") + rows.append("") + rows.append("| Driver | P(pass) bottom Q | P(pass) top Q | Δ-pp |") + rows.append("|---|---:|---:|---:|") + for e in entries: + rows.append( + f"| `{e['id']}` | {e['p_pass_low_quartile'] * 100:.1f}% | " + f"{e['p_pass_high_quartile'] * 100:.1f}% | {e['delta_pp']:+.1f} |" + ) + rows.append("") + return rows + + +def render_required_input_thresholds(mc: dict | None) -> list[str]: + if not mc or not mc.get("required_input_thresholds"): + return [] + rows = [ + "## What would need to be true for each failing gate to clear at 80%", + "", + "For each gate that fails the 80% confidence bar, this is the input-bound restriction that would lift the conditional pass-probability to at least 80%. \"Below the 25th percentile\" means: the simulation only passes 80% of the time when this input lands in the optimistic quarter of its bound. An empty list for a gate means no single input restriction is enough — multiple inputs need to move together, or the gate is structurally beyond the current bounds.", + "", + ] + for output_id, entries in mc["required_input_thresholds"].items(): + if not entries: + continue + rows.append(f"**`{output_id}`** — required input restrictions to reach 80% pass rate:") + rows.append("") + for e in entries: + direction = "above" if e["direction"] == "above" else "below" + pct = e["input_percentile_cutoff"] + cond_p = e["conditional_pass_prob"] * 100 + admitted = e["admitted_fraction"] * 100 + rows.append( + f"- `{e['id']}` must stay {direction} the {pct}th percentile " + f"(conditional pass rate {cond_p:.1f}%, admitting {admitted:.1f}% of runs)" + ) + rows.append("") + return rows + + +def render_missing_value_priority(mc: dict | None) -> list[str]: + if not mc or not mc.get("missing_value_priority"): + return [] + rows = [ + "## Missing inputs ranked by simulation impact", + "", + "The plan does not state these values; the model assumed bounds. Rank is by `|Δ-pp on the worst-affected gate| × (1 - that gate's pass rate) × bound-width-ratio` — the higher the score, the more value you would get by pinning this input down with real data instead of guessing.", + "", + "| Rank | Input | Worst-affected gate | Score | Bound width / base | Source |", + "|---:|---|---|---:|---:|---|", + ] + for i, e in enumerate(mc["missing_value_priority"], 1): + rows.append( + f"| {i} | `{e['id']}` | `{e['worst_gate']}` | {e['score']:.2f} | " + f"{e['bound_width_ratio']:.2f} | {e['source']} |" + ) + rows.append("") + return rows + + +def render_model_confidence(mc: dict | None) -> list[str]: + if not mc or not mc.get("model_confidence"): + return [] + rows = [ + "## Confidence in each output", + "", + "Grade per output, derived from how many of its input bounds are sourced from data versus assumption, and how wide those bounds are relative to the base value. LOW means the number should be read as directional, not precise.", + "", + "| Output | Confidence | Data-sourced inputs | Bound-width / base | Reasons |", + "|---|:---:|---:|---:|---|", + ] + for output_id, info in mc["model_confidence"].items(): + if "data_source_fraction" not in info: + continue + data_pct = info["data_source_fraction"] * 100 + width = info["average_bound_width_ratio"] + reasons = "; ".join(info.get("reasons", [])) + rows.append( + f"| `{output_id}` | **{info['grade']}** | {data_pct:.0f}% | {width:.2f} | {reasons} |" + ) + rows.append("") + return rows + + def render_sensitivity(mc: dict | None) -> list[str]: if not mc or not mc.get("sensitivity"): return [] @@ -393,8 +505,13 @@ def build_insights(params: dict | None, bounds: dict | None, render_plan_summary(params), render_bad_news_first(mc, scenarios, params, bounds), render_threshold_verdicts(mc, params), + render_binding_gates(mc), + render_quartile_analysis(mc), + render_required_input_thresholds(mc), render_distributions(mc), render_sensitivity(mc), + render_missing_value_priority(mc), + render_model_confidence(mc), render_scenarios(scenarios), render_missing_data(params, bounds), render_inputs_footer(params_path, bounds_path, scenarios_path, mc_path), diff --git a/experiments/napkin_math/tests/test_run_monte_carlo.py b/experiments/napkin_math/tests/test_run_monte_carlo.py index dc6c73c6..ed67a23b 100644 --- a/experiments/napkin_math/tests/test_run_monte_carlo.py +++ b/experiments/napkin_math/tests/test_run_monte_carlo.py @@ -732,5 +732,115 @@ def variance_for(distribution_default: str) -> float: self.assertGreater(uni_var, tri_var) +class TestNewAnalysisBlocks(unittest.TestCase): + """Smoke tests for the v34 quick-win analyses: binding gates, quartile pass + rates, required-input thresholds, missing-value priority, model confidence.""" + + def _faraday_like_fixture(self, tmpdir: Path) -> dict: + """Tiny min()-aggregate fixture with three EUR surplus gates.""" + calc = ( + "def gate_a(a: float, b: float) -> float:\n return a - b\n" + "def gate_b(c: float, d: float) -> float:\n return c - d\n" + "def gate_c(e: float, f: float) -> float:\n return e - f\n" + "def weakest_gate(gate_a: float, gate_b: float, gate_c: float) -> float:\n" + " return min(gate_a, gate_b, gate_c)\n" + ) + recommended = [ + {"id": "gate_a", "label": "x", + "formula_hint": "gate_a = a - b", + "output_name": "gate_a", "output_unit": "EUR", + "depends_on": ["a", "b"], "why_first": "x"}, + {"id": "gate_b", "label": "x", + "formula_hint": "gate_b = c - d", + "output_name": "gate_b", "output_unit": "EUR", + "depends_on": ["c", "d"], "why_first": "x"}, + {"id": "gate_c", "label": "x", + "formula_hint": "gate_c = e - f", + "output_name": "gate_c", "output_unit": "EUR", + "depends_on": ["e", "f"], "why_first": "x"}, + {"id": "weakest_gate", "label": "x", + "formula_hint": "weakest_gate = min(gate_a, gate_b, gate_c)", + "output_name": "weakest_gate", "output_unit": "EUR", + "depends_on": ["gate_a", "gate_b", "gate_c"], "why_first": "x"}, + ] + missing = [{"id": v, "label": "x", "unit": "EUR", + "why_needed": "x", "suggested_estimation_method": "x"} + for v in "abcdef"] + # gate_a is the dominant binder: a vs b overlap, gate_a swings near zero; + # gate_b borderline; gate_c almost always positive. + bnds = { + "a": make_bound(unit="EUR", low=80, base=100, high=120, sampling_discipline="continuous"), + "b": make_bound(unit="EUR", low=90, base=105, high=130, sampling_discipline="continuous"), + "c": make_bound(unit="EUR", low=80, base=100, high=120, sampling_discipline="continuous"), + "d": make_bound(unit="EUR", low=85, base=95, high=105, sampling_discipline="continuous"), + "e": make_bound(unit="EUR", low=500, base=600, high=700, sampling_discipline="continuous"), + "f": make_bound(unit="EUR", low=10, base=20, high=30, sampling_discipline="continuous"), + } + # Mark a/b as data-source, others as assumption to test confidence grading. + for k in ("a", "b"): + bnds[k]["source"] = "data" + for k in ("c", "d", "e", "f"): + bnds[k]["source"] = "assumption" + return run_with_fixture( + tmpdir, + missing_values=missing, + recommended=recommended, + bounds=bnds, + calc_source=calc, + _settings={"n_runs": 500, "seed": 1, + "thresholds": {"weakest_gate": {"operator": ">=", "value": 0}}}, + ) + + def test_binding_gate_identified_for_min_aggregate(self): + with tempfile.TemporaryDirectory() as td: + result = self._faraday_like_fixture(Path(td)) + self.assertIn("binding_gate_analysis", result) + bg = result["binding_gate_analysis"].get("weakest_gate") + self.assertIsNotNone(bg) + # gate_a is structurally most negative (a ≪ b), so it should dominate the binding distribution. + top = bg["binding_when_aggregate_fails"][0] + self.assertEqual(top["dependency"], "gate_a") + self.assertGreater(top["frequency"], 0.5) + + def test_quartile_analysis_present_for_thresholds(self): + with tempfile.TemporaryDirectory() as td: + result = self._faraday_like_fixture(Path(td)) + self.assertIn("quartile_analysis", result) + q = result["quartile_analysis"].get("weakest_gate") + self.assertIsNotNone(q) + # Every row has the three required keys. + for row in q: + self.assertIn("id", row) + self.assertIn("p_pass_low_quartile", row) + self.assertIn("p_pass_high_quartile", row) + self.assertIn("delta_pp", row) + + def test_required_input_thresholds_empty_when_unreachable(self): + with tempfile.TemporaryDirectory() as td: + result = self._faraday_like_fixture(Path(td)) + # weakest_gate is structurally infeasible; no single-input restriction reaches 80%. + self.assertEqual(result["required_input_thresholds"].get("weakest_gate", []), []) + + def test_missing_value_priority_ranks_inputs(self): + with tempfile.TemporaryDirectory() as td: + result = self._faraday_like_fixture(Path(td)) + mv = result["missing_value_priority"] + self.assertGreater(len(mv), 0) + # b drives gate_a (the binding gate); it should rank highly. + ids = [e["id"] for e in mv[:2]] + self.assertIn("b", ids) + + def test_model_confidence_grades_present_for_outputs(self): + with tempfile.TemporaryDirectory() as td: + result = self._faraday_like_fixture(Path(td)) + mc = result["model_confidence"] + self.assertIn("weakest_gate", mc) + grade = mc["weakest_gate"]["grade"] + self.assertIn(grade, {"HIGH", "MEDIUM", "LOW"}) + # 2/6 inputs data-sourced (a, b); under the data-fraction rule that's < 30%, so LOW expected + # unless bound widths compensate. Either MEDIUM or LOW is acceptable depending on widths. + self.assertIn(grade, {"LOW", "MEDIUM"}) + + if __name__ == "__main__": unittest.main()