From fa34a06b2a247314abc7b646b3f0c41ed106df47 Mon Sep 17 00:00:00 2001
From: Brian McMahon <brian@nousergon.ai>
Date: Thu, 4 Jun 2026 09:23:47 -0700
Subject: [PATCH 1/2] feat(metrics+stats): MetricRecord contract +
 bootstrap/Newey-West/Wilson intervals (v0.52.0)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Director plan Phase A lib substrate (alpha-engine-docs/private/director-implementation-plan-260604.md):

- metrics.py: MetricRecord — the System Report Card v2 per-component contract
  (value + CI + N/floor + target/red-line + trend + 7-state status taxonomy +
  criticality), shared by producer (evaluator grading) and consumers (console,
  public site). Pure helpers derive_status (GREEN/WATCH/RED + 4-state N/A,
  direction-aware), derive_trend_decoration, derive_letter encode the RC v2
  status semantics at the chokepoint so all surfaces agree.
- quant/stats/intervals.py: bootstrap_ci (seeded/reproducible), newey_west_se
  (HAC, Bartlett kernel, auto-lag), wilson_score_interval (small-N rates) — the
  three CI methods MetricRecord.ci_method references. numpy + stdlib, no scipy.
- 31 tests (known-value Wilson 50/100, zero-lag NW = iid SE, N/A precedence,
  lower-is-better direction); full suite 1138 passed.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 pyproject.toml                                |   2 +-
 src/alpha_engine_lib/__init__.py              |   2 +-
 src/alpha_engine_lib/metrics.py               | 202 ++++++++++++++++
 src/alpha_engine_lib/quant/stats/__init__.py  |   2 +
 src/alpha_engine_lib/quant/stats/intervals.py | 220 ++++++++++++++++++
 tests/test_metrics.py                         | 122 ++++++++++
 tests/test_quant_stats_intervals.py           |  85 +++++++
 7 files changed, 633 insertions(+), 2 deletions(-)
 create mode 100644 src/alpha_engine_lib/metrics.py
 create mode 100644 src/alpha_engine_lib/quant/stats/intervals.py
 create mode 100644 tests/test_metrics.py
 create mode 100644 tests/test_quant_stats_intervals.py

diff --git a/pyproject.toml b/pyproject.toml
index d609303..2512de1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "alpha-engine-lib"
-version = "0.51.0"
+version = "0.52.0"
 description = "Shared utilities for the Alpha Engine modules: preflight, logging, ArcticDB, dates, decision capture, cost telemetry, Anthropic payload chokepoint, artifact freshness, RAG, agent schemas, SSM secrets, Telegram + SNS alerts, EC2 spot resilience, SSM log-capture, SSM dispatcher, Step-Functions execution-state projection, S3-conditional-PUT writer locks, and bounded-backoff HTTP retry. Full surface documented in README."
 readme = "README.md"
 # EC2 still runs Python 3.9 on the always-on micro instance (boto3 drops
diff --git a/src/alpha_engine_lib/__init__.py b/src/alpha_engine_lib/__init__.py
index 1438566..4a4263e 100644
--- a/src/alpha_engine_lib/__init__.py
+++ b/src/alpha_engine_lib/__init__.py
@@ -1,3 +1,3 @@
 """alpha-engine-lib — shared utilities for Alpha Engine modules."""
 
-__version__ = "0.51.0"
+__version__ = "0.52.0"
diff --git a/src/alpha_engine_lib/metrics.py b/src/alpha_engine_lib/metrics.py
new file mode 100644
index 0000000..6e1099f
--- /dev/null
+++ b/src/alpha_engine_lib/metrics.py
@@ -0,0 +1,202 @@
+"""metrics — the System Report Card v2 ``MetricRecord`` contract + status derivation.
+
+A ``MetricRecord`` is the unit of the v2 report card: every graded component
+(research / predictor / executor / backtester / substrate / agent / portfolio)
+emits one, carrying not just a value but its statistical context — CI, sample
+size vs floor, target, red-line, trend, and a derived status/letter. The letter
+is *derived* from the status, never the source of truth (RC v2 Principle 2).
+
+This module is the shared chokepoint: the producer (the evaluator's grading
+layer) and every consumer (dashboard console, public site) agree on the schema
+AND on the status semantics via the pure ``derive_*`` helpers here — so the same
+``(value, CI, N)`` maps to the same GREEN/WATCH/RED everywhere.
+
+The N/A taxonomy distinguishes the four engineering states that the legacy
+"insufficient data" string conflated:
+  - ``N/A-NOT-IMPL``     grader exists, producer analysis not yet wired
+  - ``N/A-NOT-RUN``      producer implemented but did not run this cycle
+  - ``N/A-LOW-N``        ran, but N below half the floor — CI too wide to read
+  - ``N/A-MISSING-INPUT``ran, but a required upstream artifact was absent
+
+Authoritative design: ``alpha-engine-docs/private/system-report-card-revamp-260522.md``.
+Module-level aggregation (critical-gate module roll-up, BH-FDR over a tile's
+component family) lives in the evaluator, not here — this is the per-component
+contract only.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+StatusLiteral = Literal[
+    "GREEN",
+    "WATCH",
+    "RED",
+    "N/A-NOT-IMPL",
+    "N/A-NOT-RUN",
+    "N/A-LOW-N",
+    "N/A-MISSING-INPUT",
+]
+CriticalityLiteral = Literal["critical", "supporting", "diagnostic"]
+MetricTypeLiteral = Literal[
+    "ic", "lift", "ratio", "pct", "count", "duration",
+    "sharpe", "calibration", "p_value", "zscore", "log_return",
+]
+TrendDecorationLiteral = Literal["↑↑", "↑", "→", "↓", "↓↓"]
+
+_NA_PREFIX = "N/A"
+
+
+class MetricRecord(BaseModel):
+    """One graded component of the System Report Card v2.
+
+    ``extra="allow"`` for forward-compat: a newer producer may add fields a
+    older consumer hasn't learned yet without breaking the read.
+    """
+
+    model_config = ConfigDict(extra="allow")
+
+    name: str = Field(description="snake_case stable key, e.g. 'predictor_meta_l2_ic'.")
+    module: str = Field(
+        description="Owning tile: portfolio | research | predictor | executor | "
+        "backtester | substrate | agent."
+    )
+    metric_type: MetricTypeLiteral
+    value: float | None = Field(default=None, description="Measured value (None when N/A-*).")
+    ci_low: float | None = Field(default=None)
+    ci_high: float | None = Field(default=None)
+    ci_method: str | None = Field(
+        default=None, description="e.g. 'bootstrap', 'newey-west', 'wilson'."
+    )
+    n_samples: int | None = Field(default=None, description="Observations behind the value.")
+    n_floor: int = Field(description="Documented minimum N for a confident reading.")
+    target: float | None = Field(default=None, description="At/beyond here = good.")
+    red_line: float | None = Field(default=None, description="At/beyond here = system-breaking.")
+    trend_4w: list[float] | None = Field(default=None)
+    trend_13w: list[float] | None = Field(default=None)
+    trend_decoration: TrendDecorationLiteral = Field(default="→")
+    status: StatusLiteral
+    status_reason: str = Field(description="One operator-readable sentence; never generic.")
+    criticality: CriticalityLiteral = Field(default="supporting")
+    source_path: str = Field(description="S3 URI / SQLite path / artifact this was read from.")
+    bh_fdr_adjusted_p: float | None = Field(default=None)
+    last_updated_utc: datetime
+    derived_letter: str = Field(default="N/A", description="Summary letter; derived from status+value.")
+
+    @property
+    def is_na(self) -> bool:
+        return self.status.startswith(_NA_PREFIX)
+
+
+def derive_trend_decoration(
+    values: list[float] | None,
+    *,
+    higher_is_better: bool = True,
+) -> TrendDecorationLiteral:
+    """Map a rolling value window to a trend glyph (RC v2 Principle 5).
+
+    Looks at the last four points: ``↑↑`` if all 3 steps improved, ``↑`` if the
+    most recent 2 of those steps improved, ``↓↓``/``↓`` mirrored, else ``→``.
+    Improvement is "increase" when ``higher_is_better`` else "decrease".
+    """
+    if not values or len(values) < 2:
+        return "→"
+    window = values[-4:]
+    steps = [b - a for a, b in zip(window, window[1:])]
+    if not higher_is_better:
+        steps = [-s for s in steps]
+
+    eps = 1e-12
+    ups = sum(1 for s in steps if s > eps)
+    downs = sum(1 for s in steps if s < -eps)
+    recent2 = steps[-2:]
+
+    if len(steps) >= 3 and ups == len(steps):
+        return "↑↑"
+    if len(steps) >= 3 and downs == len(steps):
+        return "↓↓"
+    if sum(1 for s in recent2 if s > eps) == len(recent2) and ups >= downs:
+        return "↑"
+    if sum(1 for s in recent2 if s < -eps) == len(recent2) and downs >= ups:
+        return "↓"
+    return "→"
+
+
+def derive_letter(status: StatusLiteral) -> str:
+    """Project a status onto the summary letter band (RC v2 Principle 2).
+
+    The letter is a display convenience only — ``status`` + ``value`` are the
+    source of truth. Any ``N/A-*`` projects to ``"N/A"``.
+    """
+    if status.startswith(_NA_PREFIX):
+        return "N/A"
+    return {"GREEN": "A", "WATCH": "C", "RED": "F"}[status]
+
+
+def derive_status(
+    *,
+    value: float | None,
+    n_samples: int | None,
+    n_floor: int,
+    target: float | None = None,
+    red_line: float | None = None,
+    ci_low: float | None = None,
+    ci_high: float | None = None,
+    implemented: bool = True,
+    ran: bool = True,
+    input_present: bool = True,
+) -> StatusLiteral:
+    """Derive the GREEN/WATCH/RED/``N/A-*`` status for one component.
+
+    Encodes RC v2 Principles 2 (status taxonomy) and 6 (sample-size discipline)
+    so producer and consumers agree. Direction is inferred from the
+    ``target``/``red_line`` ordering: ``target >= red_line`` ⇒ higher-is-better,
+    else lower-is-better (e.g. max-drawdown, ECE).
+
+    The four N/A conditions take precedence in order: not-implemented →
+    not-run → missing-input → low-N. Above the floor, status follows the value
+    and (when provided) the confidence interval relative to target/red-line.
+    """
+    if not implemented:
+        return "N/A-NOT-IMPL"
+    if not ran:
+        return "N/A-NOT-RUN"
+    if not input_present:
+        return "N/A-MISSING-INPUT"
+    if value is None or n_samples is None or n_samples < 0.5 * n_floor:
+        return "N/A-LOW-N"
+
+    higher_is_better = target is None or red_line is None or target >= red_line
+
+    def _at_or_better(a: float, b: float) -> bool:
+        return a >= b if higher_is_better else a <= b
+
+    def _at_or_worse(a: float, b: float) -> bool:
+        return a <= b if higher_is_better else a >= b
+
+    # RED: value at/below the red-line, or the CI sits entirely on the bad side.
+    if red_line is not None:
+        if _at_or_worse(value, red_line):
+            return "RED"
+        bad_bound = ci_low if higher_is_better else ci_high
+        if bad_bound is not None and _at_or_worse(bad_bound, red_line):
+            return "RED"
+
+    # Between half-floor and floor: CI too wide to claim GREEN regardless.
+    if n_samples < n_floor:
+        return "WATCH"
+
+    if target is not None:
+        if _at_or_better(value, target):
+            # GREEN needs the whole CI clear of the red-line (when both known).
+            good_bound = ci_low if higher_is_better else ci_high
+            if red_line is not None and good_bound is not None and _at_or_worse(good_bound, red_line):
+                return "WATCH"
+            return "GREEN"
+        return "WATCH"
+
+    # No target given: above red-line with adequate N ⇒ GREEN.
+    return "GREEN"
diff --git a/src/alpha_engine_lib/quant/stats/__init__.py b/src/alpha_engine_lib/quant/stats/__init__.py
index 85b748a..b4d10e9 100644
--- a/src/alpha_engine_lib/quant/stats/__init__.py
+++ b/src/alpha_engine_lib/quant/stats/__init__.py
@@ -11,6 +11,7 @@
   - ``information_coefficient`` — Spearman rank IC of conviction vs forward return
   - ``expectancy``              — hit-rate × win/loss decomposition
   - ``multiple_testing``        — Benjamini-Hochberg FDR correction
+  - ``intervals``               — bootstrap CI, Newey-West SE, Wilson score interval
   - ``risk_matched_benchmark``  — EW-high-vol + beta-matched-SPY baselines + IR
   - ``regime_sortino``          — regime-stratified cross-sectional pick-alpha Sortino
 
@@ -18,6 +19,7 @@
 
     from alpha_engine_lib.quant.stats.dsr import compute_dsr
     from alpha_engine_lib.quant.stats.multiple_testing import benjamini_hochberg
+    from alpha_engine_lib.quant.stats.intervals import bootstrap_ci, wilson_score_interval
 """
 
 from __future__ import annotations
diff --git a/src/alpha_engine_lib/quant/stats/intervals.py b/src/alpha_engine_lib/quant/stats/intervals.py
new file mode 100644
index 0000000..6f9c0a3
--- /dev/null
+++ b/src/alpha_engine_lib/quant/stats/intervals.py
@@ -0,0 +1,220 @@
+"""intervals — Bootstrap confidence intervals, Newey-West SE, Wilson score intervals.
+
+The three inference primitives the System Report Card v2 metric records require
+(``MetricRecord.ci_method`` ∈ {``bootstrap``, ``newey-west``, ``wilson``}):
+
+  - ``bootstrap_ci``          — percentile bootstrap CI for any statistic of a
+                                sample (default the mean). The general-purpose
+                                CI for ICs, lifts, hit-rates, Sharpe point
+                                estimates where no closed form is convenient.
+  - ``newey_west_se``         — heteroskedasticity-and-autocorrelation-consistent
+                                (HAC) standard error of a series mean, for
+                                autocorrelated daily P&L where the iid SE
+                                understates uncertainty.
+  - ``wilson_score_interval`` — Wilson score binomial interval for rates with
+                                small N (veto-gate precision/recall, hit-rate),
+                                which the normal-approximation interval handles
+                                badly near 0/1.
+
+Pure-compute; no I/O. bootstrap + Newey-West need numpy (install
+``alpha-engine-lib[quant]``); Wilson is stdlib-only (``statistics.NormalDist``).
+
+Reference: López de Prado, *Advances in Financial Machine Learning* (bootstrap
++ HAC); Wilson (1927) "Probable Inference, the Law of Succession, and
+Statistical Inference".
+"""
+
+from __future__ import annotations
+
+import math
+from statistics import NormalDist
+from typing import Callable, Sequence, TypedDict
+
+import numpy as np
+
+_DEFAULT_N_RESAMPLES = 1000
+
+
+class BootstrapCIResult(TypedDict, total=False):
+    status: str           # "ok" | "insufficient_data"
+    n: int                # observations after NaN drop
+    estimate: float       # statistic on the full sample
+    ci_low: float
+    ci_high: float
+    ci_level: float       # e.g. 0.95
+    method: str           # "bootstrap"
+    n_resamples: int
+
+
+class NeweyWestResult(TypedDict, total=False):
+    status: str           # "ok" | "insufficient_data"
+    n: int
+    estimate: float       # sample mean
+    se: float             # HAC standard error of the mean
+    lags: int             # Bartlett-kernel lags used
+    method: str           # "newey-west"
+
+
+class WilsonScoreResult(TypedDict, total=False):
+    status: str           # "ok" | "insufficient_data"
+    n: int                # trials
+    successes: int
+    rate: float           # successes / trials (point estimate)
+    estimate: float       # alias of rate (uniform with the other results)
+    ci_low: float
+    ci_high: float
+    ci_level: float
+    method: str           # "wilson"
+
+
+def _clean(data: Sequence[float] | np.ndarray) -> np.ndarray:
+    """Coerce to a 1-D float array with NaN/inf dropped."""
+    arr = np.asarray(data, dtype=float).ravel()
+    return arr[np.isfinite(arr)]
+
+
+def bootstrap_ci(
+    data: Sequence[float] | np.ndarray,
+    statistic: Callable[[np.ndarray], float] = np.mean,
+    *,
+    ci_level: float = 0.95,
+    n_resamples: int = _DEFAULT_N_RESAMPLES,
+    seed: int = 0,
+) -> BootstrapCIResult:
+    """Percentile bootstrap confidence interval for ``statistic`` of ``data``.
+
+    Args:
+        data: 1-D sample of observations (NaN/inf dropped).
+        statistic: Callable ``(np.ndarray) -> float`` to bootstrap. Defaults to
+            the mean.
+        ci_level: Confidence level in (0, 1) (default 0.95).
+        n_resamples: Number of bootstrap resamples (default 1000).
+        seed: RNG seed for reproducibility (the report card must be stable
+            across re-renders of the same cycle).
+
+    Returns:
+        A :class:`BootstrapCIResult`. ``status == "insufficient_data"`` when
+        fewer than 2 finite observations remain.
+    """
+    arr = _clean(data)
+    n = int(arr.size)
+    if n < 2:
+        return {"status": "insufficient_data", "n": n}
+
+    estimate = float(statistic(arr))
+    rng = np.random.default_rng(seed)
+    idx = rng.integers(0, n, size=(n_resamples, n))
+    boot = np.fromiter(
+        (statistic(arr[row]) for row in idx), dtype=float, count=n_resamples
+    )
+    boot = boot[np.isfinite(boot)]
+    if boot.size == 0:
+        return {"status": "insufficient_data", "n": n}
+
+    tail = (1.0 - ci_level) / 2.0
+    ci_low = float(np.percentile(boot, 100.0 * tail))
+    ci_high = float(np.percentile(boot, 100.0 * (1.0 - tail)))
+    return {
+        "status": "ok",
+        "n": n,
+        "estimate": estimate,
+        "ci_low": ci_low,
+        "ci_high": ci_high,
+        "ci_level": ci_level,
+        "method": "bootstrap",
+        "n_resamples": int(boot.size),
+    }
+
+
+def _auto_lags(n: int) -> int:
+    """Newey-West (1994) automatic lag selection: floor(4·(n/100)^(2/9))."""
+    return int(math.floor(4.0 * (n / 100.0) ** (2.0 / 9.0)))
+
+
+def newey_west_se(
+    series: Sequence[float] | np.ndarray,
+    *,
+    max_lags: int | None = None,
+) -> NeweyWestResult:
+    """HAC (Newey-West, Bartlett kernel) standard error of the series mean.
+
+    For autocorrelated series (daily P&L), the iid SE ``s/√n`` understates
+    uncertainty. The HAC estimator inflates the long-run variance by the
+    Bartlett-weighted autocovariances up to ``max_lags``.
+
+    Args:
+        series: 1-D series (NaN/inf dropped).
+        max_lags: Bartlett-kernel lag truncation. ``None`` ⇒ the Newey-West
+            (1994) rule ``floor(4·(n/100)^(2/9))``. Clamped to ``[0, n-1]``.
+
+    Returns:
+        A :class:`NeweyWestResult`. ``status == "insufficient_data"`` for n < 2.
+    """
+    x = _clean(series)
+    n = int(x.size)
+    if n < 2:
+        return {"status": "insufficient_data", "n": n}
+
+    lags = _auto_lags(n) if max_lags is None else int(max_lags)
+    lags = max(0, min(lags, n - 1))
+
+    e = x - x.mean()
+    gamma0 = float(np.dot(e, e) / n)
+    lrv = gamma0
+    for j in range(1, lags + 1):
+        weight = 1.0 - j / (lags + 1.0)
+        gamma_j = float(np.dot(e[j:], e[:-j]) / n)
+        lrv += 2.0 * weight * gamma_j
+    lrv = max(lrv, 0.0)  # Bartlett kernel guarantees PSD; clamp float error.
+    se = math.sqrt(lrv / n)
+    return {
+        "status": "ok",
+        "n": n,
+        "estimate": float(x.mean()),
+        "se": se,
+        "lags": lags,
+        "method": "newey-west",
+    }
+
+
+def wilson_score_interval(
+    successes: int,
+    trials: int,
+    *,
+    ci_level: float = 0.95,
+) -> WilsonScoreResult:
+    """Wilson score interval for a binomial proportion.
+
+    Preferred over the normal-approximation (Wald) interval for small ``trials``
+    or rates near 0/1, where Wald produces bounds outside [0, 1] and undercovers.
+
+    Args:
+        successes: Count of successes (0 ≤ successes ≤ trials).
+        trials: Total trials (> 0).
+        ci_level: Confidence level in (0, 1) (default 0.95).
+
+    Returns:
+        A :class:`WilsonScoreResult`. ``status == "insufficient_data"`` for
+        ``trials <= 0``. Bounds are clamped to [0, 1].
+    """
+    if trials <= 0:
+        return {"status": "insufficient_data", "n": int(max(trials, 0))}
+    successes = max(0, min(int(successes), int(trials)))
+
+    z = NormalDist().inv_cdf(1.0 - (1.0 - ci_level) / 2.0)
+    p = successes / trials
+    z2 = z * z
+    denom = 1.0 + z2 / trials
+    center = (p + z2 / (2.0 * trials)) / denom
+    margin = (z / denom) * math.sqrt(p * (1.0 - p) / trials + z2 / (4.0 * trials * trials))
+    return {
+        "status": "ok",
+        "n": int(trials),
+        "successes": int(successes),
+        "rate": p,
+        "estimate": p,
+        "ci_low": max(0.0, center - margin),
+        "ci_high": min(1.0, center + margin),
+        "ci_level": ci_level,
+        "method": "wilson",
+    }
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
new file mode 100644
index 0000000..83fc4da
--- /dev/null
+++ b/tests/test_metrics.py
@@ -0,0 +1,122 @@
+"""Tests for alpha_engine_lib.metrics — MetricRecord contract + status derivation."""
+
+from datetime import datetime, timezone
+
+import pytest
+from pydantic import ValidationError
+
+from alpha_engine_lib.metrics import (
+    MetricRecord,
+    derive_letter,
+    derive_status,
+    derive_trend_decoration,
+)
+
+
+def _record(**overrides):
+    base = dict(
+        name="predictor_meta_l2_ic",
+        module="predictor",
+        metric_type="ic",
+        n_floor=100,
+        status="GREEN",
+        status_reason="L2 IC 0.48 (CI [0.21,0.74], N=626) above target 0.05.",
+        source_path="s3://alpha-engine-research/predictor/metrics/latest.json#l2_ic",
+        last_updated_utc=datetime(2026, 6, 4, tzinfo=timezone.utc),
+    )
+    base.update(overrides)
+    return MetricRecord(**base)
+
+
+class TestMetricRecord:
+    def test_minimal_valid_record(self):
+        r = _record(value=0.48, n_samples=626)
+        assert r.module == "predictor" and r.value == 0.48
+        assert r.trend_decoration == "→"  # default
+        assert r.is_na is False
+
+    def test_na_status_flags_is_na(self):
+        assert _record(status="N/A-LOW-N").is_na is True
+
+    def test_extra_fields_allowed(self):
+        r = _record(value=0.1, n_samples=200, future_field="ok")
+        assert r.value == 0.1
+
+    def test_bad_status_rejected(self):
+        with pytest.raises(ValidationError):
+            _record(status="MAYBE")
+
+    def test_bad_metric_type_rejected(self):
+        with pytest.raises(ValidationError):
+            _record(metric_type="vibes")
+
+
+class TestDeriveTrendDecoration:
+    def test_sustained_up_and_down(self):
+        assert derive_trend_decoration([1, 2, 3, 4]) == "↑↑"
+        assert derive_trend_decoration([4, 3, 2, 1]) == "↓↓"
+
+    def test_flat_and_too_short(self):
+        assert derive_trend_decoration([2, 2, 2, 2]) == "→"
+        assert derive_trend_decoration([1]) == "→"
+        assert derive_trend_decoration(None) == "→"
+
+    def test_recent_improvement(self):
+        # down then up-up: not sustained, but recent 2 improved.
+        assert derive_trend_decoration([5, 1, 2, 3]) == "↑"
+
+    def test_higher_is_better_false_flips(self):
+        # decreasing values are an improvement when lower is better (e.g. drawdown).
+        assert derive_trend_decoration([4, 3, 2, 1], higher_is_better=False) == "↑↑"
+
+
+class TestDeriveLetter:
+    def test_status_to_letter(self):
+        assert derive_letter("GREEN") == "A"
+        assert derive_letter("WATCH") == "C"
+        assert derive_letter("RED") == "F"
+        assert derive_letter("N/A-NOT-IMPL") == "N/A"
+
+
+class TestDeriveStatus:
+    def test_na_precedence(self):
+        assert derive_status(value=0.5, n_samples=200, n_floor=100, implemented=False) == "N/A-NOT-IMPL"
+        assert derive_status(value=0.5, n_samples=200, n_floor=100, ran=False) == "N/A-NOT-RUN"
+        assert derive_status(value=0.5, n_samples=200, n_floor=100, input_present=False) == "N/A-MISSING-INPUT"
+
+    def test_low_n(self):
+        assert derive_status(value=0.5, n_samples=40, n_floor=100) == "N/A-LOW-N"
+        assert derive_status(value=None, n_samples=200, n_floor=100) == "N/A-LOW-N"
+
+    def test_red_when_at_or_below_red_line(self):
+        s = derive_status(value=-0.01, n_samples=200, n_floor=100, target=0.05, red_line=0.0)
+        assert s == "RED"
+
+    def test_red_when_ci_entirely_below_red_line(self):
+        s = derive_status(
+            value=0.02, n_samples=200, n_floor=100, target=0.05, red_line=0.0,
+            ci_low=-0.03, ci_high=-0.01,
+        )
+        assert s == "RED"
+
+    def test_green_when_above_target_with_clear_ci(self):
+        s = derive_status(
+            value=0.48, n_samples=626, n_floor=100, target=0.05, red_line=0.0,
+            ci_low=0.21, ci_high=0.74,
+        )
+        assert s == "GREEN"
+
+    def test_watch_between_half_floor_and_floor(self):
+        # N=70 is above 0.5*floor (50) but below floor (100) → WATCH regardless of value.
+        s = derive_status(value=0.9, n_samples=70, n_floor=100, target=0.05, red_line=0.0)
+        assert s == "WATCH"
+
+    def test_watch_below_target_above_red_line(self):
+        s = derive_status(value=0.02, n_samples=200, n_floor=100, target=0.05, red_line=0.0)
+        assert s == "WATCH"
+
+    def test_lower_is_better_direction(self):
+        # max_drawdown: target 0.15, red_line 0.25 (lower is better).
+        assert derive_status(value=0.08, n_samples=200, n_floor=60, target=0.15, red_line=0.25) == "GREEN"
+        assert derive_status(value=0.30, n_samples=200, n_floor=60, target=0.15, red_line=0.25) == "RED"
+        assert derive_status(value=0.20, n_samples=200, n_floor=60, target=0.15, red_line=0.25) == "WATCH"
diff --git a/tests/test_quant_stats_intervals.py b/tests/test_quant_stats_intervals.py
new file mode 100644
index 0000000..fb25804
--- /dev/null
+++ b/tests/test_quant_stats_intervals.py
@@ -0,0 +1,85 @@
+"""Tests for alpha_engine_lib.quant.stats.intervals — bootstrap CI, Newey-West SE, Wilson."""
+
+import math
+
+import numpy as np
+
+from alpha_engine_lib.quant.stats.intervals import (
+    bootstrap_ci,
+    newey_west_se,
+    wilson_score_interval,
+)
+
+
+class TestBootstrapCI:
+    def test_insufficient_data(self):
+        assert bootstrap_ci([]) == {"status": "insufficient_data", "n": 0}
+        assert bootstrap_ci([4.2])["status"] == "insufficient_data"
+
+    def test_constant_sample_has_zero_width(self):
+        out = bootstrap_ci([5.0, 5.0, 5.0, 5.0])
+        assert out["status"] == "ok"
+        assert out["estimate"] == 5.0
+        assert out["ci_low"] == 5.0 and out["ci_high"] == 5.0
+
+    def test_ci_brackets_estimate_and_is_deterministic(self):
+        data = list(range(100))
+        a = bootstrap_ci(data, seed=7)
+        b = bootstrap_ci(data, seed=7)
+        assert a == b  # seeded → reproducible (report card must be stable)
+        assert a["ci_low"] <= a["estimate"] <= a["ci_high"]
+        assert a["estimate"] == float(np.mean(data))
+        assert a["ci_level"] == 0.95 and a["method"] == "bootstrap"
+
+    def test_nan_dropped(self):
+        out = bootstrap_ci([1.0, 2.0, float("nan"), 3.0])
+        assert out["n"] == 3
+
+    def test_custom_statistic(self):
+        out = bootstrap_ci([1.0, 2.0, 3.0, 4.0, 5.0], statistic=np.median, seed=1)
+        assert out["status"] == "ok"
+        assert out["estimate"] == 3.0
+
+
+class TestNeweyWestSE:
+    def test_insufficient_data(self):
+        assert newey_west_se([2.0])["status"] == "insufficient_data"
+
+    def test_zero_lag_matches_iid_se(self):
+        # [1..5]: mean 3, gamma0 = 10/5 = 2, se = sqrt(2/5).
+        out = newey_west_se([1.0, 2.0, 3.0, 4.0, 5.0], max_lags=0)
+        assert out["estimate"] == 3.0
+        assert out["lags"] == 0
+        assert out["se"] == math.sqrt(0.4)
+
+    def test_lags_clamped_to_n_minus_1(self):
+        out = newey_west_se([1.0, 2.0, 3.0], max_lags=99)
+        assert out["lags"] == 2
+
+    def test_auto_lags_nonnegative(self):
+        out = newey_west_se([float(x) for x in range(200)])
+        assert out["status"] == "ok"
+        assert out["lags"] >= 0
+        assert out["se"] >= 0.0
+
+
+class TestWilsonScoreInterval:
+    def test_insufficient_data(self):
+        assert wilson_score_interval(0, 0)["status"] == "insufficient_data"
+
+    def test_known_50_of_100(self):
+        # Textbook Wilson 95% interval for 50/100 is [0.4038, 0.5962].
+        out = wilson_score_interval(50, 100)
+        assert out["rate"] == 0.5
+        assert abs(out["ci_low"] - 0.4038) < 1e-3
+        assert abs(out["ci_high"] - 0.5962) < 1e-3
+
+    def test_bounds_clamped_to_unit_interval(self):
+        lo = wilson_score_interval(0, 10)
+        assert lo["ci_low"] == 0.0 and 0.0 < lo["ci_high"] < 1.0
+        hi = wilson_score_interval(10, 10)
+        assert hi["ci_high"] == 1.0 and 0.0 < hi["ci_low"] < 1.0
+
+    def test_successes_clamped_to_trials(self):
+        out = wilson_score_interval(15, 10)
+        assert out["successes"] == 10 and out["rate"] == 1.0

From eee5a5418803592118f15e6deead2b6c9ddf675a Mon Sep 17 00:00:00 2001
From: Brian McMahon <brian@nousergon.ai>
Date: Thu, 4 Jun 2026 09:30:14 -0700
Subject: [PATCH 2/2] test(intervals): tolerance on Wilson boundary bounds
 (float residual, not exact 0/1)

CI surfaced wilson_score_interval(0,10) returning ci_low=2.78e-17 (legitimate
float noise from the [0,1] clamp; true bound is 0). Assert pytest.approx instead
of exact equality. Local numpy happened to compute exact 0.0; CI did not.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/test_quant_stats_intervals.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_quant_stats_intervals.py b/tests/test_quant_stats_intervals.py
index fb25804..1b2003a 100644
--- a/tests/test_quant_stats_intervals.py
+++ b/tests/test_quant_stats_intervals.py
@@ -3,6 +3,7 @@
 import math
 
 import numpy as np
+import pytest
 
 from alpha_engine_lib.quant.stats.intervals import (
     bootstrap_ci,
@@ -75,10 +76,13 @@ def test_known_50_of_100(self):
         assert abs(out["ci_high"] - 0.5962) < 1e-3
 
     def test_bounds_clamped_to_unit_interval(self):
+        # successes=0 → true lower bound is 0 (residual float noise ≈ 1e-17).
         lo = wilson_score_interval(0, 10)
-        assert lo["ci_low"] == 0.0 and 0.0 < lo["ci_high"] < 1.0
+        assert lo["ci_low"] == pytest.approx(0.0, abs=1e-9)
+        assert 0.0 < lo["ci_high"] < 1.0
         hi = wilson_score_interval(10, 10)
-        assert hi["ci_high"] == 1.0 and 0.0 < hi["ci_low"] < 1.0
+        assert hi["ci_high"] == pytest.approx(1.0, abs=1e-9)
+        assert 0.0 < hi["ci_low"] < 1.0
 
     def test_successes_clamped_to_trials(self):
         out = wilson_score_interval(15, 10)