From fa34a06b2a247314abc7b646b3f0c41ed106df47 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Thu, 4 Jun 2026 09:23:47 -0700 Subject: [PATCH 1/2] feat(metrics+stats): MetricRecord contract + bootstrap/Newey-West/Wilson intervals (v0.52.0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Director plan Phase A lib substrate (alpha-engine-docs/private/director-implementation-plan-260604.md): - metrics.py: MetricRecord — the System Report Card v2 per-component contract (value + CI + N/floor + target/red-line + trend + 7-state status taxonomy + criticality), shared by producer (evaluator grading) and consumers (console, public site). Pure helpers derive_status (GREEN/WATCH/RED + 4-state N/A, direction-aware), derive_trend_decoration, derive_letter encode the RC v2 status semantics at the chokepoint so all surfaces agree. - quant/stats/intervals.py: bootstrap_ci (seeded/reproducible), newey_west_se (HAC, Bartlett kernel, auto-lag), wilson_score_interval (small-N rates) — the three CI methods MetricRecord.ci_method references. numpy + stdlib, no scipy. - 31 tests (known-value Wilson 50/100, zero-lag NW = iid SE, N/A precedence, lower-is-better direction); full suite 1138 passed. Co-Authored-By: Claude Opus 4.8 (1M context) --- pyproject.toml | 2 +- src/alpha_engine_lib/__init__.py | 2 +- src/alpha_engine_lib/metrics.py | 202 ++++++++++++++++ src/alpha_engine_lib/quant/stats/__init__.py | 2 + src/alpha_engine_lib/quant/stats/intervals.py | 220 ++++++++++++++++++ tests/test_metrics.py | 122 ++++++++++ tests/test_quant_stats_intervals.py | 85 +++++++ 7 files changed, 633 insertions(+), 2 deletions(-) create mode 100644 src/alpha_engine_lib/metrics.py create mode 100644 src/alpha_engine_lib/quant/stats/intervals.py create mode 100644 tests/test_metrics.py create mode 100644 tests/test_quant_stats_intervals.py diff --git a/pyproject.toml b/pyproject.toml index d609303..2512de1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "alpha-engine-lib" -version = "0.51.0" +version = "0.52.0" description = "Shared utilities for the Alpha Engine modules: preflight, logging, ArcticDB, dates, decision capture, cost telemetry, Anthropic payload chokepoint, artifact freshness, RAG, agent schemas, SSM secrets, Telegram + SNS alerts, EC2 spot resilience, SSM log-capture, SSM dispatcher, Step-Functions execution-state projection, S3-conditional-PUT writer locks, and bounded-backoff HTTP retry. Full surface documented in README." readme = "README.md" # EC2 still runs Python 3.9 on the always-on micro instance (boto3 drops diff --git a/src/alpha_engine_lib/__init__.py b/src/alpha_engine_lib/__init__.py index 1438566..4a4263e 100644 --- a/src/alpha_engine_lib/__init__.py +++ b/src/alpha_engine_lib/__init__.py @@ -1,3 +1,3 @@ """alpha-engine-lib — shared utilities for Alpha Engine modules.""" -__version__ = "0.51.0" +__version__ = "0.52.0" diff --git a/src/alpha_engine_lib/metrics.py b/src/alpha_engine_lib/metrics.py new file mode 100644 index 0000000..6e1099f --- /dev/null +++ b/src/alpha_engine_lib/metrics.py @@ -0,0 +1,202 @@ +"""metrics — the System Report Card v2 ``MetricRecord`` contract + status derivation. + +A ``MetricRecord`` is the unit of the v2 report card: every graded component +(research / predictor / executor / backtester / substrate / agent / portfolio) +emits one, carrying not just a value but its statistical context — CI, sample +size vs floor, target, red-line, trend, and a derived status/letter. The letter +is *derived* from the status, never the source of truth (RC v2 Principle 2). + +This module is the shared chokepoint: the producer (the evaluator's grading +layer) and every consumer (dashboard console, public site) agree on the schema +AND on the status semantics via the pure ``derive_*`` helpers here — so the same +``(value, CI, N)`` maps to the same GREEN/WATCH/RED everywhere. + +The N/A taxonomy distinguishes the four engineering states that the legacy +"insufficient data" string conflated: + - ``N/A-NOT-IMPL`` grader exists, producer analysis not yet wired + - ``N/A-NOT-RUN`` producer implemented but did not run this cycle + - ``N/A-LOW-N`` ran, but N below half the floor — CI too wide to read + - ``N/A-MISSING-INPUT``ran, but a required upstream artifact was absent + +Authoritative design: ``alpha-engine-docs/private/system-report-card-revamp-260522.md``. +Module-level aggregation (critical-gate module roll-up, BH-FDR over a tile's +component family) lives in the evaluator, not here — this is the per-component +contract only. +""" + +from __future__ import annotations + +from datetime import datetime +from typing import Literal + +from pydantic import BaseModel, ConfigDict, Field + +StatusLiteral = Literal[ + "GREEN", + "WATCH", + "RED", + "N/A-NOT-IMPL", + "N/A-NOT-RUN", + "N/A-LOW-N", + "N/A-MISSING-INPUT", +] +CriticalityLiteral = Literal["critical", "supporting", "diagnostic"] +MetricTypeLiteral = Literal[ + "ic", "lift", "ratio", "pct", "count", "duration", + "sharpe", "calibration", "p_value", "zscore", "log_return", +] +TrendDecorationLiteral = Literal["↑↑", "↑", "→", "↓", "↓↓"] + +_NA_PREFIX = "N/A" + + +class MetricRecord(BaseModel): + """One graded component of the System Report Card v2. + + ``extra="allow"`` for forward-compat: a newer producer may add fields a + older consumer hasn't learned yet without breaking the read. + """ + + model_config = ConfigDict(extra="allow") + + name: str = Field(description="snake_case stable key, e.g. 'predictor_meta_l2_ic'.") + module: str = Field( + description="Owning tile: portfolio | research | predictor | executor | " + "backtester | substrate | agent." + ) + metric_type: MetricTypeLiteral + value: float | None = Field(default=None, description="Measured value (None when N/A-*).") + ci_low: float | None = Field(default=None) + ci_high: float | None = Field(default=None) + ci_method: str | None = Field( + default=None, description="e.g. 'bootstrap', 'newey-west', 'wilson'." + ) + n_samples: int | None = Field(default=None, description="Observations behind the value.") + n_floor: int = Field(description="Documented minimum N for a confident reading.") + target: float | None = Field(default=None, description="At/beyond here = good.") + red_line: float | None = Field(default=None, description="At/beyond here = system-breaking.") + trend_4w: list[float] | None = Field(default=None) + trend_13w: list[float] | None = Field(default=None) + trend_decoration: TrendDecorationLiteral = Field(default="→") + status: StatusLiteral + status_reason: str = Field(description="One operator-readable sentence; never generic.") + criticality: CriticalityLiteral = Field(default="supporting") + source_path: str = Field(description="S3 URI / SQLite path / artifact this was read from.") + bh_fdr_adjusted_p: float | None = Field(default=None) + last_updated_utc: datetime + derived_letter: str = Field(default="N/A", description="Summary letter; derived from status+value.") + + @property + def is_na(self) -> bool: + return self.status.startswith(_NA_PREFIX) + + +def derive_trend_decoration( + values: list[float] | None, + *, + higher_is_better: bool = True, +) -> TrendDecorationLiteral: + """Map a rolling value window to a trend glyph (RC v2 Principle 5). + + Looks at the last four points: ``↑↑`` if all 3 steps improved, ``↑`` if the + most recent 2 of those steps improved, ``↓↓``/``↓`` mirrored, else ``→``. + Improvement is "increase" when ``higher_is_better`` else "decrease". + """ + if not values or len(values) < 2: + return "→" + window = values[-4:] + steps = [b - a for a, b in zip(window, window[1:])] + if not higher_is_better: + steps = [-s for s in steps] + + eps = 1e-12 + ups = sum(1 for s in steps if s > eps) + downs = sum(1 for s in steps if s < -eps) + recent2 = steps[-2:] + + if len(steps) >= 3 and ups == len(steps): + return "↑↑" + if len(steps) >= 3 and downs == len(steps): + return "↓↓" + if sum(1 for s in recent2 if s > eps) == len(recent2) and ups >= downs: + return "↑" + if sum(1 for s in recent2 if s < -eps) == len(recent2) and downs >= ups: + return "↓" + return "→" + + +def derive_letter(status: StatusLiteral) -> str: + """Project a status onto the summary letter band (RC v2 Principle 2). + + The letter is a display convenience only — ``status`` + ``value`` are the + source of truth. Any ``N/A-*`` projects to ``"N/A"``. + """ + if status.startswith(_NA_PREFIX): + return "N/A" + return {"GREEN": "A", "WATCH": "C", "RED": "F"}[status] + + +def derive_status( + *, + value: float | None, + n_samples: int | None, + n_floor: int, + target: float | None = None, + red_line: float | None = None, + ci_low: float | None = None, + ci_high: float | None = None, + implemented: bool = True, + ran: bool = True, + input_present: bool = True, +) -> StatusLiteral: + """Derive the GREEN/WATCH/RED/``N/A-*`` status for one component. + + Encodes RC v2 Principles 2 (status taxonomy) and 6 (sample-size discipline) + so producer and consumers agree. Direction is inferred from the + ``target``/``red_line`` ordering: ``target >= red_line`` ⇒ higher-is-better, + else lower-is-better (e.g. max-drawdown, ECE). + + The four N/A conditions take precedence in order: not-implemented → + not-run → missing-input → low-N. Above the floor, status follows the value + and (when provided) the confidence interval relative to target/red-line. + """ + if not implemented: + return "N/A-NOT-IMPL" + if not ran: + return "N/A-NOT-RUN" + if not input_present: + return "N/A-MISSING-INPUT" + if value is None or n_samples is None or n_samples < 0.5 * n_floor: + return "N/A-LOW-N" + + higher_is_better = target is None or red_line is None or target >= red_line + + def _at_or_better(a: float, b: float) -> bool: + return a >= b if higher_is_better else a <= b + + def _at_or_worse(a: float, b: float) -> bool: + return a <= b if higher_is_better else a >= b + + # RED: value at/below the red-line, or the CI sits entirely on the bad side. + if red_line is not None: + if _at_or_worse(value, red_line): + return "RED" + bad_bound = ci_low if higher_is_better else ci_high + if bad_bound is not None and _at_or_worse(bad_bound, red_line): + return "RED" + + # Between half-floor and floor: CI too wide to claim GREEN regardless. + if n_samples < n_floor: + return "WATCH" + + if target is not None: + if _at_or_better(value, target): + # GREEN needs the whole CI clear of the red-line (when both known). + good_bound = ci_low if higher_is_better else ci_high + if red_line is not None and good_bound is not None and _at_or_worse(good_bound, red_line): + return "WATCH" + return "GREEN" + return "WATCH" + + # No target given: above red-line with adequate N ⇒ GREEN. + return "GREEN" diff --git a/src/alpha_engine_lib/quant/stats/__init__.py b/src/alpha_engine_lib/quant/stats/__init__.py index 85b748a..b4d10e9 100644 --- a/src/alpha_engine_lib/quant/stats/__init__.py +++ b/src/alpha_engine_lib/quant/stats/__init__.py @@ -11,6 +11,7 @@ - ``information_coefficient`` — Spearman rank IC of conviction vs forward return - ``expectancy`` — hit-rate × win/loss decomposition - ``multiple_testing`` — Benjamini-Hochberg FDR correction + - ``intervals`` — bootstrap CI, Newey-West SE, Wilson score interval - ``risk_matched_benchmark`` — EW-high-vol + beta-matched-SPY baselines + IR - ``regime_sortino`` — regime-stratified cross-sectional pick-alpha Sortino @@ -18,6 +19,7 @@ from alpha_engine_lib.quant.stats.dsr import compute_dsr from alpha_engine_lib.quant.stats.multiple_testing import benjamini_hochberg + from alpha_engine_lib.quant.stats.intervals import bootstrap_ci, wilson_score_interval """ from __future__ import annotations diff --git a/src/alpha_engine_lib/quant/stats/intervals.py b/src/alpha_engine_lib/quant/stats/intervals.py new file mode 100644 index 0000000..6f9c0a3 --- /dev/null +++ b/src/alpha_engine_lib/quant/stats/intervals.py @@ -0,0 +1,220 @@ +"""intervals — Bootstrap confidence intervals, Newey-West SE, Wilson score intervals. + +The three inference primitives the System Report Card v2 metric records require +(``MetricRecord.ci_method`` ∈ {``bootstrap``, ``newey-west``, ``wilson``}): + + - ``bootstrap_ci`` — percentile bootstrap CI for any statistic of a + sample (default the mean). The general-purpose + CI for ICs, lifts, hit-rates, Sharpe point + estimates where no closed form is convenient. + - ``newey_west_se`` — heteroskedasticity-and-autocorrelation-consistent + (HAC) standard error of a series mean, for + autocorrelated daily P&L where the iid SE + understates uncertainty. + - ``wilson_score_interval`` — Wilson score binomial interval for rates with + small N (veto-gate precision/recall, hit-rate), + which the normal-approximation interval handles + badly near 0/1. + +Pure-compute; no I/O. bootstrap + Newey-West need numpy (install +``alpha-engine-lib[quant]``); Wilson is stdlib-only (``statistics.NormalDist``). + +Reference: López de Prado, *Advances in Financial Machine Learning* (bootstrap ++ HAC); Wilson (1927) "Probable Inference, the Law of Succession, and +Statistical Inference". +""" + +from __future__ import annotations + +import math +from statistics import NormalDist +from typing import Callable, Sequence, TypedDict + +import numpy as np + +_DEFAULT_N_RESAMPLES = 1000 + + +class BootstrapCIResult(TypedDict, total=False): + status: str # "ok" | "insufficient_data" + n: int # observations after NaN drop + estimate: float # statistic on the full sample + ci_low: float + ci_high: float + ci_level: float # e.g. 0.95 + method: str # "bootstrap" + n_resamples: int + + +class NeweyWestResult(TypedDict, total=False): + status: str # "ok" | "insufficient_data" + n: int + estimate: float # sample mean + se: float # HAC standard error of the mean + lags: int # Bartlett-kernel lags used + method: str # "newey-west" + + +class WilsonScoreResult(TypedDict, total=False): + status: str # "ok" | "insufficient_data" + n: int # trials + successes: int + rate: float # successes / trials (point estimate) + estimate: float # alias of rate (uniform with the other results) + ci_low: float + ci_high: float + ci_level: float + method: str # "wilson" + + +def _clean(data: Sequence[float] | np.ndarray) -> np.ndarray: + """Coerce to a 1-D float array with NaN/inf dropped.""" + arr = np.asarray(data, dtype=float).ravel() + return arr[np.isfinite(arr)] + + +def bootstrap_ci( + data: Sequence[float] | np.ndarray, + statistic: Callable[[np.ndarray], float] = np.mean, + *, + ci_level: float = 0.95, + n_resamples: int = _DEFAULT_N_RESAMPLES, + seed: int = 0, +) -> BootstrapCIResult: + """Percentile bootstrap confidence interval for ``statistic`` of ``data``. + + Args: + data: 1-D sample of observations (NaN/inf dropped). + statistic: Callable ``(np.ndarray) -> float`` to bootstrap. Defaults to + the mean. + ci_level: Confidence level in (0, 1) (default 0.95). + n_resamples: Number of bootstrap resamples (default 1000). + seed: RNG seed for reproducibility (the report card must be stable + across re-renders of the same cycle). + + Returns: + A :class:`BootstrapCIResult`. ``status == "insufficient_data"`` when + fewer than 2 finite observations remain. + """ + arr = _clean(data) + n = int(arr.size) + if n < 2: + return {"status": "insufficient_data", "n": n} + + estimate = float(statistic(arr)) + rng = np.random.default_rng(seed) + idx = rng.integers(0, n, size=(n_resamples, n)) + boot = np.fromiter( + (statistic(arr[row]) for row in idx), dtype=float, count=n_resamples + ) + boot = boot[np.isfinite(boot)] + if boot.size == 0: + return {"status": "insufficient_data", "n": n} + + tail = (1.0 - ci_level) / 2.0 + ci_low = float(np.percentile(boot, 100.0 * tail)) + ci_high = float(np.percentile(boot, 100.0 * (1.0 - tail))) + return { + "status": "ok", + "n": n, + "estimate": estimate, + "ci_low": ci_low, + "ci_high": ci_high, + "ci_level": ci_level, + "method": "bootstrap", + "n_resamples": int(boot.size), + } + + +def _auto_lags(n: int) -> int: + """Newey-West (1994) automatic lag selection: floor(4·(n/100)^(2/9)).""" + return int(math.floor(4.0 * (n / 100.0) ** (2.0 / 9.0))) + + +def newey_west_se( + series: Sequence[float] | np.ndarray, + *, + max_lags: int | None = None, +) -> NeweyWestResult: + """HAC (Newey-West, Bartlett kernel) standard error of the series mean. + + For autocorrelated series (daily P&L), the iid SE ``s/√n`` understates + uncertainty. The HAC estimator inflates the long-run variance by the + Bartlett-weighted autocovariances up to ``max_lags``. + + Args: + series: 1-D series (NaN/inf dropped). + max_lags: Bartlett-kernel lag truncation. ``None`` ⇒ the Newey-West + (1994) rule ``floor(4·(n/100)^(2/9))``. Clamped to ``[0, n-1]``. + + Returns: + A :class:`NeweyWestResult`. ``status == "insufficient_data"`` for n < 2. + """ + x = _clean(series) + n = int(x.size) + if n < 2: + return {"status": "insufficient_data", "n": n} + + lags = _auto_lags(n) if max_lags is None else int(max_lags) + lags = max(0, min(lags, n - 1)) + + e = x - x.mean() + gamma0 = float(np.dot(e, e) / n) + lrv = gamma0 + for j in range(1, lags + 1): + weight = 1.0 - j / (lags + 1.0) + gamma_j = float(np.dot(e[j:], e[:-j]) / n) + lrv += 2.0 * weight * gamma_j + lrv = max(lrv, 0.0) # Bartlett kernel guarantees PSD; clamp float error. + se = math.sqrt(lrv / n) + return { + "status": "ok", + "n": n, + "estimate": float(x.mean()), + "se": se, + "lags": lags, + "method": "newey-west", + } + + +def wilson_score_interval( + successes: int, + trials: int, + *, + ci_level: float = 0.95, +) -> WilsonScoreResult: + """Wilson score interval for a binomial proportion. + + Preferred over the normal-approximation (Wald) interval for small ``trials`` + or rates near 0/1, where Wald produces bounds outside [0, 1] and undercovers. + + Args: + successes: Count of successes (0 ≤ successes ≤ trials). + trials: Total trials (> 0). + ci_level: Confidence level in (0, 1) (default 0.95). + + Returns: + A :class:`WilsonScoreResult`. ``status == "insufficient_data"`` for + ``trials <= 0``. Bounds are clamped to [0, 1]. + """ + if trials <= 0: + return {"status": "insufficient_data", "n": int(max(trials, 0))} + successes = max(0, min(int(successes), int(trials))) + + z = NormalDist().inv_cdf(1.0 - (1.0 - ci_level) / 2.0) + p = successes / trials + z2 = z * z + denom = 1.0 + z2 / trials + center = (p + z2 / (2.0 * trials)) / denom + margin = (z / denom) * math.sqrt(p * (1.0 - p) / trials + z2 / (4.0 * trials * trials)) + return { + "status": "ok", + "n": int(trials), + "successes": int(successes), + "rate": p, + "estimate": p, + "ci_low": max(0.0, center - margin), + "ci_high": min(1.0, center + margin), + "ci_level": ci_level, + "method": "wilson", + } diff --git a/tests/test_metrics.py b/tests/test_metrics.py new file mode 100644 index 0000000..83fc4da --- /dev/null +++ b/tests/test_metrics.py @@ -0,0 +1,122 @@ +"""Tests for alpha_engine_lib.metrics — MetricRecord contract + status derivation.""" + +from datetime import datetime, timezone + +import pytest +from pydantic import ValidationError + +from alpha_engine_lib.metrics import ( + MetricRecord, + derive_letter, + derive_status, + derive_trend_decoration, +) + + +def _record(**overrides): + base = dict( + name="predictor_meta_l2_ic", + module="predictor", + metric_type="ic", + n_floor=100, + status="GREEN", + status_reason="L2 IC 0.48 (CI [0.21,0.74], N=626) above target 0.05.", + source_path="s3://alpha-engine-research/predictor/metrics/latest.json#l2_ic", + last_updated_utc=datetime(2026, 6, 4, tzinfo=timezone.utc), + ) + base.update(overrides) + return MetricRecord(**base) + + +class TestMetricRecord: + def test_minimal_valid_record(self): + r = _record(value=0.48, n_samples=626) + assert r.module == "predictor" and r.value == 0.48 + assert r.trend_decoration == "→" # default + assert r.is_na is False + + def test_na_status_flags_is_na(self): + assert _record(status="N/A-LOW-N").is_na is True + + def test_extra_fields_allowed(self): + r = _record(value=0.1, n_samples=200, future_field="ok") + assert r.value == 0.1 + + def test_bad_status_rejected(self): + with pytest.raises(ValidationError): + _record(status="MAYBE") + + def test_bad_metric_type_rejected(self): + with pytest.raises(ValidationError): + _record(metric_type="vibes") + + +class TestDeriveTrendDecoration: + def test_sustained_up_and_down(self): + assert derive_trend_decoration([1, 2, 3, 4]) == "↑↑" + assert derive_trend_decoration([4, 3, 2, 1]) == "↓↓" + + def test_flat_and_too_short(self): + assert derive_trend_decoration([2, 2, 2, 2]) == "→" + assert derive_trend_decoration([1]) == "→" + assert derive_trend_decoration(None) == "→" + + def test_recent_improvement(self): + # down then up-up: not sustained, but recent 2 improved. + assert derive_trend_decoration([5, 1, 2, 3]) == "↑" + + def test_higher_is_better_false_flips(self): + # decreasing values are an improvement when lower is better (e.g. drawdown). + assert derive_trend_decoration([4, 3, 2, 1], higher_is_better=False) == "↑↑" + + +class TestDeriveLetter: + def test_status_to_letter(self): + assert derive_letter("GREEN") == "A" + assert derive_letter("WATCH") == "C" + assert derive_letter("RED") == "F" + assert derive_letter("N/A-NOT-IMPL") == "N/A" + + +class TestDeriveStatus: + def test_na_precedence(self): + assert derive_status(value=0.5, n_samples=200, n_floor=100, implemented=False) == "N/A-NOT-IMPL" + assert derive_status(value=0.5, n_samples=200, n_floor=100, ran=False) == "N/A-NOT-RUN" + assert derive_status(value=0.5, n_samples=200, n_floor=100, input_present=False) == "N/A-MISSING-INPUT" + + def test_low_n(self): + assert derive_status(value=0.5, n_samples=40, n_floor=100) == "N/A-LOW-N" + assert derive_status(value=None, n_samples=200, n_floor=100) == "N/A-LOW-N" + + def test_red_when_at_or_below_red_line(self): + s = derive_status(value=-0.01, n_samples=200, n_floor=100, target=0.05, red_line=0.0) + assert s == "RED" + + def test_red_when_ci_entirely_below_red_line(self): + s = derive_status( + value=0.02, n_samples=200, n_floor=100, target=0.05, red_line=0.0, + ci_low=-0.03, ci_high=-0.01, + ) + assert s == "RED" + + def test_green_when_above_target_with_clear_ci(self): + s = derive_status( + value=0.48, n_samples=626, n_floor=100, target=0.05, red_line=0.0, + ci_low=0.21, ci_high=0.74, + ) + assert s == "GREEN" + + def test_watch_between_half_floor_and_floor(self): + # N=70 is above 0.5*floor (50) but below floor (100) → WATCH regardless of value. + s = derive_status(value=0.9, n_samples=70, n_floor=100, target=0.05, red_line=0.0) + assert s == "WATCH" + + def test_watch_below_target_above_red_line(self): + s = derive_status(value=0.02, n_samples=200, n_floor=100, target=0.05, red_line=0.0) + assert s == "WATCH" + + def test_lower_is_better_direction(self): + # max_drawdown: target 0.15, red_line 0.25 (lower is better). + assert derive_status(value=0.08, n_samples=200, n_floor=60, target=0.15, red_line=0.25) == "GREEN" + assert derive_status(value=0.30, n_samples=200, n_floor=60, target=0.15, red_line=0.25) == "RED" + assert derive_status(value=0.20, n_samples=200, n_floor=60, target=0.15, red_line=0.25) == "WATCH" diff --git a/tests/test_quant_stats_intervals.py b/tests/test_quant_stats_intervals.py new file mode 100644 index 0000000..fb25804 --- /dev/null +++ b/tests/test_quant_stats_intervals.py @@ -0,0 +1,85 @@ +"""Tests for alpha_engine_lib.quant.stats.intervals — bootstrap CI, Newey-West SE, Wilson.""" + +import math + +import numpy as np + +from alpha_engine_lib.quant.stats.intervals import ( + bootstrap_ci, + newey_west_se, + wilson_score_interval, +) + + +class TestBootstrapCI: + def test_insufficient_data(self): + assert bootstrap_ci([]) == {"status": "insufficient_data", "n": 0} + assert bootstrap_ci([4.2])["status"] == "insufficient_data" + + def test_constant_sample_has_zero_width(self): + out = bootstrap_ci([5.0, 5.0, 5.0, 5.0]) + assert out["status"] == "ok" + assert out["estimate"] == 5.0 + assert out["ci_low"] == 5.0 and out["ci_high"] == 5.0 + + def test_ci_brackets_estimate_and_is_deterministic(self): + data = list(range(100)) + a = bootstrap_ci(data, seed=7) + b = bootstrap_ci(data, seed=7) + assert a == b # seeded → reproducible (report card must be stable) + assert a["ci_low"] <= a["estimate"] <= a["ci_high"] + assert a["estimate"] == float(np.mean(data)) + assert a["ci_level"] == 0.95 and a["method"] == "bootstrap" + + def test_nan_dropped(self): + out = bootstrap_ci([1.0, 2.0, float("nan"), 3.0]) + assert out["n"] == 3 + + def test_custom_statistic(self): + out = bootstrap_ci([1.0, 2.0, 3.0, 4.0, 5.0], statistic=np.median, seed=1) + assert out["status"] == "ok" + assert out["estimate"] == 3.0 + + +class TestNeweyWestSE: + def test_insufficient_data(self): + assert newey_west_se([2.0])["status"] == "insufficient_data" + + def test_zero_lag_matches_iid_se(self): + # [1..5]: mean 3, gamma0 = 10/5 = 2, se = sqrt(2/5). + out = newey_west_se([1.0, 2.0, 3.0, 4.0, 5.0], max_lags=0) + assert out["estimate"] == 3.0 + assert out["lags"] == 0 + assert out["se"] == math.sqrt(0.4) + + def test_lags_clamped_to_n_minus_1(self): + out = newey_west_se([1.0, 2.0, 3.0], max_lags=99) + assert out["lags"] == 2 + + def test_auto_lags_nonnegative(self): + out = newey_west_se([float(x) for x in range(200)]) + assert out["status"] == "ok" + assert out["lags"] >= 0 + assert out["se"] >= 0.0 + + +class TestWilsonScoreInterval: + def test_insufficient_data(self): + assert wilson_score_interval(0, 0)["status"] == "insufficient_data" + + def test_known_50_of_100(self): + # Textbook Wilson 95% interval for 50/100 is [0.4038, 0.5962]. + out = wilson_score_interval(50, 100) + assert out["rate"] == 0.5 + assert abs(out["ci_low"] - 0.4038) < 1e-3 + assert abs(out["ci_high"] - 0.5962) < 1e-3 + + def test_bounds_clamped_to_unit_interval(self): + lo = wilson_score_interval(0, 10) + assert lo["ci_low"] == 0.0 and 0.0 < lo["ci_high"] < 1.0 + hi = wilson_score_interval(10, 10) + assert hi["ci_high"] == 1.0 and 0.0 < hi["ci_low"] < 1.0 + + def test_successes_clamped_to_trials(self): + out = wilson_score_interval(15, 10) + assert out["successes"] == 10 and out["rate"] == 1.0 From eee5a5418803592118f15e6deead2b6c9ddf675a Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Thu, 4 Jun 2026 09:30:14 -0700 Subject: [PATCH 2/2] test(intervals): tolerance on Wilson boundary bounds (float residual, not exact 0/1) CI surfaced wilson_score_interval(0,10) returning ci_low=2.78e-17 (legitimate float noise from the [0,1] clamp; true bound is 0). Assert pytest.approx instead of exact equality. Local numpy happened to compute exact 0.0; CI did not. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_quant_stats_intervals.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_quant_stats_intervals.py b/tests/test_quant_stats_intervals.py index fb25804..1b2003a 100644 --- a/tests/test_quant_stats_intervals.py +++ b/tests/test_quant_stats_intervals.py @@ -3,6 +3,7 @@ import math import numpy as np +import pytest from alpha_engine_lib.quant.stats.intervals import ( bootstrap_ci, @@ -75,10 +76,13 @@ def test_known_50_of_100(self): assert abs(out["ci_high"] - 0.5962) < 1e-3 def test_bounds_clamped_to_unit_interval(self): + # successes=0 → true lower bound is 0 (residual float noise ≈ 1e-17). lo = wilson_score_interval(0, 10) - assert lo["ci_low"] == 0.0 and 0.0 < lo["ci_high"] < 1.0 + assert lo["ci_low"] == pytest.approx(0.0, abs=1e-9) + assert 0.0 < lo["ci_high"] < 1.0 hi = wilson_score_interval(10, 10) - assert hi["ci_high"] == 1.0 and 0.0 < hi["ci_low"] < 1.0 + assert hi["ci_high"] == pytest.approx(1.0, abs=1e-9) + assert 0.0 < hi["ci_low"] < 1.0 def test_successes_clamped_to_trials(self): out = wilson_score_interval(15, 10)