diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 95bebaf..ee881af 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,9 +46,10 @@ jobs: env: IPAX_BACKENDS: numpy,torch,array_api_strict run: pytest -q --cov=ipax --cov-report=xml -n auto - - uses: codecov/codecov-action@v4 + - uses: codecov/codecov-action@v5 with: files: coverage.xml + token: ${{ secrets.CODECOV_TOKEN }} if: always() qc: diff --git a/CHANGELOG.md b/CHANGELOG.md index d9fca30..144d8cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,29 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ## [Unreleased] +## [0.2.0] - 2026-06-21 + +### Added +- GPU/device-efficiency profiling harness: `DeviceMetrics` and + `measure_device_solve` in `benchmarks/harness` (host↔device sync counter plus a + CuPy GPU/CPU time split), the `benchmarks/runners/device_efficiency.py` CLI + runner (GPU-gated, a no-op on CI), and kernel micro-benchmarks (matvec / dense + solve / one Newton step) parametrized over every installed backend. +- Device reporting: backend `capabilities()` now discovers available devices via + the Array-API inspection API (`__array_namespace_info__().devices()`) instead of + assuming CPU, and `Result.device` records where the solve ran — surfaced in the + tier-1 result summary. + +### Changed +- Vectorized `fraction_to_boundary`, removing a per-element Python loop that forced + `O(n)` host↔device synchronizations per call (and it is called six times per + iteration). On a CUDA backend this cut the per-iteration sync count from + thousands — scaling linearly with `n` — to a small constant, speeding up + matrix-free GPU iterations by roughly 20× at scale; iterates and results are + unchanged (CPU behavior is identical). +- CI uploads coverage with `codecov/codecov-action@v5` using a repository + `CODECOV_TOKEN`. + ## [0.1.1] - 2026-06-21 ### Fixed @@ -62,6 +85,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Contract batteries (`tests/contracts/`) plus unit/property/integration/backends/ regression layers; benchmark suite (`benchmarks/`, asv); MkDocs documentation. -[Unreleased]: https://github.com/wahln/ipax/compare/v0.1.1...HEAD +[Unreleased]: https://github.com/wahln/ipax/compare/v0.2.0...HEAD +[0.2.0]: https://github.com/wahln/ipax/compare/v0.1.1...v0.2.0 [0.1.1]: https://github.com/wahln/ipax/compare/v0.1.0...v0.1.1 [0.1.0]: https://github.com/wahln/ipax/releases/tag/v0.1.0 diff --git a/README.md b/README.md index 0ff611b..4da0ae4 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![PyPI version](https://img.shields.io/pypi/v/ipax.svg)](https://pypi.org/project/ipax/) [![CI](https://github.com/wahln/ipax/actions/workflows/ci.yml/badge.svg)](https://github.com/wahln/ipax/actions/workflows/ci.yml) +[![codecov](https://codecov.io/gh/wahln/ipax/branch/main/graph/badge.svg)](https://codecov.io/gh/wahln/ipax) [![Documentation](https://readthedocs.org/projects/ipax/badge/?version=latest)](https://ipax.readthedocs.io/en/latest/) [![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/) [![License: Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py index 0312202..a78bcfd 100644 --- a/benchmarks/harness/__init__.py +++ b/benchmarks/harness/__init__.py @@ -68,14 +68,27 @@ def capture_environment() -> dict[str, object]: "machine": platform.machine(), "ipax": getattr(ipax, "__version__", "unknown"), } - for pkg in ("numpy", "scipy", "torch"): + for pkg in ("numpy", "scipy", "torch", "cupy", "jax"): try: env[pkg] = __import__(pkg).__version__ except Exception: # optional backend, record absence env[pkg] = None + env["gpu"] = _detect_gpu_name() return env +def _detect_gpu_name() -> str | None: + """Best-effort CUDA device name for the report header (None if no GPU).""" + try: + import cupy as cp + + props = cp.cuda.runtime.getDeviceProperties(0) + name = props["name"] + return name.decode() if isinstance(name, bytes) else str(name) + except Exception: # no CuPy / no CUDA device + return None + + def _inf_norm(xp: Namespace, a: object, b: object) -> float: return float(xp.max(xp.abs(a - b))) @@ -468,16 +481,241 @@ def _exp(value: float) -> str: return "—" if not math.isfinite(value) else f"{value:.2f}" +# -- device-efficiency study (GPU profiling) --------------------------------- +# +# The iterative IPM loop is where Array-API GPU performance is won or lost: every +# ``float()``/``bool()`` on a 0-d device array forces a host<->device sync that +# serializes the GPU. This study measures, per solve, the **host-sync count** and +# **wall time per iteration** (plus the GPU-vs-CPU time split on CuPy) so the +# no-cost optimization — consolidating per-iteration scalar reads to one sync — +# can be quantified before and after. Backend specifics (CuPy/Torch timers) live +# here in ``benchmarks/``, never in the ``ipax/`` core (invariant #1). + + +@dataclass(frozen=True) +class DeviceMetrics: + """Host-sync / timing profile for one ``(backend, route, n)`` solve.""" + + backend: str + device: str + route: str + n_vars: int + n_iter: int + success: bool + solve_time: float # total wall (s), device-synchronized at both ends + time_per_iter: float # solve_time / n_iter + gpu_time: float | None # measured GPU compute time (s); CuPy only + host_syncs: int | None # device->host scalar materializations during solve + syncs_per_iter: float | None # host_syncs / n_iter (the headline metric) + peak_device_mb: float | None # device-memory high-water (CuPy/Torch-CUDA) + kkt_error: float + + +class _ScalarSyncCounter: + """Count device->host scalar materializations during a solve. + + Patches the array type's scalar dunders (``__float__``/``__int__``/ + ``__bool__``/``__index__``/``item``) for the measurement window. Each forces a + host sync on a GPU backend, so the tally is the per-solve host-sync total the + driver-loop optimization targets. Built-in array types that forbid attribute + assignment (NumPy's ``ndarray``) make counting unavailable — :attr:`result` + is then ``None`` (on CPU a host scalar read is free anyway). + """ + + _NAMES = ("__float__", "__int__", "__bool__", "__index__", "item") + + def __init__(self, array_type: type) -> None: + self._type = array_type + self._orig: dict[str, object] = {} + self.count = 0 + self.available = True + + def __enter__(self) -> _ScalarSyncCounter: + counter = self + for name in self._NAMES: + orig = getattr(self._type, name, None) + if orig is None: + continue + + def make(orig: object): + def wrapped(self, *args, **kwargs): + counter.count += 1 + return orig(self, *args, **kwargs) # type: ignore[operator] + + return wrapped + + try: + setattr(self._type, name, make(orig)) + except TypeError: # built-in/extension type (NumPy) — cannot patch + self.available = False + self._restore() + break + self._orig[name] = orig + return self + + def _restore(self) -> None: + for name, orig in self._orig.items(): + setattr(self._type, name, orig) + self._orig.clear() + + def __exit__(self, *exc: object) -> None: + self._restore() + + @property + def result(self) -> int | None: + return self.count if self.available else None + + +def _sync_device(backend: str) -> None: + """Block until the device finishes queued work (no-op off GPU).""" + if backend == "cupy": + import cupy as cp + + cp.cuda.Device().synchronize() + elif backend == "torch": + import torch + + if torch.cuda.is_available(): + torch.cuda.synchronize() + + +def _reset_device_memory(backend: str) -> None: + """Reset the device-memory high-water mark before a measured solve.""" + if backend == "cupy": + import cupy as cp + + cp.get_default_memory_pool().free_all_blocks() + elif backend == "torch": + import torch + + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats() + + +def _peak_device_mb(backend: str) -> float | None: + """Device-memory high-water mark in MB (None off GPU).""" + if backend == "cupy": + import cupy as cp + + return cp.get_default_memory_pool().total_bytes() / 1e6 + if backend == "torch": + import torch + + if torch.cuda.is_available(): + return torch.cuda.max_memory_allocated() / 1e6 + return None + + +def _gpu_time(backend: str, fn: object) -> float | None: + """Measured GPU compute time of ``fn`` in seconds (CuPy only, else None). + + Uses ``cupyx.profiler.benchmark`` (one extra run), whose ``gpu_times`` is the + on-stream device time. A large gap between wall time and this value is the + signature of a host-sync-bound loop. + """ + if backend != "cupy": + return None + try: + from cupyx.profiler import benchmark + + measured = benchmark(fn, n_repeat=1, n_warmup=0) # type: ignore[arg-type] + return float(measured.gpu_times.mean()) + except Exception: # profiler unavailable / non-idempotent call + return None + + +def measure_device_solve( + problem: object, + x0: object, + options: ipax.Options, + *, + backend: str, + route: str, + warmup: bool = True, +) -> DeviceMetrics: + """Profile one solve for host-sync count and per-iteration timing. + + ``warmup`` runs (and discards) one solve first so device handle/allocator + init and any JIT cost stay out of the measured run. The measured solve is + device-synchronized at both ends so ``solve_time`` is true end-to-end wall. + """ + if warmup: + ipax.solve(problem, x0, options=options) # type: ignore[arg-type] + + _reset_device_memory(backend) + _sync_device(backend) + with _ScalarSyncCounter(type(x0)) as counter: + start = perf_counter() + result = ipax.solve(problem, x0, options=options) # type: ignore[arg-type] + _sync_device(backend) + wall = perf_counter() - start + host_syncs = counter.result + + n_iter = result.n_iter + gpu_time = _gpu_time(backend, lambda: ipax.solve(problem, x0, options=options)) + peak = _peak_device_mb(backend) + return DeviceMetrics( + backend=backend, + device=result.device or "cpu", + route=route, + n_vars=int(problem.n_vars), # type: ignore[attr-defined] + n_iter=n_iter, + success=result.success, + solve_time=wall, + time_per_iter=wall / n_iter if n_iter else float("nan"), + gpu_time=gpu_time, + host_syncs=host_syncs, + syncs_per_iter=( + host_syncs / n_iter if (host_syncs is not None and n_iter) else None + ), + peak_device_mb=peak, + kkt_error=result.kkt_error, + ) + + +def format_device(metrics: list[DeviceMetrics], environment: dict[str, object]) -> str: + """Render the device-efficiency study as Markdown (syncs/iter + timing).""" + lines = [ + "# ipax device-efficiency study", + "", + f"- generated: `{environment.get('timestamp')}`", + f"- gpu: `{environment.get('gpu')}`", + f"- cupy `{environment.get('cupy')}` · torch `{environment.get('torch')}`", + "- headline metric: **host syncs / iter** " + "(device->host scalar reads in the loop).", + "", + "| backend | device | route | n | iters | wall (s) | s/iter " + "| syncs | syncs/iter | gpu (s) | peak MB | kkt |", + "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |", + ] + for m in sorted(metrics, key=lambda m: (m.backend, m.route, m.n_vars)): + lines.append( + f"| {m.backend} | {m.device} | {m.route} | {m.n_vars} | {m.n_iter} " + f"| {m.solve_time:.3f} | {_fmt(m.time_per_iter)} " + f"| {_fmt_count(m.host_syncs)} | {_fmt_opt(m.syncs_per_iter)} " + f"| {_fmt_opt(m.gpu_time)} | {_fmt_opt(m.peak_device_mb)} " + f"| {_fmt(m.kkt_error)} |" + ) + return "\n".join(lines) + "\n" + + +def _fmt_count(value: int | None) -> str: + return "—" if value is None else str(value) + + __all__ = [ "CaseResult", "CrossCheckResult", + "DeviceMetrics", "ScalingPoint", "capture_environment", "cross_check", "fit_exponent", "format_crosscheck", + "format_device", "format_markdown", "format_scaling", + "measure_device_solve", "measure_solve", "run_case", "to_payload", diff --git a/benchmarks/runners/device_efficiency.py b/benchmarks/runners/device_efficiency.py new file mode 100644 index 0000000..79fc59e --- /dev/null +++ b/benchmarks/runners/device_efficiency.py @@ -0,0 +1,126 @@ +"""Device-efficiency study runner (GPU profiling). + +Profiles the IPM loop for **host<->device sync count** and **wall time per +iteration** (plus the GPU-vs-CPU time split on CuPy) across solver routes and +problem sizes on the synthetic RT-like generator. The headline metric is +``syncs/iter``: every ``float()``/``bool()`` on a 0-d device array in the driver +loop forces a host sync that serializes the GPU, so this is the number the +no-cost loop optimization (consolidating per-iteration scalar reads) drives down. + +Run from the repository root on a CUDA machine:: + + python -m benchmarks.runners.device_efficiency \ + --backends cupy --routes krylov,sparse --sizes 1000,10000,100000 + +GPU-gated: a requested backend that is not installed is skipped with a note, so +this is a **no-op on CI** (no CuPy) and real work on a local GPU. Exits non-zero +if any solve fails. For kernel-launch counts, wrap the same command in an external +profiler:: + + nsys profile -o device python -m benchmarks.runners.device_efficiency --backends cupy ... +""" + +from __future__ import annotations + +import argparse +import json +from dataclasses import asdict +from pathlib import Path + +import ipax +from benchmarks.harness import ( + DeviceMetrics, + capture_environment, + format_device, + measure_device_solve, +) +from ipax.backend.namespace import capabilities +from ipax.testing.backends import import_namespace +from ipax.testing.problems import make_rt_like_problem + +_ROUTES: dict[str, ipax.Options] = { + "dense": ipax.Options(hessian="exact", linsolve="dense"), + "krylov": ipax.Options(hessian="exact", linsolve="krylov"), + "sparse": ipax.Options(hessian="exact", linsolve="sparse"), +} + + +def run_device_efficiency( + backends: list[str], routes: list[str], sizes: list[int] +) -> list[DeviceMetrics]: + """Profile each installed backend over the route × size grid.""" + metrics: list[DeviceMetrics] = [] + for backend in backends: + try: + xp = import_namespace(backend) + except ImportError: + print(f" skip backend {backend!r}: not installed") + continue + + has_sparse = capabilities(xp).has_sparse_adapter + for route in routes: + if route == "sparse" and not has_sparse: + print(f" skip {backend}/sparse: no sparse adapter") + continue + options = _ROUTES[route] + for n in sizes: + problem = make_rt_like_problem( + xp, n, n_structures=6, density=0.1, seed=0 + ) + x0 = xp.full((n,), 0.01, dtype=xp.float64) + metrics.append( + measure_device_solve( + problem, x0, options, backend=backend, route=route + ) + ) + return metrics + + +def _parse_int_list(text: str) -> list[int]: + return [int(x) for x in text.split(",") if x.strip()] + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="ipax device-efficiency study") + parser.add_argument( + "--backends", default="cupy", help="comma-separated backends (default: cupy)" + ) + parser.add_argument( + "--routes", default="krylov,sparse", help="comma-separated routes" + ) + parser.add_argument( + "--sizes", default="1000,10000,100000", help="comma-separated n" + ) + parser.add_argument("--out", default="benchmarks/reports/device") + args = parser.parse_args(argv) + + backends = [b.strip() for b in args.backends.split(",") if b.strip()] + routes = [r.strip() for r in args.routes.split(",") if r.strip()] + sizes = _parse_int_list(args.sizes) + unknown = [r for r in routes if r not in _ROUTES] + if unknown: + parser.error(f"unknown route(s): {', '.join(unknown)}") + + environment = capture_environment() + metrics = run_device_efficiency(backends, routes, sizes) + + out = Path(args.out) + out.parent.mkdir(parents=True, exist_ok=True) + payload = {"environment": environment, "metrics": [asdict(m) for m in metrics]} + out.with_suffix(".json").write_text(json.dumps(payload, indent=2)) + out.with_suffix(".md").write_text( + format_device(metrics, environment), encoding="utf-8" + ) + + failures = [m for m in metrics if not m.success] + print( + f"device-efficiency: {len(metrics) - len(failures)}/{len(metrics)} solved " + f"-> {out.with_suffix('.json')}, {out.with_suffix('.md')}" + ) + for m in failures: + print(f" FAIL {m.backend}/{m.route} n={m.n_vars}: kkt={m.kkt_error:.2e}") + return 1 if failures else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/benchmarks/runners/micro/bench_kernels.py b/benchmarks/runners/micro/bench_kernels.py index 368072e..2f626c2 100644 --- a/benchmarks/runners/micro/bench_kernels.py +++ b/benchmarks/runners/micro/bench_kernels.py @@ -1,18 +1,95 @@ -"""pytest-benchmark micro-benchmarks: matvec, factor, one Newton step (§9.4). - -Run with ``pytest benchmarks/runners/micro --benchmark-only``. -""" - -from __future__ import annotations - -import pytest - - -@pytest.mark.skip(reason="kernels") -def test_matvec_throughput(benchmark): - raise NotImplementedError - - -@pytest.mark.skip(reason="kernels") -def test_one_newton_step(benchmark): - raise NotImplementedError +"""pytest-benchmark micro-benchmarks: matvec, dense KKT solve, one Newton step. + +These isolate the per-kernel GPU cost (matmul, factor/solve) from the IPM loop +overhead measured by ``benchmarks.runners.device_efficiency``. Each benchmark is +parametrized over every installed backend (NumPy / Torch / CuPy), so the same +kernels are timed on CPU and — when CuPy is present — on the GPU. + +Run with:: + + pytest benchmarks/runners/micro --benchmark-only + pytest benchmarks/runners/micro --benchmark-only -k cupy # GPU kernels only + +Each callable ends with a scalar read so a lazy/async backend (CuPy) is forced to +synchronize *inside* the timed region — otherwise the benchmark would clock only +kernel-launch latency, not the kernel. +""" + +from __future__ import annotations + +import pytest + +from ipax.backend.operators import Dense +from ipax.linalg.dense import DenseSolver +from ipax.testing.backends import import_namespace + + +def _installed_backends() -> list[str]: + names: list[str] = [] + for name in ("numpy", "torch", "cupy"): + try: + import_namespace(name) + except ImportError: + continue + names.append(name) + return names + + +@pytest.fixture(params=_installed_backends(), ids=str) +def xp(request: pytest.FixtureRequest): + return import_namespace(request.param) + + +def _spd_matrix(xp, n: int): + """A symmetric positive-definite ``n×n`` matrix (well-conditioned).""" + m = xp.reshape(xp.linspace(-1.0, 1.0, n * n, dtype=xp.float64), (n, n)) + return xp.matmul(m, xp.matrix_transpose(m)) + n * xp.eye(n, dtype=xp.float64) + + +def test_matvec_throughput(benchmark, xp): + """Dense operator matvec — the dominant matrix-free kernel.""" + n = 1024 + op = Dense(_spd_matrix(xp, n)) + v = xp.ones((n,), dtype=xp.float64) + + def run() -> float: + return float(xp.sum(op.matvec(v))) + + benchmark(run) + + +def test_dense_kkt_solve(benchmark, xp): + """Factor + solve of a dense SPD system — the condensed KKT linear solve.""" + n = 512 + operator = Dense(_spd_matrix(xp, n)) + rhs = xp.ones((n,), dtype=xp.float64) + solver = DenseSolver() + + def run() -> float: + solver.factor(operator) + return float(xp.sum(solver.solve(rhs))) + + benchmark(run) + + +def test_one_newton_step(benchmark, xp): + """One Newton-system solve: assemble RHS, factor, solve, recover (proxy). + + Times the linear-algebra heart of a single IPM iteration — a symmetric solve + plus the surrounding vector arithmetic — without the convergence/line-search + machinery, so a GPU regression here is a kernel regression, not loop overhead. + """ + n = 512 + operator = Dense(_spd_matrix(xp, n)) + grad = xp.linspace(0.1, 1.0, n, dtype=xp.float64) + sigma = xp.ones((n,), dtype=xp.float64) + solver = DenseSolver() + + def run() -> float: + rhs = -(grad + sigma) + solver.factor(operator) + dx = solver.solve(rhs) + dz = sigma * dx # representative recovery of an eliminated block + return float(xp.max(xp.abs(dx)) + xp.max(xp.abs(dz))) + + benchmark(run) diff --git a/ipax/__init__.py b/ipax/__init__.py index 37f3b9c..e249486 100644 --- a/ipax/__init__.py +++ b/ipax/__init__.py @@ -63,4 +63,4 @@ "solve", ] -__version__ = "0.1.1" +__version__ = "0.2.0" diff --git a/ipax/_logging.py b/ipax/_logging.py index 709ac5a..3aef35e 100644 --- a/ipax/_logging.py +++ b/ipax/_logging.py @@ -209,6 +209,7 @@ def format_result(result: Result) -> str: f" infeasibility = {result.constraint_violation:.3e}\n" f" solve time = {result.solve_time:.3e}s\n" f" linear solver = {result.linear_solver}\n" + f" device = {result.device}\n" f" derivatives = grad:{src.gradient} eq_jac:{src.eq_jacobian} " f"ineq_jac:{src.ineq_jacobian} hess:{src.hessian}" ) diff --git a/ipax/backend/namespace.py b/ipax/backend/namespace.py index 0158b15..c341282 100644 --- a/ipax/backend/namespace.py +++ b/ipax/backend/namespace.py @@ -103,6 +103,26 @@ class Capabilities: default_float: str # "float64" preferred; read from inputs in practice +def _detect_devices(xp: Namespace) -> tuple[str, ...]: + """Available devices via the Array-API info namespace, stringified. + + Uses ``xp.__array_namespace_info__().devices()`` (Array API 2023.12 inspection + API) so the probe stays inside the standard — no ``cupy.cuda``/``torch.cuda`` + in the core. Backends expose device objects (````, + ``device(type='cpu')``, ``'cpu'``); we stringify them for reporting. Falls + back to ``("cpu",)`` for namespaces lacking the inspection API. + """ + info_fn = getattr(xp, "__array_namespace_info__", None) + if info_fn is None: + return ("cpu",) + try: + devices = info_fn().devices() + except Exception: # best-effort probe; never fail capability detection + return ("cpu",) + names = tuple(str(d) for d in devices) + return names or ("cpu",) + + def capabilities(xp: Namespace) -> Capabilities: """Probe ``xp`` for optional Array-API features and adapter availability. @@ -131,7 +151,7 @@ def capabilities(xp: Namespace) -> Capabilities: linalg_functions=linalg_functions, has_sparse_adapter=name in _SPARSE_ADAPTER_BACKENDS, supports_autodiff=name in _AUTODIFF_BACKENDS, - devices=("cpu",), + devices=_detect_devices(xp), default_float=default_float, ) diff --git a/ipax/ipm/barrier.py b/ipax/ipm/barrier.py index 4fbf91b..b188da0 100644 --- a/ipax/ipm/barrier.py +++ b/ipax/ipm/barrier.py @@ -16,8 +16,11 @@ from __future__ import annotations +import math from typing import TYPE_CHECKING +from ipax.backend.namespace import array_namespace + if TYPE_CHECKING: from ipax.options import BarrierOptions from ipax.typing import Array @@ -43,11 +46,19 @@ def fraction_to_boundary(v: Array, dv: Array, tau: float) -> float: if not 0.0 < tau <= 1.0: raise ValueError("tau must be in (0, 1]") - alpha = 1.0 - for idx in range(int(v.shape[0])): - step = float(dv[idx]) - if step < 0.0: - alpha = min(alpha, tau * float(v[idx]) / (-step)) + if int(v.shape[0]) == 0: + return 1.0 + + # Only components moving toward the boundary (dv < 0) limit the step; for + # those alpha_i = tau * v_i / (-dv_i) (Wachter & Biegler 2006, eq. 15). + # Vectorized so the whole rule costs a single host<->device sync (the final + # float()) rather than one per element — the element-wise Python loop made + # this O(n) syncs/call and dominated GPU iteration time. + xp = array_namespace(v, dv) + blocking = dv < 0.0 + safe_denom = xp.where(blocking, -dv, xp.ones_like(dv)) + ratios = xp.where(blocking, tau * v / safe_denom, xp.full_like(v, math.inf)) + alpha = float(xp.min(ratios)) return max(0.0, min(1.0, alpha)) diff --git a/ipax/result.py b/ipax/result.py index 9abef8f..b23e7d3 100644 --- a/ipax/result.py +++ b/ipax/result.py @@ -156,6 +156,7 @@ class Result: constraint_violation: float = float("inf") solve_time: float = 0.0 # total wall-clock seconds for the solve linear_solver: str = "" # the linear solver used internally (e.g. "dense") + device: str = "" # device the solve ran on (e.g. "cpu", "") derivative_sources: DerivativeSources = field(default_factory=DerivativeSources) history: tuple[IterationRecord, ...] = () diff --git a/ipax/solve.py b/ipax/solve.py index 887b625..d36ae37 100644 --- a/ipax/solve.py +++ b/ipax/solve.py @@ -157,6 +157,7 @@ def solve( kkt_error=float("inf"), constraint_violation=_bound_violation(xp, x0, lower, upper), solve_time=perf_counter() - start_time, + device=_describe_device(x0), message="infeasible bounds: x_L > x_U", ) @@ -228,6 +229,7 @@ def transform_record(record: IterationRecord) -> IterationRecord: result, solve_time=perf_counter() - start_time, linear_solver=_describe_solver(solver), + device=_describe_device(result.x), ) # Post-solve summary (tier 1) and the timing split (tier 2). @@ -286,6 +288,16 @@ def _describe_solver(solver: object) -> str: return describe() if callable(describe) else type(solver).__name__ +def _describe_device(x: Array) -> str: + """Stringified device of the solution array (e.g. ``"cpu"``, CUDA device). + + Read from the array's standard ``.device`` attribute so it reflects where the + solve actually ran without importing any concrete backend. + """ + device = getattr(x, "device", None) + return "" if device is None else str(device) + + def _count_finite(xp: Namespace, bound: Array | None) -> int: """Number of finite entries in a bound vector (``None`` → 0).""" if bound is None: diff --git a/tests/integration/test_device_efficiency.py b/tests/integration/test_device_efficiency.py new file mode 100644 index 0000000..bd3fe5e --- /dev/null +++ b/tests/integration/test_device_efficiency.py @@ -0,0 +1,86 @@ +"""Smoke test for the device-efficiency profiling harness. + +Runs on CPU (the namespace fixture) so the host-sync counter, ``DeviceMetrics``, +the runner, and the report formatter stay correct as the solver evolves. The +host-sync *count* is unavailable on NumPy (its ``ndarray`` scalar dunders cannot +be patched), available on Torch — both paths are exercised. Real GPU numbers come +from running ``benchmarks.runners.device_efficiency`` on CUDA hardware. +""" + +from __future__ import annotations + +import json + +import ipax +from benchmarks.harness import ( + DeviceMetrics, + capture_environment, + format_device, + measure_device_solve, +) +from benchmarks.runners.device_efficiency import main, run_device_efficiency +from ipax.testing.problems import make_rt_like_problem + + +def test_measure_device_solve_populates_metrics(namespace, backend_name): + problem = make_rt_like_problem(namespace, 60, n_structures=4, density=0.2, seed=0) + x0 = namespace.full((60,), 0.01, dtype=namespace.float64) + + m = measure_device_solve( + problem, + x0, + ipax.Options(hessian="exact", linsolve="krylov"), + backend=backend_name, + route="krylov", + ) + + assert isinstance(m, DeviceMetrics) + assert m.success + assert m.n_iter > 0 + assert m.solve_time > 0.0 + assert m.time_per_iter > 0.0 + # NumPy's built-in ndarray cannot be patched -> count unavailable (None); + # Torch's Tensor can, so the loop's scalar reads are counted (> 0). + if backend_name == "numpy": + assert m.host_syncs is None + else: + assert m.host_syncs is not None and m.host_syncs > 0 + assert m.syncs_per_iter is not None and m.syncs_per_iter > 0 + + +def test_sync_counter_restores_dunders_after_window(namespace): + """The counter must not leave the array type permanently patched.""" + problem = make_rt_like_problem(namespace, 40, n_structures=4, density=0.2, seed=0) + x0 = namespace.full((40,), 0.01, dtype=namespace.float64) + before = namespace.asarray([2.5])[0] + assert float(before) == 2.5 + + measure_device_solve( + problem, + x0, + ipax.Options(hessian="exact", linsolve="dense"), + backend="numpy", + route="dense", + ) + + after = namespace.asarray([3.5])[0] + assert float(after) == 3.5 # dunder behaves normally post-window + + +def test_runner_produces_report(tmp_path): + metrics = run_device_efficiency(["numpy"], ["dense"], [40]) + assert metrics and all(m.success for m in metrics) + + env = capture_environment() + md = format_device(metrics, env) + assert "# ipax device-efficiency study" in md + assert "syncs/iter" in md + + out = tmp_path / "device" + rc = main( + ["--backends", "numpy", "--routes", "dense", "--sizes", "40", "--out", str(out)] + ) + assert rc == 0 + payload = json.loads(out.with_suffix(".json").read_text()) + assert payload["metrics"][0]["route"] == "dense" + assert out.with_suffix(".md").exists() diff --git a/tests/integration/test_known_optima.py b/tests/integration/test_known_optima.py index 60a6d09..1f79b6c 100644 --- a/tests/integration/test_known_optima.py +++ b/tests/integration/test_known_optima.py @@ -171,6 +171,9 @@ def gradient(self, x): assert result.status is Status.INFEASIBLE assert not result.success assert "infeasible" in result.message.lower() + # device must be reported on the early-return failure path too, consistent + # with the main solve path (Copilot PR #2 review). + assert result.device == str(array(namespace, [0.5]).device) def test_inconsistent_equalities_report_infeasible(namespace): diff --git a/tests/regression/test_fraction_to_boundary_no_elementwise_sync.py b/tests/regression/test_fraction_to_boundary_no_elementwise_sync.py new file mode 100644 index 0000000..59240aa --- /dev/null +++ b/tests/regression/test_fraction_to_boundary_no_elementwise_sync.py @@ -0,0 +1,39 @@ +"""Regression: fraction-to-boundary must not host-sync per element (GPU perf). + +``fraction_to_boundary`` originally looped over the array in Python, calling +``float(v[idx])``/``float(dv[idx])`` per element — O(n) host<->device syncs per +call, and it is called 6x per IPM iteration. On a CUDA backend this dominated +iteration time: a device-efficiency profile showed ~5,500 syncs/iter at n=1000 +(scaling linearly with n). The rule is now vectorized to a single sync. + +This guards the property directly: counting the device->host scalar +materializations during one call on a large vector must be O(1), not O(n). The +counter patches the array type's scalar dunders, which only works on backends +whose array type allows it (Torch yes, NumPy's built-in ndarray no), so the test +skips where counting is unavailable — the cost it guards is a GPU cost anyway. +""" + +from __future__ import annotations + +import pytest + +from benchmarks.harness import _ScalarSyncCounter +from ipax.ipm.barrier import fraction_to_boundary +from tests._helpers import array + + +def test_fraction_to_boundary_is_constant_sync(namespace): + n = 2000 + v = array(namespace, [1.0] * n) + dv = array(namespace, [-0.5 if i % 2 else 0.5 for i in range(n)]) + + with _ScalarSyncCounter(type(v)) as counter: + alpha = fraction_to_boundary(v, dv, tau=0.95) + + if not counter.available: + pytest.skip(f"scalar-sync counting unavailable for {type(v)!r}") + + # Vectorized form does a single float() on the reduced scalar; a constant + # bound (independent of n) is the invariant. The old loop would be ~n here. + assert counter.count <= 4, f"{counter.count} host syncs for n={n} (expected O(1))" + assert 0.0 <= alpha <= 1.0 diff --git a/tests/unit/test_backend_namespace.py b/tests/unit/test_backend_namespace.py index 57e5659..dce0edd 100644 --- a/tests/unit/test_backend_namespace.py +++ b/tests/unit/test_backend_namespace.py @@ -73,3 +73,21 @@ def test_capabilities_flags_jax_as_autodiff_not_sparse_numpy(): caps = capabilities(types.ModuleType("jax.numpy")) assert caps.name == "jax" assert caps.supports_autodiff is True + + +def test_capabilities_reports_real_devices(namespace): + """Devices come from the Array-API info namespace, not a hardcoded ``cpu``.""" + with implemented("device probe"): + caps = capabilities(namespace) + + assert isinstance(caps.devices, tuple) + assert len(caps.devices) >= 1 + assert all(isinstance(d, str) for d in caps.devices) + # Every installed CI backend (numpy/torch/array_api_strict) exposes a CPU. + assert any("cpu" in d.lower() for d in caps.devices) + + +def test_capabilities_devices_falls_back_when_info_missing(): + """A bare namespace without ``__array_namespace_info__`` degrades to ``cpu``.""" + caps = capabilities(types.ModuleType("numpy")) + assert caps.devices == ("cpu",) diff --git a/tests/unit/test_barrier.py b/tests/unit/test_barrier.py index 744630e..2b6bb72 100644 --- a/tests/unit/test_barrier.py +++ b/tests/unit/test_barrier.py @@ -36,3 +36,29 @@ def test_fraction_to_boundary_returns_one_for_nonnegative_step(namespace): alpha = fraction_to_boundary(v, dv, tau=0.99) assert alpha == 1.0 + + +def test_fraction_to_boundary_empty_vector_is_unconstrained(namespace): + v = array(namespace, []) + dv = array(namespace, []) + assert fraction_to_boundary(v, dv, tau=0.99) == 1.0 + + +def test_fraction_to_boundary_matches_elementwise_reference(namespace): + """Vectorized rule equals the scalar loop it replaced (regression). + + The element-wise loop was O(n) host syncs/call and dominated GPU iteration + time; this pins the vectorized form to identical results on mixed signs. + """ + raw_v = [1.0, 2.0, 0.5, 3.0, 0.25, 10.0] + raw_dv = [-0.5, 4.0, -4.0, 0.0, -0.1, -2.0] + tau = 0.95 + + expected = 1.0 + for vi, dvi in zip(raw_v, raw_dv, strict=True): + if dvi < 0.0: + expected = min(expected, tau * vi / (-dvi)) + expected = max(0.0, min(1.0, expected)) + + alpha = fraction_to_boundary(array(namespace, raw_v), array(namespace, raw_dv), tau) + assert_scalar_close(alpha, expected) diff --git a/tests/unit/test_logging.py b/tests/unit/test_logging.py index cb700ad..86a1839 100644 --- a/tests/unit/test_logging.py +++ b/tests/unit/test_logging.py @@ -63,6 +63,7 @@ def test_format_result_reports_status_and_sources(): constraint_violation=0.0, solve_time=0.042, linear_solver="sparse [Feral LDL^T (CPU)]", + device="cpu", derivative_sources=DerivativeSources(gradient="analytic", hessian="lbfgs"), message="converged", ) @@ -71,6 +72,7 @@ def test_format_result_reports_status_and_sources(): assert "iterations = 7" in text assert "solve time = 4.200e-02s" in text assert "linear solver = sparse [Feral LDL^T (CPU)]" in text + assert "device = cpu" in text assert "grad:analytic" in text and "hess:lbfgs" in text