wahln · wahln · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -46,9 +46,10 @@ jobs:
         env:
           IPAX_BACKENDS: numpy,torch,array_api_strict
         run: pytest -q --cov=ipax --cov-report=xml -n auto
-      - uses: codecov/codecov-action@v4
+      - uses: codecov/codecov-action@v5
         with:
           files: coverage.xml
+          token: ${{ secrets.CODECOV_TOKEN }}
         if: always()
 
   qc:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,29 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [Unreleased]
 
+## [0.2.0] - 2026-06-21
+
+### Added
+- GPU/device-efficiency profiling harness: `DeviceMetrics` and
+  `measure_device_solve` in `benchmarks/harness` (host↔device sync counter plus a
+  CuPy GPU/CPU time split), the `benchmarks/runners/device_efficiency.py` CLI
+  runner (GPU-gated, a no-op on CI), and kernel micro-benchmarks (matvec / dense
+  solve / one Newton step) parametrized over every installed backend.
+- Device reporting: backend `capabilities()` now discovers available devices via
+  the Array-API inspection API (`__array_namespace_info__().devices()`) instead of
+  assuming CPU, and `Result.device` records where the solve ran — surfaced in the
+  tier-1 result summary.
+
+### Changed
+- Vectorized `fraction_to_boundary`, removing a per-element Python loop that forced
+  `O(n)` host↔device synchronizations per call (and it is called six times per
+  iteration). On a CUDA backend this cut the per-iteration sync count from
+  thousands — scaling linearly with `n` — to a small constant, speeding up
+  matrix-free GPU iterations by roughly 20× at scale; iterates and results are
+  unchanged (CPU behavior is identical).
+- CI uploads coverage with `codecov/codecov-action@v5` using a repository
+  `CODECOV_TOKEN`.
+
 ## [0.1.1] - 2026-06-21
 
 ### Fixed
@@ -62,6 +85,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Contract batteries (`tests/contracts/`) plus unit/property/integration/backends/
   regression layers; benchmark suite (`benchmarks/`, asv); MkDocs documentation.
 
-[Unreleased]: https://github.com/wahln/ipax/compare/v0.1.1...HEAD
+[Unreleased]: https://github.com/wahln/ipax/compare/v0.2.0...HEAD
+[0.2.0]: https://github.com/wahln/ipax/compare/v0.1.1...v0.2.0
 [0.1.1]: https://github.com/wahln/ipax/compare/v0.1.0...v0.1.1
 [0.1.0]: https://github.com/wahln/ipax/releases/tag/v0.1.0
diff --git a/README.md b/README.md
@@ -2,6 +2,7 @@
 
 [![PyPI version](https://img.shields.io/pypi/v/ipax.svg)](https://pypi.org/project/ipax/)
 [![CI](https://github.com/wahln/ipax/actions/workflows/ci.yml/badge.svg)](https://github.com/wahln/ipax/actions/workflows/ci.yml)
+[![codecov](https://codecov.io/gh/wahln/ipax/branch/main/graph/badge.svg)](https://codecov.io/gh/wahln/ipax)
 [![Documentation](https://readthedocs.org/projects/ipax/badge/?version=latest)](https://ipax.readthedocs.io/en/latest/)
 [![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
 [![License: Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE)

diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py
@@ -68,14 +68,27 @@ def capture_environment() -> dict[str, object]:
         "machine": platform.machine(),
         "ipax": getattr(ipax, "__version__", "unknown"),
     }
-    for pkg in ("numpy", "scipy", "torch"):
+    for pkg in ("numpy", "scipy", "torch", "cupy", "jax"):
         try:
             env[pkg] = __import__(pkg).__version__
         except Exception:  # optional backend, record absence
             env[pkg] = None
+    env["gpu"] = _detect_gpu_name()
     return env
 
 
+def _detect_gpu_name() -> str | None:
+    """Best-effort CUDA device name for the report header (None if no GPU)."""
+    try:
+        import cupy as cp
+
+        props = cp.cuda.runtime.getDeviceProperties(0)
+        name = props["name"]
+        return name.decode() if isinstance(name, bytes) else str(name)
+    except Exception:  # no CuPy / no CUDA device
+        return None
+
+
 def _inf_norm(xp: Namespace, a: object, b: object) -> float:
     return float(xp.max(xp.abs(a - b)))
 
@@ -468,16 +481,241 @@ def _exp(value: float) -> str:
     return "—" if not math.isfinite(value) else f"{value:.2f}"
 
 
+# -- device-efficiency study (GPU profiling) ---------------------------------
+#
+# The iterative IPM loop is where Array-API GPU performance is won or lost: every
+# ``float()``/``bool()`` on a 0-d device array forces a host<->device sync that
+# serializes the GPU. This study measures, per solve, the **host-sync count** and
+# **wall time per iteration** (plus the GPU-vs-CPU time split on CuPy) so the
+# no-cost optimization — consolidating per-iteration scalar reads to one sync —
+# can be quantified before and after. Backend specifics (CuPy/Torch timers) live
+# here in ``benchmarks/``, never in the ``ipax/`` core (invariant #1).
+
+
+@dataclass(frozen=True)
+class DeviceMetrics:
+    """Host-sync / timing profile for one ``(backend, route, n)`` solve."""
+
+    backend: str
+    device: str
+    route: str
+    n_vars: int
+    n_iter: int
+    success: bool
+    solve_time: float  # total wall (s), device-synchronized at both ends
+    time_per_iter: float  # solve_time / n_iter
+    gpu_time: float | None  # measured GPU compute time (s); CuPy only
+    host_syncs: int | None  # device->host scalar materializations during solve
+    syncs_per_iter: float | None  # host_syncs / n_iter (the headline metric)
+    peak_device_mb: float | None  # device-memory high-water (CuPy/Torch-CUDA)
+    kkt_error: float
+
+
+class _ScalarSyncCounter:
+    """Count device->host scalar materializations during a solve.
+
+    Patches the array type's scalar dunders (``__float__``/``__int__``/
+    ``__bool__``/``__index__``/``item``) for the measurement window. Each forces a
+    host sync on a GPU backend, so the tally is the per-solve host-sync total the
+    driver-loop optimization targets. Built-in array types that forbid attribute
+    assignment (NumPy's ``ndarray``) make counting unavailable — :attr:`result`
+    is then ``None`` (on CPU a host scalar read is free anyway).
+    """
+
+    _NAMES = ("__float__", "__int__", "__bool__", "__index__", "item")
+
+    def __init__(self, array_type: type) -> None:
+        self._type = array_type
+        self._orig: dict[str, object] = {}
+        self.count = 0
+        self.available = True
+
+    def __enter__(self) -> _ScalarSyncCounter:
+        counter = self
+        for name in self._NAMES:
+            orig = getattr(self._type, name, None)
+            if orig is None:
+                continue
+
+            def make(orig: object):
+                def wrapped(self, *args, **kwargs):
+                    counter.count += 1
+                    return orig(self, *args, **kwargs)  # type: ignore[operator]
+
+                return wrapped
+
+            try:
+                setattr(self._type, name, make(orig))
+            except TypeError:  # built-in/extension type (NumPy) — cannot patch
+                self.available = False
+                self._restore()
+                break
+            self._orig[name] = orig
+        return self
+
+    def _restore(self) -> None:
+        for name, orig in self._orig.items():
+            setattr(self._type, name, orig)
+        self._orig.clear()
+
+    def __exit__(self, *exc: object) -> None:
+        self._restore()
+
+    @property
+    def result(self) -> int | None:
+        return self.count if self.available else None
+
+
+def _sync_device(backend: str) -> None:
+    """Block until the device finishes queued work (no-op off GPU)."""
+    if backend == "cupy":
+        import cupy as cp
+
+        cp.cuda.Device().synchronize()
+    elif backend == "torch":
+        import torch
+
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+
+def _reset_device_memory(backend: str) -> None:
+    """Reset the device-memory high-water mark before a measured solve."""
+    if backend == "cupy":
+        import cupy as cp
+
+        cp.get_default_memory_pool().free_all_blocks()
+    elif backend == "torch":
+        import torch
+
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats()
+
+
+def _peak_device_mb(backend: str) -> float | None:
+    """Device-memory high-water mark in MB (None off GPU)."""
+    if backend == "cupy":
+        import cupy as cp
+
+        return cp.get_default_memory_pool().total_bytes() / 1e6
+    if backend == "torch":
+        import torch
+
+        if torch.cuda.is_available():
+            return torch.cuda.max_memory_allocated() / 1e6
+    return None
+
+
+def _gpu_time(backend: str, fn: object) -> float | None:
+    """Measured GPU compute time of ``fn`` in seconds (CuPy only, else None).
+
+    Uses ``cupyx.profiler.benchmark`` (one extra run), whose ``gpu_times`` is the
+    on-stream device time. A large gap between wall time and this value is the
+    signature of a host-sync-bound loop.
+    """
+    if backend != "cupy":
+        return None
+    try:
+        from cupyx.profiler import benchmark
+
+        measured = benchmark(fn, n_repeat=1, n_warmup=0)  # type: ignore[arg-type]
+        return float(measured.gpu_times.mean())
+    except Exception:  # profiler unavailable / non-idempotent call
+        return None
+
+
+def measure_device_solve(
+    problem: object,
+    x0: object,
+    options: ipax.Options,
+    *,
+    backend: str,
+    route: str,
+    warmup: bool = True,
+) -> DeviceMetrics:
+    """Profile one solve for host-sync count and per-iteration timing.
+
+    ``warmup`` runs (and discards) one solve first so device handle/allocator
+    init and any JIT cost stay out of the measured run. The measured solve is
+    device-synchronized at both ends so ``solve_time`` is true end-to-end wall.
+    """
+    if warmup:
+        ipax.solve(problem, x0, options=options)  # type: ignore[arg-type]
+
+    _reset_device_memory(backend)
+    _sync_device(backend)
+    with _ScalarSyncCounter(type(x0)) as counter:
+        start = perf_counter()
+        result = ipax.solve(problem, x0, options=options)  # type: ignore[arg-type]
+        _sync_device(backend)
+        wall = perf_counter() - start
+    host_syncs = counter.result
+
+    n_iter = result.n_iter
+    gpu_time = _gpu_time(backend, lambda: ipax.solve(problem, x0, options=options))
+    peak = _peak_device_mb(backend)
+    return DeviceMetrics(
+        backend=backend,
+        device=result.device or "cpu",
+        route=route,
+        n_vars=int(problem.n_vars),  # type: ignore[attr-defined]
+        n_iter=n_iter,
+        success=result.success,
+        solve_time=wall,
+        time_per_iter=wall / n_iter if n_iter else float("nan"),
+        gpu_time=gpu_time,
+        host_syncs=host_syncs,
+        syncs_per_iter=(
+            host_syncs / n_iter if (host_syncs is not None and n_iter) else None
+        ),
+        peak_device_mb=peak,
+        kkt_error=result.kkt_error,
+    )
+
+
+def format_device(metrics: list[DeviceMetrics], environment: dict[str, object]) -> str:
+    """Render the device-efficiency study as Markdown (syncs/iter + timing)."""
+    lines = [
+        "# ipax device-efficiency study",
+        "",
+        f"- generated: `{environment.get('timestamp')}`",
+        f"- gpu: `{environment.get('gpu')}`",
+        f"- cupy `{environment.get('cupy')}` · torch `{environment.get('torch')}`",
+        "- headline metric: **host syncs / iter** "
+        "(device->host scalar reads in the loop).",
+        "",
+        "| backend | device | route | n | iters | wall (s) | s/iter "
+        "| syncs | syncs/iter | gpu (s) | peak MB | kkt |",
+        "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |",
+    ]
+    for m in sorted(metrics, key=lambda m: (m.backend, m.route, m.n_vars)):
+        lines.append(
+            f"| {m.backend} | {m.device} | {m.route} | {m.n_vars} | {m.n_iter} "
+            f"| {m.solve_time:.3f} | {_fmt(m.time_per_iter)} "
+            f"| {_fmt_count(m.host_syncs)} | {_fmt_opt(m.syncs_per_iter)} "
+            f"| {_fmt_opt(m.gpu_time)} | {_fmt_opt(m.peak_device_mb)} "
+            f"| {_fmt(m.kkt_error)} |"
+        )
+    return "\n".join(lines) + "\n"
+
+
+def _fmt_count(value: int | None) -> str:
+    return "—" if value is None else str(value)
+
+
 __all__ = [
     "CaseResult",
     "CrossCheckResult",
+    "DeviceMetrics",
     "ScalingPoint",
     "capture_environment",
     "cross_check",
     "fit_exponent",
     "format_crosscheck",
+    "format_device",
     "format_markdown",
     "format_scaling",
+    "measure_device_solve",
     "measure_solve",
     "run_case",
     "to_payload",