Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,10 @@ jobs:
env:
IPAX_BACKENDS: numpy,torch,array_api_strict
run: pytest -q --cov=ipax --cov-report=xml -n auto
- uses: codecov/codecov-action@v4
- uses: codecov/codecov-action@v5
with:
files: coverage.xml
token: ${{ secrets.CODECOV_TOKEN }}
if: always()

qc:
Expand Down
26 changes: 25 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,29 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

## [Unreleased]

## [0.2.0] - 2026-06-21

### Added
- GPU/device-efficiency profiling harness: `DeviceMetrics` and
`measure_device_solve` in `benchmarks/harness` (host↔device sync counter plus a
CuPy GPU/CPU time split), the `benchmarks/runners/device_efficiency.py` CLI
runner (GPU-gated, a no-op on CI), and kernel micro-benchmarks (matvec / dense
solve / one Newton step) parametrized over every installed backend.
- Device reporting: backend `capabilities()` now discovers available devices via
the Array-API inspection API (`__array_namespace_info__().devices()`) instead of
assuming CPU, and `Result.device` records where the solve ran — surfaced in the
tier-1 result summary.

### Changed
- Vectorized `fraction_to_boundary`, removing a per-element Python loop that forced
`O(n)` host↔device synchronizations per call (and it is called six times per
iteration). On a CUDA backend this cut the per-iteration sync count from
thousands — scaling linearly with `n` — to a small constant, speeding up
matrix-free GPU iterations by roughly 20× at scale; iterates and results are
unchanged (CPU behavior is identical).
- CI uploads coverage with `codecov/codecov-action@v5` using a repository
`CODECOV_TOKEN`.

## [0.1.1] - 2026-06-21

### Fixed
Expand Down Expand Up @@ -62,6 +85,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Contract batteries (`tests/contracts/`) plus unit/property/integration/backends/
regression layers; benchmark suite (`benchmarks/`, asv); MkDocs documentation.

[Unreleased]: https://github.com/wahln/ipax/compare/v0.1.1...HEAD
[Unreleased]: https://github.com/wahln/ipax/compare/v0.2.0...HEAD
[0.2.0]: https://github.com/wahln/ipax/compare/v0.1.1...v0.2.0
[0.1.1]: https://github.com/wahln/ipax/compare/v0.1.0...v0.1.1
[0.1.0]: https://github.com/wahln/ipax/releases/tag/v0.1.0
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

[![PyPI version](https://img.shields.io/pypi/v/ipax.svg)](https://pypi.org/project/ipax/)
[![CI](https://github.com/wahln/ipax/actions/workflows/ci.yml/badge.svg)](https://github.com/wahln/ipax/actions/workflows/ci.yml)
[![codecov](https://codecov.io/gh/wahln/ipax/branch/main/graph/badge.svg)](https://codecov.io/gh/wahln/ipax)
[![Documentation](https://readthedocs.org/projects/ipax/badge/?version=latest)](https://ipax.readthedocs.io/en/latest/)
[![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
[![License: Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE)
Expand Down
240 changes: 239 additions & 1 deletion benchmarks/harness/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,27 @@ def capture_environment() -> dict[str, object]:
"machine": platform.machine(),
"ipax": getattr(ipax, "__version__", "unknown"),
}
for pkg in ("numpy", "scipy", "torch"):
for pkg in ("numpy", "scipy", "torch", "cupy", "jax"):
try:
env[pkg] = __import__(pkg).__version__
except Exception: # optional backend, record absence
env[pkg] = None
env["gpu"] = _detect_gpu_name()
return env


def _detect_gpu_name() -> str | None:
"""Best-effort CUDA device name for the report header (None if no GPU)."""
try:
import cupy as cp

props = cp.cuda.runtime.getDeviceProperties(0)
name = props["name"]
return name.decode() if isinstance(name, bytes) else str(name)
except Exception: # no CuPy / no CUDA device
return None


def _inf_norm(xp: Namespace, a: object, b: object) -> float:
return float(xp.max(xp.abs(a - b)))

Expand Down Expand Up @@ -468,16 +481,241 @@ def _exp(value: float) -> str:
return "—" if not math.isfinite(value) else f"{value:.2f}"


# -- device-efficiency study (GPU profiling) ---------------------------------
#
# The iterative IPM loop is where Array-API GPU performance is won or lost: every
# ``float()``/``bool()`` on a 0-d device array forces a host<->device sync that
# serializes the GPU. This study measures, per solve, the **host-sync count** and
# **wall time per iteration** (plus the GPU-vs-CPU time split on CuPy) so the
# no-cost optimization — consolidating per-iteration scalar reads to one sync —
# can be quantified before and after. Backend specifics (CuPy/Torch timers) live
# here in ``benchmarks/``, never in the ``ipax/`` core (invariant #1).


@dataclass(frozen=True)
class DeviceMetrics:
"""Host-sync / timing profile for one ``(backend, route, n)`` solve."""

backend: str
device: str
route: str
n_vars: int
n_iter: int
success: bool
solve_time: float # total wall (s), device-synchronized at both ends
time_per_iter: float # solve_time / n_iter
gpu_time: float | None # measured GPU compute time (s); CuPy only
host_syncs: int | None # device->host scalar materializations during solve
syncs_per_iter: float | None # host_syncs / n_iter (the headline metric)
peak_device_mb: float | None # device-memory high-water (CuPy/Torch-CUDA)
kkt_error: float


class _ScalarSyncCounter:
"""Count device->host scalar materializations during a solve.

Patches the array type's scalar dunders (``__float__``/``__int__``/
``__bool__``/``__index__``/``item``) for the measurement window. Each forces a
host sync on a GPU backend, so the tally is the per-solve host-sync total the
driver-loop optimization targets. Built-in array types that forbid attribute
assignment (NumPy's ``ndarray``) make counting unavailable — :attr:`result`
is then ``None`` (on CPU a host scalar read is free anyway).
"""

_NAMES = ("__float__", "__int__", "__bool__", "__index__", "item")

def __init__(self, array_type: type) -> None:
self._type = array_type
self._orig: dict[str, object] = {}
self.count = 0
self.available = True

def __enter__(self) -> _ScalarSyncCounter:
counter = self
for name in self._NAMES:
orig = getattr(self._type, name, None)
if orig is None:
continue

def make(orig: object):
def wrapped(self, *args, **kwargs):
counter.count += 1
return orig(self, *args, **kwargs) # type: ignore[operator]

return wrapped

try:
setattr(self._type, name, make(orig))
except TypeError: # built-in/extension type (NumPy) — cannot patch
self.available = False
self._restore()
break
self._orig[name] = orig
return self

def _restore(self) -> None:
for name, orig in self._orig.items():
setattr(self._type, name, orig)
self._orig.clear()

def __exit__(self, *exc: object) -> None:
self._restore()

@property
def result(self) -> int | None:
return self.count if self.available else None


def _sync_device(backend: str) -> None:
"""Block until the device finishes queued work (no-op off GPU)."""
if backend == "cupy":
import cupy as cp

cp.cuda.Device().synchronize()
elif backend == "torch":
import torch

if torch.cuda.is_available():
torch.cuda.synchronize()


def _reset_device_memory(backend: str) -> None:
"""Reset the device-memory high-water mark before a measured solve."""
if backend == "cupy":
import cupy as cp

cp.get_default_memory_pool().free_all_blocks()
elif backend == "torch":
import torch

if torch.cuda.is_available():
torch.cuda.reset_peak_memory_stats()


def _peak_device_mb(backend: str) -> float | None:
"""Device-memory high-water mark in MB (None off GPU)."""
if backend == "cupy":
import cupy as cp

return cp.get_default_memory_pool().total_bytes() / 1e6
if backend == "torch":
import torch

if torch.cuda.is_available():
return torch.cuda.max_memory_allocated() / 1e6
return None


def _gpu_time(backend: str, fn: object) -> float | None:
"""Measured GPU compute time of ``fn`` in seconds (CuPy only, else None).

Uses ``cupyx.profiler.benchmark`` (one extra run), whose ``gpu_times`` is the
on-stream device time. A large gap between wall time and this value is the
signature of a host-sync-bound loop.
"""
if backend != "cupy":
return None
try:
from cupyx.profiler import benchmark

measured = benchmark(fn, n_repeat=1, n_warmup=0) # type: ignore[arg-type]
return float(measured.gpu_times.mean())
except Exception: # profiler unavailable / non-idempotent call
return None


def measure_device_solve(
problem: object,
x0: object,
options: ipax.Options,
*,
backend: str,
route: str,
warmup: bool = True,
) -> DeviceMetrics:
"""Profile one solve for host-sync count and per-iteration timing.

``warmup`` runs (and discards) one solve first so device handle/allocator
init and any JIT cost stay out of the measured run. The measured solve is
device-synchronized at both ends so ``solve_time`` is true end-to-end wall.
"""
if warmup:
ipax.solve(problem, x0, options=options) # type: ignore[arg-type]

_reset_device_memory(backend)
_sync_device(backend)
with _ScalarSyncCounter(type(x0)) as counter:
start = perf_counter()
result = ipax.solve(problem, x0, options=options) # type: ignore[arg-type]
_sync_device(backend)
wall = perf_counter() - start
host_syncs = counter.result

n_iter = result.n_iter
gpu_time = _gpu_time(backend, lambda: ipax.solve(problem, x0, options=options))
peak = _peak_device_mb(backend)
return DeviceMetrics(
backend=backend,
device=result.device or "cpu",
route=route,
n_vars=int(problem.n_vars), # type: ignore[attr-defined]
n_iter=n_iter,
success=result.success,
solve_time=wall,
time_per_iter=wall / n_iter if n_iter else float("nan"),
gpu_time=gpu_time,
host_syncs=host_syncs,
syncs_per_iter=(
host_syncs / n_iter if (host_syncs is not None and n_iter) else None
),
peak_device_mb=peak,
kkt_error=result.kkt_error,
)


def format_device(metrics: list[DeviceMetrics], environment: dict[str, object]) -> str:
"""Render the device-efficiency study as Markdown (syncs/iter + timing)."""
lines = [
"# ipax device-efficiency study",
"",
f"- generated: `{environment.get('timestamp')}`",
f"- gpu: `{environment.get('gpu')}`",
f"- cupy `{environment.get('cupy')}` · torch `{environment.get('torch')}`",
"- headline metric: **host syncs / iter** "
"(device->host scalar reads in the loop).",
"",
"| backend | device | route | n | iters | wall (s) | s/iter "
"| syncs | syncs/iter | gpu (s) | peak MB | kkt |",
"| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |",
]
for m in sorted(metrics, key=lambda m: (m.backend, m.route, m.n_vars)):
lines.append(
f"| {m.backend} | {m.device} | {m.route} | {m.n_vars} | {m.n_iter} "
f"| {m.solve_time:.3f} | {_fmt(m.time_per_iter)} "
f"| {_fmt_count(m.host_syncs)} | {_fmt_opt(m.syncs_per_iter)} "
f"| {_fmt_opt(m.gpu_time)} | {_fmt_opt(m.peak_device_mb)} "
f"| {_fmt(m.kkt_error)} |"
)
return "\n".join(lines) + "\n"


def _fmt_count(value: int | None) -> str:
return "—" if value is None else str(value)


__all__ = [
"CaseResult",
"CrossCheckResult",
"DeviceMetrics",
"ScalingPoint",
"capture_environment",
"cross_check",
"fit_exponent",
"format_crosscheck",
"format_device",
"format_markdown",
"format_scaling",
"measure_device_solve",
"measure_solve",
"run_case",
"to_payload",
Expand Down
Loading
Loading