diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 95bebaf..ee881af 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -46,9 +46,10 @@ jobs:
         env:
           IPAX_BACKENDS: numpy,torch,array_api_strict
         run: pytest -q --cov=ipax --cov-report=xml -n auto
-      - uses: codecov/codecov-action@v4
+      - uses: codecov/codecov-action@v5
         with:
           files: coverage.xml
+          token: ${{ secrets.CODECOV_TOKEN }}
         if: always()
 
   qc:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d9fca30..144d8cf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,29 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 
 ## [Unreleased]
 
+## [0.2.0] - 2026-06-21
+
+### Added
+- GPU/device-efficiency profiling harness: `DeviceMetrics` and
+  `measure_device_solve` in `benchmarks/harness` (host↔device sync counter plus a
+  CuPy GPU/CPU time split), the `benchmarks/runners/device_efficiency.py` CLI
+  runner (GPU-gated, a no-op on CI), and kernel micro-benchmarks (matvec / dense
+  solve / one Newton step) parametrized over every installed backend.
+- Device reporting: backend `capabilities()` now discovers available devices via
+  the Array-API inspection API (`__array_namespace_info__().devices()`) instead of
+  assuming CPU, and `Result.device` records where the solve ran — surfaced in the
+  tier-1 result summary.
+
+### Changed
+- Vectorized `fraction_to_boundary`, removing a per-element Python loop that forced
+  `O(n)` host↔device synchronizations per call (and it is called six times per
+  iteration). On a CUDA backend this cut the per-iteration sync count from
+  thousands — scaling linearly with `n` — to a small constant, speeding up
+  matrix-free GPU iterations by roughly 20× at scale; iterates and results are
+  unchanged (CPU behavior is identical).
+- CI uploads coverage with `codecov/codecov-action@v5` using a repository
+  `CODECOV_TOKEN`.
+
 ## [0.1.1] - 2026-06-21
 
 ### Fixed
@@ -62,6 +85,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Contract batteries (`tests/contracts/`) plus unit/property/integration/backends/
   regression layers; benchmark suite (`benchmarks/`, asv); MkDocs documentation.
 
-[Unreleased]: https://github.com/wahln/ipax/compare/v0.1.1...HEAD
+[Unreleased]: https://github.com/wahln/ipax/compare/v0.2.0...HEAD
+[0.2.0]: https://github.com/wahln/ipax/compare/v0.1.1...v0.2.0
 [0.1.1]: https://github.com/wahln/ipax/compare/v0.1.0...v0.1.1
 [0.1.0]: https://github.com/wahln/ipax/releases/tag/v0.1.0
diff --git a/README.md b/README.md
index 0ff611b..4da0ae4 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 
 [![PyPI version](https://img.shields.io/pypi/v/ipax.svg)](https://pypi.org/project/ipax/)
 [![CI](https://github.com/wahln/ipax/actions/workflows/ci.yml/badge.svg)](https://github.com/wahln/ipax/actions/workflows/ci.yml)
+[![codecov](https://codecov.io/gh/wahln/ipax/branch/main/graph/badge.svg)](https://codecov.io/gh/wahln/ipax)
 [![Documentation](https://readthedocs.org/projects/ipax/badge/?version=latest)](https://ipax.readthedocs.io/en/latest/)
 [![Python](https://img.shields.io/badge/python-3.10%2B-blue.svg)](https://www.python.org/downloads/)
 [![License: Apache 2.0](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE)
diff --git a/benchmarks/harness/__init__.py b/benchmarks/harness/__init__.py
index 0312202..a78bcfd 100644
--- a/benchmarks/harness/__init__.py
+++ b/benchmarks/harness/__init__.py
@@ -68,14 +68,27 @@ def capture_environment() -> dict[str, object]:
         "machine": platform.machine(),
         "ipax": getattr(ipax, "__version__", "unknown"),
     }
-    for pkg in ("numpy", "scipy", "torch"):
+    for pkg in ("numpy", "scipy", "torch", "cupy", "jax"):
         try:
             env[pkg] = __import__(pkg).__version__
         except Exception:  # optional backend, record absence
             env[pkg] = None
+    env["gpu"] = _detect_gpu_name()
     return env
 
 
+def _detect_gpu_name() -> str | None:
+    """Best-effort CUDA device name for the report header (None if no GPU)."""
+    try:
+        import cupy as cp
+
+        props = cp.cuda.runtime.getDeviceProperties(0)
+        name = props["name"]
+        return name.decode() if isinstance(name, bytes) else str(name)
+    except Exception:  # no CuPy / no CUDA device
+        return None
+
+
 def _inf_norm(xp: Namespace, a: object, b: object) -> float:
     return float(xp.max(xp.abs(a - b)))
 
@@ -468,16 +481,241 @@ def _exp(value: float) -> str:
     return "—" if not math.isfinite(value) else f"{value:.2f}"
 
 
+# -- device-efficiency study (GPU profiling) ---------------------------------
+#
+# The iterative IPM loop is where Array-API GPU performance is won or lost: every
+# ``float()``/``bool()`` on a 0-d device array forces a host<->device sync that
+# serializes the GPU. This study measures, per solve, the **host-sync count** and
+# **wall time per iteration** (plus the GPU-vs-CPU time split on CuPy) so the
+# no-cost optimization — consolidating per-iteration scalar reads to one sync —
+# can be quantified before and after. Backend specifics (CuPy/Torch timers) live
+# here in ``benchmarks/``, never in the ``ipax/`` core (invariant #1).
+
+
+@dataclass(frozen=True)
+class DeviceMetrics:
+    """Host-sync / timing profile for one ``(backend, route, n)`` solve."""
+
+    backend: str
+    device: str
+    route: str
+    n_vars: int
+    n_iter: int
+    success: bool
+    solve_time: float  # total wall (s), device-synchronized at both ends
+    time_per_iter: float  # solve_time / n_iter
+    gpu_time: float | None  # measured GPU compute time (s); CuPy only
+    host_syncs: int | None  # device->host scalar materializations during solve
+    syncs_per_iter: float | None  # host_syncs / n_iter (the headline metric)
+    peak_device_mb: float | None  # device-memory high-water (CuPy/Torch-CUDA)
+    kkt_error: float
+
+
+class _ScalarSyncCounter:
+    """Count device->host scalar materializations during a solve.
+
+    Patches the array type's scalar dunders (``__float__``/``__int__``/
+    ``__bool__``/``__index__``/``item``) for the measurement window. Each forces a
+    host sync on a GPU backend, so the tally is the per-solve host-sync total the
+    driver-loop optimization targets. Built-in array types that forbid attribute
+    assignment (NumPy's ``ndarray``) make counting unavailable — :attr:`result`
+    is then ``None`` (on CPU a host scalar read is free anyway).
+    """
+
+    _NAMES = ("__float__", "__int__", "__bool__", "__index__", "item")
+
+    def __init__(self, array_type: type) -> None:
+        self._type = array_type
+        self._orig: dict[str, object] = {}
+        self.count = 0
+        self.available = True
+
+    def __enter__(self) -> _ScalarSyncCounter:
+        counter = self
+        for name in self._NAMES:
+            orig = getattr(self._type, name, None)
+            if orig is None:
+                continue
+
+            def make(orig: object):
+                def wrapped(self, *args, **kwargs):
+                    counter.count += 1
+                    return orig(self, *args, **kwargs)  # type: ignore[operator]
+
+                return wrapped
+
+            try:
+                setattr(self._type, name, make(orig))
+            except TypeError:  # built-in/extension type (NumPy) — cannot patch
+                self.available = False
+                self._restore()
+                break
+            self._orig[name] = orig
+        return self
+
+    def _restore(self) -> None:
+        for name, orig in self._orig.items():
+            setattr(self._type, name, orig)
+        self._orig.clear()
+
+    def __exit__(self, *exc: object) -> None:
+        self._restore()
+
+    @property
+    def result(self) -> int | None:
+        return self.count if self.available else None
+
+
+def _sync_device(backend: str) -> None:
+    """Block until the device finishes queued work (no-op off GPU)."""
+    if backend == "cupy":
+        import cupy as cp
+
+        cp.cuda.Device().synchronize()
+    elif backend == "torch":
+        import torch
+
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+
+def _reset_device_memory(backend: str) -> None:
+    """Reset the device-memory high-water mark before a measured solve."""
+    if backend == "cupy":
+        import cupy as cp
+
+        cp.get_default_memory_pool().free_all_blocks()
+    elif backend == "torch":
+        import torch
+
+        if torch.cuda.is_available():
+            torch.cuda.reset_peak_memory_stats()
+
+
+def _peak_device_mb(backend: str) -> float | None:
+    """Device-memory high-water mark in MB (None off GPU)."""
+    if backend == "cupy":
+        import cupy as cp
+
+        return cp.get_default_memory_pool().total_bytes() / 1e6
+    if backend == "torch":
+        import torch
+
+        if torch.cuda.is_available():
+            return torch.cuda.max_memory_allocated() / 1e6
+    return None
+
+
+def _gpu_time(backend: str, fn: object) -> float | None:
+    """Measured GPU compute time of ``fn`` in seconds (CuPy only, else None).
+
+    Uses ``cupyx.profiler.benchmark`` (one extra run), whose ``gpu_times`` is the
+    on-stream device time. A large gap between wall time and this value is the
+    signature of a host-sync-bound loop.
+    """
+    if backend != "cupy":
+        return None
+    try:
+        from cupyx.profiler import benchmark
+
+        measured = benchmark(fn, n_repeat=1, n_warmup=0)  # type: ignore[arg-type]
+        return float(measured.gpu_times.mean())
+    except Exception:  # profiler unavailable / non-idempotent call
+        return None
+
+
+def measure_device_solve(
+    problem: object,
+    x0: object,
+    options: ipax.Options,
+    *,
+    backend: str,
+    route: str,
+    warmup: bool = True,
+) -> DeviceMetrics:
+    """Profile one solve for host-sync count and per-iteration timing.
+
+    ``warmup`` runs (and discards) one solve first so device handle/allocator
+    init and any JIT cost stay out of the measured run. The measured solve is
+    device-synchronized at both ends so ``solve_time`` is true end-to-end wall.
+    """
+    if warmup:
+        ipax.solve(problem, x0, options=options)  # type: ignore[arg-type]
+
+    _reset_device_memory(backend)
+    _sync_device(backend)
+    with _ScalarSyncCounter(type(x0)) as counter:
+        start = perf_counter()
+        result = ipax.solve(problem, x0, options=options)  # type: ignore[arg-type]
+        _sync_device(backend)
+        wall = perf_counter() - start
+    host_syncs = counter.result
+
+    n_iter = result.n_iter
+    gpu_time = _gpu_time(backend, lambda: ipax.solve(problem, x0, options=options))
+    peak = _peak_device_mb(backend)
+    return DeviceMetrics(
+        backend=backend,
+        device=result.device or "cpu",
+        route=route,
+        n_vars=int(problem.n_vars),  # type: ignore[attr-defined]
+        n_iter=n_iter,
+        success=result.success,
+        solve_time=wall,
+        time_per_iter=wall / n_iter if n_iter else float("nan"),
+        gpu_time=gpu_time,
+        host_syncs=host_syncs,
+        syncs_per_iter=(
+            host_syncs / n_iter if (host_syncs is not None and n_iter) else None
+        ),
+        peak_device_mb=peak,
+        kkt_error=result.kkt_error,
+    )
+
+
+def format_device(metrics: list[DeviceMetrics], environment: dict[str, object]) -> str:
+    """Render the device-efficiency study as Markdown (syncs/iter + timing)."""
+    lines = [
+        "# ipax device-efficiency study",
+        "",
+        f"- generated: `{environment.get('timestamp')}`",
+        f"- gpu: `{environment.get('gpu')}`",
+        f"- cupy `{environment.get('cupy')}` · torch `{environment.get('torch')}`",
+        "- headline metric: **host syncs / iter** "
+        "(device->host scalar reads in the loop).",
+        "",
+        "| backend | device | route | n | iters | wall (s) | s/iter "
+        "| syncs | syncs/iter | gpu (s) | peak MB | kkt |",
+        "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |",
+    ]
+    for m in sorted(metrics, key=lambda m: (m.backend, m.route, m.n_vars)):
+        lines.append(
+            f"| {m.backend} | {m.device} | {m.route} | {m.n_vars} | {m.n_iter} "
+            f"| {m.solve_time:.3f} | {_fmt(m.time_per_iter)} "
+            f"| {_fmt_count(m.host_syncs)} | {_fmt_opt(m.syncs_per_iter)} "
+            f"| {_fmt_opt(m.gpu_time)} | {_fmt_opt(m.peak_device_mb)} "
+            f"| {_fmt(m.kkt_error)} |"
+        )
+    return "\n".join(lines) + "\n"
+
+
+def _fmt_count(value: int | None) -> str:
+    return "—" if value is None else str(value)
+
+
 __all__ = [
     "CaseResult",
     "CrossCheckResult",
+    "DeviceMetrics",
     "ScalingPoint",
     "capture_environment",
     "cross_check",
     "fit_exponent",
     "format_crosscheck",
+    "format_device",
     "format_markdown",
     "format_scaling",
+    "measure_device_solve",
     "measure_solve",
     "run_case",
     "to_payload",
diff --git a/benchmarks/runners/device_efficiency.py b/benchmarks/runners/device_efficiency.py
new file mode 100644
index 0000000..79fc59e
--- /dev/null
+++ b/benchmarks/runners/device_efficiency.py
@@ -0,0 +1,126 @@
+"""Device-efficiency study runner (GPU profiling).
+
+Profiles the IPM loop for **host<->device sync count** and **wall time per
+iteration** (plus the GPU-vs-CPU time split on CuPy) across solver routes and
+problem sizes on the synthetic RT-like generator. The headline metric is
+``syncs/iter``: every ``float()``/``bool()`` on a 0-d device array in the driver
+loop forces a host sync that serializes the GPU, so this is the number the
+no-cost loop optimization (consolidating per-iteration scalar reads) drives down.
+
+Run from the repository root on a CUDA machine::
+
+    python -m benchmarks.runners.device_efficiency \
+        --backends cupy --routes krylov,sparse --sizes 1000,10000,100000
+
+GPU-gated: a requested backend that is not installed is skipped with a note, so
+this is a **no-op on CI** (no CuPy) and real work on a local GPU. Exits non-zero
+if any solve fails. For kernel-launch counts, wrap the same command in an external
+profiler::
+
+    nsys profile -o device python -m benchmarks.runners.device_efficiency --backends cupy ...
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from dataclasses import asdict
+from pathlib import Path
+
+import ipax
+from benchmarks.harness import (
+    DeviceMetrics,
+    capture_environment,
+    format_device,
+    measure_device_solve,
+)
+from ipax.backend.namespace import capabilities
+from ipax.testing.backends import import_namespace
+from ipax.testing.problems import make_rt_like_problem
+
+_ROUTES: dict[str, ipax.Options] = {
+    "dense": ipax.Options(hessian="exact", linsolve="dense"),
+    "krylov": ipax.Options(hessian="exact", linsolve="krylov"),
+    "sparse": ipax.Options(hessian="exact", linsolve="sparse"),
+}
+
+
+def run_device_efficiency(
+    backends: list[str], routes: list[str], sizes: list[int]
+) -> list[DeviceMetrics]:
+    """Profile each installed backend over the route × size grid."""
+    metrics: list[DeviceMetrics] = []
+    for backend in backends:
+        try:
+            xp = import_namespace(backend)
+        except ImportError:
+            print(f"  skip backend {backend!r}: not installed")
+            continue
+
+        has_sparse = capabilities(xp).has_sparse_adapter
+        for route in routes:
+            if route == "sparse" and not has_sparse:
+                print(f"  skip {backend}/sparse: no sparse adapter")
+                continue
+            options = _ROUTES[route]
+            for n in sizes:
+                problem = make_rt_like_problem(
+                    xp, n, n_structures=6, density=0.1, seed=0
+                )
+                x0 = xp.full((n,), 0.01, dtype=xp.float64)
+                metrics.append(
+                    measure_device_solve(
+                        problem, x0, options, backend=backend, route=route
+                    )
+                )
+    return metrics
+
+
+def _parse_int_list(text: str) -> list[int]:
+    return [int(x) for x in text.split(",") if x.strip()]
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="ipax device-efficiency study")
+    parser.add_argument(
+        "--backends", default="cupy", help="comma-separated backends (default: cupy)"
+    )
+    parser.add_argument(
+        "--routes", default="krylov,sparse", help="comma-separated routes"
+    )
+    parser.add_argument(
+        "--sizes", default="1000,10000,100000", help="comma-separated n"
+    )
+    parser.add_argument("--out", default="benchmarks/reports/device")
+    args = parser.parse_args(argv)
+
+    backends = [b.strip() for b in args.backends.split(",") if b.strip()]
+    routes = [r.strip() for r in args.routes.split(",") if r.strip()]
+    sizes = _parse_int_list(args.sizes)
+    unknown = [r for r in routes if r not in _ROUTES]
+    if unknown:
+        parser.error(f"unknown route(s): {', '.join(unknown)}")
+
+    environment = capture_environment()
+    metrics = run_device_efficiency(backends, routes, sizes)
+
+    out = Path(args.out)
+    out.parent.mkdir(parents=True, exist_ok=True)
+    payload = {"environment": environment, "metrics": [asdict(m) for m in metrics]}
+    out.with_suffix(".json").write_text(json.dumps(payload, indent=2))
+    out.with_suffix(".md").write_text(
+        format_device(metrics, environment), encoding="utf-8"
+    )
+
+    failures = [m for m in metrics if not m.success]
+    print(
+        f"device-efficiency: {len(metrics) - len(failures)}/{len(metrics)} solved "
+        f"-> {out.with_suffix('.json')}, {out.with_suffix('.md')}"
+    )
+    for m in failures:
+        print(f"  FAIL {m.backend}/{m.route} n={m.n_vars}: kkt={m.kkt_error:.2e}")
+    return 1 if failures else 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/benchmarks/runners/micro/bench_kernels.py b/benchmarks/runners/micro/bench_kernels.py
index 368072e..2f626c2 100644
--- a/benchmarks/runners/micro/bench_kernels.py
+++ b/benchmarks/runners/micro/bench_kernels.py
@@ -1,18 +1,95 @@
-"""pytest-benchmark micro-benchmarks: matvec, factor, one Newton step (§9.4).
-
-Run with ``pytest benchmarks/runners/micro --benchmark-only``.
-"""
-
-from __future__ import annotations
-
-import pytest
-
-
-@pytest.mark.skip(reason="kernels")
-def test_matvec_throughput(benchmark):
-    raise NotImplementedError
-
-
-@pytest.mark.skip(reason="kernels")
-def test_one_newton_step(benchmark):
-    raise NotImplementedError
+"""pytest-benchmark micro-benchmarks: matvec, dense KKT solve, one Newton step.
+
+These isolate the per-kernel GPU cost (matmul, factor/solve) from the IPM loop
+overhead measured by ``benchmarks.runners.device_efficiency``. Each benchmark is
+parametrized over every installed backend (NumPy / Torch / CuPy), so the same
+kernels are timed on CPU and — when CuPy is present — on the GPU.
+
+Run with::
+
+    pytest benchmarks/runners/micro --benchmark-only
+    pytest benchmarks/runners/micro --benchmark-only -k cupy   # GPU kernels only
+
+Each callable ends with a scalar read so a lazy/async backend (CuPy) is forced to
+synchronize *inside* the timed region — otherwise the benchmark would clock only
+kernel-launch latency, not the kernel.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from ipax.backend.operators import Dense
+from ipax.linalg.dense import DenseSolver
+from ipax.testing.backends import import_namespace
+
+
+def _installed_backends() -> list[str]:
+    names: list[str] = []
+    for name in ("numpy", "torch", "cupy"):
+        try:
+            import_namespace(name)
+        except ImportError:
+            continue
+        names.append(name)
+    return names
+
+
+@pytest.fixture(params=_installed_backends(), ids=str)
+def xp(request: pytest.FixtureRequest):
+    return import_namespace(request.param)
+
+
+def _spd_matrix(xp, n: int):
+    """A symmetric positive-definite ``n×n`` matrix (well-conditioned)."""
+    m = xp.reshape(xp.linspace(-1.0, 1.0, n * n, dtype=xp.float64), (n, n))
+    return xp.matmul(m, xp.matrix_transpose(m)) + n * xp.eye(n, dtype=xp.float64)
+
+
+def test_matvec_throughput(benchmark, xp):
+    """Dense operator matvec — the dominant matrix-free kernel."""
+    n = 1024
+    op = Dense(_spd_matrix(xp, n))
+    v = xp.ones((n,), dtype=xp.float64)
+
+    def run() -> float:
+        return float(xp.sum(op.matvec(v)))
+
+    benchmark(run)
+
+
+def test_dense_kkt_solve(benchmark, xp):
+    """Factor + solve of a dense SPD system — the condensed KKT linear solve."""
+    n = 512
+    operator = Dense(_spd_matrix(xp, n))
+    rhs = xp.ones((n,), dtype=xp.float64)
+    solver = DenseSolver()
+
+    def run() -> float:
+        solver.factor(operator)
+        return float(xp.sum(solver.solve(rhs)))
+
+    benchmark(run)
+
+
+def test_one_newton_step(benchmark, xp):
+    """One Newton-system solve: assemble RHS, factor, solve, recover (proxy).
+
+    Times the linear-algebra heart of a single IPM iteration — a symmetric solve
+    plus the surrounding vector arithmetic — without the convergence/line-search
+    machinery, so a GPU regression here is a kernel regression, not loop overhead.
+    """
+    n = 512
+    operator = Dense(_spd_matrix(xp, n))
+    grad = xp.linspace(0.1, 1.0, n, dtype=xp.float64)
+    sigma = xp.ones((n,), dtype=xp.float64)
+    solver = DenseSolver()
+
+    def run() -> float:
+        rhs = -(grad + sigma)
+        solver.factor(operator)
+        dx = solver.solve(rhs)
+        dz = sigma * dx  # representative recovery of an eliminated block
+        return float(xp.max(xp.abs(dx)) + xp.max(xp.abs(dz)))
+
+    benchmark(run)
diff --git a/ipax/__init__.py b/ipax/__init__.py
index 37f3b9c..e249486 100644
--- a/ipax/__init__.py
+++ b/ipax/__init__.py
@@ -63,4 +63,4 @@
     "solve",
 ]
 
-__version__ = "0.1.1"
+__version__ = "0.2.0"
diff --git a/ipax/_logging.py b/ipax/_logging.py
index 709ac5a..3aef35e 100644
--- a/ipax/_logging.py
+++ b/ipax/_logging.py
@@ -209,6 +209,7 @@ def format_result(result: Result) -> str:
         f"  infeasibility = {result.constraint_violation:.3e}\n"
         f"  solve time    = {result.solve_time:.3e}s\n"
         f"  linear solver = {result.linear_solver}\n"
+        f"  device        = {result.device}\n"
         f"  derivatives   = grad:{src.gradient} eq_jac:{src.eq_jacobian} "
         f"ineq_jac:{src.ineq_jacobian} hess:{src.hessian}"
     )
diff --git a/ipax/backend/namespace.py b/ipax/backend/namespace.py
index 0158b15..c341282 100644
--- a/ipax/backend/namespace.py
+++ b/ipax/backend/namespace.py
@@ -103,6 +103,26 @@ class Capabilities:
     default_float: str  # "float64" preferred; read from inputs in practice
 
 
+def _detect_devices(xp: Namespace) -> tuple[str, ...]:
+    """Available devices via the Array-API info namespace, stringified.
+
+    Uses ``xp.__array_namespace_info__().devices()`` (Array API 2023.12 inspection
+    API) so the probe stays inside the standard — no ``cupy.cuda``/``torch.cuda``
+    in the core. Backends expose device objects (``<CUDA Device 0>``,
+    ``device(type='cpu')``, ``'cpu'``); we stringify them for reporting. Falls
+    back to ``("cpu",)`` for namespaces lacking the inspection API.
+    """
+    info_fn = getattr(xp, "__array_namespace_info__", None)
+    if info_fn is None:
+        return ("cpu",)
+    try:
+        devices = info_fn().devices()
+    except Exception:  # best-effort probe; never fail capability detection
+        return ("cpu",)
+    names = tuple(str(d) for d in devices)
+    return names or ("cpu",)
+
+
 def capabilities(xp: Namespace) -> Capabilities:
     """Probe ``xp`` for optional Array-API features and adapter availability.
 
@@ -131,7 +151,7 @@ def capabilities(xp: Namespace) -> Capabilities:
         linalg_functions=linalg_functions,
         has_sparse_adapter=name in _SPARSE_ADAPTER_BACKENDS,
         supports_autodiff=name in _AUTODIFF_BACKENDS,
-        devices=("cpu",),
+        devices=_detect_devices(xp),
         default_float=default_float,
     )
 
diff --git a/ipax/ipm/barrier.py b/ipax/ipm/barrier.py
index 4fbf91b..b188da0 100644
--- a/ipax/ipm/barrier.py
+++ b/ipax/ipm/barrier.py
@@ -16,8 +16,11 @@
 
 from __future__ import annotations
 
+import math
 from typing import TYPE_CHECKING
 
+from ipax.backend.namespace import array_namespace
+
 if TYPE_CHECKING:
     from ipax.options import BarrierOptions
     from ipax.typing import Array
@@ -43,11 +46,19 @@ def fraction_to_boundary(v: Array, dv: Array, tau: float) -> float:
     if not 0.0 < tau <= 1.0:
         raise ValueError("tau must be in (0, 1]")
 
-    alpha = 1.0
-    for idx in range(int(v.shape[0])):
-        step = float(dv[idx])
-        if step < 0.0:
-            alpha = min(alpha, tau * float(v[idx]) / (-step))
+    if int(v.shape[0]) == 0:
+        return 1.0
+
+    # Only components moving toward the boundary (dv < 0) limit the step; for
+    # those alpha_i = tau * v_i / (-dv_i) (Wachter & Biegler 2006, eq. 15).
+    # Vectorized so the whole rule costs a single host<->device sync (the final
+    # float()) rather than one per element — the element-wise Python loop made
+    # this O(n) syncs/call and dominated GPU iteration time.
+    xp = array_namespace(v, dv)
+    blocking = dv < 0.0
+    safe_denom = xp.where(blocking, -dv, xp.ones_like(dv))
+    ratios = xp.where(blocking, tau * v / safe_denom, xp.full_like(v, math.inf))
+    alpha = float(xp.min(ratios))
     return max(0.0, min(1.0, alpha))
 
 
diff --git a/ipax/result.py b/ipax/result.py
index 9abef8f..b23e7d3 100644
--- a/ipax/result.py
+++ b/ipax/result.py
@@ -156,6 +156,7 @@ class Result:
     constraint_violation: float = float("inf")
     solve_time: float = 0.0  # total wall-clock seconds for the solve
     linear_solver: str = ""  # the linear solver used internally (e.g. "dense")
+    device: str = ""  # device the solve ran on (e.g. "cpu", "<CUDA Device 0>")
 
     derivative_sources: DerivativeSources = field(default_factory=DerivativeSources)
     history: tuple[IterationRecord, ...] = ()
diff --git a/ipax/solve.py b/ipax/solve.py
index 887b625..d36ae37 100644
--- a/ipax/solve.py
+++ b/ipax/solve.py
@@ -157,6 +157,7 @@ def solve(
             kkt_error=float("inf"),
             constraint_violation=_bound_violation(xp, x0, lower, upper),
             solve_time=perf_counter() - start_time,
+            device=_describe_device(x0),
             message="infeasible bounds: x_L > x_U",
         )
 
@@ -228,6 +229,7 @@ def transform_record(record: IterationRecord) -> IterationRecord:
         result,
         solve_time=perf_counter() - start_time,
         linear_solver=_describe_solver(solver),
+        device=_describe_device(result.x),
     )
 
     # Post-solve summary (tier 1) and the timing split (tier 2).
@@ -286,6 +288,16 @@ def _describe_solver(solver: object) -> str:
     return describe() if callable(describe) else type(solver).__name__
 
 
+def _describe_device(x: Array) -> str:
+    """Stringified device of the solution array (e.g. ``"cpu"``, CUDA device).
+
+    Read from the array's standard ``.device`` attribute so it reflects where the
+    solve actually ran without importing any concrete backend.
+    """
+    device = getattr(x, "device", None)
+    return "" if device is None else str(device)
+
+
 def _count_finite(xp: Namespace, bound: Array | None) -> int:
     """Number of finite entries in a bound vector (``None`` → 0)."""
     if bound is None:
diff --git a/tests/integration/test_device_efficiency.py b/tests/integration/test_device_efficiency.py
new file mode 100644
index 0000000..bd3fe5e
--- /dev/null
+++ b/tests/integration/test_device_efficiency.py
@@ -0,0 +1,86 @@
+"""Smoke test for the device-efficiency profiling harness.
+
+Runs on CPU (the namespace fixture) so the host-sync counter, ``DeviceMetrics``,
+the runner, and the report formatter stay correct as the solver evolves. The
+host-sync *count* is unavailable on NumPy (its ``ndarray`` scalar dunders cannot
+be patched), available on Torch — both paths are exercised. Real GPU numbers come
+from running ``benchmarks.runners.device_efficiency`` on CUDA hardware.
+"""
+
+from __future__ import annotations
+
+import json
+
+import ipax
+from benchmarks.harness import (
+    DeviceMetrics,
+    capture_environment,
+    format_device,
+    measure_device_solve,
+)
+from benchmarks.runners.device_efficiency import main, run_device_efficiency
+from ipax.testing.problems import make_rt_like_problem
+
+
+def test_measure_device_solve_populates_metrics(namespace, backend_name):
+    problem = make_rt_like_problem(namespace, 60, n_structures=4, density=0.2, seed=0)
+    x0 = namespace.full((60,), 0.01, dtype=namespace.float64)
+
+    m = measure_device_solve(
+        problem,
+        x0,
+        ipax.Options(hessian="exact", linsolve="krylov"),
+        backend=backend_name,
+        route="krylov",
+    )
+
+    assert isinstance(m, DeviceMetrics)
+    assert m.success
+    assert m.n_iter > 0
+    assert m.solve_time > 0.0
+    assert m.time_per_iter > 0.0
+    # NumPy's built-in ndarray cannot be patched -> count unavailable (None);
+    # Torch's Tensor can, so the loop's scalar reads are counted (> 0).
+    if backend_name == "numpy":
+        assert m.host_syncs is None
+    else:
+        assert m.host_syncs is not None and m.host_syncs > 0
+        assert m.syncs_per_iter is not None and m.syncs_per_iter > 0
+
+
+def test_sync_counter_restores_dunders_after_window(namespace):
+    """The counter must not leave the array type permanently patched."""
+    problem = make_rt_like_problem(namespace, 40, n_structures=4, density=0.2, seed=0)
+    x0 = namespace.full((40,), 0.01, dtype=namespace.float64)
+    before = namespace.asarray([2.5])[0]
+    assert float(before) == 2.5
+
+    measure_device_solve(
+        problem,
+        x0,
+        ipax.Options(hessian="exact", linsolve="dense"),
+        backend="numpy",
+        route="dense",
+    )
+
+    after = namespace.asarray([3.5])[0]
+    assert float(after) == 3.5  # dunder behaves normally post-window
+
+
+def test_runner_produces_report(tmp_path):
+    metrics = run_device_efficiency(["numpy"], ["dense"], [40])
+    assert metrics and all(m.success for m in metrics)
+
+    env = capture_environment()
+    md = format_device(metrics, env)
+    assert "# ipax device-efficiency study" in md
+    assert "syncs/iter" in md
+
+    out = tmp_path / "device"
+    rc = main(
+        ["--backends", "numpy", "--routes", "dense", "--sizes", "40", "--out", str(out)]
+    )
+    assert rc == 0
+    payload = json.loads(out.with_suffix(".json").read_text())
+    assert payload["metrics"][0]["route"] == "dense"
+    assert out.with_suffix(".md").exists()
diff --git a/tests/integration/test_known_optima.py b/tests/integration/test_known_optima.py
index 60a6d09..1f79b6c 100644
--- a/tests/integration/test_known_optima.py
+++ b/tests/integration/test_known_optima.py
@@ -171,6 +171,9 @@ def gradient(self, x):
     assert result.status is Status.INFEASIBLE
     assert not result.success
     assert "infeasible" in result.message.lower()
+    # device must be reported on the early-return failure path too, consistent
+    # with the main solve path (Copilot PR #2 review).
+    assert result.device == str(array(namespace, [0.5]).device)
 
 
 def test_inconsistent_equalities_report_infeasible(namespace):
diff --git a/tests/regression/test_fraction_to_boundary_no_elementwise_sync.py b/tests/regression/test_fraction_to_boundary_no_elementwise_sync.py
new file mode 100644
index 0000000..59240aa
--- /dev/null
+++ b/tests/regression/test_fraction_to_boundary_no_elementwise_sync.py
@@ -0,0 +1,39 @@
+"""Regression: fraction-to-boundary must not host-sync per element (GPU perf).
+
+``fraction_to_boundary`` originally looped over the array in Python, calling
+``float(v[idx])``/``float(dv[idx])`` per element — O(n) host<->device syncs per
+call, and it is called 6x per IPM iteration. On a CUDA backend this dominated
+iteration time: a device-efficiency profile showed ~5,500 syncs/iter at n=1000
+(scaling linearly with n). The rule is now vectorized to a single sync.
+
+This guards the property directly: counting the device->host scalar
+materializations during one call on a large vector must be O(1), not O(n). The
+counter patches the array type's scalar dunders, which only works on backends
+whose array type allows it (Torch yes, NumPy's built-in ndarray no), so the test
+skips where counting is unavailable — the cost it guards is a GPU cost anyway.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from benchmarks.harness import _ScalarSyncCounter
+from ipax.ipm.barrier import fraction_to_boundary
+from tests._helpers import array
+
+
+def test_fraction_to_boundary_is_constant_sync(namespace):
+    n = 2000
+    v = array(namespace, [1.0] * n)
+    dv = array(namespace, [-0.5 if i % 2 else 0.5 for i in range(n)])
+
+    with _ScalarSyncCounter(type(v)) as counter:
+        alpha = fraction_to_boundary(v, dv, tau=0.95)
+
+    if not counter.available:
+        pytest.skip(f"scalar-sync counting unavailable for {type(v)!r}")
+
+    # Vectorized form does a single float() on the reduced scalar; a constant
+    # bound (independent of n) is the invariant. The old loop would be ~n here.
+    assert counter.count <= 4, f"{counter.count} host syncs for n={n} (expected O(1))"
+    assert 0.0 <= alpha <= 1.0
diff --git a/tests/unit/test_backend_namespace.py b/tests/unit/test_backend_namespace.py
index 57e5659..dce0edd 100644
--- a/tests/unit/test_backend_namespace.py
+++ b/tests/unit/test_backend_namespace.py
@@ -73,3 +73,21 @@ def test_capabilities_flags_jax_as_autodiff_not_sparse_numpy():
     caps = capabilities(types.ModuleType("jax.numpy"))
     assert caps.name == "jax"
     assert caps.supports_autodiff is True
+
+
+def test_capabilities_reports_real_devices(namespace):
+    """Devices come from the Array-API info namespace, not a hardcoded ``cpu``."""
+    with implemented("device probe"):
+        caps = capabilities(namespace)
+
+    assert isinstance(caps.devices, tuple)
+    assert len(caps.devices) >= 1
+    assert all(isinstance(d, str) for d in caps.devices)
+    # Every installed CI backend (numpy/torch/array_api_strict) exposes a CPU.
+    assert any("cpu" in d.lower() for d in caps.devices)
+
+
+def test_capabilities_devices_falls_back_when_info_missing():
+    """A bare namespace without ``__array_namespace_info__`` degrades to ``cpu``."""
+    caps = capabilities(types.ModuleType("numpy"))
+    assert caps.devices == ("cpu",)
diff --git a/tests/unit/test_barrier.py b/tests/unit/test_barrier.py
index 744630e..2b6bb72 100644
--- a/tests/unit/test_barrier.py
+++ b/tests/unit/test_barrier.py
@@ -36,3 +36,29 @@ def test_fraction_to_boundary_returns_one_for_nonnegative_step(namespace):
         alpha = fraction_to_boundary(v, dv, tau=0.99)
 
     assert alpha == 1.0
+
+
+def test_fraction_to_boundary_empty_vector_is_unconstrained(namespace):
+    v = array(namespace, [])
+    dv = array(namespace, [])
+    assert fraction_to_boundary(v, dv, tau=0.99) == 1.0
+
+
+def test_fraction_to_boundary_matches_elementwise_reference(namespace):
+    """Vectorized rule equals the scalar loop it replaced (regression).
+
+    The element-wise loop was O(n) host syncs/call and dominated GPU iteration
+    time; this pins the vectorized form to identical results on mixed signs.
+    """
+    raw_v = [1.0, 2.0, 0.5, 3.0, 0.25, 10.0]
+    raw_dv = [-0.5, 4.0, -4.0, 0.0, -0.1, -2.0]
+    tau = 0.95
+
+    expected = 1.0
+    for vi, dvi in zip(raw_v, raw_dv, strict=True):
+        if dvi < 0.0:
+            expected = min(expected, tau * vi / (-dvi))
+    expected = max(0.0, min(1.0, expected))
+
+    alpha = fraction_to_boundary(array(namespace, raw_v), array(namespace, raw_dv), tau)
+    assert_scalar_close(alpha, expected)
diff --git a/tests/unit/test_logging.py b/tests/unit/test_logging.py
index cb700ad..86a1839 100644
--- a/tests/unit/test_logging.py
+++ b/tests/unit/test_logging.py
@@ -63,6 +63,7 @@ def test_format_result_reports_status_and_sources():
         constraint_violation=0.0,
         solve_time=0.042,
         linear_solver="sparse [Feral LDL^T (CPU)]",
+        device="cpu",
         derivative_sources=DerivativeSources(gradient="analytic", hessian="lbfgs"),
         message="converged",
     )
@@ -71,6 +72,7 @@ def test_format_result_reports_status_and_sources():
     assert "iterations    = 7" in text
     assert "solve time    = 4.200e-02s" in text
     assert "linear solver = sparse [Feral LDL^T (CPU)]" in text
+    assert "device        = cpu" in text
     assert "grad:analytic" in text and "hess:lbfgs" in text