diff --git a/.claude/rules/docs-sync.md b/.claude/rules/docs-sync.md
index ad05793..fa8f595 100644
--- a/.claude/rules/docs-sync.md
+++ b/.claude/rules/docs-sync.md
@@ -31,11 +31,11 @@ When the user is committing, pushing, or otherwise wrapping up a change that tou
   - `docs/getting-started.md` (build instructions)
   - `AGENTS.md` (build section)
   - `README.md` (Quick Start commands)
-- `src/kernels.cu` / `src/kernels.h` — reorder or quantize kernel changes affect `docs/tutorials/benchmarking_examples.md` and the Reorder & quantize kernels subsection in `AGENTS.md`.
+- `src/kernels.cu` / `src/kernels.h` — reorder or quantize kernel changes affect `docs/benchmarks/raw_benchmarking.md` and the Reorder & quantize kernels subsection in `AGENTS.md`.
 
 ### Benchmark and example changes (medium impact)
 - `examples/*.cpp` or `examples/*.yaml` — new benchmarks, changed CLI flags, or new YAML config keys may need updating in:
-  - `docs/tutorials/benchmarking_examples.md`
+  - `docs/benchmarks/raw_benchmarking.md`
   - `docs/tutorials/configuration-walkthrough.md` — when adding or removing a YAML, add or remove a leaf in the **"Choosing an example config"** decision tree (`#choosing-an-example-config`). CI's `scripts/check_doc_refs.py` enforces that every YAML in `examples/` is referenced in this file; a new config without a tree leaf will fail the check.
   - `AGENTS.md` (benchmark table)
 - When adding or removing a benchmark executable, also update the benchmark table in `AGENTS.md`.
@@ -77,10 +77,10 @@ When the user is committing, pushing, or otherwise wrapping up a change that tou
 | `src/manager.h` | `docs/api-reference/cpp.md`, `docs/concepts.md`, `AGENTS.md` |
 | `src/managers/*/` | `docs/getting-started.md`, `docs/concepts.md` (backend list + maturity), `docs/api-reference/configuration.md`, `docs/tutorials/configuration-walkthrough.md`, `README.md`, `AGENTS.md` |
 | `src/CMakeLists.txt` | `docs/getting-started.md`, `AGENTS.md`, `README.md` |
-| `src/kernels.cu` | `docs/tutorials/benchmarking_examples.md`, `AGENTS.md` |
+| `src/kernels.cu` | `docs/benchmarks/raw_benchmarking.md`, `AGENTS.md` |
 | `python/daqiri_common_pybind.cpp` | `docs/api-reference/python.md`, `AGENTS.md` |
-| `examples/*.cpp` | `docs/tutorials/benchmarking_examples.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` |
-| `examples/*.yaml` | `docs/tutorials/benchmarking_examples.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` |
+| `examples/*.cpp` | `docs/benchmarks/raw_benchmarking.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` |
+| `examples/*.yaml` | `docs/benchmarks/raw_benchmarking.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` |
 | `examples/*.py` | `docs/api-reference/python.md`, `AGENTS.md` |
 | `mkdocs.yml` | `docs/index.html` (nav links) |
 | Any `docs/*` rename/move | `README.md` (Documentation table), `AGENTS.md` (Documentation section), `mkdocs.yml`, `docs/index.html` |
diff --git a/.greptile/config.json b/.greptile/config.json
index 75d2798..5436c2f 100644
--- a/.greptile/config.json
+++ b/.greptile/config.json
@@ -62,7 +62,7 @@
     },
     {
       "id": "doc-sync",
-      "rule": "DAQIRI has no automated doc-sync gate beyond mkdocs/strict link checks. When a PR changes any of the files listed in .claude/rules/docs-sync.md, the matching docs must be updated in the same PR. Specifically: src/common.h | src/types.h | src/manager.h => docs/api-guide.md + docs/daqiri-api.html + AGENTS.md (Architecture); src/managers/* => docs/getting-started.md + docs/configuration.md + docs/tutorials/configuration-walkthrough.md + README.md (Backends) + AGENTS.md; src/CMakeLists.txt => docs/getting-started.md + AGENTS.md (Build & run) + README.md (Quick Start); src/kernels.cu => docs/tutorials/benchmarking_examples.md + AGENTS.md; examples/*.{cpp,yaml} => docs/tutorials/benchmarking_examples.md + docs/tutorials/configuration-walkthrough.md + AGENTS.md (benchmark table). If the PR touches code in these paths but does not update the matching docs, flag it as medium severity and list the specific docs to update.",
+      "rule": "DAQIRI has no automated doc-sync gate beyond mkdocs/strict link checks. When a PR changes any of the files listed in .claude/rules/docs-sync.md, the matching docs must be updated in the same PR. Specifically: src/common.h | src/types.h | src/manager.h => docs/api-guide.md + docs/daqiri-api.html + AGENTS.md (Architecture); src/managers/* => docs/getting-started.md + docs/configuration.md + docs/tutorials/configuration-walkthrough.md + README.md (Backends) + AGENTS.md; src/CMakeLists.txt => docs/getting-started.md + AGENTS.md (Build & run) + README.md (Quick Start); src/kernels.cu => docs/benchmarks/raw_benchmarking.md + AGENTS.md; examples/*.{cpp,yaml} => docs/benchmarks/raw_benchmarking.md + docs/tutorials/configuration-walkthrough.md + AGENTS.md (benchmark table). If the PR touches code in these paths but does not update the matching docs, flag it as medium severity and list the specific docs to update.",
       "scope": ["src/**", "examples/**", "mkdocs.yml", "README.md", "AGENTS.md", "docs/**"],
       "severity": "medium"
     },
diff --git a/.greptile/rules.md b/.greptile/rules.md
index af38c34..09b38a4 100644
--- a/.greptile/rules.md
+++ b/.greptile/rules.md
@@ -106,8 +106,8 @@ The mapping (mirrored from `.claude/rules/docs-sync.md`):
 | `src/manager.h` | `docs/api-guide.md`, `AGENTS.md` (Manager abstraction) |
 | `src/managers/*/` | `docs/getting-started.md`, `docs/configuration.md`, `docs/tutorials/configuration-walkthrough.md`, `README.md` (Backends), `AGENTS.md` |
 | `src/CMakeLists.txt` (CMake options, `DAQIRI_MGR` default, CUDA arch) | `docs/getting-started.md`, `AGENTS.md` (Build & run), `README.md` (Quick Start) |
-| `src/kernels.cu` / `src/kernels.h` | `docs/tutorials/benchmarking_examples.md`, `AGENTS.md` (Reorder & quantize kernels) |
-| `examples/*.cpp`, `examples/*.yaml` (new bench, new CLI flag, new YAML key) | `docs/tutorials/benchmarking_examples.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` (benchmark table) |
+| `src/kernels.cu` / `src/kernels.h` | `docs/benchmarks/raw_benchmarking.md`, `AGENTS.md` (Reorder & quantize kernels) |
+| `examples/*.cpp`, `examples/*.yaml` (new bench, new CLI flag, new YAML key) | `docs/benchmarks/raw_benchmarking.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` (benchmark table) |
 | `mkdocs.yml` nav | `docs/index.html` (landing page links) |
 | Any `docs/*` rename or move | `README.md` (Documentation table), `AGENTS.md` (Documentation section), `mkdocs.yml`, `docs/index.html` |
 
diff --git a/AGENTS.md b/AGENTS.md
index effe7ff..3e78e3e 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -97,7 +97,10 @@ The web docs live in `docs/` and are built with [MkDocs Material](https://squidf
 - `docs/api-reference/index.md` — API guide (6-step application lifecycle, configuration-first model)
 - `docs/api-reference/configuration.md`, `docs/api-reference/cpp.md`, `docs/api-reference/python.md` — YAML schema, C++ API, and Python bindings docs
 - `docs/tutorials/` — tutorial walkthroughs (system config, config-file walkthrough)
-- `docs/tutorials/benchmarking_examples.md` — surfaced as a top-level "Benchmarks" nav entry in `mkdocs.yml` and `docs/index.html`; file kept at its original path for inbound-link stability
+- `docs/benchmarks/` — benchmark guide pages, surfaced as a top-level "Benchmarking" nav section in `mkdocs.yml` and `docs/index.html`:
+  - `docs/benchmarks/benchmarks.md` — overview and backend-selection decision tree
+  - `docs/benchmarks/socket_benchmarking.md` — "Socket and RDMA Benchmarking" (TCP/UDP and RoCE/RDMA)
+  - `docs/benchmarks/raw_benchmarking.md` — "Raw Ethernet Benchmarking" (DPDK `raw_*` benches)
 - `docs/stylesheets/extra.css` — custom theme overrides
 
 **User-facing vocabulary:** docs and the YAML schema use `stream_type` (`raw`, `socket`, future `pcie`) and `protocol` (`udp`, `tcp`, `roce`). The word "backend" is internal-only — accurate for `src/managers/<name>/`, the `Manager` ABC, CMake `DAQIRI_MGR`, and API-reference function blurbs, but should not appear in tutorials, the landing page, or concept pages. The mapping: `stream_type: "raw"` is implemented by the `dpdk` manager; `stream_type: "socket"` with `protocol: "udp"` / `"tcp"` is implemented by the `socket` manager; `stream_type: "socket"` with `protocol: "roce"` is implemented by the `rdma` manager.
@@ -105,7 +108,7 @@ The web docs live in `docs/` and are built with [MkDocs Material](https://squidf
 **Keeping docs in sync with code:** before committing changes, scan for the recurring drift hotspots:
 - **Stream-type list** (`src/managers/*/`) — README Backends table, `docs/getting-started.md`, `docs/concepts.md` (Stream Types section + Support and testing admonition), `docs/api-reference/configuration.md`
 - **CMake options / `DAQIRI_MGR` default** (`src/CMakeLists.txt:137`) — README Quick Start, `docs/getting-started.md`, this file's Build & run section
-- **Benchmark binary or YAML names** (`examples/`) — the benchmark table above, `docs/tutorials/benchmarking_examples.md`, and the "Choosing an example config" decision tree in `docs/tutorials/configuration-walkthrough.md` (every YAML must have a leaf; CI's `scripts/check_doc_refs.py` enforces coverage)
+- **Benchmark binary or YAML names** (`examples/`) — the benchmark table above, `docs/benchmarks/raw_benchmarking.md`, and the "Choosing an example config" decision tree in `docs/tutorials/configuration-walkthrough.md` (every YAML must have a leaf; CI's `scripts/check_doc_refs.py` enforces coverage)
 - **Public API include** (`#include <daqiri/daqiri.h>`; source files under `include/daqiri/`) — `docs/api-reference/index.md`, `docs/api-reference/cpp.md`, `docs/api-reference/python.md`; if the change adds or renames a user-facing concept, also `docs/concepts.md`
 - **Python bindings** (`python/daqiri_common_pybind.cpp`) — `docs/api-reference/python.md` (function reference tables, enums/classes tables, GIL Behavior section)
 - **Doc reorganization** (any rename in `docs/`) — `docs/index.html` landing page, `mkdocs.yml` nav, README Documentation table
diff --git a/README.md b/README.md
index 44867bd..617ed90 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ target storage stack to be reported as supported by `gdscheck.py -p`.
 Container build:
 
 ```bash
-BASE_TARGET=dpdk DAQIRI_MGR="dpdk rdma" scripts/build-container.sh
+BASE_TARGET=dpdk DAQIRI_MGR="dpdk socket rdma" scripts/build-container.sh
 ```
 
 OpenTelemetry metrics are opt-in. Build with `-DDAQIRI_ENABLE_OTEL_METRICS=ON`
@@ -81,6 +81,30 @@ exporters.
 See [Getting Started](https://nvidia.github.io/daqiri/getting-started/) for requirements, CMake options, and
 running the benchmarks.
 
+## Benchmarking
+
+Start with the [Benchmarking overview](https://nvidia.github.io/daqiri/benchmarks/benchmarks/) to choose between Linux sockets, RoCE/RDMA, and raw Ethernet.
+
+For Spark-style on-wire tests, use the same client/server namespace shape for Linux sockets and RDMA/RoCE: put the client-facing NIC in one namespace, the server-facing NIC in another, pin routes and neighbors to those interfaces, then verify `tx_packets_phy` on the client and `rx_packets_phy` on the server before trusting bandwidth numbers.
+
+```bash
+# Linux TCP/UDP sockets, split by namespace
+ip netns exec dq_wire_server ./build/examples/daqiri_bench_socket \
+  /tmp/socket-server.yaml --seconds 10 --mode server &
+ip netns exec dq_wire_client ./build/examples/daqiri_bench_socket \
+  /tmp/socket-client.yaml --seconds 10 --mode client
+wait
+
+# RoCE/RDMA, using the same namespace pair
+ip netns exec dq_wire_server ./build/examples/daqiri_bench_rdma \
+  /tmp/rdma-server.yaml --seconds 10 --mode server &
+ip netns exec dq_wire_client ./build/examples/daqiri_bench_rdma \
+  /tmp/rdma-client.yaml --seconds 10 --mode client
+wait
+```
+
+See [Socket and RDMA Benchmarking](https://nvidia.github.io/daqiri/benchmarks/socket_benchmarking/) for the full namespace setup and YAML templates. See [Raw Ethernet Benchmarking](https://nvidia.github.io/daqiri/benchmarks/raw_benchmarking/) for DPDK/raw Ethernet loopback tests.
+
 ## Documentation
 
 Reference material for the DAQIRI codebase:
@@ -98,7 +122,9 @@ Reference material for the DAQIRI codebase:
 Step-by-step walkthroughs to get hands-on:
 
 - [System Configuration](https://nvidia.github.io/daqiri/tutorials/system_configuration/) — NIC drivers, link layers, GPUDirect, hugepages, CPU isolation, GPU clocks
-- [Benchmarking Examples](https://nvidia.github.io/daqiri/tutorials/benchmarking_examples/) — run `daqiri_bench_raw_gpudirect` with a loopback test
+- [Benchmarking Overview](https://nvidia.github.io/daqiri/benchmarks/benchmarks/) — choose between Linux sockets, RoCE/RDMA, and raw Ethernet benchmarks
+- [Socket and RDMA Benchmarking](https://nvidia.github.io/daqiri/benchmarks/socket_benchmarking/) — run TCP/UDP sockets and RoCE/RDMA with matching namespace isolation
+- [Raw Ethernet Benchmarking](https://nvidia.github.io/daqiri/benchmarks/raw_benchmarking/) — run `daqiri_bench_raw_gpudirect` with a physical loopback test
 - [Understanding the Configuration File](https://nvidia.github.io/daqiri/tutorials/configuration-walkthrough/) — annotated YAML walkthrough
 
 ## License
diff --git a/docs/benchmarks/benchmarks.md b/docs/benchmarks/benchmarks.md
new file mode 100644
index 0000000..c8dc400
--- /dev/null
+++ b/docs/benchmarks/benchmarks.md
@@ -0,0 +1,49 @@
+# Benchmarking
+
+DAQIRI ships with several backends to handle different types of incoming and outgoing streams. Choosing the stream type depends on the type of sensor being used and its capabilities. The `stream_type` is decided from the decision tree below:
+
+![DAQIRI networking backend decision tree](../images/backend-decision-tree.svg)
+
+## Choose a backend
+
+| Use case | DAQIRI config | Benchmark | Start here |
+|---|---|---|---|
+| Ingest from or egress to a programmable PCIe sensor, such as an FPGA on the PCIe bus. | `stream_type: "pcie"` | Coming soon | PCIe benchmarking docs are coming soon. |
+| Compare against normal Linux networking, run on a non-NVIDIA NIC, or test a peer that speaks TCP/UDP sockets. | `stream_type: "socket"` with `protocol: "tcp"` or `protocol: "udp"` | `daqiri_bench_socket` | [Socket and RDMA Benchmarking](socket_benchmarking.md) |
+| Test a peer that already implements RDMA verbs over RoCE. | `stream_type: "socket"` with `protocol: "roce"` | `daqiri_bench_rdma` | [Socket and RDMA Benchmarking](socket_benchmarking.md#run-the-rdma-roce-benchmark) |
+| Drive raw Ethernet packets directly from an NVIDIA NIC under DAQIRI control. | `stream_type: "raw"` | `daqiri_bench_raw_gpudirect` and the other `raw_*` benches | [Raw Ethernet Benchmarking](raw_benchmarking.md) |
+
+!!! note "PCIe backend status"
+
+    The PCIe programmable-sensor path is under development. Once completed it will allow 3rd party PCIe devices
+    to read from and write to the GPU's BAR1 memory.
+
+!!! note "Why RDMA is listed under socket"
+
+    The RoCE benchmark uses the connection-oriented socket/RDMA configuration model. The executable is named  `daqiri_bench_rdma` to show the RDMA-specific API calls.
+
+## Common benchmark workflow
+
+1. Build the examples with the backend you plan to test. The default container build enables all three:
+
+    ```bash
+    BASE_TARGET=dpdk DAQIRI_MGR="dpdk socket rdma" scripts/build-container.sh
+    ```
+
+2. Pick the physical pair or host pair that should carry the traffic. For same-host Spark wire tests, prefer a client namespace and a server namespace so the route cannot silently fall back to loopback.
+
+3. Prove the direction with hardware counters before trusting bandwidth numbers. For one-way client-to-server tests, the important counters are the client-side `tx_packets_phy` / `tx_bytes_phy` and the server-side `rx_packets_phy` / `rx_bytes_phy`.
+
+4. Run the DAQIRI benchmark and a known baseline such as `iperf3` or `ib_send_bw` with the same namespace, interface, and message-size assumptions.
+
+5. Monitor line rate with NIC counters or `mlnx_perf`; application-side byte counts are useful, but hardware counters answer whether packets actually reached the physical path.
+
+## Page map
+
+- [Socket and RDMA Benchmarking](socket_benchmarking.md) covers Linux TCP/UDP and RoCE/RDMA runs with matching client/server namespace setup.
+- [Raw Ethernet Benchmarking](raw_benchmarking.md) covers the DPDK/raw Ethernet examples, hugepage sizing, physical loopback configuration, and raw benchmark troubleshooting.
+- [Understanding the Configuration File](../tutorials/configuration-walkthrough.md) explains the YAML fields once you have selected the backend and example config.
+
+---
+**Previous:** [System Configuration](../tutorials/system_configuration.md)<br>
+**Next:** [Socket and RDMA Benchmarking](socket_benchmarking.md)
diff --git a/docs/tutorials/benchmarking_examples.md b/docs/benchmarks/raw_benchmarking.md
similarity index 93%
rename from docs/tutorials/benchmarking_examples.md
rename to docs/benchmarks/raw_benchmarking.md
index 1cb98fe..1eb415c 100644
--- a/docs/tutorials/benchmarking_examples.md
+++ b/docs/benchmarks/raw_benchmarking.md
@@ -1,19 +1,14 @@
----
-hide:
-  - navigation
----
-
-# Benchmarking Examples
+# Raw Ethernet Benchmarking
 
-DAQIRI provides a benchmarking application named `daqiri_bench_raw_gpudirect` that can be used to test the performance of the networking configuration. In this section, we'll walk you through the steps needed to configure the application for your NIC for Tx and Rx, and run a loopback test between the two interfaces with a [physical SFP cable](https://www.nvidia.com/en-us/networking/interconnect/) connecting them.
+DAQIRI provides raw Ethernet benchmark applications that use DPDK to drive an NVIDIA NIC directly. This page walks through `daqiri_bench_raw_gpudirect`, the TX/RX loopback config, and the raw Ethernet checks needed before interpreting throughput results.
 
 Make sure to [build the DAQIRI library](../getting-started.md#build-the-daqiri-library) beforehand.
 
-**Not sure which YAML to start from?** See [Choosing an example config](configuration-walkthrough.md#choosing-an-example-config) in the configuration tutorial — a use-case-driven decision tree from "I just want to verify the build" through reorder, recording, RDMA, and sockets.
+**Not sure which backend to benchmark?** Start with the [Benchmarking overview](benchmarks.md). Use this page after you have chosen the raw Ethernet backend. Use [Socket and RDMA Benchmarking](socket_benchmarking.md) for TCP, UDP, and RoCE/RDMA runs.
 
 !!! note "Prerequisites"
 
-    Before running the benchmarking application, ensure your system has been fully configured per the [System Configuration](system_configuration.md) page.
+    Before running the benchmarking application, ensure your system has been fully configured per the [System Configuration](../tutorials/system_configuration.md) page.
 
 ## Configure hugepages first
 
@@ -23,11 +18,11 @@ Size the hugepage pool to your YAML's `memory_regions` plus DPDK overhead before
 grep Huge /proc/meminfo
 ```
 
-For a persistent allocation across reboots, use the grub recipe in [Step 4 of System Configuration](system_configuration.md#step-4-enable-huge-pages).
+For a persistent allocation across reboots, use the grub recipe in [Step 4 of System Configuration](../tutorials/system_configuration.md#step-4-enable-huge-pages).
 
 ## Running the DAQIRI container
 
-If you built DAQIRI using the container approach, use the following command to launch the container with Raw Ethernet (DPDK) and GPU support. The host system must be fully configured (see [System Configuration](system_configuration.md)) before the container can access the NIC and GPU hardware.
+If you built DAQIRI using the container approach, use the following command to launch the container with Raw Ethernet (DPDK) and GPU support. The host system must be fully configured (see [System Configuration](../tutorials/system_configuration.md)) before the container can access the NIC and GPU hardware.
 
 ```bash
 docker run --rm -it --privileged \
@@ -41,7 +36,7 @@ docker run --rm -it --privileged \
 
     | Flag | Purpose |
     |------|---------|
-    | `--privileged` | DPDK requires raw access to NIC hardware (PCI devices, hugepage files). Also covers `/dev/infiniband` for RDMA. |
+    | `--privileged` | DPDK requires raw access to NIC hardware, PCI devices, and hugepage files. |
     | `--runtime=nvidia` | Makes the host GPU visible inside the container via the NVIDIA Container Toolkit |
     | `--network=host` | Shares the host network namespace so DPDK can discover the physical NIC interfaces and their PCIe topology |
     | `-v /dev/hugepages:/dev/hugepages` | Mounts the hugepage filesystem for DPDK memory allocation (`--privileged` alone does not cover mounted filesystems) |
@@ -50,7 +45,7 @@ docker run --rm -it --privileged \
 
 !!! tip "DGX Spark"
 
-    For systems configured per the [DGX Spark profile](system_configuration.md#dgx-spark-profile), use these configs to skip the PCIe/IP/CPU-core edits below:
+    For systems configured per the [DGX Spark profile](../tutorials/system_configuration.md#dgx-spark-profile), use these configs to skip the PCIe/IP/CPU-core edits below:
 
     - [`daqiri_bench_raw_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx_spark.yaml) for `daqiri_bench_raw_gpudirect` — still set `eth_dst_addr` to the RX MAC. The rx_port is `0002:01:00.1` (physical port p1), so read its MAC: `cat /sys/class/net/enP2p1s0f1np1/address`. This p0-to-p1 pairing is intentional for an over-the-wire single-machine loopback; using two PFs that map to the same physical port exercises the on-chip eswitch path instead.
     - [`daqiri_bench_rdma_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_rdma_tx_rx_spark.yaml) for `daqiri_bench_rdma` — no further edits needed.
@@ -62,7 +57,7 @@ The benchmark executables and example YAML configurations are located at:
 | **Container** | `/opt/daqiri/bin/` | `/opt/daqiri/bin/` |
 | **From source** | `./build/examples/` | `./examples/` |
 
-The fields in the YAML configs will be explained in more detail in [Understanding the Configuration File](configuration-walkthrough.md). For now, we'll stick to modifying the strict minimum required fields to run the application as-is on your system.
+The fields in the YAML configs will be explained in more detail in [Understanding the Configuration File](../tutorials/configuration-walkthrough.md). For now, we'll stick to modifying the strict minimum required fields to run the application as-is on your system.
 
 ##### Identify your NIC's PCIe addresses
 
@@ -426,7 +421,7 @@ The `*_packets_phy` and `*_bytes_phy` counters are physical-link counters. They
         [critical] [adv_network_dpdk_mgr.cpp:430] Failed to map MRs
         ```
 
-        Check the [GPUDirect setup](system_configuration.md#enable-gpudirect) for your
+        Check the [GPUDirect setup](../tutorials/system_configuration.md#enable-gpudirect) for your
         deployment. Some host builds use `nvidia-peermem`; the container path uses
         dma-buf support from the patched DPDK build.
 
@@ -450,13 +445,13 @@ The `*_packets_phy` and `*_bytes_phy` counters are physical-link counters. They
         EAL: x hugepages of size x reserved, no mounted hugetlbfs found for that size
         ```
 
-        Ensure your [hugepages are mounted](system_configuration.md#step-4-enable-huge-pages).
+        Ensure your [hugepages are mounted](../tutorials/system_configuration.md#step-4-enable-huge-pages).
 
         ```log
         EAL: No free x kB hugepages reported on node 0
         ```
 
-        Reachable only when the in-process preflight is bypassed (e.g. running an older binary against a host with hugepages reserved but not mounted). Mount per [System Configuration: Step 4](system_configuration.md#step-4-enable-huge-pages) and re-run.
+        Reachable only when the in-process preflight is bypassed (e.g. running an older binary against a host with hugepages reserved but not mounted). Mount per [System Configuration: Step 4](../tutorials/system_configuration.md#step-4-enable-huge-pages) and re-run.
 
     ??? failure "Stale `<file-prefix>map_*` files in /dev/hugepages after a SIGKILL"
 
@@ -478,5 +473,5 @@ The `*_packets_phy` and `*_bytes_phy` counters are physical-link counters. They
         You might need to kill some of the listed processes to free up GPU VRAM.
 
 ---
-**Previous:** [System Configuration](system_configuration.md)  
-**Next:** [Understanding the Configuration File](configuration-walkthrough.md) — deep dive into the YAML parameters
+**Previous:** [Benchmarking](benchmarks.md)<br>
+**Next:** [Understanding the Configuration File](../tutorials/configuration-walkthrough.md) — deep dive into the YAML parameters
diff --git a/docs/benchmarks/socket_benchmarking.md b/docs/benchmarks/socket_benchmarking.md
new file mode 100644
index 0000000..eb6731c
--- /dev/null
+++ b/docs/benchmarks/socket_benchmarking.md
@@ -0,0 +1,369 @@
+# Socket and RDMA Benchmarking
+
+Use this page when the peer protocol is TCP, UDP, or RoCE/RDMA. These benchmarks use the Linux networking stack for TCP/UDP and RDMA verbs for RoCE, so the same client/server namespace shape is useful for proving that traffic leaves the host through the expected NIC path.
+
+Make sure to [build DAQIRI](../getting-started.md#build-the-daqiri-library) with the socket and RDMA backends first.
+
+## Backend choices
+
+| Protocol | YAML selector | Benchmark executable | Typical reason to use it |
+|---|---|---|---|
+| TCP | `stream_type: "socket"`, `protocol: "tcp"` | `daqiri_bench_socket` | Baseline against normal Linux streams or test a TCP-speaking peer. |
+| UDP | `stream_type: "socket"`, `protocol: "udp"` | `daqiri_bench_socket` | Datagram baseline against Linux networking. UDP payloads must be at most `65507` bytes. |
+| RoCE/RDMA | `stream_type: "socket"`, `protocol: "roce"` | `daqiri_bench_rdma` | Compare DAQIRI RDMA verbs against tools such as `ib_send_bw` or `ib_write_bw`. |
+
+## Build and launch a test shell
+
+Build the socket and RDMA benchmarks inside the DAQIRI container:
+
+```bash
+docker run --rm --privileged --network=host --gpus all --ipc=host \
+  --user "$(id -u):$(id -g)" \
+  -v /dev/hugepages:/dev/hugepages \
+  -v "$PWD:/work" \
+  -w /work daqiri:local \
+  bash -lc 'cmake -S . -B build-socket-rdma \
+    -DBUILD_SHARED_LIBS=ON \
+    -DDAQIRI_BUILD_PYTHON=OFF \
+    -DDAQIRI_MGR="dpdk socket rdma" &&
+    cmake --build build-socket-rdma \
+      --target daqiri_bench_socket daqiri_bench_rdma -j"$(nproc)"'
+```
+
+Run the benchmark setup commands as root. The easiest path is a privileged, host-networked DAQIRI container:
+
+```bash
+docker run --rm -it --privileged --network=host --pid=host --ipc=host \
+  --gpus all \
+  -v "$PWD:/work" \
+  -v /tmp:/tmp \
+  -w /work daqiri:local bash
+```
+
+Install network tools inside the container if needed:
+
+```bash
+apt-get update
+apt-get install -y iproute2 iputils-ping ethtool iperf3 rdma-core ibverbs-utils
+```
+
+## Create isolated namespaces
+
+Choose one transmit-facing interface and one receive-facing interface. The example below uses the Spark pair that was verified to increment physical counters on the tested system; adjust names, IPs, and MAC addresses on other machines.
+
+```bash
+CLIENT_NS=dq_wire_client
+SERVER_NS=dq_wire_server
+
+CLIENT_IF=enp1s0f0np0
+SERVER_IF=enp1s0f1np1
+
+CLIENT_IP=10.250.0.1
+SERVER_IP=10.250.0.2
+
+CLIENT_MAC=4c:bb:47:2a:ea:ed
+SERVER_MAC=4c:bb:47:2a:ea:ee
+
+MTU=9082
+```
+
+Create namespaces and pin routes to the physical interfaces:
+
+```bash
+ip netns delete "$CLIENT_NS" >/dev/null 2>&1 || true
+ip netns delete "$SERVER_NS" >/dev/null 2>&1 || true
+
+ip addr flush dev "$CLIENT_IF" || true
+ip addr flush dev "$SERVER_IF" || true
+
+ip netns add "$CLIENT_NS"
+ip netns add "$SERVER_NS"
+
+ip link set "$CLIENT_IF" netns "$CLIENT_NS"
+ip link set "$SERVER_IF" netns "$SERVER_NS"
+
+ip -n "$CLIENT_NS" addr add "$CLIENT_IP/24" dev "$CLIENT_IF"
+ip -n "$SERVER_NS" addr add "$SERVER_IP/24" dev "$SERVER_IF"
+
+ip -n "$CLIENT_NS" link set lo up
+ip -n "$SERVER_NS" link set lo up
+ip -n "$CLIENT_NS" link set "$CLIENT_IF" mtu "$MTU" up
+ip -n "$SERVER_NS" link set "$SERVER_IF" mtu "$MTU" up
+
+ip -n "$CLIENT_NS" route add "$SERVER_IP/32" dev "$CLIENT_IF"
+ip -n "$SERVER_NS" route add "$CLIENT_IP/32" dev "$SERVER_IF"
+
+ip -n "$CLIENT_NS" neigh replace "$SERVER_IP" \
+  lladdr "$SERVER_MAC" dev "$CLIENT_IF" nud permanent
+ip -n "$SERVER_NS" neigh replace "$CLIENT_IP" \
+  lladdr "$CLIENT_MAC" dev "$SERVER_IF" nud permanent
+```
+
+Verify the route and a short control packet:
+
+```bash
+ip -n "$CLIENT_NS" route get "$SERVER_IP" from "$CLIENT_IP"
+ip -n "$SERVER_NS" route get "$CLIENT_IP" from "$SERVER_IP"
+ip netns exec "$CLIENT_NS" ping -c 1 -W 1 "$SERVER_IP"
+```
+
+The route output should name the namespace interface, not `lo`.
+
+!!! note "RDMA device visibility"
+
+    On most RoCE setups, the RDMA device follows the netdev/GID association used by the namespace. If `ibv_devinfo` or `rdma link show` inside a namespace cannot see the expected device, move the matching RDMA device into the namespace with `rdma dev set <rdma_device> netns <namespace>`, or run the RDMA benchmark in the host namespace and still verify the same physical counters.
+
+## Prove the pair hits the wire
+
+Capture directional PHY counters before and after a short transfer. For one-way client-to-server traffic:
+
+```bash
+ip netns exec "$CLIENT_NS" ethtool -S "$CLIENT_IF" | \
+  grep -E 'tx_packets_phy|tx_bytes_phy|tx_vport_unicast'
+ip netns exec "$SERVER_NS" ethtool -S "$SERVER_IF" | \
+  grep -E 'rx_packets_phy|rx_bytes_phy|rx_vport_unicast'
+```
+
+Use `iperf3` as a quick proof before running DAQIRI:
+
+```bash
+ip netns exec "$SERVER_NS" iperf3 -s -B "$SERVER_IP" -1 &
+sleep 1
+ip netns exec "$CLIENT_NS" iperf3 -c "$SERVER_IP" -B "$CLIENT_IP" -t 2 -P 1
+wait
+```
+
+Then check the counters again. Treat the result as on-wire only when the client `tx_packets_phy` and server `rx_packets_phy` counters increase by matching packet counts. If only vport counters move, pick a different port pair.
+
+## Run the Linux socket benchmark
+
+The shipped configs run both endpoints on `127.0.0.1` and are useful for a smoke test:
+
+```bash
+./build-socket-rdma/examples/daqiri_bench_socket \
+  examples/daqiri_bench_socket_udp_tx_rx.yaml \
+  --seconds 10 --mode both
+
+./build-socket-rdma/examples/daqiri_bench_socket \
+  examples/daqiri_bench_socket_tcp_tx_rx.yaml \
+  --seconds 10 --mode both
+```
+
+For an on-wire namespace test, use separate server and client YAML files. The important fields are the protocol, namespace IPs, server port, `max_payload_size`, memory-region `buf_size`, and benchmark `message_size`.
+
+Server-side UDP template:
+
+```yaml
+%YAML 1.2
+---
+daqiri:
+  cfg:
+    version: 1
+    stream_type: "socket"
+    protocol: "udp"
+    master_core: 3
+    debug: false
+    log_level: "info"
+    memory_regions:
+    - name: "DATA_SOCKET_SERVER"
+      kind: "host"
+      affinity: 0
+      num_bufs: 1024
+      buf_size: 65507
+    interfaces:
+    - name: udp_server
+      address: 10.250.0.2
+      socket_config:
+        mode: server
+        local_ip: 10.250.0.2
+        local_port: 5021
+        max_payload_size: 65535
+      rx:
+        queues:
+        - name: "RX_Queue"
+          id: 0
+          cpu_core: 8
+          batch_size: 1
+          memory_regions: ["DATA_SOCKET_SERVER"]
+      tx:
+        queues:
+        - name: "TX_Queue"
+          id: 0
+          cpu_core: 7
+          batch_size: 1
+          memory_regions: ["DATA_SOCKET_SERVER"]
+
+socket_bench_server:
+  server: true
+  send: false
+  receive: true
+  iterations: 1000000000
+  message_size: 65507
+  server_address: 10.250.0.2
+  client_address: 10.250.0.1
+  server_port: 5021
+```
+
+Client-side UDP template:
+
+```yaml
+%YAML 1.2
+---
+daqiri:
+  cfg:
+    version: 1
+    stream_type: "socket"
+    protocol: "udp"
+    master_core: 3
+    debug: false
+    log_level: "info"
+    memory_regions:
+    - name: "DATA_SOCKET_CLIENT"
+      kind: "host"
+      affinity: 0
+      num_bufs: 1024
+      buf_size: 65507
+    interfaces:
+    - name: udp_client
+      address: 10.250.0.1
+      socket_config:
+        mode: client
+        local_ip: 10.250.0.1
+        local_port: 5121
+        remote_ip: 10.250.0.2
+        remote_port: 5021
+        max_payload_size: 65535
+      rx:
+        queues:
+        - name: "RX_Queue"
+          id: 0
+          cpu_core: 8
+          batch_size: 1
+          memory_regions: ["DATA_SOCKET_CLIENT"]
+      tx:
+        queues:
+        - name: "TX_Queue"
+          id: 0
+          cpu_core: 7
+          batch_size: 1
+          memory_regions: ["DATA_SOCKET_CLIENT"]
+
+socket_bench_client:
+  server: false
+  send: true
+  receive: false
+  iterations: 1000000000
+  message_size: 65507
+  server_address: 10.250.0.2
+  client_address: 10.250.0.1
+  server_port: 5021
+```
+
+For TCP, change `protocol: "udp"` to `protocol: "tcp"` in both files. For UDP, keep `message_size` at or below `65507`.
+
+Run the server and client in their namespaces:
+
+```bash
+export LD_LIBRARY_PATH=/work/build-socket-rdma/src:${LD_LIBRARY_PATH:-}
+BIN=/work/build-socket-rdma/examples/daqiri_bench_socket
+
+ip netns exec "$SERVER_NS" env LD_LIBRARY_PATH="$LD_LIBRARY_PATH" \
+  "$BIN" /tmp/socket-server.yaml --seconds 11 --mode server &
+
+sleep 1
+
+ip netns exec "$CLIENT_NS" env LD_LIBRARY_PATH="$LD_LIBRARY_PATH" \
+  "$BIN" /tmp/socket-client.yaml --seconds 10 --mode client
+
+wait
+```
+
+For a four-process run, create four server/client YAML pairs with unique server ports such as `5021`, `5022`, `5023`, and `5024`, and unique client local ports such as `5121`, `5122`, `5123`, and `5124`.
+
+## Run the RDMA RoCE benchmark
+
+Start from `examples/daqiri_bench_rdma_tx_rx.yaml` or `examples/daqiri_bench_rdma_tx_rx_spark.yaml`. The full config can run both endpoints in one process:
+
+```bash
+./build-socket-rdma/examples/daqiri_bench_rdma \
+  examples/daqiri_bench_rdma_tx_rx_spark.yaml \
+  --seconds 10 --mode both
+```
+
+For namespace testing, split the file by role just as in the Linux socket test:
+
+- The server YAML keeps the server memory regions, the server interface with `socket_config.mode: server`, and `rdma_bench_server`.
+- The client YAML keeps the client memory regions, the client interface with `socket_config.mode: client`, and `rdma_bench_client`.
+- Both files use `stream_type: "socket"` and `protocol: "roce"`.
+- `rdma_bench_client.client_address` should be the client namespace IP.
+
+Run the split RDMA test with the same namespace pair:
+
+```bash
+export LD_LIBRARY_PATH=/work/build-socket-rdma/src:${LD_LIBRARY_PATH:-}
+BIN=/work/build-socket-rdma/examples/daqiri_bench_rdma
+
+ip netns exec "$SERVER_NS" env LD_LIBRARY_PATH="$LD_LIBRARY_PATH" \
+  "$BIN" /tmp/rdma-server.yaml --seconds 11 --mode server &
+
+sleep 1
+
+ip netns exec "$CLIENT_NS" env LD_LIBRARY_PATH="$LD_LIBRARY_PATH" \
+  "$BIN" /tmp/rdma-client.yaml --seconds 10 --mode client
+
+wait
+```
+
+Use `ib_send_bw` or `ib_write_bw` in the same namespaces as a comparison baseline, and monitor `mlnx_perf` or `ethtool -S` on the same directional interfaces.
+
+## Example Spark socket results
+
+The following DAQIRI socket matrix was run on the verified physical path `enp1s0f0np0 -> enp1s0f1np1` with four client/server process pairs:
+
+| Protocol | Message size | App TX | App RX | Loss | Client `tx_packets_phy` | Server `rx_packets_phy` |
+|---|---:|---:|---:|---:|---:|---:|
+| TCP | 1000 | 10.93 Gb/s | 10.93 Gb/s | 0.00% | 1,513,047 | 1,513,047 |
+| TCP | 8000 | 11.20 Gb/s | 11.20 Gb/s | 0.00% | 1,550,052 | 1,550,052 |
+| TCP | 1 MiB | 11.67 Gb/s | 11.67 Gb/s | 0.00% | 1,615,399 | 1,615,399 |
+| UDP | 1000 | 12.28 Gb/s | 11.68 Gb/s | 4.88% | 15,350,463 | 15,350,463 |
+| UDP | 8000 | 12.93 Gb/s | 10.10 Gb/s | 21.91% | 2,020,461 | 2,020,461 |
+| UDP | 65507 | 12.84 Gb/s | 12.41 Gb/s | 3.34% | 1,960,392 | 1,960,392 |
+
+UDP 1 MiB is intentionally skipped because Linux UDP payloads above `65507` bytes require fragmentation or segmentation behavior outside the benchmark's supported payload model.
+
+## Restore host networking
+
+After tests, move interfaces back to the host and restore the usual IPs. Adjust names and addresses for the target machine:
+
+```bash
+for ns in "$CLIENT_NS" "$SERVER_NS"; do
+  ip netns exec "$ns" ip link set "$CLIENT_IF" netns 1 >/dev/null 2>&1 || true
+  ip netns exec "$ns" ip link set "$SERVER_IF" netns 1 >/dev/null 2>&1 || true
+done
+
+ip netns delete "$CLIENT_NS" >/dev/null 2>&1 || true
+ip netns delete "$SERVER_NS" >/dev/null 2>&1 || true
+
+for ifc in enp1s0f0np0 enp1s0f1np1 enP2p1s0f0np0 enP2p1s0f1np1; do
+  ip addr flush dev "$ifc" >/dev/null 2>&1 || true
+  ip link set dev "$ifc" mtu 9082 up >/dev/null 2>&1 || true
+done
+```
+
+## Loopback disable knobs
+
+If namespace isolation still increments only vport counters, check whether the platform exposes loopback control:
+
+```bash
+ethtool --show-priv-flags <interface>
+ethtool --set-priv-flags <interface> local_lb off
+
+mlxconfig -d <device> q | grep FORCE_LOOPBACK_DISABLE
+mlxconfig -d <device> set FORCE_LOOPBACK_DISABLE=1
+```
+
+Treat firmware settings as maintenance-window changes: query first, set only with the proper Mellanox tooling available, then reset or reboot as required and rerun the same `rx_packets_phy` proof.
+
+---
+**Previous:** [Benchmarking](benchmarks.md)<br>
+**Next:** [Raw Ethernet Benchmarking](raw_benchmarking.md)
diff --git a/docs/concepts.md b/docs/concepts.md
index ba1b46d..c9ca280 100644
--- a/docs/concepts.md
+++ b/docs/concepts.md
@@ -22,6 +22,11 @@ choice is configured per-application in YAML by two keys:
 - `protocol` — required when `stream_type: "socket"`; selects the
   socket-level protocol.
 
+The shipped Ethernet stream types use NICs as their hardware endpoint.
+The planned PCIe programmable-sensor path uses the same DAQIRI model for
+devices that sit directly on the PCIe bus, such as FPGAs, frame grabbers,
+or custom acquisition cards.
+
 ### Raw Ethernet
 
 *YAML:* `stream_type: "raw"`.
@@ -57,8 +62,11 @@ Requires an NVIDIA SmartNIC (ConnectX-6 Dx or later).
 
 *YAML:* `stream_type: "pcie"`.
 
-Placeholder for an upcoming direct-PCIe stream type. Not implemented
-yet.
+Coming-soon path for sensors that appear directly on the PCIe bus, such
+as FPGAs, frame grabbers, or custom acquisition cards. The goal is to
+move data into or out of CPU or NVIDIA GPU memory through the same
+DAQIRI C++/Python API while avoiding unnecessary copies. This stream
+type does not currently ship with a runnable benchmark or example YAML.
 
 ### Choosing a stream type
 
@@ -87,6 +95,7 @@ in the configuration walkthrough.
     - **Socket — RoCE** (`stream_type: "socket"`,
       `protocol: "roce"`) is supported and distributed; integration
       testing is under development.
+    - The **PCIe programmable-sensor** path is under development.
 
 ## GPUDirect
 
diff --git a/docs/getting-started.md b/docs/getting-started.md
index a752f18..cd61b5a 100644
--- a/docs/getting-started.md
+++ b/docs/getting-started.md
@@ -201,5 +201,5 @@ Once DAQIRI is built, follow the tutorials to configure your system and run your
 1. [**Concepts**](concepts.md) — terminology (stream types and protocols, packet, burst, segment, flow, queue, memory region), GPUDirect, and zero-copy ownership. Keep this open in a second tab.
 2. [**API Guide**](api-reference/index.md) — the six-step DAQIRI application lifecycle and configuration-first model
 3. [**System Configuration**](tutorials/system_configuration.md) — NIC drivers, link layers, GPUDirect, hugepages, CPU isolation, GPU clocks, and more
-4. [**Benchmarking Examples**](tutorials/benchmarking_examples.md) — run `daqiri_bench_raw_gpudirect` with a loopback test
+4. [**Benchmarking**](benchmarks/benchmarks.md) — choose a backend, then run socket/RDMA or raw Ethernet benchmarks
 5. [**Understanding the Configuration File**](tutorials/configuration-walkthrough.md) — annotated YAML walkthrough
diff --git a/docs/images/architecture.svg b/docs/images/architecture.svg
index b336eec..c0a9272 100644
--- a/docs/images/architecture.svg
+++ b/docs/images/architecture.svg
@@ -42,7 +42,7 @@
   <!-- ── NIC (GPUDirect RDMA) ──────────────────────────── -->
   <rect x="118" y="230" width="148" height="56" rx="6" fill="#0d0d0d" stroke="#76b900" stroke-width="1.5"/>
   <text x="192" y="247" text-anchor="middle" fill="#4a7a00" font-size="8" font-weight="700" font-family="Inter,sans-serif" letter-spacing="1">INGEST</text>
-  <text x="192" y="262" text-anchor="middle" fill="#f0f0f0" font-weight="700" font-size="12" font-family="Inter,sans-serif">NIC</text>
+  <text x="192" y="262" text-anchor="middle" fill="#f0f0f0" font-weight="700" font-size="12" font-family="Inter,sans-serif">NIC/PCIe</text>
   <text x="192" y="277" text-anchor="middle" fill="#76b900" font-size="10" font-family="Inter,sans-serif">GPUDirect RDMA</text>
 
   <!-- Arrow 3: NIC → GPU, green -->
diff --git a/docs/images/backend-decision-tree.svg b/docs/images/backend-decision-tree.svg
new file mode 100644
index 0000000..25c1db3
--- /dev/null
+++ b/docs/images/backend-decision-tree.svg
@@ -0,0 +1,80 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1600 1200" role="img" aria-labelledby="title desc">
+  <title id="title">DAQIRI networking backend decision tree</title>
+  <desc id="desc">Choose the coming-soon PCIe path for programmable PCIe sensors, socket TCP or UDP for non-NVIDIA NICs, socket RoCE for NVIDIA NICs talking to an RDMA endpoint, or raw Ethernet for NVIDIA NICs without an existing RoCE endpoint.</desc>
+  <defs>
+    <filter id="shadow" x="-20%" y="-20%" width="140%" height="140%">
+      <feDropShadow dx="0" dy="12" stdDeviation="12" flood-color="#1f2937" flood-opacity="0.16"/>
+    </filter>
+    <marker id="arrow" viewBox="0 0 20 20" refX="16" refY="10" markerWidth="11" markerHeight="11" orient="auto">
+      <path d="M0 0 L20 10 L0 20 Z" fill="#2f3b4f"/>
+    </marker>
+  </defs>
+
+  <rect width="1600" height="1200" fill="#f3f6fa"/>
+  <rect width="1600" height="138" fill="#0b1017"/>
+  <rect y="132" width="1600" height="7" fill="#76b900"/>
+  <rect x="64" y="42" width="82" height="48" rx="6" fill="#76b900"/>
+  <text x="164" y="69" fill="#ffffff" font-family="Inter, Arial, sans-serif" font-size="42" font-weight="800">DAQIRI Networking Backend Decision Tree</text>
+  <text x="164" y="102" fill="#c7d0dd" font-family="Inter, Arial, sans-serif" font-size="21">Choose the backend that matches the local sensor, NIC, and peer protocol.</text>
+
+  <!-- Connectors -->
+  <g fill="none" stroke="#2f3b4f" stroke-width="4" stroke-linecap="round" stroke-linejoin="round">
+    <path d="M1160 275 H1250" marker-end="url(#arrow)"/>
+    <path d="M780 395 V460" marker-end="url(#arrow)"/>
+    <path d="M520 575 H400" marker-end="url(#arrow)"/>
+    <path d="M780 670 V745" marker-end="url(#arrow)"/>
+    <path d="M1110 870 H1225" marker-end="url(#arrow)"/>
+    <path d="M780 980 V1060" marker-end="url(#arrow)"/>
+  </g>
+
+  <!-- Decisions -->
+  <g fill="#ffffff" stroke="#2f3b4f" stroke-width="4" stroke-linejoin="round">
+    <path d="M780 155 L1160 275 L780 395 L400 275 Z"/>
+    <path d="M780 480 L1040 575 L780 670 L520 575 Z"/>
+    <path d="M780 760 L1110 870 L780 980 L450 870 Z"/>
+  </g>
+
+  <g font-family="Inter, Arial, sans-serif" fill="#121827">
+    <text x="780" y="258" text-anchor="middle" font-size="30" font-weight="800">PCIe Programmable</text>
+    <text x="780" y="298" text-anchor="middle" font-size="30" font-weight="800">Sensor (FPGA, etc)?</text>
+
+    <text x="780" y="586" text-anchor="middle" font-size="34" font-weight="800">NVIDIA NIC?</text>
+
+    <text x="780" y="858" text-anchor="middle" font-size="30" font-weight="800">Existing endpoint</text>
+    <text x="780" y="896" text-anchor="middle" font-size="30" font-weight="800">implementing RoCE?</text>
+  </g>
+
+  <!-- Branch labels -->
+  <g fill="#344054" font-family="Inter, Arial, sans-serif" font-size="22" font-weight="800">
+    <text x="1168" y="246">Yes</text>
+    <text x="808" y="430">No</text>
+    <text x="454" y="540">No</text>
+    <text x="808" y="715">Yes</text>
+    <text x="1145" y="846">Yes</text>
+    <text x="808" y="1035">No</text>
+  </g>
+
+  <!-- Leaves -->
+  <g filter="url(#shadow)">
+    <rect x="1250" y="220" width="310" height="110" rx="14" fill="#e6f7ff" stroke="#0b6f8f" stroke-width="3"/>
+    <text x="1405" y="278" text-anchor="middle" fill="#121827" font-family="Inter, Arial, sans-serif" font-size="25" font-weight="800">stream_type = pcie</text>
+    <text x="1405" y="307" text-anchor="middle" fill="#0b6f8f" font-family="Inter, Arial, sans-serif" font-size="18" font-weight="800">coming soon</text>
+  </g>
+
+  <g filter="url(#shadow)">
+    <rect x="20" y="515" width="380" height="130" rx="14" fill="#eef2ff" stroke="#4f46e5" stroke-width="3"/>
+    <text x="210" y="570" text-anchor="middle" fill="#121827" font-family="Inter, Arial, sans-serif" font-size="26" font-weight="800">stream_type = socket,</text>
+    <text x="210" y="608" text-anchor="middle" fill="#121827" font-family="Inter, Arial, sans-serif" font-size="26" font-weight="800">protocol = tcp/udp</text>
+  </g>
+
+  <g filter="url(#shadow)">
+    <rect x="1225" y="805" width="355" height="140" rx="14" fill="#ecfdf3" stroke="#16a34a" stroke-width="3"/>
+    <text x="1402" y="873" text-anchor="middle" fill="#121827" font-family="Inter, Arial, sans-serif" font-size="25" font-weight="800">stream_type = socket,</text>
+    <text x="1402" y="910" text-anchor="middle" fill="#121827" font-family="Inter, Arial, sans-serif" font-size="25" font-weight="800">protocol = roce</text>
+  </g>
+
+  <g filter="url(#shadow)">
+    <rect x="600" y="1060" width="360" height="110" rx="14" fill="#fff7ed" stroke="#c2410c" stroke-width="3"/>
+    <text x="780" y="1128" text-anchor="middle" fill="#121827" font-family="Inter, Arial, sans-serif" font-size="28" font-weight="800">stream_type = raw</text>
+  </g>
+</svg>
diff --git a/docs/images/daqiri-landing-graphic.svg b/docs/images/daqiri-landing-graphic.svg
new file mode 100644
index 0000000..26967e7
--- /dev/null
+++ b/docs/images/daqiri-landing-graphic.svg
@@ -0,0 +1,256 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 820 560" role="img" aria-labelledby="title desc">
+  <title id="title">DAQIRI sensor data paths to CPU, NVIDIA GPU memory, and storage</title>
+  <desc id="desc">A diagram showing sensor data entering or leaving CPU memory or NVIDIA GPU memory through DAQIRI from a PCIe FPGA path or a network-capable sensor path through a NIC, then GPU-resident data writing out through GPUDirect Storage.</desc>
+  <defs>
+    <linearGradient id="panel" x1="0" y1="0" x2="1" y2="1">
+      <stop offset="0" stop-color="#171a18"/>
+      <stop offset="1" stop-color="#080a08"/>
+    </linearGradient>
+    <linearGradient id="greenPanel" x1="0" y1="0" x2="1" y2="1">
+      <stop offset="0" stop-color="#203a09"/>
+      <stop offset="1" stop-color="#0d1704"/>
+    </linearGradient>
+    <linearGradient id="gpuPanel" x1="0" y1="0" x2="1" y2="1">
+      <stop offset="0" stop-color="#193609"/>
+      <stop offset="0.58" stop-color="#102404"/>
+      <stop offset="1" stop-color="#070b05"/>
+    </linearGradient>
+    <linearGradient id="pciePanel" x1="0" y1="0" x2="1" y2="1">
+      <stop offset="0" stop-color="#132e36"/>
+      <stop offset="1" stop-color="#071316"/>
+    </linearGradient>
+    <linearGradient id="networkPanel" x1="0" y1="0" x2="1" y2="1">
+      <stop offset="0" stop-color="#18293d"/>
+      <stop offset="1" stop-color="#070d16"/>
+    </linearGradient>
+    <linearGradient id="storagePanel" x1="0" y1="0" x2="1" y2="1">
+      <stop offset="0" stop-color="#3c2a10"/>
+      <stop offset="1" stop-color="#120d05"/>
+    </linearGradient>
+    <radialGradient id="glow" cx="50%" cy="45%" r="62%">
+      <stop offset="0" stop-color="#76b900" stop-opacity="0.18"/>
+      <stop offset="0.55" stop-color="#76b900" stop-opacity="0.05"/>
+      <stop offset="1" stop-color="#76b900" stop-opacity="0"/>
+    </radialGradient>
+    <filter id="softGlow" x="-30%" y="-30%" width="160%" height="160%">
+      <feGaussianBlur stdDeviation="8" result="blur"/>
+      <feMerge>
+        <feMergeNode in="blur"/>
+        <feMergeNode in="SourceGraphic"/>
+      </feMerge>
+    </filter>
+    <marker id="arrowGreen" markerWidth="5" markerHeight="5" refX="4.3" refY="2.5" orient="auto-start-reverse">
+      <path d="M0.8 0.7 L4.6 2.5 L0.8 4.3 Z" fill="#76b900"/>
+    </marker>
+    <marker id="arrowCyan" markerWidth="5" markerHeight="5" refX="4.3" refY="2.5" orient="auto-start-reverse">
+      <path d="M0.8 0.7 L4.6 2.5 L0.8 4.3 Z" fill="#5dd8f2"/>
+    </marker>
+    <marker id="arrowBlue" markerWidth="5" markerHeight="5" refX="4.3" refY="2.5" orient="auto-start-reverse">
+      <path d="M0.8 0.7 L4.6 2.5 L0.8 4.3 Z" fill="#82aaff"/>
+    </marker>
+    <marker id="arrowMuted" markerWidth="5" markerHeight="5" refX="4.3" refY="2.5" orient="auto-start-reverse">
+      <path d="M0.8 0.7 L4.6 2.5 L0.8 4.3 Z" fill="#888"/>
+    </marker>
+    <marker id="arrowAmber" markerWidth="5" markerHeight="5" refX="4.3" refY="2.5" orient="auto">
+      <path d="M0.8 0.7 L4.6 2.5 L0.8 4.3 Z" fill="#ffcb6b"/>
+    </marker>
+    <style>
+      .label { font-family: Inter, "Segoe UI", Arial, sans-serif; letter-spacing: 0; }
+      .mono { font-family: "JetBrains Mono", "Fira Code", Consolas, monospace; letter-spacing: 0; }
+      .flow-green { animation: flow 2.6s linear infinite; stroke-dasharray: 6 10; }
+      .flow-cyan { animation: flow 3.1s linear infinite; stroke-dasharray: 6 10; }
+      .flow-blue { animation: flow 3.4s linear infinite; stroke-dasharray: 6 10; }
+      .flow-amber { animation: flow 2.9s linear infinite; stroke-dasharray: 6 10; }
+      .travel-dot { opacity: 0.95; }
+      .pulse { animation: pulse 2.8s ease-in-out infinite; transform-origin: center; }
+      .pulse-2 { animation-delay: 0.85s; }
+      .pulse-3 { animation-delay: 1.55s; }
+      .pulse-4 { animation-delay: 2.15s; }
+      @keyframes flow { to { stroke-dashoffset: -52; } }
+      @keyframes pulse {
+        0%, 100% { opacity: 0.35; transform: scale(1); }
+        50% { opacity: 1; transform: scale(1.18); }
+      }
+      @media (prefers-reduced-motion: reduce) {
+        .flow-green, .flow-cyan, .flow-blue, .flow-amber, .pulse { animation: none; }
+      }
+    </style>
+  </defs>
+
+  <rect width="820" height="560" rx="24" fill="#090a09"/>
+  <rect x="1" y="1" width="818" height="558" rx="23" fill="none" stroke="#1d2518"/>
+  <rect width="820" height="560" fill="url(#glow)"/>
+
+  <g opacity="0.4">
+    <path d="M40 88 H780 M40 144 H780 M40 200 H780 M40 256 H780 M40 312 H780 M40 368 H780 M40 424 H780 M40 480 H780" stroke="#76b900" stroke-opacity="0.08"/>
+    <path d="M84 52 V520 M148 52 V520 M212 52 V520 M276 52 V520 M340 52 V520 M404 52 V520 M468 52 V520 M532 52 V520 M596 52 V520 M660 52 V520 M724 52 V520 M788 52 V520" stroke="#76b900" stroke-opacity="0.08"/>
+  </g>
+
+  <g class="label">
+    <text x="410" y="48" text-anchor="middle" fill="#f0f0f0" font-size="24" font-weight="800">Sensor streams meet CPU or GPU memory</text>
+    <text x="410" y="74" text-anchor="middle" fill="#8f9a88" font-size="13">DAQIRI carries PCIe and network-capable streams into memory, then GPU data out to storage</text>
+  </g>
+
+  <!-- PCIe sensor path -->
+  <g transform="translate(30 302)">
+    <rect width="222" height="142" rx="12" fill="url(#pciePanel)" stroke="#2e7180" stroke-width="1.5"/>
+    <g transform="translate(18 19)">
+      <rect x="0" y="0" width="82" height="82" rx="8" fill="#0b171a" stroke="#5dd8f2" stroke-opacity="0.65"/>
+      <rect x="18" y="18" width="46" height="46" rx="6" fill="#12303a" stroke="#5dd8f2"/>
+      <text x="41" y="47" text-anchor="middle" class="mono" fill="#c7f7ff" font-size="10" font-weight="700">FPGA</text>
+      <g stroke="#5dd8f2" stroke-width="2" stroke-linecap="round" opacity="0.7">
+        <path d="M8 16 H0 M8 30 H0 M8 44 H0 M8 58 H0 M8 72 H0"/>
+        <path d="M82 16 H74 M82 30 H74 M82 44 H74 M82 58 H74 M82 72 H74"/>
+        <path d="M16 8 V0 M30 8 V0 M44 8 V0 M58 8 V0 M72 8 V0"/>
+        <path d="M16 82 V74 M30 82 V74 M44 82 V74 M58 82 V74 M72 82 V74"/>
+      </g>
+    </g>
+    <text x="118" y="35" fill="#f0f0f0" font-size="18" font-weight="800">PCIe device</text>
+    <text x="118" y="57" fill="#9fb0b4" font-size="11">FPGA / frame grabber</text>
+    <text x="118" y="73" fill="#9fb0b4" font-size="11">sensor front-end</text>
+    <rect x="118" y="92" width="82" height="26" rx="5" fill="#061216" stroke="#2e7180"/>
+    <text x="159" y="109" text-anchor="middle" class="mono" fill="#5dd8f2" font-size="11" font-weight="700">PCIe BAR</text>
+  </g>
+
+  <!-- Network-capable sensor path -->
+  <g transform="translate(30 116)">
+    <rect width="222" height="142" rx="12" fill="url(#networkPanel)" stroke="#394d86" stroke-width="1.5"/>
+    <g transform="translate(16 26)" fill="none" stroke="#82aaff" stroke-width="2">
+      <circle cx="18" cy="18" r="12" fill="#0b1320"/>
+      <circle cx="58" cy="18" r="12" fill="#0b1320"/>
+      <circle cx="38" cy="58" r="12" fill="#0b1320"/>
+      <path d="M28 20 H48 M24 29 L34 47 M53 29 L43 47"/>
+      <circle cx="18" cy="18" r="4" fill="#82aaff"/>
+      <circle cx="58" cy="18" r="4" fill="#82aaff"/>
+      <circle cx="38" cy="58" r="4" fill="#82aaff"/>
+    </g>
+    <text x="96" y="30" fill="#f0f0f0" font-size="14" font-weight="800">Network-capable</text>
+    <text x="96" y="48" fill="#f0f0f0" font-size="14" font-weight="800">sensor</text>
+    <rect x="96" y="64" width="104" height="30" rx="5" fill="#07101c" stroke="#394d86"/>
+    <text x="148" y="77" text-anchor="middle" class="mono" fill="#d1ddff" font-size="8.5" font-weight="800">Raw Ethernet</text>
+    <text x="148" y="89" text-anchor="middle" class="mono" fill="#82aaff" font-size="8.5" font-weight="700">UDP/TCP</text>
+    <rect x="96" y="102" width="62" height="24" rx="5" fill="#07101c" stroke="#394d86"/>
+    <text x="127" y="118" text-anchor="middle" class="mono" fill="#82aaff" font-size="10" font-weight="800">RoCE</text>
+  </g>
+
+  <!-- Host NIC between the network-capable sensor and DAQIRI -->
+  <g transform="translate(283 188)">
+    <rect width="32" height="100" rx="9" fill="#091119" stroke="#82aaff" stroke-width="1.5"/>
+    <text x="16" y="20" text-anchor="middle" class="mono" fill="#d1ddff" font-size="11" font-weight="900">NIC</text>
+    <g fill="#07101c" stroke="#82aaff" stroke-width="1">
+      <rect x="7" y="34" width="8" height="8" rx="1.5"/>
+      <rect x="18" y="34" width="8" height="8" rx="1.5"/>
+      <rect x="7" y="49" width="8" height="8" rx="1.5"/>
+      <rect x="18" y="49" width="8" height="8" rx="1.5"/>
+    </g>
+    <circle cx="9" cy="76" r="2.2" fill="#82aaff"/>
+    <circle cx="16" cy="76" r="2.2" fill="#76b900"/>
+    <circle cx="23" cy="76" r="2.2" fill="#82aaff"/>
+    <text x="16" y="92" text-anchor="middle" class="mono" fill="#82aaff" font-size="7" font-weight="700">RX/TX</text>
+  </g>
+
+  <!-- DAQIRI core -->
+  <g transform="translate(346 168)">
+    <rect width="142" height="224" rx="14" fill="url(#greenPanel)" stroke="#76b900" stroke-width="2" filter="url(#softGlow)"/>
+    <rect x="17" y="18" width="108" height="32" rx="6" fill="#76b900"/>
+    <text x="71" y="40" text-anchor="middle" class="mono" fill="#050805" font-size="18" font-weight="900">DAQIRI</text>
+    <text x="71" y="72" text-anchor="middle" fill="#a0d000" font-size="10" font-weight="700">C++/Python Library</text>
+
+    <g transform="translate(22 88)">
+      <rect width="98" height="26" rx="5" fill="#0a0f08" stroke="#385b16"/>
+      <text x="49" y="17" text-anchor="middle" class="mono" fill="#c8e840" font-size="10">RX/TX bursts</text>
+      <rect y="32" width="98" height="26" rx="5" fill="#0a0f08" stroke="#385b16"/>
+      <text x="49" y="49" text-anchor="middle" class="mono" fill="#c8e840" font-size="10">flow steering</text>
+      <rect y="64" width="98" height="26" rx="5" fill="#0a0f08" stroke="#385b16"/>
+      <text x="49" y="81" text-anchor="middle" class="mono" fill="#c8e840" font-size="10">zero-copy API</text>
+      <rect y="96" width="98" height="26" rx="5" fill="#0a0f08" stroke="#385b16"/>
+      <text x="49" y="113" text-anchor="middle" class="mono" fill="#c8e840" font-size="10">GDS writes</text>
+    </g>
+  </g>
+
+  <!-- Zero-copy destination/source targets -->
+  <g transform="translate(532 142)">
+    <text x="95" y="22" text-anchor="middle" fill="#f0f0f0" font-size="13" font-weight="900">Zero-copy to GPU or CPU</text>
+
+    <g transform="translate(20 42)">
+      <rect width="150" height="82" rx="12" fill="url(#gpuPanel)" stroke="#76b900" stroke-width="2"/>
+      <rect x="16" y="16" width="58" height="50" rx="8" fill="#162d08" stroke="#a0d000"/>
+      <text x="45" y="40" text-anchor="middle" class="mono" fill="#f0f0f0" font-size="16" font-weight="900">GPU</text>
+      <text x="45" y="56" text-anchor="middle" fill="#a0d000" font-size="9" font-weight="700">NVIDIA</text>
+      <rect x="86" y="17" width="48" height="48" rx="8" fill="#050805" stroke="#76b900" stroke-opacity="0.75"/>
+      <text x="110" y="36" text-anchor="middle" fill="#ffffff" font-size="9" font-weight="900">GPU</text>
+      <text x="110" y="50" text-anchor="middle" fill="#ffffff" font-size="9" font-weight="900">memory</text>
+      <rect x="97" y="57" width="26" height="4" rx="2" fill="#76b900" opacity="0.9"/>
+    </g>
+
+    <g transform="translate(194 42)">
+      <rect width="86" height="82" rx="12" fill="url(#storagePanel)" stroke="#ffcb6b" stroke-width="1.8"/>
+      <g transform="translate(21 9)" fill="none" stroke="#ffcb6b" stroke-width="1.25">
+        <ellipse cx="22" cy="7" rx="22" ry="6" fill="#171006"/>
+        <path d="M0 7 V25 C0 29 10 32 22 32 C34 32 44 29 44 25 V7"/>
+        <path d="M0 16 C0 20 10 23 22 23 C34 23 44 20 44 16"/>
+      </g>
+      <rect x="11" y="51" width="64" height="26" rx="5" fill="#120d05" stroke="#5b431c"/>
+      <text x="43" y="63" text-anchor="middle" class="mono" fill="#ffefc7" font-size="8" font-weight="900">GPUDirect</text>
+      <text x="43" y="74" text-anchor="middle" fill="#ffcb6b" font-size="9.5" font-weight="900">Storage</text>
+    </g>
+
+    <g transform="translate(20 154)">
+      <rect width="150" height="82" rx="12" fill="#101208" stroke="#b8e85a" stroke-width="2"/>
+      <rect x="16" y="16" width="58" height="50" rx="8" fill="#181c10" stroke="#b8e85a"/>
+      <text x="45" y="46" text-anchor="middle" class="mono" fill="#f0f0f0" font-size="16" font-weight="900">CPU</text>
+      <rect x="86" y="17" width="48" height="48" rx="8" fill="#080a05" stroke="#b8e85a" stroke-opacity="0.75"/>
+      <text x="110" y="36" text-anchor="middle" fill="#ffffff" font-size="9" font-weight="900">CPU</text>
+      <text x="110" y="50" text-anchor="middle" fill="#ffffff" font-size="9" font-weight="900">memory</text>
+      <rect x="97" y="57" width="26" height="4" rx="2" fill="#b8e85a" opacity="0.9"/>
+    </g>
+  </g>
+
+  <!-- Path labels -->
+  <g class="label">
+    <rect x="270" y="330" width="58" height="24" rx="6" fill="#061216" stroke="#2e7180"/>
+    <text x="299" y="346" text-anchor="middle" class="mono" fill="#5dd8f2" font-size="10" font-weight="800">PCIe</text>
+  </g>
+
+  <!-- Data paths -->
+  <path d="M252 378 C286 378 314 362 346 362" fill="none" stroke="#5dd8f2" stroke-width="2.5" stroke-linecap="round" marker-start="url(#arrowCyan)" marker-end="url(#arrowCyan)"/>
+  <path class="flow-cyan" d="M252 378 C286 378 314 362 346 362" fill="none" stroke="#c7f7ff" stroke-width="1" stroke-linecap="round" opacity="0.75"/>
+
+  <path d="M252 238 H283" fill="none" stroke="#82aaff" stroke-width="2" stroke-linecap="round" marker-start="url(#arrowBlue)" marker-end="url(#arrowBlue)"/>
+  <path class="flow-blue" d="M252 238 H283" fill="none" stroke="#d1ddff" stroke-width="0.8" stroke-linecap="round" opacity="0.75"/>
+  <path d="M315 238 H346" fill="none" stroke="#82aaff" stroke-width="2" stroke-linecap="round" marker-start="url(#arrowBlue)" marker-end="url(#arrowBlue)"/>
+  <path class="flow-blue" d="M315 238 H346" fill="none" stroke="#d1ddff" stroke-width="0.8" stroke-linecap="round" opacity="0.75"/>
+
+  <path d="M488 236 C505 218 520 218 552 225" fill="none" stroke="#76b900" stroke-width="2.6" stroke-linecap="round" marker-start="url(#arrowGreen)" marker-end="url(#arrowGreen)"/>
+  <path class="flow-green" d="M488 236 C505 218 520 218 552 225" fill="none" stroke="#d8ff6a" stroke-width="1.1" stroke-linecap="round" opacity="0.75"/>
+  <path d="M686 225 H726" fill="none" stroke="#ffcb6b" stroke-width="2.3" stroke-linecap="round" marker-end="url(#arrowAmber)"/>
+  <path class="flow-amber" d="M686 225 H726" fill="none" stroke="#ffefc7" stroke-width="0.9" stroke-linecap="round" opacity="0.8"/>
+  <path d="M488 318 C506 332 522 336 552 338" fill="none" stroke="#b8e85a" stroke-width="2.4" stroke-linecap="round" marker-start="url(#arrowGreen)" marker-end="url(#arrowGreen)"/>
+  <path class="flow-green" d="M488 318 C506 332 522 336 552 338" fill="none" stroke="#ecffb0" stroke-width="1" stroke-linecap="round" opacity="0.72"/>
+
+  <text x="509" y="206" text-anchor="middle" class="mono" fill="#a0d000" font-size="8.5" font-weight="700">GPU</text>
+  <text x="509" y="350" text-anchor="middle" class="mono" fill="#d3f08a" font-size="8.5" font-weight="700">CPU</text>
+
+  <!-- Activity dots -->
+  <g filter="url(#softGlow)">
+    <circle class="travel-dot" r="3.2" fill="#82aaff">
+      <animateMotion dur="3.2s" repeatCount="indefinite" calcMode="linear" keyPoints="0;1;0" keyTimes="0;0.5;1" path="M252 238 H346"/>
+    </circle>
+    <circle class="travel-dot" r="3.2" fill="#5dd8f2">
+      <animateMotion dur="3.4s" repeatCount="indefinite" calcMode="linear" keyPoints="0;1;0" keyTimes="0;0.5;1" path="M252 378 C286 378 314 362 346 362"/>
+    </circle>
+    <circle class="travel-dot" r="3.4" fill="#76b900">
+      <animateMotion dur="3s" repeatCount="indefinite" calcMode="linear" keyPoints="0;1;0" keyTimes="0;0.5;1" path="M488 236 C505 218 520 218 552 225"/>
+    </circle>
+    <circle class="travel-dot" r="3.2" fill="#b8e85a">
+      <animateMotion dur="3.2s" repeatCount="indefinite" calcMode="linear" keyPoints="0;1;0" keyTimes="0;0.5;1" path="M488 318 C506 332 522 336 552 338"/>
+    </circle>
+    <circle class="pulse pulse-4" cx="706" cy="225" r="3" fill="#ffcb6b"/>
+  </g>
+
+  <g class="label">
+    <rect x="158" y="472" width="504" height="46" rx="10" fill="#0b0d0a" stroke="#27331d"/>
+    <text x="410" y="491" text-anchor="middle" fill="#f0f0f0" font-size="12" font-weight="800">one application, sensor I/O plus GPU storage writes</text>
+    <text x="410" y="508" text-anchor="middle" fill="#8f9a88" font-size="9.5">PCIe or network sensors into CPU/GPU memory, then GPU data out to storage</text>
+  </g>
+</svg>
diff --git a/docs/index.html b/docs/index.html
index f5b49e8..20edd91 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -31,6 +31,7 @@
     pre .str { color:#c3e88d; }
     pre .nm { color:var(--nv-green-l); }
     .container { max-width:1200px; margin:0 auto; padding:0 2rem; }
+    #hero .container { max-width:1360px; }
     section { padding:6rem 0; }
     .section-label { font-size:.72rem; font-weight:700; letter-spacing:.15em; text-transform:uppercase; color:var(--nv-green); margin-bottom:.75rem; }
     .section-title { color:var(--text-pri); margin-bottom:1rem; }
@@ -76,26 +77,45 @@
     .btn-outline:hover { color:var(--text-pri); border-color:#444; background:rgba(255,255,255,.04); }
 
     /* HERO */
-    #hero { min-height:100vh; display:flex; align-items:center; padding-top:var(--nav-h); position:relative; overflow:hidden; }
-    .hero-inner { display:grid; grid-template-columns:1fr auto; gap:4rem; align-items:center; }
-    .hero-logo-wrap { display:flex; align-items:center; justify-content:center; flex-shrink:0; }
-    .hero-logo-wrap img { width:320px; max-width:38vw; filter:drop-shadow(0 0 40px rgba(118,185,0,0.18)); }
+    #hero { min-height:calc(100vh - 24px); display:flex; align-items:center; padding:calc(var(--nav-h) + 3rem) 0 4rem; position:relative; overflow:hidden; }
+    .hero-inner { display:grid; grid-template-columns:minmax(0,1fr) minmax(460px,700px); column-gap:2rem; row-gap:2rem; align-items:center; }
+    .hero-visual { position:relative; z-index:1; grid-column:2; grid-row:1 / span 3; display:flex; align-items:center; justify-content:center; }
+    .hero-visual::before { content:''; position:absolute; inset:-5% -8% -2%; background:radial-gradient(ellipse,rgba(118,185,0,.28) 0%,transparent 68%); filter:blur(20px); z-index:-1; }
+    .hero-visual-link { display:block; width:min(100%,700px); padding:.45rem; border-radius:20px; appearance:none; background:linear-gradient(135deg,rgba(118,185,0,.34),rgba(93,216,242,.18)); border:1px solid rgba(160,208,0,.38); box-shadow:0 24px 70px rgba(0,0,0,.58),0 0 0 1px rgba(255,255,255,.05) inset; transition:transform var(--ease),border-color var(--ease),box-shadow var(--ease); cursor:zoom-in; }
+    .hero-visual-link:hover { transform:translateY(-2px); border-color:rgba(160,208,0,.68); box-shadow:0 30px 90px rgba(0,0,0,.65),0 0 34px rgba(118,185,0,.16); }
+    .hero-visual-link:focus-visible { outline:2px solid var(--nv-green); outline-offset:4px; }
+    .hero-visual img { width:100%; display:block; border-radius:16px; filter:contrast(1.08) brightness(1.08) saturate(1.12) drop-shadow(0 0 44px rgba(118,185,0,0.18)); }
     .hero-grid { position:absolute; inset:0; z-index:0; background-image:linear-gradient(rgba(118,185,0,.04) 1px,transparent 1px),linear-gradient(90deg,rgba(118,185,0,.04) 1px,transparent 1px); background-size:60px 60px; mask-image:radial-gradient(ellipse 80% 60% at 50% 40%,black 30%,transparent 100%); }
     .hero-glow { position:absolute; top:-20%; left:50%; transform:translateX(-50%); width:800px; height:500px; background:radial-gradient(ellipse,rgba(118,185,0,.11) 0%,transparent 70%); pointer-events:none; z-index:0; }
-    .hero-content { position:relative; z-index:1; max-width:780px; }
+    .hero-content { position:relative; z-index:1; grid-column:1; max-width:760px; }
     .hero-eyebrow { display:inline-flex; align-items:center; gap:.5rem; font-size:.72rem; font-weight:700; letter-spacing:.15em; text-transform:uppercase; color:var(--nv-green); background:rgba(118,185,0,.08); border:1px solid rgba(118,185,0,.2); border-radius:99px; padding:.35rem 1rem; margin-bottom:2rem; }
     .hero-eyebrow::before { content:''; width:6px; height:6px; border-radius:50%; background:var(--nv-green); animation:pulse 2s ease-in-out infinite; }
     @keyframes pulse { 0%,100%{opacity:1;transform:scale(1)}50%{opacity:.4;transform:scale(.8)} }
-    .hero-title { margin-bottom:1.5rem; }
+    .hero-title { font-size:clamp(1.85rem,2.8vw,2.35rem); margin-bottom:1.5rem; }
     .hero-title .hi { color:var(--nv-green); }
-    .hero-desc { font-size:1.15rem; color:var(--text-mut); max-width:620px; margin-bottom:2rem; line-height:1.75; }
-    .hero-actions { display:flex; align-items:center; gap:1rem; flex-wrap:wrap; margin-bottom:3.5rem; }
-    .hero-stats { display:flex; gap:3rem; flex-wrap:wrap; border-top:1px solid var(--border); padding-top:2.5rem; }
-    .stat-value { font-size:1.75rem; font-weight:800; color:var(--text-pri); letter-spacing:-.03em; }
+    .hero-desc { font-size:1.15rem; color:var(--text-mut); max-width:620px; margin-bottom:0; line-height:1.75; }
+    .hero-actions { position:relative; z-index:1; grid-column:1; display:flex; align-items:center; gap:1rem; flex-wrap:wrap; }
+    .hero-stats { grid-column:1 / -1; display:flex; gap:3rem; flex-wrap:wrap; border-top:1px solid var(--border); padding-top:2.5rem; }
+    .stat-value { font-size:1.45rem; font-weight:800; color:var(--text-pri); letter-spacing:-.03em; }
     .stat-label { font-size:.78rem; color:var(--text-dim); font-weight:500; text-transform:uppercase; letter-spacing:.1em; margin-top:.2rem; }
 
+    /* GRAPHIC OVERLAY */
+    body.graphic-overlay-open { overflow:hidden; }
+    .graphic-overlay { position:fixed; inset:calc(var(--nav-h) + 1rem) 1.5rem 1.5rem; z-index:2000; display:grid; place-items:center; opacity:0; visibility:hidden; pointer-events:none; transition:opacity var(--ease),visibility var(--ease); }
+    .graphic-overlay.is-open { opacity:1; visibility:visible; pointer-events:auto; }
+    .graphic-overlay-backdrop { position:absolute; inset:0; border-radius:22px; background:rgba(4,7,4,.74); border:1px solid rgba(118,185,0,.22); box-shadow:0 24px 120px rgba(0,0,0,.72); backdrop-filter:blur(6px); }
+    .graphic-overlay-panel { position:relative; width:min(1120px,calc(100vw - 6rem)); max-height:calc(100vh - var(--nav-h) - 4.5rem); display:flex; flex-direction:column; border-radius:18px; background:#070907; border:1px solid rgba(160,208,0,.5); box-shadow:0 0 0 1px rgba(255,255,255,.05) inset,0 0 48px rgba(118,185,0,.16); overflow:hidden; transform:translateY(8px) scale(.985); transition:transform var(--ease); }
+    .graphic-overlay.is-open .graphic-overlay-panel { transform:translateY(0) scale(1); }
+    .graphic-overlay-bar { display:flex; align-items:center; justify-content:space-between; gap:1rem; padding:.85rem 1rem; border-bottom:1px solid rgba(118,185,0,.2); background:rgba(17,22,13,.96); }
+    .graphic-overlay-title { margin:0; color:var(--text-pri); font-size:.95rem; font-weight:800; letter-spacing:0; }
+    .graphic-overlay-close { width:36px; height:36px; display:inline-flex; align-items:center; justify-content:center; flex-shrink:0; border-radius:8px; border:1px solid rgba(255,255,255,.14); background:rgba(255,255,255,.05); color:var(--text-pri); cursor:pointer; transition:background var(--ease),border-color var(--ease),color var(--ease); }
+    .graphic-overlay-close:hover { background:rgba(118,185,0,.14); border-color:rgba(160,208,0,.5); color:var(--nv-green-l); }
+    .graphic-overlay-close:focus-visible { outline:2px solid var(--nv-green); outline-offset:3px; }
+    .graphic-overlay-close svg { width:18px; height:18px; display:block; }
+    .graphic-overlay-img { width:100%; height:auto; max-height:calc(100vh - var(--nav-h) - 8.5rem); display:block; object-fit:contain; background:#050705; }
+
     /* WARNING BANNER */
-    .warn-banner { background:rgba(255,193,7,.07); border:1px solid rgba(255,193,7,.25); border-radius:var(--radius); padding:.9rem 1.25rem; display:flex; gap:.75rem; align-items:flex-start; margin-bottom:2rem; }
+    .warn-banner { position:relative; z-index:1; grid-column:1; background:rgba(255,193,7,.07); border:1px solid rgba(255,193,7,.25); border-radius:var(--radius); padding:.9rem 1.25rem; display:flex; gap:.75rem; align-items:flex-start; }
     .warn-banner p { color:rgba(255,203,107,.85); font-size:.875rem; }
 
     /* FEATURES */
@@ -253,9 +273,9 @@
         font-size:.85rem;
       }
     }
-    @media (max-width:1000px) { .hero-inner { grid-template-columns:1fr; } .hero-logo-wrap { display:none; } }
+    @media (max-width:1000px) { #hero .container { max-width:1200px; } .hero-inner { grid-template-columns:1fr; } .hero-content { grid-column:auto; order:1; } .hero-visual { grid-column:auto; grid-row:auto; order:2; margin:.5rem 0; } .warn-banner { grid-column:auto; order:3; } .hero-actions { grid-column:auto; order:4; } .hero-stats { order:5; } .hero-visual-link { width:min(100%,720px); } }
     @media (max-width:900px) { .gs-layout { grid-template-columns:1fr; } .gs-code-panel { position:static; } .footer-inner { grid-template-columns:1fr 1fr; } }
-    @media (max-width:640px) { section { padding:4rem 0; } .footer-inner { grid-template-columns:1fr; } .tut-meta { display:none; } .nav-actions .btn-outline { display:none; } }
+    @media (max-width:640px) { section { padding:4rem 0; } .hero-stats { gap:1.5rem 2rem; } .stat-value { font-size:1.2rem; } .footer-inner { grid-template-columns:1fr; } .tut-meta { display:none; } .nav-actions .btn-outline { display:none; } .graphic-overlay { inset:calc(var(--nav-h) + .5rem) .5rem .5rem; } .graphic-overlay-panel { width:100%; max-height:calc(100vh - var(--nav-h) - 1rem); } .graphic-overlay-img { max-height:calc(100vh - var(--nav-h) - 5rem); } }
   </style>
 </head>
 <body>
@@ -269,13 +289,21 @@
       </a>
       <div class="nav-links" id="nav-links">
         <a href="#features">Features</a>
+        <a href="#getting-started">Quick Start</a>
         <a href="concepts/" class="nav-ext">Concepts</a>
-        <a href="tutorials/benchmarking_examples/" class="nav-ext">Benchmarks</a>
-        <a href="#examples">Examples</a>
+        <div class="nav-item nav-has-dropdown">
+          <a href="benchmarks/benchmarks/" class="nav-ext">Benchmarking</a>
+          <ul class="nav-dropdown">
+            <li><a href="benchmarks/benchmarks/">Overview</a></li>
+            <li><a href="benchmarks/socket_benchmarking/">Socket and RDMA Benchmarking</a></li>
+            <li><a href="benchmarks/raw_benchmarking/">Raw Ethernet Benchmarking</a></li>
+          </ul>
+        </div>
         <div class="nav-item nav-has-dropdown">
           <a href="#tutorials">Tutorials</a>
           <ul class="nav-dropdown">
             <li><a href="tutorials/system_configuration/">System Configuration</a></li>
+            <li><a href="tutorials/bare-metal-cmake-build/">Bare-Metal CMake Build</a></li>
             <li><a href="tutorials/configuration-walkthrough/">Configuration YAML Walkthrough</a></li>
           </ul>
         </div>
@@ -285,6 +313,7 @@
             <li><a href="api-reference/">API Guide</a></li>
             <li><a href="api-reference/configuration/">Configuration YAML Reference</a></li>
             <li><a href="api-reference/cpp/">C++ API Usage</a></li>
+            <li><a href="api-reference/python/">Python API Usage</a></li>
           </ul>
         </div>
         <a href="#publications">News</a>
@@ -309,15 +338,21 @@
     <div class="hero-glow"></div>
     <div class="container">
       <div class="hero-inner">
-      <div class="hero-content">
-        <div class="hero-eyebrow">NVIDIA Open Source · Data Acquisition</div>
-        <h1 class="hero-title">DAQIRI — Command the<br>Data Deluge at the <span class="hi">Source</span></h1>
-        <p class="hero-desc">
-          DAQIRI (Data Acquisition for Integrated Real-time Instruments) connects high-bandwidth streaming sensors
-          directly to the NVIDIA compute ecosystem. By abstracting zero-copy data movement from sensor to GPU,
-          DAQIRI puts scalable, real-time AI, signal processing, and scientific computing within reach of the next
-          generation of instruments.
-        </p>
+        <div class="hero-content">
+          <div class="hero-eyebrow">NVIDIA Open Source · Data Acquisition</div>
+          <h1 class="hero-title">DAQIRI for Sensor Data<br>in <span class="hi">CPU or NVIDIA GPU Memory</span></h1>
+          <p class="hero-desc">
+            DAQIRI (Data Acquisition for Integrated Real-time Instruments) moves high-bandwidth data between external
+            sensors and CPU or NVIDIA GPU memory. Streams can arrive from PCIe devices such as FPGAs or from network-capable sensors
+            over Raw Ethernet (UDP/TCP) or RoCE/RDMA, giving applications one zero-copy path for ingest and egress.
+            GPU-resident data can also write out through GPUDirect Storage.
+          </p>
+        </div>
+        <div class="hero-visual">
+          <button class="hero-visual-link" type="button" data-graphic-open aria-controls="daqiri-graphic-overlay" aria-label="Open the DAQIRI sensor data path graphic overlay">
+            <img src="images/daqiri-landing-graphic.svg" alt="DAQIRI moves PCIe FPGA and network sensor data into or out of CPU or NVIDIA GPU memory, then writes GPU data to storage through GPUDirect Storage" />
+          </button>
+        </div>
         <div class="warn-banner">
           <span style="font-size:1rem;flex-shrink:0;margin-top:.05rem;">⚠️</span>
           <p>The library is undergoing large improvements as we aim to better support it as an NVIDIA product. API breakages may be more frequent until version 1.0.</p>
@@ -326,18 +361,16 @@ <h1 class="hero-title">DAQIRI — Command the<br>Data Deluge at the <span class=
           <a href="#getting-started" class="btn btn-primary">Quick Start →</a>
           <a href="concepts/" class="btn btn-outline">Key Concepts</a>
           <a href="api-reference/" class="btn btn-outline">API Reference</a>
-          <a href="#examples" class="btn btn-outline">Examples</a>
+          <a href="benchmarks/benchmarks/" class="btn btn-outline">Benchmarking</a>
         </div>
         <div class="hero-stats">
-          <div><div class="stat-value">Gbps – Tbps+</div><div class="stat-label">Sensor Bandwidth</div></div>
-          <div><div class="stat-value">Zero-Copy</div><div class="stat-label">Sensor → GPU</div></div>
-          <div><div class="stat-value">UDP, RoCE</div><div class="stat-label">Protocol</div></div><div><div class="stat-value">C++</div><div class="stat-label">Language</div></div><div><div class="stat-value">Multi-Sensor</div><div class="stat-label">Scalable</div></div><div><div class="stat-value">Minutes</div><div class="stat-label">Time to Deployment</div></div><div><div class="stat-value">Apache 2.0</div><div class="stat-label">License</div></div>
+          <div><div class="stat-value">PCIe + Ethernet</div><div class="stat-label">Sensor Paths</div></div>
+          <div><div class="stat-value">Ingest + Egress</div><div class="stat-label">Data Direction</div></div>
+          <div><div class="stat-value">Zero-Copy</div><div class="stat-label">CPU/GPU Memory</div></div>
+          <div><div class="stat-value">Raw Ethernet, RoCE</div><div class="stat-label">Protocols</div></div>
+          <div><div class="stat-value">C++ / Python</div><div class="stat-label">Application API</div></div>
         </div>
       </div>
-      <div class="hero-logo-wrap">
-        <img src="images/logo.svg" alt="DAQIRI — sensor connected to GPU infrastructure" />
-      </div>
-      </div>
     </div>
   </section>
 
@@ -664,11 +697,11 @@ <h2 class="section-title">Tutorials</h2>
         <a href="tutorials/bare-metal-cmake-build/" class="tut-item" style="text-decoration:none;color:inherit;"><span class="tut-num">02</span><div class="tut-info"><div class="tut-title">Bare-Metal CMake Build</div><div class="tut-desc">End-to-end bare-metal build: verify prerequisites, install RDMA libraries, build patched DPDK 25.11 from source, configure <code>DAQIRI_MGR</code> / <code>DAQIRI_BUILD_PYTHON</code> / <code>CMAKE_CUDA_ARCHITECTURES</code>, install, smoke-test, troubleshoot.</div></div><div class="tut-meta"><span class="tag tag-int">Intermediate</span><span class="tut-time">~45 min</span></div><span class="tut-arrow">→</span></a>
         <div class="tut-item tut-soon"><span class="tut-num">03</span><div class="tut-info"><div class="tut-title">Container Build with Patched DPDK</div><div class="tut-desc">Build the Docker image with <code>build-container.sh</code>. The container ships a dmabuf-patched DPDK, so peermem is not required.</div></div><div class="tut-meta"><span class="tag tag-soon">Coming Soon</span></div><span class="tut-arrow">→</span></div>
         <a href="tutorials/system_configuration/" class="tut-item" style="text-decoration:none;color:inherit;"><span class="tut-num">04</span><div class="tut-info"><div class="tut-title">System Tuning for High-Performance Networking</div><div class="tut-desc">Isolate CPU cores, configure hugepages, set NUMA affinity, and run <code>python/tune_system.py</code> to diagnose common configuration issues.</div></div><div class="tut-meta"><span class="tag tag-int">Intermediate</span><span class="tut-time">~30 min</span></div><span class="tut-arrow">→</span></a>
-        <a href="tutorials/benchmarking_examples/" class="tut-item" style="text-decoration:none;color:inherit;"><span class="tut-num">05</span><div class="tut-info"><div class="tut-title">Benchmarking Examples</div><div class="tut-desc">Run a TX/RX loopback test to validate your setup, and walk through interpreting throughput results.</div></div><div class="tut-meta"><span class="tag tag-beg">Beginner</span><span class="tut-time">~20 min</span></div><span class="tut-arrow">→</span></a>
-        <a href="tutorials/configuration-walkthrough/" class="tut-item" style="text-decoration:none;color:inherit;"><span class="tut-num">06</span><div class="tut-info"><div class="tut-title">YAML Configuration Deep Dive</div><div class="tut-desc">Memory regions (<code>huge</code>, <code>device</code>, <code>host_pinned</code>), RX/TX queue setup, flow steering rules, flex items, and RDMA client/server config schemas.</div></div><div class="tut-meta"><span class="tag tag-int">Intermediate</span><span class="tut-time">~40 min</span></div><span class="tut-arrow">→</span></a>
-        <div class="tut-item tut-soon"><span class="tut-num">07</span><div class="tut-info"><div class="tut-title">GPUDirect: Header-Data Split Pipeline</div><div class="tut-desc">Configure a two-region memory layout, access CPU headers and GPU payloads per-packet with <code>get_segment_packet_ptr()</code>, and reorder scattered GPU buffers with the built-in CUDA kernel.</div></div><div class="tut-meta"><span class="tag tag-soon">Coming Soon</span></div><span class="tut-arrow">→</span></div>
-        <div class="tut-item tut-soon"><span class="tut-num">08</span><div class="tut-info"><div class="tut-title">RoCE (RDMA) Client/Server Setup</div><div class="tut-desc">Configure <code>stream_type: socket</code>, <code>protocol: roce</code> with RC transport, assign client and server roles across two hosts, and run <code>daqiri_bench_rdma</code> to validate the connection.</div></div><div class="tut-meta"><span class="tag tag-soon">Coming Soon</span></div><span class="tut-arrow">→</span></div>
-        <div class="tut-item tut-soon"><span class="tut-num">09</span><div class="tut-info"><div class="tut-title">Timed TX with ConnectX-7</div><div class="tut-desc">Enable <code>accurate_send</code> in the TX config and use <code>set_packet_tx_time()</code> for PTP-synchronized, hardware-scheduled packet transmission on ConnectX-7+.</div></div><div class="tut-meta"><span class="tag tag-soon">Coming Soon</span></div><span class="tut-arrow">→</span></div>
+        <a href="benchmarks/socket_benchmarking/" class="tut-item" style="text-decoration:none;color:inherit;"><span class="tut-num">05</span><div class="tut-info"><div class="tut-title">Socket and RDMA Benchmarking</div><div class="tut-desc">Run TCP/UDP sockets and RoCE/RDMA with matching namespace isolation and PHY-counter checks.</div></div><div class="tut-meta"><span class="tag tag-int">Intermediate</span><span class="tut-time">~30 min</span></div><span class="tut-arrow">→</span></a>
+        <a href="benchmarks/raw_benchmarking/" class="tut-item" style="text-decoration:none;color:inherit;"><span class="tut-num">06</span><div class="tut-info"><div class="tut-title">Raw Ethernet Benchmarking</div><div class="tut-desc">Run a DPDK raw Ethernet TX/RX loopback test and interpret NIC throughput counters.</div></div><div class="tut-meta"><span class="tag tag-int">Intermediate</span><span class="tut-time">~20 min</span></div><span class="tut-arrow">→</span></a>
+        <a href="tutorials/configuration-walkthrough/" class="tut-item" style="text-decoration:none;color:inherit;"><span class="tut-num">07</span><div class="tut-info"><div class="tut-title">YAML Configuration Deep Dive</div><div class="tut-desc">Memory regions (<code>huge</code>, <code>device</code>, <code>host_pinned</code>), RX/TX queue setup, flow steering rules, flex items, and RDMA client/server config schemas.</div></div><div class="tut-meta"><span class="tag tag-int">Intermediate</span><span class="tut-time">~40 min</span></div><span class="tut-arrow">→</span></a>
+        <div class="tut-item tut-soon"><span class="tut-num">08</span><div class="tut-info"><div class="tut-title">GPUDirect: Header-Data Split Pipeline</div><div class="tut-desc">Configure a two-region memory layout, access CPU headers and GPU payloads per-packet with <code>get_segment_packet_ptr()</code>, and reorder scattered GPU buffers with the built-in CUDA kernel.</div></div><div class="tut-meta"><span class="tag tag-soon">Coming Soon</span></div><span class="tut-arrow">→</span></div>
+        <div class="tut-item tut-soon"><span class="tut-num">10</span><div class="tut-info"><div class="tut-title">Timed TX with ConnectX-7</div><div class="tut-desc">Enable <code>accurate_send</code> in the TX config and use <code>set_packet_tx_time()</code> for PTP-synchronized, hardware-scheduled packet transmission on ConnectX-7+.</div></div><div class="tut-meta"><span class="tag tag-soon">Coming Soon</span></div><span class="tut-arrow">→</span></div>
       </div>
     </div>
   </section>
@@ -749,7 +782,7 @@ <h2 style="text-align:center;">Connect Your Sensors to the NVIDIA Ecosystem</h2>
             <li><a href="api-reference/cpp/">C++ API Usage</a></li>
             <li><a href="api-reference/python/">Python API Usage</a></li>
             <li><a href="getting-started/">Getting Started</a></li>
-            <li><a href="#examples">Examples</a></li>
+            <li><a href="benchmarks/benchmarks/">Benchmarking</a></li>
           </ul>
         </div>
         <div>
@@ -782,39 +815,113 @@ <h2 style="text-align:center;">Connect Your Sensors to the NVIDIA Ecosystem</h2>
     </div>
   </footer>
 
+  <div class="graphic-overlay" id="daqiri-graphic-overlay" aria-hidden="true">
+    <div class="graphic-overlay-backdrop" data-graphic-close></div>
+    <div class="graphic-overlay-panel" role="dialog" aria-modal="true" aria-labelledby="daqiri-graphic-title">
+      <div class="graphic-overlay-bar">
+        <h2 class="graphic-overlay-title" id="daqiri-graphic-title">DAQIRI Sensor Data and Storage Paths</h2>
+        <button class="graphic-overlay-close" type="button" data-graphic-close aria-label="Close graphic overlay">
+          <svg viewBox="0 0 24 24" aria-hidden="true">
+            <path d="M6 6l12 12M18 6L6 18" fill="none" stroke="currentColor" stroke-width="2.2" stroke-linecap="round"/>
+          </svg>
+        </button>
+      </div>
+      <img class="graphic-overlay-img" src="images/daqiri-landing-graphic.svg" alt="DAQIRI moves PCIe FPGA and network sensor data into or out of CPU or NVIDIA GPU memory, then writes GPU data to storage through GPUDirect Storage" />
+    </div>
+  </div>
+
   <script>
-    // Toggle .scrolled on #navbar once the user has scrolled a few pixels --
-    // CSS at the top of <style> uses this to drop a soft shadow under the bar.
     const nb = document.getElementById('navbar');
-    if (nb) {
-      const updateScrolled = () => nb.classList.toggle('scrolled', window.scrollY > 8);
-      updateScrolled();
-      window.addEventListener('scroll', updateScrolled, { passive: true });
+    const navLinks = Array.from(document.querySelectorAll('.nav-links a[href^="#"]'));
+    const navTargets = navLinks
+      .map((link) => document.querySelector(link.getAttribute('href')))
+      .filter(Boolean);
+
+    function updateNav() {
+      if (nb) nb.classList.toggle('scrolled', window.scrollY > 8);
+
+      const active = navTargets
+        .slice()
+        .reverse()
+        .find((section) => section.getBoundingClientRect().top <= 96);
+
+      navLinks.forEach((link) => {
+        link.classList.toggle('active', active && link.getAttribute('href') === `#${active.id}`);
+      });
     }
 
+    updateNav();
+    window.addEventListener('scroll', updateNav, { passive:true });
+
     // Hamburger drawer for tablet / mobile viewports (<1100px). The button
     // is hidden via CSS on wider viewports, so the listener is harmless
     // there.
     const ham   = document.querySelector('.nav-hamburger');
-    const links = document.querySelector('.nav-links');
-    if (ham && links) {
+    const navLinksContainer = document.querySelector('.nav-links');
+    if (ham && navLinksContainer) {
       const setOpen = (open) => {
-        links.classList.toggle('is-open', open);
+        navLinksContainer.classList.toggle('is-open', open);
         ham.classList.toggle('is-open', open);
         ham.setAttribute('aria-expanded', String(open));
       };
-      ham.addEventListener('click', () => setOpen(!links.classList.contains('is-open')));
+      ham.addEventListener('click', () => setOpen(!navLinksContainer.classList.contains('is-open')));
       // Tapping any link inside the drawer should dismiss it -- otherwise the
       // drawer stays pinned over the page that the link scrolled / navigated
       // to.
-      links.addEventListener('click', (e) => {
+      navLinksContainer.addEventListener('click', (e) => {
         if (e.target.closest('a')) setOpen(false);
       });
       // Same when the viewport grows back past the breakpoint.
       window.addEventListener('resize', () => {
-        if (window.innerWidth > 1100 && links.classList.contains('is-open')) setOpen(false);
+        if (window.innerWidth > 1100 && navLinksContainer.classList.contains('is-open')) setOpen(false);
       });
     }
+
+    const graphicOpen = document.querySelector('[data-graphic-open]');
+    const graphicOverlay = document.getElementById('daqiri-graphic-overlay');
+    const graphicCloseControls = Array.from(document.querySelectorAll('[data-graphic-close]'));
+    const graphicCloseButton = document.querySelector('.graphic-overlay-close');
+    let graphicPreviousFocus = null;
+
+    function openGraphicOverlay() {
+      if (!graphicOverlay) return;
+      graphicPreviousFocus = document.activeElement;
+      graphicOverlay.classList.add('is-open');
+      graphicOverlay.setAttribute('aria-hidden', 'false');
+      document.body.classList.add('graphic-overlay-open');
+      if (graphicCloseButton) graphicCloseButton.focus();
+    }
+
+    function closeGraphicOverlay() {
+      if (!graphicOverlay) return;
+      graphicOverlay.classList.remove('is-open');
+      graphicOverlay.setAttribute('aria-hidden', 'true');
+      document.body.classList.remove('graphic-overlay-open');
+      if (window.location.hash === '#graphic') {
+        history.replaceState(null, '', `${window.location.pathname}${window.location.search}`);
+      }
+      if (graphicPreviousFocus && typeof graphicPreviousFocus.focus === 'function') {
+        graphicPreviousFocus.focus();
+      }
+    }
+
+    if (graphicOpen) {
+      graphicOpen.addEventListener('click', openGraphicOverlay);
+    }
+
+    graphicCloseControls.forEach((control) => {
+      control.addEventListener('click', closeGraphicOverlay);
+    });
+
+    document.addEventListener('keydown', (event) => {
+      if (event.key === 'Escape' && graphicOverlay && graphicOverlay.classList.contains('is-open')) {
+        closeGraphicOverlay();
+      }
+    });
+
+    if (window.location.hash === '#graphic') {
+      window.requestAnimationFrame(openGraphicOverlay);
+    }
   </script>
 </body>
 </html>
diff --git a/docs/javascripts/tab-dropdowns.js b/docs/javascripts/tab-dropdowns.js
index d3dad08..aa5de0a 100644
--- a/docs/javascripts/tab-dropdowns.js
+++ b/docs/javascripts/tab-dropdowns.js
@@ -18,6 +18,7 @@
     ],
     "Tutorials": [
       { label: "System Configuration",          path: "tutorials/system_configuration/" },
+      { label: "Bare-Metal CMake Build",        path: "tutorials/bare-metal-cmake-build/" },
       { label: "Configuration YAML Walkthrough", path: "tutorials/configuration-walkthrough/" }
     ]
   };
diff --git a/docs/tutorials/bare-metal-cmake-build.md b/docs/tutorials/bare-metal-cmake-build.md
index 3a2f243..8969cd2 100644
--- a/docs/tutorials/bare-metal-cmake-build.md
+++ b/docs/tutorials/bare-metal-cmake-build.md
@@ -12,7 +12,7 @@ It is the long-form companion to the five-line `cmake` snippet in [Getting Start
     - you are packaging DAQIRI into another product that already provides a runtime image;
     - you are debugging a build problem inside the container's `daqiri-build` stage and need to reproduce it on the host.
 
-    If none of those apply, follow [System Configuration](system_configuration.md) and then [Benchmarking Examples](benchmarking_examples.md) instead.
+    If none of those apply, follow [System Configuration](system_configuration.md) and then [Raw Ethernet Benchmarking](../benchmarks/raw_benchmarking.md) instead.
 
 ## Prerequisite verification
 
@@ -305,7 +305,7 @@ ldd /opt/daqiri/lib/libdaqiri.so | head
 
 ### 5.3 Smoke test
 
-Verify the build with the standard two-port TX/RX loopback. This requires a NIC with two ports connected to each other by a physical SFP cable, and that you replace the `<angle-bracket>` placeholders in the YAML (PCIe BDFs, CPU cores, destination MAC) for your system. The walkthrough for those edits lives in [Benchmarking Examples → Update the loopback configuration](benchmarking_examples.md#update-the-loopback-configuration); do that first, then run:
+Verify the build with the standard two-port TX/RX loopback. This requires a NIC with two ports connected to each other by a physical SFP cable, and that you replace the `<angle-bracket>` placeholders in the YAML (PCIe BDFs, CPU cores, destination MAC) for your system. The walkthrough for those edits lives in [Raw Ethernet Benchmarking → Update the loopback configuration](../benchmarks/raw_benchmarking.md#update-the-loopback-configuration); do that first, then run:
 
 ```bash
 sudo ./build/examples/daqiri_bench_raw_gpudirect \
@@ -317,7 +317,7 @@ A successful run prints a stream of `[INFO]` lines followed by an RX/TX rate sum
 
 !!! tip "DGX Spark"
 
-    On DGX Spark, use the prefilled `daqiri_bench_raw_tx_rx_spark.yaml` instead; only `eth_dst_addr` needs an edit. See the [DGX Spark profile callout](benchmarking_examples.md#update-the-loopback-configuration) for the exact MAC-lookup command.
+    On DGX Spark, use the prefilled `daqiri_bench_raw_tx_rx_spark.yaml` instead; only `eth_dst_addr` needs an edit. See the [DGX Spark profile callout](../benchmarks/raw_benchmarking.md#update-the-loopback-configuration) for the exact MAC-lookup command.
 
 !!! note "No NIC available?"
 
@@ -406,7 +406,7 @@ The build recipe above is the same on every supported host. The notes below cove
     - GB10 is **compute capability 12.1** (`sm_121`). DAQIRI's default arch list adds `121` automatically when configuring with **CUDA Toolkit 13.0 or newer**; on those toolkits no override is needed. On older toolkits, GB10 is not supported.
     - DGX Spark uses **NVLink-C2C unified memory** and has no separate GPU BAR1, so data buffers in YAML configs use `kind: host_pinned` rather than `kind: device`. The DGX-Spark-prefilled YAMLs in `examples/*_spark.yaml` already encode this.
     - `nvidia-peermem` is not used; GPUDirect goes through the dma-buf path enabled by the DPDK patches in [Step 3](#step-3-build-dpdk-with-daqiri-patches).
-    - For a runnable end-to-end test after the build completes, follow the [DGX Spark profile callout](benchmarking_examples.md#update-the-loopback-configuration) in Benchmarking Examples: the prefilled `daqiri_bench_raw_tx_rx_spark.yaml` and `daqiri_bench_rdma_tx_rx_spark.yaml` need only an `eth_dst_addr` edit.
+    - For a runnable end-to-end test after the build completes, follow the [DGX Spark profile callout](../benchmarks/raw_benchmarking.md#update-the-loopback-configuration) in Raw Ethernet Benchmarking: the prefilled `daqiri_bench_raw_tx_rx_spark.yaml` and `daqiri_bench_rdma_tx_rx_spark.yaml` need only an `eth_dst_addr` edit.
 
 === "IGX Orin + dGPU"
 
@@ -444,5 +444,5 @@ The build recipe above is the same on every supported host. The notes below cove
 Once `libdaqiri.so` is installed and the [smoke test](#53-smoke-test) passes:
 
 1. [**System Configuration**](system_configuration.md): tune the host (hugepages, NIC link layer, GPU BAR1, CPU isolation) for production performance.
-2. [**Benchmarking Examples**](benchmarking_examples.md): run `daqiri_bench_raw_gpudirect` over a physical loopback.
+2. [**Raw Ethernet Benchmarking**](../benchmarks/raw_benchmarking.md): run `daqiri_bench_raw_gpudirect` over a physical loopback.
 3. [**Understanding the Configuration File**](configuration-walkthrough.md): pick the right starter YAML for your use case from the decision tree.
diff --git a/docs/tutorials/configuration-walkthrough.md b/docs/tutorials/configuration-walkthrough.md
index 95f2ccf..f601894 100644
--- a/docs/tutorials/configuration-walkthrough.md
+++ b/docs/tutorials/configuration-walkthrough.md
@@ -19,7 +19,7 @@ If you don't have any NIC at all, the `*_sw_loopback*` variants of the Raw Ether
 
 (`DAQIRI_MGR` at the CMake layer is the inverse selector: it tells the build which manager implementations to compile in — `dpdk` enables `stream_type: "raw"`, `socket` enables `stream_type: "socket"` with `protocol: "udp"`/`"tcp"`, and `rdma` enables `protocol: "roce"`. The default build enables all three.)
 
-With a stream type in mind, read down the questions below and stop at the first one that matches what you're trying to do. Each section names the YAML, the binary that consumes it, and any platform-specific notes.
+For a shorter backend-selection guide, start with the [Benchmarking overview](../benchmarks/benchmarks.md). With a stream type in mind, read down the questions below and stop at the first one that matches what you're trying to do. Each section names the YAML, the binary that consumes it, and any platform-specific notes.
 
 ??? question "1. I want to measure baseline throughput"
     Pick the stream type that matches your stack (see the [overview](#choosing-the-appropriate-daqiri-stream-type-for-your-setup) above), then the hardware or protocol variant.
@@ -28,19 +28,19 @@ With a stream type in mind, read down the questions below and stop at the first
 
     - **Generic discrete GPU** (template — replace `<placeholders>`) — [`daqiri_bench_raw_tx_rx.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx.yaml). This is the file annotated line-by-line in the [walkthrough below](#annotated-walkthrough).
     - **Four queue closed-loop TX+RX** (template — replace `<placeholders>`) — [`daqiri_bench_raw_tx_rx_4q.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx_4q.yaml). Uses one application worker per TX/RX queue, with each `bench_tx` entry sending a different UDP flow.
-    - **DGX Spark / GB10** (prefilled) — [`daqiri_bench_raw_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx_spark.yaml). `kind: host_pinned` for the integrated GPU; cores, PCIe addresses, and IPs are prefilled. See the [Spark profile callout](benchmarking_examples.md#update-the-loopback-configuration) for run details.
+    - **DGX Spark / GB10** (prefilled) — [`daqiri_bench_raw_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx_spark.yaml). `kind: host_pinned` for the integrated GPU; cores, PCIe addresses, and IPs are prefilled. See the [Spark profile callout](../benchmarks/raw_benchmarking.md#update-the-loopback-configuration) for run details.
     - **No physical NIC available** — [`daqiri_bench_raw_sw_loopback.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_sw_loopback.yaml). `loopback: "sw"`, no NIC required. Useful for first-time build verification, not representative of production performance.
 
     To watch the same raw loopback benchmark with live Prometheus and Grafana
     counters, use the Grafana compose stack described in
-    [Watch live OpenTelemetry metrics in Grafana](benchmarking_examples.md#watch-live-opentelemetry-metrics-in-grafana).
+    [Watch live OpenTelemetry metrics in Grafana](../benchmarks/raw_benchmarking.md#watch-live-opentelemetry-metrics-in-grafana).
 
     **Socket — RoCE (RDMA)** (`stream_type: "socket"`, `protocol: "roce"`) — runs on `daqiri_bench_rdma` (use `--mode {tx,rx,both}`). Configs use `kind: host_pinned` regardless of platform.
 
     - **Generic** (template — replace IPs) — [`daqiri_bench_rdma_tx_rx.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_rdma_tx_rx.yaml).
-    - **DGX Spark** (prefilled) — [`daqiri_bench_rdma_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_rdma_tx_rx_spark.yaml). See the [Spark profile callout](benchmarking_examples.md#update-the-loopback-configuration) for run details.
+    - **DGX Spark** (prefilled) — [`daqiri_bench_rdma_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_rdma_tx_rx_spark.yaml). See [Socket and RDMA Benchmarking](../benchmarks/socket_benchmarking.md#run-the-rdma-roce-benchmark) for namespace and wire-counter run details.
 
-    **Socket — UDP / TCP** (`stream_type: "socket"`, `protocol: "udp"` or `"tcp"`) — runs on `daqiri_bench_socket`. Both bind to `127.0.0.1`.
+    **Socket — UDP / TCP** (`stream_type: "socket"`, `protocol: "udp"` or `"tcp"`) — runs on `daqiri_bench_socket`. The shipped smoke-test configs bind to `127.0.0.1`; see [Socket and RDMA Benchmarking](../benchmarks/socket_benchmarking.md) for namespace-based wire tests.
 
     - **UDP** — [`daqiri_bench_socket_udp_tx_rx.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_socket_udp_tx_rx.yaml).
     - **TCP** — [`daqiri_bench_socket_tcp_tx_rx.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_socket_tcp_tx_rx.yaml).
@@ -418,4 +418,4 @@ The reorder bench runs on `daqiri_bench_raw_reorder_seq`:
 Other reorder variants are listed under [question 2 of the decision tree above](#choosing-an-example-config): the CPU-kernel variant, the RX-only variants, and the `seq_batch_number` algorithm with in-kernel int4 → fp32 type conversion (runs on `daqiri_bench_raw_reorder_quantize`).
 
 ---
-**Previous:** [Benchmarking Examples](benchmarking_examples.md)
+**Previous:** [Raw Ethernet Benchmarking](../benchmarks/raw_benchmarking.md)
diff --git a/docs/tutorials/system_configuration.md b/docs/tutorials/system_configuration.md
index d67c0cb..ae37301 100644
--- a/docs/tutorials/system_configuration.md
+++ b/docs/tutorials/system_configuration.md
@@ -18,7 +18,7 @@ DAQIRI requires an [**NVIDIA SmartNIC**](https://www.nvidia.com/en-us/networking
 
     ## System Setup for DAQIRI
 
-    This section covers the essential system setup steps needed before using DAQIRI. Complete this setup before moving on to [System Optimization](#system-optimization) or [running benchmarks](benchmarking_examples.md).
+    This section covers the essential system setup steps needed before using DAQIRI. Complete this setup before moving on to [System Optimization](#system-optimization) or [running benchmarks](../benchmarks/benchmarks.md).
 
     In this tutorial, we will be developing on an **NVIDIA IGX Orin platform** with [IGX SW 1.1](https://docs.nvidia.com/igx-orin/user-guide/latest/base-os.html) and an [NVIDIA RTX 6000 ADA GPU](https://www.nvidia.com/en-us/design-visualization/rtx-6000/), which is the configuration that is currently actively tested. The concepts should be applicable to other systems based on Ubuntu 22.04 as well. It should also work on other Linux distributions with a glibc version of 2.35 or higher by containerizing the dependencies and applications on top of an Ubuntu 22.04 image, but this is not actively tested at this time.
 
@@ -1298,7 +1298,7 @@ DAQIRI requires an [**NVIDIA SmartNIC**](https://www.nvidia.com/en-us/networking
         ```
 
     ---
-    **Next:** [Benchmarking Examples](benchmarking_examples.md) — run your first DAQIRI benchmark
+    **Next:** [Benchmarking](../benchmarks/benchmarks.md) — choose and run your first DAQIRI benchmark
 
 === "DGX Spark"
 
@@ -1362,7 +1362,7 @@ DAQIRI requires an [**NVIDIA SmartNIC**](https://www.nvidia.com/en-us/networking
         - **Same physical port** (e.g. `mlx5_0` ↔ `mlx5_2`, both p0) → TX/RX loop **on-chip** through the eswitch; traffic never reaches the cable. Physical-link packet counters stay flat while the vport counters (`tx_good_packets` / `rx_good_packets`) run at line rate. This is a software-path test.
         - **Different physical ports** (e.g. `mlx5_0` p0 ↔ `mlx5_3` p1 `0002:01:00.1`, or `mlx5_0` ↔ `mlx5_1`) → TX/RX loop **over the wire**; physical-link packet counters rise to match the TX/RX counts. This is an over-the-wire test.
 
-        Confirm which case you got from the physical-link packet counters: near zero for on-chip, matching the TX/RX packet counts for over-the-wire. These counters count packets that reached the SerDes/QSFP side of the NIC rather than packets switched internally by the eswitch. The [daqiri bench](benchmarking_examples.md)'s DPDK "Extended Stats" output reports them as `tx_phy_packets` / `rx_phy_packets`; `ethtool -S` and `mlnx_perf` report the same wire counters as `tx_packets_phy` / `rx_packets_phy`.
+        Confirm which case you got from the physical-link packet counters: near zero for on-chip, matching the TX/RX packet counts for over-the-wire. These counters count packets that reached the SerDes/QSFP side of the NIC rather than packets switched internally by the eswitch. The [daqiri bench](../benchmarks/raw_benchmarking.md)'s DPDK "Extended Stats" output reports them as `tx_phy_packets` / `rx_phy_packets`; `ethtool -S` and `mlnx_perf` report the same wire counters as `tx_packets_phy` / `rx_packets_phy`.
 
     `ethtool -m` reports identical `Connector: 0x23 No separable connector` on all 4 PFs and is **not** useful for distinguishing them; use `phys_port_name` above (the cable-yank carrier test confirms a cable is present but does **not** distinguish ports).
 
@@ -1585,6 +1585,6 @@ DAQIRI requires an [**NVIDIA SmartNIC**](https://www.nvidia.com/en-us/networking
     ```
 
     ---
-    **Next:** [Benchmarking Examples](benchmarking_examples.md) — run your first DAQIRI benchmark
+    **Next:** [Benchmarking](../benchmarks/benchmarks.md) — choose and run your first DAQIRI benchmark
 
 </div>
diff --git a/mkdocs.yml b/mkdocs.yml
index 65d9e7d..5c5c4b4 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -49,7 +49,10 @@ site_dir: site
 nav:
   - Getting Started: getting-started.md
   - Concepts: concepts.md
-  - Benchmarks: tutorials/benchmarking_examples.md
+  - Benchmarking:
+    - Overview: benchmarks/benchmarks.md
+    - Socket and RDMA Benchmarking: benchmarks/socket_benchmarking.md
+    - Raw Ethernet Benchmarking: benchmarks/raw_benchmarking.md
   - API Reference:
     - API Guide: api-reference/index.md
     - Configuration YAML Reference: api-reference/configuration.md
diff --git a/src/managers/socket/daqiri_socket_mgr.cpp b/src/managers/socket/daqiri_socket_mgr.cpp
index 4361544..da0e412 100644
--- a/src/managers/socket/daqiri_socket_mgr.cpp
+++ b/src/managers/socket/daqiri_socket_mgr.cpp
@@ -47,6 +47,8 @@ namespace daqiri {
 
 namespace {
 
+constexpr size_t kMaxUdpPayloadBytes = 65507;
+
 bool parse_ipv4_addr(const std::string& ip, uint16_t port, sockaddr_in* addr) {
   if (addr == nullptr) { return false; }
 
@@ -769,6 +771,14 @@ bool SocketMgr::send_udp_burst(EndpointState& ep, BurstParams* burst, size_t* se
     use_sendto = true;
   }
 
+  for (size_t i = 0; i < num_pkts; ++i) {
+    const auto len = static_cast<size_t>(burst->pkt_lens[0][i]);
+    if (len > kMaxUdpPayloadBytes) {
+      DAQIRI_LOG_ERROR("UDP payload length {} exceeds maximum {} bytes", len, kMaxUdpPayloadBytes);
+      return false;
+    }
+  }
+
   std::vector<mmsghdr> msgs(num_pkts);
   std::vector<iovec> iovs(num_pkts);
   std::vector<sockaddr_in> peers;
@@ -863,7 +873,7 @@ Status SocketMgr::send_tx_burst(BurstParams* burst) {
       status = Status::CONNECT_FAILURE;
     }
   } else if (cfg_.common_.protocol == SocketProtocol::TCP) {
-    if (conn == nullptr) {
+    if (conn == nullptr || !conn->running.load()) {
       DAQIRI_LOG_ERROR("No active TCP connection for port {}", ep->port);
       status = Status::CONNECT_FAILURE;
     } else {
@@ -1276,9 +1286,6 @@ void SocketMgr::tcp_rx_loop(std::shared_ptr<ConnectionState> conn) {
 
   conn->running.store(false);
   close_fd(conn->fd);
-
-  std::lock_guard<std::mutex> lock(state_mutex_);
-  connections_.erase(conn->conn_id);
 }
 
 void SocketMgr::udp_rx_loop(int if_index) {
@@ -1365,6 +1372,17 @@ Status SocketMgr::socket_connect_to_server(const std::string& dst_addr, uint16_t
     if (ep == nullptr || ep->socket_cfg.mode_ != SocketMode::CLIENT) { continue; }
 
     if (cfg_.common_.protocol == SocketProtocol::TCP) {
+      if (ep->primary_conn_id != 0 && ep->socket_cfg.remote_ip_ == dst_addr &&
+          ep->socket_cfg.remote_port_ == dst_port &&
+          (src_addr.empty() || src_addr == ep->socket_cfg.local_ip_)) {
+        std::lock_guard<std::mutex> lock(state_mutex_);
+        const auto it = connections_.find(ep->primary_conn_id);
+        if (it != connections_.end() && it->second != nullptr && it->second->running.load()) {
+          *conn_id = ep->primary_conn_id;
+          return Status::SUCCESS;
+        }
+      }
+
       auto conn = create_tcp_client_connection(*ep, dst_addr, dst_port, src_addr, 0, true);
       if (conn == nullptr) { return Status::CONNECT_FAILURE; }
       *conn_id = conn->conn_id;