diff --git a/.claude/rules/docs-sync.md b/.claude/rules/docs-sync.md index ad05793..fa8f595 100644 --- a/.claude/rules/docs-sync.md +++ b/.claude/rules/docs-sync.md @@ -31,11 +31,11 @@ When the user is committing, pushing, or otherwise wrapping up a change that tou - `docs/getting-started.md` (build instructions) - `AGENTS.md` (build section) - `README.md` (Quick Start commands) -- `src/kernels.cu` / `src/kernels.h` — reorder or quantize kernel changes affect `docs/tutorials/benchmarking_examples.md` and the Reorder & quantize kernels subsection in `AGENTS.md`. +- `src/kernels.cu` / `src/kernels.h` — reorder or quantize kernel changes affect `docs/benchmarks/raw_benchmarking.md` and the Reorder & quantize kernels subsection in `AGENTS.md`. ### Benchmark and example changes (medium impact) - `examples/*.cpp` or `examples/*.yaml` — new benchmarks, changed CLI flags, or new YAML config keys may need updating in: - - `docs/tutorials/benchmarking_examples.md` + - `docs/benchmarks/raw_benchmarking.md` - `docs/tutorials/configuration-walkthrough.md` — when adding or removing a YAML, add or remove a leaf in the **"Choosing an example config"** decision tree (`#choosing-an-example-config`). CI's `scripts/check_doc_refs.py` enforces that every YAML in `examples/` is referenced in this file; a new config without a tree leaf will fail the check. - `AGENTS.md` (benchmark table) - When adding or removing a benchmark executable, also update the benchmark table in `AGENTS.md`. @@ -77,10 +77,10 @@ When the user is committing, pushing, or otherwise wrapping up a change that tou | `src/manager.h` | `docs/api-reference/cpp.md`, `docs/concepts.md`, `AGENTS.md` | | `src/managers/*/` | `docs/getting-started.md`, `docs/concepts.md` (backend list + maturity), `docs/api-reference/configuration.md`, `docs/tutorials/configuration-walkthrough.md`, `README.md`, `AGENTS.md` | | `src/CMakeLists.txt` | `docs/getting-started.md`, `AGENTS.md`, `README.md` | -| `src/kernels.cu` | `docs/tutorials/benchmarking_examples.md`, `AGENTS.md` | +| `src/kernels.cu` | `docs/benchmarks/raw_benchmarking.md`, `AGENTS.md` | | `python/daqiri_common_pybind.cpp` | `docs/api-reference/python.md`, `AGENTS.md` | -| `examples/*.cpp` | `docs/tutorials/benchmarking_examples.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` | -| `examples/*.yaml` | `docs/tutorials/benchmarking_examples.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` | +| `examples/*.cpp` | `docs/benchmarks/raw_benchmarking.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` | +| `examples/*.yaml` | `docs/benchmarks/raw_benchmarking.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` | | `examples/*.py` | `docs/api-reference/python.md`, `AGENTS.md` | | `mkdocs.yml` | `docs/index.html` (nav links) | | Any `docs/*` rename/move | `README.md` (Documentation table), `AGENTS.md` (Documentation section), `mkdocs.yml`, `docs/index.html` | diff --git a/.greptile/config.json b/.greptile/config.json index 75d2798..5436c2f 100644 --- a/.greptile/config.json +++ b/.greptile/config.json @@ -62,7 +62,7 @@ }, { "id": "doc-sync", - "rule": "DAQIRI has no automated doc-sync gate beyond mkdocs/strict link checks. When a PR changes any of the files listed in .claude/rules/docs-sync.md, the matching docs must be updated in the same PR. Specifically: src/common.h | src/types.h | src/manager.h => docs/api-guide.md + docs/daqiri-api.html + AGENTS.md (Architecture); src/managers/* => docs/getting-started.md + docs/configuration.md + docs/tutorials/configuration-walkthrough.md + README.md (Backends) + AGENTS.md; src/CMakeLists.txt => docs/getting-started.md + AGENTS.md (Build & run) + README.md (Quick Start); src/kernels.cu => docs/tutorials/benchmarking_examples.md + AGENTS.md; examples/*.{cpp,yaml} => docs/tutorials/benchmarking_examples.md + docs/tutorials/configuration-walkthrough.md + AGENTS.md (benchmark table). If the PR touches code in these paths but does not update the matching docs, flag it as medium severity and list the specific docs to update.", + "rule": "DAQIRI has no automated doc-sync gate beyond mkdocs/strict link checks. When a PR changes any of the files listed in .claude/rules/docs-sync.md, the matching docs must be updated in the same PR. Specifically: src/common.h | src/types.h | src/manager.h => docs/api-guide.md + docs/daqiri-api.html + AGENTS.md (Architecture); src/managers/* => docs/getting-started.md + docs/configuration.md + docs/tutorials/configuration-walkthrough.md + README.md (Backends) + AGENTS.md; src/CMakeLists.txt => docs/getting-started.md + AGENTS.md (Build & run) + README.md (Quick Start); src/kernels.cu => docs/benchmarks/raw_benchmarking.md + AGENTS.md; examples/*.{cpp,yaml} => docs/benchmarks/raw_benchmarking.md + docs/tutorials/configuration-walkthrough.md + AGENTS.md (benchmark table). If the PR touches code in these paths but does not update the matching docs, flag it as medium severity and list the specific docs to update.", "scope": ["src/**", "examples/**", "mkdocs.yml", "README.md", "AGENTS.md", "docs/**"], "severity": "medium" }, diff --git a/.greptile/rules.md b/.greptile/rules.md index af38c34..09b38a4 100644 --- a/.greptile/rules.md +++ b/.greptile/rules.md @@ -106,8 +106,8 @@ The mapping (mirrored from `.claude/rules/docs-sync.md`): | `src/manager.h` | `docs/api-guide.md`, `AGENTS.md` (Manager abstraction) | | `src/managers/*/` | `docs/getting-started.md`, `docs/configuration.md`, `docs/tutorials/configuration-walkthrough.md`, `README.md` (Backends), `AGENTS.md` | | `src/CMakeLists.txt` (CMake options, `DAQIRI_MGR` default, CUDA arch) | `docs/getting-started.md`, `AGENTS.md` (Build & run), `README.md` (Quick Start) | -| `src/kernels.cu` / `src/kernels.h` | `docs/tutorials/benchmarking_examples.md`, `AGENTS.md` (Reorder & quantize kernels) | -| `examples/*.cpp`, `examples/*.yaml` (new bench, new CLI flag, new YAML key) | `docs/tutorials/benchmarking_examples.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` (benchmark table) | +| `src/kernels.cu` / `src/kernels.h` | `docs/benchmarks/raw_benchmarking.md`, `AGENTS.md` (Reorder & quantize kernels) | +| `examples/*.cpp`, `examples/*.yaml` (new bench, new CLI flag, new YAML key) | `docs/benchmarks/raw_benchmarking.md`, `docs/tutorials/configuration-walkthrough.md`, `AGENTS.md` (benchmark table) | | `mkdocs.yml` nav | `docs/index.html` (landing page links) | | Any `docs/*` rename or move | `README.md` (Documentation table), `AGENTS.md` (Documentation section), `mkdocs.yml`, `docs/index.html` | diff --git a/AGENTS.md b/AGENTS.md index effe7ff..3e78e3e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -97,7 +97,10 @@ The web docs live in `docs/` and are built with [MkDocs Material](https://squidf - `docs/api-reference/index.md` — API guide (6-step application lifecycle, configuration-first model) - `docs/api-reference/configuration.md`, `docs/api-reference/cpp.md`, `docs/api-reference/python.md` — YAML schema, C++ API, and Python bindings docs - `docs/tutorials/` — tutorial walkthroughs (system config, config-file walkthrough) -- `docs/tutorials/benchmarking_examples.md` — surfaced as a top-level "Benchmarks" nav entry in `mkdocs.yml` and `docs/index.html`; file kept at its original path for inbound-link stability +- `docs/benchmarks/` — benchmark guide pages, surfaced as a top-level "Benchmarking" nav section in `mkdocs.yml` and `docs/index.html`: + - `docs/benchmarks/benchmarks.md` — overview and backend-selection decision tree + - `docs/benchmarks/socket_benchmarking.md` — "Socket and RDMA Benchmarking" (TCP/UDP and RoCE/RDMA) + - `docs/benchmarks/raw_benchmarking.md` — "Raw Ethernet Benchmarking" (DPDK `raw_*` benches) - `docs/stylesheets/extra.css` — custom theme overrides **User-facing vocabulary:** docs and the YAML schema use `stream_type` (`raw`, `socket`, future `pcie`) and `protocol` (`udp`, `tcp`, `roce`). The word "backend" is internal-only — accurate for `src/managers//`, the `Manager` ABC, CMake `DAQIRI_MGR`, and API-reference function blurbs, but should not appear in tutorials, the landing page, or concept pages. The mapping: `stream_type: "raw"` is implemented by the `dpdk` manager; `stream_type: "socket"` with `protocol: "udp"` / `"tcp"` is implemented by the `socket` manager; `stream_type: "socket"` with `protocol: "roce"` is implemented by the `rdma` manager. @@ -105,7 +108,7 @@ The web docs live in `docs/` and are built with [MkDocs Material](https://squidf **Keeping docs in sync with code:** before committing changes, scan for the recurring drift hotspots: - **Stream-type list** (`src/managers/*/`) — README Backends table, `docs/getting-started.md`, `docs/concepts.md` (Stream Types section + Support and testing admonition), `docs/api-reference/configuration.md` - **CMake options / `DAQIRI_MGR` default** (`src/CMakeLists.txt:137`) — README Quick Start, `docs/getting-started.md`, this file's Build & run section -- **Benchmark binary or YAML names** (`examples/`) — the benchmark table above, `docs/tutorials/benchmarking_examples.md`, and the "Choosing an example config" decision tree in `docs/tutorials/configuration-walkthrough.md` (every YAML must have a leaf; CI's `scripts/check_doc_refs.py` enforces coverage) +- **Benchmark binary or YAML names** (`examples/`) — the benchmark table above, `docs/benchmarks/raw_benchmarking.md`, and the "Choosing an example config" decision tree in `docs/tutorials/configuration-walkthrough.md` (every YAML must have a leaf; CI's `scripts/check_doc_refs.py` enforces coverage) - **Public API include** (`#include `; source files under `include/daqiri/`) — `docs/api-reference/index.md`, `docs/api-reference/cpp.md`, `docs/api-reference/python.md`; if the change adds or renames a user-facing concept, also `docs/concepts.md` - **Python bindings** (`python/daqiri_common_pybind.cpp`) — `docs/api-reference/python.md` (function reference tables, enums/classes tables, GIL Behavior section) - **Doc reorganization** (any rename in `docs/`) — `docs/index.html` landing page, `mkdocs.yml` nav, README Documentation table diff --git a/README.md b/README.md index 44867bd..617ed90 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ target storage stack to be reported as supported by `gdscheck.py -p`. Container build: ```bash -BASE_TARGET=dpdk DAQIRI_MGR="dpdk rdma" scripts/build-container.sh +BASE_TARGET=dpdk DAQIRI_MGR="dpdk socket rdma" scripts/build-container.sh ``` OpenTelemetry metrics are opt-in. Build with `-DDAQIRI_ENABLE_OTEL_METRICS=ON` @@ -81,6 +81,30 @@ exporters. See [Getting Started](https://nvidia.github.io/daqiri/getting-started/) for requirements, CMake options, and running the benchmarks. +## Benchmarking + +Start with the [Benchmarking overview](https://nvidia.github.io/daqiri/benchmarks/benchmarks/) to choose between Linux sockets, RoCE/RDMA, and raw Ethernet. + +For Spark-style on-wire tests, use the same client/server namespace shape for Linux sockets and RDMA/RoCE: put the client-facing NIC in one namespace, the server-facing NIC in another, pin routes and neighbors to those interfaces, then verify `tx_packets_phy` on the client and `rx_packets_phy` on the server before trusting bandwidth numbers. + +```bash +# Linux TCP/UDP sockets, split by namespace +ip netns exec dq_wire_server ./build/examples/daqiri_bench_socket \ + /tmp/socket-server.yaml --seconds 10 --mode server & +ip netns exec dq_wire_client ./build/examples/daqiri_bench_socket \ + /tmp/socket-client.yaml --seconds 10 --mode client +wait + +# RoCE/RDMA, using the same namespace pair +ip netns exec dq_wire_server ./build/examples/daqiri_bench_rdma \ + /tmp/rdma-server.yaml --seconds 10 --mode server & +ip netns exec dq_wire_client ./build/examples/daqiri_bench_rdma \ + /tmp/rdma-client.yaml --seconds 10 --mode client +wait +``` + +See [Socket and RDMA Benchmarking](https://nvidia.github.io/daqiri/benchmarks/socket_benchmarking/) for the full namespace setup and YAML templates. See [Raw Ethernet Benchmarking](https://nvidia.github.io/daqiri/benchmarks/raw_benchmarking/) for DPDK/raw Ethernet loopback tests. + ## Documentation Reference material for the DAQIRI codebase: @@ -98,7 +122,9 @@ Reference material for the DAQIRI codebase: Step-by-step walkthroughs to get hands-on: - [System Configuration](https://nvidia.github.io/daqiri/tutorials/system_configuration/) — NIC drivers, link layers, GPUDirect, hugepages, CPU isolation, GPU clocks -- [Benchmarking Examples](https://nvidia.github.io/daqiri/tutorials/benchmarking_examples/) — run `daqiri_bench_raw_gpudirect` with a loopback test +- [Benchmarking Overview](https://nvidia.github.io/daqiri/benchmarks/benchmarks/) — choose between Linux sockets, RoCE/RDMA, and raw Ethernet benchmarks +- [Socket and RDMA Benchmarking](https://nvidia.github.io/daqiri/benchmarks/socket_benchmarking/) — run TCP/UDP sockets and RoCE/RDMA with matching namespace isolation +- [Raw Ethernet Benchmarking](https://nvidia.github.io/daqiri/benchmarks/raw_benchmarking/) — run `daqiri_bench_raw_gpudirect` with a physical loopback test - [Understanding the Configuration File](https://nvidia.github.io/daqiri/tutorials/configuration-walkthrough/) — annotated YAML walkthrough ## License diff --git a/docs/benchmarks/benchmarks.md b/docs/benchmarks/benchmarks.md new file mode 100644 index 0000000..c8dc400 --- /dev/null +++ b/docs/benchmarks/benchmarks.md @@ -0,0 +1,49 @@ +# Benchmarking + +DAQIRI ships with several backends to handle different types of incoming and outgoing streams. Choosing the stream type depends on the type of sensor being used and its capabilities. The `stream_type` is decided from the decision tree below: + +![DAQIRI networking backend decision tree](../images/backend-decision-tree.svg) + +## Choose a backend + +| Use case | DAQIRI config | Benchmark | Start here | +|---|---|---|---| +| Ingest from or egress to a programmable PCIe sensor, such as an FPGA on the PCIe bus. | `stream_type: "pcie"` | Coming soon | PCIe benchmarking docs are coming soon. | +| Compare against normal Linux networking, run on a non-NVIDIA NIC, or test a peer that speaks TCP/UDP sockets. | `stream_type: "socket"` with `protocol: "tcp"` or `protocol: "udp"` | `daqiri_bench_socket` | [Socket and RDMA Benchmarking](socket_benchmarking.md) | +| Test a peer that already implements RDMA verbs over RoCE. | `stream_type: "socket"` with `protocol: "roce"` | `daqiri_bench_rdma` | [Socket and RDMA Benchmarking](socket_benchmarking.md#run-the-rdma-roce-benchmark) | +| Drive raw Ethernet packets directly from an NVIDIA NIC under DAQIRI control. | `stream_type: "raw"` | `daqiri_bench_raw_gpudirect` and the other `raw_*` benches | [Raw Ethernet Benchmarking](raw_benchmarking.md) | + +!!! note "PCIe backend status" + + The PCIe programmable-sensor path is under development. Once completed it will allow 3rd party PCIe devices + to read from and write to the GPU's BAR1 memory. + +!!! note "Why RDMA is listed under socket" + + The RoCE benchmark uses the connection-oriented socket/RDMA configuration model. The executable is named `daqiri_bench_rdma` to show the RDMA-specific API calls. + +## Common benchmark workflow + +1. Build the examples with the backend you plan to test. The default container build enables all three: + + ```bash + BASE_TARGET=dpdk DAQIRI_MGR="dpdk socket rdma" scripts/build-container.sh + ``` + +2. Pick the physical pair or host pair that should carry the traffic. For same-host Spark wire tests, prefer a client namespace and a server namespace so the route cannot silently fall back to loopback. + +3. Prove the direction with hardware counters before trusting bandwidth numbers. For one-way client-to-server tests, the important counters are the client-side `tx_packets_phy` / `tx_bytes_phy` and the server-side `rx_packets_phy` / `rx_bytes_phy`. + +4. Run the DAQIRI benchmark and a known baseline such as `iperf3` or `ib_send_bw` with the same namespace, interface, and message-size assumptions. + +5. Monitor line rate with NIC counters or `mlnx_perf`; application-side byte counts are useful, but hardware counters answer whether packets actually reached the physical path. + +## Page map + +- [Socket and RDMA Benchmarking](socket_benchmarking.md) covers Linux TCP/UDP and RoCE/RDMA runs with matching client/server namespace setup. +- [Raw Ethernet Benchmarking](raw_benchmarking.md) covers the DPDK/raw Ethernet examples, hugepage sizing, physical loopback configuration, and raw benchmark troubleshooting. +- [Understanding the Configuration File](../tutorials/configuration-walkthrough.md) explains the YAML fields once you have selected the backend and example config. + +--- +**Previous:** [System Configuration](../tutorials/system_configuration.md)
+**Next:** [Socket and RDMA Benchmarking](socket_benchmarking.md) diff --git a/docs/tutorials/benchmarking_examples.md b/docs/benchmarks/raw_benchmarking.md similarity index 93% rename from docs/tutorials/benchmarking_examples.md rename to docs/benchmarks/raw_benchmarking.md index 1cb98fe..1eb415c 100644 --- a/docs/tutorials/benchmarking_examples.md +++ b/docs/benchmarks/raw_benchmarking.md @@ -1,19 +1,14 @@ ---- -hide: - - navigation ---- - -# Benchmarking Examples +# Raw Ethernet Benchmarking -DAQIRI provides a benchmarking application named `daqiri_bench_raw_gpudirect` that can be used to test the performance of the networking configuration. In this section, we'll walk you through the steps needed to configure the application for your NIC for Tx and Rx, and run a loopback test between the two interfaces with a [physical SFP cable](https://www.nvidia.com/en-us/networking/interconnect/) connecting them. +DAQIRI provides raw Ethernet benchmark applications that use DPDK to drive an NVIDIA NIC directly. This page walks through `daqiri_bench_raw_gpudirect`, the TX/RX loopback config, and the raw Ethernet checks needed before interpreting throughput results. Make sure to [build the DAQIRI library](../getting-started.md#build-the-daqiri-library) beforehand. -**Not sure which YAML to start from?** See [Choosing an example config](configuration-walkthrough.md#choosing-an-example-config) in the configuration tutorial — a use-case-driven decision tree from "I just want to verify the build" through reorder, recording, RDMA, and sockets. +**Not sure which backend to benchmark?** Start with the [Benchmarking overview](benchmarks.md). Use this page after you have chosen the raw Ethernet backend. Use [Socket and RDMA Benchmarking](socket_benchmarking.md) for TCP, UDP, and RoCE/RDMA runs. !!! note "Prerequisites" - Before running the benchmarking application, ensure your system has been fully configured per the [System Configuration](system_configuration.md) page. + Before running the benchmarking application, ensure your system has been fully configured per the [System Configuration](../tutorials/system_configuration.md) page. ## Configure hugepages first @@ -23,11 +18,11 @@ Size the hugepage pool to your YAML's `memory_regions` plus DPDK overhead before grep Huge /proc/meminfo ``` -For a persistent allocation across reboots, use the grub recipe in [Step 4 of System Configuration](system_configuration.md#step-4-enable-huge-pages). +For a persistent allocation across reboots, use the grub recipe in [Step 4 of System Configuration](../tutorials/system_configuration.md#step-4-enable-huge-pages). ## Running the DAQIRI container -If you built DAQIRI using the container approach, use the following command to launch the container with Raw Ethernet (DPDK) and GPU support. The host system must be fully configured (see [System Configuration](system_configuration.md)) before the container can access the NIC and GPU hardware. +If you built DAQIRI using the container approach, use the following command to launch the container with Raw Ethernet (DPDK) and GPU support. The host system must be fully configured (see [System Configuration](../tutorials/system_configuration.md)) before the container can access the NIC and GPU hardware. ```bash docker run --rm -it --privileged \ @@ -41,7 +36,7 @@ docker run --rm -it --privileged \ | Flag | Purpose | |------|---------| - | `--privileged` | DPDK requires raw access to NIC hardware (PCI devices, hugepage files). Also covers `/dev/infiniband` for RDMA. | + | `--privileged` | DPDK requires raw access to NIC hardware, PCI devices, and hugepage files. | | `--runtime=nvidia` | Makes the host GPU visible inside the container via the NVIDIA Container Toolkit | | `--network=host` | Shares the host network namespace so DPDK can discover the physical NIC interfaces and their PCIe topology | | `-v /dev/hugepages:/dev/hugepages` | Mounts the hugepage filesystem for DPDK memory allocation (`--privileged` alone does not cover mounted filesystems) | @@ -50,7 +45,7 @@ docker run --rm -it --privileged \ !!! tip "DGX Spark" - For systems configured per the [DGX Spark profile](system_configuration.md#dgx-spark-profile), use these configs to skip the PCIe/IP/CPU-core edits below: + For systems configured per the [DGX Spark profile](../tutorials/system_configuration.md#dgx-spark-profile), use these configs to skip the PCIe/IP/CPU-core edits below: - [`daqiri_bench_raw_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx_spark.yaml) for `daqiri_bench_raw_gpudirect` — still set `eth_dst_addr` to the RX MAC. The rx_port is `0002:01:00.1` (physical port p1), so read its MAC: `cat /sys/class/net/enP2p1s0f1np1/address`. This p0-to-p1 pairing is intentional for an over-the-wire single-machine loopback; using two PFs that map to the same physical port exercises the on-chip eswitch path instead. - [`daqiri_bench_rdma_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_rdma_tx_rx_spark.yaml) for `daqiri_bench_rdma` — no further edits needed. @@ -62,7 +57,7 @@ The benchmark executables and example YAML configurations are located at: | **Container** | `/opt/daqiri/bin/` | `/opt/daqiri/bin/` | | **From source** | `./build/examples/` | `./examples/` | -The fields in the YAML configs will be explained in more detail in [Understanding the Configuration File](configuration-walkthrough.md). For now, we'll stick to modifying the strict minimum required fields to run the application as-is on your system. +The fields in the YAML configs will be explained in more detail in [Understanding the Configuration File](../tutorials/configuration-walkthrough.md). For now, we'll stick to modifying the strict minimum required fields to run the application as-is on your system. ##### Identify your NIC's PCIe addresses @@ -426,7 +421,7 @@ The `*_packets_phy` and `*_bytes_phy` counters are physical-link counters. They [critical] [adv_network_dpdk_mgr.cpp:430] Failed to map MRs ``` - Check the [GPUDirect setup](system_configuration.md#enable-gpudirect) for your + Check the [GPUDirect setup](../tutorials/system_configuration.md#enable-gpudirect) for your deployment. Some host builds use `nvidia-peermem`; the container path uses dma-buf support from the patched DPDK build. @@ -450,13 +445,13 @@ The `*_packets_phy` and `*_bytes_phy` counters are physical-link counters. They EAL: x hugepages of size x reserved, no mounted hugetlbfs found for that size ``` - Ensure your [hugepages are mounted](system_configuration.md#step-4-enable-huge-pages). + Ensure your [hugepages are mounted](../tutorials/system_configuration.md#step-4-enable-huge-pages). ```log EAL: No free x kB hugepages reported on node 0 ``` - Reachable only when the in-process preflight is bypassed (e.g. running an older binary against a host with hugepages reserved but not mounted). Mount per [System Configuration: Step 4](system_configuration.md#step-4-enable-huge-pages) and re-run. + Reachable only when the in-process preflight is bypassed (e.g. running an older binary against a host with hugepages reserved but not mounted). Mount per [System Configuration: Step 4](../tutorials/system_configuration.md#step-4-enable-huge-pages) and re-run. ??? failure "Stale `map_*` files in /dev/hugepages after a SIGKILL" @@ -478,5 +473,5 @@ The `*_packets_phy` and `*_bytes_phy` counters are physical-link counters. They You might need to kill some of the listed processes to free up GPU VRAM. --- -**Previous:** [System Configuration](system_configuration.md) -**Next:** [Understanding the Configuration File](configuration-walkthrough.md) — deep dive into the YAML parameters +**Previous:** [Benchmarking](benchmarks.md)
+**Next:** [Understanding the Configuration File](../tutorials/configuration-walkthrough.md) — deep dive into the YAML parameters diff --git a/docs/benchmarks/socket_benchmarking.md b/docs/benchmarks/socket_benchmarking.md new file mode 100644 index 0000000..eb6731c --- /dev/null +++ b/docs/benchmarks/socket_benchmarking.md @@ -0,0 +1,369 @@ +# Socket and RDMA Benchmarking + +Use this page when the peer protocol is TCP, UDP, or RoCE/RDMA. These benchmarks use the Linux networking stack for TCP/UDP and RDMA verbs for RoCE, so the same client/server namespace shape is useful for proving that traffic leaves the host through the expected NIC path. + +Make sure to [build DAQIRI](../getting-started.md#build-the-daqiri-library) with the socket and RDMA backends first. + +## Backend choices + +| Protocol | YAML selector | Benchmark executable | Typical reason to use it | +|---|---|---|---| +| TCP | `stream_type: "socket"`, `protocol: "tcp"` | `daqiri_bench_socket` | Baseline against normal Linux streams or test a TCP-speaking peer. | +| UDP | `stream_type: "socket"`, `protocol: "udp"` | `daqiri_bench_socket` | Datagram baseline against Linux networking. UDP payloads must be at most `65507` bytes. | +| RoCE/RDMA | `stream_type: "socket"`, `protocol: "roce"` | `daqiri_bench_rdma` | Compare DAQIRI RDMA verbs against tools such as `ib_send_bw` or `ib_write_bw`. | + +## Build and launch a test shell + +Build the socket and RDMA benchmarks inside the DAQIRI container: + +```bash +docker run --rm --privileged --network=host --gpus all --ipc=host \ + --user "$(id -u):$(id -g)" \ + -v /dev/hugepages:/dev/hugepages \ + -v "$PWD:/work" \ + -w /work daqiri:local \ + bash -lc 'cmake -S . -B build-socket-rdma \ + -DBUILD_SHARED_LIBS=ON \ + -DDAQIRI_BUILD_PYTHON=OFF \ + -DDAQIRI_MGR="dpdk socket rdma" && + cmake --build build-socket-rdma \ + --target daqiri_bench_socket daqiri_bench_rdma -j"$(nproc)"' +``` + +Run the benchmark setup commands as root. The easiest path is a privileged, host-networked DAQIRI container: + +```bash +docker run --rm -it --privileged --network=host --pid=host --ipc=host \ + --gpus all \ + -v "$PWD:/work" \ + -v /tmp:/tmp \ + -w /work daqiri:local bash +``` + +Install network tools inside the container if needed: + +```bash +apt-get update +apt-get install -y iproute2 iputils-ping ethtool iperf3 rdma-core ibverbs-utils +``` + +## Create isolated namespaces + +Choose one transmit-facing interface and one receive-facing interface. The example below uses the Spark pair that was verified to increment physical counters on the tested system; adjust names, IPs, and MAC addresses on other machines. + +```bash +CLIENT_NS=dq_wire_client +SERVER_NS=dq_wire_server + +CLIENT_IF=enp1s0f0np0 +SERVER_IF=enp1s0f1np1 + +CLIENT_IP=10.250.0.1 +SERVER_IP=10.250.0.2 + +CLIENT_MAC=4c:bb:47:2a:ea:ed +SERVER_MAC=4c:bb:47:2a:ea:ee + +MTU=9082 +``` + +Create namespaces and pin routes to the physical interfaces: + +```bash +ip netns delete "$CLIENT_NS" >/dev/null 2>&1 || true +ip netns delete "$SERVER_NS" >/dev/null 2>&1 || true + +ip addr flush dev "$CLIENT_IF" || true +ip addr flush dev "$SERVER_IF" || true + +ip netns add "$CLIENT_NS" +ip netns add "$SERVER_NS" + +ip link set "$CLIENT_IF" netns "$CLIENT_NS" +ip link set "$SERVER_IF" netns "$SERVER_NS" + +ip -n "$CLIENT_NS" addr add "$CLIENT_IP/24" dev "$CLIENT_IF" +ip -n "$SERVER_NS" addr add "$SERVER_IP/24" dev "$SERVER_IF" + +ip -n "$CLIENT_NS" link set lo up +ip -n "$SERVER_NS" link set lo up +ip -n "$CLIENT_NS" link set "$CLIENT_IF" mtu "$MTU" up +ip -n "$SERVER_NS" link set "$SERVER_IF" mtu "$MTU" up + +ip -n "$CLIENT_NS" route add "$SERVER_IP/32" dev "$CLIENT_IF" +ip -n "$SERVER_NS" route add "$CLIENT_IP/32" dev "$SERVER_IF" + +ip -n "$CLIENT_NS" neigh replace "$SERVER_IP" \ + lladdr "$SERVER_MAC" dev "$CLIENT_IF" nud permanent +ip -n "$SERVER_NS" neigh replace "$CLIENT_IP" \ + lladdr "$CLIENT_MAC" dev "$SERVER_IF" nud permanent +``` + +Verify the route and a short control packet: + +```bash +ip -n "$CLIENT_NS" route get "$SERVER_IP" from "$CLIENT_IP" +ip -n "$SERVER_NS" route get "$CLIENT_IP" from "$SERVER_IP" +ip netns exec "$CLIENT_NS" ping -c 1 -W 1 "$SERVER_IP" +``` + +The route output should name the namespace interface, not `lo`. + +!!! note "RDMA device visibility" + + On most RoCE setups, the RDMA device follows the netdev/GID association used by the namespace. If `ibv_devinfo` or `rdma link show` inside a namespace cannot see the expected device, move the matching RDMA device into the namespace with `rdma dev set netns `, or run the RDMA benchmark in the host namespace and still verify the same physical counters. + +## Prove the pair hits the wire + +Capture directional PHY counters before and after a short transfer. For one-way client-to-server traffic: + +```bash +ip netns exec "$CLIENT_NS" ethtool -S "$CLIENT_IF" | \ + grep -E 'tx_packets_phy|tx_bytes_phy|tx_vport_unicast' +ip netns exec "$SERVER_NS" ethtool -S "$SERVER_IF" | \ + grep -E 'rx_packets_phy|rx_bytes_phy|rx_vport_unicast' +``` + +Use `iperf3` as a quick proof before running DAQIRI: + +```bash +ip netns exec "$SERVER_NS" iperf3 -s -B "$SERVER_IP" -1 & +sleep 1 +ip netns exec "$CLIENT_NS" iperf3 -c "$SERVER_IP" -B "$CLIENT_IP" -t 2 -P 1 +wait +``` + +Then check the counters again. Treat the result as on-wire only when the client `tx_packets_phy` and server `rx_packets_phy` counters increase by matching packet counts. If only vport counters move, pick a different port pair. + +## Run the Linux socket benchmark + +The shipped configs run both endpoints on `127.0.0.1` and are useful for a smoke test: + +```bash +./build-socket-rdma/examples/daqiri_bench_socket \ + examples/daqiri_bench_socket_udp_tx_rx.yaml \ + --seconds 10 --mode both + +./build-socket-rdma/examples/daqiri_bench_socket \ + examples/daqiri_bench_socket_tcp_tx_rx.yaml \ + --seconds 10 --mode both +``` + +For an on-wire namespace test, use separate server and client YAML files. The important fields are the protocol, namespace IPs, server port, `max_payload_size`, memory-region `buf_size`, and benchmark `message_size`. + +Server-side UDP template: + +```yaml +%YAML 1.2 +--- +daqiri: + cfg: + version: 1 + stream_type: "socket" + protocol: "udp" + master_core: 3 + debug: false + log_level: "info" + memory_regions: + - name: "DATA_SOCKET_SERVER" + kind: "host" + affinity: 0 + num_bufs: 1024 + buf_size: 65507 + interfaces: + - name: udp_server + address: 10.250.0.2 + socket_config: + mode: server + local_ip: 10.250.0.2 + local_port: 5021 + max_payload_size: 65535 + rx: + queues: + - name: "RX_Queue" + id: 0 + cpu_core: 8 + batch_size: 1 + memory_regions: ["DATA_SOCKET_SERVER"] + tx: + queues: + - name: "TX_Queue" + id: 0 + cpu_core: 7 + batch_size: 1 + memory_regions: ["DATA_SOCKET_SERVER"] + +socket_bench_server: + server: true + send: false + receive: true + iterations: 1000000000 + message_size: 65507 + server_address: 10.250.0.2 + client_address: 10.250.0.1 + server_port: 5021 +``` + +Client-side UDP template: + +```yaml +%YAML 1.2 +--- +daqiri: + cfg: + version: 1 + stream_type: "socket" + protocol: "udp" + master_core: 3 + debug: false + log_level: "info" + memory_regions: + - name: "DATA_SOCKET_CLIENT" + kind: "host" + affinity: 0 + num_bufs: 1024 + buf_size: 65507 + interfaces: + - name: udp_client + address: 10.250.0.1 + socket_config: + mode: client + local_ip: 10.250.0.1 + local_port: 5121 + remote_ip: 10.250.0.2 + remote_port: 5021 + max_payload_size: 65535 + rx: + queues: + - name: "RX_Queue" + id: 0 + cpu_core: 8 + batch_size: 1 + memory_regions: ["DATA_SOCKET_CLIENT"] + tx: + queues: + - name: "TX_Queue" + id: 0 + cpu_core: 7 + batch_size: 1 + memory_regions: ["DATA_SOCKET_CLIENT"] + +socket_bench_client: + server: false + send: true + receive: false + iterations: 1000000000 + message_size: 65507 + server_address: 10.250.0.2 + client_address: 10.250.0.1 + server_port: 5021 +``` + +For TCP, change `protocol: "udp"` to `protocol: "tcp"` in both files. For UDP, keep `message_size` at or below `65507`. + +Run the server and client in their namespaces: + +```bash +export LD_LIBRARY_PATH=/work/build-socket-rdma/src:${LD_LIBRARY_PATH:-} +BIN=/work/build-socket-rdma/examples/daqiri_bench_socket + +ip netns exec "$SERVER_NS" env LD_LIBRARY_PATH="$LD_LIBRARY_PATH" \ + "$BIN" /tmp/socket-server.yaml --seconds 11 --mode server & + +sleep 1 + +ip netns exec "$CLIENT_NS" env LD_LIBRARY_PATH="$LD_LIBRARY_PATH" \ + "$BIN" /tmp/socket-client.yaml --seconds 10 --mode client + +wait +``` + +For a four-process run, create four server/client YAML pairs with unique server ports such as `5021`, `5022`, `5023`, and `5024`, and unique client local ports such as `5121`, `5122`, `5123`, and `5124`. + +## Run the RDMA RoCE benchmark + +Start from `examples/daqiri_bench_rdma_tx_rx.yaml` or `examples/daqiri_bench_rdma_tx_rx_spark.yaml`. The full config can run both endpoints in one process: + +```bash +./build-socket-rdma/examples/daqiri_bench_rdma \ + examples/daqiri_bench_rdma_tx_rx_spark.yaml \ + --seconds 10 --mode both +``` + +For namespace testing, split the file by role just as in the Linux socket test: + +- The server YAML keeps the server memory regions, the server interface with `socket_config.mode: server`, and `rdma_bench_server`. +- The client YAML keeps the client memory regions, the client interface with `socket_config.mode: client`, and `rdma_bench_client`. +- Both files use `stream_type: "socket"` and `protocol: "roce"`. +- `rdma_bench_client.client_address` should be the client namespace IP. + +Run the split RDMA test with the same namespace pair: + +```bash +export LD_LIBRARY_PATH=/work/build-socket-rdma/src:${LD_LIBRARY_PATH:-} +BIN=/work/build-socket-rdma/examples/daqiri_bench_rdma + +ip netns exec "$SERVER_NS" env LD_LIBRARY_PATH="$LD_LIBRARY_PATH" \ + "$BIN" /tmp/rdma-server.yaml --seconds 11 --mode server & + +sleep 1 + +ip netns exec "$CLIENT_NS" env LD_LIBRARY_PATH="$LD_LIBRARY_PATH" \ + "$BIN" /tmp/rdma-client.yaml --seconds 10 --mode client + +wait +``` + +Use `ib_send_bw` or `ib_write_bw` in the same namespaces as a comparison baseline, and monitor `mlnx_perf` or `ethtool -S` on the same directional interfaces. + +## Example Spark socket results + +The following DAQIRI socket matrix was run on the verified physical path `enp1s0f0np0 -> enp1s0f1np1` with four client/server process pairs: + +| Protocol | Message size | App TX | App RX | Loss | Client `tx_packets_phy` | Server `rx_packets_phy` | +|---|---:|---:|---:|---:|---:|---:| +| TCP | 1000 | 10.93 Gb/s | 10.93 Gb/s | 0.00% | 1,513,047 | 1,513,047 | +| TCP | 8000 | 11.20 Gb/s | 11.20 Gb/s | 0.00% | 1,550,052 | 1,550,052 | +| TCP | 1 MiB | 11.67 Gb/s | 11.67 Gb/s | 0.00% | 1,615,399 | 1,615,399 | +| UDP | 1000 | 12.28 Gb/s | 11.68 Gb/s | 4.88% | 15,350,463 | 15,350,463 | +| UDP | 8000 | 12.93 Gb/s | 10.10 Gb/s | 21.91% | 2,020,461 | 2,020,461 | +| UDP | 65507 | 12.84 Gb/s | 12.41 Gb/s | 3.34% | 1,960,392 | 1,960,392 | + +UDP 1 MiB is intentionally skipped because Linux UDP payloads above `65507` bytes require fragmentation or segmentation behavior outside the benchmark's supported payload model. + +## Restore host networking + +After tests, move interfaces back to the host and restore the usual IPs. Adjust names and addresses for the target machine: + +```bash +for ns in "$CLIENT_NS" "$SERVER_NS"; do + ip netns exec "$ns" ip link set "$CLIENT_IF" netns 1 >/dev/null 2>&1 || true + ip netns exec "$ns" ip link set "$SERVER_IF" netns 1 >/dev/null 2>&1 || true +done + +ip netns delete "$CLIENT_NS" >/dev/null 2>&1 || true +ip netns delete "$SERVER_NS" >/dev/null 2>&1 || true + +for ifc in enp1s0f0np0 enp1s0f1np1 enP2p1s0f0np0 enP2p1s0f1np1; do + ip addr flush dev "$ifc" >/dev/null 2>&1 || true + ip link set dev "$ifc" mtu 9082 up >/dev/null 2>&1 || true +done +``` + +## Loopback disable knobs + +If namespace isolation still increments only vport counters, check whether the platform exposes loopback control: + +```bash +ethtool --show-priv-flags +ethtool --set-priv-flags local_lb off + +mlxconfig -d q | grep FORCE_LOOPBACK_DISABLE +mlxconfig -d set FORCE_LOOPBACK_DISABLE=1 +``` + +Treat firmware settings as maintenance-window changes: query first, set only with the proper Mellanox tooling available, then reset or reboot as required and rerun the same `rx_packets_phy` proof. + +--- +**Previous:** [Benchmarking](benchmarks.md)
+**Next:** [Raw Ethernet Benchmarking](raw_benchmarking.md) diff --git a/docs/concepts.md b/docs/concepts.md index ba1b46d..c9ca280 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -22,6 +22,11 @@ choice is configured per-application in YAML by two keys: - `protocol` — required when `stream_type: "socket"`; selects the socket-level protocol. +The shipped Ethernet stream types use NICs as their hardware endpoint. +The planned PCIe programmable-sensor path uses the same DAQIRI model for +devices that sit directly on the PCIe bus, such as FPGAs, frame grabbers, +or custom acquisition cards. + ### Raw Ethernet *YAML:* `stream_type: "raw"`. @@ -57,8 +62,11 @@ Requires an NVIDIA SmartNIC (ConnectX-6 Dx or later). *YAML:* `stream_type: "pcie"`. -Placeholder for an upcoming direct-PCIe stream type. Not implemented -yet. +Coming-soon path for sensors that appear directly on the PCIe bus, such +as FPGAs, frame grabbers, or custom acquisition cards. The goal is to +move data into or out of CPU or NVIDIA GPU memory through the same +DAQIRI C++/Python API while avoiding unnecessary copies. This stream +type does not currently ship with a runnable benchmark or example YAML. ### Choosing a stream type @@ -87,6 +95,7 @@ in the configuration walkthrough. - **Socket — RoCE** (`stream_type: "socket"`, `protocol: "roce"`) is supported and distributed; integration testing is under development. + - The **PCIe programmable-sensor** path is under development. ## GPUDirect diff --git a/docs/getting-started.md b/docs/getting-started.md index a752f18..cd61b5a 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -201,5 +201,5 @@ Once DAQIRI is built, follow the tutorials to configure your system and run your 1. [**Concepts**](concepts.md) — terminology (stream types and protocols, packet, burst, segment, flow, queue, memory region), GPUDirect, and zero-copy ownership. Keep this open in a second tab. 2. [**API Guide**](api-reference/index.md) — the six-step DAQIRI application lifecycle and configuration-first model 3. [**System Configuration**](tutorials/system_configuration.md) — NIC drivers, link layers, GPUDirect, hugepages, CPU isolation, GPU clocks, and more -4. [**Benchmarking Examples**](tutorials/benchmarking_examples.md) — run `daqiri_bench_raw_gpudirect` with a loopback test +4. [**Benchmarking**](benchmarks/benchmarks.md) — choose a backend, then run socket/RDMA or raw Ethernet benchmarks 5. [**Understanding the Configuration File**](tutorials/configuration-walkthrough.md) — annotated YAML walkthrough diff --git a/docs/images/architecture.svg b/docs/images/architecture.svg index b336eec..c0a9272 100644 --- a/docs/images/architecture.svg +++ b/docs/images/architecture.svg @@ -42,7 +42,7 @@ INGEST - NIC + NIC/PCIe GPUDirect RDMA diff --git a/docs/images/backend-decision-tree.svg b/docs/images/backend-decision-tree.svg new file mode 100644 index 0000000..25c1db3 --- /dev/null +++ b/docs/images/backend-decision-tree.svg @@ -0,0 +1,80 @@ + + DAQIRI networking backend decision tree + Choose the coming-soon PCIe path for programmable PCIe sensors, socket TCP or UDP for non-NVIDIA NICs, socket RoCE for NVIDIA NICs talking to an RDMA endpoint, or raw Ethernet for NVIDIA NICs without an existing RoCE endpoint. + + + + + + + + + + + + + + DAQIRI Networking Backend Decision Tree + Choose the backend that matches the local sensor, NIC, and peer protocol. + + + + + + + + + + + + + + + + + + + + PCIe Programmable + Sensor (FPGA, etc)? + + NVIDIA NIC? + + Existing endpoint + implementing RoCE? + + + + + Yes + No + No + Yes + Yes + No + + + + + + stream_type = pcie + coming soon + + + + + stream_type = socket, + protocol = tcp/udp + + + + + stream_type = socket, + protocol = roce + + + + + stream_type = raw + + diff --git a/docs/images/daqiri-landing-graphic.svg b/docs/images/daqiri-landing-graphic.svg new file mode 100644 index 0000000..26967e7 --- /dev/null +++ b/docs/images/daqiri-landing-graphic.svg @@ -0,0 +1,256 @@ + + DAQIRI sensor data paths to CPU, NVIDIA GPU memory, and storage + A diagram showing sensor data entering or leaving CPU memory or NVIDIA GPU memory through DAQIRI from a PCIe FPGA path or a network-capable sensor path through a NIC, then GPU-resident data writing out through GPUDirect Storage. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Sensor streams meet CPU or GPU memory + DAQIRI carries PCIe and network-capable streams into memory, then GPU data out to storage + + + + + + + + + FPGA + + + + + + + + PCIe device + FPGA / frame grabber + sensor front-end + + PCIe BAR + + + + + + + + + + + + + + + Network-capable + sensor + + Raw Ethernet + UDP/TCP + + RoCE + + + + + + NIC + + + + + + + + + + RX/TX + + + + + + + DAQIRI + C++/Python Library + + + + RX/TX bursts + + flow steering + + zero-copy API + + GDS writes + + + + + + Zero-copy to GPU or CPU + + + + + GPU + NVIDIA + + GPU + memory + + + + + + + + + + + + GPUDirect + Storage + + + + + + CPU + + CPU + memory + + + + + + + + PCIe + + + + + + + + + + + + + + + + + + + GPU + CPU + + + + + + + + + + + + + + + + + + + + + one application, sensor I/O plus GPU storage writes + PCIe or network sensors into CPU/GPU memory, then GPU data out to storage + + diff --git a/docs/index.html b/docs/index.html index f5b49e8..20edd91 100644 --- a/docs/index.html +++ b/docs/index.html @@ -31,6 +31,7 @@ pre .str { color:#c3e88d; } pre .nm { color:var(--nv-green-l); } .container { max-width:1200px; margin:0 auto; padding:0 2rem; } + #hero .container { max-width:1360px; } section { padding:6rem 0; } .section-label { font-size:.72rem; font-weight:700; letter-spacing:.15em; text-transform:uppercase; color:var(--nv-green); margin-bottom:.75rem; } .section-title { color:var(--text-pri); margin-bottom:1rem; } @@ -76,26 +77,45 @@ .btn-outline:hover { color:var(--text-pri); border-color:#444; background:rgba(255,255,255,.04); } /* HERO */ - #hero { min-height:100vh; display:flex; align-items:center; padding-top:var(--nav-h); position:relative; overflow:hidden; } - .hero-inner { display:grid; grid-template-columns:1fr auto; gap:4rem; align-items:center; } - .hero-logo-wrap { display:flex; align-items:center; justify-content:center; flex-shrink:0; } - .hero-logo-wrap img { width:320px; max-width:38vw; filter:drop-shadow(0 0 40px rgba(118,185,0,0.18)); } + #hero { min-height:calc(100vh - 24px); display:flex; align-items:center; padding:calc(var(--nav-h) + 3rem) 0 4rem; position:relative; overflow:hidden; } + .hero-inner { display:grid; grid-template-columns:minmax(0,1fr) minmax(460px,700px); column-gap:2rem; row-gap:2rem; align-items:center; } + .hero-visual { position:relative; z-index:1; grid-column:2; grid-row:1 / span 3; display:flex; align-items:center; justify-content:center; } + .hero-visual::before { content:''; position:absolute; inset:-5% -8% -2%; background:radial-gradient(ellipse,rgba(118,185,0,.28) 0%,transparent 68%); filter:blur(20px); z-index:-1; } + .hero-visual-link { display:block; width:min(100%,700px); padding:.45rem; border-radius:20px; appearance:none; background:linear-gradient(135deg,rgba(118,185,0,.34),rgba(93,216,242,.18)); border:1px solid rgba(160,208,0,.38); box-shadow:0 24px 70px rgba(0,0,0,.58),0 0 0 1px rgba(255,255,255,.05) inset; transition:transform var(--ease),border-color var(--ease),box-shadow var(--ease); cursor:zoom-in; } + .hero-visual-link:hover { transform:translateY(-2px); border-color:rgba(160,208,0,.68); box-shadow:0 30px 90px rgba(0,0,0,.65),0 0 34px rgba(118,185,0,.16); } + .hero-visual-link:focus-visible { outline:2px solid var(--nv-green); outline-offset:4px; } + .hero-visual img { width:100%; display:block; border-radius:16px; filter:contrast(1.08) brightness(1.08) saturate(1.12) drop-shadow(0 0 44px rgba(118,185,0,0.18)); } .hero-grid { position:absolute; inset:0; z-index:0; background-image:linear-gradient(rgba(118,185,0,.04) 1px,transparent 1px),linear-gradient(90deg,rgba(118,185,0,.04) 1px,transparent 1px); background-size:60px 60px; mask-image:radial-gradient(ellipse 80% 60% at 50% 40%,black 30%,transparent 100%); } .hero-glow { position:absolute; top:-20%; left:50%; transform:translateX(-50%); width:800px; height:500px; background:radial-gradient(ellipse,rgba(118,185,0,.11) 0%,transparent 70%); pointer-events:none; z-index:0; } - .hero-content { position:relative; z-index:1; max-width:780px; } + .hero-content { position:relative; z-index:1; grid-column:1; max-width:760px; } .hero-eyebrow { display:inline-flex; align-items:center; gap:.5rem; font-size:.72rem; font-weight:700; letter-spacing:.15em; text-transform:uppercase; color:var(--nv-green); background:rgba(118,185,0,.08); border:1px solid rgba(118,185,0,.2); border-radius:99px; padding:.35rem 1rem; margin-bottom:2rem; } .hero-eyebrow::before { content:''; width:6px; height:6px; border-radius:50%; background:var(--nv-green); animation:pulse 2s ease-in-out infinite; } @keyframes pulse { 0%,100%{opacity:1;transform:scale(1)}50%{opacity:.4;transform:scale(.8)} } - .hero-title { margin-bottom:1.5rem; } + .hero-title { font-size:clamp(1.85rem,2.8vw,2.35rem); margin-bottom:1.5rem; } .hero-title .hi { color:var(--nv-green); } - .hero-desc { font-size:1.15rem; color:var(--text-mut); max-width:620px; margin-bottom:2rem; line-height:1.75; } - .hero-actions { display:flex; align-items:center; gap:1rem; flex-wrap:wrap; margin-bottom:3.5rem; } - .hero-stats { display:flex; gap:3rem; flex-wrap:wrap; border-top:1px solid var(--border); padding-top:2.5rem; } - .stat-value { font-size:1.75rem; font-weight:800; color:var(--text-pri); letter-spacing:-.03em; } + .hero-desc { font-size:1.15rem; color:var(--text-mut); max-width:620px; margin-bottom:0; line-height:1.75; } + .hero-actions { position:relative; z-index:1; grid-column:1; display:flex; align-items:center; gap:1rem; flex-wrap:wrap; } + .hero-stats { grid-column:1 / -1; display:flex; gap:3rem; flex-wrap:wrap; border-top:1px solid var(--border); padding-top:2.5rem; } + .stat-value { font-size:1.45rem; font-weight:800; color:var(--text-pri); letter-spacing:-.03em; } .stat-label { font-size:.78rem; color:var(--text-dim); font-weight:500; text-transform:uppercase; letter-spacing:.1em; margin-top:.2rem; } + /* GRAPHIC OVERLAY */ + body.graphic-overlay-open { overflow:hidden; } + .graphic-overlay { position:fixed; inset:calc(var(--nav-h) + 1rem) 1.5rem 1.5rem; z-index:2000; display:grid; place-items:center; opacity:0; visibility:hidden; pointer-events:none; transition:opacity var(--ease),visibility var(--ease); } + .graphic-overlay.is-open { opacity:1; visibility:visible; pointer-events:auto; } + .graphic-overlay-backdrop { position:absolute; inset:0; border-radius:22px; background:rgba(4,7,4,.74); border:1px solid rgba(118,185,0,.22); box-shadow:0 24px 120px rgba(0,0,0,.72); backdrop-filter:blur(6px); } + .graphic-overlay-panel { position:relative; width:min(1120px,calc(100vw - 6rem)); max-height:calc(100vh - var(--nav-h) - 4.5rem); display:flex; flex-direction:column; border-radius:18px; background:#070907; border:1px solid rgba(160,208,0,.5); box-shadow:0 0 0 1px rgba(255,255,255,.05) inset,0 0 48px rgba(118,185,0,.16); overflow:hidden; transform:translateY(8px) scale(.985); transition:transform var(--ease); } + .graphic-overlay.is-open .graphic-overlay-panel { transform:translateY(0) scale(1); } + .graphic-overlay-bar { display:flex; align-items:center; justify-content:space-between; gap:1rem; padding:.85rem 1rem; border-bottom:1px solid rgba(118,185,0,.2); background:rgba(17,22,13,.96); } + .graphic-overlay-title { margin:0; color:var(--text-pri); font-size:.95rem; font-weight:800; letter-spacing:0; } + .graphic-overlay-close { width:36px; height:36px; display:inline-flex; align-items:center; justify-content:center; flex-shrink:0; border-radius:8px; border:1px solid rgba(255,255,255,.14); background:rgba(255,255,255,.05); color:var(--text-pri); cursor:pointer; transition:background var(--ease),border-color var(--ease),color var(--ease); } + .graphic-overlay-close:hover { background:rgba(118,185,0,.14); border-color:rgba(160,208,0,.5); color:var(--nv-green-l); } + .graphic-overlay-close:focus-visible { outline:2px solid var(--nv-green); outline-offset:3px; } + .graphic-overlay-close svg { width:18px; height:18px; display:block; } + .graphic-overlay-img { width:100%; height:auto; max-height:calc(100vh - var(--nav-h) - 8.5rem); display:block; object-fit:contain; background:#050705; } + /* WARNING BANNER */ - .warn-banner { background:rgba(255,193,7,.07); border:1px solid rgba(255,193,7,.25); border-radius:var(--radius); padding:.9rem 1.25rem; display:flex; gap:.75rem; align-items:flex-start; margin-bottom:2rem; } + .warn-banner { position:relative; z-index:1; grid-column:1; background:rgba(255,193,7,.07); border:1px solid rgba(255,193,7,.25); border-radius:var(--radius); padding:.9rem 1.25rem; display:flex; gap:.75rem; align-items:flex-start; } .warn-banner p { color:rgba(255,203,107,.85); font-size:.875rem; } /* FEATURES */ @@ -253,9 +273,9 @@ font-size:.85rem; } } - @media (max-width:1000px) { .hero-inner { grid-template-columns:1fr; } .hero-logo-wrap { display:none; } } + @media (max-width:1000px) { #hero .container { max-width:1200px; } .hero-inner { grid-template-columns:1fr; } .hero-content { grid-column:auto; order:1; } .hero-visual { grid-column:auto; grid-row:auto; order:2; margin:.5rem 0; } .warn-banner { grid-column:auto; order:3; } .hero-actions { grid-column:auto; order:4; } .hero-stats { order:5; } .hero-visual-link { width:min(100%,720px); } } @media (max-width:900px) { .gs-layout { grid-template-columns:1fr; } .gs-code-panel { position:static; } .footer-inner { grid-template-columns:1fr 1fr; } } - @media (max-width:640px) { section { padding:4rem 0; } .footer-inner { grid-template-columns:1fr; } .tut-meta { display:none; } .nav-actions .btn-outline { display:none; } } + @media (max-width:640px) { section { padding:4rem 0; } .hero-stats { gap:1.5rem 2rem; } .stat-value { font-size:1.2rem; } .footer-inner { grid-template-columns:1fr; } .tut-meta { display:none; } .nav-actions .btn-outline { display:none; } .graphic-overlay { inset:calc(var(--nav-h) + .5rem) .5rem .5rem; } .graphic-overlay-panel { width:100%; max-height:calc(100vh - var(--nav-h) - 1rem); } .graphic-overlay-img { max-height:calc(100vh - var(--nav-h) - 5rem); } } @@ -269,13 +289,21 @@ News @@ -309,15 +338,21 @@
-
-
NVIDIA Open Source · Data Acquisition
-

DAQIRI — Command the
Data Deluge at the Source

-

- DAQIRI (Data Acquisition for Integrated Real-time Instruments) connects high-bandwidth streaming sensors - directly to the NVIDIA compute ecosystem. By abstracting zero-copy data movement from sensor to GPU, - DAQIRI puts scalable, real-time AI, signal processing, and scientific computing within reach of the next - generation of instruments. -

+
+
NVIDIA Open Source · Data Acquisition
+

DAQIRI for Sensor Data
in CPU or NVIDIA GPU Memory

+

+ DAQIRI (Data Acquisition for Integrated Real-time Instruments) moves high-bandwidth data between external + sensors and CPU or NVIDIA GPU memory. Streams can arrive from PCIe devices such as FPGAs or from network-capable sensors + over Raw Ethernet (UDP/TCP) or RoCE/RDMA, giving applications one zero-copy path for ingest and egress. + GPU-resident data can also write out through GPUDirect Storage. +

+
+
+ +
⚠️

The library is undergoing large improvements as we aim to better support it as an NVIDIA product. API breakages may be more frequent until version 1.0.

@@ -326,18 +361,16 @@

DAQIRI — Command the
Data Deluge at the Quick Start → Key Concepts API Reference - Examples + Benchmarking

-
Gbps – Tbps+
Sensor Bandwidth
-
Zero-Copy
Sensor → GPU
-
UDP, RoCE
Protocol
C++
Language
Multi-Sensor
Scalable
Minutes
Time to Deployment
Apache 2.0
License
+
PCIe + Ethernet
Sensor Paths
+
Ingest + Egress
Data Direction
+
Zero-Copy
CPU/GPU Memory
+
Raw Ethernet, RoCE
Protocols
+
C++ / Python
Application API
-
- DAQIRI — sensor connected to GPU infrastructure -
-
@@ -664,11 +697,11 @@

Tutorials

02
Bare-Metal CMake Build
End-to-end bare-metal build: verify prerequisites, install RDMA libraries, build patched DPDK 25.11 from source, configure DAQIRI_MGR / DAQIRI_BUILD_PYTHON / CMAKE_CUDA_ARCHITECTURES, install, smoke-test, troubleshoot.
Intermediate~45 min
03
Container Build with Patched DPDK
Build the Docker image with build-container.sh. The container ships a dmabuf-patched DPDK, so peermem is not required.
Coming Soon
04
System Tuning for High-Performance Networking
Isolate CPU cores, configure hugepages, set NUMA affinity, and run python/tune_system.py to diagnose common configuration issues.
Intermediate~30 min
- 05
Benchmarking Examples
Run a TX/RX loopback test to validate your setup, and walk through interpreting throughput results.
Beginner~20 min
- 06
YAML Configuration Deep Dive
Memory regions (huge, device, host_pinned), RX/TX queue setup, flow steering rules, flex items, and RDMA client/server config schemas.
Intermediate~40 min
-
07
GPUDirect: Header-Data Split Pipeline
Configure a two-region memory layout, access CPU headers and GPU payloads per-packet with get_segment_packet_ptr(), and reorder scattered GPU buffers with the built-in CUDA kernel.
Coming Soon
-
08
RoCE (RDMA) Client/Server Setup
Configure stream_type: socket, protocol: roce with RC transport, assign client and server roles across two hosts, and run daqiri_bench_rdma to validate the connection.
Coming Soon
-
09
Timed TX with ConnectX-7
Enable accurate_send in the TX config and use set_packet_tx_time() for PTP-synchronized, hardware-scheduled packet transmission on ConnectX-7+.
Coming Soon
+ 05
Socket and RDMA Benchmarking
Run TCP/UDP sockets and RoCE/RDMA with matching namespace isolation and PHY-counter checks.
Intermediate~30 min
+ 06
Raw Ethernet Benchmarking
Run a DPDK raw Ethernet TX/RX loopback test and interpret NIC throughput counters.
Intermediate~20 min
+ 07
YAML Configuration Deep Dive
Memory regions (huge, device, host_pinned), RX/TX queue setup, flow steering rules, flex items, and RDMA client/server config schemas.
Intermediate~40 min
+
08
GPUDirect: Header-Data Split Pipeline
Configure a two-region memory layout, access CPU headers and GPU payloads per-packet with get_segment_packet_ptr(), and reorder scattered GPU buffers with the built-in CUDA kernel.
Coming Soon
+
10
Timed TX with ConnectX-7
Enable accurate_send in the TX config and use set_packet_tx_time() for PTP-synchronized, hardware-scheduled packet transmission on ConnectX-7+.
Coming Soon
@@ -749,7 +782,7 @@

Connect Your Sensors to the NVIDIA Ecosystem

  • C++ API Usage
  • Python API Usage
  • Getting Started
  • -
  • Examples
  • +
  • Benchmarking
  • @@ -782,39 +815,113 @@

    Connect Your Sensors to the NVIDIA Ecosystem

    + + diff --git a/docs/javascripts/tab-dropdowns.js b/docs/javascripts/tab-dropdowns.js index d3dad08..aa5de0a 100644 --- a/docs/javascripts/tab-dropdowns.js +++ b/docs/javascripts/tab-dropdowns.js @@ -18,6 +18,7 @@ ], "Tutorials": [ { label: "System Configuration", path: "tutorials/system_configuration/" }, + { label: "Bare-Metal CMake Build", path: "tutorials/bare-metal-cmake-build/" }, { label: "Configuration YAML Walkthrough", path: "tutorials/configuration-walkthrough/" } ] }; diff --git a/docs/tutorials/bare-metal-cmake-build.md b/docs/tutorials/bare-metal-cmake-build.md index 3a2f243..8969cd2 100644 --- a/docs/tutorials/bare-metal-cmake-build.md +++ b/docs/tutorials/bare-metal-cmake-build.md @@ -12,7 +12,7 @@ It is the long-form companion to the five-line `cmake` snippet in [Getting Start - you are packaging DAQIRI into another product that already provides a runtime image; - you are debugging a build problem inside the container's `daqiri-build` stage and need to reproduce it on the host. - If none of those apply, follow [System Configuration](system_configuration.md) and then [Benchmarking Examples](benchmarking_examples.md) instead. + If none of those apply, follow [System Configuration](system_configuration.md) and then [Raw Ethernet Benchmarking](../benchmarks/raw_benchmarking.md) instead. ## Prerequisite verification @@ -305,7 +305,7 @@ ldd /opt/daqiri/lib/libdaqiri.so | head ### 5.3 Smoke test -Verify the build with the standard two-port TX/RX loopback. This requires a NIC with two ports connected to each other by a physical SFP cable, and that you replace the `` placeholders in the YAML (PCIe BDFs, CPU cores, destination MAC) for your system. The walkthrough for those edits lives in [Benchmarking Examples → Update the loopback configuration](benchmarking_examples.md#update-the-loopback-configuration); do that first, then run: +Verify the build with the standard two-port TX/RX loopback. This requires a NIC with two ports connected to each other by a physical SFP cable, and that you replace the `` placeholders in the YAML (PCIe BDFs, CPU cores, destination MAC) for your system. The walkthrough for those edits lives in [Raw Ethernet Benchmarking → Update the loopback configuration](../benchmarks/raw_benchmarking.md#update-the-loopback-configuration); do that first, then run: ```bash sudo ./build/examples/daqiri_bench_raw_gpudirect \ @@ -317,7 +317,7 @@ A successful run prints a stream of `[INFO]` lines followed by an RX/TX rate sum !!! tip "DGX Spark" - On DGX Spark, use the prefilled `daqiri_bench_raw_tx_rx_spark.yaml` instead; only `eth_dst_addr` needs an edit. See the [DGX Spark profile callout](benchmarking_examples.md#update-the-loopback-configuration) for the exact MAC-lookup command. + On DGX Spark, use the prefilled `daqiri_bench_raw_tx_rx_spark.yaml` instead; only `eth_dst_addr` needs an edit. See the [DGX Spark profile callout](../benchmarks/raw_benchmarking.md#update-the-loopback-configuration) for the exact MAC-lookup command. !!! note "No NIC available?" @@ -406,7 +406,7 @@ The build recipe above is the same on every supported host. The notes below cove - GB10 is **compute capability 12.1** (`sm_121`). DAQIRI's default arch list adds `121` automatically when configuring with **CUDA Toolkit 13.0 or newer**; on those toolkits no override is needed. On older toolkits, GB10 is not supported. - DGX Spark uses **NVLink-C2C unified memory** and has no separate GPU BAR1, so data buffers in YAML configs use `kind: host_pinned` rather than `kind: device`. The DGX-Spark-prefilled YAMLs in `examples/*_spark.yaml` already encode this. - `nvidia-peermem` is not used; GPUDirect goes through the dma-buf path enabled by the DPDK patches in [Step 3](#step-3-build-dpdk-with-daqiri-patches). - - For a runnable end-to-end test after the build completes, follow the [DGX Spark profile callout](benchmarking_examples.md#update-the-loopback-configuration) in Benchmarking Examples: the prefilled `daqiri_bench_raw_tx_rx_spark.yaml` and `daqiri_bench_rdma_tx_rx_spark.yaml` need only an `eth_dst_addr` edit. + - For a runnable end-to-end test after the build completes, follow the [DGX Spark profile callout](../benchmarks/raw_benchmarking.md#update-the-loopback-configuration) in Raw Ethernet Benchmarking: the prefilled `daqiri_bench_raw_tx_rx_spark.yaml` and `daqiri_bench_rdma_tx_rx_spark.yaml` need only an `eth_dst_addr` edit. === "IGX Orin + dGPU" @@ -444,5 +444,5 @@ The build recipe above is the same on every supported host. The notes below cove Once `libdaqiri.so` is installed and the [smoke test](#53-smoke-test) passes: 1. [**System Configuration**](system_configuration.md): tune the host (hugepages, NIC link layer, GPU BAR1, CPU isolation) for production performance. -2. [**Benchmarking Examples**](benchmarking_examples.md): run `daqiri_bench_raw_gpudirect` over a physical loopback. +2. [**Raw Ethernet Benchmarking**](../benchmarks/raw_benchmarking.md): run `daqiri_bench_raw_gpudirect` over a physical loopback. 3. [**Understanding the Configuration File**](configuration-walkthrough.md): pick the right starter YAML for your use case from the decision tree. diff --git a/docs/tutorials/configuration-walkthrough.md b/docs/tutorials/configuration-walkthrough.md index 95f2ccf..f601894 100644 --- a/docs/tutorials/configuration-walkthrough.md +++ b/docs/tutorials/configuration-walkthrough.md @@ -19,7 +19,7 @@ If you don't have any NIC at all, the `*_sw_loopback*` variants of the Raw Ether (`DAQIRI_MGR` at the CMake layer is the inverse selector: it tells the build which manager implementations to compile in — `dpdk` enables `stream_type: "raw"`, `socket` enables `stream_type: "socket"` with `protocol: "udp"`/`"tcp"`, and `rdma` enables `protocol: "roce"`. The default build enables all three.) -With a stream type in mind, read down the questions below and stop at the first one that matches what you're trying to do. Each section names the YAML, the binary that consumes it, and any platform-specific notes. +For a shorter backend-selection guide, start with the [Benchmarking overview](../benchmarks/benchmarks.md). With a stream type in mind, read down the questions below and stop at the first one that matches what you're trying to do. Each section names the YAML, the binary that consumes it, and any platform-specific notes. ??? question "1. I want to measure baseline throughput" Pick the stream type that matches your stack (see the [overview](#choosing-the-appropriate-daqiri-stream-type-for-your-setup) above), then the hardware or protocol variant. @@ -28,19 +28,19 @@ With a stream type in mind, read down the questions below and stop at the first - **Generic discrete GPU** (template — replace ``) — [`daqiri_bench_raw_tx_rx.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx.yaml). This is the file annotated line-by-line in the [walkthrough below](#annotated-walkthrough). - **Four queue closed-loop TX+RX** (template — replace ``) — [`daqiri_bench_raw_tx_rx_4q.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx_4q.yaml). Uses one application worker per TX/RX queue, with each `bench_tx` entry sending a different UDP flow. - - **DGX Spark / GB10** (prefilled) — [`daqiri_bench_raw_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx_spark.yaml). `kind: host_pinned` for the integrated GPU; cores, PCIe addresses, and IPs are prefilled. See the [Spark profile callout](benchmarking_examples.md#update-the-loopback-configuration) for run details. + - **DGX Spark / GB10** (prefilled) — [`daqiri_bench_raw_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_tx_rx_spark.yaml). `kind: host_pinned` for the integrated GPU; cores, PCIe addresses, and IPs are prefilled. See the [Spark profile callout](../benchmarks/raw_benchmarking.md#update-the-loopback-configuration) for run details. - **No physical NIC available** — [`daqiri_bench_raw_sw_loopback.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_raw_sw_loopback.yaml). `loopback: "sw"`, no NIC required. Useful for first-time build verification, not representative of production performance. To watch the same raw loopback benchmark with live Prometheus and Grafana counters, use the Grafana compose stack described in - [Watch live OpenTelemetry metrics in Grafana](benchmarking_examples.md#watch-live-opentelemetry-metrics-in-grafana). + [Watch live OpenTelemetry metrics in Grafana](../benchmarks/raw_benchmarking.md#watch-live-opentelemetry-metrics-in-grafana). **Socket — RoCE (RDMA)** (`stream_type: "socket"`, `protocol: "roce"`) — runs on `daqiri_bench_rdma` (use `--mode {tx,rx,both}`). Configs use `kind: host_pinned` regardless of platform. - **Generic** (template — replace IPs) — [`daqiri_bench_rdma_tx_rx.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_rdma_tx_rx.yaml). - - **DGX Spark** (prefilled) — [`daqiri_bench_rdma_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_rdma_tx_rx_spark.yaml). See the [Spark profile callout](benchmarking_examples.md#update-the-loopback-configuration) for run details. + - **DGX Spark** (prefilled) — [`daqiri_bench_rdma_tx_rx_spark.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_rdma_tx_rx_spark.yaml). See [Socket and RDMA Benchmarking](../benchmarks/socket_benchmarking.md#run-the-rdma-roce-benchmark) for namespace and wire-counter run details. - **Socket — UDP / TCP** (`stream_type: "socket"`, `protocol: "udp"` or `"tcp"`) — runs on `daqiri_bench_socket`. Both bind to `127.0.0.1`. + **Socket — UDP / TCP** (`stream_type: "socket"`, `protocol: "udp"` or `"tcp"`) — runs on `daqiri_bench_socket`. The shipped smoke-test configs bind to `127.0.0.1`; see [Socket and RDMA Benchmarking](../benchmarks/socket_benchmarking.md) for namespace-based wire tests. - **UDP** — [`daqiri_bench_socket_udp_tx_rx.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_socket_udp_tx_rx.yaml). - **TCP** — [`daqiri_bench_socket_tcp_tx_rx.yaml`](https://github.com/nvidia/daqiri/blob/main/examples/daqiri_bench_socket_tcp_tx_rx.yaml). @@ -418,4 +418,4 @@ The reorder bench runs on `daqiri_bench_raw_reorder_seq`: Other reorder variants are listed under [question 2 of the decision tree above](#choosing-an-example-config): the CPU-kernel variant, the RX-only variants, and the `seq_batch_number` algorithm with in-kernel int4 → fp32 type conversion (runs on `daqiri_bench_raw_reorder_quantize`). --- -**Previous:** [Benchmarking Examples](benchmarking_examples.md) +**Previous:** [Raw Ethernet Benchmarking](../benchmarks/raw_benchmarking.md) diff --git a/docs/tutorials/system_configuration.md b/docs/tutorials/system_configuration.md index d67c0cb..ae37301 100644 --- a/docs/tutorials/system_configuration.md +++ b/docs/tutorials/system_configuration.md @@ -18,7 +18,7 @@ DAQIRI requires an [**NVIDIA SmartNIC**](https://www.nvidia.com/en-us/networking ## System Setup for DAQIRI - This section covers the essential system setup steps needed before using DAQIRI. Complete this setup before moving on to [System Optimization](#system-optimization) or [running benchmarks](benchmarking_examples.md). + This section covers the essential system setup steps needed before using DAQIRI. Complete this setup before moving on to [System Optimization](#system-optimization) or [running benchmarks](../benchmarks/benchmarks.md). In this tutorial, we will be developing on an **NVIDIA IGX Orin platform** with [IGX SW 1.1](https://docs.nvidia.com/igx-orin/user-guide/latest/base-os.html) and an [NVIDIA RTX 6000 ADA GPU](https://www.nvidia.com/en-us/design-visualization/rtx-6000/), which is the configuration that is currently actively tested. The concepts should be applicable to other systems based on Ubuntu 22.04 as well. It should also work on other Linux distributions with a glibc version of 2.35 or higher by containerizing the dependencies and applications on top of an Ubuntu 22.04 image, but this is not actively tested at this time. @@ -1298,7 +1298,7 @@ DAQIRI requires an [**NVIDIA SmartNIC**](https://www.nvidia.com/en-us/networking ``` --- - **Next:** [Benchmarking Examples](benchmarking_examples.md) — run your first DAQIRI benchmark + **Next:** [Benchmarking](../benchmarks/benchmarks.md) — choose and run your first DAQIRI benchmark === "DGX Spark" @@ -1362,7 +1362,7 @@ DAQIRI requires an [**NVIDIA SmartNIC**](https://www.nvidia.com/en-us/networking - **Same physical port** (e.g. `mlx5_0` ↔ `mlx5_2`, both p0) → TX/RX loop **on-chip** through the eswitch; traffic never reaches the cable. Physical-link packet counters stay flat while the vport counters (`tx_good_packets` / `rx_good_packets`) run at line rate. This is a software-path test. - **Different physical ports** (e.g. `mlx5_0` p0 ↔ `mlx5_3` p1 `0002:01:00.1`, or `mlx5_0` ↔ `mlx5_1`) → TX/RX loop **over the wire**; physical-link packet counters rise to match the TX/RX counts. This is an over-the-wire test. - Confirm which case you got from the physical-link packet counters: near zero for on-chip, matching the TX/RX packet counts for over-the-wire. These counters count packets that reached the SerDes/QSFP side of the NIC rather than packets switched internally by the eswitch. The [daqiri bench](benchmarking_examples.md)'s DPDK "Extended Stats" output reports them as `tx_phy_packets` / `rx_phy_packets`; `ethtool -S` and `mlnx_perf` report the same wire counters as `tx_packets_phy` / `rx_packets_phy`. + Confirm which case you got from the physical-link packet counters: near zero for on-chip, matching the TX/RX packet counts for over-the-wire. These counters count packets that reached the SerDes/QSFP side of the NIC rather than packets switched internally by the eswitch. The [daqiri bench](../benchmarks/raw_benchmarking.md)'s DPDK "Extended Stats" output reports them as `tx_phy_packets` / `rx_phy_packets`; `ethtool -S` and `mlnx_perf` report the same wire counters as `tx_packets_phy` / `rx_packets_phy`. `ethtool -m` reports identical `Connector: 0x23 No separable connector` on all 4 PFs and is **not** useful for distinguishing them; use `phys_port_name` above (the cable-yank carrier test confirms a cable is present but does **not** distinguish ports). @@ -1585,6 +1585,6 @@ DAQIRI requires an [**NVIDIA SmartNIC**](https://www.nvidia.com/en-us/networking ``` --- - **Next:** [Benchmarking Examples](benchmarking_examples.md) — run your first DAQIRI benchmark + **Next:** [Benchmarking](../benchmarks/benchmarks.md) — choose and run your first DAQIRI benchmark diff --git a/mkdocs.yml b/mkdocs.yml index 65d9e7d..5c5c4b4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -49,7 +49,10 @@ site_dir: site nav: - Getting Started: getting-started.md - Concepts: concepts.md - - Benchmarks: tutorials/benchmarking_examples.md + - Benchmarking: + - Overview: benchmarks/benchmarks.md + - Socket and RDMA Benchmarking: benchmarks/socket_benchmarking.md + - Raw Ethernet Benchmarking: benchmarks/raw_benchmarking.md - API Reference: - API Guide: api-reference/index.md - Configuration YAML Reference: api-reference/configuration.md diff --git a/src/managers/socket/daqiri_socket_mgr.cpp b/src/managers/socket/daqiri_socket_mgr.cpp index 4361544..da0e412 100644 --- a/src/managers/socket/daqiri_socket_mgr.cpp +++ b/src/managers/socket/daqiri_socket_mgr.cpp @@ -47,6 +47,8 @@ namespace daqiri { namespace { +constexpr size_t kMaxUdpPayloadBytes = 65507; + bool parse_ipv4_addr(const std::string& ip, uint16_t port, sockaddr_in* addr) { if (addr == nullptr) { return false; } @@ -769,6 +771,14 @@ bool SocketMgr::send_udp_burst(EndpointState& ep, BurstParams* burst, size_t* se use_sendto = true; } + for (size_t i = 0; i < num_pkts; ++i) { + const auto len = static_cast(burst->pkt_lens[0][i]); + if (len > kMaxUdpPayloadBytes) { + DAQIRI_LOG_ERROR("UDP payload length {} exceeds maximum {} bytes", len, kMaxUdpPayloadBytes); + return false; + } + } + std::vector msgs(num_pkts); std::vector iovs(num_pkts); std::vector peers; @@ -863,7 +873,7 @@ Status SocketMgr::send_tx_burst(BurstParams* burst) { status = Status::CONNECT_FAILURE; } } else if (cfg_.common_.protocol == SocketProtocol::TCP) { - if (conn == nullptr) { + if (conn == nullptr || !conn->running.load()) { DAQIRI_LOG_ERROR("No active TCP connection for port {}", ep->port); status = Status::CONNECT_FAILURE; } else { @@ -1276,9 +1286,6 @@ void SocketMgr::tcp_rx_loop(std::shared_ptr conn) { conn->running.store(false); close_fd(conn->fd); - - std::lock_guard lock(state_mutex_); - connections_.erase(conn->conn_id); } void SocketMgr::udp_rx_loop(int if_index) { @@ -1365,6 +1372,17 @@ Status SocketMgr::socket_connect_to_server(const std::string& dst_addr, uint16_t if (ep == nullptr || ep->socket_cfg.mode_ != SocketMode::CLIENT) { continue; } if (cfg_.common_.protocol == SocketProtocol::TCP) { + if (ep->primary_conn_id != 0 && ep->socket_cfg.remote_ip_ == dst_addr && + ep->socket_cfg.remote_port_ == dst_port && + (src_addr.empty() || src_addr == ep->socket_cfg.local_ip_)) { + std::lock_guard lock(state_mutex_); + const auto it = connections_.find(ep->primary_conn_id); + if (it != connections_.end() && it->second != nullptr && it->second->running.load()) { + *conn_id = ep->primary_conn_id; + return Status::SUCCESS; + } + } + auto conn = create_tcp_client_connection(*ep, dst_addr, dst_port, src_addr, 0, true); if (conn == nullptr) { return Status::CONNECT_FAILURE; } *conn_id = conn->conn_id;