diff --git a/.gitignore b/.gitignore index 1c01658..2f30b5e 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,9 @@ comm/Makefile comm/cmake_install.cmake install_manifest.txt patches/ +.tmp +tests/udma/logs +*.diff +*.bz2 +tmp +.kilo \ No newline at end of file diff --git a/docs/TileXR_UDMA_P2P_PERF_GUIDE.md b/docs/TileXR_UDMA_P2P_PERF_GUIDE.md new file mode 100644 index 0000000..3b4d24e --- /dev/null +++ b/docs/TileXR_UDMA_P2P_PERF_GUIDE.md @@ -0,0 +1,596 @@ +# TileXR UDMA 2-Card P2P Performance Test Guide + +## Background + +This guide designs a 2-card single-pair P2P performance test for the TileXR +UDMA path, using commit `378724c583c417ab2388a3c5354bfdb0e57dc589` as the +reference baseline. That commit already contains the UDMA demo infrastructure +under `tests/udma/demo`, including local rank launch, TileXR communicator +initialization, registered device memory, UDMA kernel launches, and per-rank logs. + +The P2P test should reuse that infrastructure and add a focused perf mode for +two ranks only. + +## Goal + +Measure one directed UDMA P2P transfer at a time on 2 cards: + +- `rank0 -> rank1` +- `rank1 -> rank0` + +The final output should include: + +- correctness status for each transfer size and direction; +- latency in microseconds; +- effective bandwidth in GB/s; +- a CSV result file; +- a bandwidth curve comparing `0->1` and `1->0`. + +## Test Model + +Use `rank_size=2`. Each run launches two local ranks, one rank per NPU. + +The P2P perf mode supports these user-facing transport modes: + +- `direct_urma`: registered-memory UDMA transfer. Internally this path uses + the parallel multi-jetty kernel; `block_dim=1` with one QP is the single-jetty + baseline, while `block_dim=N` with `TILEXR_UDMA_QP_NUM=N` uses up to `N` + QPs/jettys in parallel. +- `memory`: peer-memory IPC comparison using Ascend C `DataCopyPad`. +- `memory_consume`: peer-memory IPC comparison with a separate outer sync flag + and a receiver-side copy from the IPC window into the destination buffer. +- `data_as_flag`: peer-memory IPC comparison where each 512B block carries + 480B payload plus a 32B ready flag. +- `data_as_flag_epoch_ordered`: data-as-flag comparison with per-batch commit + epochs to avoid cross-iteration ready flag reuse. + +The diagnostic transports `memory_segmented` and `memory_segmented_rotate` +are intended for large-message root-cause experiments. They keep `block_dim=1` +but split the `memory` copy helper into 16 MiB segments; the rotate variant +keeps writing inside one 16 MiB destination span and skips payload validation. +The trace variants `memory_segmented_trace` and +`memory_segmented_rotate_trace` use 8 MiB segments and record block-0 +per-segment cycle sums in the debug buffer. Enable +`TILEXR_P2P_DEBUG_SUMMARY=1` to print `traceSegments` and `seg0Cycles` through +`seg7Cycles`. + +For `direct_urma`, for a selected direction: + +- the sender rank posts one UDMA write to the receiver's registered memory; +- the sender calls `UDMAQuietStatus` and records the completion status; +- the receiver does not send data in that measurement round; +- after stream synchronization and a host barrier, the receiver copies its + destination buffer back and validates the expected byte pattern. + +For `memory`, for the same selected direction: + +- the sender rank launches `tilexr_memory_p2p_perf_kernel`; +- only the sender rank performs the device-side copy; +- the destination address is its peer IPC data window on the receiver; +- the receiver validates by copying its local IPC data window back to host. + +For `data_as_flag`, for the same selected direction: + +- the sender rank launches `tilexr_data_as_flag_p2p_perf_kernel`; +- each 512B block carries 480B payload plus a 32B ready flag in the receiver's + peer IPC memory window; +- the receiver validates by copying its local IPC data window back to host. + +Run both directions independently so each result row has an unambiguous source +and destination. For current comparison work, `0->1` is usually enough because +`0->1` and `1->0` were observed to be effectively identical on the tested +2-card setup. + +Important scope notes: + +- `direct_urma` measures the TileXR registered-memory URMA/UDMA path. +- `memory` measures peer-memory IPC semantics implemented with AIV + `DataCopyPad`. It is a useful baseline, not the same hardware data path as + UDMA queue based direct URMA. +- `memory` and `data_as_flag` are limited by the TileXR IPC data window. The + current wrapper rejects sizes above 100 MiB. +- The current IPC comparison kernels use one AIV block in the runner. Large + messages therefore reflect single-block GM->UB->peer-GM copy throughput. + +## Metrics + +Recommended CSV fields: + +```text +direction,src,dst,ranks,bytes,iters,avg_us,min_us,max_us,bw_GBps,status,errors,log_dir +``` + +Definitions: + +- `direction`: `0to1` or `1to0`. +- `bytes`: transfer bytes per measured UDMA write. +- `avg_us`: average measured time per transfer, excluding warmup. +- `min_us` and `max_us`: per-transfer minimum and maximum from measured + iterations. +- `bw_GBps`: `bytes / avg_us / 1000`. +- `status`: UDMA completion status from `UDMAQuietStatus`; `0` means success. +- `errors`: validation mismatch count on the receiver side. + +Do not include communicator initialization, memory registration, H2D/D2H copies, +or TCP barriers in the measured interval. + +For both transports, timing is taken on the source rank and shared through +per-rank status files before rank 0 writes the CSV row. This avoids reporting +receiver-side empty-kernel timing for the reverse direction. + +## Recommended Sweep + +Use this default sweep first: + +```text +rank_size = 2 +npu_count = 2 +first_npu = 0 +directions = 0->1, 1->0 +min_bytes = 4096 +max_bytes = 268435456 +step_factor = 2 +warmup_iters = 10 +iters = 100 +check = 1 +``` + +If large messages are unstable during bring-up, start with: + +```text +min_bytes = 4096 +max_bytes = 16777216 +warmup_iters = 5 +iters = 20 +``` + +Then expand the range after both directions pass correctness. + +## Code Changes + +### 1. Add A P2P Perf Test Type + +Extend `tests/udma/demo/tilexr_udma_demo.cpp` and +`tests/udma/demo/tilexr_udma_demo_kernel.cpp` with a new mode: + +```text +test_type=4: 2-card directed UDMA P2P performance test +``` + +Suggested host options: + +```text +--src-rank <0|1> +--dst-rank <0|1> +--min-bytes +--max-bytes +--step-factor +--warmup-iters +--iters +--csv +--check <0|1> +``` + +For compatibility with the existing positional demo style, a wrapper script can +translate simple positional arguments into these options. + +### 2. Add A Device Kernel + +Add a kernel shaped like this: + +```cpp +extern "C" __global__ __aicore__ void tilexr_udma_p2p_perf_kernel( + GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR debugGM, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern) +{ + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM); + auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM); + + const int32_t rank = args->rank; + if (rank != srcRank) { + return; + } + + TileXR::UDMAPutNbi(args, dstRank, src, dstByteOffset, bytes); + uint32_t status = TileXR::UDMAQuietStatus(args, dstRank); + if (debug != nullptr) { + debug[0] = status; + debug[1] = bytes; + debug[2] = pattern; + } +} +``` + +The source buffer should already contain the host-generated pattern before the +measured launch. The receiver validates the destination buffer after the measured +iteration batch. + +### 3. Allocate Registered Buffers + +For each rank, allocate one registered memory region with: + +```text +src buffer +dst buffer +debug/status buffer +``` + +Round the total registered bytes to the existing 2 MiB UDMA registration +alignment used by the demo. + +The remote destination offset passed to the kernel is the byte offset of the +receiver's `dst buffer` inside that registered region. + +### 4. Measure The Transfer + +Use one stream and event timing around each measured kernel launch batch: + +```text +warmup: + launch kernel + synchronize stream + +measured: + record start event + repeat iters: + launch kernel + record stop event + synchronize stop event +``` + +Compute: + +```text +avg_us = elapsed_ms * 1000 / iters +bw_GBps = bytes / avg_us / 1000 +``` + +If per-iteration min and max are required, record events around each measured +iteration. For initial bring-up, average timing is enough; min and max can be +added once correctness is stable. + +### 5. Write CSV + +Only rank 0 should create or append to the CSV. For direction `1->0`, rank 0 is +the receiver, so rank 0 can still write the result after copying back debug and +validation state. + +Each size and direction emits one row. + +Example: + +```csv +direction,src,dst,ranks,bytes,iters,avg_us,min_us,max_us,bw_GBps,status,errors,log_dir +0to1,0,1,2,4096,100,8.31,0,0,0.493,0,0,logs/tilexr_udma_p2p_perf_20260622_120000 +1to0,1,0,2,4096,100,8.45,0,0,0.485,0,0,logs/tilexr_udma_p2p_perf_20260622_120000 +``` + +When min and max are not implemented yet, write `0` for those fields and keep +the columns stable. + +## Runner Script + +Add a script: + +```text +tests/udma/demo/run_tilexr_udma_p2p_perf.sh +``` + +Suggested usage: + +```bash +bash demo/run_tilexr_udma_p2p_perf.sh \ + \ + +``` + +Example: + +```bash +cd /path/to/TileXR/tests/udma +bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 direct_urma +bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 memory +bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 data_as_flag +``` + +Arguments: + +- `src_rank`, `dst_rank`: directed transfer, normally `0 1` or `1 0`. +- `min_bytes`, `max_bytes`, `step_factor`: byte sweep definition. +- `iters`, `warmup_iters`: measured and warmup launch counts. +- `first_npu`: first physical NPU id used by local rank 0. For physical cards + 2 and 3, pass `2`. +- `check`: `1` validates destination bytes after each size. +- `transport`: `direct_urma`, `memory`, or `data_as_flag`; default is `direct_urma`. + +The script should: + +- source `scripts/common_env.sh`; +- set `TILEXR_COMM_ID` if it is not already set; +- set `TILEXR_DEMO_NPUS=2`; +- set `TILEXR_DEMO_FIRST_NPU`; +- launch exactly two ranks; +- write per-rank logs under + `tests/udma/logs/tilexr_udma_p2p_perf_*__`; +- write CSV under the same log directory. + +## Build + +Build TileXR core first: + +```bash +cd /path/to/TileXR +source scripts/common_env.sh +mkdir -p build +cd build +cmake -DCMAKE_INSTALL_PREFIX=../install .. +make -j$(nproc) +make install +``` + +Build UDMA tests and demo: + +```bash +cd /path/to/TileXR/tests/udma +bash build.sh +``` + +Expected binary: + +```text +tests/udma/install/bin/tilexr_udma_demo +``` + +## Run + +### Run direct URMA + +Run a short correctness-oriented sweep first: + +```bash +cd /path/to/TileXR/tests/udma +bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 16777216 2 20 5 2 1 direct_urma +``` + +Then run the larger sweep. On the tested `141.62.19.144` environment, 64 MiB +was used as the stable upper bound because 128 MiB and above failed during +UDMA registration due to the registered region size. + +```bash +bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 direct_urma +``` + +### Run memory semantics + +Use the same sweep and only change the transport: + +```bash +cd /path/to/TileXR/tests/udma +bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 memory +``` + +For diagnosis around the observed large-message knee, run single-size points: + +```bash +for bytes in 33554432 41943040 50331648 58720256 62914560 67108864 75497472 83886080 100663296; do + TILEXR_COMM_ID=127.0.0.1:$((12000 + bytes / 1048576)) \ + bash demo/run_tilexr_udma_p2p_perf.sh 0 1 ${bytes} ${bytes} 2 50 5 2 1 memory +done +``` + +### Run data-as-flag semantics + +Use the same sweep and only change the transport: + +```bash +cd /path/to/TileXR/tests/udma +bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 data_as_flag +``` + +Success criteria: + +- both rank processes exit with code 0; +- every row has `status=0`; +- every row has `errors=0`; +- for `direct_urma`, per-rank logs include UDMA enabled in `CommArgs`; +- for `memory` and `data_as_flag`, per-rank logs show non-null `peerMems[]` + for the tested ranks; +- no rank log contains `MISMATCH`, `TileXR UDMA demo failed`, or `ERROR`. + +## Plot The Curve + +Use the plotting helper: + +```text +tests/udma/demo/plot_tilexr_udma_p2p_perf.py +``` + +For the old direction comparison: + +```bash +python3 demo/plot_tilexr_udma_p2p_perf.py \ + logs/tilexr_udma_p2p_perf_*/p2p_perf.csv \ + --output logs/tilexr_udma_p2p_perf_curve.png +``` + +For a transport comparison, pass both CSVs and label the +series explicitly: + +```bash +python3 demo/plot_tilexr_udma_p2p_perf.py \ + logs/tilexr_udma_p2p_perf_20260622_173102_direct_urma_0to1/p2p_perf.csv \ + logs/tilexr_udma_p2p_perf_20260622_173138_memory_0to1/p2p_perf.csv \ + --direction 0to1 \ + --labels direct_urma,memory \ + --output logs/tilexr_udma_vs_memory_0to1_bandwidth.png \ + --latency-output logs/tilexr_udma_vs_memory_0to1_latency.png \ + --latency-max-bytes 1048576 +``` + +The script generates: + +- bandwidth curve: all rows in the selected CSVs; +- latency curve: rows up to `--latency-max-bytes`, default 1 MiB. + +The plot uses: + +- x-axis: `bytes`, log scale; +- x tick labels: human-readable `KB`, `MB`, or `GB`; +- y-axis: `bw_GBps` for bandwidth or `avg_us` for latency; +- one line per direction or per explicit label. + +Use labels that match the compared transport set, for example +`direct_urma,memory` or `direct_urma,data_as_flag`. + +If multiple CSV files are passed, merge rows by direction and bytes before +plotting. When `--labels` is used, merge rows by label and bytes instead, which +allows two `0to1` CSV files to appear as separate series. + +### Local CSV and plot workflow + +When the test runs on a remote Ascend server, pull only the CSVs to the local +workspace and plot locally: + +```powershell +$out = "D:\workspace\TileXR\tests\udma\logs\p2p_compare_20260622_173138_64MiB" +New-Item -ItemType Directory -Force -Path $out | Out-Null + +scp -i C:\Users\h30059441\.ssh\id_ed25519 ` + root@141.62.19.144:/home/h30059441/TileXR/tests/udma/logs/tilexr_udma_p2p_perf_20260622_173102_direct_urma_0to1/p2p_perf.csv ` + "$out\p2p_perf_direct_urma_0to1.csv" + +scp -i C:\Users\h30059441\.ssh\id_ed25519 ` + root@141.62.19.144:/home/h30059441/TileXR/tests/udma/logs/tilexr_udma_p2p_perf_20260622_173138_memory_0to1/p2p_perf.csv ` + "$out\p2p_perf_memory_0to1.csv" + +python tests\udma\demo\plot_tilexr_udma_p2p_perf.py ` + "$out\p2p_perf_direct_urma_0to1.csv" ` + "$out\p2p_perf_memory_0to1.csv" ` + --direction 0to1 ` + --labels direct_urma,memory ` + --output "$out\tilexr_udma_vs_memory_0to1_bandwidth.png" ` + --latency-output "$out\tilexr_udma_vs_memory_0to1_latency.png" ` + --latency-max-bytes 1048576 +``` + +Example output files: + +```text +tests/udma/logs/p2p_compare_20260622_173138_64MiB/p2p_perf_direct_urma_0to1.csv +tests/udma/logs/p2p_compare_20260622_173138_64MiB/p2p_perf_memory_0to1.csv +tests/udma/logs/p2p_compare_20260622_173138_64MiB/tilexr_udma_vs_memory_0to1_bandwidth.png +tests/udma/logs/p2p_compare_20260622_173138_64MiB/tilexr_udma_vs_memory_0to1_latency.png +``` + +## Result Review + +Check the CSV first: + +```bash +column -s, -t logs/tilexr_udma_p2p_perf_*/p2p_perf.csv | less -S +``` + +Look for: + +- `status != 0`: UDMA completion failure or CQ polling issue; +- `errors != 0`: receiver data mismatch; +- sharp bandwidth drop at a specific size: possible registration range, + alignment, CQ depth, or timeout issue; +- large directional difference between `0->1` and `1->0`: possible topology or + route asymmetry. +- `memory` bandwidth decreasing after 32 MiB: this is not necessarily a data + error. On the tested setup, single-size memory runs showed a gradual drop + from about 48 GB/s at 32 MiB to about 39 GB/s at 64 MiB and above. That shape + is consistent with the current single-AIV-block `DataCopyPad` IPC-window + baseline entering sustained GM->UB->peer-GM throughput, not with a 64 MiB + correctness boundary. +- `data_as_flag` differs from `memory` because it spends part of each 512B block + on the embedded ready flag; compare it as a separate IPC baseline. + +Keep the raw logs with the CSV and curve. They are needed to confirm UDMA was +enabled and to inspect `CommArgs`, registered memory offsets, and debug words. + +## Example Result From 141.62.19.144 + +Environment: + +```text +CANN: /usr/local/Ascend/ascend-toolkit +MPI: /usr/local/mpich +NPUs: physical 2 and 3 +Direction: 0->1 +Sweep: 4096 -> 67108864, step_factor=2, iters=100, warmup=10 +``` + +Selected rows: + +```text +transport bytes avg_us bw_GBps status errors +direct_urma 4096 6.504 0.630 0 0 +direct_urma 1048576 26.040 40.267 0 0 +direct_urma 16777216 322.074 52.091 0 0 +direct_urma 67108864 1269.934 52.844 0 0 +memory 4096 4.059 1.009 0 0 +memory 1048576 23.484 44.651 0 0 +memory 16777216 334.924 50.093 0 0 +memory 67108864 1691.058 39.685 0 0 +``` + +Memory-only diagnostic points: + +```text +32MiB 48.16 GB/s +40MiB 45.54 GB/s +48MiB 42.09 GB/s +56MiB 40.59 GB/s +60MiB 40.16 GB/s +64MiB 39.56 GB/s +72MiB 39.21 GB/s +80MiB 39.15 GB/s +96MiB 39.10 GB/s +``` + +Interpretation: + +- direct URMA reaches about 52.8 GB/s at 64 MiB on this pair; +- memory semantics is competitive for small and mid-sized messages, but the + current single-block DataCopyPad baseline settles near 39-40 GB/s for larger + messages; +- both paths passed payload validation with `status=0` and `errors=0`. + +For the specific `memory block_dim=1` drop after 16 MiB, first compare +`memory`, `memory_segmented`, and `memory_segmented_rotate` at `block_dim=1`, +then repeat the rotate run at `block_dim=2` or higher. If splitting into +16 MiB helper calls and rotating inside a 16 MiB destination window still +follows `memory`, then helper-call size, destination address span, and +host-side sweep shape are unlikely to be the root cause. Then run: + +```bash +TILEXR_P2P_DEBUG_SUMMARY=1 bash demo/run_tilexr_udma_p2p_concurrency_sweep.sh \ + 16777216 67108864 2 20 5 0 1 \ + memory_segmented_trace,memory_segmented_rotate_trace \ + unidir 1,2 +``` + +Interpret the trace as follows: + +- `block_dim=1` later 8 MiB segments get progressively more expensive while + `block_dim=2` stays flat: sustained single-stream peer IPC writes are + accumulating credit/backpressure or remote write-queue drain delay. +- every `block_dim=1` segment is similarly slower than `block_dim=2`: the + bottleneck is a fixed single-stream bandwidth limit rather than segment-local + accumulation. +- rotate trace follows normal trace: destination window span is still excluded. +- rotate trace diverges sharply: re-check address mapping, aliasing, or overwrite + effects before using the trace as a bandwidth conclusion. + +## Minimal Acceptance Checklist + +- `test_type=4` exists and is documented in the UDMA demo README. +- `run_tilexr_udma_p2p_perf.sh` can run `0->1` and `1->0` with `rank_size=2`. +- CSV rows are produced for every requested size. +- The receiver validates the copied payload for every measured size. +- The plot helper generates a two-line bandwidth curve. +- A short sweep and full sweep both pass with `status=0` and `errors=0`. diff --git a/docs/UDMA_P2P_TRANSPORT_COMPARISON.md b/docs/UDMA_P2P_TRANSPORT_COMPARISON.md new file mode 100644 index 0000000..2efe838 --- /dev/null +++ b/docs/UDMA_P2P_TRANSPORT_COMPARISON.md @@ -0,0 +1,219 @@ +# UDMA P2P 传输方式对比 + +**主题:** `memory_consume` 与 `data_as_flag_epoch_ordered` + +## 摘要 + +`data_as_flag_epoch_ordered` 已经消除了最严重的按块写 flag 瓶颈,但其 payload 路径仍比 `memory_consume` 重得多。在相同的有效 payload 带宽统计口径下,`~10 GB/s` 对 `~50 GB/s` 这样的结果与当前实现是相符的。 + +总体来说: + +```text +memory_consume: + 连续 payload 传输 + 独立同步 flag + +data_as_flag_epoch_ordered: + data-as-flag 协议校验,提交信息内嵌 + 在 480B payload + 32B flag/gap 的块布局中 +``` + +`memory_consume` 更接近高效的连续 payload 传输。`data_as_flag_epoch_ordered` 更接近一条协议路径——通过在数据窗口中内嵌就绪信息来证明数据已就绪。 + +## `memory_consume` 数据路径 + +`memory_consume` kernel 保持 payload 窗口连续: + +```text +发送端: + 连续 src payload + -> 对端 IPC 数据窗口 + 设置独立的外部同步 flag + +接收端: + 等待独立的外部同步 flag + 连续的本地对端 IPC 数据窗口 + -> 连续的 dst payload +``` + +在代码中,发送端从 `srcGM + offset` 拷贝到 `peerBase + dstByteOffset + offset`,然后调用 `sync.SetOuterFlag(magic, step)`。接收端用 `sync.WaitOuterFlag(...)` 等待,再从 `localBase + dstByteOffset + offset` 拷贝到 `dst + offset`。 + +两端的 payload 布局都保持连续。虽然 helper 内部仍会经过 UB 中转,但拷贝的 payload 是简单的连续 GM 到 GM 流。对于大消息,独立同步 flag 的开销会被大 payload 摊薄,因此这条路径可以达到高得多的带宽。 + +相关文件: + +```text +tests/udma/demo/tilexr_udma_demo_kernel.cpp + tilexr_memory_consume_p2p_perf_kernel + TileXRUdmaDemoCopyBytesGmToGm +``` + +## `data_as_flag_epoch_ordered` 数据路径 + +`data_as_flag_epoch_ordered` kernel 使用 data-as-flag 块布局: + +```text +512B 块: + 480B payload + 32B flag/gap +``` + +发送端路径: + +```text +连续 src payload + -> UB 暂存 + -> 打包成 512B 块: + 480B payload + 32B flag/gap + -> 对端 data-as-flag IPC 窗口 + -> 在该 batch 最后一个块的 flag 区写入 32B batch commit flag +``` + +接收端路径: + +```text +轮询 batch commit flag + -> 从 512B data-as-flag 窗口读取 + -> 解包/提取 480B payload 区域 + -> 写入连续的 dst payload +``` + +epoch-ordered 变体相比旧的按块 data-as-flag 路径有所改进:每个 batch 只写一次 commit flag,而不是每个 480B payload 块写一次。这消除了破坏性最大的小写瓶颈,但该路径仍然存在额外的布局转换、轮询,以及每个 batch 一次的小 commit 写入。 + +相关文件: + +```text +src/include/tilexr_data_as_flag.h + DATA_AS_FLAG_BLOCK_BYTES = 512 + DATA_AS_FLAG_PAYLOAD_BYTES = 480 + DATA_AS_FLAG_FLAG_BYTES = 32 + DataAsFlagSendEpochOrdered + DataAsFlagWriteBatchCommitFlag + DataAsFlagCheckAndRecvEpochOrdered + DataAsFlagCopyBatchToRecvGM + +tests/udma/demo/tilexr_udma_demo_kernel.cpp + tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel +``` + +## 差距的主要来源 + +### 1. 布局开销 + +`data_as_flag_epoch_ordered` 把每个 `480B` payload 存进一个 `512B` 块: + +```text +512 / 480 = 1.067x +``` + +因此在考虑任何控制或解包开销之前,数据窗口就至少有约 `6.7%` 的额外空间和传输开销。 + +该开销在 host 侧窗口计算中可见: + +```text +DataAsFlagWindowBytes(payloadBytes) + = ceil(payloadBytes / 480) * 512 +``` + +但带宽报告使用的是有效 payload 字节数,而不是展开后的 data-as-flag 窗口字节数。因此额外的块布局字节会拉低报告的有效带宽。 + +相关文件: + +```text +tests/udma/demo/tilexr_udma_p2p_perf_config.h + DataAsFlagWindowBytes + P2PEffectiveTransferBytes + FormatP2PPerfCsvRow +``` + +### 2. 经过 UB 的打包与解包 + +`memory_consume` 是从连续 payload 拷贝到连续 payload。 + +`data_as_flag_epoch_ordered` 必须对 payload 做重塑: + +```text +发送端: + 连续 src + -> UB + -> 512B 块中的 480B payload 分块 + -> 对端 GM + +接收端: + 512B 块/gap 窗口 + -> UB + -> 提取 480B payload 分块 + -> 连续 dst +``` + +这增加了连续 `memory_consume` 路径所不需要的额外 MTE 操作、barrier 以及跨步拷贝行为。 + +### 3. Batch Commit 小写入 + +epoch-ordered 路径不再为每个 `480B` payload 块写一次 flag。而是在一个 batch 的 payload 写完后,向该 batch 最后一个块的 flag 区写入一个 `32B` commit flag: + +```text +payload batch 写入: + batchBlocks * 512B + +batch commit 写入: + 32B commit flag +``` + +这就是“batch commit 小写入”。 + +它比按块写 flag 好得多,但仍存在固定开销: + +```text +准备 32B epoch flag +S -> MTE3 同步 +MTE3 向 GM/对端窗口写入 32B commit flag +MTE3 -> S 同步 +接收端轮询并读取 commit flag +``` + +对于大 batch,该开销会被摊薄,但并非免费。结合打包/解包和 512B 块布局,它仍会限制最终带宽。 + +## 为什么 `0.7 GB/s` 能恢复到 `~10 GB/s` + +旧的 data-as-flag 路径实际上以极高频率付出小 flag 写入/检查开销——大约每个 `480B` payload 块一次。这使小写入和轮询主导了 payload 传输。 + +epoch-ordered 路径将其降为每个 batch 一次 commit 写入: + +```text +旧路径: + 每个 480B payload 块都有 ready/flag 开销 + +epoch-ordered: + 每个 batch 只有一个最终的 32B commit flag +``` + +这就解释了为什么性能可以显著恢复,例如从约 `0.7 GB/s` 恢复到约 `10 GB/s`。 + +但它并未把该路径变成连续 payload 传输。该协议仍要为 data-as-flag 布局付出代价,接收端仍需等待,再从内嵌布局窗口中解包。 + +## 最终解读 + +`memory_consume`: + +```text +用独立的同步区来证明 payload 就绪。 +保持 payload 数据窗口连续。 +针对大块连续 payload 搬移做优化。 +``` + +`data_as_flag_epoch_ordered`: + +```text +用数据窗口内嵌的 commit flag 来证明 payload 就绪。 +以 480B + 32B 的 data-as-flag 块存放 payload。 +付出打包/解包、轮询和 batch commit 开销。 +``` + +因此在当前实现和相同有效 payload 带宽统计口径下,出现如下差距: + +```text +data_as_flag_epoch_ordered: ~10 GB/s +memory_consume: ~50 GB/s +``` + +是符合预期的。这并不一定说明 epoch-ordered 的修复失败;它说明该修复消除了最严重的按块 flag 瓶颈,而剩余的 data-as-flag 协议路径仍比带外部同步的连续 payload 传输重得多。 diff --git a/docs/superpowers/plans/2026-06-24-p2p-transport-trim.md b/docs/superpowers/plans/2026-06-24-p2p-transport-trim.md new file mode 100644 index 0000000..fa998b4 --- /dev/null +++ b/docs/superpowers/plans/2026-06-24-p2p-transport-trim.md @@ -0,0 +1,650 @@ +# P2P Transport Trim Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Trim the UDMA P2P performance demo to expose only `direct_urma`, `memory`, and `data_as_flag`, with `direct_urma` using the current parallel multi-jetty implementation internally. + +**Architecture:** Keep one UDMA benchmark path and two peer-memory comparison paths. The public transport enum/parser/CSV/docs will only know the three user-facing names, while the `direct_urma` host launcher calls the existing parallel multi-jetty kernel. Remove obsolete single-WQE, multi-WQE, serial multi-jetty, and fixed-WQE benchmark paths. + +**Tech Stack:** C++14, Ascend C kernels, Bash demo runners, host-only C++ unit tests, GitHub PR docs. + +## Global Constraints + +- User-facing transport names must be exactly `direct_urma`, `memory`, and `data_as_flag`. +- `direct_urma` must use the current parallel multi-jetty implementation internally. +- `direct_urma` with `block_dim=1` and one QP must behave like the previous single-QP direct URMA path. +- `direct_urma` with `block_dim=N` and `TILEXR_UDMA_QP_NUM=N` must use up to `N` QPs/jettys in parallel. +- Keep QP-indexed device helpers and UDMA layout support. +- Remove fixed-WQE-only window sizing and mismatch-check logic. +- Do not touch unrelated untracked files such as `.kilo/`. +- Source repository guidance from `CLAUDE.md` remains in effect. + +--- + +## File Structure + +- `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`: host-only behavior tests for supported transports, rejected aliases, CSV formatting, window sizing, and aggregation. +- `tests/udma/demo/tilexr_udma_p2p_perf_config.h`: transport enum, parsing, names, validation, window sizing, effective byte accounting, mismatch checking, CSV row formatting, and row aggregation. +- `tests/udma/demo/tilexr_udma_demo.cpp`: host launch declarations and transport dispatch. This should dispatch `P2PTransport::DirectUrma` to the parallel multi-jetty launcher. +- `tests/udma/demo/tilexr_udma_demo_kernel.cpp`: Ascend C benchmark kernels and launch wrappers. Keep the parallel multi-jetty kernel, rename its launcher to the public `launch_tilexr_udma_p2p_perf`, and remove obsolete benchmark kernels/wrappers. +- `tests/udma/demo/run_tilexr_udma_p2p_perf.sh`: runner argument handling and `TILEXR_UDMA_QP_NUM` setup. +- `tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh`: sweep defaults, already expected to remain `direct_urma,memory,data_as_flag`. +- `docs/TileXR_UDMA_P2P_PERF_GUIDE.md` and `tests/udma/demo/README.md`: user-facing documentation. + +--- + +### Task 1: Update Config Tests First + +**Files:** +- Modify: `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp` +- Test: `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp` + +**Interfaces:** +- Consumes: current `TileXR::Demo::P2PTransport`, `ParseP2PTransport`, `P2PTransportName`, `P2PTransportWindowBytes`, `P2PEffectiveTransferBytes`, `CountP2PTransportMismatches`, `FormatP2PPerfCsvRow`, `ValidateP2PPerfOptions` +- Produces: expected behavior that later implementation must satisfy: + - only `DirectUrma`, `Memory`, `DataAsFlag`, and `Invalid` transport enum values are referenced by tests + - obsolete transport strings parse to `Invalid` + - `direct_urma` CSV rows keep the name `direct_urma` + - direct URMA effective bytes are the payload bytes, not `payload * blockDim` + +- [ ] **Step 1: Replace transport name assertions** + +Replace the block after `DirectionName` so it only checks the three supported names: + +```cpp + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DirectUrma) == "direct_urma", + "direct_urma transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::Memory) == "memory", + "memory transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DataAsFlag) == "data_as_flag", + "data_as_flag transport name mismatch"); +``` + +- [ ] **Step 2: Replace parse assertions** + +Keep the positive parse checks for `direct_urma`, `memory`, and `data_as_flag`, and add explicit rejection checks: + +```cpp + Require(TileXR::Demo::ParseP2PTransport("direct_urma") == TileXR::Demo::P2PTransport::DirectUrma, + "direct_urma transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("udma") == TileXR::Demo::P2PTransport::DirectUrma, + "udma alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory") == TileXR::Demo::P2PTransport::Memory, + "memory transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("data_as_flag") == TileXR::Demo::P2PTransport::DataAsFlag, + "data_as_flag transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_wqe") == TileXR::Demo::P2PTransport::Invalid, + "direct_urma_multi_wqe must be rejected"); + Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty") == TileXR::Demo::P2PTransport::Invalid, + "direct_urma_multi_jetty must be rejected"); + Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty_parallel") == TileXR::Demo::P2PTransport::Invalid, + "direct_urma_multi_jetty_parallel must be rejected"); + Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty_parallel_fixed_wqe") == + TileXR::Demo::P2PTransport::Invalid, + "direct_urma_multi_jetty_parallel_fixed_wqe must be rejected"); +``` + +- [ ] **Step 3: Remove fixed-WQE helper assertions** + +Delete assertions that call: + +```cpp +TileXR::Demo::P2PFixedWqeStrideBytes(...) +TileXR::Demo::P2PFixedWqeWindowBytes(...) +TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DirectUrmaMultiJettyParallelFixedWqe, ...) +``` + +Add this replacement assertion: + +```cpp + Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DirectUrma, 4096, 8) == 4096, + "direct_urma window must equal payload bytes"); +``` + +- [ ] **Step 4: Replace validation assertions for removed transports** + +Remove validation checks for `DirectUrmaMultiWqe`, `DirectUrmaMultiJetty`, `DirectUrmaMultiJettyParallel`, and `DirectUrmaMultiJettyParallelFixedWqe`. Add this direct URMA validation check: + +```cpp + options.transport = TileXR::Demo::P2PTransport::DirectUrma; + options.traffic = TileXR::Demo::P2PTraffic::BiDir; + options.blockDim = 8; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "valid direct_urma multi-jetty options rejected"); +``` + +- [ ] **Step 5: Replace mismatch-check assertions** + +Delete the `fixedWqeBytes` setup and the two fixed-WQE mismatch-check assertions. Add this simple direct URMA assertion: + +```cpp + Require(TileXR::Demo::CountP2PTransportMismatches( + bytes, pattern, 4096, TileXR::Demo::P2PTransport::DirectUrma, 8) == 1, + "direct_urma mismatch checker must validate payload bytes"); +``` + +This assertion comes after `bytes[17] ^= 0xff;`. + +- [ ] **Step 6: Replace CSV assertions for removed transports** + +Delete the CSV assertions for `DirectUrmaMultiWqe`, `DirectUrmaMultiJetty`, `DirectUrmaMultiJettyParallel`, and `DirectUrmaMultiJettyParallelFixedWqe`. Add this direct URMA multi-block CSV assertion: + +```cpp + row.transport = TileXR::Demo::P2PTransport::DirectUrma; + row.traffic = TileXR::Demo::P2PTraffic::UniDir; + row.blockDim = 8; + const std::string directUrmaParallelCsv = TileXR::Demo::FormatP2PPerfCsvRow(row); + Require(directUrmaParallelCsv == + "direct_urma,unidir,8,1to0,1,0,2,4096,20,8.000,0.000,0.000,0.512,0.512,0,0,logs/run\n", + "direct_urma parallel csv row mismatch"); +``` + +- [ ] **Step 7: Run focused unit test and verify RED** + +Run: + +```powershell +cmake --build tests/udma/build --target test_tilexr_udma_p2p_perf_config +``` + +If the build directory does not exist, run: + +```powershell +cmake -S tests/udma -B tests/udma/build +cmake --build tests/udma/build --target test_tilexr_udma_p2p_perf_config +``` + +Expected before production changes: build fails because enum members such as `DirectUrmaMultiWqe` still exist in production but tests now expect old strings to parse as `Invalid`, or the executable fails with one of the new rejection messages. + +- [ ] **Step 8: Commit failing tests only** + +Do not commit a failing test state. This step is intentionally a review gate: inspect `git diff tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`, then proceed to Task 2 without committing. + +--- + +### Task 2: Trim Config Helpers + +**Files:** +- Modify: `tests/udma/demo/tilexr_udma_p2p_perf_config.h` +- Test: `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp` + +**Interfaces:** +- Consumes: failing tests from Task 1 +- Produces: + - `enum class P2PTransport { DirectUrma, Memory, DataAsFlag, Invalid }` + - `ParseP2PTransport("direct_urma"|"udma") -> DirectUrma` + - removed transport strings return `Invalid` + - `P2PTransportWindowBytes(DirectUrma, bytes, blockDim) -> bytes` + - `P2PEffectiveTransferBytes(DirectUrma, bytes, blockDim) -> bytes` + +- [ ] **Step 1: Trim `P2PTransport` enum** + +Change the enum to: + +```cpp +enum class P2PTransport { + DirectUrma, + Memory, + DataAsFlag, + Invalid, +}; +``` + +- [ ] **Step 2: Trim `P2PTransportName`** + +Keep only: + +```cpp +inline const char* P2PTransportName(P2PTransport transport) +{ + switch (transport) { + case P2PTransport::DirectUrma: + return "direct_urma"; + case P2PTransport::Memory: + return "memory"; + case P2PTransport::DataAsFlag: + return "data_as_flag"; + default: + return "invalid"; + } +} +``` + +- [ ] **Step 3: Trim `ParseP2PTransport`** + +Keep only supported names and aliases: + +```cpp +inline P2PTransport ParseP2PTransport(const std::string& name) +{ + if (name == "direct_urma" || name == "udma") { + return P2PTransport::DirectUrma; + } + if (name == "memory" || name == "ipc" || name == "datacopy") { + return P2PTransport::Memory; + } + if (name == "data_as_flag" || name == "data-as-flag" || name == "daf") { + return P2PTransport::DataAsFlag; + } + return P2PTransport::Invalid; +} +``` + +- [ ] **Step 4: Remove fixed-WQE helpers** + +Delete: + +```cpp +inline uint64_t P2PAlignUp(uint64_t value, uint64_t alignment) +inline uint64_t P2PFixedWqeStrideBytes(uint64_t payloadBytes) +inline uint64_t P2PFixedWqeWindowBytes(uint64_t payloadBytes, uint32_t blockDim) +``` + +Then simplify: + +```cpp +inline uint64_t P2PTransportWindowBytes(P2PTransport transport, uint64_t payloadBytes, uint32_t blockDim) +{ + (void)blockDim; + return P2PTransportWindowBytes(transport, payloadBytes); +} +``` + +- [ ] **Step 5: Simplify effective bytes** + +Replace `P2PEffectiveTransferBytes` with: + +```cpp +inline uint64_t P2PEffectiveTransferBytes(P2PTransport transport, uint64_t payloadBytes, uint32_t blockDim) +{ + (void)transport; + (void)blockDim; + return payloadBytes; +} +``` + +- [ ] **Step 6: Update validation message** + +Change the invalid transport message to: + +```cpp +return fail("transport must be direct_urma, memory, or data_as_flag"); +``` + +- [ ] **Step 7: Simplify mismatch checker** + +Replace `CountP2PTransportMismatches` with: + +```cpp +inline uint64_t CountP2PTransportMismatches( + const std::vector& data, uint32_t pattern, uint64_t payloadBytes, + P2PTransport transport, uint32_t blockDim) +{ + (void)transport; + (void)blockDim; + return CountP2PMismatches(data, pattern, payloadBytes); +} +``` + +- [ ] **Step 8: Run focused unit test and verify GREEN** + +Run: + +```powershell +cmake --build tests/udma/build --target test_tilexr_udma_p2p_perf_config +ctest --test-dir tests/udma/build -R test_tilexr_udma_p2p_perf_config --output-on-failure +``` + +Expected: target builds and the config test passes. + +- [ ] **Step 9: Commit config test and helper changes** + +Run: + +```powershell +git add tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp tests/udma/demo/tilexr_udma_p2p_perf_config.h +git commit -m "test: trim P2P transport config modes" +``` + +--- + +### Task 3: Route `direct_urma` To Parallel Multi-Jetty And Remove Obsolete Kernels + +**Files:** +- Modify: `tests/udma/demo/tilexr_udma_demo.cpp` +- Modify: `tests/udma/demo/tilexr_udma_demo_kernel.cpp` +- Test: UDMA demo build target + +**Interfaces:** +- Consumes: + - `P2PTransport::DirectUrma` + - `launch_tilexr_udma_p2p_perf(uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic)` +- Produces: + - `launch_tilexr_udma_p2p_perf` launches the parallel multi-jetty kernel body + - no host references to removed enum values + - no benchmark wrappers named `launch_tilexr_udma_p2p_perf_multi_wqe`, `launch_tilexr_udma_p2p_perf_multi_jetty`, `launch_tilexr_udma_p2p_perf_multi_jetty_parallel`, or `launch_tilexr_udma_p2p_perf_multi_jetty_parallel_fixed_wqe` + +- [ ] **Step 1: Remove obsolete extern declarations from host** + +In `tests/udma/demo/tilexr_udma_demo.cpp`, delete declarations for: + +```cpp +launch_tilexr_udma_p2p_perf_multi_wqe +launch_tilexr_udma_p2p_perf_multi_jetty +launch_tilexr_udma_p2p_perf_multi_jetty_parallel +launch_tilexr_udma_p2p_perf_multi_jetty_parallel_fixed_wqe +``` + +Keep the existing `launch_tilexr_udma_p2p_perf` declaration. + +- [ ] **Step 2: Simplify host dispatch** + +In `LaunchP2PKernel`, remove branches for removed transports. The final structure should be: + +```cpp + if (options.transport == TileXR::Demo::P2PTransport::Memory) { + launch_tilexr_memory_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic); + return; + } + if (options.transport == TileXR::Demo::P2PTransport::DataAsFlag) { + launch_tilexr_data_as_flag_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic); + return; + } + launch_tilexr_udma_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic); +``` + +- [ ] **Step 3: Remove fixed-WQE host window special cases** + +In `RunP2PPerfMode`, delete `useFixedWqeSharedWindow` and `useSeparateDebugBuffer`. Replace the offset calculations with: + +```cpp + const uint64_t dstOffset = useIpcTransport ? TileXR::IPC_DATA_OFFSET : dstWindowBytes; + const uint64_t localDstOffset = dstWindowBytes; + const uint64_t debugOffset = useIpcTransport ? localDstOffset + dstWindowBytes : dstOffset + dstWindowBytes; + const uint64_t payloadBytes = debugOffset + kP2PDebugWords * sizeof(uint32_t); + const uint64_t registeredPayloadBytes = payloadBytes; +``` + +Remove `debugMemory` allocation and cleanup if it is only used by the fixed-WQE path after this change. + +- [ ] **Step 4: Simplify host source/destination initialization** + +Replace: + +```cpp +const bool initSrc = !useFixedWqeSharedWindow || IsP2PSourceRank(rank, options); +const bool initDst = !useFixedWqeSharedWindow || IsP2PReceiveRank(rank, options); +``` + +with: + +```cpp +const bool initSrc = true; +const bool initDst = true; +``` + +or inline the copy conditions so both local source and destination buffers are initialized on every rank. + +- [ ] **Step 5: Simplify UDMA QP support warning** + +Near the CLI option handling in `tilexr_udma_demo.cpp`, replace conditions that check removed transports: + +```cpp +p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrmaMultiWqe || +p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrmaMultiJetty || +p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrmaMultiJettyParallel || +p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrmaMultiJettyParallelFixedWqe +``` + +with: + +```cpp +p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrma +``` + +- [ ] **Step 6: Rename the parallel multi-jetty kernel body** + +In `tests/udma/demo/tilexr_udma_demo_kernel.cpp`, delete the old `tilexr_udma_p2p_perf_kernel` body and rename: + +```cpp +tilexr_udma_p2p_perf_multi_jetty_parallel_kernel +``` + +to: + +```cpp +tilexr_udma_p2p_perf_kernel +``` + +The resulting `tilexr_udma_p2p_perf_kernel` must retain the parallel logic: + +```cpp +uint32_t qpNum = enabled ? TileXR::GetUDMAInfo(args)->qpNum : 0; +uint32_t jettyCount = blockNum < qpNum ? blockNum : qpNum; +if (blockIdx >= jettyCount) { + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + return; +} +TileXRUdmaDemoWqeSlice(bytes, jettyCount, blockIdx, offset, sliceBytes); +TileXR::UDMAPutNbiQp(args, peer, blockIdx, src + offset, dstByteOffset + offset, sliceBytes); +uint32_t status = TileXR::UDMAQuietStatusQp(args, peer, blockIdx); +``` + +- [ ] **Step 7: Delete obsolete benchmark kernels** + +Delete these kernel functions: + +```cpp +tilexr_udma_p2p_perf_multi_wqe_kernel +tilexr_udma_p2p_perf_multi_jetty_kernel +tilexr_udma_p2p_perf_multi_jetty_parallel_fixed_wqe_kernel +``` + +Keep helper `TileXRUdmaDemoWqeSlice` because the renamed direct URMA kernel uses it. Delete `TileXRUdmaDemoFixedWqeSlice` if it becomes unused. + +- [ ] **Step 8: Delete obsolete launch wrappers** + +Delete wrappers: + +```cpp +launch_tilexr_udma_p2p_perf_multi_wqe +launch_tilexr_udma_p2p_perf_multi_jetty +launch_tilexr_udma_p2p_perf_multi_jetty_parallel +launch_tilexr_udma_p2p_perf_multi_jetty_parallel_fixed_wqe +``` + +Make existing `launch_tilexr_udma_p2p_perf` launch the renamed parallel kernel: + +```cpp +void launch_tilexr_udma_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic) +{ + tilexr_udma_p2p_perf_kernel<<>>( + commArgs, src, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic); +} +``` + +- [ ] **Step 9: Search for removed enum/function references** + +Run: + +```powershell +rg -n "DirectUrmaMulti|direct_urma_multi|multi_wqe|multi_jetty_parallel_fixed_wqe|fixed_wqe|launch_tilexr_udma_p2p_perf_multi" tests src docs +``` + +Expected after Task 3 code changes but before docs cleanup: only docs or tests planned for Task 4 may still mention removed user-facing names. + +- [ ] **Step 10: Build demo/unit targets** + +Run: + +```powershell +cmake --build tests/udma/build +ctest --test-dir tests/udma/build -R "test_tilexr_udma_p2p_perf_config|test_tilexr_udma_transport_layout" --output-on-failure +``` + +Expected: build succeeds; focused host-only tests pass. If CANN/Ascend build dependencies are unavailable on this host, capture the exact failure and continue to docs cleanup. + +- [ ] **Step 11: Commit host/kernel trim** + +Run: + +```powershell +git add tests/udma/demo/tilexr_udma_demo.cpp tests/udma/demo/tilexr_udma_demo_kernel.cpp +git commit -m "feat: fold P2P direct URMA into parallel jetty path" +``` + +--- + +### Task 4: Update Scripts, Documentation, And PR Description + +**Files:** +- Modify: `tests/udma/demo/run_tilexr_udma_p2p_perf.sh` +- Modify: `tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh` +- Modify: `docs/TileXR_UDMA_P2P_PERF_GUIDE.md` +- Modify: `tests/udma/demo/README.md` +- External: GitHub PR #40 description + +**Interfaces:** +- Consumes: supported transport names from Task 2 +- Produces: scripts and docs that mention only `direct_urma`, `memory`, and `data_as_flag` + +- [ ] **Step 1: Update runner QP setup** + +In `tests/udma/demo/run_tilexr_udma_p2p_perf.sh`, replace the transport conditional with: + +```bash +if [ "${transport}" = "direct_urma" ] || [ "${transport}" = "udma" ]; then + export TILEXR_UDMA_QP_NUM="${block_dim}" +else + export TILEXR_UDMA_QP_NUM="${TILEXR_UDMA_QP_NUM:-1}" +fi +``` + +- [ ] **Step 2: Verify sweep defaults** + +Confirm `tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh` default remains: + +```bash +transports_csv=${8:-direct_urma,memory,data_as_flag} +``` + +If it differs, change it to that exact line. + +- [ ] **Step 3: Update docs transport model** + +In `docs/TileXR_UDMA_P2P_PERF_GUIDE.md`, update the transport list to: + +```markdown +The P2P perf mode supports three user-facing transport modes: + +- `direct_urma`: registered-memory UDMA transfer. Internally this path uses + the parallel multi-jetty kernel; `block_dim=1` with one QP is the single-jetty + baseline, while `block_dim=N` with `TILEXR_UDMA_QP_NUM=N` uses up to `N` + QPs/jettys in parallel. +- `memory`: peer-memory IPC comparison using Ascend C `DataCopyPad`. +- `data_as_flag`: peer-memory IPC comparison where each 512B block carries + 480B payload plus a 32B ready flag. +``` + +Also update argument descriptions so `transport` says: + +```markdown +- `transport`: `direct_urma`, `memory`, or `data_as_flag`; default is `direct_urma`. +``` + +- [ ] **Step 4: Remove obsolete docs wording** + +Run: + +```powershell +rg -n "direct_urma_multi|multi-WQE|multi WQE|fixed-WQE|fixed_wqe|direct_urma_multi_jetty_parallel" docs tests/udma/demo +``` + +Edit `docs/TileXR_UDMA_P2P_PERF_GUIDE.md` and `tests/udma/demo/README.md` until no user-facing obsolete transport names remain. It is acceptable for the design spec and plan files under `docs/superpowers/` to mention removed names as historical scope. + +- [ ] **Step 5: Update PR #40 body with `gh`** + +Use the installed GitHub CLI full path if `gh` is not active in `PATH`: + +```powershell +& 'C:\Users\h30059441\AppData\Local\GitHubCLI\bin\gh.exe' pr edit 40 --repo LingquLab/TileXR --body @" +## Summary + +Extend the UDMA P2P performance demo with a trimmed three-mode transport matrix. + +This PR adds: + +- `test_type=4` UDMA P2P performance mode and usage documentation +- `direct_urma` registered-memory UDMA transfer with built-in parallel multi-jetty support +- `memory` peer-memory IPC comparison +- `data_as_flag` peer-memory IPC comparison with embedded ready flags +- unidirectional and bidirectional P2P traffic modes +- CSV output, plotting helper, P2P runner script, and concurrency sweep script +- host-side unit coverage for P2P config parsing/window sizing and UDMA transport layout metadata + +## Test plan + +- [x] `git diff --cached --check` +- [ ] `bash tests/udma/run_tests.sh` +- [ ] Run `tests/udma/demo/run_tilexr_udma_p2p_perf.sh` on supported Ascend hardware +- [ ] Run `tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh` on supported Ascend hardware +"@ +``` + +- [ ] **Step 6: Run final searches** + +Run: + +```powershell +rg -n "direct_urma_multi|multi-WQE|multi WQE|fixed-WQE|fixed_wqe|DirectUrmaMulti|launch_tilexr_udma_p2p_perf_multi" tests src docs --glob "!docs/superpowers/**" +``` + +Expected: no matches. + +- [ ] **Step 7: Run verification** + +Run: + +```powershell +git diff --check +ctest --test-dir tests/udma/build -R "test_tilexr_udma_p2p_perf_config|test_tilexr_udma_transport_layout" --output-on-failure +``` + +If `tests/udma/build` is unavailable, run: + +```powershell +cmake -S tests/udma -B tests/udma/build +cmake --build tests/udma/build +ctest --test-dir tests/udma/build -R "test_tilexr_udma_p2p_perf_config|test_tilexr_udma_transport_layout" --output-on-failure +``` + +Expected: whitespace check passes; focused tests pass. If environment dependencies prevent building, record the exact command and failure. + +- [ ] **Step 8: Commit docs/script cleanup** + +Run: + +```powershell +git add tests/udma/demo/run_tilexr_udma_p2p_perf.sh tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh docs/TileXR_UDMA_P2P_PERF_GUIDE.md tests/udma/demo/README.md +git commit -m "docs: describe trimmed P2P transport modes" +``` + +--- + +## Self-Review Checklist + +- Spec coverage: + - Three transport names covered in Tasks 1, 2, and 4. + - `direct_urma` folded into parallel multi-jetty path covered in Task 3. + - Removed modes covered in Tasks 1, 2, 3, and 4. + - Script QP behavior covered in Task 4. + - TDD red/green covered in Tasks 1 and 2. +- Placeholder scan: no unfinished placeholder wording is intentional in executable steps. +- Type consistency: + - `P2PTransport::DirectUrma`, `Memory`, `DataAsFlag`, and `Invalid` are the only public enum values used after Task 2. + - `launch_tilexr_udma_p2p_perf` remains the host-visible UDMA launcher name. diff --git a/docs/superpowers/plans/2026-06-26-data-as-flag-epoch-ordered.md b/docs/superpowers/plans/2026-06-26-data-as-flag-epoch-ordered.md new file mode 100644 index 0000000..7b93261 --- /dev/null +++ b/docs/superpowers/plans/2026-06-26-data-as-flag-epoch-ordered.md @@ -0,0 +1,1215 @@ +# DataAsFlag Epoch-Ordered P2P Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a side-by-side `data_as_flag_epoch_ordered` P2P transport that uses per-launch epochs and ordered payload/flag MTE3 writes before later replacing legacy `data_as_flag`. + +**Architecture:** The implementation extends the existing UDMA P2P demo transport table, keeps the legacy data-as-flag helpers intact, and adds epoch-ordered helper APIs in `src/include/tilexr_data_as_flag.h`. The new P2P kernel reuses the existing data-as-flag role and slice logic, but sender writes payload and epoch flag in separate MTE3 stages while receiver waits on one batch commit flag. + +**Tech Stack:** C++14 host code, AscendC AICore device code, CMake UDMA test targets, remote Ascend950 validation through `docs/ASCEND_REMOTE_BUILD_RUNBOOK.md`. + +## Global Constraints + +- Keep the existing block layout: `512B = 480B payload + 32B flag`. +- Keep ready information embedded in the data-as-flag window; do not move it to `SyncCollectives` or an independent flag region. +- Use per-launch `magic` and `step` to build a 64-bit epoch. +- Do not depend on payload and flag visibility within the same MTE3 operation. +- Phase 1 must keep legacy `data_as_flag` selectable for A/B comparison. +- `data_as_flag_epoch_ordered` must use the IPC peer data window and the same window byte calculation as legacy `data_as_flag`. +- Remote validation targets the existing Ascend environment in `docs/ASCEND_REMOTE_BUILD_RUNBOOK.md`; prefer NPU6/7 if 4/5 or 2/3 are suspect. +- Do not store the remote password in repo files or scripts. + +--- + +## File Structure + +- Modify `tests/udma/demo/tilexr_udma_p2p_perf_config.h` + - Add the `DataAsFlagEpochOrdered` transport enum value and helpers. + - Keep legacy `DataAsFlag` behavior unchanged. +- Modify `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp` + - Add failing then passing config tests for the new transport. +- Modify `src/include/tilexr_data_as_flag.h` + - Add epoch construction and epoch-ordered send/receive helpers. + - Keep existing `DataAsFlagSend` and `DataAsFlagCheckAndRecv` untouched for legacy mode. +- Modify `tests/udma/demo/tilexr_udma_demo_kernel.cpp` + - Add the new epoch-ordered P2P kernel and launcher wrapper. + - Reuse existing data-as-flag role resolution and slicing helpers. +- Modify `tests/udma/demo/tilexr_udma_demo.cpp` + - Add the launcher declaration and host dispatch. + - Pass `magic` and `step` to the new mode. + - Ensure legacy data-as-flag per-iteration clear barriers do not apply to the new mode. +- Modify `tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp` + - Add source structure checks for the new helper, kernel, launcher, and host wiring. +- Optionally modify `tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh` + - Include `data_as_flag_epoch_ordered` in default sweeps only after smoke validation passes. +- Add CSV outputs under `run/csv_full/` only during validation, not as part of core implementation commits unless the user explicitly asks to keep results. + +--- + +### Task 1: Add Transport Config and Unit Tests + +**Files:** +- Modify: `tests/udma/demo/tilexr_udma_p2p_perf_config.h` +- Modify: `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp` + +**Interfaces:** +- Consumes: Existing `P2PTransport`, `P2PTransportName`, `ParseP2PTransport`, `P2PTransportWindowBytes`, `P2PTransportUsesIpc`, `P2PTransportBothRanksActive`, `ValidateP2PPerfOptions`, `FormatP2PPerfCsvRow`. +- Produces: + - `P2PTransport::DataAsFlagEpochOrdered` + - Transport name: `"data_as_flag_epoch_ordered"` + - Aliases: `"data-as-flag-epoch-ordered"`, `"daf_epoch_ordered"` + +- [ ] **Step 1: Write failing enum/name/parse tests** + +Add these checks near the existing `DataAsFlag` transport checks in `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`: + +```cpp + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered) == + "data_as_flag_epoch_ordered", + "data_as_flag_epoch_ordered transport name mismatch"); + Require(TileXR::Demo::ParseP2PTransport("data_as_flag_epoch_ordered") == + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, + "data_as_flag_epoch_ordered transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("data-as-flag-epoch-ordered") == + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, + "data-as-flag-epoch-ordered alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("daf_epoch_ordered") == + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, + "daf_epoch_ordered alias parse mismatch"); +``` + +- [ ] **Step 2: Write failing layout/helper tests** + +Add these checks after the existing data-as-flag window checks: + +```cpp + Require(TileXR::Demo::P2PTransportWindowBytes( + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 0) == 0, + "data_as_flag_epoch_ordered zero layout mismatch"); + Require(TileXR::Demo::P2PTransportWindowBytes( + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 480) == 512, + "data_as_flag_epoch_ordered 480B layout mismatch"); + Require(TileXR::Demo::P2PTransportWindowBytes( + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 481) == 1024, + "data_as_flag_epoch_ordered 481B layout mismatch"); + Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered), + "data_as_flag_epoch_ordered must use IPC peer window"); + Require(TileXR::Demo::P2PTransportBothRanksActive( + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, TileXR::Demo::P2PTraffic::UniDir), + "data_as_flag_epoch_ordered unidir must keep receiver active"); +``` + +- [ ] **Step 3: Write failing validation and CSV tests** + +Add these checks near existing `DataAsFlag` validation and CSV assertions: + +```cpp + options.transport = TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered; + options.traffic = TileXR::Demo::P2PTraffic::UniDir; + options.blockDim = 4; + options.maxBytes = 16384; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "valid data_as_flag_epoch_ordered options rejected"); + + row.transport = TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered; + row.traffic = TileXR::Demo::P2PTraffic::UniDir; + row.blockDim = 4; + const std::string epochOrderedCsv = TileXR::Demo::FormatP2PPerfCsvRow(row); + Require(epochOrderedCsv == + "data_as_flag_epoch_ordered,unidir,4,1to0,1,0,2,4096,20,8.000,0.000,0.000,0.512,0.512,0,0,logs/run\n", + "data_as_flag_epoch_ordered csv row mismatch"); +``` + +- [ ] **Step 4: Run the config test and confirm it fails** + +Run on a Linux/Ascend build environment: + +```bash +cd /home/h30059441/TileXR +source scripts/common_env.sh +cd tests/udma +bash build.sh +./install/bin/test_tilexr_udma_p2p_perf_config +``` + +Expected before implementation: compile failure mentioning `DataAsFlagEpochOrdered` is not a member of `P2PTransport`. + +- [ ] **Step 5: Implement transport config** + +Modify `tests/udma/demo/tilexr_udma_p2p_perf_config.h`: + +```cpp +enum class P2PTransport { + DirectUrma, + DirectUrmaPostOnly, + Memory, + MemoryConsume, + DataAsFlag, + DataAsFlagEpochOrdered, + Invalid, +}; +``` + +Add name mapping: + +```cpp + case P2PTransport::DataAsFlagEpochOrdered: + return "data_as_flag_epoch_ordered"; +``` + +Add parser mapping: + +```cpp + if (name == "data_as_flag_epoch_ordered" || + name == "data-as-flag-epoch-ordered" || + name == "daf_epoch_ordered") { + return P2PTransport::DataAsFlagEpochOrdered; + } +``` + +Update window bytes: + +```cpp + return (transport == P2PTransport::DataAsFlag || + transport == P2PTransport::DataAsFlagEpochOrdered) ? + DataAsFlagWindowBytes(payloadBytes) : payloadBytes; +``` + +Update IPC and active-rank helpers: + +```cpp +inline bool P2PTransportUsesIpc(P2PTransport transport) +{ + return transport == P2PTransport::Memory || + transport == P2PTransport::MemoryConsume || + transport == P2PTransport::DataAsFlag || + transport == P2PTransport::DataAsFlagEpochOrdered; +} + +inline bool P2PTransportBothRanksActive(P2PTransport transport, P2PTraffic traffic) +{ + return traffic == P2PTraffic::BiDir || + transport == P2PTransport::MemoryConsume || + transport == P2PTransport::DataAsFlag || + transport == P2PTransport::DataAsFlagEpochOrdered; +} +``` + +Update the validation error string: + +```cpp +return fail("transport must be direct_urma, direct_urma_post_only, memory, memory_consume, data_as_flag, or data_as_flag_epoch_ordered"); +``` + +- [ ] **Step 6: Run the config test and confirm it passes** + +Run: + +```bash +cd /home/h30059441/TileXR/tests/udma +bash build.sh +./install/bin/test_tilexr_udma_p2p_perf_config +``` + +Expected: exit code `0`. + +- [ ] **Step 7: Commit Task 1** + +```bash +git add tests/udma/demo/tilexr_udma_p2p_perf_config.h \ + tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp +git commit -m "feat: add epoch ordered data-as-flag transport config" +``` + +--- + +### Task 2: Add Epoch-Ordered Device Helpers + +**Files:** +- Modify: `src/include/tilexr_data_as_flag.h` + +**Interfaces:** +- Consumes: Existing constants `DATA_AS_FLAG_BLOCK_BYTES`, `DATA_AS_FLAG_PAYLOAD_BYTES`, `DATA_AS_FLAG_FLAG_BYTES`, `DATA_AS_FLAG_FLAG_OFFSET_BYTES`, `DataAsFlagBlockCountForPayloadBytes`, `DataAsFlagScratchBytes`, `DataAsFlagMaxRecvBlocks`, `DataAsFlagCopyPayloadToScratch`, `DataAsFlagCopyBatchToRecvGM`. +- Produces: + - `uint64_t DataAsFlagEpoch(int32_t magic, int32_t step)` + - `uint32_t DataAsFlagMaxEpochOrderedSendBlocks(uint32_t scratchBytes)` + - `uint32_t DataAsFlagMaxEpochOrderedRecvBlocks(uint32_t scratchBytes)` + - `uint32_t DataAsFlagSendEpochOrdered(...)` + - `bool DataAsFlagCheckAndRecvEpochOrdered(...)` + +- [ ] **Step 1: Add source guard tests before implementation** + +This task's implementation will be guarded in Task 5, but first verify absence manually: + +```bash +rg -n "DataAsFlagEpoch|DataAsFlagSendEpochOrdered|DataAsFlagCheckAndRecvEpochOrdered" src/include/tilexr_data_as_flag.h +``` + +Expected before implementation: no matches. + +- [ ] **Step 2: Add epoch helper** + +Add this helper in `src/include/tilexr_data_as_flag.h` outside the AscendC-only block so unit/source checks can find it: + +```cpp +TILEXR_DATA_AS_FLAG_INLINE uint64_t DataAsFlagEpoch(int32_t magic, int32_t step) +{ + return (static_cast(static_cast(magic)) << 32) | + static_cast(step); +} +``` + +- [ ] **Step 3: Add epoch ordered batch capacity helpers** + +Inside `#if TILEXR_ASCENDC_AICORE_COMPILE`, add: + +```cpp +__aicore__ inline uint32_t DataAsFlagMaxEpochOrderedSendBlocks(uint32_t scratchBytes) +{ + uint32_t blocks = scratchBytes / DATA_AS_FLAG_BLOCK_BYTES; + while (blocks > 0U) { + const uint64_t requiredBytes = + static_cast(blocks) * DATA_AS_FLAG_BLOCK_BYTES + + static_cast(blocks) * DATA_AS_FLAG_FLAG_BYTES; + if (static_cast(scratchBytes) >= requiredBytes) { + return blocks; + } + --blocks; + } + return 0U; +} + +__aicore__ inline uint32_t DataAsFlagMaxEpochOrderedRecvBlocks(uint32_t scratchBytes) +{ + return DataAsFlagMaxRecvBlocks(scratchBytes); +} +``` + +This conservative v1 uses one scratch region for packed payload blocks and a second scratch region for epoch flags. + +- [ ] **Step 4: Add flag fill and flag copy helpers** + +Add: + +```cpp +__aicore__ inline void DataAsFlagFillEpochFlags( + AscendC::LocalTensor& flagScratch, + uint32_t blockCount, + uint64_t epoch) +{ + AscendC::LocalTensor flagWords = flagScratch.template ReinterpretCast(); + const uint32_t words = blockCount * DATA_AS_FLAG_FLAG_BYTES / sizeof(uint64_t); + for (uint32_t i = 0; i < words; ++i) { + flagWords.SetValue(i, epoch); + } + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); +} + +__aicore__ inline void DataAsFlagCopyEpochFlagsToGM( + __gm__ uint8_t* dstDataAsFlagGM, + uint32_t dstBlockOffset, + AscendC::LocalTensor& flagScratch, + uint32_t batchBlocks) +{ + AscendC::GlobalTensor flagGlobal; + flagGlobal.SetGlobalBuffer( + dstDataAsFlagGM + static_cast(dstBlockOffset) * DATA_AS_FLAG_BLOCK_BYTES + + DATA_AS_FLAG_FLAG_OFFSET_BYTES); + AscendC::DataCopyExtParams flagParams { + static_cast(batchBlocks), + DATA_AS_FLAG_FLAG_BYTES, + 0U, + DATA_AS_FLAG_PAYLOAD_BYTES / DATA_AS_FLAG_ALIGN_BYTES, + 0U}; + AscendC::DataCopyPadExtParams padParams {false, 0U, 0U, 0U}; + AscendC::DataCopyPad(flagGlobal, flagScratch, flagParams, padParams); +} +``` + +- [ ] **Step 5: Add `DataAsFlagSendEpochOrdered`** + +Add: + +```cpp +__aicore__ inline uint32_t DataAsFlagSendEpochOrdered( + __gm__ uint8_t* dstDataAsFlagGM, + const __gm__ uint8_t* srcGM, + uint64_t dataBytes, + uint64_t epoch, + AscendC::LocalTensor& scratch) +{ + if (dstDataAsFlagGM == nullptr || srcGM == nullptr || dataBytes == 0U) { + return 0U; + } + + const uint32_t totalBlocks = DataAsFlagBlockCountForPayloadBytes(dataBytes); + const uint32_t batchCapacity = DataAsFlagMaxEpochOrderedSendBlocks(DataAsFlagScratchBytes(scratch)); + if (batchCapacity == 0U) { + return 0U; + } + + uint32_t sentBlocks = 0U; + uint64_t sentBytes = 0U; + while (sentBlocks < totalBlocks) { + const uint32_t remainingBlocks = totalBlocks - sentBlocks; + const uint32_t batchBlocks = remainingBlocks < batchCapacity ? remainingBlocks : batchCapacity; + const uint64_t maxBatchBytes = static_cast(batchBlocks) * DATA_AS_FLAG_PAYLOAD_BYTES; + const uint64_t remainingBytes = dataBytes - sentBytes; + const uint32_t batchPayloadBytes = static_cast( + remainingBytes < maxBatchBytes ? remainingBytes : maxBatchBytes); + const uint32_t fullBlocks = batchPayloadBytes / DATA_AS_FLAG_PAYLOAD_BYTES; + const uint32_t tailBytes = batchPayloadBytes % DATA_AS_FLAG_PAYLOAD_BYTES; + + AscendC::LocalTensor payloadScratch = scratch; + AscendC::LocalTensor flagScratch = + scratch[static_cast(batchBlocks) * DATA_AS_FLAG_BLOCK_BYTES]; + + AscendC::Duplicate(payloadScratch, 0U, batchBlocks * DATA_AS_FLAG_BLOCK_BYTES); + AscendC::PipeBarrier(); + DataAsFlagCopyPayloadToScratch(payloadScratch, srcGM, sentBytes, fullBlocks, tailBytes); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + DataAsFlagCopyScratchToDataAsFlagGM(dstDataAsFlagGM, sentBlocks, payloadScratch, batchBlocks); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + + DataAsFlagFillEpochFlags(flagScratch, batchBlocks, epoch); + DataAsFlagCopyEpochFlagsToGM(dstDataAsFlagGM, sentBlocks, flagScratch, batchBlocks); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + + sentBlocks += batchBlocks; + sentBytes += batchPayloadBytes; + } + AscendC::PipeBarrier(); + return totalBlocks; +} +``` + +Note: the first MTE3 writes packed 512B blocks with zero flags. The second MTE3 writes current epoch flags. This satisfies the ordered protocol while avoiding a risky first version of strided payload-only GM writes. + +- [ ] **Step 6: Add commit flag polling helper** + +Add: + +```cpp +__aicore__ inline uint64_t DataAsFlagLoadEpochFlag( + const __gm__ uint8_t* dataAsFlagGM, + uint32_t blockIndex, + AscendC::LocalTensor& scratch) +{ + AscendC::GlobalTensor flagGlobal; + flagGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>( + const_cast<__gm__ uint8_t*>( + dataAsFlagGM + static_cast(blockIndex) * DATA_AS_FLAG_BLOCK_BYTES + + DATA_AS_FLAG_FLAG_OFFSET_BYTES))); + AscendC::LocalTensor flagLocal = scratch.template ReinterpretCast(); + AscendC::DataCopyExtParams params {1U, sizeof(uint64_t), 0U, 0U, 0U}; + AscendC::DataCopyPadExtParams padParams {false, 0U, 0U, 0U}; + AscendC::DataCopyPad(flagLocal, flagGlobal, params, padParams); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + return flagLocal.GetValue(0); +} +``` + +- [ ] **Step 7: Add strict batch checker** + +Add: + +```cpp +__aicore__ inline bool DataAsFlagCheckBatchEpochStrict( + const __gm__ uint8_t* dataAsFlagGM, + uint32_t blockOffset, + uint32_t batchBlocks, + uint64_t epoch, + AscendC::LocalTensor& scratch) +{ + if (batchBlocks == 0U) { + return false; + } + for (uint32_t i = 0; i < batchBlocks; ++i) { + if (DataAsFlagLoadEpochFlag(dataAsFlagGM, blockOffset + i, scratch) != epoch) { + return false; + } + } + return true; +} +``` + +- [ ] **Step 8: Add `DataAsFlagCheckAndRecvEpochOrdered`** + +Add: + +```cpp +__aicore__ inline bool DataAsFlagCheckAndRecvEpochOrdered( + const __gm__ uint8_t* dataAsFlagGM, + uint64_t dataBytes, + __gm__ uint8_t* recvGM, + uint64_t epoch, + AscendC::LocalTensor& recvScratch, + bool strict) +{ + if (dataAsFlagGM == nullptr || recvGM == nullptr) { + return false; + } + if (dataBytes == 0U) { + return true; + } + + const uint32_t totalBlocks = DataAsFlagBlockCountForPayloadBytes(dataBytes); + const uint32_t batchCapacity = DataAsFlagMaxEpochOrderedRecvBlocks(DataAsFlagScratchBytes(recvScratch)); + if (batchCapacity == 0U) { + return false; + } + + uint32_t processedBlocks = 0U; + uint64_t processedBytes = 0U; + while (processedBlocks < totalBlocks) { + const uint32_t remainingBlocks = totalBlocks - processedBlocks; + const uint32_t batchBlocks = remainingBlocks < batchCapacity ? remainingBlocks : batchCapacity; + const uint32_t lastBlock = processedBlocks + batchBlocks - 1U; + while (DataAsFlagLoadEpochFlag(dataAsFlagGM, lastBlock, recvScratch) != epoch) { + } + if (strict && !DataAsFlagCheckBatchEpochStrict(dataAsFlagGM, processedBlocks, batchBlocks, epoch, recvScratch)) { + return false; + } + + const uint64_t remainingBytes = dataBytes - processedBytes; + const uint64_t maxBatchBytes = static_cast(batchBlocks) * DATA_AS_FLAG_PAYLOAD_BYTES; + const uint32_t batchBytes = static_cast( + remainingBytes < maxBatchBytes ? remainingBytes : maxBatchBytes); + DataAsFlagCopyBatchToRecvGM( + dataAsFlagGM, processedBlocks, processedBytes, batchBytes, recvGM, recvScratch); + processedBlocks += batchBlocks; + processedBytes += batchBytes; + } + AscendC::PipeBarrier(); + return true; +} +``` + +- [ ] **Step 9: Build and fix compiler issues** + +Run: + +```bash +cd /home/h30059441/TileXR +source scripts/common_env.sh +cd tests/udma +bash build.sh +``` + +Expected: build succeeds. If AscendC rejects `Duplicate` or `DataCopyPad` template parameters, adjust only the helper internals while preserving the same signatures and two-stage MTE3 protocol. + +- [ ] **Step 10: Commit Task 2** + +```bash +git add src/include/tilexr_data_as_flag.h +git commit -m "feat: add epoch ordered data-as-flag helpers" +``` + +--- + +### Task 3: Add P2P Kernel and Launcher + +**Files:** +- Modify: `tests/udma/demo/tilexr_udma_demo_kernel.cpp` + +**Interfaces:** +- Consumes: + - `TileXRUdmaDemoResolveDataAsFlagRole` + - `TileXRUdmaDemoDataAsFlagSlice` + - `TileXR::DataAsFlagEpoch` + - `TileXR::DataAsFlagSendEpochOrdered` + - `TileXR::DataAsFlagCheckAndRecvEpochOrdered` +- Produces: + - `tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel` + - `launch_tilexr_data_as_flag_epoch_ordered_p2p_perf` + +- [ ] **Step 1: Add failing source guard expectation** + +Temporarily add these checks to `tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp` in a new helper `TestDataAsFlagEpochOrderedSource()`: + +```cpp +void TestDataAsFlagEpochOrderedSource() +{ + const std::string kernelPath = "tests/udma/demo/tilexr_udma_demo_kernel.cpp"; + const std::string kernelText = ReadFile(kernelPath); + CheckContains(kernelPath, kernelText, "tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel"); + CheckContains(kernelPath, kernelText, "launch_tilexr_data_as_flag_epoch_ordered_p2p_perf"); + CheckContains(kernelPath, kernelText, "DataAsFlagEpoch(magic, step)"); + CheckContains(kernelPath, kernelText, "DataAsFlagSendEpochOrdered"); + CheckContains(kernelPath, kernelText, "DataAsFlagCheckAndRecvEpochOrdered"); + CheckContains(kernelPath, kernelText, "int32_t magic, int32_t step"); +} +``` + +Call it from `main()`: + +```cpp + TestDataAsFlagEpochOrderedSource(); +``` + +Run: + +```bash +cd /home/h30059441/TileXR/tests/udma +bash build.sh +./install/bin/test_tilexr_udma_p2p_source_guard +``` + +Expected before kernel implementation: source guard failure listing missing strings. + +- [ ] **Step 2: Add the epoch-ordered kernel** + +Add this kernel beside `tilexr_data_as_flag_p2p_perf_kernel` in `tests/udma/demo/tilexr_udma_demo_kernel.cpp`: + +```cpp +extern "C" __global__ __aicore__ void tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel( + GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR dstGM, GM_ADDR debugGM, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step) +{ + if constexpr (g_coreType == AscendC::AIV) { + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM); + auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM); + auto dst = reinterpret_cast<__gm__ uint8_t*>(dstGM); + + int32_t rank = args->rank; + uint32_t blockIdx = AscendC::GetBlockIdx(); + uint32_t blockNum = AscendC::GetBlockNum(); + const uint64_t epoch = TileXR::DataAsFlagEpoch(magic, step); + if (debug != nullptr && blockIdx == 0) { + debug[0] = TILEXR_UDMA_DEMO_MAGIC; + debug[1] = rank; + debug[2] = 1; + debug[3] = bytes; + debug[4] = pattern; + debug[5] = 0xffffffffu; + debug[6] = blockNum; + debug[7] = static_cast(epoch & 0xffffffffu); + } + + bool isSender = false; + bool isReceiver = false; + int32_t peer = -1; + if (blockNum == 0 || + !TileXRUdmaDemoResolveDataAsFlagRole(rank, srcRank, dstRank, traffic, isSender, isReceiver, peer) || + peer < 0 || peer >= args->rankSize) { + return; + } + + AscendC::GlobalTensor peerMems; + peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE); + GM_ADDR peerBase = peerMems.GetValue(peer); + GM_ADDR localBase = peerMems.GetValue(rank); + if (peerBase == nullptr || localBase == nullptr || dst == nullptr || (isSender && src == nullptr)) { + uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + } + return; + } + + uint32_t payloadOffset = 0; + uint32_t sliceBytes = 0; + uint32_t dataAsFlagOffset = 0; + TileXRUdmaDemoDataAsFlagSlice(bytes, blockNum, blockIdx, payloadOffset, sliceBytes, dataAsFlagOffset); + if (debug != nullptr && blockIdx < 8) { + debug[16 + blockIdx] = payloadOffset; + debug[24 + blockIdx] = sliceBytes; + } + if (sliceBytes == 0) { + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + return; + } + + AscendC::TPipe pipe; + AscendC::TBuf tBuf; + pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES); + AscendC::LocalTensor scratch = tBuf.Get(); + + uint32_t status = 0; + if (isSender) { + uint32_t sentBlocks = TileXR::DataAsFlagSendEpochOrdered( + reinterpret_cast<__gm__ uint8_t*>(peerBase + dstByteOffset + dataAsFlagOffset), + src + payloadOffset, sliceBytes, epoch, scratch); + if (sentBlocks == 0U) { + status = 3; + } + } + if (isReceiver && status == 0) { + bool strict = false; + bool received = TileXR::DataAsFlagCheckAndRecvEpochOrdered( + reinterpret_cast<__gm__ uint8_t*>(localBase + dstByteOffset + dataAsFlagOffset), + sliceBytes, dst + payloadOffset, epoch, scratch, strict); + if (!received) { + status = 4; + } + } + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + } + } +} +``` + +- [ ] **Step 3: Add launcher wrapper** + +Add near the existing `launch_tilexr_data_as_flag_p2p_perf`: + +```cpp +void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step) +{ + tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel<<>>( + commArgs, src, dst, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic, magic, step); +} +``` + +- [ ] **Step 4: Build and run source guard** + +Run: + +```bash +cd /home/h30059441/TileXR +source scripts/common_env.sh +cd tests/udma +bash build.sh +./install/bin/test_tilexr_udma_p2p_source_guard +``` + +Expected: build succeeds and source guard exits `0`. + +- [ ] **Step 5: Commit Task 3** + +```bash +git add tests/udma/demo/tilexr_udma_demo_kernel.cpp \ + tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp +git commit -m "feat: add epoch ordered data-as-flag p2p kernel" +``` + +--- + +### Task 4: Wire Host Launch and Clear-Window Behavior + +**Files:** +- Modify: `tests/udma/demo/tilexr_udma_demo.cpp` +- Modify: `tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp` + +**Interfaces:** +- Consumes: + - `P2PTransport::DataAsFlagEpochOrdered` + - `launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(...)` +- Produces: + - Host dispatch from CLI transport `data_as_flag_epoch_ordered`. + - No per-iteration clear-window barrier for the new transport. + +- [ ] **Step 1: Extend source guard for host wiring** + +Add checks to `TestDataAsFlagEpochOrderedSource()`: + +```cpp + const std::string hostPath = "tests/udma/demo/tilexr_udma_demo.cpp"; + const std::string hostText = ReadFile(hostPath); + CheckContains(hostPath, hostText, "launch_tilexr_data_as_flag_epoch_ordered_p2p_perf"); + CheckContains(hostPath, hostText, "P2PTransport::DataAsFlagEpochOrdered"); + CheckContains(hostPath, hostText, "magic, step"); + CheckContains(hostPath, hostText, "useLegacyDataAsFlagTransport"); +``` + +Run: + +```bash +cd /home/h30059441/TileXR/tests/udma +bash build.sh +./install/bin/test_tilexr_udma_p2p_source_guard +``` + +Expected before host implementation: source guard failure. + +- [ ] **Step 2: Add launcher declaration** + +In `tests/udma/demo/tilexr_udma_demo.cpp`, add: + +```cpp +extern void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step); +``` + +- [ ] **Step 3: Dispatch the new transport** + +In `LaunchP2PKernel`, add the new branch before the legacy `DataAsFlag` branch: + +```cpp + if (options.transport == TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered) { + launch_tilexr_data_as_flag_epoch_ordered_p2p_perf( + options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic, magic, step); + return; + } +``` + +- [ ] **Step 4: Split legacy clear behavior from new behavior** + +Replace: + +```cpp + const bool useDataAsFlagTransport = options.transport == TileXR::Demo::P2PTransport::DataAsFlag; +``` + +with: + +```cpp + const bool useLegacyDataAsFlagTransport = options.transport == TileXR::Demo::P2PTransport::DataAsFlag; +``` + +Then replace warmup/measured per-iteration data-as-flag clear branches to use `useLegacyDataAsFlagTransport` only: + +```cpp + if (useLegacyDataAsFlagTransport && IsP2PReceiveRank(rank, options) && + !ClearLocalPeerWindow(rank, commArgsHost, dstOffset, + TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes), "p2p data_as_flag warmup window")) { + ok = false; + break; + } + if (useLegacyDataAsFlagTransport && + !DemoBarrierAll(rank, rankSize, + "p2p data_as_flag warmup clear bytes=" + std::to_string(bytes) + + " iter=" + std::to_string(i))) { + ok = false; + break; + } +``` + +And measured: + +```cpp + if (useLegacyDataAsFlagTransport && IsP2PReceiveRank(rank, options) && + !ClearLocalPeerWindow(rank, commArgsHost, dstOffset, + TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes), "p2p data_as_flag measured window")) { + ok = false; + break; + } + if (useLegacyDataAsFlagTransport && + !DemoBarrierAll(rank, rankSize, + "p2p data_as_flag measured clear bytes=" + std::to_string(bytes))) { + ok = false; + break; + } +``` + +Keep the existing once-per-size IPC destination clear: + +```cpp + if (useIpcTransport && IsP2PReceiveRank(rank, options)) { + const uint64_t clearBytes = TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes); + ... + } +``` + +This one-time clear remains valid for epoch-ordered startup hygiene. + +- [ ] **Step 5: Build and run unit/source tests** + +Run: + +```bash +cd /home/h30059441/TileXR +source scripts/common_env.sh +cd tests/udma +bash build.sh +./install/bin/test_tilexr_udma_p2p_perf_config +./install/bin/test_tilexr_udma_p2p_source_guard +``` + +Expected: both tests exit `0`. + +- [ ] **Step 6: Commit Task 4** + +```bash +git add tests/udma/demo/tilexr_udma_demo.cpp \ + tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp +git commit -m "feat: wire epoch ordered data-as-flag p2p host path" +``` + +--- + +### Task 5: Add Strict Mode Host Control + +**Files:** +- Modify: `tests/udma/demo/tilexr_udma_demo_kernel.cpp` +- Modify: `tests/udma/demo/tilexr_udma_demo.cpp` +- Modify: `tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp` + +**Interfaces:** +- Consumes: `DataAsFlagCheckAndRecvEpochOrdered(..., bool strict)`. +- Produces: + - Environment variable `TILEXR_DATA_AS_FLAG_STRICT`. + - Launcher and kernel parameter `int32_t strict`. + +- [ ] **Step 1: Add source guard checks** + +Add checks: + +```cpp + CheckContains(hostPath, hostText, "TILEXR_DATA_AS_FLAG_STRICT"); + CheckContains(kernelPath, kernelText, "int32_t strict"); + CheckContains(kernelPath, kernelText, "strict != 0"); +``` + +Run source guard and expect failure before implementation. + +- [ ] **Step 2: Extend launcher signatures** + +In `tests/udma/demo/tilexr_udma_demo.cpp`, change the extern declaration: + +```cpp +extern void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step, int32_t strict); +``` + +In `tests/udma/demo/tilexr_udma_demo_kernel.cpp`, extend the launcher wrapper and kernel parameters with `int32_t strict`. + +- [ ] **Step 3: Pass strict from host** + +In `LaunchP2PKernel`, compute: + +```cpp + const int32_t dataAsFlagStrict = GetEnvFlag("TILEXR_DATA_AS_FLAG_STRICT", false) ? 1 : 0; +``` + +Pass it only to the epoch-ordered launcher: + +```cpp + launch_tilexr_data_as_flag_epoch_ordered_p2p_perf( + options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic, + magic, step, dataAsFlagStrict); +``` + +- [ ] **Step 4: Use strict in kernel** + +Replace: + +```cpp + bool strict = false; +``` + +with: + +```cpp + bool strictMode = strict != 0; +``` + +Pass `strictMode` to `DataAsFlagCheckAndRecvEpochOrdered`. + +- [ ] **Step 5: Build and run source guard** + +Run: + +```bash +cd /home/h30059441/TileXR +source scripts/common_env.sh +cd tests/udma +bash build.sh +./install/bin/test_tilexr_udma_p2p_source_guard +``` + +Expected: source guard exits `0`. + +- [ ] **Step 6: Commit Task 5** + +```bash +git add tests/udma/demo/tilexr_udma_demo.cpp \ + tests/udma/demo/tilexr_udma_demo_kernel.cpp \ + tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp +git commit -m "feat: add strict flag checks for epoch ordered data-as-flag" +``` + +--- + +### Task 6: Remote Smoke and Sweep Validation + +**Files:** +- Read: `docs/ASCEND_REMOTE_BUILD_RUNBOOK.md` +- May create local validation outputs under: `run/csv_full/` + +**Interfaces:** +- Consumes: Built demo binary `tests/udma/install/bin/tilexr_udma_demo`. +- Produces: + - Remote logs for focused smoke and sweep runs. + - Local CSV copies for old/new transport comparison. + +- [ ] **Step 1: Sync changed files to remote** + +Use Paramiko from local PowerShell. Keep the password in the environment only: + +```powershell +@' +import os +import pathlib +import paramiko + +host = "141.62.19.144" +user = "root" +password = os.environ["TILEXR_REMOTE_PASS"] +local_root = pathlib.Path(r"D:\workspace\TileXR") +remote_root = "/home/h30059441/TileXR" + +files = [ + "src/include/tilexr_data_as_flag.h", + "tests/udma/demo/tilexr_udma_demo.cpp", + "tests/udma/demo/tilexr_udma_demo_kernel.cpp", + "tests/udma/demo/tilexr_udma_p2p_perf_config.h", + "tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp", + "tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp", +] + +ssh = paramiko.SSHClient() +ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +ssh.connect(host, username=user, password=password, timeout=15, + banner_timeout=15, auth_timeout=15) +sftp = ssh.open_sftp() +for path in files: + sftp.put(str(local_root / path), remote_root + "/" + path) + print("uploaded", path) +sftp.close() +ssh.close() +'@ | python - +``` + +- [ ] **Step 2: Build and run tests remotely** + +Run: + +```powershell +@' +import os +import paramiko +host = "141.62.19.144" +user = "root" +password = os.environ["TILEXR_REMOTE_PASS"] +cmd = r''' +cd /home/h30059441/TileXR +pids=$(ps -eo pid=,comm= | awk '$2=="tilexr_udma_demo" {print $1}') +[ -n "$pids" ] && kill -TERM $pids || true +sleep 2 +pids=$(ps -eo pid=,comm= | awk '$2=="tilexr_udma_demo" {print $1}') +[ -n "$pids" ] && kill -KILL $pids || true +source scripts/common_env.sh +cd tests/udma +bash build.sh +bash run_tests.sh +''' +ssh = paramiko.SSHClient() +ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +ssh.connect(host, username=user, password=password, timeout=15, + banner_timeout=15, auth_timeout=15) +stdin, stdout, stderr = ssh.exec_command(cmd, get_pty=True, timeout=900) +for line in iter(stdout.readline, ""): + print(line, end="") +err = stderr.read().decode("utf-8", errors="replace") +if err: + print(err, end="") +rc = stdout.channel.recv_exit_status() +ssh.close() +raise SystemExit(rc) +'@ | python - +``` + +Expected: build succeeds, `run_tests.sh` summary reports PASS for layout, registry, P2P perf config, source guard, and single-process integration. + +- [ ] **Step 3: Run focused fast-path smoke on NPU6/7** + +Run a 4KB to 1MB smoke: + +```bash +cd /home/h30059441/TileXR +source scripts/common_env.sh +cd tests/udma +export TILEXR_COMM_ID=127.0.0.1:10267 +export TILEXR_DEMO_NPUS=2 +export TILEXR_DEMO_FIRST_NPU=6 +export LD_LIBRARY_PATH="$PWD/install/lib:$PWD/install/lib64:/home/h30059441/TileXR/install/lib:/home/h30059441/TileXR/install/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}" +log_dir="$PWD/logs/tilexr_daf_epoch_smoke_$(date +%Y%m%d_%H%M%S)" +mkdir -p "$log_dir" +csv="$log_dir/p2p_perf.csv" +bin="$PWD/install/bin/tilexr_udma_demo" +for rank in 0 1; do + RANK=$rank RANK_SIZE=2 TILEXR_P2P_LOG_DIR="$log_dir" TILEXR_P2P_CSV="$csv" "$bin" \ + 2 "$rank" 4 0 2 6 \ + 0 1 4096 1048576 2 \ + 20 5 1 "$csv" "$log_dir" data_as_flag_epoch_ordered \ + 1 unidir >"$log_dir/rank_${rank}.log" 2>&1 & + pids[$rank]=$! +done +ret=0 +for pid in ${pids[0]} ${pids[1]}; do + wait "$pid" || ret=$? +done +echo "log_dir=$log_dir" +cat "$csv" +exit $ret +``` + +Expected: all CSV rows have `status=0` and `errors=0`. + +- [ ] **Step 4: Run strict smoke** + +Repeat Step 3 with: + +```bash +export TILEXR_DATA_AS_FLAG_STRICT=1 +``` + +Use a smaller range: + +```text +8 65536 2 +``` + +Expected: all CSV rows have `status=0` and `errors=0`. + +- [ ] **Step 5: Run full unidir sweep 8B to 64MB** + +Run with: + +```text +min_bytes=8 +max_bytes=67108864 +step_factor=2 +iters=20 +warmup_iters=5 +transport=data_as_flag_epoch_ordered +block_dim=1 +traffic=unidir +``` + +Expected: 24 data rows, all `status=0`, all `errors=0`. + +- [ ] **Step 6: Download CSVs** + +Download the new sweep: + +```powershell +@' +import os +import pathlib +import paramiko +host = "141.62.19.144" +user = "root" +password = os.environ["TILEXR_REMOTE_PASS"] +local_csv = pathlib.Path(r"D:\workspace\TileXR\run\csv_full\data_as_flag_epoch_ordered_unidir_bd1_8B_64MB.csv") +ssh = paramiko.SSHClient() +ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +ssh.connect(host, username=user, password=password, timeout=15, + banner_timeout=15, auth_timeout=15) +stdin, stdout, stderr = ssh.exec_command( + "ls -td /home/h30059441/TileXR/tests/udma/logs/tilexr_daf_epoch_* 2>/dev/null | head -1", + timeout=30) +log_dir = stdout.read().decode("utf-8", errors="replace").strip() +err = stderr.read().decode("utf-8", errors="replace").strip() +if err: + raise RuntimeError(err) +if not log_dir: + raise RuntimeError("no tilexr_daf_epoch log directory found") +remote_csv = log_dir + "/p2p_perf.csv" +sftp = ssh.open_sftp() +sftp.get(remote_csv, str(local_csv)) +sftp.close() +ssh.close() +print("downloaded", local_csv) +'@ | python - +``` + +- [ ] **Step 7: Confirm no stale demo processes** + +Run: + +```bash +ps -eo pid=,comm=,args= | awk '$2=="tilexr_udma_demo" {print}' +``` + +Expected: no output. + +- [ ] **Step 8: Commit Task 6 if validation artifacts should be tracked** + +Only commit CSVs or plot changes if the user asks to keep them in the branch: + +```bash +git add run/csv_full/data_as_flag_epoch_ordered_unidir_bd1_8B_64MB.csv +git commit -m "test: add epoch ordered data-as-flag p2p sweep results" +``` + +If the user does not ask to track artifacts, do not commit this task. + +--- + +### Task 7: Decide Migration to Public `data_as_flag` + +**Files:** +- Modify later only after user approval: + - `tests/udma/demo/tilexr_udma_p2p_perf_config.h` + - `tests/udma/demo/tilexr_udma_demo.cpp` + - `tests/udma/demo/tilexr_udma_demo_kernel.cpp` + - `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp` + - `tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp` + - plotting scripts under `run/csv_full/` if needed + +**Interfaces:** +- Consumes: successful Task 6 validation evidence. +- Produces: a follow-up plan or patch that maps `data_as_flag` to the epoch-ordered implementation. + +- [ ] **Step 1: Summarize validation** + +Prepare a short table: + +```text +transport range block_dim strict status/errors +data_as_flag 8B-64MB 1 no ... +data_as_flag_epoch_ordered 8B-64MB 1 no ... +data_as_flag_epoch_ordered 8B-64KB 1 yes ... +``` + +- [ ] **Step 2: Ask for migration approval** + +Ask the user: + +```text +Validation is clean. Should I now replace legacy data_as_flag with the epoch-ordered implementation and keep data_as_flag_epoch_ordered as a temporary alias? +``` + +- [ ] **Step 3: Stop if approval is not explicit** + +Do not remove legacy `data_as_flag` in this plan without explicit user approval after validation. + +--- + +## Self-Review Checklist + +- Spec coverage: + - New transport config: Task 1. + - Per-launch epoch: Tasks 2, 3, 4. + - Embedded 512B block layout unchanged: Tasks 2 and 3. + - Separate ordered MTE3 payload/flag stages: Task 2. + - Fast receiver checks last flag: Task 2. + - Strict/debug path: Task 5 and Task 6 strict smoke. + - Side-by-side validation with old `data_as_flag`: Task 6 and Task 7. + - Later migration to public `data_as_flag`: Task 7. +- Placeholder scan: no red-flag placeholder words or unnamed tests remain. +- Type consistency: + - Enum name is consistently `DataAsFlagEpochOrdered`. + - Public transport string is consistently `data_as_flag_epoch_ordered`. + - Device epoch type is consistently `uint64_t`. + - Launcher carries `int32_t magic`, `int32_t step`, and later `int32_t strict`. diff --git a/docs/superpowers/specs/2026-06-24-p2p-transport-trim-design.md b/docs/superpowers/specs/2026-06-24-p2p-transport-trim-design.md new file mode 100644 index 0000000..18da592 --- /dev/null +++ b/docs/superpowers/specs/2026-06-24-p2p-transport-trim-design.md @@ -0,0 +1,63 @@ +# P2P Transport Trim Design + +## Goal + +Reduce the UDMA P2P performance demo transport matrix to three user-facing modes: + +- `direct_urma` +- `memory` +- `data_as_flag` + +`direct_urma` will remain the only UDMA benchmark transport name, but it will use the current parallel multi-jetty implementation internally. + +## Behavior + +`direct_urma` represents direct registered-memory UDMA transfer. It supports both a single-jetty baseline and multi-jetty parallel transfer through the existing `block_dim` and `TILEXR_UDMA_QP_NUM` controls: + +- `block_dim=1` with one QP behaves like the previous single-QP direct URMA path. +- `block_dim=N` with `TILEXR_UDMA_QP_NUM=N` uses up to `N` QPs/jettys in parallel. + +The removed user-facing variants are: + +- `direct_urma_multi_wqe` +- `direct_urma_multi_jetty` +- `direct_urma_multi_jetty_parallel` +- `direct_urma_multi_jetty_parallel_fixed_wqe` + +The previous `direct_urma_multi_jetty_parallel` behavior is folded into `direct_urma`. + +## Code Changes + +Update the P2P transport enum, parser, formatter, validation messages, CSV output tests, runner scripts, and documentation so only the three supported names appear. + +Remove the obsolete benchmark kernels and host launch wrappers for: + +- single-WQE `direct_urma` +- `multi_wqe` +- serial `multi_jetty` +- fixed-WQE parallel multi-jetty + +Keep the existing QP-indexed device helpers and UDMA transport layout support because the new `direct_urma` implementation depends on them. + +Remove fixed-WQE-only window sizing and mismatch-check logic from the P2P config helpers and host demo code. + +## Scripts And Docs + +The P2P runner should set `TILEXR_UDMA_QP_NUM` from `block_dim` when `transport=direct_urma`. The concurrency sweep defaults should cover: + +- `direct_urma` +- `memory` +- `data_as_flag` + +Documentation and PR wording should describe `direct_urma` as a scalable direct UDMA path rather than listing multiple UDMA variants. + +## Testing + +Use TDD for the implementation: + +1. First update host-only unit tests to expect the trimmed transport set and the new `direct_urma` CSV semantics. +2. Run the unit target and confirm the tests fail before production changes. +3. Apply the production changes. +4. Re-run the focused UDMA unit tests and any available build/test script. + +Hardware P2P demo runs remain a manual verification item on supported Ascend hardware. diff --git a/docs/superpowers/specs/2026-06-26-data-as-flag-epoch-ordered-design.md b/docs/superpowers/specs/2026-06-26-data-as-flag-epoch-ordered-design.md new file mode 100644 index 0000000..69b6abf --- /dev/null +++ b/docs/superpowers/specs/2026-06-26-data-as-flag-epoch-ordered-design.md @@ -0,0 +1,355 @@ +# DataAsFlag Epoch-Ordered P2P Design + +Date: 2026-06-26 + +## Background + +The current P2P `data_as_flag` transport uses a 512-byte block layout: + +```text +[0, 480) payload +[480, 512) ready flag area +``` + +The sender initializes the local UB scratch with ready flags and then writes the +packed `payload + flag` block layout to the peer data-as-flag IPC data window. +The receiver polls each batch by copying one flag value per block and using +`Sum` to decide whether all blocks in the batch are ready. + +This has two important limitations in the P2P benchmark: + +- The host does not pass a per-iteration value into `data_as_flag`. Measured + iterations currently rely on clearing the receiver window. If the window is + cleared only once before a measured loop, later queued kernel launches can see + stale ready flags from earlier iterations. +- A flag written as part of the same MTE3 transfer as the payload is not a + sufficient proof that the payload bytes from that same MTE3 transfer are + already visible. The correctness argument should depend on the order between + distinct MTE3 writes, not on byte-level visibility within a single MTE3 write. + +`memory_consume` already avoids stale synchronization by passing `magic` and +`step` for each launch. `data_as_flag_epoch_ordered` brings the same +per-iteration discipline to data-as-flag while keeping the flag embedded in the +data window. + +## Goals + +- Add a new P2P transport named `data_as_flag_epoch_ordered`. +- Keep the existing 512-byte data-as-flag block layout unchanged. +- Keep ready information embedded in the data-as-flag window, not in an + independent sync region. +- Use a per-launch epoch so receiver polling waits for the current iteration + and does not depend on clearing the window between launches. +- Send payload and flag with ordered, separate MTE3 stages: + 1. MTE3 payload write. + 2. MTE3 flag write with the current epoch. +- Reduce receiver overhead by making the default receive path poll only the last + flag in each batch as a batch commit marker. +- Keep a strict/debug path that can validate all flags in a batch after the + commit flag is observed. +- Validate the new transport side-by-side with the existing `data_as_flag`. + +## Non-Goals + +- Do not change the block layout or payload-per-block ratio. +- Do not increase payload bytes per flag. +- Do not move flags to `SyncCollectives` outer flags or any other independent + sync area. +- Do not immediately remove the existing `data_as_flag` implementation. Removal + happens only after the new mode is validated. +- Do not depend on same-MTE3 payload and flag visibility. + +## Transport Rollout + +The rollout has two phases. + +Phase 1 adds a new transport: + +```text +data_as_flag_epoch_ordered +``` + +Suggested aliases: + +```text +daf_epoch_ordered +data-as-flag-epoch-ordered +``` + +The old `data_as_flag` remains available for A/B comparison. + +Phase 2 happens after remote validation shows correct data and stable +performance. In that phase, remove or hide the legacy implementation and make +the epoch-ordered implementation use the public `data_as_flag` transport name. + +## Epoch Format + +The P2P host already creates one `magic` per phase and size, then passes +`step = iteration + 1` for each warmup or measured launch. The new transport +uses the same inputs. + +Device-side data-as-flag epoch: + +```cpp +uint64_t epoch = (static_cast(static_cast(magic)) << 32) | + static_cast(step); +``` + +The full 64-bit epoch is written into the 32-byte flag area. The default +implementation should duplicate it four times as `uint64_t` values: + +```text +flag area: epoch, epoch, epoch, epoch +``` + +The receiver fast path only needs to compare one `uint64_t`, preferably the +first word in the flag area. The strict path can check all four words and all +blocks in the batch. + +## Sender Protocol + +For each batch: + +```text +MTE3 #1: payload blocks -> peer data-as-flag payload areas +MTE3 #2: flag blocks -> peer data-as-flag flag areas, value = epoch +``` + +The sender must not prove payload arrival using a flag from the same MTE3 write +as the payload. The flag is a commit marker only because the flag MTE3 is issued +after the payload MTE3. + +Suggested device helper: + +```cpp +__aicore__ inline uint32_t DataAsFlagSendEpochOrdered( + __gm__ uint8_t* dstDataAsFlagGM, + const __gm__ uint8_t* srcGM, + uint64_t dataBytes, + uint64_t epoch, + AscendC::LocalTensor& scratch); +``` + +Implementation shape: + +1. Compute `totalBlocks = ceil(dataBytes / 480)`. +2. Split by UB capacity. +3. Copy continuous payload from `srcGM` into a payload scratch area. +4. MTE3-copy payload into block payload regions in `dstDataAsFlagGM`, using a + destination stride that skips each 32-byte flag area. +5. Prepare a flag scratch area filled with the current epoch. +6. MTE3-copy flags into the block flag regions. +7. Advance to the next batch. + +If a direct strided `DataCopyPad` from continuous payload scratch to +`480B payload + 32B gap` GM layout is not accepted by AscendC or performs badly, +the conservative first implementation may keep the existing payload packing +shape but must still issue the flag write as a separate MTE3 stage after the +payload write. + +## Receiver Protocol + +For each batch: + +```text +poll last block flag == epoch +copy batch payload -> dstGM +``` + +Suggested device helper: + +```cpp +__aicore__ inline bool DataAsFlagCheckAndRecvEpochOrdered( + const __gm__ uint8_t* dataAsFlagGM, + uint64_t dataBytes, + __gm__ uint8_t* recvGM, + uint64_t epoch, + AscendC::LocalTensor& scratch, + bool strict); +``` + +Implementation shape: + +1. Compute `totalBlocks = ceil(dataBytes / 480)`. +2. Split by receive UB capacity. +3. For each batch, compute `lastBlock = processedBlocks + batchBlocks - 1`. +4. Poll the selected word in `lastBlock`'s flag area until it equals `epoch`. +5. If strict mode is enabled, validate the batch flags after the last flag is + observed. +6. Copy the batch payload from data-as-flag layout into continuous `recvGM`. +7. Advance to the next batch. + +The fast path removes the repeated copy of all flags and the vector `Sum` used +by the legacy receiver. The strict path keeps a validation option for testing +the MTE3 order assumption and diagnosing data errors. + +## Strict Mode + +Strict mode is not the default performance path. + +It should be enabled by a host option or environment flag, for example: + +```text +TILEXR_DATA_AS_FLAG_STRICT=1 +``` + +After the last block flag equals the current epoch, strict mode checks the whole +batch. A simple first implementation may copy one `uint64_t` flag per block and +compare each value to `epoch`. It does not need to use `Sum`, because the epoch +is a 64-bit token rather than `float(1.0f)`. + +Strict mode failure should set a nonzero debug status so the host CSV row reports +the issue through the existing P2P status path. + +## Host Changes + +The new launcher takes the same launch metadata style as `memory_consume`: + +```cpp +void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf( + uint32_t blockDim, + void* stream, + GM_ADDR commArgs, + GM_ADDR src, + GM_ADDR dst, + GM_ADDR debug, + int32_t srcRank, + int32_t dstRank, + uint64_t dstByteOffset, + uint32_t bytes, + uint32_t pattern, + int32_t traffic, + int32_t magic, + int32_t step); +``` + +`LaunchP2PKernel` passes `magic` and `step` to the new transport. Warmup and +measured loops use the existing values: + +```text +warmup: magic = warmupMagic, step = i + 1 +measure: magic = measuredMagic, step = i + 1 +``` + +The new transport does not need per-iteration clear-window calls or the extra +clear barriers currently used for legacy `data_as_flag`. A single initialization +clear per size is still acceptable, mainly to reduce the chance that old data +accidentally equals an epoch during early bring-up. The epoch value should be +large enough that accidental matches are practically impossible. + +The transport helper predicates should treat `data_as_flag_epoch_ordered` like +`data_as_flag`: + +- Uses IPC peer window. +- Both ranks are active in unidirectional traffic. +- Window size is still `ceil(payload / 480) * 512`. + +## Kernel Integration + +Add a new P2P kernel beside the legacy one: + +```cpp +extern "C" __global__ __aicore__ void tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel(...); +``` + +The role resolution can reuse `TileXRUdmaDemoResolveDataAsFlagRole`. + +The slicing can reuse `TileXRUdmaDemoDataAsFlagSlice` because the layout and +block ownership do not change. + +Sender path: + +```text +DataAsFlagSendEpochOrdered(peerBase + dstByteOffset + dataAsFlagOffset, + src + payloadOffset, + sliceBytes, + epoch, + scratch) +``` + +Receiver path: + +```text +DataAsFlagCheckAndRecvEpochOrdered(localBase + dstByteOffset + dataAsFlagOffset, + sliceBytes, + dst + payloadOffset, + epoch, + scratch, + strict) +``` + +Debug words should record enough information to distinguish the new mode from +legacy `data_as_flag`, including block count, status, and at least the low 32 +bits of the epoch. + +## Correctness Argument + +Stale flags are avoided because every launch writes and waits for a unique epoch. +The receiver no longer treats a generic ready value from an older launch as +current data. + +Payload visibility depends on MTE3 ordering across distinct MTE3 operations: +payload MTE3 is issued before flag MTE3. When the receiver sees the batch commit +flag from the flag MTE3, it can treat the earlier payload MTE3 for the same +batch as visible. + +The protocol does not assume that all bytes from a single MTE3 write become +visible at the same time. In particular, it does not use a flag written by the +same MTE3 operation as proof for that MTE3 operation's payload. + +The block semantic remains data-as-flag because the flag is still embedded in +the 512-byte data window block. + +## Tests + +Unit tests: + +- `P2PTransportName` returns `data_as_flag_epoch_ordered`. +- Parser accepts the new name and aliases. +- Window size matches legacy `data_as_flag`. +- `P2PTransportUsesIpc` and `P2PTransportBothRanksActive` include the new mode. +- CSV formatting preserves the transport name. + +Source guard tests: + +- New launcher declaration exists. +- New kernel exists. +- New device helpers exist in `tilexr_data_as_flag.h`. +- New helper signatures include `epoch`. +- Host passes `magic` and `step` to the new launcher. +- Legacy per-iteration clear-window branches do not apply to the new transport. + +Remote validation: + +- Build `tests/udma` on the Ascend environment. +- Run unit tests. +- Run focused P2P smoke tests for `data_as_flag_epoch_ordered` on 6/7 or other + healthy cards. +- Run 8B to 64MB unidirectional sweep with `block_dim=1`. +- Compare old `data_as_flag`, new `data_as_flag_epoch_ordered`, `memory`, and + `memory_consume` CSVs. +- Verify all rows have `status=0` and `errors=0`. +- Run strict mode on a smaller sweep to validate all batch flags. + +## Completion Criteria + +- `data_as_flag_epoch_ordered` is selectable from the P2P demo CLI. +- The new mode uses per-launch epoch values and does not require per-iteration + clear-window barriers. +- Sender emits payload and flag in separate ordered MTE3 stages. +- Receiver fast path waits on one batch commit flag and then copies payload. +- Strict mode can validate all flags for debug runs. +- Remote build and tests pass. +- Remote P2P smoke and sweep runs complete with `status=0` and `errors=0`. +- CSV output allows direct A/B comparison with legacy `data_as_flag`. + +## Later Migration + +After validation, replace the public `data_as_flag` behavior with the +epoch-ordered implementation: + +1. Remove or hide the legacy transport from sweeps. +2. Make `data_as_flag` parse to the epoch-ordered implementation. +3. Keep `data_as_flag_epoch_ordered` as a temporary alias if useful. +4. Update docs and plots so the public name is again `data_as_flag`. +5. Remove legacy clear-window workarounds from the data-as-flag host path. diff --git a/src/comm/udma/tilexr_udma_layout.cpp b/src/comm/udma/tilexr_udma_layout.cpp index 401b5f6..0a6d6d0 100644 --- a/src/comm/udma/tilexr_udma_layout.cpp +++ b/src/comm/udma/tilexr_udma_layout.cpp @@ -8,10 +8,6 @@ #include namespace TileXR { -namespace { - -constexpr uint32_t TILEXR_UDMA_QP_NUM = 1; - template void CopyVector(std::vector& dst, size_t offset, const std::vector& src) { @@ -20,10 +16,9 @@ void CopyVector(std::vector& dst, size_t offset, const std::vector& } } -} // namespace - int BuildUDMAInfoImage( uintptr_t deviceBase, + uint32_t qpNum, const std::vector& sq, const std::vector& rq, const std::vector& scq, @@ -32,9 +27,9 @@ int BuildUDMAInfoImage( UDMAInfo& info, std::vector& bytes) { - const size_t rankCount = sq.size(); - if (rankCount == 0 || rq.size() != rankCount || scq.size() != rankCount || - rcq.size() != rankCount || mem.size() != rankCount) { + if (qpNum == 0 || sq.empty() || sq.size() % qpNum != 0 || + rq.size() != sq.size() || scq.size() != sq.size() || + rcq.size() != sq.size() || mem.size() != sq.size()) { return TILEXR_UDMA_LAYOUT_INVALID; } @@ -46,7 +41,7 @@ int BuildUDMAInfoImage( const size_t totalBytes = memOffset + mem.size() * sizeof(UDMAMemInfo); info = {}; - info.qpNum = TILEXR_UDMA_QP_NUM; + info.qpNum = qpNum; info.sqPtr = deviceBase + sqOffset; info.rqPtr = deviceBase + rqOffset; info.scqPtr = deviceBase + scqOffset; diff --git a/src/comm/udma/tilexr_udma_layout.h b/src/comm/udma/tilexr_udma_layout.h index fefd49e..21cc138 100644 --- a/src/comm/udma/tilexr_udma_layout.h +++ b/src/comm/udma/tilexr_udma_layout.h @@ -18,6 +18,7 @@ constexpr int TILEXR_UDMA_LAYOUT_INVALID = -3; int BuildUDMAInfoImage( uintptr_t deviceBase, + uint32_t qpNum, const std::vector& sq, const std::vector& rq, const std::vector& scq, diff --git a/src/comm/udma/tilexr_udma_transport.cpp b/src/comm/udma/tilexr_udma_transport.cpp index 7bafa5f..0429a19 100644 --- a/src/comm/udma/tilexr_udma_transport.cpp +++ b/src/comm/udma/tilexr_udma_transport.cpp @@ -33,6 +33,20 @@ uint32_t Log2Uint64(uint64_t value) return result; } +uint32_t GetEnvUint(const char* name, uint32_t defaultValue, uint32_t minValue, uint32_t maxValue) +{ + const char* value = std::getenv(name); + if (value == nullptr || value[0] == '\0') { + return defaultValue; + } + char* end = nullptr; + unsigned long parsed = std::strtoul(value, &end, 10); + if (end == value || *end != '\0' || parsed < minValue || parsed > maxValue) { + return defaultValue; + } + return static_cast(parsed); +} + HccpEid SwapEidForDevice(const HccpEid& hccpEid) { HccpEid swapped {}; @@ -293,20 +307,20 @@ struct TileXRUDMATransport::PerEidState { void* ctxHandle = nullptr; void* tokenHandle = nullptr; void* chanHandle = nullptr; - void* cqHandle = nullptr; - void* qpHandle = nullptr; - CqInfoT cqInfo {}; - QpCreateInfo qpInfo {}; - std::vector remoteQpHandles; - std::vector tpnList; - void* cqPiAddr = nullptr; - void* cqCiAddr = nullptr; - void* sqPiAddr = nullptr; - void* sqCiAddr = nullptr; - void* wqeCntAddr = nullptr; - void* amoAddr = nullptr; - UDMAWQCtx localWq {}; - UDMACQCtx localCq {}; + std::vector cqHandles; + std::vector cqInfos; + std::vector qpHandles; + std::vector qpInfos; + std::vector> remoteQpHandlesByQp; + std::vector> tpnListByQp; + std::vector cqPiAddrs; + std::vector cqCiAddrs; + std::vector sqPiAddrs; + std::vector sqCiAddrs; + std::vector wqeCntAddrs; + std::vector amoAddrs; + std::vector localWqs; + std::vector localCqs; }; TileXRUDMATransport::TileXRUDMATransport() = default; @@ -328,6 +342,7 @@ int TileXRUDMATransport::Init(const TileXRUDMATransportOptions& options) return TILEXR_ERROR_PARA_CHECK_FAIL; } options_ = options; + qpNum_ = GetEnvUint("TILEXR_UDMA_QP_NUM", 1, 1, 64); int ret = loader_.Load(); if (ret != TILEXR_HCCP_LOADER_SUCCESS) { @@ -586,8 +601,20 @@ int TileXRUDMATransport::CreateQueues() state.eidIndex = ctxEntry.first; state.ctxHandle = ctxEntry.second; state.tokenHandle = tokenHandleByEid_[ctxEntry.first]; - state.remoteQpHandles.assign(options_.rankSize, nullptr); - state.tpnList.assign(options_.rankSize, 0); + state.qpHandles.assign(qpNum_, nullptr); + state.qpInfos.resize(qpNum_); + state.remoteQpHandlesByQp.assign(qpNum_, std::vector(options_.rankSize, nullptr)); + state.tpnListByQp.assign(qpNum_, std::vector(options_.rankSize, 0)); + state.cqHandles.assign(qpNum_, nullptr); + state.cqInfos.resize(qpNum_); + state.cqPiAddrs.assign(qpNum_, nullptr); + state.cqCiAddrs.assign(qpNum_, nullptr); + state.sqPiAddrs.assign(qpNum_, nullptr); + state.sqCiAddrs.assign(qpNum_, nullptr); + state.wqeCntAddrs.assign(qpNum_, nullptr); + state.amoAddrs.assign(qpNum_, nullptr); + state.localWqs.resize(qpNum_); + state.localCqs.resize(qpNum_); ChanInfoT chanInfo {}; chanInfo.in.dataPlaneFlag.bs.poolCqCstm = 1; @@ -596,62 +623,66 @@ int TileXRUDMATransport::CreateQueues() return TILEXR_ERROR_INTERNAL; } - state.cqInfo.in.chanHandle = state.chanHandle; - state.cqInfo.in.depth = TILEXR_UDMA_CQ_DEPTH; - state.cqInfo.in.ub.mode = JFC_MODE_USER_CTL_NORMAL; - ret = loader_.RaCtxCqCreate(state.ctxHandle, &state.cqInfo, &state.cqHandle); - if (ret != 0) { - return TILEXR_ERROR_INTERNAL; - } - state.localCq.cqn = 0; - state.localCq.bufAddr = state.cqInfo.out.bufAddr; - state.localCq.baseBkShift = Log2Uint64(state.cqInfo.out.cqeSize); - state.localCq.depth = state.cqInfo.in.depth; - if (AllocDeviceScalar(&state.cqPiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || - AllocDeviceScalar(&state.cqCiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS) { - return TILEXR_ERROR_INTERNAL; - } - state.localCq.headAddr = reinterpret_cast(state.cqPiAddr); - state.localCq.tailAddr = reinterpret_cast(state.cqCiAddr); - state.localCq.dbMode = UDMADBMode::SW_DB; - state.localCq.dbAddr = state.cqInfo.out.swdbAddr; - - QpCreateAttr qpAttr {}; - qpAttr.scqHandle = state.cqHandle; - qpAttr.rcqHandle = state.cqHandle; - qpAttr.srqHandle = state.cqHandle; - qpAttr.sqDepth = TILEXR_UDMA_SQ_DEPTH; - qpAttr.rqDepth = TILEXR_UDMA_RQ_DEPTH_DEFAULT; - qpAttr.transportMode = CONN_RM; - qpAttr.ub.mode = JETTY_MODE_USER_CTL_NORMAL; - qpAttr.ub.flag.value = 1; - qpAttr.ub.jfsFlag.value = 2; - qpAttr.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE; - qpAttr.ub.rnrRetry = 7; - qpAttr.ub.extMode.piType = 0; - qpAttr.ub.extMode.cstmFlag.bs.sqCstm = 0; - qpAttr.ub.extMode.sqebbNum = TILEXR_UDMA_SQ_DEPTH; - qpAttr.ub.tokenIdHandle = state.tokenHandle; - ret = loader_.RaCtxQpCreate(state.ctxHandle, &qpAttr, &state.qpInfo, &state.qpHandle); - if (ret != 0) { - return TILEXR_ERROR_INTERNAL; - } - state.localWq.wqn = 0; - state.localWq.bufAddr = state.qpInfo.ub.sqBuffVa; - state.localWq.baseBkShift = Log2Uint64(state.qpInfo.ub.wqebbSize); - state.localWq.depth = TILEXR_UDMA_SQ_BB_COUNT; - if (AllocDeviceScalar(&state.sqPiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || - AllocDeviceScalar(&state.sqCiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || - AllocDeviceScalar(&state.wqeCntAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || - AllocDeviceScalar(&state.amoAddr, sizeof(uint64_t)) != TILEXR_SUCCESS) { - return TILEXR_ERROR_INTERNAL; + for (uint32_t qpIdx = 0; qpIdx < qpNum_; ++qpIdx) { + state.cqInfos[qpIdx].in.chanHandle = state.chanHandle; + state.cqInfos[qpIdx].in.depth = TILEXR_UDMA_CQ_DEPTH; + state.cqInfos[qpIdx].in.ub.mode = JFC_MODE_USER_CTL_NORMAL; + ret = loader_.RaCtxCqCreate(state.ctxHandle, &state.cqInfos[qpIdx], &state.cqHandles[qpIdx]); + if (ret != 0) { + return TILEXR_ERROR_INTERNAL; + } + auto& localCq = state.localCqs[qpIdx]; + localCq.cqn = qpIdx; + localCq.bufAddr = state.cqInfos[qpIdx].out.bufAddr; + localCq.baseBkShift = Log2Uint64(state.cqInfos[qpIdx].out.cqeSize); + localCq.depth = state.cqInfos[qpIdx].in.depth; + if (AllocDeviceScalar(&state.cqPiAddrs[qpIdx], sizeof(uint32_t)) != TILEXR_SUCCESS || + AllocDeviceScalar(&state.cqCiAddrs[qpIdx], sizeof(uint32_t)) != TILEXR_SUCCESS) { + return TILEXR_ERROR_INTERNAL; + } + localCq.headAddr = reinterpret_cast(state.cqPiAddrs[qpIdx]); + localCq.tailAddr = reinterpret_cast(state.cqCiAddrs[qpIdx]); + localCq.dbMode = UDMADBMode::SW_DB; + localCq.dbAddr = state.cqInfos[qpIdx].out.swdbAddr; + + QpCreateAttr qpAttr {}; + qpAttr.scqHandle = state.cqHandles[qpIdx]; + qpAttr.rcqHandle = state.cqHandles[qpIdx]; + qpAttr.srqHandle = state.cqHandles[qpIdx]; + qpAttr.sqDepth = TILEXR_UDMA_SQ_DEPTH; + qpAttr.rqDepth = TILEXR_UDMA_RQ_DEPTH_DEFAULT; + qpAttr.transportMode = CONN_RM; + qpAttr.ub.mode = JETTY_MODE_USER_CTL_NORMAL; + qpAttr.ub.flag.value = 1; + qpAttr.ub.jfsFlag.value = 2; + qpAttr.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE; + qpAttr.ub.rnrRetry = 7; + qpAttr.ub.extMode.piType = 0; + qpAttr.ub.extMode.cstmFlag.bs.sqCstm = 0; + qpAttr.ub.extMode.sqebbNum = TILEXR_UDMA_SQ_DEPTH; + qpAttr.ub.tokenIdHandle = state.tokenHandle; + ret = loader_.RaCtxQpCreate(state.ctxHandle, &qpAttr, &state.qpInfos[qpIdx], &state.qpHandles[qpIdx]); + if (ret != 0) { + return TILEXR_ERROR_INTERNAL; + } + auto& localWq = state.localWqs[qpIdx]; + localWq.wqn = qpIdx; + localWq.bufAddr = state.qpInfos[qpIdx].ub.sqBuffVa; + localWq.baseBkShift = Log2Uint64(state.qpInfos[qpIdx].ub.wqebbSize); + localWq.depth = TILEXR_UDMA_SQ_BB_COUNT; + if (AllocDeviceScalar(&state.sqPiAddrs[qpIdx], sizeof(uint32_t)) != TILEXR_SUCCESS || + AllocDeviceScalar(&state.sqCiAddrs[qpIdx], sizeof(uint32_t)) != TILEXR_SUCCESS || + AllocDeviceScalar(&state.wqeCntAddrs[qpIdx], sizeof(uint32_t)) != TILEXR_SUCCESS || + AllocDeviceScalar(&state.amoAddrs[qpIdx], sizeof(uint64_t)) != TILEXR_SUCCESS) { + return TILEXR_ERROR_INTERNAL; + } + localWq.headAddr = reinterpret_cast(state.sqPiAddrs[qpIdx]); + localWq.tailAddr = reinterpret_cast(state.sqCiAddrs[qpIdx]); + localWq.dbMode = UDMADBMode::SW_DB; + localWq.dbAddr = state.qpInfos[qpIdx].ub.dbAddr; + localWq.wqeCntAddr = reinterpret_cast(state.wqeCntAddrs[qpIdx]); + localWq.amoAddr = reinterpret_cast(state.amoAddrs[qpIdx]); } - state.localWq.headAddr = reinterpret_cast(state.sqPiAddr); - state.localWq.tailAddr = reinterpret_cast(state.sqCiAddr); - state.localWq.dbMode = UDMADBMode::SW_DB; - state.localWq.dbAddr = state.qpInfo.ub.dbAddr; - state.localWq.wqeCntAddr = reinterpret_cast(state.wqeCntAddr); - state.localWq.amoAddr = reinterpret_cast(state.amoAddr); states_[state.eidIndex] = state; } return states_.empty() ? TILEXR_ERROR_INTERNAL : TILEXR_SUCCESS; @@ -659,28 +690,32 @@ int TileXRUDMATransport::CreateQueues() int TileXRUDMATransport::ImportQueues() { - std::vector localImports(eidCount_); - std::vector localKeys(eidCount_); + const size_t routeCount = static_cast(eidCount_) * qpNum_; + std::vector localImports(routeCount); + std::vector localKeys(routeCount); for (const auto& stateEntry : states_) { const auto& state = stateEntry.second; if (state.eidIndex >= eidCount_) { return TILEXR_ERROR_INTERNAL; } - localImports[state.eidIndex].in.ub.mode = JETTY_IMPORT_MODE_NORMAL; - localImports[state.eidIndex].in.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE; - localImports[state.eidIndex].in.ub.policy = JETTY_GRP_POLICY_RR; - localImports[state.eidIndex].in.ub.type = TARGET_TYPE_JETTY; - localImports[state.eidIndex].in.ub.flag.bs.tokenPolicy = TOKEN_POLICY_PLAIN_TEXT; - localImports[state.eidIndex].in.ub.tpType = 1; - localKeys[state.eidIndex] = state.qpInfo.key; + for (uint32_t qpIdx = 0; qpIdx < qpNum_; ++qpIdx) { + const size_t localIndex = static_cast(state.eidIndex) * qpNum_ + qpIdx; + localImports[localIndex].in.ub.mode = JETTY_IMPORT_MODE_NORMAL; + localImports[localIndex].in.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE; + localImports[localIndex].in.ub.policy = JETTY_GRP_POLICY_RR; + localImports[localIndex].in.ub.type = TARGET_TYPE_JETTY; + localImports[localIndex].in.ub.flag.bs.tokenPolicy = TOKEN_POLICY_PLAIN_TEXT; + localImports[localIndex].in.ub.tpType = 1; + localKeys[localIndex] = state.qpInfos[qpIdx].key; + } } - std::vector allImports(options_.rankSize * eidCount_); + std::vector allImports(options_.rankSize * routeCount); int ret = options_.exchange->AllGather(localImports.data(), localImports.size(), allImports.data()); if (ret != TILEXR_SUCCESS) { return ret; } - std::vector allKeys(options_.rankSize * eidCount_); + std::vector allKeys(options_.rankSize * routeCount); ret = options_.exchange->AllGather(localKeys.data(), localKeys.size(), allKeys.data()); if (ret != TILEXR_SUCCESS) { return ret; @@ -700,13 +735,16 @@ int TileXRUDMATransport::ImportQueues() if (remoteEid >= eidCount_) { return TILEXR_ERROR_INTERNAL; } - QpImportInfoT importInfo = allImports[peer * eidCount_ + remoteEid]; - importInfo.in.key = allKeys[peer * eidCount_ + remoteEid]; - ret = loader_.RaCtxQpImport(state.ctxHandle, &importInfo, &state.remoteQpHandles[peer]); - if (ret != 0) { - return TILEXR_ERROR_INTERNAL; + for (uint32_t qpIdx = 0; qpIdx < qpNum_; ++qpIdx) { + const size_t remoteIndex = (static_cast(peer) * eidCount_ + remoteEid) * qpNum_ + qpIdx; + QpImportInfoT importInfo = allImports[remoteIndex]; + importInfo.in.key = allKeys[remoteIndex]; + ret = loader_.RaCtxQpImport(state.ctxHandle, &importInfo, &state.remoteQpHandlesByQp[qpIdx][peer]); + if (ret != 0) { + return TILEXR_ERROR_INTERNAL; + } + state.tpnListByQp[qpIdx][peer] = importInfo.out.ub.tpn; } - state.tpnList[peer] = importInfo.out.ub.tpn; } } return TILEXR_SUCCESS; @@ -768,11 +806,12 @@ int TileXRUDMATransport::RefreshUDMAInfo() return TILEXR_ERROR_INTERNAL; } - std::vector sq(options_.rankSize); - std::vector rq(options_.rankSize); - std::vector scq(options_.rankSize); - std::vector rcq(options_.rankSize); - std::vector mem(options_.rankSize); + const size_t queueEntries = static_cast(options_.rankSize) * qpNum_; + std::vector sq(queueEntries); + std::vector rq(queueEntries); + std::vector scq(queueEntries); + std::vector rcq(queueEntries); + std::vector mem(queueEntries); for (int rank = 0; rank < options_.rankSize; ++rank) { uint32_t localEid = fallbackEid; @@ -786,26 +825,29 @@ int TileXRUDMATransport::RefreshUDMAInfo() stateIt = fallbackIt; } const auto& state = stateIt->second; - sq[rank] = state.localWq; - rq[rank] = state.localWq; - scq[rank] = state.localCq; - rcq[rank] = state.localCq; - if (rank == options_.rank) { - const auto localMemIt = localMemInfoByEid_.find(localEid); - if (localMemIt != localMemInfoByEid_.end()) { - mem[rank] = localMemIt->second; + for (uint32_t qpIdx = 0; qpIdx < qpNum_; ++qpIdx) { + const size_t entryIndex = static_cast(rank) * qpNum_ + qpIdx; + sq[entryIndex] = state.localWqs[qpIdx]; + rq[entryIndex] = state.localWqs[qpIdx]; + scq[entryIndex] = state.localCqs[qpIdx]; + rcq[entryIndex] = state.localCqs[qpIdx]; + if (rank == options_.rank) { + const auto localMemIt = localMemInfoByEid_.find(localEid); + if (localMemIt != localMemInfoByEid_.end()) { + mem[entryIndex] = localMemIt->second; + } + } else { + mem[entryIndex] = allMem[rank * eidCount_ + remoteEid]; + mem[entryIndex].tpn = state.tpnListByQp[qpIdx][rank]; } - } else { - mem[rank] = allMem[rank * eidCount_ + remoteEid]; - mem[rank].tpn = state.tpnList[rank]; + mem[entryIndex].eidAddr = reinterpret_cast( + eidTableDev_ + (rank * eidCount_ + remoteEid) * sizeof(HccpEid)); } - mem[rank].eidAddr = reinterpret_cast( - eidTableDev_ + (rank * eidCount_ + remoteEid) * sizeof(HccpEid)); } if (udmaInfoDev_ == nullptr) { const size_t oneRankSize = 2 * sizeof(UDMAWQCtx) + 2 * sizeof(UDMACQCtx) + sizeof(UDMAMemInfo); - udmaInfoSize_ = static_cast(sizeof(UDMAInfo) + oneRankSize * options_.rankSize); + udmaInfoSize_ = static_cast(sizeof(UDMAInfo) + oneRankSize * options_.rankSize * qpNum_); ret = aclrtMalloc(reinterpret_cast(&udmaInfoDev_), udmaInfoSize_, ACL_MEM_MALLOC_HUGE_FIRST); if (ret != ACL_SUCCESS) { return TILEXR_ERROR_INTERNAL; @@ -814,7 +856,7 @@ int TileXRUDMATransport::RefreshUDMAInfo() UDMAInfo info {}; std::vector image; - ret = BuildUDMAInfoImage(reinterpret_cast(udmaInfoDev_), sq, rq, scq, rcq, mem, info, image); + ret = BuildUDMAInfoImage(reinterpret_cast(udmaInfoDev_), qpNum_, sq, rq, scq, rcq, mem, info, image); if (ret != TILEXR_UDMA_LAYOUT_SUCCESS) { return TILEXR_ERROR_PARA_CHECK_FAIL; } @@ -1002,26 +1044,44 @@ void TileXRUDMATransport::CleanupQueues() { for (auto& stateEntry : states_) { auto& state = stateEntry.second; - for (void* remoteQp : state.remoteQpHandles) { - if (remoteQp != nullptr && state.ctxHandle != nullptr) { - loader_.RaCtxQpUnimport(state.ctxHandle, remoteQp); + for (auto& remoteQpHandles : state.remoteQpHandlesByQp) { + for (void* remoteQp : remoteQpHandles) { + if (remoteQp != nullptr && state.ctxHandle != nullptr) { + loader_.RaCtxQpUnimport(state.ctxHandle, remoteQp); + } } } - if (state.qpHandle != nullptr) { - loader_.RaCtxQpDestroy(state.qpHandle); + for (void* qpHandle : state.qpHandles) { + if (qpHandle != nullptr) { + loader_.RaCtxQpDestroy(qpHandle); + } } - if (state.cqHandle != nullptr && state.ctxHandle != nullptr) { - loader_.RaCtxCqDestroy(state.ctxHandle, state.cqHandle); + for (void* cqHandle : state.cqHandles) { + if (cqHandle != nullptr && state.ctxHandle != nullptr) { + loader_.RaCtxCqDestroy(state.ctxHandle, cqHandle); + } } if (state.chanHandle != nullptr && state.ctxHandle != nullptr) { loader_.RaCtxChanDestroy(state.ctxHandle, state.chanHandle); } - FreeDeviceScalar(state.cqPiAddr); - FreeDeviceScalar(state.cqCiAddr); - FreeDeviceScalar(state.sqPiAddr); - FreeDeviceScalar(state.sqCiAddr); - FreeDeviceScalar(state.wqeCntAddr); - FreeDeviceScalar(state.amoAddr); + for (void*& ptr : state.cqPiAddrs) { + FreeDeviceScalar(ptr); + } + for (void*& ptr : state.cqCiAddrs) { + FreeDeviceScalar(ptr); + } + for (void*& ptr : state.sqPiAddrs) { + FreeDeviceScalar(ptr); + } + for (void*& ptr : state.sqCiAddrs) { + FreeDeviceScalar(ptr); + } + for (void*& ptr : state.wqeCntAddrs) { + FreeDeviceScalar(ptr); + } + for (void*& ptr : state.amoAddrs) { + FreeDeviceScalar(ptr); + } } states_.clear(); } diff --git a/src/comm/udma/tilexr_udma_transport.h b/src/comm/udma/tilexr_udma_transport.h index 0a787d1..8c5cd9a 100644 --- a/src/comm/udma/tilexr_udma_transport.h +++ b/src/comm/udma/tilexr_udma_transport.h @@ -73,6 +73,7 @@ class TileXRUDMATransport { uint32_t logicDevId_ = 0; uint32_t deviceIdOffset_ = 0; uint32_t eidCount_ = 0; + uint32_t qpNum_ = 1; std::map ctxHandleByEid_; std::map tokenHandleByEid_; std::map peerLocalEid_; diff --git a/src/include/tilexr_data_as_flag.h b/src/include/tilexr_data_as_flag.h index 93c6460..21bf9d2 100644 --- a/src/include/tilexr_data_as_flag.h +++ b/src/include/tilexr_data_as_flag.h @@ -27,6 +27,7 @@ constexpr uint32_t DATA_AS_FLAG_ALIGN_BYTES = 32; constexpr uint32_t DATA_AS_FLAG_FLOAT_BYTES = sizeof(float); constexpr uint32_t DATA_AS_FLAG_FLAG_FLOATS = DATA_AS_FLAG_FLAG_BYTES / DATA_AS_FLAG_FLOAT_BYTES; constexpr uint32_t DATA_AS_FLAG_SUM_RESULT_BYTES = DATA_AS_FLAG_ALIGN_BYTES; +constexpr uint64_t DATA_AS_FLAG_COMMIT_BIT = 0x80000000ULL; constexpr float DATA_AS_FLAG_READY_VALUE = 1.0f; static_assert(DATA_AS_FLAG_PAYLOAD_BYTES + DATA_AS_FLAG_FLAG_BYTES == DATA_AS_FLAG_BLOCK_BYTES, @@ -52,6 +53,23 @@ TILEXR_DATA_AS_FLAG_INLINE uint64_t DataAsFlagAlignUp(uint64_t value, uint64_t a return remainder == 0U ? value : value + alignment - remainder; } +TILEXR_DATA_AS_FLAG_INLINE uint64_t DataAsFlagEpoch(int32_t magic, int32_t step) +{ + return (static_cast(static_cast(magic)) << 32) | + static_cast(step); +} + +TILEXR_DATA_AS_FLAG_INLINE bool DataAsFlagEpochReady(uint64_t observed, uint64_t expected) +{ + constexpr uint64_t kMagicMask = 0xffffffff00000000ULL; + return (observed & kMagicMask) == (expected & kMagicMask) && observed >= expected; +} + +TILEXR_DATA_AS_FLAG_INLINE uint64_t DataAsFlagCommitEpoch(uint64_t epoch) +{ + return epoch | DATA_AS_FLAG_COMMIT_BIT; +} + #if TILEXR_ASCENDC_AICORE_COMPILE __aicore__ inline uint32_t DataAsFlagScratchBytes(const AscendC::LocalTensor& scratch) @@ -141,6 +159,16 @@ __aicore__ inline uint32_t DataAsFlagMaxRecvBlocks(uint32_t scratchBytes) return 0U; } +__aicore__ inline uint32_t DataAsFlagMaxEpochOrderedSendBlocks(uint32_t scratchBytes) +{ + return scratchBytes / DATA_AS_FLAG_BLOCK_BYTES; +} + +__aicore__ inline uint32_t DataAsFlagMaxEpochOrderedRecvBlocks(uint32_t scratchBytes) +{ + return DataAsFlagMaxEpochOrderedSendBlocks(scratchBytes); +} + __aicore__ inline uint32_t DataAsFlagInit(AscendC::LocalTensor& sendScratch) { const uint32_t sendBlocks = DataAsFlagScratchBytes(sendScratch) / DATA_AS_FLAG_BLOCK_BYTES; @@ -248,6 +276,94 @@ __aicore__ inline uint32_t DataAsFlagSend( return totalBlocks; } +__aicore__ inline void DataAsFlagFillEpochFlags( + AscendC::LocalTensor& flagScratch, + uint32_t blockCount, + uint64_t epoch) +{ + AscendC::LocalTensor flagWords = flagScratch.template ReinterpretCast(); + const uint32_t words = blockCount * DATA_AS_FLAG_FLAG_BYTES / sizeof(int64_t); + const int64_t signedEpoch = static_cast(epoch); + for (uint32_t i = 0; i < words; ++i) { + flagWords.SetValue(i, signedEpoch); + } + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); +} + +__aicore__ inline void DataAsFlagWriteBatchCommitFlag( + __gm__ uint8_t* dstDataAsFlagGM, + uint32_t dstBlockOffset, + AscendC::LocalTensor& scratch, + uint32_t batchBlocks, + uint64_t commitEpoch) +{ + if (batchBlocks == 0U) { + return; + } + DataAsFlagFillEpochFlags(scratch, 1U, commitEpoch); + AscendC::GlobalTensor flagGlobal; + const uint32_t lastBlock = dstBlockOffset + batchBlocks - 1U; + flagGlobal.SetGlobalBuffer( + reinterpret_cast<__gm__ int64_t*>( + dstDataAsFlagGM + static_cast(lastBlock) * DATA_AS_FLAG_BLOCK_BYTES + + DATA_AS_FLAG_FLAG_OFFSET_BYTES)); + AscendC::LocalTensor flagWords = scratch.template ReinterpretCast(); + constexpr uint32_t kFlagWords = DATA_AS_FLAG_FLAG_BYTES / sizeof(int64_t); + AscendC::DataCopy(flagGlobal, flagWords, kFlagWords); +} + +__aicore__ inline uint32_t DataAsFlagSendEpochOrdered( + __gm__ uint8_t* dstDataAsFlagGM, + const __gm__ uint8_t* srcGM, + uint64_t dataBytes, + uint64_t epoch, + AscendC::LocalTensor& scratch) +{ + if (dstDataAsFlagGM == nullptr || srcGM == nullptr || dataBytes == 0U) { + return 0U; + } + + const uint32_t totalBlocks = DataAsFlagBlockCountForPayloadBytes(dataBytes); + const uint32_t batchCapacity = DataAsFlagMaxEpochOrderedSendBlocks(DataAsFlagScratchBytes(scratch)); + if (batchCapacity == 0U) { + return 0U; + } + + uint32_t sentBlocks = 0U; + uint64_t sentBytes = 0U; + while (sentBlocks < totalBlocks) { + const uint32_t remainingBlocks = totalBlocks - sentBlocks; + const uint32_t batchBlocks = remainingBlocks < batchCapacity ? remainingBlocks : batchCapacity; + const uint64_t maxBatchBytes = static_cast(batchBlocks) * DATA_AS_FLAG_PAYLOAD_BYTES; + const uint64_t remainingBytes = dataBytes - sentBytes; + const uint32_t batchPayloadBytes = static_cast( + remainingBytes < maxBatchBytes ? remainingBytes : maxBatchBytes); + const uint32_t fullBlocks = batchPayloadBytes / DATA_AS_FLAG_PAYLOAD_BYTES; + const uint32_t tailBytes = batchPayloadBytes % DATA_AS_FLAG_PAYLOAD_BYTES; + + AscendC::LocalTensor payloadScratch = scratch; + + AscendC::Duplicate(payloadScratch, 0U, batchBlocks * DATA_AS_FLAG_BLOCK_BYTES); + AscendC::PipeBarrier(); + DataAsFlagCopyPayloadToScratch(payloadScratch, srcGM, sentBytes, fullBlocks, tailBytes); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + DataAsFlagCopyScratchToDataAsFlagGM(dstDataAsFlagGM, sentBlocks, payloadScratch, batchBlocks); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + + DataAsFlagWriteBatchCommitFlag(dstDataAsFlagGM, sentBlocks, scratch, batchBlocks, DataAsFlagCommitEpoch(epoch)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + + sentBlocks += batchBlocks; + sentBytes += batchPayloadBytes; + } + AscendC::PipeBarrier(); + return totalBlocks; +} + __aicore__ inline bool DataAsFlagCheckBatch( const __gm__ uint8_t* dataAsFlagGM, uint32_t blockOffset, @@ -290,6 +406,40 @@ __aicore__ inline bool DataAsFlagCheckBatch( return sumOut.GetValue(0) == static_cast(batchBlocks); } +__aicore__ inline uint64_t DataAsFlagLoadEpochFlag( + const __gm__ uint8_t* dataAsFlagGM, + uint32_t blockIndex, + AscendC::LocalTensor& scratch) +{ + AscendC::GlobalTensor flagGlobal; + flagGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>( + const_cast<__gm__ uint8_t*>( + dataAsFlagGM + static_cast(blockIndex) * DATA_AS_FLAG_BLOCK_BYTES + + DATA_AS_FLAG_FLAG_OFFSET_BYTES))); + AscendC::LocalTensor flagLocal = scratch.template ReinterpretCast(); + AscendC::DataCopy(flagLocal, flagGlobal, DATA_AS_FLAG_FLAG_BYTES / sizeof(int64_t)); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + return static_cast(flagLocal.GetValue(0)); +} + +__aicore__ inline bool DataAsFlagCheckBatchEpochStrict( + const __gm__ uint8_t* dataAsFlagGM, + uint32_t blockOffset, + uint32_t batchBlocks, + uint64_t commitEpoch, + AscendC::LocalTensor& scratch) +{ + if (batchBlocks == 0U) { + return false; + } + const uint32_t lastBlock = blockOffset + batchBlocks - 1U; + if (!DataAsFlagEpochReady(DataAsFlagLoadEpochFlag(dataAsFlagGM, lastBlock, scratch), commitEpoch)) { + return false; + } + return true; +} + __aicore__ inline bool DataAsFlagCheck( const __gm__ uint8_t* dataAsFlagGM, uint64_t dataBytes, @@ -363,6 +513,54 @@ __aicore__ inline void DataAsFlagCopyBatchToRecvGM( AscendC::WaitFlag(EVENT_ID0); } +__aicore__ inline bool DataAsFlagCheckAndRecvEpochOrdered( + const __gm__ uint8_t* dataAsFlagGM, + uint64_t dataBytes, + __gm__ uint8_t* recvGM, + uint64_t epoch, + AscendC::LocalTensor& recvScratch, + bool strict) +{ + if (dataAsFlagGM == nullptr || recvGM == nullptr) { + return false; + } + if (dataBytes == 0U) { + return true; + } + + const uint32_t totalBlocks = DataAsFlagBlockCountForPayloadBytes(dataBytes); + const uint32_t batchCapacity = DataAsFlagMaxEpochOrderedRecvBlocks(DataAsFlagScratchBytes(recvScratch)); + if (batchCapacity == 0U) { + return false; + } + + uint32_t processedBlocks = 0U; + uint64_t processedBytes = 0U; + const uint64_t commitEpoch = DataAsFlagCommitEpoch(epoch); + while (processedBlocks < totalBlocks) { + const uint32_t remainingBlocks = totalBlocks - processedBlocks; + const uint32_t batchBlocks = remainingBlocks < batchCapacity ? remainingBlocks : batchCapacity; + const uint32_t lastBlock = processedBlocks + batchBlocks - 1U; + while (!DataAsFlagEpochReady(DataAsFlagLoadEpochFlag(dataAsFlagGM, lastBlock, recvScratch), commitEpoch)) { + } + if (strict && + !DataAsFlagCheckBatchEpochStrict(dataAsFlagGM, processedBlocks, batchBlocks, commitEpoch, recvScratch)) { + return false; + } + + const uint64_t remainingBytes = dataBytes - processedBytes; + const uint64_t maxBatchBytes = static_cast(batchBlocks) * DATA_AS_FLAG_PAYLOAD_BYTES; + const uint32_t batchBytes = static_cast( + remainingBytes < maxBatchBytes ? remainingBytes : maxBatchBytes); + DataAsFlagCopyBatchToRecvGM( + dataAsFlagGM, processedBlocks, processedBytes, batchBytes, recvGM, recvScratch); + processedBlocks += batchBlocks; + processedBytes += batchBytes; + } + AscendC::PipeBarrier(); + return true; +} + __aicore__ inline bool DataAsFlagCheckAndRecv( const __gm__ uint8_t* dataAsFlagGM, uint64_t dataBytes, diff --git a/src/include/tilexr_udma.h b/src/include/tilexr_udma.h index 43f15c0..e8d4b33 100644 --- a/src/include/tilexr_udma.h +++ b/src/include/tilexr_udma.h @@ -105,9 +105,10 @@ __aicore__ inline __gm__ UDMACQCtx* UDMAGetSCQCtx(__gm__ UDMAInfo* udmaInfo, uin return reinterpret_cast<__gm__ UDMACQCtx*>(udmaInfo->scqPtr + (pe * qpNum + qpIdx) * sizeof(UDMACQCtx)); } -__aicore__ inline __gm__ UDMAMemInfo* UDMAGetRemoteMemInfo(__gm__ UDMAInfo* udmaInfo, uint32_t pe) +__aicore__ inline __gm__ UDMAMemInfo* UDMAGetRemoteMemInfo(__gm__ UDMAInfo* udmaInfo, uint32_t pe, uint32_t qpIdx) { - return reinterpret_cast<__gm__ UDMAMemInfo*>(udmaInfo->memPtr + sizeof(UDMAMemInfo) * pe); + uint32_t qpNum = udmaInfo->qpNum; + return reinterpret_cast<__gm__ UDMAMemInfo*>(udmaInfo->memPtr + sizeof(UDMAMemInfo) * (pe * qpNum + qpIdx)); } __aicore__ inline void UDMAPollCQUpdateInfo( @@ -244,7 +245,7 @@ __aicore__ inline void UDMAPostSend( uint32_t wqeCnt = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->wqeCntAddr), 0); UDMAPollCQWhenSQOverflow(udmaInfo, qpCtxEntry, wqeCnt, pe, qpIdx); - __gm__ UDMAMemInfo* remoteMemInfo = UDMAGetRemoteMemInfo(udmaInfo, pe); + __gm__ UDMAMemInfo* remoteMemInfo = UDMAGetRemoteMemInfo(udmaInfo, pe, qpIdx); __gm__ uint8_t* wqeAddr = reinterpret_cast<__gm__ uint8_t*>(qpCtxEntry->bufAddr + wqeSize * (curHead % TILEXR_UDMA_SQ_BB_COUNT)); __gm__ UDMASqeCtx* sqeCtx = reinterpret_cast<__gm__ UDMASqeCtx*>(wqeAddr); @@ -289,17 +290,27 @@ __aicore__ inline void UDMAWriteNotify( } template -__aicore__ inline void UDMAPutNbi( - const __gm__ CommArgs* args, int targetRank, const __gm__ T* localSrc, uint64_t byteOffset, uint32_t byteCount) +__aicore__ inline void UDMAPutNbiQp( + const __gm__ CommArgs* args, int targetRank, uint32_t qpIdx, + const __gm__ T* localSrc, uint64_t byteOffset, uint32_t byteCount) { if (!UDMARegistryEnabled(args)) return; + __gm__ UDMAInfo* udmaInfo = GetUDMAInfo(args); + if (qpIdx >= udmaInfo->qpNum) return; auto registry = GetUDMARegistry(args); if (!UDMARegisteredRangeValid(registry, targetRank, byteOffset, byteCount)) return; auto remoteAddr = UDMARegisteredRemoteAddr(registry, targetRank, byteOffset); UDMAWrite(args, remoteAddr, reinterpret_cast<__gm__ uint8_t*>(const_cast<__gm__ T*>(localSrc)), - targetRank, 0, byteCount); + targetRank, qpIdx, byteCount); +} + +template +__aicore__ inline void UDMAPutNbi( + const __gm__ CommArgs* args, int targetRank, const __gm__ T* localSrc, uint64_t byteOffset, uint32_t byteCount) +{ + UDMAPutNbiQp(args, targetRank, 0, localSrc, byteOffset, byteCount); } template @@ -310,16 +321,26 @@ __aicore__ inline void UDMAPutRegisteredNbi( } template -__aicore__ inline void UDMAGetNbi( - const __gm__ CommArgs* args, int sourceRank, __gm__ T* localDst, uint64_t byteOffset, uint32_t byteCount) +__aicore__ inline void UDMAGetNbiQp( + const __gm__ CommArgs* args, int sourceRank, uint32_t qpIdx, + __gm__ T* localDst, uint64_t byteOffset, uint32_t byteCount) { if (!UDMARegistryEnabled(args)) return; + __gm__ UDMAInfo* udmaInfo = GetUDMAInfo(args); + if (qpIdx >= udmaInfo->qpNum) return; auto registry = GetUDMARegistry(args); if (!UDMARegisteredRangeValid(registry, sourceRank, byteOffset, byteCount)) return; auto remoteAddr = UDMARegisteredRemoteAddr(registry, sourceRank, byteOffset); - UDMARead(args, reinterpret_cast<__gm__ uint8_t*>(localDst), remoteAddr, sourceRank, 0, byteCount); + UDMARead(args, reinterpret_cast<__gm__ uint8_t*>(localDst), remoteAddr, sourceRank, qpIdx, byteCount); +} + +template +__aicore__ inline void UDMAGetNbi( + const __gm__ CommArgs* args, int sourceRank, __gm__ T* localDst, uint64_t byteOffset, uint32_t byteCount) +{ + UDMAGetNbiQp(args, sourceRank, 0, localDst, byteOffset, byteCount); } template @@ -359,13 +380,24 @@ __aicore__ inline void UDMAPutRegisteredSignalNbi( UDMAPutSignalNbi(args, targetRank, localSrc, byteOffset, byteCount, signalByteOffset, signal); } -__aicore__ inline void UDMAQuiet(const __gm__ CommArgs* args, int targetRank) +__aicore__ inline uint32_t UDMAQuietStatusQp(const __gm__ CommArgs* args, int targetRank, uint32_t qpIdx) { - if (!UDMAEnabled(args)) return; + if (!UDMAEnabled(args)) return 0xFFFFFFFFU; __gm__ UDMAInfo* udmaInfo = GetUDMAInfo(args); - __gm__ UDMAWQCtx* qpCtxEntry = UDMAGetWQCtx(udmaInfo, targetRank, 0); + if (qpIdx >= udmaInfo->qpNum) return 0xFFFFFFFFU; + __gm__ UDMAWQCtx* qpCtxEntry = UDMAGetWQCtx(udmaInfo, targetRank, qpIdx); uint32_t wqeCnt = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->wqeCntAddr), 0); - (void)UDMAPollCQ(udmaInfo, targetRank, 0, wqeCnt); + return UDMAPollCQ(udmaInfo, targetRank, qpIdx, wqeCnt); +} + +__aicore__ inline uint32_t UDMAQuietStatus(const __gm__ CommArgs* args, int targetRank) +{ + return UDMAQuietStatusQp(args, targetRank, 0); +} + +__aicore__ inline void UDMAQuiet(const __gm__ CommArgs* args, int targetRank) +{ + (void)UDMAQuietStatus(args, targetRank); } } // namespace TileXR diff --git a/tests/udma/CMakeLists.txt b/tests/udma/CMakeLists.txt index 6c1e41a..7ac0edd 100644 --- a/tests/udma/CMakeLists.txt +++ b/tests/udma/CMakeLists.txt @@ -43,7 +43,7 @@ message(STATUS "TILEXR_ROOT: ${TILEXR_ROOT}") # 查找库 find_library(TILEXR_LIB tile-comm - HINTS "${TILEXR_ROOT}/install/lib" "${TILEXR_ROOT}/build/src/comm" + HINTS "${TILEXR_ROOT}/install/lib" "${TILEXR_ROOT}/install/lib64" "${TILEXR_ROOT}/build/src/comm" REQUIRED) message(STATUS "Found tile-comm: ${TILEXR_LIB}") @@ -52,6 +52,7 @@ message(STATUS "Found tile-comm: ${TILEXR_LIB}") include_directories( ${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/ ${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/runtime/ + ${ASCEND_HOME_PATH}/${ARCH}-linux/asc/include/ ${ASCEND_HOME_PATH}/${ARCH}-linux/include/ ${ASCEND_DRIVER_PATH}/kernel/inc ${TILEXR_ROOT}/3rdparty @@ -83,6 +84,22 @@ target_include_directories(test_tilexr_udma_transport_layout PRIVATE ${TILEXR_ROOT}/src/comm ) +add_executable(test_tilexr_udma_p2p_perf_config + unit/test_tilexr_udma_p2p_perf_config.cpp +) + +target_include_directories(test_tilexr_udma_p2p_perf_config PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} +) + +add_executable(test_tilexr_udma_p2p_source_guard + unit/test_tilexr_udma_p2p_source_guard.cpp +) + +target_compile_definitions(test_tilexr_udma_p2p_source_guard PRIVATE + TILEXR_SOURCE_ROOT="${TILEXR_ROOT}" +) + # 集成测试:TileXR UDMA add_executable(test_tilexr_udma integration/test_tilexr_udma.cpp @@ -99,6 +116,8 @@ set(INSTALL_TARGETS test_tilexr_udma test_tilexr_udma_registry test_tilexr_udma_transport_layout + test_tilexr_udma_p2p_perf_config + test_tilexr_udma_p2p_source_guard ) if(BUILD_TILEXR_UDMA_DEMO) @@ -161,6 +180,7 @@ if(BUILD_TILEXR_UDMA_DEMO) -I${ASCEND_HOME_PATH}/${ARCH}-linux/tikcpp/tikcfw/interface -I${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/ -I${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/runtime/ + -I${ASCEND_HOME_PATH}/${ARCH}-linux/asc/include/ -I${ASCEND_HOME_PATH}/${ARCH}-linux/include/ -I${ASCEND_DRIVER_PATH}/kernel/inc -I${TILEXR_ROOT}/3rdparty @@ -175,6 +195,7 @@ if(BUILD_TILEXR_UDMA_DEMO) -shared ${TILEXR_UDMA_KERNEL_LINK_OPTIONS} -DCATLASS_ARCH=${TILEXR_UDMA_CATLASS_ARCH} + -DTILEXR_UDMA_FORCE_ENABLE=1 ${TILEXR_UDMA_DEMO_KERNEL_INCLUDES} "${CMAKE_CURRENT_SOURCE_DIR}/demo/tilexr_udma_demo_kernel.cpp" -L${ASCEND_DRIVER_PATH}/lib64/driver @@ -193,6 +214,7 @@ if(BUILD_TILEXR_UDMA_DEMO) -o "${TILEXR_UDMA_DEMO_KERNEL_SO}" DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/demo/tilexr_udma_demo_kernel.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/demo/tilexr_udma_p2p_perf_config.h" "${TILEXR_ROOT}/src/include/tilexr_udma.h" VERBATIM COMMENT "Building TileXR UDMA demo kernel with bisheng" diff --git a/tests/udma/demo/README.md b/tests/udma/demo/README.md index 5894707..b6273bd 100644 --- a/tests/udma/demo/README.md +++ b/tests/udma/demo/README.md @@ -17,6 +17,10 @@ The demo target requires `bisheng`. If `bisheng` is not available, `build.sh` st cd /path/to/TileXR/tests/udma bash demo/run_tilexr_udma_demo.sh 0 2 16 2 0 bash demo/run_tilexr_udma_demo.sh 1 2 16 2 0 +bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 16777216 2 20 5 0 +bash demo/run_tilexr_udma_p2p_concurrency_sweep.sh 4096 16777216 2 20 5 0 1 direct_urma,memory,data_as_flag unidir,bidir 1,2,4,8 +bash demo/run_tilexr_udma_p2p_concurrency_sweep.sh 16777216 67108864 2 20 5 0 1 memory,memory_segmented,memory_segmented_rotate unidir 1 +TILEXR_P2P_DEBUG_SUMMARY=1 bash demo/run_tilexr_udma_p2p_concurrency_sweep.sh 16777216 67108864 2 20 5 0 1 memory_segmented_trace,memory_segmented_rotate_trace unidir 1,2 ``` Arguments: @@ -27,12 +31,48 @@ run_tilexr_udma_demo.sh - `test_type=0`: all-gather style UDMA put. - `test_type=1`: UDMA put with signal. +- `test_type=4`: directed 2-card P2P performance mode. Use + `demo/run_tilexr_udma_p2p_perf.sh` instead of calling the binary directly. - `rank_size`: number of local ranks to launch. - `elements_per_rank`: `int32_t` elements in each rank segment. - `npu_count`: number of NPUs available to this run. - `first_npu`: first physical NPU id to use. Each run writes per-rank logs under `tests/udma/logs/tilexr_udma_demo_*`. +P2P performance runs write logs under `tests/udma/logs/tilexr_udma_p2p_perf_*` +and append CSV rows to `p2p_perf.csv`. + +The P2P performance scripts expose these user-facing transports: +`direct_urma`, `memory`, `memory_consume`, `data_as_flag`, and +`data_as_flag_epoch_ordered`. `direct_urma` uses the current parallel +multi-jetty implementation internally; `block_dim=1` with one QP matches the +previous single-QP direct URMA baseline, while `block_dim=N` with +`TILEXR_UDMA_QP_NUM=N` uses up to `N` QPs/jettys in parallel. + +Two additional diagnostic IPC transports help isolate large-message `memory` +throughput drops: + +- `memory_segmented`: one `block_dim=1` kernel still transfers the full + payload, but calls the internal memory copy helper in 16 MiB segments while + writing the normal continuous peer window. +- `memory_segmented_rotate`: transfers the full payload but rotates destination + writes inside one 16 MiB peer-window span. This is a performance diagnostic + only; payload validation is skipped for this transport because later segments + overwrite earlier destination bytes. +- `memory_segmented_trace`: same as `memory_segmented`, but uses 8 MiB + segments and records per-segment copy cycles for block 0 in the debug + summary. Set `TILEXR_P2P_DEBUG_SUMMARY=1` to print `seg0Cycles` through + `seg7Cycles`. +- `memory_segmented_rotate_trace`: same as `memory_segmented_rotate`, with the + same 8 MiB per-segment cycle trace. Compare it with + `memory_segmented_trace` to keep destination address span separate from + sustained single-stream write behavior. + +For the `memory` large-message knee, use the trace variants as a root-cause +probe. If `block_dim=1` later segments cost more cycles while `block_dim=2` +stays flat, that supports accumulated peer IPC write backpressure or remote +write-queue drain limits. If all `block_dim=1` segments are uniformly slower +than `block_dim=2`, the result points to a fixed single-stream bandwidth cap. Run this demo only on A5 / Ascend950 / 950 hardware. Builds or smoke tests on other Ascend chips are not valid UDMA runtime validation. diff --git a/tests/udma/demo/plot_tilexr_udma_p2p_perf.py b/tests/udma/demo/plot_tilexr_udma_p2p_perf.py new file mode 100644 index 0000000..52e049e --- /dev/null +++ b/tests/udma/demo/plot_tilexr_udma_p2p_perf.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +"""Plot TileXR UDMA P2P performance CSV files.""" + +import argparse +import csv +import math +from collections import defaultdict +from pathlib import Path +from glob import glob + + +def parse_args(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("csv", nargs="+", help="p2p_perf.csv files to merge") + parser.add_argument("--output", default="tilexr_udma_p2p_perf_curve.png", help="bandwidth PNG path") + parser.add_argument("--latency-output", default=None, help="latency PNG path") + parser.add_argument("--latency-max-bytes", type=int, default=1024 * 1024, + help="only plot latency rows up to this size; default: 1048576") + parser.add_argument("--direction", default=None, help="only plot one direction, for example 0to1") + parser.add_argument("--labels", default=None, + help="comma-separated labels matching input CSV files, for example direct_urma,memory") + parser.add_argument("--series-by", default="auto", choices=["auto", "label", "transport", "traffic", "block_dim"], + help="series grouping for new CSVs; default uses transport+traffic+block_dim") + parser.add_argument("--metric", default="bw_GBps", choices=["bw_GBps", "per_flow_bw_GBps"], + help="bandwidth metric to plot; default plots aggregate bandwidth") + return parser.parse_args() + + +def infer_label(path): + text = str(path) + if "direct_urma" in text: + return "direct_urma" + if "memory" in text: + return "memory" + return Path(path).parent.name or Path(path).stem + + +def expand_paths(paths): + expanded = [] + for path in paths: + matches = sorted(glob(path)) + expanded.extend(matches if matches else [path]) + return expanded + + +def row_value(row, key, default): + value = row.get(key) + return default if value is None or value == "" else value + + +def build_series(row, path, fallback_label, series_by): + transport = row_value(row, "transport", fallback_label) + traffic = row_value(row, "traffic", "unidir") + block_dim = row_value(row, "block_dim", "1") + if series_by == "label": + return fallback_label + if series_by == "transport": + return transport + if series_by == "traffic": + return traffic + if series_by == "block_dim": + return f"bd={block_dim}" + return f"{transport} {traffic} bd={block_dim}" + + +def load_rows(paths, labels=None, direction_filter=None, series_by="auto"): + if labels is not None and len(labels) != len(paths): + raise SystemExit("--labels count must match input CSV count") + merged = {} + for index, path in enumerate(paths): + label = labels[index] if labels is not None else infer_label(path) + with open(path, newline="") as handle: + reader = csv.DictReader(handle) + for row in reader: + direction = row_value(row, "direction", "") + if direction_filter is not None and direction != direction_filter: + continue + series = label if labels is not None else build_series(row, path, label, series_by) + key = (series, int(row["bytes"])) + merged[key] = { + "series": series, + "direction": direction, + "transport": row_value(row, "transport", label), + "traffic": row_value(row, "traffic", "unidir"), + "block_dim": int(row_value(row, "block_dim", "1")), + "bytes": int(row["bytes"]), + "avg_us": float(row["avg_us"]), + "bw_GBps": float(row["bw_GBps"]), + "per_flow_bw_GBps": float(row_value(row, "per_flow_bw_GBps", row["bw_GBps"])), + "src": row_value(row, "src", ""), + "dst": row_value(row, "dst", ""), + "ranks": row_value(row, "ranks", ""), + "status": int(row["status"]), + "errors": int(row["errors"]), + } + grouped = defaultdict(list) + for row in merged.values(): + grouped[row["series"]].append(row) + for rows in grouped.values(): + rows.sort(key=lambda item: item["bytes"]) + return grouped + + +def default_latency_output(output): + path = Path(output) + suffix = path.suffix or ".png" + return str(path.with_name(path.stem + "_latency" + suffix)) + + +def format_bytes(value): + units = [("GB", 1024 ** 3), ("MB", 1024 ** 2), ("KB", 1024)] + for suffix, scale in units: + if value >= scale and value % scale == 0: + return f"{value // scale}{suffix}" + return f"{value}B" + + +def byte_ticks(grouped): + values = sorted({row["bytes"] for rows in grouped.values() for row in rows}) + return values, [format_bytes(value) for value in values] + + +def filter_by_max_bytes(grouped, max_bytes): + filtered = defaultdict(list) + for direction, rows in grouped.items(): + filtered[direction] = [row for row in rows if row["bytes"] <= max_bytes] + return {direction: rows for direction, rows in filtered.items() if rows} + + +def plot_metric(grouped, metric, ylabel, title, output): + import matplotlib.pyplot as plt + + plt.figure(figsize=(9, 5.2)) + for direction in sorted(grouped): + rows = grouped[direction] + plt.plot([row["bytes"] for row in rows], [row[metric] for row in rows], marker="o", label=direction) + + plt.xscale("log", base=2) + ticks, labels = byte_ticks(grouped) + plt.xticks(ticks, labels, rotation=35, ha="right") + plt.xlabel("message size") + plt.ylabel(ylabel) + plt.title(title) + plt.grid(True, which="both", linestyle="--", alpha=0.35) + plt.legend() + plt.tight_layout() + plt.savefig(output, dpi=160) + plt.close() + print(output) + + +def main(): + args = parse_args() + try: + import matplotlib # noqa: F401 + except ImportError as exc: + raise SystemExit("matplotlib is required to plot the curve: python3 -m pip install matplotlib") from exc + + paths = expand_paths(args.csv) + labels = args.labels.split(",") if args.labels else None + grouped = load_rows(paths, labels=labels, direction_filter=args.direction, series_by=args.series_by) + if not grouped: + raise SystemExit("no rows found") + + bad_rows = [ + row for rows in grouped.values() for row in rows + if row["status"] != 0 or row["errors"] != 0 + ] + if bad_rows: + raise SystemExit("refuse to plot rows with nonzero status/errors") + + metric_label = "aggregate bw_GBps" if args.metric == "bw_GBps" else "per-flow bw_GBps" + plot_metric(grouped, args.metric, metric_label, "TileXR P2P bandwidth, rank_size=2", args.output) + latency_grouped = filter_by_max_bytes(grouped, args.latency_max_bytes) + if not latency_grouped: + raise SystemExit(f"no rows found for latency <= {args.latency_max_bytes} bytes") + plot_metric(latency_grouped, "avg_us", "avg_us", + f"TileXR UDMA P2P latency, rank_size=2, <= {format_bytes(args.latency_max_bytes)}", + args.latency_output or default_latency_output(args.output)) + + +if __name__ == "__main__": + main() diff --git a/tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh b/tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh new file mode 100644 index 0000000..652f81f --- /dev/null +++ b/tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# +# Run TileXR P2P perf across transports, traffic modes, and blockDim values. +# + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +UDMA_DIR=$(cd "${SCRIPT_DIR}/.." && pwd) + +min_bytes=${1:-4096} +max_bytes=${2:-16777216} +step_factor=${3:-2} +iters=${4:-20} +warmup_iters=${5:-5} +first_npu=${6:-2} +check=${7:-1} +transports_csv=${8:-direct_urma,memory,memory_consume,data_as_flag} +traffic_csv=${9:-unidir,bidir} +block_dims_csv=${10:-1,2,4,8} + +IFS=',' read -r -a transports <<<"${transports_csv}" +IFS=',' read -r -a traffics <<<"${traffic_csv}" +IFS=',' read -r -a block_dims <<<"${block_dims_csv}" + +base_port=${TILEXR_P2P_SWEEP_BASE_PORT:-13600} +run_index=0 + +echo "==========================================" +echo " TileXR P2P Concurrency Sweep" +echo "==========================================" +echo "Range: ${min_bytes} -> ${max_bytes}, step=${step_factor}" +echo "Iters: ${iters}, warmup=${warmup_iters}" +echo "First NPU: ${first_npu}" +echo "Check: ${check}" +echo "Transports: ${transports_csv}" +echo "Traffic: ${traffic_csv}" +echo "Block dims: ${block_dims_csv}" +echo "==========================================" + +cd "${UDMA_DIR}" +for transport in "${transports[@]}"; do + for traffic in "${traffics[@]}"; do + for block_dim in "${block_dims[@]}"; do + port=$((base_port + run_index)) + export TILEXR_COMM_ID="127.0.0.1:${port}" + echo "==== transport=${transport} traffic=${traffic} block_dim=${block_dim} port=${port} ====" + bash demo/run_tilexr_udma_p2p_perf.sh \ + 0 1 "${min_bytes}" "${max_bytes}" "${step_factor}" "${iters}" "${warmup_iters}" \ + "${first_npu}" "${check}" "${transport}" "${block_dim}" "${traffic}" + run_index=$((run_index + 1)) + done + done +done diff --git a/tests/udma/demo/run_tilexr_udma_p2p_perf.sh b/tests/udma/demo/run_tilexr_udma_p2p_perf.sh new file mode 100755 index 0000000..0ffd9a3 --- /dev/null +++ b/tests/udma/demo/run_tilexr_udma_p2p_perf.sh @@ -0,0 +1,112 @@ +#!/bin/bash +# +# Run the TileXR 2-card directed UDMA P2P performance demo. +# + +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +UDMA_DIR=$(cd "${SCRIPT_DIR}/.." && pwd) +TILEXR_ROOT=$(cd "${UDMA_DIR}/../.." && pwd) +INSTALL_DIR="${UDMA_DIR}/install" + +src_rank=${1:-0} +dst_rank=${2:-1} +min_bytes=${3:-4096} +max_bytes=${4:-16777216} +step_factor=${5:-2} +iters=${6:-20} +warmup_iters=${7:-5} +first_npu=${8:-0} +check=${9:-1} +transport=${10:-direct_urma} +block_dim=${11:-1} +traffic=${12:-unidir} + +source "${TILEXR_ROOT}/scripts/common_env.sh" + +export TILEXR_COMM_ID=${TILEXR_COMM_ID:-127.0.0.1:10067} +export TILEXR_DEMO_NPUS=2 +export TILEXR_DEMO_FIRST_NPU=${first_npu} +if [ "${transport}" = "direct_urma" ] || [ "${transport}" = "udma" ]; then + export TILEXR_UDMA_QP_NUM="${block_dim}" +else + export TILEXR_UDMA_QP_NUM="${TILEXR_UDMA_QP_NUM:-1}" +fi +export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib64:${TILEXR_ROOT}/install/lib:${TILEXR_ROOT}/install/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}" + +bin="${INSTALL_DIR}/bin/tilexr_udma_demo" +if [ ! -x "${bin}" ]; then + echo "ERROR: ${bin} not found. Run: cd ${UDMA_DIR} && bash build.sh" + exit 1 +fi + +log_dir="${UDMA_DIR}/logs/tilexr_udma_p2p_perf_$(date +%Y%m%d_%H%M%S)_${transport}_${traffic}_bd${block_dim}_${src_rank}to${dst_rank}" +csv_path="${log_dir}/p2p_perf.csv" +mkdir -p "${log_dir}" + +echo "==========================================" +echo " TileXR UDMA P2P Performance Demo" +echo "==========================================" +echo "Binary: ${bin}" +echo "Direction: ${src_rank}->${dst_rank}" +echo "Rank size: 2" +echo "Min bytes: ${min_bytes}" +echo "Max bytes: ${max_bytes}" +echo "Step factor: ${step_factor}" +echo "Iters: ${iters}" +echo "Warmup iters: ${warmup_iters}" +echo "First NPU: ${first_npu}" +echo "Check: ${check}" +echo "Transport: ${transport}" +echo "Block dim: ${block_dim}" +echo "Traffic: ${traffic}" +echo "UDMA QP num: ${TILEXR_UDMA_QP_NUM}" +echo "TILEXR_COMM_ID:${TILEXR_COMM_ID}" +echo "Log dir: ${log_dir}" +echo "CSV: ${csv_path}" +echo "==========================================" + +pids=() +for rank in 0 1; do + log_file="${log_dir}/rank_${rank}.log" + echo "Starting rank ${rank}, log=${log_file}" + RANK=${rank} RANK_SIZE=2 TILEXR_P2P_LOG_DIR="${log_dir}" TILEXR_P2P_CSV="${csv_path}" "${bin}" \ + 2 "${rank}" 4 0 2 "${first_npu}" \ + "${src_rank}" "${dst_rank}" "${min_bytes}" "${max_bytes}" "${step_factor}" \ + "${iters}" "${warmup_iters}" "${check}" "${csv_path}" "${log_dir}" "${transport}" \ + "${block_dim}" "${traffic}" \ + >"${log_file}" 2>&1 & + pids+=("$!") +done + +ret=0 +for idx in "${!pids[@]}"; do + pid=${pids[$idx]} + rank=${idx} + if wait "${pid}"; then + echo "rank ${rank} finished successfully" + else + r=$? + echo "rank ${rank} failed with exit code ${r}" + ret=${r} + fi +done + +echo "==========================================" +echo " Rank Log Tails" +echo "==========================================" +for rank in 0 1; do + log_file="${log_dir}/rank_${rank}.log" + echo "----- rank ${rank}: ${log_file} -----" + tail -n 80 "${log_file}" || true +done + +if [ -f "${csv_path}" ]; then + echo "==========================================" + echo " CSV" + echo "==========================================" + cat "${csv_path}" +fi + +exit "${ret}" diff --git a/tests/udma/demo/tilexr_udma_demo.cpp b/tests/udma/demo/tilexr_udma_demo.cpp index eabe662..f57d76b 100644 --- a/tests/udma/demo/tilexr_udma_demo.cpp +++ b/tests/udma/demo/tilexr_udma_demo.cpp @@ -9,7 +9,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -21,12 +23,37 @@ #include "acl/acl.h" #include "tilexr_api.h" #include "tilexr_types.h" +#include "tilexr_udma_p2p_perf_config.h" extern void launch_tilexr_udma_all_gather( uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR data, GM_ADDR debug, int32_t elementsPerRank); extern void launch_tilexr_udma_put_signal( uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR data, GM_ADDR signals, GM_ADDR debug, int32_t elementsPerRank, uint64_t signal); +extern void launch_tilexr_udma_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic); +extern void launch_tilexr_udma_p2p_post_only_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic); +extern void launch_tilexr_memory_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic); +extern void launch_tilexr_memory_segmented_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, + int32_t traffic, uint32_t segmentBytes, int32_t rotateWindow, uint32_t traceSegments); +extern void launch_tilexr_memory_consume_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step); +extern void launch_tilexr_data_as_flag_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic); +extern void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step, int32_t strict); namespace { constexpr int32_t kDefaultElementsPerRank = 16; @@ -37,6 +64,9 @@ constexpr int kDemoBarrierPortOffset = 97; constexpr size_t kUdmaRegistrationAlignment = 2 * 1024 * 1024; constexpr int kConnectRetryCount = 500; constexpr int kConnectRetrySleepMs = 10; +constexpr size_t kP2PDebugWords = 64; +constexpr uint32_t kA5CycleToUsDivisor = 1000; +constexpr int32_t kP2PMagicBase = 0x5444554d; // "TDUM" struct BarrierEndpoint { uint16_t port; @@ -48,6 +78,21 @@ int GetEnvInt(const char* name, int defaultValue) return value == nullptr ? defaultValue : std::atoi(value); } +const char* GetEnvString(const char* name, const char* defaultValue) +{ + const char* value = std::getenv(name); + return value == nullptr || value[0] == '\0' ? defaultValue : value; +} + +bool GetEnvFlag(const char* name, bool defaultValue) +{ + const char* value = std::getenv(name); + if (value == nullptr || value[0] == '\0') { + return defaultValue; + } + return std::atoi(value) != 0; +} + int GetDeviceIdFromEnv(int rank, int npuCount, int firstNpu) { const char* devices = std::getenv("TILEXR_DEMO_DEVICES"); @@ -359,6 +404,515 @@ bool ValidateSignals(int rank, int rankSize, const std::vector& signal return ok; } +uint64_t RoundUpToAlignment(uint64_t value, uint64_t alignment) +{ + return ((value + alignment - 1) / alignment) * alignment; +} + +bool WriteTextFile(const std::string& path, const std::string& text) +{ + std::ofstream out(path.c_str(), std::ios::out | std::ios::trunc); + if (!out) { + return false; + } + out << text; + return static_cast(out); +} + +bool ReadP2PStatusFile(const std::string& path, TileXR::Demo::P2PRankStatus* status) +{ + std::ifstream in(path.c_str()); + if (!in) { + return false; + } + in >> status->status >> status->errors; + if (!in) { + return false; + } + if (in >> status->elapsedMs) { + if (in >> status->avgUsOverride) { + return true; + } + status->avgUsOverride = 0.0; + return true; + } + return in.eof(); +} + +bool AppendP2PPerfCsvRow(const TileXR::Demo::P2PPerfOptions& options, const TileXR::Demo::P2PPerfRow& row) +{ + if (options.csvPath.empty()) { + return true; + } + std::ifstream check(options.csvPath.c_str()); + bool exists = static_cast(check); + check.close(); + std::ofstream out(options.csvPath.c_str(), std::ios::out | std::ios::app); + if (!out) { + return false; + } + if (!exists) { + out << TileXR::Demo::P2PPerfCsvHeader(); + } + out << TileXR::Demo::FormatP2PPerfCsvRow(row); + return static_cast(out); +} + +bool CheckPeerMemsReady(int rank, int rankSize, const TileXR::CommArgs& args) +{ + for (int peer = 0; peer < rankSize; ++peer) { + if (args.peerMems[peer] == nullptr) { + std::cerr << "[rank " << rank << "] ERROR: peerMems[" << peer << "] is null" << std::endl; + return false; + } + } + return true; +} + +bool IsP2PSourceRank(int rank, const TileXR::Demo::P2PPerfOptions& options) +{ + return options.traffic == TileXR::Demo::P2PTraffic::BiDir ? + (rank == options.srcRank || rank == options.dstRank) : rank == options.srcRank; +} + +bool IsP2PReceiveRank(int rank, const TileXR::Demo::P2PPerfOptions& options) +{ + return options.traffic == TileXR::Demo::P2PTraffic::BiDir ? + (rank == options.srcRank || rank == options.dstRank) : rank == options.dstRank; +} + +uint32_t FoldP2PDebugStatus(const std::vector& debug, uint32_t blockDim) +{ + uint32_t status = debug.size() > 5 ? debug[5] : 0xffffffffu; + const uint32_t limit = std::min(blockDim, 16U); + for (uint32_t i = 0; i < limit && 8U + i < debug.size(); ++i) { + status |= debug[8U + i]; + } + return status; +} + +void PrintP2PDebugSummary(int rank, const std::vector& debug, uint32_t blockDim) +{ + std::ostringstream out; + out << "p2p debug:" + << " magic=0x" << std::hex << (debug.size() > 0 ? debug[0] : 0) + << std::dec + << " kernelRank=" << (debug.size() > 1 ? debug[1] : 0) + << " enabled=" << (debug.size() > 2 ? debug[2] : 0) + << " bytes=" << (debug.size() > 3 ? debug[3] : 0) + << " status=" << (debug.size() > 5 ? debug[5] : 0xffffffffu) + << " blockNum=" << (debug.size() > 6 ? debug[6] : 0) + << " qpNum=" << (debug.size() > 7 ? debug[7] : 0) + << " jettyCount=" << (debug.size() > 15 ? debug[15] : 0) + << " postCycles=" << (debug.size() > 29 ? + ((static_cast(debug[29]) << 32) | debug[28]) : 0) + << " postCycleSum=" << (debug.size() > 31 ? + ((static_cast(debug[31]) << 32) | debug[30]) : 0) + << " validateSum=" << (debug.size() > 33 ? + ((static_cast(debug[33]) << 32) | debug[32]) : 0) + << " ctxSum=" << (debug.size() > 35 ? + ((static_cast(debug[35]) << 32) | debug[34]) : 0) + << " sqeSum=" << (debug.size() > 37 ? + ((static_cast(debug[37]) << 32) | debug[36]) : 0) + << " sgeSum=" << (debug.size() > 39 ? + ((static_cast(debug[39]) << 32) | debug[38]) : 0) + << " cleanSum=" << (debug.size() > 41 ? + ((static_cast(debug[41]) << 32) | debug[40]) : 0) + << " doorbellSum=" << (debug.size() > 43 ? + ((static_cast(debug[43]) << 32) | debug[42]) : 0) + << " quietSum=" << (debug.size() > 45 ? + ((static_cast(debug[45]) << 32) | debug[44]) : 0); + const uint32_t traceSegmentCount = debug.size() > 46 ? debug[46] : 0; + if (traceSegmentCount != 0) { + out << " traceSegments=" << traceSegmentCount; + const uint32_t traceLimit = std::min(traceSegmentCount, 8U); + for (uint32_t i = 0; i < traceLimit; ++i) { + const uint32_t lowIdx = 48U + i * 2U; + const uint64_t cycles = lowIdx + 1U < debug.size() ? + ((static_cast(debug[lowIdx + 1U]) << 32) | debug[lowIdx]) : 0U; + out << " seg" << i << "Cycles=" << cycles; + } + } + const uint32_t limit = std::min(blockDim, 8U); + for (uint32_t i = 0; i < limit; ++i) { + out << " b" << i + << "{status=" << (8U + i < debug.size() ? debug[8U + i] : 0xffffffffu) + << ",offset=" << (16U + i < debug.size() ? debug[16U + i] : 0) + << ",bytes=" << (24U + i < debug.size() ? debug[24U + i] : 0) + << "}"; + } + PrintStatus(rank, out.str()); +} + +bool ClearLocalPeerWindow( + int rank, const TileXR::CommArgs& commArgsHost, uint64_t offset, uint64_t bytes, const std::string& name) +{ + if (bytes == 0) { + return true; + } + void* localWindow = reinterpret_cast(commArgsHost.peerMems[rank] + offset); + return CheckAcl(rank, "aclrtMemset " + name, aclrtMemset(localWindow, static_cast(bytes), 0, + static_cast(bytes))); +} + +void LaunchP2PKernel( + aclrtStream stream, GM_ADDR commArgsDev, GM_ADDR srcDev, GM_ADDR dstDev, GM_ADDR debugDev, + const TileXR::Demo::P2PPerfOptions& options, uint64_t dstOffset, + uint32_t transferBytes, uint32_t pattern, int32_t magic, int32_t step) +{ + const int32_t traffic = options.traffic == TileXR::Demo::P2PTraffic::BiDir ? 1 : 0; + const int32_t dataAsFlagStrict = GetEnvFlag("TILEXR_DATA_AS_FLAG_STRICT", false) ? 1 : 0; + if (options.transport == TileXR::Demo::P2PTransport::Memory) { + launch_tilexr_memory_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic); + return; + } + if (options.transport == TileXR::Demo::P2PTransport::MemorySegmented || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotate || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedTrace || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace) { + const bool rotateWindow = + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotate || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace; + const bool traceSegments = + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedTrace || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace; + launch_tilexr_memory_segmented_p2p_perf( + options.blockDim, stream, commArgsDev, srcDev, debugDev, options.srcRank, options.dstRank, + dstOffset, transferBytes, pattern, traffic, + static_cast(traceSegments ? TileXR::Demo::kP2PMemoryTraceSegmentBytes : + TileXR::Demo::kP2PMemorySegmentBytes), + rotateWindow ? 1 : 0, traceSegments ? 8U : 0U); + return; + } + if (options.transport == TileXR::Demo::P2PTransport::MemoryConsume) { + launch_tilexr_memory_consume_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic, magic, step); + return; + } + if (options.transport == TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered) { + launch_tilexr_data_as_flag_epoch_ordered_p2p_perf( + options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic, magic, step, + dataAsFlagStrict); + return; + } + if (options.transport == TileXR::Demo::P2PTransport::DataAsFlag) { + launch_tilexr_data_as_flag_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic); + return; + } + if (options.transport == TileXR::Demo::P2PTransport::DirectUrmaPostOnly) { + launch_tilexr_udma_p2p_post_only_perf(options.blockDim, stream, commArgsDev, srcDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic); + return; + } + launch_tilexr_udma_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, debugDev, + options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic); +} + +bool RunP2PPerfMode( + int rank, int rankSize, TileXRCommPtr comm, const TileXR::CommArgs& commArgsHost, + GM_ADDR commArgsDev, aclrtStream stream, + const TileXR::Demo::P2PPerfOptions& options) +{ + std::string error; + if (!TileXR::Demo::ValidateP2PPerfOptions(options, rankSize, &error)) { + std::cerr << "[rank " << rank << "] ERROR: " << error << std::endl; + return false; + } + + void* registeredMemory = nullptr; + uint32_t* debug = nullptr; + TileXRUDMAMemHandle udmaHandle = 0; + bool udmaRegistered = false; + aclrtEvent startEvent = nullptr; + aclrtEvent stopEvent = nullptr; + + const bool useMemoryTransport = options.transport == TileXR::Demo::P2PTransport::Memory; + const bool validatePeerWindow = + options.transport == TileXR::Demo::P2PTransport::Memory || + options.transport == TileXR::Demo::P2PTransport::MemorySegmented || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotate || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedTrace || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace; + const bool useLegacyDataAsFlagTransport = options.transport == TileXR::Demo::P2PTransport::DataAsFlag; + const bool useIpcTransport = TileXR::Demo::P2PTransportUsesIpc(options.transport); + const bool bothRanksActive = TileXR::Demo::P2PTransportBothRanksActive(options.transport, options.traffic); + const uint64_t srcOffset = 0; + const uint64_t dstWindowBytes = + TileXR::Demo::P2PTransportWindowBytes(options.transport, options.maxBytes, options.blockDim); + const uint64_t dstOffset = useIpcTransport ? TileXR::IPC_DATA_OFFSET : dstWindowBytes; + const uint64_t localDstOffset = dstWindowBytes; + const uint64_t debugOffset = useIpcTransport ? localDstOffset + dstWindowBytes : dstOffset + dstWindowBytes; + const uint64_t payloadBytes = debugOffset + kP2PDebugWords * sizeof(uint32_t); + const uint64_t registeredPayloadBytes = payloadBytes; + const uint64_t registeredBytes = RoundUpToAlignment(registeredPayloadBytes, kUdmaRegistrationAlignment); + + auto cleanup = [&]() { + if (startEvent != nullptr) { + CheckAcl(rank, "aclrtDestroyEvent start", aclrtDestroyEvent(startEvent)); + } + if (stopEvent != nullptr) { + CheckAcl(rank, "aclrtDestroyEvent stop", aclrtDestroyEvent(stopEvent)); + } + if (udmaRegistered) { + CheckTileXR(rank, "TileXRUDMAUnregister", TileXRUDMAUnregister(comm, udmaHandle)); + } + if (registeredMemory != nullptr) { + PrintStatus(rank, "aclrtFree p2p registered memory"); + aclrtFree(registeredMemory); + } + }; + + const std::string memoryName = useMemoryTransport ? "p2p memory local scratch" : "p2p registered memory"; + if (!CheckAcl(rank, "aclrtMalloc " + memoryName, + aclrtMalloc(®isteredMemory, registeredBytes, ACL_MEM_MALLOC_HUGE_FIRST))) { + cleanup(); + return false; + } + auto base = static_cast(registeredMemory); + auto srcDev = base + srcOffset; + auto dstDev = base + localDstOffset; + debug = reinterpret_cast(base + debugOffset); + + if (!useIpcTransport) { + if (!CheckTileXR(rank, "TileXRUDMARegister p2p", + TileXRUDMARegister(comm, static_cast(registeredMemory), registeredBytes, &udmaHandle))) { + cleanup(); + return false; + } + udmaRegistered = true; + } + PrintStatus(rank, std::string("p2p transport=") + TileXR::Demo::P2PTransportName(options.transport) + + " traffic=" + TileXR::Demo::P2PTrafficName(options.traffic) + + " blockDim=" + std::to_string(options.blockDim) + + " memory base=" + std::to_string(reinterpret_cast(registeredMemory)) + + " bytes=" + std::to_string(registeredBytes) + + " srcOffset=" + std::to_string(srcOffset) + + " dstOffset=" + std::to_string(dstOffset) + + " debugOffset=" + std::to_string(debugOffset)); + + if (!CheckAcl(rank, "aclrtCreateEvent start", aclrtCreateEvent(&startEvent)) || + !CheckAcl(rank, "aclrtCreateEvent stop", aclrtCreateEvent(&stopEvent))) { + cleanup(); + return false; + } + + bool ok = true; + int32_t launchMagic = kP2PMagicBase; + for (uint64_t bytes : TileXR::Demo::BuildP2PPerfSizeSweep(options)) { + const uint32_t transferBytes = static_cast(bytes); + const uint32_t pattern = TileXR::Demo::P2PPattern(options.srcRank, options.dstRank, bytes); + const uint64_t transferWindowBytes = + TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes, options.blockDim); + std::vector hostSrc(static_cast(transferWindowBytes), 0); + std::vector hostDst(static_cast(transferWindowBytes), 0); + std::vector hostDebug(kP2PDebugWords, 0); + TileXR::Demo::FillP2PPattern(hostSrc, pattern); + + const bool initSrc = true; + const bool initDst = true; + if ((initSrc && !CopyHostToDevice(rank, srcDev, static_cast(transferWindowBytes), hostSrc.data(), + static_cast(transferWindowBytes), "p2p src")) || + (initDst && !CopyHostToDevice(rank, dstDev, static_cast(transferWindowBytes), hostDst.data(), + static_cast(transferWindowBytes), "p2p dst")) || + !CopyHostToDevice(rank, debug, hostDebug.size() * sizeof(uint32_t), hostDebug.data(), + hostDebug.size() * sizeof(uint32_t), "p2p debug")) { + ok = false; + break; + } + if (useIpcTransport && IsP2PReceiveRank(rank, options)) { + const uint64_t clearBytes = TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes); + if (!ClearLocalPeerWindow(rank, commArgsHost, dstOffset, clearBytes, "p2p ipc dst window")) { + ok = false; + break; + } + } + if (!DemoBarrierAll(rank, rankSize, "p2p initialized bytes=" + std::to_string(bytes))) { + ok = false; + break; + } + + const int32_t warmupMagic = ++launchMagic; + for (int i = 0; i < options.warmupIters; ++i) { + if (useLegacyDataAsFlagTransport && IsP2PReceiveRank(rank, options) && + !ClearLocalPeerWindow(rank, commArgsHost, dstOffset, + TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes), "p2p data_as_flag warmup window")) { + ok = false; + break; + } + if (useLegacyDataAsFlagTransport && + !DemoBarrierAll(rank, rankSize, + "p2p data_as_flag warmup clear bytes=" + std::to_string(bytes) + + " iter=" + std::to_string(i))) { + ok = false; + break; + } + LaunchP2PKernel(stream, commArgsDev, reinterpret_cast(srcDev), reinterpret_cast(dstDev), + reinterpret_cast(debug), options, dstOffset, transferBytes, pattern, warmupMagic, i + 1); + if (!CheckAcl(rank, "aclrtSynchronizeStream p2p warmup", aclrtSynchronizeStream(stream))) { + ok = false; + break; + } + } + if (!ok || !DemoBarrierAll(rank, rankSize, "p2p warmup done bytes=" + std::to_string(bytes))) { + ok = false; + break; + } + + if (useLegacyDataAsFlagTransport && IsP2PReceiveRank(rank, options) && + !ClearLocalPeerWindow(rank, commArgsHost, dstOffset, + TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes), "p2p data_as_flag measured window")) { + ok = false; + break; + } + if (useLegacyDataAsFlagTransport && + !DemoBarrierAll(rank, rankSize, + "p2p data_as_flag measured clear bytes=" + std::to_string(bytes))) { + ok = false; + break; + } + const bool resetDebugBeforeMeasured = + options.transport == TileXR::Demo::P2PTransport::DirectUrmaPostOnly || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedTrace || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace; + if (resetDebugBeforeMeasured && + !CopyHostToDevice(rank, debug, hostDebug.size() * sizeof(uint32_t), hostDebug.data(), + hostDebug.size() * sizeof(uint32_t), "p2p post-only debug reset")) { + ok = false; + break; + } + + if (!CheckAcl(rank, "aclrtRecordEvent start", aclrtRecordEvent(startEvent, stream))) { + ok = false; + break; + } + const int32_t measuredMagic = ++launchMagic; + for (int i = 0; i < options.iters; ++i) { + LaunchP2PKernel(stream, commArgsDev, reinterpret_cast(srcDev), reinterpret_cast(dstDev), + reinterpret_cast(debug), options, dstOffset, transferBytes, pattern, measuredMagic, i + 1); + } + if (!ok || + !CheckAcl(rank, "aclrtRecordEvent stop", aclrtRecordEvent(stopEvent, stream)) || + !CheckAcl(rank, "aclrtSynchronizeStream p2p measured", aclrtSynchronizeStream(stream))) { + ok = false; + break; + } + + float elapsedMs = 0.0f; + if (!CheckAcl(rank, "aclrtEventElapsedTime p2p", aclrtEventElapsedTime(&elapsedMs, startEvent, stopEvent))) { + ok = false; + break; + } + if (!DemoBarrierAll(rank, rankSize, "p2p measured done bytes=" + std::to_string(bytes))) { + ok = false; + break; + } + + uint64_t errors = 0; + const bool skipPayloadCheck = + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotate || + options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace; + if (IsP2PReceiveRank(rank, options) && options.check && !skipPayloadCheck) { + const void* validateSrc = validatePeerWindow ? + reinterpret_cast(commArgsHost.peerMems[rank] + TileXR::IPC_DATA_OFFSET) : + static_cast(dstDev); + if (!CopyDeviceToHost(rank, hostDst.data(), static_cast(transferWindowBytes), validateSrc, + static_cast(transferWindowBytes), "p2p dst")) { + ok = false; + break; + } + errors = TileXR::Demo::CountP2PTransportMismatches( + hostDst, pattern, bytes, options.transport, options.blockDim); + } + if (IsP2PSourceRank(rank, options) || bothRanksActive) { + if (!CopyDeviceToHost(rank, hostDebug.data(), hostDebug.size() * sizeof(uint32_t), debug, + hostDebug.size() * sizeof(uint32_t), "p2p debug")) { + ok = false; + break; + } + } + + TileXR::Demo::P2PRankStatus localStatus; + localStatus.status = (IsP2PSourceRank(rank, options) || bothRanksActive) ? + FoldP2PDebugStatus(hostDebug, options.blockDim) : 0; + localStatus.errors = IsP2PReceiveRank(rank, options) ? errors : 0; + localStatus.elapsedMs = elapsedMs; + if (options.transport == TileXR::Demo::P2PTransport::DirectUrmaPostOnly && + IsP2PSourceRank(rank, options) && hostDebug.size() > 31) { + const uint64_t postCycleSum = (static_cast(hostDebug[31]) << 32) | hostDebug[30]; + localStatus.avgUsOverride = options.iters > 0 ? + static_cast(postCycleSum) / + static_cast(options.iters) / + static_cast(kA5CycleToUsDivisor) : + 0.0; + } + if (IsP2PSourceRank(rank, options) && + (GetEnvFlag("TILEXR_P2P_DEBUG_SUMMARY", false) || localStatus.status != 0 || errors != 0)) { + PrintP2PDebugSummary(rank, hostDebug, options.blockDim); + } + if (!options.logDir.empty()) { + std::string statusPath = options.logDir + "/p2p_rank_" + std::to_string(rank) + + "_" + std::to_string(bytes) + ".status"; + std::ostringstream statusText; + statusText << localStatus.status << ' ' << localStatus.errors << ' ' + << localStatus.elapsedMs << ' ' << localStatus.avgUsOverride << '\n'; + if (!WriteTextFile(statusPath, statusText.str())) { + std::cerr << "[rank " << rank << "] ERROR: failed to write " << statusPath << std::endl; + ok = false; + break; + } + } + if (!DemoBarrierAll(rank, rankSize, "p2p status files bytes=" + std::to_string(bytes))) { + ok = false; + break; + } + + if (rank == 0) { + TileXR::Demo::P2PRankStatus srcStatus = localStatus; + TileXR::Demo::P2PRankStatus dstStatus = localStatus; + if (options.srcRank != 0) { + std::string srcPath = options.logDir + "/p2p_rank_" + std::to_string(options.srcRank) + + "_" + std::to_string(bytes) + ".status"; + if (!ReadP2PStatusFile(srcPath, &srcStatus)) { + std::cerr << "[rank " << rank << "] ERROR: failed to read " << srcPath << std::endl; + ok = false; + break; + } + } + if (options.dstRank != 0) { + std::string dstPath = options.logDir + "/p2p_rank_" + std::to_string(options.dstRank) + + "_" + std::to_string(bytes) + ".status"; + if (!ReadP2PStatusFile(dstPath, &dstStatus)) { + std::cerr << "[rank " << rank << "] ERROR: failed to read " << dstPath << std::endl; + ok = false; + break; + } + } + TileXR::Demo::P2PPerfRow row = + TileXR::Demo::BuildP2PPerfRow(options, rankSize, bytes, srcStatus, dstStatus); + if (!AppendP2PPerfCsvRow(options, row)) { + std::cerr << "[rank " << rank << "] ERROR: failed to append CSV " << options.csvPath << std::endl; + ok = false; + break; + } + std::cout << "[rank " << rank << "] p2p row: " + << TileXR::Demo::FormatP2PPerfCsvRow(row); + if (row.status != 0 || row.errors != 0) { + ok = false; + } + } + if (!DemoBarrierAll(rank, rankSize, "p2p csv written bytes=" + std::to_string(bytes))) { + ok = false; + break; + } + } + + cleanup(); + return ok; +} + void Cleanup( TileXRCommPtr comm, aclrtStream stream, void* registeredMemory, int32_t* debug, int rank, int deviceId) { @@ -390,6 +944,29 @@ int main(int argc, char** argv) int32_t elementsPerRank = argc > argIndex ? std::atoi(argv[argIndex++]) : kDefaultElementsPerRank; int npuCount = argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_DEMO_NPUS", 8); int firstNpu = argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_DEMO_FIRST_NPU", 0); + TileXR::Demo::P2PPerfOptions p2pOptions; + if (testType == 4) { + p2pOptions.srcRank = argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_P2P_SRC_RANK", 0); + p2pOptions.dstRank = argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_P2P_DST_RANK", 1); + p2pOptions.minBytes = argc > argIndex ? std::strtoull(argv[argIndex++], nullptr, 10) : + static_cast(GetEnvInt("TILEXR_P2P_MIN_BYTES", 4096)); + p2pOptions.maxBytes = argc > argIndex ? std::strtoull(argv[argIndex++], nullptr, 10) : + static_cast(GetEnvInt("TILEXR_P2P_MAX_BYTES", 4096)); + p2pOptions.stepFactor = argc > argIndex ? std::strtoull(argv[argIndex++], nullptr, 10) : + static_cast(GetEnvInt("TILEXR_P2P_STEP_FACTOR", 2)); + p2pOptions.iters = argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_P2P_ITERS", 100); + p2pOptions.warmupIters = argc > argIndex ? std::atoi(argv[argIndex++]) : + GetEnvInt("TILEXR_P2P_WARMUP_ITERS", 10); + p2pOptions.check = (argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_P2P_CHECK", 1)) != 0; + p2pOptions.csvPath = argc > argIndex ? argv[argIndex++] : GetEnvString("TILEXR_P2P_CSV", ""); + p2pOptions.logDir = argc > argIndex ? argv[argIndex++] : GetEnvString("TILEXR_P2P_LOG_DIR", ""); + std::string transportName = argc > argIndex ? argv[argIndex++] : GetEnvString("TILEXR_P2P_TRANSPORT", "direct_urma"); + p2pOptions.transport = TileXR::Demo::ParseP2PTransport(transportName); + p2pOptions.blockDim = argc > argIndex ? static_cast(std::strtoul(argv[argIndex++], nullptr, 10)) : + static_cast(GetEnvInt("TILEXR_P2P_BLOCK_DIM", 1)); + std::string trafficName = argc > argIndex ? argv[argIndex++] : GetEnvString("TILEXR_P2P_TRAFFIC", "unidir"); + p2pOptions.traffic = TileXR::Demo::ParseP2PTraffic(trafficName); + } int deviceId = GetDeviceIdFromEnv(rank, npuCount, firstNpu); std::cout << "========================================" << std::endl; @@ -398,6 +975,21 @@ int main(int argc, char** argv) std::cout << "[rank " << rank << "] argv: rankSize=" << rankSize << " rank=" << rank << " testType=" << testType << " elementsPerRank=" << elementsPerRank << " npuCount=" << npuCount << " firstNpu=" << firstNpu << std::endl; + if (testType == 4) { + std::cout << "[rank " << rank << "] p2p: src=" << p2pOptions.srcRank + << " dst=" << p2pOptions.dstRank + << " minBytes=" << p2pOptions.minBytes + << " maxBytes=" << p2pOptions.maxBytes + << " stepFactor=" << p2pOptions.stepFactor + << " iters=" << p2pOptions.iters + << " warmupIters=" << p2pOptions.warmupIters + << " check=" << (p2pOptions.check ? 1 : 0) + << " transport=" << TileXR::Demo::P2PTransportName(p2pOptions.transport) + << " blockDim=" << p2pOptions.blockDim + << " traffic=" << TileXR::Demo::P2PTrafficName(p2pOptions.traffic) + << " csv=" << p2pOptions.csvPath + << " logDir=" << p2pOptions.logDir << std::endl; + } std::cout << "[rank " << rank << "] PID=" << getpid() << " TILEXR_COMM_ID=" << (std::getenv("TILEXR_COMM_ID") ? std::getenv("TILEXR_COMM_ID") : "") << " LD_LIBRARY_PATH=" << (std::getenv("LD_LIBRARY_PATH") ? std::getenv("LD_LIBRARY_PATH") : "") @@ -439,13 +1031,33 @@ int main(int argc, char** argv) } PrintCommArgs(rank, *commArgsHost, commArgsDev); - if ((commArgsHost->extraFlag & TileXR::ExtraFlag::UDMA) == 0 || commArgsHost->udmaInfoPtr == nullptr) { + if (testType == 4 && + TileXR::Demo::P2PTransportUsesIpc(p2pOptions.transport) && + !CheckPeerMemsReady(rank, rankSize, *commArgsHost)) { + Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); + return 1; + } + + if ((testType != 4 || + p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrma) && + ((commArgsHost->extraFlag & TileXR::ExtraFlag::UDMA) == 0 || commArgsHost->udmaInfoPtr == nullptr)) { std::cerr << "[rank " << rank << "] ERROR: TileXR UDMA is not enabled. " << "Check A5/Ascend950 hardware support, CANN/driver setup, and LD_LIBRARY_PATH." << std::endl; Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); return 1; } + if (testType == 4) { + bool ok = RunP2PPerfMode(rank, rankSize, comm, *commArgsHost, commArgsDev, stream, p2pOptions); + Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); + if (!ok) { + std::cerr << "[rank " << rank << "] TileXR UDMA P2P perf failed" << std::endl; + return 1; + } + std::cout << "[rank " << rank << "] TileXR UDMA P2P perf success" << std::endl; + return 0; + } + size_t dataCount = static_cast(rankSize) * elementsPerRank; size_t dataBytes = dataCount * sizeof(int32_t); size_t signalBytes = static_cast(rankSize) * sizeof(uint64_t); diff --git a/tests/udma/demo/tilexr_udma_demo_kernel.cpp b/tests/udma/demo/tilexr_udma_demo_kernel.cpp index a49ca3c..b452260 100644 --- a/tests/udma/demo/tilexr_udma_demo_kernel.cpp +++ b/tests/udma/demo/tilexr_udma_demo_kernel.cpp @@ -4,9 +4,223 @@ */ #include "kernel_operator.h" +#include "tilexr_data_as_flag.h" +#include "tilexr_sync.h" #include "tilexr_udma.h" constexpr int32_t TILEXR_UDMA_DEMO_MAGIC = 0x5444554d; // "TDUM" +constexpr uint32_t TILEXR_UDMA_DEMO_P2P_UB_BYTES = 64 * 1024; +constexpr uint32_t TILEXR_UDMA_DEMO_P2P_SYNC_UB_BYTES = 4 * 1024; +constexpr uint32_t TILEXR_UDMA_DEMO_P2P_COPY_TILE_BYTES = + TILEXR_UDMA_DEMO_P2P_UB_BYTES - TILEXR_UDMA_DEMO_P2P_SYNC_UB_BYTES; +constexpr uint32_t TILEXR_UDMA_DEMO_P2P_MAX_DEBUG_BLOCKS = 16; + +__aicore__ inline uint32_t TileXRUdmaDemoCeilDiv(uint32_t value, uint32_t divisor) +{ + return divisor == 0 ? 0 : (value + divisor - 1) / divisor; +} + +__aicore__ inline void TileXRUdmaDemoBlockSlice( + uint32_t total, uint32_t blockNum, uint32_t blockIdx, uint32_t& offset, uint32_t& bytes) +{ + uint32_t perBlock = TileXRUdmaDemoCeilDiv(total, blockNum); + perBlock = ((perBlock + TileXR::BLOCK_UNIT_BYTE - 1) / TileXR::BLOCK_UNIT_BYTE) * TileXR::BLOCK_UNIT_BYTE; + offset = blockIdx * perBlock; + if (offset >= total) { + offset = total; + bytes = 0; + return; + } + bytes = total - offset; + if (bytes > perBlock) { + bytes = perBlock; + } +} + +__aicore__ inline void TileXRUdmaDemoWqeSlice( + uint32_t total, uint32_t wqeCount, uint32_t wqeIdx, uint32_t& offset, uint32_t& bytes) +{ + uint32_t perWqe = TileXRUdmaDemoCeilDiv(total, wqeCount); + perWqe = ((perWqe + TileXR::BLOCK_UNIT_BYTE - 1) / TileXR::BLOCK_UNIT_BYTE) * TileXR::BLOCK_UNIT_BYTE; + offset = wqeIdx * perWqe; + if (wqeCount == 0 || offset >= total) { + offset = total; + bytes = 0; + return; + } + bytes = total - offset; + if (bytes > perWqe) { + bytes = perWqe; + } +} + +__aicore__ inline bool TileXRUdmaDemoResolvePeer( + int32_t rank, int32_t srcRank, int32_t dstRank, int32_t traffic, int32_t& peer) +{ + const bool bidir = traffic == 1; + if (bidir) { + if (rank == srcRank) { + peer = dstRank; + return true; + } + if (rank == dstRank) { + peer = srcRank; + return true; + } + peer = -1; + return false; + } + peer = dstRank; + return rank == srcRank; +} + +__aicore__ inline bool TileXRUdmaDemoResolveDataAsFlagRole( + int32_t rank, int32_t srcRank, int32_t dstRank, int32_t traffic, + bool& isSender, bool& isReceiver, int32_t& peer) +{ + const bool bidir = traffic == 1; + isSender = false; + isReceiver = false; + peer = -1; + if (bidir) { + if (rank == srcRank) { + isSender = true; + isReceiver = true; + peer = dstRank; + return true; + } + if (rank == dstRank) { + isSender = true; + isReceiver = true; + peer = srcRank; + return true; + } + return false; + } + if (rank == srcRank) { + isSender = true; + peer = dstRank; + return true; + } + if (rank == dstRank) { + isReceiver = true; + peer = srcRank; + return true; + } + return false; +} + +__aicore__ inline bool TileXRUdmaDemoResolveMemoryConsumeRole( + int32_t rank, int32_t srcRank, int32_t dstRank, int32_t traffic, + bool& isSender, bool& isReceiver, int32_t& peer) +{ + const bool bidir = traffic == 1; + isSender = false; + isReceiver = false; + peer = -1; + if (bidir) { + if (rank == srcRank) { + isSender = true; + isReceiver = true; + peer = dstRank; + return true; + } + if (rank == dstRank) { + isSender = true; + isReceiver = true; + peer = srcRank; + return true; + } + return false; + } + if (rank == srcRank) { + isSender = true; + peer = dstRank; + return true; + } + if (rank == dstRank) { + isReceiver = true; + peer = srcRank; + return true; + } + return false; +} + +__aicore__ inline uint32_t TileXRUdmaDemoFoldDebugStatus( + __gm__ uint32_t* debug, uint32_t blockIdx, uint32_t status) +{ + if (debug != nullptr && blockIdx < TILEXR_UDMA_DEMO_P2P_MAX_DEBUG_BLOCKS) { + debug[8 + blockIdx] = status; + } + return status; +} + +__aicore__ inline void TileXRUdmaDemoAddCycleSum(__gm__ uint32_t* debug, uint32_t lowIdx, uint64_t cycles) +{ + if (debug == nullptr) { + return; + } + uint64_t sum = (static_cast(debug[lowIdx + 1]) << 32) | debug[lowIdx]; + sum += cycles; + debug[lowIdx] = static_cast(sum & 0xffffffffu); + debug[lowIdx + 1] = static_cast(sum >> 32); +} + +__aicore__ inline void TileXRUdmaDemoDataAsFlagSlice( + uint32_t bytes, uint32_t blockNum, uint32_t blockIdx, + uint32_t& payloadOffset, uint32_t& sliceBytes, uint32_t& dataAsFlagOffset) +{ + uint32_t totalBlocks = TileXR::DataAsFlagBlockCountForPayloadBytes(bytes); + uint32_t perBlock = TileXRUdmaDemoCeilDiv(totalBlocks, blockNum); + uint32_t startBlock = blockIdx * perBlock; + if (blockNum == 0 || perBlock == 0 || startBlock >= totalBlocks) { + payloadOffset = bytes; + sliceBytes = 0; + dataAsFlagOffset = totalBlocks * TileXR::DATA_AS_FLAG_BLOCK_BYTES; + return; + } + uint32_t blockCount = totalBlocks - startBlock; + if (blockCount > perBlock) { + blockCount = perBlock; + } + payloadOffset = startBlock * TileXR::DATA_AS_FLAG_PAYLOAD_BYTES; + uint32_t maxPayloadBytes = blockCount * TileXR::DATA_AS_FLAG_PAYLOAD_BYTES; + sliceBytes = bytes - payloadOffset; + if (sliceBytes > maxPayloadBytes) { + sliceBytes = maxPayloadBytes; + } + dataAsFlagOffset = startBlock * TileXR::DATA_AS_FLAG_BLOCK_BYTES; +} + +__aicore__ inline void TileXRUdmaDemoCopyBytesGmToGm( + GM_ADDR dstGM, GM_ADDR srcGM, AscendC::TBuf& tBuf, uint32_t bytes) +{ + if (dstGM == nullptr || srcGM == nullptr || bytes == 0) { + return; + } + + AscendC::LocalTensor local = + tBuf.GetWithOffset(TILEXR_UDMA_DEMO_P2P_COPY_TILE_BYTES, TILEXR_UDMA_DEMO_P2P_SYNC_UB_BYTES); + AscendC::GlobalTensor src; + AscendC::GlobalTensor dst; + src.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t*>(srcGM), bytes); + dst.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t*>(dstGM), bytes); + + for (uint32_t copied = 0; copied < bytes; copied += TILEXR_UDMA_DEMO_P2P_COPY_TILE_BYTES) { + uint32_t tileBytes = bytes - copied; + if (tileBytes > TILEXR_UDMA_DEMO_P2P_COPY_TILE_BYTES) { + tileBytes = TILEXR_UDMA_DEMO_P2P_COPY_TILE_BYTES; + } + AscendC::DataCopyPadParams padParams {false, 0, 0, 0}; + AscendC::DataCopyParams copyParams {1, static_cast(tileBytes), 0, 0}; + AscendC::DataCopyPad(local, src[copied], copyParams, padParams); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::DataCopyPad(dst[copied], local, copyParams); + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + } +} extern "C" __global__ __aicore__ void tilexr_udma_all_gather_kernel( GM_ADDR commArgsGM, GM_ADDR dataGM, GM_ADDR debugGM, int32_t elementsPerRank) @@ -110,6 +324,561 @@ extern "C" __global__ __aicore__ void tilexr_udma_registered_smoke_kernel( } } +extern "C" __global__ __aicore__ void tilexr_udma_p2p_perf_kernel( + GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR debugGM, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic) +{ + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM); + auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM); + + int32_t rank = args->rank; + uint32_t blockIdx = AscendC::GetBlockIdx(); + uint32_t blockNum = AscendC::GetBlockNum(); + bool enabled = TileXR::UDMARegistryEnabled(args); + uint32_t qpNum = enabled ? TileXR::GetUDMAInfo(args)->qpNum : 0; + if (debug != nullptr && blockIdx == 0) { + debug[0] = TILEXR_UDMA_DEMO_MAGIC; + debug[1] = rank; + debug[2] = enabled ? 1 : 0; + debug[3] = bytes; + debug[4] = pattern; + debug[5] = 0xffffffffu; + debug[6] = blockNum; + debug[7] = qpNum; + } + + int32_t peer = -1; + if (!enabled || blockNum == 0 || qpNum == 0 || + !TileXRUdmaDemoResolvePeer(rank, srcRank, dstRank, traffic, peer)) { + return; + } + uint32_t jettyCount = blockNum < qpNum ? blockNum : qpNum; + if (blockIdx >= jettyCount) { + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + return; + } + + uint32_t offset = 0; + uint32_t sliceBytes = 0; + TileXRUdmaDemoWqeSlice(bytes, jettyCount, blockIdx, offset, sliceBytes); + if (debug != nullptr && blockIdx < 8) { + debug[16 + blockIdx] = offset; + debug[24 + blockIdx] = sliceBytes; + } + if (sliceBytes == 0) { + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + return; + } + TileXR::UDMAPutNbiQp(args, peer, blockIdx, src + offset, dstByteOffset + offset, sliceBytes); + uint32_t status = TileXR::UDMAQuietStatusQp(args, peer, blockIdx); + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + debug[15] = jettyCount; + } +} + +extern "C" __global__ __aicore__ void tilexr_udma_p2p_post_only_perf_kernel( + GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR debugGM, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic) +{ + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM); + auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM); + + int32_t rank = args->rank; + uint32_t blockIdx = AscendC::GetBlockIdx(); + uint32_t blockNum = AscendC::GetBlockNum(); + bool enabled = TileXR::UDMARegistryEnabled(args); + uint32_t qpNum = enabled ? TileXR::GetUDMAInfo(args)->qpNum : 0; + if (debug != nullptr && blockIdx == 0) { + debug[0] = TILEXR_UDMA_DEMO_MAGIC; + debug[1] = rank; + debug[2] = enabled ? 1 : 0; + debug[3] = bytes; + debug[4] = pattern; + debug[5] = 0xffffffffu; + debug[6] = blockNum; + debug[7] = qpNum; + } + + int32_t peer = -1; + if (!enabled || blockNum == 0 || qpNum == 0 || + !TileXRUdmaDemoResolvePeer(rank, srcRank, dstRank, traffic, peer)) { + return; + } + uint32_t jettyCount = blockNum < qpNum ? blockNum : qpNum; + if (blockIdx >= jettyCount) { + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + return; + } + + uint32_t offset = 0; + uint32_t sliceBytes = 0; + TileXRUdmaDemoWqeSlice(bytes, jettyCount, blockIdx, offset, sliceBytes); + if (debug != nullptr && blockIdx < 8) { + debug[16 + blockIdx] = offset; + debug[24 + blockIdx] = sliceBytes; + } + if (sliceBytes == 0) { + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + return; + } + + uint64_t startCycle = static_cast(AscendC::GetSystemCycle()); + auto udmaInfo = TileXR::GetUDMAInfo(args); + auto registry = TileXR::GetUDMARegistry(args); + if (!TileXR::UDMARegisteredRangeValid(registry, peer, dstByteOffset + offset, sliceBytes)) { + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0xffffffffu); + return; + } + uint64_t validateCycle = static_cast(AscendC::GetSystemCycle()); + __gm__ TileXR::UDMAWQCtx* qpCtxEntry = TileXR::UDMAGetWQCtx(udmaInfo, peer, blockIdx); + uint32_t wqeSize = 1U << qpCtxEntry->baseBkShift; + uint32_t curHead = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->headAddr), 0); + uint32_t wqeCnt = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->wqeCntAddr), 0); + TileXR::UDMAPollCQWhenSQOverflow(udmaInfo, qpCtxEntry, wqeCnt, peer, blockIdx); + __gm__ TileXR::UDMAMemInfo* remoteMemInfo = TileXR::UDMAGetRemoteMemInfo(udmaInfo, peer, blockIdx); + __gm__ uint8_t* remoteAddr = TileXR::UDMARegisteredRemoteAddr(registry, peer, dstByteOffset + offset); + __gm__ uint8_t* wqeAddr = + reinterpret_cast<__gm__ uint8_t*>(qpCtxEntry->bufAddr + wqeSize * (curHead % TileXR::TILEXR_UDMA_SQ_BB_COUNT)); + __gm__ TileXR::UDMASqeCtx* sqeCtx = reinterpret_cast<__gm__ TileXR::UDMASqeCtx*>(wqeAddr); + uint64_t ctxCycle = static_cast(AscendC::GetSystemCycle()); + TileXR::UDMAFillSqeCtx(sqeCtx, remoteAddr, remoteMemInfo, curHead, TileXR::UDMAOpcode::WRITE, nullptr); + uint64_t sqeCycle = static_cast(AscendC::GetSystemCycle()); + __gm__ TileXR::UDMASgeCtx* sgeCtx = + reinterpret_cast<__gm__ TileXR::UDMASgeCtx*>(TileXR::UDMAGetSgeCtxAddr(wqeAddr, TileXR::UDMAOpcode::WRITE)); + TileXR::UDMAFillSgeCtx(sgeCtx, sliceBytes, src + offset); + uint64_t sgeCycle = static_cast(AscendC::GetSystemCycle()); + uint32_t wqeBbCnt = TileXR::UDMAWqeBBCnt(TileXR::UDMAOpcode::WRITE); + TileXR::UDMACleanCacheLines(wqeAddr, wqeSize * wqeBbCnt); + uint64_t cleanCycle = static_cast(AscendC::GetSystemCycle()); + curHead += wqeBbCnt; + TileXR::UDMAPostSendUpdateInfo(curHead, qpCtxEntry); + ++wqeCnt; + st_dev(wqeCnt, reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->wqeCntAddr), 0); + uint64_t endCycle = static_cast(AscendC::GetSystemCycle()); + uint32_t status = TileXR::UDMAQuietStatusQp(args, peer, blockIdx); + uint64_t quietCycle = static_cast(AscendC::GetSystemCycle()); + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status); + if (debug != nullptr && blockIdx == 0) { + uint64_t delta = endCycle - startCycle; + TileXRUdmaDemoAddCycleSum(debug, 30, delta); + TileXRUdmaDemoAddCycleSum(debug, 32, validateCycle - startCycle); + TileXRUdmaDemoAddCycleSum(debug, 34, ctxCycle - validateCycle); + TileXRUdmaDemoAddCycleSum(debug, 36, sqeCycle - ctxCycle); + TileXRUdmaDemoAddCycleSum(debug, 38, sgeCycle - sqeCycle); + TileXRUdmaDemoAddCycleSum(debug, 40, cleanCycle - sgeCycle); + TileXRUdmaDemoAddCycleSum(debug, 42, endCycle - cleanCycle); + TileXRUdmaDemoAddCycleSum(debug, 44, quietCycle - endCycle); + debug[5] = status; + debug[15] = jettyCount; + debug[28] = static_cast(delta & 0xffffffffu); + debug[29] = static_cast(delta >> 32); + } +} + +extern "C" __global__ __aicore__ void tilexr_memory_p2p_perf_kernel( + GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR debugGM, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic) +{ + if constexpr (g_coreType == AscendC::AIV) { + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM); + + int32_t rank = args->rank; + uint32_t blockIdx = AscendC::GetBlockIdx(); + uint32_t blockNum = AscendC::GetBlockNum(); + if (debug != nullptr && blockIdx == 0) { + debug[0] = TILEXR_UDMA_DEMO_MAGIC; + debug[1] = rank; + debug[2] = 1; + debug[3] = bytes; + debug[4] = pattern; + debug[5] = 0xffffffffu; + } + + int32_t peer = -1; + if (blockNum == 0 || !TileXRUdmaDemoResolvePeer(rank, srcRank, dstRank, traffic, peer) || + peer < 0 || peer >= args->rankSize) { + return; + } + + AscendC::GlobalTensor peerMems; + peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE); + GM_ADDR dstBase = peerMems.GetValue(peer); + if (dstBase == nullptr) { + uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + } + return; + } + + uint32_t offset = 0; + uint32_t sliceBytes = 0; + TileXRUdmaDemoBlockSlice(bytes, blockNum, blockIdx, offset, sliceBytes); + AscendC::TPipe pipe; + AscendC::TBuf tBuf; + pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES); + TileXRUdmaDemoCopyBytesGmToGm( + dstBase + dstByteOffset + offset, srcGM + offset, tBuf, sliceBytes); + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + if (debug != nullptr && blockIdx == 0) { + debug[5] = 0; + } + } +} + +extern "C" __global__ __aicore__ void tilexr_memory_segmented_p2p_perf_kernel( + GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR debugGM, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, uint32_t segmentBytes, + int32_t rotateWindow, uint32_t traceSegments) +{ + if constexpr (g_coreType == AscendC::AIV) { + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM); + + int32_t rank = args->rank; + uint32_t blockIdx = AscendC::GetBlockIdx(); + uint32_t blockNum = AscendC::GetBlockNum(); + if (debug != nullptr && blockIdx == 0) { + debug[0] = TILEXR_UDMA_DEMO_MAGIC; + debug[1] = rank; + debug[2] = 1; + debug[3] = bytes; + debug[4] = pattern; + debug[5] = 0xffffffffu; + debug[6] = segmentBytes; + debug[7] = static_cast(rotateWindow != 0); + debug[46] = traceSegments; + } + + int32_t peer = -1; + if (blockNum == 0 || segmentBytes == 0U || + !TileXRUdmaDemoResolvePeer(rank, srcRank, dstRank, traffic, peer) || + peer < 0 || peer >= args->rankSize) { + return; + } + + AscendC::GlobalTensor peerMems; + peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE); + GM_ADDR dstBase = peerMems.GetValue(peer); + if (dstBase == nullptr) { + uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + } + return; + } + + uint32_t offset = 0; + uint32_t sliceBytes = 0; + TileXRUdmaDemoBlockSlice(bytes, blockNum, blockIdx, offset, sliceBytes); + AscendC::TPipe pipe; + AscendC::TBuf tBuf; + pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES); + + uint32_t copiedInSlice = 0; + uint32_t traceIndex = 0; + while (copiedInSlice < sliceBytes) { + uint32_t chunkBytes = sliceBytes - copiedInSlice; + if (chunkBytes > segmentBytes) { + chunkBytes = segmentBytes; + } + const uint32_t srcOffset = offset + copiedInSlice; + uint32_t dstInWindow = srcOffset; + if (rotateWindow != 0) { + dstInWindow = srcOffset % segmentBytes; + } + const bool recordTrace = debug != nullptr && blockIdx == 0 && traceIndex < traceSegments && traceIndex < 8U; + uint64_t startCycle = 0U; + if (recordTrace) { + startCycle = static_cast(AscendC::GetSystemCycle()); + } + TileXRUdmaDemoCopyBytesGmToGm( + dstBase + dstByteOffset + dstInWindow, srcGM + srcOffset, tBuf, chunkBytes); + if (recordTrace) { + const uint64_t endCycle = static_cast(AscendC::GetSystemCycle()); + const uint32_t traceLowIdx = 48U + traceIndex * 2U; + TileXRUdmaDemoAddCycleSum(debug, traceLowIdx, endCycle - startCycle); + } + copiedInSlice += chunkBytes; + ++traceIndex; + } + + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + if (debug != nullptr && blockIdx == 0) { + debug[5] = 0; + } + } +} + +extern "C" __global__ __aicore__ void tilexr_memory_consume_p2p_perf_kernel( + GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR dstGM, GM_ADDR debugGM, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step) +{ + if constexpr (g_coreType == AscendC::AIV) { + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM); + auto dst = reinterpret_cast<__gm__ uint8_t*>(dstGM); + + int32_t rank = args->rank; + uint32_t blockIdx = AscendC::GetBlockIdx(); + uint32_t blockNum = AscendC::GetBlockNum(); + if (debug != nullptr && blockIdx == 0) { + debug[0] = TILEXR_UDMA_DEMO_MAGIC; + debug[1] = rank; + debug[2] = 1; + debug[3] = bytes; + debug[4] = pattern; + debug[5] = 0xffffffffu; + debug[6] = blockNum; + debug[7] = static_cast(magic); + } + + bool isSender = false; + bool isReceiver = false; + int32_t peer = -1; + if (blockNum == 0 || + !TileXRUdmaDemoResolveMemoryConsumeRole(rank, srcRank, dstRank, traffic, isSender, isReceiver, peer) || + peer < 0 || peer >= args->rankSize) { + return; + } + + AscendC::GlobalTensor peerMems; + peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE); + GM_ADDR shareAddrs[TileXR::TILEXR_MAX_RANK_SIZE]; + for (int32_t i = 0; i < args->rankSize; ++i) { + shareAddrs[i] = peerMems.GetValue(i); + } + GM_ADDR peerBase = shareAddrs[peer]; + GM_ADDR localBase = shareAddrs[rank]; + if (peerBase == nullptr || localBase == nullptr || (isReceiver && dst == nullptr) || + (isSender && srcGM == nullptr)) { + uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + } + return; + } + + uint32_t offset = 0; + uint32_t sliceBytes = 0; + TileXRUdmaDemoBlockSlice(bytes, blockNum, blockIdx, offset, sliceBytes); + if (debug != nullptr && blockIdx < 8) { + debug[16 + blockIdx] = offset; + debug[24 + blockIdx] = sliceBytes; + } + if (sliceBytes == 0) { + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + return; + } + + AscendC::TPipe pipe; + AscendC::TBuf tBuf; + pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES); + SyncCollectives sync; + sync.Init(rank, args->rankSize, shareAddrs, tBuf); + + uint32_t status = 0; + if (isSender) { + TileXRUdmaDemoCopyBytesGmToGm( + peerBase + dstByteOffset + offset, srcGM + offset, tBuf, sliceBytes); + sync.SetOuterFlag(magic, step); + } + if (isReceiver && status == 0) { + sync.WaitOuterFlag(magic, step, peer, blockIdx); + TileXRUdmaDemoCopyBytesGmToGm( + reinterpret_cast(dst + offset), localBase + dstByteOffset + offset, tBuf, sliceBytes); + } + + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + } + } +} + +extern "C" __global__ __aicore__ void tilexr_data_as_flag_p2p_perf_kernel( + GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR dstGM, GM_ADDR debugGM, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic) +{ + if constexpr (g_coreType == AscendC::AIV) { + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM); + auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM); + auto dst = reinterpret_cast<__gm__ uint8_t*>(dstGM); + + int32_t rank = args->rank; + uint32_t blockIdx = AscendC::GetBlockIdx(); + uint32_t blockNum = AscendC::GetBlockNum(); + if (debug != nullptr && blockIdx == 0) { + debug[0] = TILEXR_UDMA_DEMO_MAGIC; + debug[1] = rank; + debug[2] = 1; + debug[3] = bytes; + debug[4] = pattern; + debug[5] = 0xffffffffu; + } + + bool isSender = false; + bool isReceiver = false; + int32_t peer = -1; + if (blockNum == 0 || + !TileXRUdmaDemoResolveDataAsFlagRole(rank, srcRank, dstRank, traffic, isSender, isReceiver, peer) || + peer < 0 || peer >= args->rankSize) { + return; + } + + AscendC::GlobalTensor peerMems; + peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE); + GM_ADDR peerBase = peerMems.GetValue(peer); + GM_ADDR localBase = peerMems.GetValue(rank); + if (peerBase == nullptr || localBase == nullptr || dst == nullptr) { + uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + } + return; + } + + uint32_t payloadOffset = 0; + uint32_t sliceBytes = 0; + uint32_t dataAsFlagOffset = 0; + TileXRUdmaDemoDataAsFlagSlice(bytes, blockNum, blockIdx, payloadOffset, sliceBytes, dataAsFlagOffset); + if (sliceBytes == 0) { + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + return; + } + + AscendC::TPipe pipe; + AscendC::TBuf tBuf; + pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES); + AscendC::LocalTensor scratch = tBuf.Get(); + + uint32_t status = 0; + if (isSender) { + uint32_t initBlocks = TileXR::DataAsFlagInit(scratch); + uint32_t sentBlocks = TileXR::DataAsFlagSend( + reinterpret_cast<__gm__ uint8_t*>(peerBase + dstByteOffset + dataAsFlagOffset), + src + payloadOffset, sliceBytes, scratch); + if (initBlocks == 0 || sentBlocks == 0) { + status = 3; + } + } + if (isReceiver && status == 0) { + bool received = TileXR::DataAsFlagCheckAndRecv( + reinterpret_cast<__gm__ uint8_t*>(localBase + dstByteOffset + dataAsFlagOffset), + sliceBytes, dst + payloadOffset, scratch); + if (!received) { + status = 4; + } + } + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + } + } +} + +extern "C" __global__ __aicore__ void tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel( + GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR dstGM, GM_ADDR debugGM, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step, int32_t strict) +{ + if constexpr (g_coreType == AscendC::AIV) { + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM); + auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM); + auto dst = reinterpret_cast<__gm__ uint8_t*>(dstGM); + + int32_t rank = args->rank; + uint32_t blockIdx = AscendC::GetBlockIdx(); + uint32_t blockNum = AscendC::GetBlockNum(); + const uint64_t epoch = TileXR::DataAsFlagEpoch(magic, step); + if (debug != nullptr && blockIdx == 0) { + debug[0] = TILEXR_UDMA_DEMO_MAGIC; + debug[1] = rank; + debug[2] = 1; + debug[3] = bytes; + debug[4] = pattern; + debug[5] = 0xffffffffu; + debug[6] = blockNum; + debug[7] = static_cast(epoch & 0xffffffffu); + } + + bool isSender = false; + bool isReceiver = false; + int32_t peer = -1; + if (blockNum == 0 || + !TileXRUdmaDemoResolveDataAsFlagRole(rank, srcRank, dstRank, traffic, isSender, isReceiver, peer) || + peer < 0 || peer >= args->rankSize) { + return; + } + + AscendC::GlobalTensor peerMems; + peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE); + GM_ADDR peerBase = peerMems.GetValue(peer); + GM_ADDR localBase = peerMems.GetValue(rank); + if (peerBase == nullptr || localBase == nullptr || dst == nullptr || (isSender && src == nullptr)) { + uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + } + return; + } + + uint32_t payloadOffset = 0; + uint32_t sliceBytes = 0; + uint32_t dataAsFlagOffset = 0; + TileXRUdmaDemoDataAsFlagSlice(bytes, blockNum, blockIdx, payloadOffset, sliceBytes, dataAsFlagOffset); + if (debug != nullptr && blockIdx < 8) { + debug[16 + blockIdx] = payloadOffset; + debug[24 + blockIdx] = sliceBytes; + } + if (sliceBytes == 0) { + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0); + return; + } + + AscendC::TPipe pipe; + AscendC::TBuf tBuf; + pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES); + AscendC::LocalTensor scratch = tBuf.Get(); + + uint32_t status = 0; + if (isSender) { + uint32_t sentBlocks = TileXR::DataAsFlagSendEpochOrdered( + reinterpret_cast<__gm__ uint8_t*>(peerBase + dstByteOffset + dataAsFlagOffset), + src + payloadOffset, sliceBytes, epoch, scratch); + if (sentBlocks == 0U) { + status = 3; + } + } + if (isReceiver && status == 0) { + bool strictMode = strict != 0; + bool received = TileXR::DataAsFlagCheckAndRecvEpochOrdered( + reinterpret_cast<__gm__ uint8_t*>(localBase + dstByteOffset + dataAsFlagOffset), + sliceBytes, dst + payloadOffset, epoch, scratch, strictMode); + if (!received) { + status = 4; + } + } + TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status); + if (debug != nullptr && blockIdx == 0) { + debug[5] = status; + } + } +} + void launch_tilexr_udma_all_gather( uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR data, GM_ADDR debug, int32_t elementsPerRank) { @@ -131,3 +900,63 @@ void launch_tilexr_udma_registered_smoke( tilexr_udma_registered_smoke_kernel<<>>( commArgs, local, debug, bytes, signal); } + +void launch_tilexr_udma_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic) +{ + tilexr_udma_p2p_perf_kernel<<>>( + commArgs, src, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic); +} + +void launch_tilexr_udma_p2p_post_only_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic) +{ + tilexr_udma_p2p_post_only_perf_kernel<<>>( + commArgs, src, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic); +} + +void launch_tilexr_memory_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic) +{ + tilexr_memory_p2p_perf_kernel<<>>( + commArgs, src, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic); +} + +void launch_tilexr_memory_segmented_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, + int32_t traffic, uint32_t segmentBytes, int32_t rotateWindow, uint32_t traceSegments) +{ + tilexr_memory_segmented_p2p_perf_kernel<<>>( + commArgs, src, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic, + segmentBytes, rotateWindow, traceSegments); +} + +void launch_tilexr_memory_consume_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step) +{ + tilexr_memory_consume_p2p_perf_kernel<<>>( + commArgs, src, dst, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic, magic, step); +} + +void launch_tilexr_data_as_flag_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic) +{ + tilexr_data_as_flag_p2p_perf_kernel<<>>( + commArgs, src, dst, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic); +} + +void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug, + int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, + uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step, int32_t strict) +{ + tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel<<>>( + commArgs, src, dst, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic, magic, step, strict); +} diff --git a/tests/udma/demo/tilexr_udma_p2p_perf_config.h b/tests/udma/demo/tilexr_udma_p2p_perf_config.h new file mode 100644 index 0000000..36dc98c --- /dev/null +++ b/tests/udma/demo/tilexr_udma_p2p_perf_config.h @@ -0,0 +1,401 @@ +#ifndef TILEXR_UDMA_P2P_PERF_CONFIG_H +#define TILEXR_UDMA_P2P_PERF_CONFIG_H + +#include +#include +#include +#include +#include + +namespace TileXR { +namespace Demo { + +constexpr uint64_t kP2PMemoryMaxBytes = 100ULL * 1024ULL * 1024ULL; +constexpr uint64_t kP2PMemorySegmentBytes = 16ULL * 1024ULL * 1024ULL; +constexpr uint64_t kP2PMemoryTraceSegmentBytes = 8ULL * 1024ULL * 1024ULL; + +enum class P2PTransport { + DirectUrma, + DirectUrmaPostOnly, + Memory, + MemorySegmented, + MemorySegmentedRotate, + MemorySegmentedTrace, + MemorySegmentedRotateTrace, + MemoryConsume, + DataAsFlag, + DataAsFlagEpochOrdered, + Invalid, +}; + +enum class P2PTraffic { + UniDir, + BiDir, + Invalid, +}; + +inline const char* P2PTransportName(P2PTransport transport) +{ + switch (transport) { + case P2PTransport::DirectUrma: + return "direct_urma"; + case P2PTransport::DirectUrmaPostOnly: + return "direct_urma_post_only"; + case P2PTransport::Memory: + return "memory"; + case P2PTransport::MemorySegmented: + return "memory_segmented"; + case P2PTransport::MemorySegmentedRotate: + return "memory_segmented_rotate"; + case P2PTransport::MemorySegmentedTrace: + return "memory_segmented_trace"; + case P2PTransport::MemorySegmentedRotateTrace: + return "memory_segmented_rotate_trace"; + case P2PTransport::MemoryConsume: + return "memory_consume"; + case P2PTransport::DataAsFlag: + return "data_as_flag"; + case P2PTransport::DataAsFlagEpochOrdered: + return "data_as_flag_epoch_ordered"; + default: + return "invalid"; + } +} + +inline const char* P2PTrafficName(P2PTraffic traffic) +{ + switch (traffic) { + case P2PTraffic::UniDir: + return "unidir"; + case P2PTraffic::BiDir: + return "bidir"; + default: + return "invalid"; + } +} + +inline P2PTransport ParseP2PTransport(const std::string& name) +{ + if (name == "direct_urma" || name == "udma") { + return P2PTransport::DirectUrma; + } + if (name == "direct_urma_post_only" || name == "udma_post_only" || name == "post_only") { + return P2PTransport::DirectUrmaPostOnly; + } + if (name == "memory" || name == "ipc" || name == "datacopy") { + return P2PTransport::Memory; + } + if (name == "memory_segmented" || name == "memory-segmented" || name == "ipc_segmented") { + return P2PTransport::MemorySegmented; + } + if (name == "memory_segmented_rotate" || + name == "memory-segmented-rotate" || + name == "ipc_segmented_rotate") { + return P2PTransport::MemorySegmentedRotate; + } + if (name == "memory_segmented_trace" || + name == "memory-segmented-trace" || + name == "ipc_segmented_trace") { + return P2PTransport::MemorySegmentedTrace; + } + if (name == "memory_segmented_rotate_trace" || + name == "memory-segmented-rotate-trace" || + name == "ipc_segmented_rotate_trace") { + return P2PTransport::MemorySegmentedRotateTrace; + } + if (name == "memory_consume" || name == "memory-consume" || name == "mem_consume") { + return P2PTransport::MemoryConsume; + } + if (name == "data_as_flag" || name == "data-as-flag" || name == "daf") { + return P2PTransport::DataAsFlag; + } + if (name == "data_as_flag_epoch_ordered" || + name == "data-as-flag-epoch-ordered" || + name == "daf_epoch_ordered") { + return P2PTransport::DataAsFlagEpochOrdered; + } + return P2PTransport::Invalid; +} + +inline P2PTraffic ParseP2PTraffic(const std::string& name) +{ + if (name == "unidir" || name == "uni" || name == "single") { + return P2PTraffic::UniDir; + } + if (name == "bidir" || name == "bi" || name == "duplex" || name == "full_duplex") { + return P2PTraffic::BiDir; + } + return P2PTraffic::Invalid; +} + +struct P2PPerfOptions { + int srcRank = 0; + int dstRank = 1; + uint64_t minBytes = 4096; + uint64_t maxBytes = 4096; + uint64_t stepFactor = 2; + int iters = 100; + int warmupIters = 10; + bool check = true; + std::string csvPath; + std::string logDir; + P2PTransport transport = P2PTransport::DirectUrma; + P2PTraffic traffic = P2PTraffic::UniDir; + uint32_t blockDim = 1; +}; + +struct P2PPerfRow { + P2PTransport transport = P2PTransport::DirectUrma; + P2PTraffic traffic = P2PTraffic::UniDir; + uint32_t blockDim = 1; + int srcRank = 0; + int dstRank = 1; + int rankSize = 2; + uint64_t bytes = 0; + int iters = 0; + double avgUs = 0.0; + double minUs = 0.0; + double maxUs = 0.0; + uint32_t status = 0; + uint64_t errors = 0; + std::string logDir; +}; + +struct P2PRankStatus { + uint32_t status = 0; + uint64_t errors = 0; + float elapsedMs = 0.0f; + double avgUsOverride = 0.0; +}; + +inline std::string DirectionName(int srcRank, int dstRank) +{ + return std::to_string(srcRank) + "to" + std::to_string(dstRank); +} + +inline std::string TrafficDirectionName(int srcRank, int dstRank, P2PTraffic traffic) +{ + if (traffic == P2PTraffic::BiDir) { + return DirectionName(srcRank, dstRank) + "+" + DirectionName(dstRank, srcRank); + } + return DirectionName(srcRank, dstRank); +} + +inline uint64_t DataAsFlagWindowBytes(uint64_t payloadBytes) +{ + return ((payloadBytes + 479ULL) / 480ULL) * 512ULL; +} + +inline uint64_t P2PTransportWindowBytes(P2PTransport transport, uint64_t payloadBytes) +{ + return (transport == P2PTransport::DataAsFlag || + transport == P2PTransport::DataAsFlagEpochOrdered) ? + DataAsFlagWindowBytes(payloadBytes) : payloadBytes; +} + +inline uint64_t P2PTransportWindowBytes(P2PTransport transport, uint64_t payloadBytes, uint32_t blockDim) +{ + (void)blockDim; + return P2PTransportWindowBytes(transport, payloadBytes); +} + +inline bool P2PTransportUsesIpc(P2PTransport transport) +{ + return transport == P2PTransport::Memory || + transport == P2PTransport::MemorySegmented || + transport == P2PTransport::MemorySegmentedRotate || + transport == P2PTransport::MemorySegmentedTrace || + transport == P2PTransport::MemorySegmentedRotateTrace || + transport == P2PTransport::MemoryConsume || + transport == P2PTransport::DataAsFlag || + transport == P2PTransport::DataAsFlagEpochOrdered; +} + +inline bool P2PTransportBothRanksActive(P2PTransport transport, P2PTraffic traffic) +{ + return traffic == P2PTraffic::BiDir || + transport == P2PTransport::MemoryConsume || + transport == P2PTransport::DataAsFlag || + transport == P2PTransport::DataAsFlagEpochOrdered; +} + +inline int ActiveP2PFlowCount(P2PTraffic traffic) +{ + return traffic == P2PTraffic::BiDir ? 2 : 1; +} + +inline uint64_t P2PEffectiveTransferBytes(P2PTransport transport, uint64_t payloadBytes, uint32_t blockDim) +{ + (void)transport; + (void)blockDim; + return payloadBytes; +} + +inline bool ValidateP2PPerfOptions(const P2PPerfOptions& options, int rankSize, std::string* error) +{ + auto fail = [error](const std::string& message) { + if (error != nullptr) { + *error = message; + } + return false; + }; + if (rankSize != 2) { + return fail("test_type=4 requires rank_size=2"); + } + if (options.srcRank < 0 || options.srcRank >= rankSize || + options.dstRank < 0 || options.dstRank >= rankSize) { + return fail("src_rank and dst_rank must be in [0, rank_size)"); + } + if (options.srcRank == options.dstRank) { + return fail("src_rank and dst_rank must be different"); + } + if (options.minBytes == 0 || options.maxBytes == 0 || options.minBytes > options.maxBytes) { + return fail("byte range must be nonzero and min_bytes <= max_bytes"); + } + if (options.stepFactor < 2) { + return fail("step_factor must be at least 2"); + } + if (options.iters <= 0 || options.warmupIters < 0) { + return fail("iters must be positive and warmup_iters must be nonnegative"); + } + if (options.blockDim == 0U || options.blockDim > 64U) { + return fail("block_dim must be in [1, 64]"); + } + if (options.transport == P2PTransport::Invalid) { + return fail( + "transport must be direct_urma, direct_urma_post_only, memory, memory_segmented, memory_segmented_rotate, memory_segmented_trace, memory_segmented_rotate_trace, memory_consume, data_as_flag, or data_as_flag_epoch_ordered"); + } + if (options.traffic == P2PTraffic::Invalid) { + return fail("traffic must be unidir or bidir"); + } + if (P2PTransportUsesIpc(options.transport) && + P2PTransportWindowBytes(options.transport, options.maxBytes, options.blockDim) > kP2PMemoryMaxBytes) { + return fail("memory/memory_segmented/memory_consume/data_as_flag transport max_bytes must fit in the TileXR IPC data window"); + } + return true; +} + +inline std::vector BuildP2PPerfSizeSweep(const P2PPerfOptions& options) +{ + std::vector sizes; + for (uint64_t bytes = options.minBytes; bytes <= options.maxBytes;) { + sizes.push_back(bytes); + if (bytes > options.maxBytes / options.stepFactor) { + break; + } + bytes *= options.stepFactor; + } + if (sizes.empty() || sizes.back() != options.maxBytes) { + sizes.push_back(options.maxBytes); + } + return sizes; +} + +inline uint32_t P2PPattern(int srcRank, int dstRank, uint64_t bytes) +{ + uint32_t pattern = 0x5a000000u; + pattern ^= static_cast((srcRank & 0xff) << 16); + pattern ^= static_cast((dstRank & 0xff) << 8); + pattern ^= static_cast(bytes & 0xffu); + return pattern; +} + +inline uint8_t P2PPatternByte(uint32_t pattern, uint64_t index) +{ + return static_cast((pattern >> ((index & 3u) * 8u)) & 0xffu); +} + +inline void FillP2PPattern(std::vector& data, uint32_t pattern) +{ + for (uint64_t i = 0; i < data.size(); ++i) { + data[static_cast(i)] = P2PPatternByte(pattern, i); + } +} + +inline uint64_t CountP2PMismatches(const std::vector& data, uint32_t pattern, uint64_t bytes) +{ + uint64_t errors = 0; + const uint64_t limit = bytes < data.size() ? bytes : data.size(); + for (uint64_t i = 0; i < limit; ++i) { + if (data[static_cast(i)] != P2PPatternByte(pattern, i)) { + ++errors; + } + } + return errors; +} + +inline uint64_t CountP2PTransportMismatches( + const std::vector& data, uint32_t pattern, uint64_t payloadBytes, + P2PTransport transport, uint32_t blockDim) +{ + (void)transport; + (void)blockDim; + return CountP2PMismatches(data, pattern, payloadBytes); +} + +inline std::string P2PPerfCsvHeader() +{ + return "transport,traffic,block_dim,direction,src,dst,ranks,bytes,iters,avg_us,min_us,max_us," + "bw_GBps,per_flow_bw_GBps,status,errors,log_dir\n"; +} + +inline std::string FormatP2PPerfCsvRow(const P2PPerfRow& row) +{ + const uint64_t effectiveBytes = P2PEffectiveTransferBytes(row.transport, row.bytes, row.blockDim); + const double perFlowBwGBps = row.avgUs > 0.0 ? static_cast(effectiveBytes) / row.avgUs / 1000.0 : 0.0; + const double bwGBps = perFlowBwGBps * static_cast(ActiveP2PFlowCount(row.traffic)); + std::ostringstream out; + out << P2PTransportName(row.transport) << ',' + << P2PTrafficName(row.traffic) << ',' + << row.blockDim << ',' + << TrafficDirectionName(row.srcRank, row.dstRank, row.traffic) << ',' + << row.srcRank << ',' + << row.dstRank << ',' + << row.rankSize << ',' + << row.bytes << ',' + << row.iters << ',' + << std::fixed << std::setprecision(3) + << row.avgUs << ',' + << row.minUs << ',' + << row.maxUs << ',' + << bwGBps << ',' + << perFlowBwGBps << ',' + << row.status << ',' + << row.errors << ',' + << row.logDir << '\n'; + return out.str(); +} + +inline P2PPerfRow BuildP2PPerfRow( + const P2PPerfOptions& options, + int rankSize, + uint64_t bytes, + const P2PRankStatus& srcStatus, + const P2PRankStatus& dstStatus) +{ + P2PPerfRow row; + row.transport = options.transport; + row.traffic = options.traffic; + row.blockDim = options.blockDim; + row.srcRank = options.srcRank; + row.dstRank = options.dstRank; + row.rankSize = rankSize; + row.bytes = bytes; + row.iters = options.iters; + const bool bothRanksActive = P2PTransportBothRanksActive(options.transport, options.traffic); + const float elapsedMs = bothRanksActive && dstStatus.elapsedMs > srcStatus.elapsedMs ? + dstStatus.elapsedMs : srcStatus.elapsedMs; + row.avgUs = options.iters > 0 ? static_cast(elapsedMs) * 1000.0 / static_cast(options.iters) : 0.0; + if (options.transport == P2PTransport::DirectUrmaPostOnly && srcStatus.avgUsOverride > 0.0) { + row.avgUs = srcStatus.avgUsOverride; + } + row.status = bothRanksActive ? (srcStatus.status | dstStatus.status) : srcStatus.status; + row.errors = bothRanksActive ? (srcStatus.errors + dstStatus.errors) : dstStatus.errors; + row.logDir = options.logDir; + return row; +} + +} // namespace Demo +} // namespace TileXR + +#endif // TILEXR_UDMA_P2P_PERF_CONFIG_H diff --git a/tests/udma/run_tests.sh b/tests/udma/run_tests.sh index f020b58..f4f37da 100755 --- a/tests/udma/run_tests.sh +++ b/tests/udma/run_tests.sh @@ -13,7 +13,7 @@ INSTALL_DIR="${SCRIPT_DIR}/install" source "${TILEXR_ROOT}/scripts/common_env.sh" # 设置 LD_LIBRARY_PATH:优先使用当前仓库刚编译安装的库,避免被 /usr/local/lib 中的旧库覆盖 -export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${TILEXR_ROOT}/install/lib:/usr/local/lib:${LD_LIBRARY_PATH}" +export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib64:${TILEXR_ROOT}/install/lib:${TILEXR_ROOT}/install/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}" echo "==========================================" echo " Running UDMA Tests" @@ -46,6 +46,7 @@ fi # 检查测试二进制是否存在 if [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma_transport_layout" ] || [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma_registry" ] || + [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma_p2p_perf_config" ] || [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma" ]; then echo "ERROR: Test binaries not found. Please run build.sh first." exit 1 @@ -67,19 +68,27 @@ echo "==========================================" TEST2_RESULT=$? echo "" -# 测试 3: TileXR 集成测试(单进程,单卡) +# 测试 3: P2P perf 配置单元测试(host-only) echo "==========================================" -echo "Test 3: TileXR Integration Tests (Single Process)" +echo "Test 3: TileXR UDMA P2P Perf Config Unit Test" +echo "==========================================" +"${INSTALL_DIR}/bin/test_tilexr_udma_p2p_perf_config" +TEST3_RESULT=$? +echo "" + +# 测试 4: TileXR 集成测试(单进程,单卡) +echo "==========================================" +echo "Test 4: TileXR Integration Tests (Single Process)" echo "==========================================" export RANK=0 export RANK_SIZE=1 "${INSTALL_DIR}/bin/test_tilexr_udma" -TEST3_RESULT=$? +TEST4_RESULT=$? echo "" -# 测试 4: TileXR 多进程测试(需要 mpirun) +# 测试 5: TileXR 多进程测试(需要 mpirun) echo "==========================================" -echo "Test 4: TileXR Multi-Process Tests (MPI)" +echo "Test 5: TileXR Multi-Process Tests (MPI)" echo "==========================================" # 检查是否有 mpirun @@ -98,14 +107,14 @@ if command -v mpirun &> /dev/null; then unset RANK unset RANK_SIZE mpirun -n 2 "${INSTALL_DIR}/bin/test_tilexr_udma" - TEST4_RESULT=$? + TEST5_RESULT=$? else echo "SKIP: Need at least 2 usable NPUs for multi-rank test" - TEST4_RESULT=0 + TEST5_RESULT=0 fi else echo "SKIP: mpirun not found, skipping multi-process tests" - TEST4_RESULT=0 + TEST5_RESULT=0 fi echo "" @@ -115,13 +124,14 @@ echo " Test Results Summary" echo "==========================================" echo "Test 1 (UDMA Layout): $([ $TEST1_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')" echo "Test 2 (UDMA Registry): $([ $TEST2_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')" -echo "Test 3 (TileXR Single): $([ $TEST3_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')" -echo "Test 4 (TileXR Multi): $([ $TEST4_RESULT -eq 0 ] && echo 'PASS' || echo 'SKIP/FAIL')" +echo "Test 3 (P2P Perf Config): $([ $TEST3_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')" +echo "Test 4 (TileXR Single): $([ $TEST4_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')" +echo "Test 5 (TileXR Multi): $([ $TEST5_RESULT -eq 0 ] && echo 'PASS' || echo 'SKIP/FAIL')" echo "==========================================" # 返回失败状态 if [ $TEST1_RESULT -ne 0 ] || [ $TEST2_RESULT -ne 0 ] || [ $TEST3_RESULT -ne 0 ] || - [ $TEST4_RESULT -ne 0 ]; then + [ $TEST4_RESULT -ne 0 ] || [ $TEST5_RESULT -ne 0 ]; then exit 1 fi diff --git a/tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp b/tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp new file mode 100644 index 0000000..6c828e5 --- /dev/null +++ b/tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp @@ -0,0 +1,335 @@ +#include +#include +#include +#include + +#include "demo/tilexr_udma_p2p_perf_config.h" + +namespace { + +void Require(bool condition, const char* message) +{ + if (!condition) { + throw message; + } +} + +} // namespace + +int main() +{ + TileXR::Demo::P2PPerfOptions options; + options.srcRank = 1; + options.dstRank = 0; + options.minBytes = 4096; + options.maxBytes = 16384; + options.stepFactor = 2; + options.iters = 20; + options.warmupIters = 5; + options.csvPath = "logs/p2p_perf.csv"; + + std::string error; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), "valid options rejected"); + Require(TileXR::Demo::DirectionName(0, 1) == "0to1", "direction name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DirectUrma) == "direct_urma", + "direct transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DirectUrmaPostOnly) == + "direct_urma_post_only", + "direct post-only transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::Memory) == "memory", + "memory transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::MemorySegmented) == "memory_segmented", + "memory_segmented transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::MemorySegmentedRotate) == + "memory_segmented_rotate", + "memory_segmented_rotate transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::MemorySegmentedTrace) == + "memory_segmented_trace", + "memory_segmented_trace transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace) == + "memory_segmented_rotate_trace", + "memory_segmented_rotate_trace transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::MemoryConsume) == "memory_consume", + "memory_consume transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DataAsFlag) == "data_as_flag", + "data_as_flag transport name mismatch"); + Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered) == + "data_as_flag_epoch_ordered", + "data_as_flag_epoch_ordered transport name mismatch"); + Require(TileXR::Demo::P2PTrafficName(TileXR::Demo::P2PTraffic::UniDir) == "unidir", + "unidir traffic name mismatch"); + Require(TileXR::Demo::P2PTrafficName(TileXR::Demo::P2PTraffic::BiDir) == "bidir", + "bidir traffic name mismatch"); + Require(TileXR::Demo::ParseP2PTransport("direct_urma") == TileXR::Demo::P2PTransport::DirectUrma, + "direct transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("udma") == TileXR::Demo::P2PTransport::DirectUrma, + "udma alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("direct_urma_post_only") == + TileXR::Demo::P2PTransport::DirectUrmaPostOnly, + "direct_urma_post_only transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("post_only") == TileXR::Demo::P2PTransport::DirectUrmaPostOnly, + "post_only alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory") == TileXR::Demo::P2PTransport::Memory, + "memory transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory_segmented") == TileXR::Demo::P2PTransport::MemorySegmented, + "memory_segmented transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory-segmented") == TileXR::Demo::P2PTransport::MemorySegmented, + "memory-segmented alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory_segmented_rotate") == + TileXR::Demo::P2PTransport::MemorySegmentedRotate, + "memory_segmented_rotate transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory-segmented-rotate") == + TileXR::Demo::P2PTransport::MemorySegmentedRotate, + "memory-segmented-rotate alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory_segmented_trace") == + TileXR::Demo::P2PTransport::MemorySegmentedTrace, + "memory_segmented_trace transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory-segmented-trace") == + TileXR::Demo::P2PTransport::MemorySegmentedTrace, + "memory-segmented-trace alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory_segmented_rotate_trace") == + TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace, + "memory_segmented_rotate_trace transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory-segmented-rotate-trace") == + TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace, + "memory-segmented-rotate-trace alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory_consume") == TileXR::Demo::P2PTransport::MemoryConsume, + "memory_consume transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("memory-consume") == TileXR::Demo::P2PTransport::MemoryConsume, + "memory-consume alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("mem_consume") == TileXR::Demo::P2PTransport::MemoryConsume, + "mem_consume alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("data_as_flag") == TileXR::Demo::P2PTransport::DataAsFlag, + "data_as_flag transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("data_as_flag_epoch_ordered") == + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, + "data_as_flag_epoch_ordered transport parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("data-as-flag-epoch-ordered") == + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, + "data-as-flag-epoch-ordered alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("daf_epoch_ordered") == + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, + "daf_epoch_ordered alias parse mismatch"); + Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_wqe") == TileXR::Demo::P2PTransport::Invalid, + "direct_urma_multi_wqe must be rejected"); + Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty") == TileXR::Demo::P2PTransport::Invalid, + "direct_urma_multi_jetty must be rejected"); + Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty_parallel") == + TileXR::Demo::P2PTransport::Invalid, + "direct_urma_multi_jetty_parallel must be rejected"); + Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty_parallel_fixed_wqe") == + TileXR::Demo::P2PTransport::Invalid, + "direct_urma_multi_jetty_parallel_fixed_wqe must be rejected"); + Require(TileXR::Demo::ParseP2PTraffic("unidir") == TileXR::Demo::P2PTraffic::UniDir, + "unidir traffic parse mismatch"); + Require(TileXR::Demo::ParseP2PTraffic("bidir") == TileXR::Demo::P2PTraffic::BiDir, + "bidir traffic parse mismatch"); + Require(TileXR::Demo::TrafficDirectionName(0, 1, TileXR::Demo::P2PTraffic::BiDir) == "0to1+1to0", + "bidir direction name mismatch"); + Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DataAsFlag, 0) == 0, + "data_as_flag zero layout mismatch"); + Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DataAsFlag, 480) == 512, + "data_as_flag 480B layout mismatch"); + Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DataAsFlag, 481) == 1024, + "data_as_flag 481B layout mismatch"); + Require(TileXR::Demo::P2PTransportWindowBytes( + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 0) == 0, + "data_as_flag_epoch_ordered zero layout mismatch"); + Require(TileXR::Demo::P2PTransportWindowBytes( + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 480) == 512, + "data_as_flag_epoch_ordered 480B layout mismatch"); + Require(TileXR::Demo::P2PTransportWindowBytes( + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 481) == 1024, + "data_as_flag_epoch_ordered 481B layout mismatch"); + Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DirectUrma, 4096, 8) == 4096, + "direct_urma window must equal payload bytes"); + Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::MemoryConsume, 4096, 4) == 4096, + "memory_consume window must equal payload bytes"); + Require(TileXR::Demo::P2PTransportWindowBytes( + TileXR::Demo::P2PTransport::MemorySegmented, 64ULL * 1024ULL * 1024ULL, 1) == + 64ULL * 1024ULL * 1024ULL, + "memory_segmented window must equal payload bytes"); + Require(TileXR::Demo::P2PTransportWindowBytes( + TileXR::Demo::P2PTransport::MemorySegmentedRotate, 64ULL * 1024ULL * 1024ULL, 1) == + 64ULL * 1024ULL * 1024ULL, + "memory_segmented_rotate allocation window must preserve source payload bytes"); + Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::MemorySegmented), + "memory_segmented must use IPC peer window"); + Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::MemorySegmentedRotate), + "memory_segmented_rotate must use IPC peer window"); + Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::MemorySegmentedTrace), + "memory_segmented_trace must use IPC peer window"); + Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace), + "memory_segmented_rotate_trace must use IPC peer window"); + Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::MemoryConsume), + "memory_consume must use IPC peer window"); + Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered), + "data_as_flag_epoch_ordered must use IPC peer window"); + Require(TileXR::Demo::P2PTransportBothRanksActive( + TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, TileXR::Demo::P2PTraffic::UniDir), + "data_as_flag_epoch_ordered unidir must keep receiver active"); + Require(TileXR::Demo::ActiveP2PFlowCount(TileXR::Demo::P2PTraffic::UniDir) == 1, + "unidir active flow count mismatch"); + Require(TileXR::Demo::ActiveP2PFlowCount(TileXR::Demo::P2PTraffic::BiDir) == 2, + "bidir active flow count mismatch"); + options.transport = TileXR::Demo::P2PTransport::Memory; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), "memory transport options rejected"); + options.transport = TileXR::Demo::P2PTransport::MemorySegmented; + options.maxBytes = 64ULL * 1024ULL * 1024ULL; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "valid memory_segmented options rejected"); + options.transport = TileXR::Demo::P2PTransport::MemorySegmentedRotate; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "valid memory_segmented_rotate options rejected"); + options.transport = TileXR::Demo::P2PTransport::MemorySegmentedTrace; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "valid memory_segmented_trace options rejected"); + options.transport = TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "valid memory_segmented_rotate_trace options rejected"); + options.maxBytes = 16384; + options.transport = TileXR::Demo::P2PTransport::DirectUrma; + options.traffic = TileXR::Demo::P2PTraffic::BiDir; + options.blockDim = 8; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "valid direct_urma multi-jetty options rejected"); + options.transport = TileXR::Demo::P2PTransport::DataAsFlag; + options.traffic = TileXR::Demo::P2PTraffic::BiDir; + options.blockDim = 4; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "valid data_as_flag bidir options rejected"); + options.transport = TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered; + options.traffic = TileXR::Demo::P2PTraffic::UniDir; + options.blockDim = 4; + options.maxBytes = 16384; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "valid data_as_flag_epoch_ordered options rejected"); + options.transport = TileXR::Demo::P2PTransport::MemoryConsume; + options.traffic = TileXR::Demo::P2PTraffic::UniDir; + options.blockDim = 4; + options.maxBytes = 16384; + Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "valid memory_consume options rejected"); + options.maxBytes = TileXR::Demo::kP2PMemoryMaxBytes + 1; + Require(!TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), + "oversized memory_consume transport accepted"); + options.maxBytes = 16384; + options.blockDim = 0; + Require(!TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), "block_dim=0 accepted"); + options.blockDim = 4; + options.maxBytes = TileXR::Demo::kP2PMemoryMaxBytes + 1; + Require(!TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), "oversized ipc transport accepted"); + options.maxBytes = 16384; + options.transport = TileXR::Demo::P2PTransport::Memory; + options.traffic = TileXR::Demo::P2PTraffic::UniDir; + options.blockDim = 4; + + const std::vector sizes = TileXR::Demo::BuildP2PPerfSizeSweep(options); + Require(sizes.size() == 3, "size sweep count mismatch"); + Require(sizes[0] == 4096 && sizes[1] == 8192 && sizes[2] == 16384, "size sweep values mismatch"); + + const uint32_t pattern = TileXR::Demo::P2PPattern(1, 0, 4096); + std::vector bytes(4096); + TileXR::Demo::FillP2PPattern(bytes, pattern); + Require(TileXR::Demo::CountP2PMismatches(bytes, pattern, 4096) == 0, "pattern validation failed"); + bytes[17] ^= 0xff; + Require(TileXR::Demo::CountP2PMismatches(bytes, pattern, 4096) == 1, "mismatch count failed"); + Require(TileXR::Demo::CountP2PTransportMismatches( + bytes, pattern, 4096, TileXR::Demo::P2PTransport::DirectUrma, 8) == 1, + "direct_urma mismatch checker must validate payload bytes"); + + TileXR::Demo::P2PPerfRow row; + row.srcRank = 1; + row.dstRank = 0; + row.rankSize = 2; + row.bytes = 4096; + row.iters = 20; + row.avgUs = 8.0; + row.status = 0; + row.errors = 0; + row.logDir = "logs/run"; + const std::string csv = TileXR::Demo::FormatP2PPerfCsvRow(row); + Require(csv == "direct_urma,unidir,1,1to0,1,0,2,4096,20,8.000,0.000,0.000,0.512,0.512,0,0,logs/run\n", + "csv row mismatch"); + row.traffic = TileXR::Demo::P2PTraffic::BiDir; + row.blockDim = 4; + row.transport = TileXR::Demo::P2PTransport::DataAsFlag; + const std::string bidirCsv = TileXR::Demo::FormatP2PPerfCsvRow(row); + Require(bidirCsv == + "data_as_flag,bidir,4,1to0+0to1,1,0,2,4096,20,8.000,0.000,0.000,1.024,0.512,0,0,logs/run\n", + "bidir csv row mismatch"); + row.transport = TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered; + row.traffic = TileXR::Demo::P2PTraffic::UniDir; + row.blockDim = 4; + const std::string epochOrderedCsv = TileXR::Demo::FormatP2PPerfCsvRow(row); + Require(epochOrderedCsv == + "data_as_flag_epoch_ordered,unidir,4,1to0,1,0,2,4096,20,8.000,0.000,0.000,0.512,0.512,0,0,logs/run\n", + "data_as_flag_epoch_ordered csv row mismatch"); + row.transport = TileXR::Demo::P2PTransport::DirectUrma; + row.traffic = TileXR::Demo::P2PTraffic::UniDir; + row.blockDim = 8; + const std::string directUrmaParallelCsv = TileXR::Demo::FormatP2PPerfCsvRow(row); + Require(directUrmaParallelCsv == + "direct_urma,unidir,8,1to0,1,0,2,4096,20,8.000,0.000,0.000,0.512,0.512,0,0,logs/run\n", + "direct_urma parallel csv row mismatch"); + + TileXR::Demo::P2PRankStatus srcSample; + srcSample.status = 0; + srcSample.errors = 0; + srcSample.elapsedMs = 6.4f; + TileXR::Demo::P2PRankStatus dstSample; + dstSample.status = 0xffffffffu; + dstSample.errors = 0; + dstSample.elapsedMs = 0.1f; + + const TileXR::Demo::P2PPerfRow aggregated = + TileXR::Demo::BuildP2PPerfRow(options, 2, 4096, srcSample, dstSample); + Require(std::fabs(aggregated.avgUs - 320.0) < 0.001, + "p2p row must use src rank elapsed time"); + Require(aggregated.status == 0, "p2p row must use src rank status"); + Require(aggregated.errors == 0, "p2p row must use dst rank errors"); + Require(aggregated.transport == TileXR::Demo::P2PTransport::Memory, + "p2p row must preserve transport"); + Require(aggregated.traffic == TileXR::Demo::P2PTraffic::UniDir, + "p2p row must preserve traffic"); + Require(aggregated.blockDim == 4, "p2p row must preserve block_dim"); + + options.traffic = TileXR::Demo::P2PTraffic::BiDir; + dstSample.status = 4; + dstSample.errors = 7; + dstSample.elapsedMs = 10.0f; + const TileXR::Demo::P2PPerfRow bidirAggregated = + TileXR::Demo::BuildP2PPerfRow(options, 2, 4096, srcSample, dstSample); + Require(std::fabs(bidirAggregated.avgUs - 500.0) < 0.001, + "bidir row must use max rank elapsed time"); + Require(bidirAggregated.status == 4, "bidir row must combine rank status"); + Require(bidirAggregated.errors == 7, "bidir row must sum rank errors"); + + options.transport = TileXR::Demo::P2PTransport::MemoryConsume; + options.traffic = TileXR::Demo::P2PTraffic::UniDir; + options.blockDim = 4; + options.srcRank = 1; + options.dstRank = 0; + srcSample.status = 1; + srcSample.errors = 2; + srcSample.elapsedMs = 6.4f; + dstSample.status = 4; + dstSample.errors = 7; + dstSample.elapsedMs = 10.0f; + const TileXR::Demo::P2PPerfRow memoryConsumeAggregated = + TileXR::Demo::BuildP2PPerfRow(options, 2, 4096, srcSample, dstSample); + Require(std::fabs(memoryConsumeAggregated.avgUs - 500.0) < 0.001, + "memory_consume unidir row must use max rank elapsed time"); + Require(memoryConsumeAggregated.status == 5, + "memory_consume unidir row must combine rank status"); + Require(memoryConsumeAggregated.errors == 9, + "memory_consume unidir row must sum rank errors"); + const std::string memoryConsumeCsv = TileXR::Demo::FormatP2PPerfCsvRow(memoryConsumeAggregated); + Require(memoryConsumeCsv == + "memory_consume,unidir,4,1to0,1,0,2,4096,20,500.000,0.000,0.000,0.008,0.008,5,9,\n", + "memory_consume csv row mismatch"); + + options.dstRank = 1; + Require(!TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), "same src/dst accepted"); + return 0; +} diff --git a/tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp b/tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp new file mode 100644 index 0000000..e0d9430 --- /dev/null +++ b/tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp @@ -0,0 +1,165 @@ +#include +#include +#include +#include + +namespace { + +int g_failures = 0; + +std::string RepoPath(const std::string& path) +{ +#ifdef TILEXR_SOURCE_ROOT + return std::string(TILEXR_SOURCE_ROOT) + "/" + path; +#else + return path; +#endif +} + +std::string ReadFile(const std::string& path) +{ + const std::string fullPath = RepoPath(path); + std::ifstream input(fullPath.c_str()); + if (!input.is_open()) { + std::cerr << "failed to open " << fullPath << std::endl; + ++g_failures; + return {}; + } + std::ostringstream buffer; + buffer << input.rdbuf(); + return buffer.str(); +} + +void CheckContains(const std::string& path, const std::string& text, const std::string& needle) +{ + if (text.find(needle) == std::string::npos) { + std::cerr << path << " does not contain required text: " << needle << std::endl; + ++g_failures; + } +} + +void CheckNotContains(const std::string& path, const std::string& text, const std::string& needle) +{ + if (text.find(needle) != std::string::npos) { + std::cerr << path << " contains forbidden text: " << needle << std::endl; + ++g_failures; + } +} + +void TestMemoryConsumeKernelUsesSyncCollectives() +{ + const std::string path = "tests/udma/demo/tilexr_udma_demo_kernel.cpp"; + const std::string text = ReadFile(path); + CheckContains(path, text, "#include \"tilexr_sync.h\""); + CheckContains(path, text, "tilexr_memory_consume_p2p_perf_kernel"); + CheckContains(path, text, "launch_tilexr_memory_consume_p2p_perf"); + CheckContains(path, text, "SyncCollectives sync"); + CheckContains(path, text, "int32_t magic, int32_t step"); + CheckContains(path, text, "sync.SetOuterFlag(magic, step);"); + CheckContains(path, text, "sync.WaitOuterFlag(magic, step, peer, blockIdx);"); +} + +void TestMemorySegmentedDiagnosticTransport() +{ + const std::string kernelPath = "tests/udma/demo/tilexr_udma_demo_kernel.cpp"; + const std::string kernelText = ReadFile(kernelPath); + CheckContains(kernelPath, kernelText, "tilexr_memory_segmented_p2p_perf_kernel"); + CheckContains(kernelPath, kernelText, "launch_tilexr_memory_segmented_p2p_perf"); + CheckContains(kernelPath, kernelText, "uint32_t segmentBytes, int32_t rotateWindow"); + CheckContains(kernelPath, kernelText, "dstInWindow = srcOffset % segmentBytes"); + CheckContains(kernelPath, kernelText, "uint32_t traceSegments"); + CheckContains(kernelPath, kernelText, "AscendC::GetSystemCycle()"); + CheckContains(kernelPath, kernelText, "TileXRUdmaDemoAddCycleSum(debug, traceLowIdx"); + + const std::string configPath = "tests/udma/demo/tilexr_udma_p2p_perf_config.h"; + const std::string configText = ReadFile(configPath); + CheckContains(configPath, configText, "kP2PMemorySegmentBytes = 16ULL * 1024ULL * 1024ULL"); + CheckContains(configPath, configText, "kP2PMemoryTraceSegmentBytes = 8ULL * 1024ULL * 1024ULL"); + CheckContains(configPath, configText, "MemorySegmented"); + CheckContains(configPath, configText, "MemorySegmentedRotate"); + CheckContains(configPath, configText, "MemorySegmentedTrace"); + CheckContains(configPath, configText, "MemorySegmentedRotateTrace"); + CheckContains(configPath, configText, "memory_segmented_rotate"); + CheckContains(configPath, configText, "memory_segmented_rotate_trace"); + + const std::string hostPath = "tests/udma/demo/tilexr_udma_demo.cpp"; + const std::string hostText = ReadFile(hostPath); + CheckContains(hostPath, hostText, "launch_tilexr_memory_segmented_p2p_perf"); + CheckContains(hostPath, hostText, "P2PTransport::MemorySegmentedRotate"); + CheckContains(hostPath, hostText, "P2PTransport::MemorySegmentedRotateTrace"); + CheckContains(hostPath, hostText, "traceSegments ? 8U : 0U"); + CheckContains(hostPath, hostText, "skipPayloadCheck"); +} + +void TestMemoryConsumeHostWiring() +{ + const std::string path = "tests/udma/demo/tilexr_udma_demo.cpp"; + const std::string text = ReadFile(path); + CheckContains(path, text, "launch_tilexr_memory_consume_p2p_perf"); + CheckContains(path, text, "P2PTransport::MemoryConsume"); + CheckContains(path, text, "P2PTransportUsesIpc"); + CheckContains(path, text, "P2PTransportBothRanksActive"); + CheckContains(path, text, "constexpr int32_t kP2PMagicBase = 0x5444554d"); + CheckContains(path, text, "int32_t launchMagic = kP2PMagicBase"); + CheckContains(path, text, "const int32_t warmupMagic = ++launchMagic"); + CheckContains(path, text, "const int32_t measuredMagic = ++launchMagic"); + CheckContains(path, text, "warmupMagic, i + 1"); + CheckContains(path, text, "measuredMagic, i + 1"); +} + +void TestMemoryConsumeSweepDefault() +{ + const std::string path = "tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh"; + const std::string text = ReadFile(path); + CheckContains(path, text, "direct_urma,memory,memory_consume,data_as_flag"); +} + +void TestDataAsFlagEpochOrderedSource() +{ + const std::string kernelPath = "tests/udma/demo/tilexr_udma_demo_kernel.cpp"; + const std::string kernelText = ReadFile(kernelPath); + CheckContains(kernelPath, kernelText, "tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel"); + CheckContains(kernelPath, kernelText, "launch_tilexr_data_as_flag_epoch_ordered_p2p_perf"); + CheckContains(kernelPath, kernelText, "DataAsFlagEpoch(magic, step)"); + CheckContains(kernelPath, kernelText, "DataAsFlagSendEpochOrdered"); + CheckContains(kernelPath, kernelText, "DataAsFlagCheckAndRecvEpochOrdered"); + CheckContains(kernelPath, kernelText, "int32_t magic, int32_t step"); + CheckContains(kernelPath, kernelText, "int32_t strict"); + CheckContains(kernelPath, kernelText, "strict != 0"); + + const std::string headerPath = "src/include/tilexr_data_as_flag.h"; + const std::string headerText = ReadFile(headerPath); + CheckContains(headerPath, headerText, "DataAsFlagEpochReady"); + CheckContains(headerPath, headerText, "DataAsFlagCommitEpoch"); + CheckContains(headerPath, headerText, "DataAsFlagWriteBatchCommitFlag"); + CheckContains(headerPath, headerText, "DATA_AS_FLAG_COMMIT_BIT"); + CheckContains(headerPath, headerText, "const uint64_t commitEpoch = DataAsFlagCommitEpoch(epoch)"); + CheckContains(headerPath, headerText, "DataAsFlagEpochReady(DataAsFlagLoadEpochFlag(dataAsFlagGM, lastBlock, recvScratch), commitEpoch)"); + CheckNotContains(headerPath, headerText, "DataAsFlagCopyEpochFlagsToGM"); + CheckNotContains(headerPath, headerText, "return DataAsFlagMaxRecvBlocks(scratchBytes);"); + + const std::string hostPath = "tests/udma/demo/tilexr_udma_demo.cpp"; + const std::string hostText = ReadFile(hostPath); + CheckContains(hostPath, hostText, "launch_tilexr_data_as_flag_epoch_ordered_p2p_perf"); + CheckContains(hostPath, hostText, "P2PTransport::DataAsFlagEpochOrdered"); + CheckContains(hostPath, hostText, "magic, step"); + CheckContains(hostPath, hostText, "useLegacyDataAsFlagTransport"); + CheckContains(hostPath, hostText, "TILEXR_DATA_AS_FLAG_STRICT"); +} + +} // namespace + +int main() +{ + TestMemoryConsumeKernelUsesSyncCollectives(); + TestMemorySegmentedDiagnosticTransport(); + TestMemoryConsumeHostWiring(); + TestMemoryConsumeSweepDefault(); + TestDataAsFlagEpochOrderedSource(); + if (g_failures != 0) { + std::cerr << g_failures << " TileXR UDMA P2P source guard checks failed" << std::endl; + return 1; + } + std::cout << "TileXR UDMA P2P source guard checks passed" << std::endl; + return 0; +} diff --git a/tests/udma/unit/test_tilexr_udma_transport_layout.cpp b/tests/udma/unit/test_tilexr_udma_transport_layout.cpp index cb14015..089f411 100644 --- a/tests/udma/unit/test_tilexr_udma_transport_layout.cpp +++ b/tests/udma/unit/test_tilexr_udma_transport_layout.cpp @@ -45,7 +45,7 @@ void TestHostLayoutUsesDeviceRelativePointers() constexpr uintptr_t deviceBase = 0x100000000ULL; TileXR::UDMAInfo info = {}; std::vector bytes; - const int ret = TileXR::BuildUDMAInfoImage(deviceBase, sq, rq, scq, rcq, mem, info, bytes); + const int ret = TileXR::BuildUDMAInfoImage(deviceBase, 1, sq, rq, scq, rcq, mem, info, bytes); CHECK_EQ(ret, TileXR::TILEXR_UDMA_LAYOUT_SUCCESS); CHECK_EQ(info.qpNum, 1U); @@ -81,16 +81,43 @@ void TestRejectsMismatchedArrays() TileXR::UDMAInfo info = {}; std::vector bytes; - const int ret = TileXR::BuildUDMAInfoImage(0x1000, sq, rq, scq, rcq, mem, info, bytes); + const int ret = TileXR::BuildUDMAInfoImage(0x1000, 1, sq, rq, scq, rcq, mem, info, bytes); CHECK_EQ(ret, TileXR::TILEXR_UDMA_LAYOUT_INVALID); } +void TestHostLayoutSupportsMultipleQps() +{ + std::vector sq(4); + std::vector rq(4); + std::vector scq(4); + std::vector rcq(4); + std::vector mem(4); + + sq[3].bufAddr = 0x8000; + mem[3].tpn = 17; + + constexpr uintptr_t deviceBase = 0x200000000ULL; + TileXR::UDMAInfo info = {}; + std::vector bytes; + const int ret = TileXR::BuildUDMAInfoImage(deviceBase, 2, sq, rq, scq, rcq, mem, info, bytes); + + CHECK_EQ(ret, TileXR::TILEXR_UDMA_LAYOUT_SUCCESS); + CHECK_EQ(info.qpNum, 2U); + const auto* imageSq = reinterpret_cast( + bytes.data() + (info.sqPtr - deviceBase)); + const auto* imageMem = reinterpret_cast( + bytes.data() + (info.memPtr - deviceBase)); + CHECK_EQ(imageSq[3].bufAddr, static_cast(0x8000)); + CHECK_EQ(imageMem[3].tpn, 17U); +} + } // namespace int main() { TestHostLayoutUsesDeviceRelativePointers(); TestRejectsMismatchedArrays(); + TestHostLayoutSupportsMultipleQps(); if (g_failures != 0) { std::cerr << g_failures << " UDMA transport layout checks failed" << std::endl; return 1;