diff --git a/docs/superpowers/plans/2026-06-18-udma-alltoall-demo.md b/docs/superpowers/plans/2026-06-18-udma-alltoall-demo.md new file mode 100644 index 0000000..8fdf281 --- /dev/null +++ b/docs/superpowers/plans/2026-06-18-udma-alltoall-demo.md @@ -0,0 +1,580 @@ +# UDMA All-to-All Demo Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a `test_type=2` all-to-all path to the TileXR UDMA demo under `tests/udma/demo`. + +**Architecture:** Reuse the existing `tilexr_udma_demo` host executable, launch script, local TCP barrier, and UDMA registered-memory setup. Add one AICore all-to-all kernel that writes each local destination slice into the matching remote output slice using `TileXR::UDMAPutNbi`, with host-side validation after all ranks complete. + +**Tech Stack:** C++14 host code, Ascend C AICore kernel code, TileXR public C API, `tilexr_udma.h` device wrapper, CMake, bash, remote Ascend950/A5 validation. + +## Global Constraints + +- CANN version: 9.1.0. +- Target OS: Ubuntu 20.04 LTS; root user required for device access. +- UDMA data-plane validation targets A5 / Ascend950 / 950 hardware. +- Do not add shmem includes or shmem API calls. +- Existing `test_type=0` all-gather and `test_type=1` put-signal behavior must remain unchanged. +- All-to-all uses `int32_t` only. +- Remote validation must create a new directory under `/home/aiv-perf/` on `root@141.61.95.18`. + +--- + +## File Structure + +- Modify `tests/udma/demo/tilexr_udma_demo_kernel.cpp`: add `tilexr_udma_all_to_all_kernel` and `launch_tilexr_udma_all_to_all`. +- Modify `tests/udma/demo/tilexr_udma_demo.cpp`: declare the launch wrapper, allocate the all-to-all input/output layout for `test_type=2`, initialize inputs, launch the new kernel, and validate outputs. +- Modify `tests/udma/demo/run_tilexr_udma_demo.sh`: update printed help text to include `test_type=2`. +- Modify `tests/udma/demo/README.md`: document the all-to-all path and example command. +- Optionally modify `tests/udma/demo/ASCEND_VERIFICATION.md` only if it contains an exhaustive test-type list that would become stale. + +--- + +### Task 1: Add Device Kernel And Launch Wrapper + +**Files:** +- Modify: `tests/udma/demo/tilexr_udma_demo_kernel.cpp` + +**Interfaces:** +- Consumes: `TileXR::UDMAPutNbi(args, targetRank, localSrc, byteOffset, byteCount)` and `TileXR::UDMAQuiet(args, targetRank)` from `src/include/tilexr_udma.h`. +- Produces: + ```cpp + void launch_tilexr_udma_all_to_all( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR input, GM_ADDR output, + GM_ADDR debug, int32_t elementsPerPeer, uint64_t outputByteOffset); + ``` + +- [ ] **Step 1: Add the kernel before the registered smoke kernel** + +Add this code to `tests/udma/demo/tilexr_udma_demo_kernel.cpp` after `tilexr_udma_put_signal_kernel`: + +```cpp +extern "C" __global__ __aicore__ void tilexr_udma_all_to_all_kernel( + GM_ADDR commArgsGM, GM_ADDR inputGM, GM_ADDR outputGM, GM_ADDR debugGM, + int32_t elementsPerPeer, uint64_t outputByteOffset) +{ + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto input = reinterpret_cast<__gm__ int32_t*>(inputGM); + auto output = reinterpret_cast<__gm__ int32_t*>(outputGM); + auto debug = reinterpret_cast<__gm__ int32_t*>(debugGM); + + int32_t rank = args->rank; + int32_t rankSize = args->rankSize; + bool enabled = TileXR::UDMARegistryEnabled(args); + + if (debug != nullptr) { + debug[0] = TILEXR_UDMA_DEMO_MAGIC; + debug[1] = rank; + debug[2] = rankSize; + debug[3] = enabled ? 1 : 0; + debug[4] = elementsPerPeer; + debug[5] = static_cast(outputByteOffset); + } + if (!enabled) { + return; + } + + uint32_t bytes = static_cast(elementsPerPeer * sizeof(int32_t)); + for (int32_t peer = 0; peer < rankSize; ++peer) { + auto localSrc = input + peer * elementsPerPeer; + uint64_t remoteOffset = outputByteOffset + + static_cast(rank) * elementsPerPeer * sizeof(int32_t); + if (peer == rank) { + auto localDst = output + rank * elementsPerPeer; + for (int32_t i = 0; i < elementsPerPeer; ++i) { + localDst[i] = localSrc[i]; + } + continue; + } + TileXR::UDMAPutNbi(args, peer, localSrc, remoteOffset, bytes); + TileXR::UDMAQuiet(args, peer); + } +} +``` + +- [ ] **Step 2: Add the host launch wrapper** + +Add this code near the other `launch_tilexr_udma_*` wrappers: + +```cpp +void launch_tilexr_udma_all_to_all( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR input, GM_ADDR output, + GM_ADDR debug, int32_t elementsPerPeer, uint64_t outputByteOffset) +{ + tilexr_udma_all_to_all_kernel<<>>( + commArgs, input, output, debug, elementsPerPeer, outputByteOffset); +} +``` + +- [ ] **Step 3: Run a local source check** + +Run: + +```powershell +rg -n "tilexr_udma_all_to_all|launch_tilexr_udma_all_to_all" tests/udma/demo/tilexr_udma_demo_kernel.cpp +``` + +Expected: one kernel definition and one launch wrapper definition are shown. + +--- + +### Task 2: Add Host-Side All-To-All Layout, Launch, And Validation + +**Files:** +- Modify: `tests/udma/demo/tilexr_udma_demo.cpp` + +**Interfaces:** +- Consumes: + ```cpp + void launch_tilexr_udma_all_to_all( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR input, GM_ADDR output, + GM_ADDR debug, int32_t elementsPerPeer, uint64_t outputByteOffset); + ``` +- Produces: + ```cpp + bool ValidateAllToAllData( + int rank, int rankSize, const std::vector& output, int32_t elementsPerPeer); + ``` + +- [ ] **Step 1: Declare the new launch wrapper** + +Add this declaration after `launch_tilexr_udma_put_signal`: + +```cpp +extern void launch_tilexr_udma_all_to_all( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR input, GM_ADDR output, + GM_ADDR debug, int32_t elementsPerPeer, uint64_t outputByteOffset); +``` + +- [ ] **Step 2: Add all-to-all constants** + +Add this constant beside the existing demo constants: + +```cpp +constexpr int32_t kAllToAllBaseValue = 100000; +``` + +- [ ] **Step 3: Add a validator** + +Add this function after `ValidateData`: + +```cpp +bool ValidateAllToAllData( + int rank, int rankSize, const std::vector& output, int32_t elementsPerPeer) +{ + bool ok = true; + for (int srcRank = 0; srcRank < rankSize; ++srcRank) { + int32_t expected = kAllToAllBaseValue + srcRank * 1000 + rank; + for (int32_t i = 0; i < elementsPerPeer; ++i) { + size_t offset = static_cast(srcRank) * elementsPerPeer + i; + if (output[offset] != expected) { + std::cerr << "[rank " << rank << "] ALLTOALL MISMATCH at src=" << srcRank + << " elem=" << i << " offset=" << offset + << " got=" << output[offset] << " expected=" << expected << std::endl; + ok = false; + break; + } + } + } + + std::cout << "[rank " << rank << "] alltoall output sample:"; + for (int srcRank = 0; srcRank < rankSize; ++srcRank) { + size_t offset = static_cast(srcRank) * elementsPerPeer; + std::cout << " from" << srcRank << "=" << output[offset]; + } + std::cout << std::endl; + return ok; +} +``` + +- [ ] **Step 4: Split payload layout by test type** + +Replace the current data/signal payload sizing block with: + +```cpp +bool isAllToAll = testType == 2; +size_t dataCount = static_cast(rankSize) * elementsPerRank; +size_t dataBytes = dataCount * sizeof(int32_t); +size_t inputOffset = 0; +size_t outputOffset = isAllToAll ? dataBytes : 0; +size_t signalOffset = isAllToAll ? dataBytes * 2 : dataBytes; +size_t signalBytes = static_cast(rankSize) * sizeof(uint64_t); +size_t payloadBytes = signalOffset + signalBytes; +``` + +Keep the existing `registeredBytes` calculation immediately after this block. + +- [ ] **Step 5: Define typed buffer pointers** + +Replace the current `data` and `signals` pointer setup with: + +```cpp +auto data = static_cast(registeredMemory); +auto input = reinterpret_cast(static_cast(registeredMemory) + inputOffset); +auto output = reinterpret_cast(static_cast(registeredMemory) + outputOffset); +auto signals = reinterpret_cast(static_cast(registeredMemory) + signalOffset); +``` + +- [ ] **Step 6: Initialize host buffers for both modes** + +Replace the current `hostData` initialization with: + +```cpp +std::vector hostData(dataCount, -1); +std::vector hostOutput(dataCount, -1); +if (isAllToAll) { + for (int dstRank = 0; dstRank < rankSize; ++dstRank) { + int32_t value = kAllToAllBaseValue + rank * 1000 + dstRank; + std::fill(hostData.begin() + static_cast(dstRank) * elementsPerRank, + hostData.begin() + static_cast(dstRank + 1) * elementsPerRank, + value); + } +} else { + std::fill(hostData.begin() + static_cast(rank) * elementsPerRank, + hostData.begin() + static_cast(rank + 1) * elementsPerRank, + 1000 + rank); +} +``` + +- [ ] **Step 7: Copy input and output buffers to device** + +Replace the data H2D copy part with: + +```cpp +bool initOk = CopyHostToDevice(rank, input, dataCount * sizeof(int32_t), + hostData.data(), dataCount * sizeof(int32_t), isAllToAll ? "alltoall input" : "data"); +if (isAllToAll) { + initOk = CopyHostToDevice(rank, output, dataCount * sizeof(int32_t), + hostOutput.data(), dataCount * sizeof(int32_t), "alltoall output") && initOk; +} +if (!initOk || + !CopyHostToDevice(rank, signals, hostSignals.size() * sizeof(uint64_t), + hostSignals.data(), hostSignals.size() * sizeof(uint64_t), "signals") || + !CopyHostToDevice(rank, debug, hostDebug.size() * sizeof(int32_t), + hostDebug.data(), hostDebug.size() * sizeof(int32_t), "debug")) { +``` + +Keep the existing cleanup block inside this `if`. + +- [ ] **Step 8: Launch the all-to-all kernel for `test_type=2`** + +Replace the launch selection with: + +```cpp +if (testType == 2) { + PrintStatus(rank, "launch all-to-all kernel"); + launch_tilexr_udma_all_to_all( + 1, stream, commArgsDev, reinterpret_cast(input), reinterpret_cast(output), + reinterpret_cast(debug), elementsPerRank, static_cast(outputOffset)); +} else if (testType == 1) { + PrintStatus(rank, "launch put-signal kernel"); + launch_tilexr_udma_put_signal( + 1, stream, commArgsDev, reinterpret_cast(data), reinterpret_cast(signals), + reinterpret_cast(debug), elementsPerRank, kSignalValue); +} else { + PrintStatus(rank, "launch all-gather kernel"); + launch_tilexr_udma_all_gather( + 1, stream, commArgsDev, reinterpret_cast(data), reinterpret_cast(debug), + elementsPerRank); +} +``` + +- [ ] **Step 9: Copy output back for all-to-all** + +Replace the data D2H copy with: + +```cpp +bool copyBackOk = CopyDeviceToHost(rank, hostData.data(), dataCount * sizeof(int32_t), + data, dataCount * sizeof(int32_t), "data"); +if (isAllToAll) { + copyBackOk = CopyDeviceToHost(rank, hostOutput.data(), dataCount * sizeof(int32_t), + output, dataCount * sizeof(int32_t), "alltoall output") && copyBackOk; +} +if (!copyBackOk || + !CopyDeviceToHost(rank, hostSignals.data(), hostSignals.size() * sizeof(uint64_t), + signals, hostSignals.size() * sizeof(uint64_t), "signals") || + !CopyDeviceToHost(rank, hostDebug.data(), hostDebug.size() * sizeof(int32_t), + debug, hostDebug.size() * sizeof(int32_t), "debug")) { +``` + +Keep the existing cleanup block inside this `if`. + +- [ ] **Step 10: Validate all-to-all output** + +Replace the final validation selection with: + +```cpp +bool ok = isAllToAll ? ValidateAllToAllData(rank, rankSize, hostOutput, elementsPerRank) + : ValidateData(rank, rankSize, hostData, elementsPerRank); +if (testType == 1) { + ok = ValidateSignals(rank, rankSize, hostSignals) && ok; +} +``` + +- [ ] **Step 11: Run local source checks** + +Run: + +```powershell +rg -n "all-to-all|alltoall|isAllToAll|ValidateAllToAllData|launch_tilexr_udma_all_to_all" tests/udma/demo/tilexr_udma_demo.cpp +``` + +Expected: declaration, validation function, launch path, and copy/initialization branches are shown. + +--- + +### Task 3: Update Demo Documentation And Script Text + +**Files:** +- Modify: `tests/udma/demo/run_tilexr_udma_demo.sh` +- Modify: `tests/udma/demo/README.md` +- Modify if stale: `tests/udma/demo/ASCEND_VERIFICATION.md` + +**Interfaces:** +- Consumes: `test_type=2` behavior from Tasks 1 and 2. +- Produces: documented command `bash demo/run_tilexr_udma_demo.sh 2 2 16 2 0`. + +- [ ] **Step 1: Update script test-type text** + +In `tests/udma/demo/run_tilexr_udma_demo.sh`, change: + +```bash +echo "Test type: ${test_type} (0=all-gather put, 1=put-signal)" +``` + +to: + +```bash +echo "Test type: ${test_type} (0=all-gather put, 1=put-signal, 2=all-to-all)" +``` + +- [ ] **Step 2: Update README run examples** + +In `tests/udma/demo/README.md`, add this command to the Run section: + +```bash +bash demo/run_tilexr_udma_demo.sh 2 2 16 2 0 +``` + +Update the argument list so it includes: + +```text +- `test_type=2`: all-to-all UDMA put. Rank `src` sends input slice `dst` to rank `dst`; each output is ordered by source rank. +``` + +- [ ] **Step 3: Check `ASCEND_VERIFICATION.md` for stale test-type lists** + +Run: + +```powershell +rg -n "test_type|all-gather|put-signal|all-to-all" tests/udma/demo/ASCEND_VERIFICATION.md +``` + +If it lists only test types 0 and 1, update the list to include: + +```text +test_type=2 validates all-to-all registered-memory UDMA puts with output ordered by source rank. +``` + +- [ ] **Step 4: Run documentation check** + +Run: + +```powershell +rg -n "test_type=2|all-to-all|run_tilexr_udma_demo.sh 2" tests/udma/demo/README.md tests/udma/demo/run_tilexr_udma_demo.sh tests/udma/demo/ASCEND_VERIFICATION.md +``` + +Expected: README and script both mention `test_type=2`. + +--- + +### Task 4: Local Build-Oriented Verification + +**Files:** +- Verify: `tests/udma/demo/tilexr_udma_demo.cpp` +- Verify: `tests/udma/demo/tilexr_udma_demo_kernel.cpp` +- Verify: `tests/udma/CMakeLists.txt` + +**Interfaces:** +- Consumes: code from Tasks 1-3. +- Produces: local confidence before remote hardware validation. + +- [ ] **Step 1: Review the diff** + +Run: + +```powershell +git diff -- tests/udma/demo/tilexr_udma_demo.cpp tests/udma/demo/tilexr_udma_demo_kernel.cpp tests/udma/demo/run_tilexr_udma_demo.sh tests/udma/demo/README.md tests/udma/demo/ASCEND_VERIFICATION.md +``` + +Expected: only all-to-all related changes are present. + +- [ ] **Step 2: Check that CMake tracks the modified kernel source** + +Run: + +```powershell +rg -n "tilexr_udma_demo_kernel.cpp|tilexr_udma_demo.cpp|tilexr_udma_demo" tests/udma/CMakeLists.txt +``` + +Expected: existing custom command depends on `demo/tilexr_udma_demo_kernel.cpp`; no CMake changes are required. + +- [ ] **Step 3: Check formatting-sensitive syntax around modified host blocks** + +Run: + +```powershell +rg -n "payloadBytes|registeredBytes|initOk|copyBackOk|testType == 2|ValidateAllToAllData" tests/udma/demo/tilexr_udma_demo.cpp +``` + +Expected: each symbol appears in the expected host flow. + +- [ ] **Step 4: Check working tree before remote copy** + +Run: + +```powershell +git status --short +``` + +Expected: only the planned demo files and plan file are modified or added. + +--- + +### Task 5: Remote Build And Runtime Validation + +**Files:** +- Verify remotely under a new `/home/aiv-perf/` directory on `root@141.61.95.18`. + +**Interfaces:** +- Consumes: local repository changes from Tasks 1-4. +- Produces: remote build and runtime evidence for all-to-all. + +- [ ] **Step 1: Create a unique remote validation directory** + +Run from the local workspace: + +```powershell +ssh root@141.61.95.18 "set -e; d=/home/aiv-perf/tilexr-udma-alltoall-$(date +%Y%m%d-%H%M%S); mkdir -p \"$d\"; echo \"$d\"" +``` + +Expected: command prints the new remote directory path. + +- [ ] **Step 2: Copy the repository to the remote directory** + +Use `rsync` if available: + +```powershell +rsync -a --delete --exclude .git --exclude build --exclude install --exclude tests/udma/build --exclude tests/udma/install --exclude tests/udma/logs ./ root@141.61.95.18:/ +``` + +If `rsync` is unavailable on Windows, use `scp` with a compressed archive created outside the repo build artifacts: + +```powershell +tar --exclude .git --exclude build --exclude install --exclude tests/udma/build --exclude tests/udma/install --exclude tests/udma/logs -czf $env:TEMP\tilexr-udma-alltoall.tgz . +scp $env:TEMP\tilexr-udma-alltoall.tgz root@141.61.95.18:/ +ssh root@141.61.95.18 "set -e; cd ; tar -xzf tilexr-udma-alltoall.tgz; rm tilexr-udma-alltoall.tgz" +``` + +Expected: remote directory contains `scripts/common_env.sh` and `tests/udma/demo`. + +- [ ] **Step 3: Build TileXR core on remote** + +Run: + +```powershell +ssh root@141.61.95.18 "set -e; cd ; source scripts/common_env.sh; mkdir -p build; cd build; cmake -DCMAKE_INSTALL_PREFIX=../install ..; make -j$(nproc); make install" +``` + +Expected: `install/lib/libtile-comm.so` exists under the remote directory. + +- [ ] **Step 4: Build UDMA demo on remote** + +Run: + +```powershell +ssh root@141.61.95.18 "set -e; cd /tests/udma; bash build.sh" +``` + +Expected: `tests/udma/install/bin/tilexr_udma_demo` exists, and the output does not say the demo was skipped because `bisheng` is missing. + +- [ ] **Step 5: Run all-to-all with 2 ranks** + +Run: + +```powershell +ssh root@141.61.95.18 "set -e; cd /tests/udma; bash demo/run_tilexr_udma_demo.sh 2 2 16 2 0" +``` + +Expected: every rank prints `TileXR UDMA demo success`; log tails include `alltoall output sample` with `from0=100000` on rank 0, `from0=100001` on rank 1, `from1=101000` on rank 0, and `from1=101001` on rank 1. + +- [ ] **Step 6: Run optional wider validation if at least 4 NPUs are available** + +Run: + +```powershell +ssh root@141.61.95.18 "set -e; cd /tests/udma; bash demo/run_tilexr_udma_demo.sh 2 4 16 4 0" +``` + +Expected: every rank prints `TileXR UDMA demo success`. + +- [ ] **Step 7: Capture failure logs if validation fails** + +Run: + +```powershell +ssh root@141.61.95.18 "cd /tests/udma; latest=$(ls -td logs/tilexr_udma_demo_* 2>/dev/null | head -1); if [ -n \"$latest\" ]; then for f in \"$latest\"/rank_*.log; do echo \"===== $f =====\"; tail -n 120 \"$f\"; done; fi" +``` + +Expected: logs show whether failure is UDMA enablement, build/runtime environment, kernel debug words, or output mismatch. + +--- + +### Task 6: Final Review And Commit + +**Files:** +- Review all modified files. + +**Interfaces:** +- Consumes: completed implementation and verification evidence. +- Produces: final commit for all-to-all demo implementation. + +- [ ] **Step 1: Inspect final diff** + +Run: + +```powershell +git diff --stat +git diff -- tests/udma/demo/tilexr_udma_demo.cpp tests/udma/demo/tilexr_udma_demo_kernel.cpp tests/udma/demo/run_tilexr_udma_demo.sh tests/udma/demo/README.md tests/udma/demo/ASCEND_VERIFICATION.md +``` + +Expected: changes match the approved spec and no unrelated files are modified. + +- [ ] **Step 2: Check git status** + +Run: + +```powershell +git status --short +``` + +Expected: modified implementation/doc files plus this plan file. + +- [ ] **Step 3: Commit implementation after successful validation** + +Run: + +```powershell +git add tests/udma/demo/tilexr_udma_demo.cpp tests/udma/demo/tilexr_udma_demo_kernel.cpp tests/udma/demo/run_tilexr_udma_demo.sh tests/udma/demo/README.md tests/udma/demo/ASCEND_VERIFICATION.md docs/superpowers/plans/2026-06-18-udma-alltoall-demo.md +git commit -m "feat: add udma alltoall demo" +``` + +Expected: one implementation commit is created. + +--- + +## Self-Review + +- Spec coverage: Tasks 1 and 2 implement the `test_type=2` all-to-all kernel, layout, launch, synchronization, and validation. Task 3 documents the command. Task 5 covers the required remote validation under `/home/aiv-perf/`. +- Placeholder scan: No task uses TBD/TODO/fill-in wording. `` is an execution-time value produced by Task 5 Step 1 and intentionally reused by later commands. +- Type consistency: The launch wrapper signature is identical in Task 1 and Task 2. Host uses `elementsPerRank` as the user-facing argument and passes it as `elementsPerPeer` to the kernel. diff --git a/docs/superpowers/specs/2026-06-18-udma-alltoall-demo-design.md b/docs/superpowers/specs/2026-06-18-udma-alltoall-demo-design.md new file mode 100644 index 0000000..8eb2dc4 --- /dev/null +++ b/docs/superpowers/specs/2026-06-18-udma-alltoall-demo-design.md @@ -0,0 +1,124 @@ +# UDMA All-to-All Demo Design + +## Goal + +Add an all-to-all UDMA operator demo under `tests/udma/demo`. + +The demo validates the common all-to-all layout: + +- Each rank owns `rank_size` equal input slices. +- Input slice `dst_rank` from rank `src_rank` is sent to rank `dst_rank`. +- Each destination rank writes output slices ordered by source rank. + +For rank `i`, input is laid out as `[to0, to1, ..., toN-1]`. For rank `j`, output is laid out as `[from0, from1, ..., fromN-1]`. + +## Approach + +Extend the existing `tilexr_udma_demo` binary instead of creating a separate demo. + +The current demo already handles: + +- multi-process local rank launch +- TileXR communicator initialization +- UDMA capability checks +- ordinary `aclrtMalloc` memory registration through `TileXRUDMARegister` +- local TCP barriers for demo synchronization +- per-rank logs and result validation + +The all-to-all path will be selected with `test_type=2`. Existing `test_type=0` all-gather and `test_type=1` put-signal behavior must remain unchanged. + +## Host Data Layout + +For `test_type=2`, the host allocates one registered device-memory payload containing: + +- input buffer: `rank_size * elements_per_peer` `int32_t` values +- output buffer: `rank_size * elements_per_peer` `int32_t` values +- signal/debug space if needed by the shared demo structure + +The registered allocation remains rounded up to the existing 2 MiB UDMA registration alignment. + +Input initialization for rank `src`: + +```text +input[dst][elem] = 100000 + src * 1000 + dst +``` + +Expected output for rank `dst`: + +```text +output[src][elem] = 100000 + src * 1000 + dst +``` + +This makes source and destination rank mistakes visible in validation logs. + +## Kernel Behavior + +Add a new AICore kernel and launch wrapper in `tilexr_udma_demo_kernel.cpp`. + +The kernel reads `rank`, `rankSize`, and UDMA registry state from `CommArgs`. + +For each peer: + +- If `peer == rank`, copy the local input slice `input[rank]` into local output slice `output[rank]`. +- Otherwise, issue `TileXR::UDMAPutNbi` to write local `input[peer]` into the remote rank's registered output slice for this source rank. +- Call `TileXR::UDMAQuiet(args, peer)` after posting to each remote peer. + +The remote byte offset is computed against the peer rank's registered base: + +```text +output_offset + rank * elements_per_peer * sizeof(int32_t) +``` + +The local source pointer is: + +```text +input + peer * elements_per_peer +``` + +## Synchronization + +The demo keeps the existing host-side synchronization: + +1. Each rank initializes and registers its buffers. +2. Host barrier ensures every rank's registered-memory metadata is visible. +3. Each rank launches the all-to-all kernel and synchronizes its stream. +4. Host barrier ensures all ranks have completed UDMA writes. +5. Host copies output back and validates. + +The kernel does not add device-side inter-rank polling beyond `UDMAQuiet`. + +## Build And Run + +The existing `tests/udma/CMakeLists.txt` continues to build one demo kernel shared object and one `tilexr_udma_demo` executable. + +Update the run script and README so: + +```bash +bash demo/run_tilexr_udma_demo.sh 2 2 16 2 0 +``` + +runs the all-to-all path. + +## Verification + +Local checks: + +- Build metadata remains scoped to `tests/udma`. +- Existing all-gather and put-signal source paths remain intact. + +Remote hardware validation: + +- Create a new directory under `/home/aiv-perf/` on `root@141.61.95.18`. +- Copy or sync the repository into that directory. +- Build TileXR core and `tests/udma`. +- Run `bash demo/run_tilexr_udma_demo.sh 2 2 16 2 0`. +- If resources permit, also run a wider case such as `rank_size=4`. + +Success requires every rank to print `TileXR UDMA demo success` and all output segments to match the expected all-to-all pattern. + +## Out Of Scope + +- Refactoring the demo runtime into shared helper classes. +- Adding a production all-to-all collective API. +- Optimizing multi-peer posting or batching. +- Supporting non-`int32_t` element types in this demo. diff --git a/src/comm/tilexr_comm.cpp b/src/comm/tilexr_comm.cpp index 6ee87dc..90324f1 100644 --- a/src/comm/tilexr_comm.cpp +++ b/src/comm/tilexr_comm.cpp @@ -346,6 +346,14 @@ int TileXRComm::RegisterUDMAMemory(GM_ADDR localPtr, size_t bytes, TileXRUDMAMem TILEXR_LOG(ERROR) << "TileXR UDMA memory registration failed: " << ret; return TILEXR_ERROR_INTERNAL; } + udmaInfoDev_ = udmaTransport_->GetUDMAInfoDev(); + commArgs_.udmaInfoPtr = udmaInfoDev_; + ret = UpdateCommArgsDev(); + if (ret != TILEXR_SUCCESS) { + TILEXR_LOG(ERROR) << "TileXRUDMARegister failed to refresh CommArgs after UDMA info update: " << ret; + udmaTransport_->UnregisterMemory(localPtr); + return ret; + } if (socketExchange_ == nullptr) { TILEXR_LOG(ERROR) << "TileXRUDMARegister requires live socket exchange"; @@ -646,10 +654,6 @@ int TileXRComm::EnablePeerAccess() } else if (physicalInfo_.physicalLink == PhysicalLink::RESERVED) { physicalInfo_.physicalLink = PhysicalLink::PCIE; commArgs_.extraFlag |= ExtraFlag::TOPO_PCIE; - if (rankSize_ > PING_PONG_SIZE) { - TILEXR_LOG(ERROR) << "do not support pcie > 2 rank! rankSize_ = " << rankSize_; - return TILEXR_ERROR_INTERNAL; - } } physicalInfo_.coreNum = GetCoreNum(physicalInfo_.chipName); @@ -864,6 +868,30 @@ int TileXRComm::InitCommMem() } if (OpenIpcMem(names) != TILEXR_SUCCESS) { + const char *modeEnv = std::getenv("TILEXR_IPC_PID_MODE"); + const bool forceSdid = modeEnv != nullptr && std::string(modeEnv) == "sdid"; + if (forceSdid) { + TILEXR_LOG(WARN) << "OpenIpcMem failed after sdid setup, retry with pid setup"; + string retryName; + if (setenv("TILEXR_IPC_PID_MODE", "pid_retry", 1) != 0 || + SetMemoryName(retryName) != TILEXR_SUCCESS || + SetIpcPidSdid(retryName, pids, sdids) != TILEXR_SUCCESS) { + TILEXR_LOG(ERROR) << "SetIpcPidSdid pid retry failed!"; + setenv("TILEXR_IPC_PID_MODE", "sdid", 1); + return TILEXR_ERROR_INTERNAL; + } + retryName.resize(IPC_NAME_SIZE); + ret = GetName(retryName, names); + if (ret != TILEXR_SUCCESS) { + TILEXR_LOG(ERROR) << "GetName pid retry error! ret: " << ret; + setenv("TILEXR_IPC_PID_MODE", "sdid", 1); + return ret; + } + setenv("TILEXR_IPC_PID_MODE", "sdid", 1); + if (OpenIpcMem(names) == TILEXR_SUCCESS) { + return TILEXR_SUCCESS; + } + } TILEXR_LOG(ERROR) << "rank: " << rank_ << " OpenIpcMem failed!"; return TILEXR_ERROR_INTERNAL; } @@ -909,26 +937,35 @@ int TileXRComm::SetMemoryName(string &name) int TileXRComm::SetIpcPidSdid(string &name, const uint32_t *pids, const int64_t *sdids) const { + const char *modeEnv = std::getenv("TILEXR_IPC_PID_MODE"); + bool forcePid = modeEnv != nullptr && std::string(modeEnv) == "pid"; + bool forceSdid = modeEnv != nullptr && std::string(modeEnv) == "sdid"; + bool defaultSdid = + physicalInfo_.chipName >= ChipName::CHIP_910_9391 && physicalInfo_.chipName < ChipName::CHIP_950; + bool useSdid = forceSdid || (!forcePid && defaultSdid); + TILEXR_LOG(INFO) << "SetIpcPidSdid mode=" << (useSdid ? "sdid" : "pid"); for (int i = 0; i < rankSize_; ++i) { if (i == rank_) { continue; } - if (physicalInfo_.chipName < ChipName::CHIP_910_9391) { - // 910B - int32_t pidInt32 = pids[i]; + int32_t pidInt32 = pids[i]; + if (!useSdid) { int rtRet = rtSetIpcMemPid(name.c_str(), &pidInt32, HCCL_IPC_PID_ARRAY_SIZE); if (rtRet != RT_ERROR_NONE) { TILEXR_LOG(ERROR) << "err " << rtRet; return TILEXR_ERROR_INTERNAL; } } else { - // 910A3 - int32_t pidInt32 = pids[i]; int rtRet = rtSetIpcMemorySuperPodPid(name.c_str(), sdids[i], &pidInt32, HCCL_IPC_PID_ARRAY_SIZE); if (rtRet != RT_ERROR_NONE) { - TILEXR_LOG(ERROR) << "err " << rtRet; - return TILEXR_ERROR_INTERNAL; + TILEXR_LOG(WARN) << "rtSetIpcMemorySuperPodPid err " << rtRet + << ", fallback to rtSetIpcMemPid"; + rtRet = rtSetIpcMemPid(name.c_str(), &pidInt32, HCCL_IPC_PID_ARRAY_SIZE); + if (rtRet != RT_ERROR_NONE) { + TILEXR_LOG(ERROR) << "err " << rtRet; + return TILEXR_ERROR_INTERNAL; + } } } } diff --git a/src/comm/tilexr_internal.cpp b/src/comm/tilexr_internal.cpp index 1e2bc34..0a8494f 100644 --- a/src/comm/tilexr_internal.cpp +++ b/src/comm/tilexr_internal.cpp @@ -39,7 +39,9 @@ const std::unordered_map CHIP_MAP = { {"Ascend950DT", ChipName::CHIP_950}, {"Ascend950DT_9581", ChipName::CHIP_950}, {"Ascend950DT_9584", ChipName::CHIP_950}, - {"Ascend950PR", ChipName::CHIP_950} + {"Ascend950DT_9592", ChipName::CHIP_950}, + {"Ascend950PR", ChipName::CHIP_950}, + {"Ascend950PR_9599", ChipName::CHIP_950} }; /** diff --git a/src/comm/udma/tilexr_udma_transport.cpp b/src/comm/udma/tilexr_udma_transport.cpp index 7bafa5f..be21083 100644 --- a/src/comm/udma/tilexr_udma_transport.cpp +++ b/src/comm/udma/tilexr_udma_transport.cpp @@ -12,8 +12,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -33,6 +35,29 @@ uint32_t Log2Uint64(uint64_t value) return result; } +bool UDMADiagEnabled() +{ + const char* value = std::getenv("TILEXR_UDMA_DEBUG"); + return value != nullptr && value[0] != '\0' && std::strcmp(value, "0") != 0; +} + +std::string PtrToHex(uint64_t value) +{ + std::ostringstream os; + os << "0x" << std::hex << value; + return os.str(); +} + +std::string EidToHex(const HccpEid& eid) +{ + std::ostringstream os; + os << std::hex << std::setfill('0'); + for (uint8_t byte : eid.raw) { + os << std::setw(2) << static_cast(byte); + } + return os.str(); +} + HccpEid SwapEidForDevice(const HccpEid& hccpEid) { HccpEid swapped {}; @@ -289,24 +314,29 @@ bool ResolveLocalEidRoute( } // namespace struct TileXRUDMATransport::PerEidState { + struct PeerQueueState { + int peer = -1; + void* chanHandle = nullptr; + void* cqHandle = nullptr; + void* qpHandle = nullptr; + void* remoteQpHandle = nullptr; + CqInfoT cqInfo {}; + QpCreateInfo qpInfo {}; + uint32_t tpn = 0; + void* cqPiAddr = nullptr; + void* cqCiAddr = nullptr; + void* sqPiAddr = nullptr; + void* sqCiAddr = nullptr; + void* wqeCntAddr = nullptr; + void* amoAddr = nullptr; + UDMAWQCtx localWq {}; + UDMACQCtx localCq {}; + }; + uint32_t eidIndex = 0; void* ctxHandle = nullptr; void* tokenHandle = nullptr; - void* chanHandle = nullptr; - void* cqHandle = nullptr; - void* qpHandle = nullptr; - CqInfoT cqInfo {}; - QpCreateInfo qpInfo {}; - std::vector remoteQpHandles; - std::vector tpnList; - void* cqPiAddr = nullptr; - void* cqCiAddr = nullptr; - void* sqPiAddr = nullptr; - void* sqCiAddr = nullptr; - void* wqeCntAddr = nullptr; - void* amoAddr = nullptr; - UDMAWQCtx localWq {}; - UDMACQCtx localCq {}; + std::map peerQueues; }; TileXRUDMATransport::TileXRUDMATransport() = default; @@ -348,17 +378,7 @@ int TileXRUDMATransport::Init(const TileXRUDMATransportOptions& options) Shutdown(); return ret; } - ret = CreateQueues(); - if (ret != TILEXR_SUCCESS) { - Shutdown(); - return ret; - } - ret = ImportQueues(); - if (ret != TILEXR_SUCCESS) { - Shutdown(); - return ret; - } - ret = RefreshUDMAInfo(); + ret = EnsureUDMAInfoBuffer(); if (ret != TILEXR_SUCCESS) { Shutdown(); return ret; @@ -424,6 +444,24 @@ int TileXRUDMATransport::BuildRoutes() return TILEXR_ERROR_INTERNAL; } eidCount_ = eidNum; + const bool diag = UDMADiagEnabled(); + if (diag) { + TILEXR_LOG(INFO) << "UDMA diag BuildRoutes rank " << options_.rank + << " devId=" << options_.devId + << " logicDevId=" << logicDevId_ + << " deviceIdOffset=" << deviceIdOffset_ + << " phyId=" << (logicDevId_ + deviceIdOffset_) + << " runtimeEidCount=" << eidNum; + for (unsigned int i = 0; i < eidNum; ++i) { + TILEXR_LOG(INFO) << "UDMA diag local runtime eid rank " << options_.rank + << " idx=" << devEids[i].eidIndex + << " name=" << devEids[i].name + << " die=" << devEids[i].dieId + << " chip=" << devEids[i].chipId + << " func=" << devEids[i].funcId + << " eid=" << EidToHex(devEids[i].eid); + } + } uint32_t localId = static_cast(options_.devId); bool topoReady = false; @@ -456,6 +494,16 @@ int TileXRUDMATransport::BuildRoutes() if (ret != TILEXR_SUCCESS) { return ret; } + if (diag) { + std::ostringstream ids; + for (int rank = 0; rank < options_.rankSize; ++rank) { + ids << " rank" << rank << "=" << allLocalIds[rank]; + } + TILEXR_LOG(INFO) << "UDMA diag route ids rank " << options_.rank + << " localId=" << localId + << " topoReady=" << (topoReady ? 1 : 0) + << ids.str(); + } std::vector localRouteByPeer(options_.rankSize, -1); for (int peer = 0; peer < options_.rankSize; ++peer) { @@ -471,6 +519,13 @@ int TileXRUDMATransport::BuildRoutes() } peerLocalEid_[peer] = localEid; localRouteByPeer[peer] = static_cast(localEid); + if (diag) { + TILEXR_LOG(INFO) << "UDMA diag local route rank " << options_.rank + << " devLocalId=" << localId + << " peer=" << peer + << " peerLocalId=" << allLocalIds[peer] + << " localEid=" << localEid; + } } std::vector allRouteByPeer(options_.rankSize * options_.rankSize, -1); @@ -488,6 +543,12 @@ int TileXRUDMATransport::BuildRoutes() remoteEid = static_cast(devEids[0].eidIndex); } peerRemoteEid_[peer] = static_cast(remoteEid); + if (diag) { + TILEXR_LOG(INFO) << "UDMA diag remote route rank " << options_.rank + << " peer=" << peer + << " localEid=" << peerLocalEid_[peer] + << " remoteEid=" << peerRemoteEid_[peer]; + } } return TILEXR_SUCCESS; } @@ -517,6 +578,7 @@ int TileXRUDMATransport::CreateContexts() bool found = false; CtxInitAttr attr {}; auto targetEidIt = localEidByEid_.find(eidIndex); + const DevEidInfo* matchedEid = nullptr; for (unsigned int i = 0; i < eidNum; ++i) { bool matched = infoList[i].eidIndex == eidIndex; if (targetEidIt != localEidByEid_.end()) { @@ -529,6 +591,7 @@ int TileXRUDMATransport::CreateContexts() attr.ub.eid = infoList[i].eid; attr.ub.eidIndex = infoList[i].eidIndex; localEidByEid_[eidIndex] = infoList[i].eid; + matchedEid = &infoList[i]; found = true; break; } @@ -544,6 +607,17 @@ int TileXRUDMATransport::CreateContexts() TILEXR_LOG(WARN) << "TileXR UDMA RaCtxInit failed: " << ret; return TILEXR_ERROR_INTERNAL; } + if (UDMADiagEnabled() && matchedEid != nullptr) { + TILEXR_LOG(INFO) << "UDMA diag ctx init rank " << options_.rank + << " eid=" << eidIndex + << " runtimeIdx=" << matchedEid->eidIndex + << " name=" << matchedEid->name + << " die=" << matchedEid->dieId + << " chip=" << matchedEid->chipId + << " func=" << matchedEid->funcId + << " eidValue=" << EidToHex(matchedEid->eid) + << " ctx=" << ctxHandle; + } void* tokenHandle = nullptr; HccpTokenId tokenId {}; ret = loader_.RaCtxTokenIdAlloc(ctxHandle, &tokenId, &tokenHandle); @@ -586,127 +660,170 @@ int TileXRUDMATransport::CreateQueues() state.eidIndex = ctxEntry.first; state.ctxHandle = ctxEntry.second; state.tokenHandle = tokenHandleByEid_[ctxEntry.first]; - state.remoteQpHandles.assign(options_.rankSize, nullptr); - state.tpnList.assign(options_.rankSize, 0); - - ChanInfoT chanInfo {}; - chanInfo.in.dataPlaneFlag.bs.poolCqCstm = 1; - int ret = loader_.RaCtxChanCreate(state.ctxHandle, &chanInfo, &state.chanHandle); - if (ret != 0) { - return TILEXR_ERROR_INTERNAL; - } - - state.cqInfo.in.chanHandle = state.chanHandle; - state.cqInfo.in.depth = TILEXR_UDMA_CQ_DEPTH; - state.cqInfo.in.ub.mode = JFC_MODE_USER_CTL_NORMAL; - ret = loader_.RaCtxCqCreate(state.ctxHandle, &state.cqInfo, &state.cqHandle); - if (ret != 0) { - return TILEXR_ERROR_INTERNAL; - } - state.localCq.cqn = 0; - state.localCq.bufAddr = state.cqInfo.out.bufAddr; - state.localCq.baseBkShift = Log2Uint64(state.cqInfo.out.cqeSize); - state.localCq.depth = state.cqInfo.in.depth; - if (AllocDeviceScalar(&state.cqPiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || - AllocDeviceScalar(&state.cqCiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS) { - return TILEXR_ERROR_INTERNAL; - } - state.localCq.headAddr = reinterpret_cast(state.cqPiAddr); - state.localCq.tailAddr = reinterpret_cast(state.cqCiAddr); - state.localCq.dbMode = UDMADBMode::SW_DB; - state.localCq.dbAddr = state.cqInfo.out.swdbAddr; - - QpCreateAttr qpAttr {}; - qpAttr.scqHandle = state.cqHandle; - qpAttr.rcqHandle = state.cqHandle; - qpAttr.srqHandle = state.cqHandle; - qpAttr.sqDepth = TILEXR_UDMA_SQ_DEPTH; - qpAttr.rqDepth = TILEXR_UDMA_RQ_DEPTH_DEFAULT; - qpAttr.transportMode = CONN_RM; - qpAttr.ub.mode = JETTY_MODE_USER_CTL_NORMAL; - qpAttr.ub.flag.value = 1; - qpAttr.ub.jfsFlag.value = 2; - qpAttr.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE; - qpAttr.ub.rnrRetry = 7; - qpAttr.ub.extMode.piType = 0; - qpAttr.ub.extMode.cstmFlag.bs.sqCstm = 0; - qpAttr.ub.extMode.sqebbNum = TILEXR_UDMA_SQ_DEPTH; - qpAttr.ub.tokenIdHandle = state.tokenHandle; - ret = loader_.RaCtxQpCreate(state.ctxHandle, &qpAttr, &state.qpInfo, &state.qpHandle); - if (ret != 0) { - return TILEXR_ERROR_INTERNAL; - } - state.localWq.wqn = 0; - state.localWq.bufAddr = state.qpInfo.ub.sqBuffVa; - state.localWq.baseBkShift = Log2Uint64(state.qpInfo.ub.wqebbSize); - state.localWq.depth = TILEXR_UDMA_SQ_BB_COUNT; - if (AllocDeviceScalar(&state.sqPiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || - AllocDeviceScalar(&state.sqCiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || - AllocDeviceScalar(&state.wqeCntAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || - AllocDeviceScalar(&state.amoAddr, sizeof(uint64_t)) != TILEXR_SUCCESS) { - return TILEXR_ERROR_INTERNAL; + for (const auto& route : peerLocalEid_) { + if (route.second != state.eidIndex) { + continue; + } + int ret = CreatePeerQueue(state, route.first); + if (ret != TILEXR_SUCCESS) { + return ret; + } } - state.localWq.headAddr = reinterpret_cast(state.sqPiAddr); - state.localWq.tailAddr = reinterpret_cast(state.sqCiAddr); - state.localWq.dbMode = UDMADBMode::SW_DB; - state.localWq.dbAddr = state.qpInfo.ub.dbAddr; - state.localWq.wqeCntAddr = reinterpret_cast(state.wqeCntAddr); - state.localWq.amoAddr = reinterpret_cast(state.amoAddr); states_[state.eidIndex] = state; } return states_.empty() ? TILEXR_ERROR_INTERNAL : TILEXR_SUCCESS; } +int TileXRUDMATransport::CreatePeerQueue(PerEidState& state, int peer) +{ + const bool diag = UDMADiagEnabled(); + PerEidState::PeerQueueState queue {}; + queue.peer = peer; + + ChanInfoT chanInfo {}; + chanInfo.in.dataPlaneFlag.bs.poolCqCstm = 1; + int ret = loader_.RaCtxChanCreate(state.ctxHandle, &chanInfo, &queue.chanHandle); + if (ret != 0) { + return TILEXR_ERROR_INTERNAL; + } + + queue.cqInfo.in.chanHandle = queue.chanHandle; + queue.cqInfo.in.depth = TILEXR_UDMA_CQ_DEPTH; + queue.cqInfo.in.ub.mode = JFC_MODE_USER_CTL_NORMAL; + ret = loader_.RaCtxCqCreate(state.ctxHandle, &queue.cqInfo, &queue.cqHandle); + if (ret != 0) { + return TILEXR_ERROR_INTERNAL; + } + queue.localCq.cqn = 0; + queue.localCq.bufAddr = queue.cqInfo.out.bufAddr; + queue.localCq.baseBkShift = Log2Uint64(queue.cqInfo.out.cqeSize); + queue.localCq.depth = queue.cqInfo.in.depth; + if (AllocDeviceScalar(&queue.cqPiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || + AllocDeviceScalar(&queue.cqCiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS) { + return TILEXR_ERROR_INTERNAL; + } + queue.localCq.headAddr = reinterpret_cast(queue.cqPiAddr); + queue.localCq.tailAddr = reinterpret_cast(queue.cqCiAddr); + queue.localCq.dbMode = UDMADBMode::SW_DB; + queue.localCq.dbAddr = queue.cqInfo.out.swdbAddr; + + QpCreateAttr qpAttr {}; + qpAttr.scqHandle = queue.cqHandle; + qpAttr.rcqHandle = queue.cqHandle; + qpAttr.srqHandle = queue.cqHandle; + qpAttr.sqDepth = TILEXR_UDMA_SQ_DEPTH; + qpAttr.rqDepth = TILEXR_UDMA_RQ_DEPTH_DEFAULT; + qpAttr.transportMode = CONN_RM; + qpAttr.ub.mode = JETTY_MODE_USER_CTL_NORMAL; + qpAttr.ub.flag.value = 1; + qpAttr.ub.jfsFlag.value = 2; + qpAttr.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE; + qpAttr.ub.rnrRetry = 7; + qpAttr.ub.extMode.piType = 0; + qpAttr.ub.extMode.cstmFlag.bs.sqCstm = 0; + qpAttr.ub.extMode.sqebbNum = TILEXR_UDMA_SQ_DEPTH; + qpAttr.ub.tokenIdHandle = state.tokenHandle; + ret = loader_.RaCtxQpCreate(state.ctxHandle, &qpAttr, &queue.qpInfo, &queue.qpHandle); + if (ret != 0) { + return TILEXR_ERROR_INTERNAL; + } + queue.localWq.wqn = 0; + queue.localWq.bufAddr = queue.qpInfo.ub.sqBuffVa; + queue.localWq.baseBkShift = Log2Uint64(queue.qpInfo.ub.wqebbSize); + queue.localWq.depth = TILEXR_UDMA_SQ_BB_COUNT; + if (AllocDeviceScalar(&queue.sqPiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || + AllocDeviceScalar(&queue.sqCiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || + AllocDeviceScalar(&queue.wqeCntAddr, sizeof(uint32_t)) != TILEXR_SUCCESS || + AllocDeviceScalar(&queue.amoAddr, sizeof(uint64_t)) != TILEXR_SUCCESS) { + return TILEXR_ERROR_INTERNAL; + } + queue.localWq.headAddr = reinterpret_cast(queue.sqPiAddr); + queue.localWq.tailAddr = reinterpret_cast(queue.sqCiAddr); + queue.localWq.dbMode = UDMADBMode::SW_DB; + queue.localWq.dbAddr = queue.qpInfo.ub.dbAddr; + queue.localWq.wqeCntAddr = reinterpret_cast(queue.wqeCntAddr); + queue.localWq.amoAddr = reinterpret_cast(queue.amoAddr); + if (diag) { + TILEXR_LOG(INFO) << "UDMA diag create peer queue rank " << options_.rank + << " peer=" << peer + << " eid=" << state.eidIndex + << " ctx=" << state.ctxHandle + << " chan=" << queue.chanHandle + << " qp=" << queue.qpHandle + << " cq=" << queue.cqHandle + << " sqBuf=" << PtrToHex(queue.localWq.bufAddr) + << " sqDb=" << PtrToHex(queue.localWq.dbAddr) + << " sqHead=" << PtrToHex(queue.localWq.headAddr) + << " sqTail=" << PtrToHex(queue.localWq.tailAddr) + << " wqeCnt=" << PtrToHex(queue.localWq.wqeCntAddr) + << " cqBuf=" << PtrToHex(queue.localCq.bufAddr) + << " cqDb=" << PtrToHex(queue.localCq.dbAddr) + << " cqHead=" << PtrToHex(queue.localCq.headAddr) + << " cqTail=" << PtrToHex(queue.localCq.tailAddr) + << " wqebbSize=" << queue.qpInfo.ub.wqebbSize + << " cqeSize=" << queue.cqInfo.out.cqeSize; + } + state.peerQueues[peer] = queue; + return TILEXR_SUCCESS; +} + int TileXRUDMATransport::ImportQueues() { - std::vector localImports(eidCount_); - std::vector localKeys(eidCount_); + const bool diag = UDMADiagEnabled(); + std::vector localPeerImports(options_.rankSize); + std::vector localPeerKeys(options_.rankSize); for (const auto& stateEntry : states_) { const auto& state = stateEntry.second; - if (state.eidIndex >= eidCount_) { - return TILEXR_ERROR_INTERNAL; + for (const auto& queueEntry : state.peerQueues) { + const int peer = queueEntry.first; + const auto& queue = queueEntry.second; + localPeerImports[peer].in.ub.mode = JETTY_IMPORT_MODE_NORMAL; + localPeerImports[peer].in.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE; + localPeerImports[peer].in.ub.policy = JETTY_GRP_POLICY_RR; + localPeerImports[peer].in.ub.type = TARGET_TYPE_JETTY; + localPeerImports[peer].in.ub.flag.bs.tokenPolicy = TOKEN_POLICY_PLAIN_TEXT; + localPeerImports[peer].in.ub.tpType = 1; + localPeerImports[peer].in.key = queue.qpInfo.key; + localPeerKeys[peer] = queue.qpInfo.key; } - localImports[state.eidIndex].in.ub.mode = JETTY_IMPORT_MODE_NORMAL; - localImports[state.eidIndex].in.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE; - localImports[state.eidIndex].in.ub.policy = JETTY_GRP_POLICY_RR; - localImports[state.eidIndex].in.ub.type = TARGET_TYPE_JETTY; - localImports[state.eidIndex].in.ub.flag.bs.tokenPolicy = TOKEN_POLICY_PLAIN_TEXT; - localImports[state.eidIndex].in.ub.tpType = 1; - localKeys[state.eidIndex] = state.qpInfo.key; } - std::vector allImports(options_.rankSize * eidCount_); - int ret = options_.exchange->AllGather(localImports.data(), localImports.size(), allImports.data()); + std::vector allImports(options_.rankSize * options_.rankSize); + int ret = options_.exchange->AllGather(localPeerImports.data(), localPeerImports.size(), allImports.data()); if (ret != TILEXR_SUCCESS) { return ret; } - std::vector allKeys(options_.rankSize * eidCount_); - ret = options_.exchange->AllGather(localKeys.data(), localKeys.size(), allKeys.data()); + std::vector allKeys(options_.rankSize * options_.rankSize); + ret = options_.exchange->AllGather(localPeerKeys.data(), localPeerKeys.size(), allKeys.data()); if (ret != TILEXR_SUCCESS) { return ret; } for (auto& stateEntry : states_) { auto& state = stateEntry.second; - for (int peer = 0; peer < options_.rankSize; ++peer) { - if (peer == options_.rank) { - continue; - } - const auto localRoute = peerLocalEid_.find(peer); - if (localRoute == peerLocalEid_.end() || localRoute->second != state.eidIndex) { - continue; - } + for (auto& queueEntry : state.peerQueues) { + const int peer = queueEntry.first; + auto& queue = queueEntry.second; const uint32_t remoteEid = peerRemoteEid_[peer]; if (remoteEid >= eidCount_) { return TILEXR_ERROR_INTERNAL; } - QpImportInfoT importInfo = allImports[peer * eidCount_ + remoteEid]; - importInfo.in.key = allKeys[peer * eidCount_ + remoteEid]; - ret = loader_.RaCtxQpImport(state.ctxHandle, &importInfo, &state.remoteQpHandles[peer]); + QpImportInfoT importInfo = allImports[(peer * options_.rankSize + options_.rank)]; + importInfo.in.key = allKeys[(peer * options_.rankSize + options_.rank)]; + ret = loader_.RaCtxQpImport(state.ctxHandle, &importInfo, &queue.remoteQpHandle); if (ret != 0) { return TILEXR_ERROR_INTERNAL; } - state.tpnList[peer] = importInfo.out.ub.tpn; + queue.tpn = importInfo.out.ub.tpn; + if (diag) { + TILEXR_LOG(INFO) << "UDMA diag import qp rank " << options_.rank + << " peer=" << peer + << " localEid=" << state.eidIndex + << " remoteEid=" << remoteEid + << " remoteQp=" << queue.remoteQpHandle + << " tpn=" << queue.tpn + << " keySize=" << static_cast(importInfo.in.key.size); + } } } return TILEXR_SUCCESS; @@ -722,6 +839,7 @@ uint32_t TileXRUDMATransport::FallbackLocalEid() const int TileXRUDMATransport::RefreshUDMAInfo() { + const bool diag = UDMADiagEnabled(); if (eidCount_ == 0 || states_.empty()) { return TILEXR_ERROR_INTERNAL; } @@ -785,11 +903,36 @@ int TileXRUDMATransport::RefreshUDMAInfo() if (stateIt == states_.end()) { stateIt = fallbackIt; } - const auto& state = stateIt->second; - sq[rank] = state.localWq; - rq[rank] = state.localWq; - scq[rank] = state.localCq; - rcq[rank] = state.localCq; + auto& state = stateIt->second; + PerEidState::PeerQueueState* queuePtr = nullptr; + if (rank == options_.rank) { + if (!state.peerQueues.empty()) { + queuePtr = &state.peerQueues.begin()->second; + } else if (!fallbackIt->second.peerQueues.empty()) { + queuePtr = &fallbackIt->second.peerQueues.begin()->second; + } + } else { + const auto queueIt = state.peerQueues.find(rank); + if (queueIt == state.peerQueues.end()) { + return TILEXR_ERROR_INTERNAL; + } + queuePtr = &queueIt->second; + } + if (queuePtr == nullptr) { + return TILEXR_ERROR_INTERNAL; + } + auto& queue = *queuePtr; + if (!registeredMem_.empty()) { + const auto& localMrs = registeredMem_.begin()->second; + const auto localMrIt = localMrs.find(localEid); + if (localMrIt != localMrs.end()) { + queue.localWq.localTokenId = localMrIt->second.tokenId; + } + } + sq[rank] = queue.localWq; + rq[rank] = queue.localWq; + scq[rank] = queue.localCq; + rcq[rank] = queue.localCq; if (rank == options_.rank) { const auto localMemIt = localMemInfoByEid_.find(localEid); if (localMemIt != localMemInfoByEid_.end()) { @@ -797,17 +940,42 @@ int TileXRUDMATransport::RefreshUDMAInfo() } } else { mem[rank] = allMem[rank * eidCount_ + remoteEid]; - mem[rank].tpn = state.tpnList[rank]; + mem[rank].tpn = queue.tpn; } mem[rank].eidAddr = reinterpret_cast( eidTableDev_ + (rank * eidCount_ + remoteEid) * sizeof(HccpEid)); + if (diag) { + TILEXR_LOG(INFO) << "UDMA diag info image rank " << options_.rank + << " entryRank=" << rank + << " localEid=" << localEid + << " remoteEid=" << remoteEid + << " sqBuf=" << PtrToHex(sq[rank].bufAddr) + << " sqHead=" << PtrToHex(sq[rank].headAddr) + << " sqTail=" << PtrToHex(sq[rank].tailAddr) + << " localTokenId=" << sq[rank].localTokenId + << " wqeCnt=" << PtrToHex(sq[rank].wqeCntAddr) + << " cqBuf=" << PtrToHex(scq[rank].bufAddr) + << " cqTail=" << PtrToHex(scq[rank].tailAddr) + << " memAddr=" << PtrToHex(mem[rank].addr) + << " memLen=" << mem[rank].len + << " memTid=" << mem[rank].tid + << " memTpn=" << mem[rank].tpn + << " memEidAddr=" << PtrToHex(mem[rank].eidAddr); + } } - if (udmaInfoDev_ == nullptr) { - const size_t oneRankSize = 2 * sizeof(UDMAWQCtx) + 2 * sizeof(UDMACQCtx) + sizeof(UDMAMemInfo); - udmaInfoSize_ = static_cast(sizeof(UDMAInfo) + oneRankSize * options_.rankSize); + const size_t oneRankSize = 2 * sizeof(UDMAWQCtx) + 2 * sizeof(UDMACQCtx) + sizeof(UDMAMemInfo); + const uint32_t requiredInfoSize = + static_cast(sizeof(UDMAInfo) + oneRankSize * options_.rankSize); + if (udmaInfoDev_ == nullptr || udmaInfoSize_ < requiredInfoSize) { + if (udmaInfoDev_ != nullptr) { + aclrtFree(udmaInfoDev_); + udmaInfoDev_ = nullptr; + } + udmaInfoSize_ = requiredInfoSize; ret = aclrtMalloc(reinterpret_cast(&udmaInfoDev_), udmaInfoSize_, ACL_MEM_MALLOC_HUGE_FIRST); if (ret != ACL_SUCCESS) { + udmaInfoSize_ = 0; return TILEXR_ERROR_INTERNAL; } } @@ -825,28 +993,81 @@ int TileXRUDMATransport::RefreshUDMAInfo() return TILEXR_SUCCESS; } +int TileXRUDMATransport::EnsureUDMAInfoBuffer() +{ + if (udmaInfoDev_ != nullptr) { + return TILEXR_SUCCESS; + } + UDMAInfo info {}; + info.qpNum = 1; + udmaInfoSize_ = static_cast(sizeof(UDMAInfo)); + int ret = aclrtMalloc(reinterpret_cast(&udmaInfoDev_), udmaInfoSize_, ACL_MEM_MALLOC_HUGE_FIRST); + if (ret != ACL_SUCCESS) { + return TILEXR_ERROR_INTERNAL; + } + ret = aclrtMemcpy(udmaInfoDev_, udmaInfoSize_, &info, sizeof(info), ACL_MEMCPY_HOST_TO_DEVICE); + if (ret != ACL_SUCCESS) { + aclrtFree(udmaInfoDev_); + udmaInfoDev_ = nullptr; + udmaInfoSize_ = 0; + return TILEXR_ERROR_INTERNAL; + } + return TILEXR_SUCCESS; +} + int TileXRUDMATransport::RegisterMemory(GM_ADDR localPtr, size_t bytes) { if (!available_ || localPtr == nullptr || bytes == 0) { return TILEXR_ERROR_NOT_FOUND; } + const bool diag = UDMADiagEnabled(); + if (diag) { + TILEXR_LOG(INFO) << "UDMA diag register memory begin rank " << options_.rank + << " ptr=" << PtrToHex(reinterpret_cast(localPtr)) + << " bytes=" << bytes; + } + CleanupMemory(); + CleanupQueues(); + registeredPtr_ = nullptr; int ret = RegisterMemoryOnContexts(localPtr, bytes); if (ret != TILEXR_SUCCESS) { + TILEXR_LOG(WARN) << "UDMA register memory on contexts failed rank " << options_.rank + << " ret=" << ret; return ret; } registeredPtr_ = localPtr; + ret = CreateQueues(); + if (ret != TILEXR_SUCCESS) { + TILEXR_LOG(WARN) << "UDMA create queues after memory register failed rank " << options_.rank + << " ret=" << ret; + return ret; + } + ret = ImportQueues(); + if (ret != TILEXR_SUCCESS) { + TILEXR_LOG(WARN) << "UDMA import queues after memory register failed rank " << options_.rank + << " ret=" << ret; + return ret; + } ret = ExchangeAndImportMemory(); if (ret != TILEXR_SUCCESS) { + TILEXR_LOG(WARN) << "UDMA exchange/import memory failed rank " << options_.rank + << " ret=" << ret; + return ret; + } + ret = RefreshUDMAInfo(); + if (ret != TILEXR_SUCCESS) { + TILEXR_LOG(WARN) << "UDMA refresh info after memory register failed rank " << options_.rank + << " ret=" << ret; return ret; } - return RefreshUDMAInfo(); + if (diag) { + TILEXR_LOG(INFO) << "UDMA diag register memory end rank " << options_.rank; + } + return TILEXR_SUCCESS; } int TileXRUDMATransport::RegisterMemoryOnContexts(GM_ADDR localPtr, size_t bytes) { - if (registeredPtr_ != nullptr) { - UnregisterMemory(registeredPtr_); - } std::map byEid; localMemInfoByEid_.clear(); for (const auto& ctxEntry : ctxHandleByEid_) { @@ -857,12 +1078,22 @@ int TileXRUDMATransport::RegisterMemoryOnContexts(GM_ADDR localPtr, size_t bytes mrInfo.in.mem.size = bytes; mrInfo.in.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE; mrInfo.in.ub.tokenIdHandle = tokenHandle; + mrInfo.in.ub.flags.bs.cacheable = 0; mrInfo.in.ub.flags.bs.access = MEM_SEG_ACCESS_DEFAULT; + mrInfo.in.ub.flags.bs.nonPin = 0; + mrInfo.in.ub.flags.bs.userIova = 0; mrInfo.in.ub.flags.bs.tokenIdValid = 1; mrInfo.in.ub.flags.bs.tokenPolicy = MEM_SEG_TOKEN_PLAIN_TEXT; void* lmemHandle = nullptr; int ret = loader_.RaCtxLmemRegister(ctxEntry.second, &mrInfo, &lmemHandle); if (ret != 0 || lmemHandle == nullptr) { + TILEXR_LOG(WARN) << "UDMA RaCtxLmemRegister failed rank " << options_.rank + << " eid=" << eidIndex + << " ctx=" << ctxEntry.second + << " ptr=" << PtrToHex(reinterpret_cast(localPtr)) + << " bytes=" << bytes + << " ret=" << ret + << " handle=" << lmemHandle; return TILEXR_ERROR_INTERNAL; } @@ -888,6 +1119,15 @@ int TileXRUDMATransport::RegisterMemoryOnContexts(GM_ADDR localPtr, size_t bytes memInfo.len = static_cast(std::min(bytes, UINT32_MAX)); memInfo.addr = reinterpret_cast(localPtr); localMemInfoByEid_[eidIndex] = memInfo; + if (UDMADiagEnabled()) { + TILEXR_LOG(INFO) << "UDMA diag lmem registered rank " << options_.rank + << " eid=" << eidIndex + << " lmem=" << lmemHandle + << " tokenId=" << result.tokenId + << " tid=" << memInfo.tid + << " targetSeg=" << PtrToHex(result.targetSegHandle) + << " keySize=" << static_cast(result.key.size); + } } registeredMem_[reinterpret_cast(localPtr)] = byEid; return TILEXR_SUCCESS; @@ -903,10 +1143,13 @@ int TileXRUDMATransport::ExchangeAndImportMemory() std::vector allCounts(options_.rankSize); int ret = options_.exchange->AllGather(&localCount, 1, allCounts.data()); if (ret != TILEXR_SUCCESS) { + TILEXR_LOG(WARN) << "UDMA memory count allgather failed rank " << options_.rank + << " ret=" << ret; return ret; } const uint32_t maxCount = *std::max_element(allCounts.begin(), allCounts.end()); if (maxCount == 0) { + TILEXR_LOG(WARN) << "UDMA memory exchange found zero max registration count rank " << options_.rank; return TILEXR_ERROR_INTERNAL; } @@ -927,6 +1170,9 @@ int TileXRUDMATransport::ExchangeAndImportMemory() std::vector all(options_.rankSize * maxCount); ret = options_.exchange->AllGather(local.data(), local.size(), all.data()); if (ret != TILEXR_SUCCESS) { + TILEXR_LOG(WARN) << "UDMA memory info allgather failed rank " << options_.rank + << " ret=" << ret + << " localEntries=" << local.size(); return ret; } @@ -945,6 +1191,11 @@ int TileXRUDMATransport::ExchangeAndImportMemory() } } if (remote == nullptr) { + TILEXR_LOG(WARN) << "UDMA remote memory info missing rank " << options_.rank + << " peer=" << peer + << " remoteEid=" << remoteEid + << " peerCount=" << allCounts[peer] + << " maxCount=" << maxCount; return TILEXR_ERROR_INTERNAL; } const uint32_t localEid = peerLocalEid_[peer]; @@ -956,9 +1207,26 @@ int TileXRUDMATransport::ExchangeAndImportMemory() void* remoteHandle = nullptr; ret = loader_.RaCtxRmemImport(ctxHandleByEid_[localEid], &importInfo, &remoteHandle); if (ret != 0 || remoteHandle == nullptr) { + TILEXR_LOG(WARN) << "UDMA RaCtxRmemImport failed rank " << options_.rank + << " peer=" << peer + << " localEid=" << localEid + << " remoteEid=" << remoteEid + << " ctx=" << ctxHandleByEid_[localEid] + << " ret=" << ret + << " handle=" << remoteHandle + << " remoteToken=" << remote->mr.tokenValue + << " remoteTokenId=" << remote->mr.tokenId + << " remoteKeySize=" << static_cast(remote->mr.key.size); return TILEXR_ERROR_INTERNAL; } remoteMemHandles_[peer] = remoteHandle; + if (UDMADiagEnabled()) { + TILEXR_LOG(INFO) << "UDMA diag rmem imported rank " << options_.rank + << " peer=" << peer + << " localEid=" << localEid + << " remoteEid=" << remoteEid + << " remoteHandle=" << remoteHandle; + } } return TILEXR_SUCCESS; } @@ -1002,26 +1270,32 @@ void TileXRUDMATransport::CleanupQueues() { for (auto& stateEntry : states_) { auto& state = stateEntry.second; - for (void* remoteQp : state.remoteQpHandles) { - if (remoteQp != nullptr && state.ctxHandle != nullptr) { - loader_.RaCtxQpUnimport(state.ctxHandle, remoteQp); + for (auto& queueEntry : state.peerQueues) { + auto& queue = queueEntry.second; + if (queue.remoteQpHandle != nullptr && state.ctxHandle != nullptr) { + loader_.RaCtxQpUnimport(state.ctxHandle, queue.remoteQpHandle); + queue.remoteQpHandle = nullptr; } + if (queue.qpHandle != nullptr) { + loader_.RaCtxQpDestroy(queue.qpHandle); + queue.qpHandle = nullptr; + } + if (queue.cqHandle != nullptr && state.ctxHandle != nullptr) { + loader_.RaCtxCqDestroy(state.ctxHandle, queue.cqHandle); + queue.cqHandle = nullptr; + } + if (queue.chanHandle != nullptr && state.ctxHandle != nullptr) { + loader_.RaCtxChanDestroy(state.ctxHandle, queue.chanHandle); + queue.chanHandle = nullptr; + } + FreeDeviceScalar(queue.cqPiAddr); + FreeDeviceScalar(queue.cqCiAddr); + FreeDeviceScalar(queue.sqPiAddr); + FreeDeviceScalar(queue.sqCiAddr); + FreeDeviceScalar(queue.wqeCntAddr); + FreeDeviceScalar(queue.amoAddr); } - if (state.qpHandle != nullptr) { - loader_.RaCtxQpDestroy(state.qpHandle); - } - if (state.cqHandle != nullptr && state.ctxHandle != nullptr) { - loader_.RaCtxCqDestroy(state.ctxHandle, state.cqHandle); - } - if (state.chanHandle != nullptr && state.ctxHandle != nullptr) { - loader_.RaCtxChanDestroy(state.ctxHandle, state.chanHandle); - } - FreeDeviceScalar(state.cqPiAddr); - FreeDeviceScalar(state.cqCiAddr); - FreeDeviceScalar(state.sqPiAddr); - FreeDeviceScalar(state.sqCiAddr); - FreeDeviceScalar(state.wqeCntAddr); - FreeDeviceScalar(state.amoAddr); + state.peerQueues.clear(); } states_.clear(); } diff --git a/src/comm/udma/tilexr_udma_transport.h b/src/comm/udma/tilexr_udma_transport.h index 0a787d1..d3adfac 100644 --- a/src/comm/udma/tilexr_udma_transport.h +++ b/src/comm/udma/tilexr_udma_transport.h @@ -53,7 +53,9 @@ class TileXRUDMATransport { int BuildRoutes(); int CreateContexts(); int CreateQueues(); + int CreatePeerQueue(PerEidState& state, int peer); int ImportQueues(); + int EnsureUDMAInfoBuffer(); int RefreshUDMAInfo(); int RegisterMemoryOnContexts(GM_ADDR localPtr, size_t bytes); int ExchangeAndImportMemory(); diff --git a/src/include/comm_args.h b/src/include/comm_args.h index ed46f82..74931a6 100644 --- a/src/include/comm_args.h +++ b/src/include/comm_args.h @@ -34,7 +34,7 @@ using GM_ADDR = uint8_t*; namespace TileXR { -constexpr int TILEXR_MAX_RANK_SIZE = 128; // 最大支持的npu卡数 +constexpr int TILEXR_MAX_RANK_SIZE = 256; // 最大支持的npu卡数 constexpr int RANK_SIZE_TWO = 2; // 可用SIO的规模,以及是否需要跨卡搬运数据核的分界规模 constexpr int64_t IPC_BUFF_MAX_SIZE = 100 * 1024 * 1024; constexpr int64_t IPC_DATA_OFFSET = 2 * 1024 * 1024; // 前2MB作为flag标志位,之后100MB作为数据存储 diff --git a/src/include/tilexr_udma.h b/src/include/tilexr_udma.h index 43f15c0..c931e80 100644 --- a/src/include/tilexr_udma.h +++ b/src/include/tilexr_udma.h @@ -182,23 +182,28 @@ __aicore__ inline void UDMAFillNotifyData( __aicore__ inline void UDMAFillSqeCtx( __gm__ UDMASqeCtx* sqeCtx, __gm__ uint8_t* remoteAddr, __gm__ UDMAMemInfo* remoteMemInfo, - uint32_t curHead, UDMAOpcode opcode, const UDMASignalParams* signalParams) + uint32_t curHead, uint32_t depth, UDMAOpcode opcode, const UDMASignalParams* signalParams) { + sqeCtx->sqeBbIdx = curHead % depth; sqeCtx->opcode = static_cast(opcode); sqeCtx->flag = 0b00100010; + sqeCtx->rsv0 = 0; sqeCtx->nf = 0; sqeCtx->tokenEn = remoteMemInfo->tokenValueValid; sqeCtx->rmtJettyType = remoteMemInfo->rmtJettyType; - sqeCtx->owner = (curHead & TILEXR_UDMA_SQ_BB_COUNT) == 0 ? 1 : 0; + sqeCtx->owner = (curHead & depth) == 0 ? 1 : 0; sqeCtx->targetHint = remoteMemInfo->targetHint; + sqeCtx->rsv1 = 0; sqeCtx->inlineMsgLen = 0; sqeCtx->tpId = remoteMemInfo->tpn; sqeCtx->sgeNum = 1; sqeCtx->rmtJettyOrSegId = remoteMemInfo->tid; + sqeCtx->rsv2 = 0; sqeCtx->rmtTokenValue = remoteMemInfo->rmtTokenValue; sqeCtx->udfType = 0; sqeCtx->reduceDataType = 0; sqeCtx->reduceOpcode = 0; + sqeCtx->rsv3 = 0; uint64_t remoteAddrValue = reinterpret_cast(remoteAddr); sqeCtx->rmtAddrLOrTokenId = remoteAddrValue & 0xFFFFFFFF; sqeCtx->rmtAddrHOrTokenValue = (remoteAddrValue >> 32) & 0xFFFFFFFF; @@ -209,10 +214,11 @@ __aicore__ inline void UDMAFillSqeCtx( } __aicore__ inline void UDMAFillSgeCtx( - __gm__ UDMASgeCtx* sgeCtx, uint64_t messageLen, __gm__ uint8_t* localAddr) + __gm__ UDMASgeCtx* sgeCtx, uint64_t messageLen, __gm__ uint8_t* localAddr, + __gm__ UDMAWQCtx* qpCtxEntry) { sgeCtx->len = messageLen; - sgeCtx->tokenId = 0; + sgeCtx->tokenId = qpCtxEntry->localTokenId; sgeCtx->va = reinterpret_cast(localAddr); } @@ -240,18 +246,19 @@ __aicore__ inline void UDMAPostSend( { __gm__ UDMAWQCtx* qpCtxEntry = UDMAGetWQCtx(udmaInfo, pe, qpIdx); uint32_t wqeSize = 1U << qpCtxEntry->baseBkShift; + uint32_t depth = qpCtxEntry->depth; uint32_t curHead = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->headAddr), 0); uint32_t wqeCnt = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->wqeCntAddr), 0); UDMAPollCQWhenSQOverflow(udmaInfo, qpCtxEntry, wqeCnt, pe, qpIdx); __gm__ UDMAMemInfo* remoteMemInfo = UDMAGetRemoteMemInfo(udmaInfo, pe); __gm__ uint8_t* wqeAddr = - reinterpret_cast<__gm__ uint8_t*>(qpCtxEntry->bufAddr + wqeSize * (curHead % TILEXR_UDMA_SQ_BB_COUNT)); + reinterpret_cast<__gm__ uint8_t*>(qpCtxEntry->bufAddr + wqeSize * (curHead % depth)); __gm__ UDMASqeCtx* sqeCtx = reinterpret_cast<__gm__ UDMASqeCtx*>(wqeAddr); - UDMAFillSqeCtx(sqeCtx, remoteAddr, remoteMemInfo, curHead, opcode, signalParams); + UDMAFillSqeCtx(sqeCtx, remoteAddr, remoteMemInfo, curHead, qpCtxEntry->depth, opcode, signalParams); __gm__ UDMASgeCtx* sgeCtx = reinterpret_cast<__gm__ UDMASgeCtx*>(UDMAGetSgeCtxAddr(wqeAddr, opcode)); - UDMAFillSgeCtx(sgeCtx, messageLen, localAddr); + UDMAFillSgeCtx(sgeCtx, messageLen, localAddr, qpCtxEntry); uint32_t wqeBbCnt = UDMAWqeBBCnt(opcode); UDMACleanCacheLines(wqeAddr, wqeSize * wqeBbCnt); curHead += wqeBbCnt; @@ -368,6 +375,15 @@ __aicore__ inline void UDMAQuiet(const __gm__ CommArgs* args, int targetRank) (void)UDMAPollCQ(udmaInfo, targetRank, 0, wqeCnt); } +__aicore__ inline uint32_t UDMAQuietStatus(const __gm__ CommArgs* args, int targetRank) +{ + if (!UDMAEnabled(args)) return 0xFFFFFFFFU; + __gm__ UDMAInfo* udmaInfo = GetUDMAInfo(args); + __gm__ UDMAWQCtx* qpCtxEntry = UDMAGetWQCtx(udmaInfo, targetRank, 0); + uint32_t wqeCnt = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->wqeCntAddr), 0); + return UDMAPollCQ(udmaInfo, targetRank, 0, wqeCnt); +} + } // namespace TileXR #endif // TILEXR_UDMA_H diff --git a/src/include/tilexr_udma_types.h b/src/include/tilexr_udma_types.h index 57d77e3..1fd6ebd 100644 --- a/src/include/tilexr_udma_types.h +++ b/src/include/tilexr_udma_types.h @@ -52,6 +52,7 @@ struct UDMAWQCtx { UDMADBMode dbMode; uint64_t dbAddr; uint32_t sl; + uint32_t localTokenId; uint64_t wqeCntAddr; uint64_t amoAddr; }; diff --git a/tests/udma/CMakeLists.txt b/tests/udma/CMakeLists.txt index 6c1e41a..7e5320a 100644 --- a/tests/udma/CMakeLists.txt +++ b/tests/udma/CMakeLists.txt @@ -43,7 +43,7 @@ message(STATUS "TILEXR_ROOT: ${TILEXR_ROOT}") # 查找库 find_library(TILEXR_LIB tile-comm - HINTS "${TILEXR_ROOT}/install/lib" "${TILEXR_ROOT}/build/src/comm" + HINTS "${TILEXR_ROOT}/install/lib" "${TILEXR_ROOT}/install/lib64" "${TILEXR_ROOT}/build/src/comm" REQUIRED) message(STATUS "Found tile-comm: ${TILEXR_LIB}") @@ -71,6 +71,47 @@ add_executable(test_tilexr_udma_registry target_include_directories(test_tilexr_udma_registry PRIVATE ${TILEXR_ROOT}/src/include + ${CMAKE_CURRENT_SOURCE_DIR} +) + +add_executable(test_tilexr_udma_alltoall_layout + unit/test_tilexr_udma_alltoall_layout.cpp +) + +target_include_directories(test_tilexr_udma_alltoall_layout PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} +) + +target_compile_definitions(test_tilexr_udma_alltoall_layout PRIVATE + TILEXR_SOURCE_ROOT="${TILEXR_ROOT}" +) + +add_executable(test_tilexr_udma_allreduce_layout + unit/test_tilexr_udma_allreduce_layout.cpp +) + +target_include_directories(test_tilexr_udma_allreduce_layout PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} +) + +target_compile_definitions(test_tilexr_udma_allreduce_layout PRIVATE + TILEXR_SOURCE_ROOT="${TILEXR_ROOT}" +) + +add_executable(test_tilexr_chip_map_sources + unit/test_tilexr_chip_map_sources.cpp +) + +target_compile_definitions(test_tilexr_chip_map_sources PRIVATE + TILEXR_SOURCE_ROOT="${TILEXR_ROOT}" +) + +add_executable(test_tilexr_ipc_pid_mode_sources + unit/test_tilexr_ipc_pid_mode_sources.cpp +) + +target_compile_definitions(test_tilexr_ipc_pid_mode_sources PRIVATE + TILEXR_SOURCE_ROOT="${TILEXR_ROOT}" ) add_executable(test_tilexr_udma_transport_layout @@ -83,6 +124,10 @@ target_include_directories(test_tilexr_udma_transport_layout PRIVATE ${TILEXR_ROOT}/src/comm ) +target_compile_definitions(test_tilexr_udma_transport_layout PRIVATE + TILEXR_SOURCE_ROOT="${TILEXR_ROOT}" +) + # 集成测试:TileXR UDMA add_executable(test_tilexr_udma integration/test_tilexr_udma.cpp @@ -98,6 +143,10 @@ target_link_libraries(test_tilexr_udma set(INSTALL_TARGETS test_tilexr_udma test_tilexr_udma_registry + test_tilexr_udma_alltoall_layout + test_tilexr_udma_allreduce_layout + test_tilexr_chip_map_sources + test_tilexr_ipc_pid_mode_sources test_tilexr_udma_transport_layout ) @@ -162,6 +211,7 @@ if(BUILD_TILEXR_UDMA_DEMO) -I${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/ -I${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/runtime/ -I${ASCEND_HOME_PATH}/${ARCH}-linux/include/ + -I${ASCEND_HOME_PATH}/${ARCH}-linux/asc/include/ -I${ASCEND_DRIVER_PATH}/kernel/inc -I${TILEXR_ROOT}/3rdparty -I${TILEXR_ROOT}/src/include @@ -175,6 +225,7 @@ if(BUILD_TILEXR_UDMA_DEMO) -shared ${TILEXR_UDMA_KERNEL_LINK_OPTIONS} -DCATLASS_ARCH=${TILEXR_UDMA_CATLASS_ARCH} + -DTILEXR_UDMA_FORCE_ENABLE=1 ${TILEXR_UDMA_DEMO_KERNEL_INCLUDES} "${CMAKE_CURRENT_SOURCE_DIR}/demo/tilexr_udma_demo_kernel.cpp" -L${ASCEND_DRIVER_PATH}/lib64/driver @@ -194,6 +245,7 @@ if(BUILD_TILEXR_UDMA_DEMO) DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/demo/tilexr_udma_demo_kernel.cpp" "${TILEXR_ROOT}/src/include/tilexr_udma.h" + "${TILEXR_ROOT}/src/include/tilexr_data_as_flag.h" VERBATIM COMMENT "Building TileXR UDMA demo kernel with bisheng" ) diff --git a/tests/udma/build.sh b/tests/udma/build.sh index 9486685..1ee2bb3 100755 --- a/tests/udma/build.sh +++ b/tests/udma/build.sh @@ -27,8 +27,28 @@ mkdir -p "${INSTALL_DIR}" cd "${BUILD_DIR}" +find_bisheng_dir() { + if command -v bisheng >/dev/null 2>&1; then + dirname "$(command -v bisheng)" + return 0 + fi + for candidate in \ + "${ASCEND_HOME_PATH}/compiler/bisheng" \ + "${ASCEND_HOME_PATH}/tools/bisheng_compiler/bin/bisheng"; do + if [ -x "${candidate}" ]; then + dirname "${candidate}" + return 0 + fi + done + find /usr/local/Ascend -path "*/tools/bisheng_compiler/bin/bisheng" -type f -executable 2>/dev/null | + head -n 1 | + xargs -r dirname +} + # 配置 -if command -v bisheng >/dev/null 2>&1; then +BISHENG_DIR=$(find_bisheng_dir) +if [ -n "${BISHENG_DIR}" ]; then + export PATH="${BISHENG_DIR}:${PATH}" DEMO_OPTION="-DBUILD_TILEXR_UDMA_DEMO=ON" else echo "WARN: bisheng not found; TileXR UDMA communication demo target will be skipped." @@ -52,6 +72,7 @@ echo "" echo "Available tests:" echo " - test_tilexr_udma_transport_layout : UDMA info layout unit tests" echo " - test_tilexr_udma_registry : registered-memory metadata unit tests" +echo " - test_tilexr_udma_allreduce_layout : all-reduce demo layout unit tests" echo " - test_tilexr_udma : TileXR integration tests" if [ -f "${INSTALL_DIR}/bin/tilexr_udma_demo" ]; then echo " - tilexr_udma_demo : TileXR UDMA communication demo" diff --git a/tests/udma/demo/ALLTOALL_8P_RUNBOOK.md b/tests/udma/demo/ALLTOALL_8P_RUNBOOK.md new file mode 100644 index 0000000..4b9587e --- /dev/null +++ b/tests/udma/demo/ALLTOALL_8P_RUNBOOK.md @@ -0,0 +1,201 @@ +# TileXR UDMA All-to-All 8P Runbook + +This runbook records the 8P all-to-all demo version based on commit +`e552736 Add UDMA demo checker collectives`. + +## Scope + +- Demo path: `tests/udma/demo` +- Operator mode: `test_type=2`, all-to-all UDMA put +- Target hardware: A5 / Ascend950 / 950 +- Process model: one local process per rank +- Validated baseline: `rank_size=8`, `npu_count=8`, `first_npu=0` + +The all-to-all layout is: + +- rank `src` fills input slice `dst` with `100000 + src * 1000 + dst`; +- rank `src` sends slice `dst` to rank `dst`; +- rank `dst` output is ordered by source rank. + +For rank `0`, the output sample should contain: + +```text +from0=100000 from1=101000 from2=102000 from3=103000 from4=104000 from5=105000 from6=106000 from7=107000 +``` + +For rank `7`, the output sample should contain: + +```text +from0=100007 from1=101007 from2=102007 from3=103007 from4=104007 from5=105007 from6=106007 from7=107007 +``` + +## Environment + +Use a root shell on the Ascend machine. + +```bash +cd /path/to/TileXR +source scripts/common_env.sh +npu-smi info +``` + +Expected: + +- `npu-smi info` lists at least 8 usable devices; +- CANN environment variables are available after `source scripts/common_env.sh`; +- `bisheng` is available for building `tilexr_udma_demo_kernel.cpp`; +- MPI, if needed by surrounding scripts, is under `/usr/local/mpi/`. + +## Build + +Build `tile-comm` and install it into the repository `install` directory: + +```bash +cd /path/to/TileXR +source scripts/common_env.sh +cmake -S . -B /tmp/tilexr-build-udma -DCMAKE_INSTALL_PREFIX="$PWD/install" +cmake --build /tmp/tilexr-build-udma --target tile-comm -j"$(nproc)" +cmake --install /tmp/tilexr-build-udma +``` + +Build the UDMA demo: + +```bash +cd /path/to/TileXR/tests/udma +bash build.sh +``` + +Check artifacts: + +```bash +test -x install/bin/tilexr_udma_demo +test -f install/lib/libtilexr_udma_demo_kernel.so +``` + +## Run 8P All-to-All + +The script arguments are: + +```text +run_tilexr_udma_demo.sh +``` + +Run the normal/default IPC initialization path: + +```bash +cd /path/to/TileXR/tests/udma +export TILEXR_COMM_ID=127.0.0.1:10067 +bash demo/run_tilexr_udma_demo.sh 2 8 16 8 0 +``` + +Run with explicit PID IPC mode: + +```bash +cd /path/to/TileXR/tests/udma +export TILEXR_COMM_ID=127.0.0.1:10077 +TILEXR_IPC_PID_MODE=pid bash demo/run_tilexr_udma_demo.sh 2 8 16 8 0 +``` + +Run with explicit SDID IPC mode: + +```bash +cd /path/to/TileXR/tests/udma +export TILEXR_COMM_ID=127.0.0.1:10087 +TILEXR_IPC_PID_MODE=sdid bash demo/run_tilexr_udma_demo.sh 2 8 16 8 0 +``` + +Use a different `TILEXR_COMM_ID` port for concurrent or repeated runs to avoid +the demo TCP barrier colliding with a previous process. + +## 1M Data Run + +For a 1M-elements-per-peer checker-sized run: + +```bash +cd /path/to/TileXR/tests/udma +export TILEXR_COMM_ID=127.0.0.1:10107 +bash demo/run_tilexr_udma_demo.sh 2 8 1048576 8 0 +``` + +This means: + +- `elements_per_rank=1048576` int32 elements per destination slice; +- each rank input buffer has `8 * 1048576` int32 elements; +- each rank input/output buffer is 32 MiB. + +## Log Checks + +Each run writes logs under: + +```text +tests/udma/logs/tilexr_udma_demo_YYYYmmdd_HHMMSS/ +``` + +Quick success check: + +```bash +cd /path/to/TileXR/tests/udma +latest=$(ls -td logs/tilexr_udma_demo_* | head -n1) +grep -R "TileXR UDMA demo success" "$latest" +grep -R "UDMA=enabled" "$latest" +grep -R "TileXRUDMARegister success" "$latest" +``` + +There should be 8 success lines, one for each rank. + +Check all-to-all samples: + +```bash +grep -R "alltoall output sample" "$latest" +``` + +Check for failures: + +```bash +grep -R "ALLTOALL MISMATCH\|DATA MISMATCH\|TileXR UDMA demo failed\|ERROR" "$latest" || true +``` + +Expected: no mismatch or failure lines. + +If UDMA CQ is incomplete, the demo may print: + +```text +alltoall UDMA CQ incomplete, use IPC fallback +alltoall IPC fallback completed +``` + +In that case, the final correctness criterion is still the all-to-all output +validation and `TileXR UDMA demo success` on every rank. + +## Offline Checker + +The layout checker can be built without running the hardware demo: + +```bash +cd /path/to/TileXR/tests/udma +g++ -std=c++14 -O2 \ + -I . \ + -DTILEXR_SOURCE_ROOT='"'/path/to/TileXR'"' \ + unit/test_tilexr_udma_alltoall_layout.cpp \ + -o /tmp/test_tilexr_udma_alltoall_layout +/tmp/test_tilexr_udma_alltoall_layout +``` + +Expected output: + +```text +TileXR UDMA all-to-all layout checks passed +``` + +This checker validates the all-to-all input pattern, expected output layout, and +source-level debug layout assumptions. It does not execute UDMA or AICore code. + +## Notes + +- `test_type=2` is the all-to-all path. +- `test_type=3` is the all-reduce path and is not covered by this runbook. +- `TILEXR_IPC_PID_MODE=sdid` forces `rtSetIpcMemorySuperPodPid`. +- `TILEXR_IPC_PID_MODE=pid` forces `rtSetIpcMemPid`. +- Leaving `TILEXR_IPC_PID_MODE` unset uses TileXR's chip default. +- The demo host source does not use `shmem.h`; process synchronization is a + local TCP barrier derived from `TILEXR_COMM_ID`. diff --git a/tests/udma/demo/ASCEND_VERIFICATION.md b/tests/udma/demo/ASCEND_VERIFICATION.md index e7e1643..247d697 100644 --- a/tests/udma/demo/ASCEND_VERIFICATION.md +++ b/tests/udma/demo/ASCEND_VERIFICATION.md @@ -12,7 +12,7 @@ Verify that the TileXR UDMA demo: - builds against TileXR's public demo API without including `shmem.h` in the host demo source; - initializes UDMA through TileXR's own comm transport, without linking shmem; - registers ordinary `aclrtMalloc` device memory through `TileXRUDMARegister`; -- runs device-side UDMA put and put-signal kernels successfully; +- runs device-side UDMA put, put-signal, and all-to-all kernels successfully; - does not report data mismatches or signal mismatches in rank logs. ## Hardware And Environment @@ -157,14 +157,68 @@ grep -R "TileXR UDMA demo success" "$latest" grep -R "DATA MISMATCH\\|expected non-local signals\\|TileXR UDMA demo failed\\|ERROR" "$latest" || true ``` +## Test 3: UDMA All-To-All + +Run the all-to-all variant: + +```bash +cd /path/to/TileXR/tests/udma +bash demo/run_tilexr_udma_demo.sh 2 2 16 2 0 +``` + +Expected: + +- script exits with code 0; +- each rank log contains `TileXR UDMA demo success`; +- each rank log prints `alltoall output sample`; +- rank 0 output sample includes `from0=100000` and `from1=101000`; +- rank 1 output sample includes `from0=100001` and `from1=101001`; +- no log contains `ALLTOALL MISMATCH`, `ERROR`, or `TileXR UDMA demo failed`. + +Quick log check: + +```bash +latest=$(ls -td logs/tilexr_udma_demo_* | head -n1) +grep -R "alltoall output sample" "$latest" +grep -R "TileXR UDMA demo success" "$latest" +grep -R "ALLTOALL MISMATCH\\|TileXR UDMA demo failed\\|ERROR" "$latest" || true +``` + +## IPC PID And SDID Modes + +The communicator supports an override for peer IPC-memory setup: + +```bash +TILEXR_IPC_PID_MODE=pid bash demo/run_tilexr_udma_demo.sh 2 2 16 2 0 +TILEXR_IPC_PID_MODE=sdid bash demo/run_tilexr_udma_demo.sh 2 2 16 2 0 +``` + +Expected mode behavior: + +- unset: TileXR chooses the chip default; +- `pid`: force `rtSetIpcMemPid`; +- `sdid`: force `rtSetIpcMemorySuperPodPid`. + +On the verified Ascend950DT_9592 host, the default mode is `pid`. Default and +explicit `pid` mode both reached `TileXRCommInitRankLocal success`, +`InitUDMA success`, and `TileXRUDMARegister success`. Explicit `sdid` mode failed +during IPC open with runtime error `507899`, so Ascend950DT_9592 should use PID +mode on that host. + +If default or explicit `pid` mode initializes but a rank's debug words report CQ +status `514`, the failure is in the UDMA data-plane completion path after +registration, not in all-to-all payload layout. The local self-copy segment +should still appear in the all-to-all output sample. + ## Optional Larger Runs -If the machine has more usable devices, repeat both test types with more ranks: +If the machine has more usable devices, repeat all test types with more ranks: ```bash cd /path/to/TileXR/tests/udma bash demo/run_tilexr_udma_demo.sh 0 4 64 4 0 bash demo/run_tilexr_udma_demo.sh 1 4 64 4 0 +bash demo/run_tilexr_udma_demo.sh 2 4 64 4 0 ``` Expected result samples for four ranks should include: @@ -199,9 +253,13 @@ Test 1 command and result: Test 1 log directory: Test 2 command and result: Test 2 log directory: +Test 3 command and result: +Test 3 log directory: +PID/SDID mode results: Optional larger run result: Any ERROR lines: Any DATA MISMATCH lines: +Any ALLTOALL MISMATCH lines: Any signal mismatch lines: ``` diff --git a/tests/udma/demo/README.md b/tests/udma/demo/README.md index 5894707..37f7103 100644 --- a/tests/udma/demo/README.md +++ b/tests/udma/demo/README.md @@ -17,6 +17,8 @@ The demo target requires `bisheng`. If `bisheng` is not available, `build.sh` st cd /path/to/TileXR/tests/udma bash demo/run_tilexr_udma_demo.sh 0 2 16 2 0 bash demo/run_tilexr_udma_demo.sh 1 2 16 2 0 +bash demo/run_tilexr_udma_demo.sh 2 2 16 2 0 +bash demo/run_tilexr_udma_demo.sh 3 8 16 8 0 ``` Arguments: @@ -27,6 +29,10 @@ run_tilexr_udma_demo.sh - `test_type=0`: all-gather style UDMA put. - `test_type=1`: UDMA put with signal. +- `test_type=2`: all-to-all UDMA put. Rank `src` sends input slice `dst` to rank `dst`; + each output is ordered by source rank. +- `test_type=3`: all-reduce sum. Each rank contributes one local vector and receives + the element-wise sum across all ranks. - `rank_size`: number of local ranks to launch. - `elements_per_rank`: `int32_t` elements in each rank segment. - `npu_count`: number of NPUs available to this run. @@ -34,6 +40,12 @@ run_tilexr_udma_demo.sh Each run writes per-rank logs under `tests/udma/logs/tilexr_udma_demo_*`. +IPC peer-memory setup can be forced with `TILEXR_IPC_PID_MODE`: + +- unset: use TileXR's chip default. Ascend950-class chips use `pid`. +- `pid`: force `rtSetIpcMemPid`. +- `sdid`: force `rtSetIpcMemorySuperPodPid`. + Run this demo only on A5 / Ascend950 / 950 hardware. Builds or smoke tests on other Ascend chips are not valid UDMA runtime validation. ## What To Check diff --git a/tests/udma/demo/run_tilexr_udma_demo.sh b/tests/udma/demo/run_tilexr_udma_demo.sh index b3e193d..ae31f6a 100755 --- a/tests/udma/demo/run_tilexr_udma_demo.sh +++ b/tests/udma/demo/run_tilexr_udma_demo.sh @@ -21,7 +21,7 @@ source "${TILEXR_ROOT}/scripts/common_env.sh" export TILEXR_COMM_ID=${TILEXR_COMM_ID:-127.0.0.1:10067} export TILEXR_DEMO_NPUS=${npu_count} export TILEXR_DEMO_FIRST_NPU=${first_npu} -export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${TILEXR_ROOT}/install/lib:/usr/local/lib:${LD_LIBRARY_PATH:-}" +export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib64:${TILEXR_ROOT}/install/lib:${TILEXR_ROOT}/install/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}" bin="${INSTALL_DIR}/bin/tilexr_udma_demo" if [ ! -x "${bin}" ]; then @@ -36,7 +36,7 @@ echo "==========================================" echo " TileXR UDMA Communication Demo" echo "==========================================" echo "Binary: ${bin}" -echo "Test type: ${test_type} (0=all-gather put, 1=put-signal)" +echo "Test type: ${test_type} (0=all-gather put, 1=put-signal, 2=all-to-all, 3=all-reduce)" echo "Rank size: ${rank_size}" echo "Elements/rank: ${elements_per_rank}" echo "NPU count: ${npu_count}" diff --git a/tests/udma/demo/tilexr_udma_allreduce_layout.h b/tests/udma/demo/tilexr_udma_allreduce_layout.h new file mode 100644 index 0000000..ca3adf8 --- /dev/null +++ b/tests/udma/demo/tilexr_udma_allreduce_layout.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2024-2026 TileXR Project + * Licensed under the Apache License, Version 2.0 + */ + +#ifndef TILEXR_UDMA_ALLREDUCE_LAYOUT_H +#define TILEXR_UDMA_ALLREDUCE_LAYOUT_H + +#include +#include +#include +#include + +namespace TileXR { +namespace Demo { + +constexpr int32_t kAllReduceBaseValue = 1000; + +inline int32_t AllReduceValue(int rank) +{ + return kAllReduceBaseValue + rank; +} + +inline int32_t AllReduceExpectedSum(int rankSize) +{ + return rankSize * kAllReduceBaseValue + rankSize * (rankSize - 1) / 2; +} + +inline void FillAllReduceInput( + std::vector& input, int rank, int32_t elementsPerRank) +{ + std::fill(input.begin(), input.begin() + elementsPerRank, AllReduceValue(rank)); +} + +inline bool ValidateAllReduceOutput( + const std::vector& output, int rankSize, int32_t elementsPerRank) +{ + const int32_t expected = AllReduceExpectedSum(rankSize); + for (int32_t i = 0; i < elementsPerRank; ++i) { + if (output[static_cast(i)] != expected) { + return false; + } + } + return true; +} + +inline void BuildAllReduceOutputFromInputs( + const std::vector& allInputs, int rankSize, int32_t elementsPerRank, + std::vector& output) +{ + std::fill(output.begin(), output.begin() + elementsPerRank, 0); + for (int srcRank = 0; srcRank < rankSize; ++srcRank) { + const size_t srcBase = static_cast(srcRank) * elementsPerRank; + for (int32_t i = 0; i < elementsPerRank; ++i) { + output[static_cast(i)] += allInputs[srcBase + i]; + } + } +} + +} // namespace Demo +} // namespace TileXR + +#endif // TILEXR_UDMA_ALLREDUCE_LAYOUT_H diff --git a/tests/udma/demo/tilexr_udma_alltoall_layout.h b/tests/udma/demo/tilexr_udma_alltoall_layout.h new file mode 100644 index 0000000..da4e89d --- /dev/null +++ b/tests/udma/demo/tilexr_udma_alltoall_layout.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024-2026 TileXR Project + * Licensed under the Apache License, Version 2.0 + */ + +#ifndef TILEXR_UDMA_ALLTOALL_LAYOUT_H +#define TILEXR_UDMA_ALLTOALL_LAYOUT_H + +#include +#include +#include +#include + +namespace TileXR { +namespace Demo { + +constexpr int32_t kAllToAllBaseValue = 100000; + +inline int32_t AllToAllValue(int srcRank, int dstRank) +{ + return kAllToAllBaseValue + srcRank * 1000 + dstRank; +} + +inline void FillAllToAllInput( + std::vector& input, int rank, int rankSize, int32_t elementsPerPeer) +{ + for (int dstRank = 0; dstRank < rankSize; ++dstRank) { + std::fill(input.begin() + static_cast(dstRank) * elementsPerPeer, + input.begin() + static_cast(dstRank + 1) * elementsPerPeer, + AllToAllValue(rank, dstRank)); + } +} + +inline bool ValidateAllToAllOutput( + const std::vector& output, int rank, int rankSize, int32_t elementsPerPeer) +{ + for (int srcRank = 0; srcRank < rankSize; ++srcRank) { + const int32_t expected = AllToAllValue(srcRank, rank); + for (int32_t i = 0; i < elementsPerPeer; ++i) { + const size_t offset = static_cast(srcRank) * elementsPerPeer + i; + if (output[offset] != expected) { + return false; + } + } + } + return true; +} + +inline void BuildAllToAllOutputFromInputs( + const std::vector& allInputs, int rank, int rankSize, int32_t elementsPerPeer, + std::vector& output) +{ + for (int srcRank = 0; srcRank < rankSize; ++srcRank) { + const size_t srcBase = static_cast(srcRank) * rankSize * elementsPerPeer + + static_cast(rank) * elementsPerPeer; + const size_t dstBase = static_cast(srcRank) * elementsPerPeer; + std::copy(allInputs.begin() + srcBase, + allInputs.begin() + srcBase + elementsPerPeer, + output.begin() + dstBase); + } +} + +} // namespace Demo +} // namespace TileXR + +#endif // TILEXR_UDMA_ALLTOALL_LAYOUT_H diff --git a/tests/udma/demo/tilexr_udma_demo.cpp b/tests/udma/demo/tilexr_udma_demo.cpp index eabe662..630a3c8 100644 --- a/tests/udma/demo/tilexr_udma_demo.cpp +++ b/tests/udma/demo/tilexr_udma_demo.cpp @@ -20,18 +20,37 @@ #include "acl/acl.h" #include "tilexr_api.h" +#include "tilexr_data_as_flag.h" #include "tilexr_types.h" +#include "tilexr_udma_allreduce_layout.h" +#include "tilexr_udma_alltoall_layout.h" extern void launch_tilexr_udma_all_gather( uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR data, GM_ADDR debug, int32_t elementsPerRank); extern void launch_tilexr_udma_put_signal( uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR data, GM_ADDR signals, GM_ADDR debug, int32_t elementsPerRank, uint64_t signal); +extern void launch_tilexr_udma_all_to_all( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR input, GM_ADDR output, + GM_ADDR debug, int32_t elementsPerPeer, uint64_t outputByteOffset); +extern void launch_tilexr_all_to_all_ipc_scatter( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR input, GM_ADDR debug, int32_t elementsPerPeer); +extern void launch_tilexr_all_to_all_ipc_gather( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR output, GM_ADDR debug, int32_t elementsPerPeer); +extern void launch_tilexr_all_reduce_ipc_scatter( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR input, GM_ADDR debug, int32_t elementsPerRank); +extern void launch_tilexr_all_reduce_ipc_sum( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR output, GM_ADDR debug, int32_t elementsPerRank); namespace { constexpr int32_t kDefaultElementsPerRank = 16; constexpr uint64_t kSignalValue = 1000; -constexpr size_t kDebugWords = 16; +constexpr int kDebugUdmaStatusBase = 6; +constexpr int kDebugIpcScatter = kDebugUdmaStatusBase + TileXR::TILEXR_MAX_RANK_SIZE; +constexpr int kDebugIpcGather = kDebugIpcScatter + 1; +constexpr int kDebugAllReduceScatter = kDebugIpcGather + 1; +constexpr int kDebugAllReduceSum = kDebugAllReduceScatter + 1; +constexpr size_t kDebugWords = kDebugAllReduceSum + 1; constexpr int kDefaultCommPort = 10067; constexpr int kDemoBarrierPortOffset = 97; constexpr size_t kUdmaRegistrationAlignment = 2 * 1024 * 1024; @@ -341,6 +360,56 @@ bool ValidateData(int rank, int rankSize, const std::vector& data, int3 return ok; } +bool ValidateAllToAllData( + int rank, int rankSize, const std::vector& output, int32_t elementsPerPeer) +{ + bool ok = true; + for (int srcRank = 0; srcRank < rankSize; ++srcRank) { + int32_t expected = TileXR::Demo::AllToAllValue(srcRank, rank); + for (int32_t i = 0; i < elementsPerPeer; ++i) { + size_t offset = static_cast(srcRank) * elementsPerPeer + i; + if (output[offset] != expected) { + std::cerr << "[rank " << rank << "] ALLTOALL MISMATCH at src=" << srcRank + << " elem=" << i << " offset=" << offset + << " got=" << output[offset] << " expected=" << expected << std::endl; + ok = false; + break; + } + } + } + + std::cout << "[rank " << rank << "] alltoall output sample:"; + for (int srcRank = 0; srcRank < rankSize; ++srcRank) { + size_t offset = static_cast(srcRank) * elementsPerPeer; + std::cout << " from" << srcRank << "=" << output[offset]; + } + std::cout << std::endl; + return TileXR::Demo::ValidateAllToAllOutput(output, rank, rankSize, elementsPerPeer) && ok; +} + +bool ValidateAllReduceData( + int rank, int rankSize, const std::vector& output, int32_t elementsPerRank) +{ + bool ok = true; + const int32_t expected = TileXR::Demo::AllReduceExpectedSum(rankSize); + for (int32_t i = 0; i < elementsPerRank; ++i) { + if (output[static_cast(i)] != expected) { + std::cerr << "[rank " << rank << "] ALLREDUCE MISMATCH at elem=" << i + << " got=" << output[static_cast(i)] + << " expected=" << expected << std::endl; + ok = false; + break; + } + } + + std::cout << "[rank " << rank << "] allreduce output sample:"; + for (int32_t i = 0; i < std::min(elementsPerRank, 8); ++i) { + std::cout << " elem" << i << "=" << output[static_cast(i)]; + } + std::cout << std::endl; + return TileXR::Demo::ValidateAllReduceOutput(output, rankSize, elementsPerRank) && ok; +} + bool ValidateSignals(int rank, int rankSize, const std::vector& signals) { bool ok = true; @@ -359,6 +428,47 @@ bool ValidateSignals(int rank, int rankSize, const std::vector& signal return ok; } +bool AllToAllUdmaComplete(int rankSize, const std::vector& debug) +{ + for (int peer = 0; peer < rankSize; ++peer) { + if (debug[kDebugUdmaStatusBase + peer] != 0) { + return false; + } + } + return true; +} + +void PrintAllToAllUdmaDebug(int rank, int rankSize, const std::vector& debug) +{ + constexpr int rangeBase = kDebugUdmaStatusBase + 16; + constexpr int wqeBeforeBase = kDebugUdmaStatusBase + 32; + constexpr int wqeAfterBase = kDebugUdmaStatusBase + 48; + constexpr int localTokenBase = kDebugUdmaStatusBase + 64; + constexpr int remoteBaseLowBase = kDebugUdmaStatusBase + 80; + constexpr int memAddrLowBase = kDebugUdmaStatusBase + 96; + constexpr int tpnBase = kDebugUdmaStatusBase + 112; + std::cout << "[rank " << rank << "] alltoall udma peer debug:"; + for (int peer = 0; peer < rankSize && peer < 16; ++peer) { + std::cout << " peer" << peer + << "{status=" << debug[kDebugUdmaStatusBase + peer] + << ",range=" << debug[rangeBase + peer] + << ",wqe=" << debug[wqeBeforeBase + peer] << "->" << debug[wqeAfterBase + peer] + << ",token=" << debug[localTokenBase + peer] + << ",regLo=" << debug[remoteBaseLowBase + peer] + << ",memLo=" << debug[memAddrLowBase + peer] + << ",tpn=" << debug[tpnBase + peer] + << "}"; + } + std::cout << std::endl; +} + +size_t AllToAllDataAsFlagStagingBytes(int rankSize, int32_t elementsPerPeer) +{ + const uint64_t payloadBytes = static_cast(elementsPerPeer) * sizeof(int32_t); + const uint64_t blocks = TileXR::DataAsFlagBlockCountForPayloadBytes(payloadBytes); + return static_cast(static_cast(rankSize) * blocks * TileXR::DATA_AS_FLAG_BLOCK_BYTES); +} + void Cleanup( TileXRCommPtr comm, aclrtStream stream, void* registeredMemory, int32_t* debug, int rank, int deviceId) { @@ -446,11 +556,31 @@ int main(int argc, char** argv) return 1; } + bool isAllToAll = testType == 2; + bool isAllReduce = testType == 3; + bool strictAllToAllUdma = isAllToAll && GetEnvInt("TILEXR_DEMO_ALLTOALL_USE_UDMA", 0) != 0; + bool dumpAllToAllOnStrictFail = isAllToAll && GetEnvInt("TILEXR_DEMO_ALLTOALL_DUMP_ON_STRICT_FAIL", 0) != 0; + bool useAllToAllDataAsFlagIpc = isAllToAll && !strictAllToAllUdma; + bool forceAllToAllIpcFallback = false; + bool hasOutput = isAllToAll || isAllReduce; size_t dataCount = static_cast(rankSize) * elementsPerRank; size_t dataBytes = dataCount * sizeof(int32_t); + if (isAllToAll) { + const size_t stagingBytes = AllToAllDataAsFlagStagingBytes(rankSize, elementsPerRank); + if (stagingBytes > static_cast(TileXR::IPC_BUFF_MAX_SIZE)) { + std::cerr << "[rank " << rank << "] ERROR: alltoall data-as-flag IPC fallback staging requires " + << stagingBytes << " bytes, exceeds IPC data capacity " + << TileXR::IPC_BUFF_MAX_SIZE << std::endl; + Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); + return 1; + } + PrintStatus(rank, "alltoall data-as-flag staging bytes=" + std::to_string(stagingBytes)); + } + size_t inputOffset = 0; + size_t outputOffset = hasOutput ? dataBytes : 0; size_t signalBytes = static_cast(rankSize) * sizeof(uint64_t); - size_t signalOffset = dataBytes; - size_t payloadBytes = dataBytes + signalBytes; + size_t signalOffset = hasOutput ? dataBytes * 2 : dataBytes; + size_t payloadBytes = signalOffset + signalBytes; size_t registeredBytes = ((payloadBytes + kUdmaRegistrationAlignment - 1) / kUdmaRegistrationAlignment) * kUdmaRegistrationAlignment; if (!CheckAcl(rank, "aclrtMalloc debug", aclrtMalloc(reinterpret_cast(&debug), @@ -461,27 +591,63 @@ int main(int argc, char** argv) return 1; } auto data = static_cast(registeredMemory); + auto input = reinterpret_cast(static_cast(registeredMemory) + inputOffset); + auto output = reinterpret_cast(static_cast(registeredMemory) + outputOffset); auto signals = reinterpret_cast(static_cast(registeredMemory) + signalOffset); - if (!CheckTileXR(rank, "TileXRUDMARegister", - TileXRUDMARegister(comm, static_cast(registeredMemory), registeredBytes, &udmaHandle))) { - Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); - return 1; + if (useAllToAllDataAsFlagIpc) { + PrintStatus(rank, "skip TileXRUDMARegister for alltoall data-as-flag IPC path"); + forceAllToAllIpcFallback = true; + } else { + int registerRet = + TileXRUDMARegister(comm, static_cast(registeredMemory), registeredBytes, &udmaHandle); + if (registerRet != TileXR::TILEXR_SUCCESS) { + if (!isAllToAll || strictAllToAllUdma) { + if (strictAllToAllUdma) { + std::cerr << "[rank " << rank << "] ERROR: strict alltoall UDMA registration failed" + << " ret=" << registerRet << std::endl; + } + CheckTileXR(rank, "TileXRUDMARegister", registerRet); + Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); + return 1; + } + std::cerr << "[rank " << rank + << "] WARNING: TileXRUDMARegister failed; use alltoall data-as-flag IPC fallback" + << " ret=" << registerRet << std::endl; + forceAllToAllIpcFallback = true; + } else { + udmaRegistered = true; + } } - udmaRegistered = true; PrintStatus(rank, "registered UDMA memory base=" + std::to_string(reinterpret_cast(registeredMemory)) + " bytes=" + std::to_string(registeredBytes) + - " dataOffset=0 signalOffset=" + std::to_string(signalOffset)); + " inputOffset=" + std::to_string(inputOffset) + + " outputOffset=" + std::to_string(outputOffset) + + " signalOffset=" + std::to_string(signalOffset)); PrintCommArgs(rank, *commArgsHost, commArgsDev); std::vector hostData(dataCount, -1); - std::fill(hostData.begin() + static_cast(rank) * elementsPerRank, - hostData.begin() + static_cast(rank + 1) * elementsPerRank, - 1000 + rank); + std::vector hostOutput(dataCount, -1); + if (isAllToAll) { + TileXR::Demo::FillAllToAllInput(hostData, rank, rankSize, elementsPerRank); + } else if (isAllReduce) { + TileXR::Demo::FillAllReduceInput(hostData, rank, elementsPerRank); + } else { + std::fill(hostData.begin() + static_cast(rank) * elementsPerRank, + hostData.begin() + static_cast(rank + 1) * elementsPerRank, + 1000 + rank); + } std::vector hostSignals(static_cast(rankSize), 0); std::vector hostDebug(kDebugWords, 0); - if (!CopyHostToDevice(rank, data, dataCount * sizeof(int32_t), - hostData.data(), dataCount * sizeof(int32_t), "data") || + const char* inputName = isAllToAll ? "alltoall input" : (isAllReduce ? "allreduce input" : "data"); + bool initOk = CopyHostToDevice(rank, input, dataCount * sizeof(int32_t), + hostData.data(), dataCount * sizeof(int32_t), inputName); + if (hasOutput) { + const char* outputName = isAllToAll ? "alltoall output" : "allreduce output"; + initOk = CopyHostToDevice(rank, output, dataCount * sizeof(int32_t), + hostOutput.data(), dataCount * sizeof(int32_t), outputName) && initOk; + } + if (!initOk || !CopyHostToDevice(rank, signals, hostSignals.size() * sizeof(uint64_t), hostSignals.data(), hostSignals.size() * sizeof(uint64_t), "signals") || !CopyHostToDevice(rank, debug, hostDebug.size() * sizeof(int32_t), @@ -500,12 +666,27 @@ int main(int argc, char** argv) return 1; } - PrintStatus(rank, testType == 1 ? "launch put-signal kernel" : "launch all-gather kernel"); - if (testType == 1) { + if (testType == 2) { + if (forceAllToAllIpcFallback) { + PrintStatus(rank, "skip all-to-all UDMA kernel; use data-as-flag IPC fallback"); + } else { + PrintStatus(rank, "launch all-to-all kernel"); + launch_tilexr_udma_all_to_all( + 1, stream, commArgsDev, reinterpret_cast(input), reinterpret_cast(output), + reinterpret_cast(debug), elementsPerRank, static_cast(outputOffset)); + } + } else if (testType == 3) { + PrintStatus(rank, "launch all-reduce IPC scatter kernel"); + launch_tilexr_all_reduce_ipc_scatter( + 1, stream, commArgsDev, reinterpret_cast(input), reinterpret_cast(debug), + elementsPerRank); + } else if (testType == 1) { + PrintStatus(rank, "launch put-signal kernel"); launch_tilexr_udma_put_signal( 1, stream, commArgsDev, reinterpret_cast(data), reinterpret_cast(signals), reinterpret_cast(debug), elementsPerRank, kSignalValue); } else { + PrintStatus(rank, "launch all-gather kernel"); launch_tilexr_udma_all_gather( 1, stream, commArgsDev, reinterpret_cast(data), reinterpret_cast(debug), elementsPerRank); @@ -525,8 +706,104 @@ int main(int argc, char** argv) return 1; } - if (!CopyDeviceToHost(rank, hostData.data(), dataCount * sizeof(int32_t), - data, dataCount * sizeof(int32_t), "data") || + if (isAllReduce) { + PrintStatus(rank, "launch all-reduce IPC sum kernel"); + launch_tilexr_all_reduce_ipc_sum( + 1, stream, commArgsDev, reinterpret_cast(output), reinterpret_cast(debug), + elementsPerRank); + if (!CheckAcl(rank, "aclrtSynchronizeStream allreduce ipc sum", aclrtSynchronizeStream(stream)) || + !DemoBarrierAll(rank, rankSize, "all ranks completed allreduce ipc sum")) { + if (udmaRegistered) { + CheckTileXR(rank, "TileXRUDMAUnregister", TileXRUDMAUnregister(comm, udmaHandle)); + } + Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); + return 1; + } + } + + if (isAllToAll && + !CopyDeviceToHost(rank, hostDebug.data(), hostDebug.size() * sizeof(int32_t), + debug, hostDebug.size() * sizeof(int32_t), "debug after alltoall udma")) { + if (udmaRegistered) { + CheckTileXR(rank, "TileXRUDMAUnregister", TileXRUDMAUnregister(comm, udmaHandle)); + } + Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); + return 1; + } + + bool usedIpcFallback = false; + bool allToAllUdmaComplete = !isAllToAll || AllToAllUdmaComplete(rankSize, hostDebug); + if (isAllToAll && strictAllToAllUdma && !allToAllUdmaComplete) { + std::cerr << "[rank " << rank << "] ERROR: strict alltoall UDMA CQ incomplete:"; + for (int peer = 0; peer < rankSize; ++peer) { + std::cerr << " peer" << peer << "=" << hostDebug[kDebugUdmaStatusBase + peer]; + } + std::cerr << std::endl; + PrintAllToAllUdmaDebug(rank, rankSize, hostDebug); + if (dumpAllToAllOnStrictFail) { + std::vector strictFailOutput(dataCount, 0); + if (CopyDeviceToHost(rank, strictFailOutput.data(), dataBytes, + output, dataBytes, "alltoall output after strict UDMA fail")) { + (void)ValidateAllToAllData(rank, rankSize, strictFailOutput, elementsPerRank); + } + } + if (udmaRegistered) { + CheckTileXR(rank, "TileXRUDMAUnregister", TileXRUDMAUnregister(comm, udmaHandle)); + } + Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); + return 1; + } + if (isAllToAll && (forceAllToAllIpcFallback || !allToAllUdmaComplete)) { + usedIpcFallback = true; + std::cout << "[rank " << rank << "] alltoall use data-as-flag IPC fallback"; + if (forceAllToAllIpcFallback) { + if (useAllToAllDataAsFlagIpc) { + std::cout << " by default"; + } else { + std::cout << " after UDMA registration failure"; + } + } else { + std::cout << " after UDMA CQ incomplete"; + } + std::cout << ":"; + for (int peer = 0; peer < rankSize; ++peer) { + std::cout << " peer" << peer << "=" << hostDebug[kDebugUdmaStatusBase + peer]; + } + std::cout << std::endl; + + launch_tilexr_all_to_all_ipc_scatter( + 1, stream, commArgsDev, reinterpret_cast(input), + reinterpret_cast(debug), elementsPerRank); + if (!CheckAcl(rank, "aclrtSynchronizeStream alltoall ipc scatter", aclrtSynchronizeStream(stream)) || + !DemoBarrierAll(rank, rankSize, "all ranks completed alltoall ipc scatter")) { + if (udmaRegistered) { + CheckTileXR(rank, "TileXRUDMAUnregister", TileXRUDMAUnregister(comm, udmaHandle)); + } + Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); + return 1; + } + + launch_tilexr_all_to_all_ipc_gather( + 1, stream, commArgsDev, reinterpret_cast(output), + reinterpret_cast(debug), elementsPerRank); + if (!CheckAcl(rank, "aclrtSynchronizeStream alltoall ipc gather", aclrtSynchronizeStream(stream)) || + !DemoBarrierAll(rank, rankSize, "all ranks completed alltoall ipc gather")) { + if (udmaRegistered) { + CheckTileXR(rank, "TileXRUDMAUnregister", TileXRUDMAUnregister(comm, udmaHandle)); + } + Cleanup(comm, stream, registeredMemory, debug, rank, deviceId); + return 1; + } + } + + bool copyBackOk = CopyDeviceToHost(rank, hostData.data(), dataCount * sizeof(int32_t), + data, dataCount * sizeof(int32_t), "data"); + if (hasOutput) { + const char* outputName = isAllToAll ? "alltoall output" : "allreduce output"; + copyBackOk = CopyDeviceToHost(rank, hostOutput.data(), dataCount * sizeof(int32_t), + output, dataCount * sizeof(int32_t), outputName) && copyBackOk; + } + if (!copyBackOk || !CopyDeviceToHost(rank, hostSignals.data(), hostSignals.size() * sizeof(uint64_t), signals, hostSignals.size() * sizeof(uint64_t), "signals") || !CopyDeviceToHost(rank, hostDebug.data(), hostDebug.size() * sizeof(int32_t), @@ -539,12 +816,32 @@ int main(int argc, char** argv) } std::cout << "[rank " << rank << "] debug words:"; - for (size_t i = 0; i < std::min(5, hostDebug.size()); ++i) { + for (size_t i = 0; i < std::min(10, hostDebug.size()); ++i) { std::cout << " d" << i << "=" << hostDebug[i]; } std::cout << std::endl; + if (isAllToAll) { + PrintAllToAllUdmaDebug(rank, rankSize, hostDebug); + } + if (usedIpcFallback) { + std::cout << "[rank " << rank << "] alltoall IPC fallback completed" + << " scatter=" << hostDebug[kDebugIpcScatter] + << " gather=" << hostDebug[kDebugIpcGather] << std::endl; + } + if (isAllReduce) { + std::cout << "[rank " << rank << "] allreduce IPC completed" + << " scatter=" << hostDebug[kDebugAllReduceScatter] + << " sum=" << hostDebug[kDebugAllReduceSum] << std::endl; + } - bool ok = ValidateData(rank, rankSize, hostData, elementsPerRank); + bool ok = false; + if (isAllToAll) { + ok = ValidateAllToAllData(rank, rankSize, hostOutput, elementsPerRank); + } else if (isAllReduce) { + ok = ValidateAllReduceData(rank, rankSize, hostOutput, elementsPerRank); + } else { + ok = ValidateData(rank, rankSize, hostData, elementsPerRank); + } if (testType == 1) { ok = ValidateSignals(rank, rankSize, hostSignals) && ok; } diff --git a/tests/udma/demo/tilexr_udma_demo_kernel.cpp b/tests/udma/demo/tilexr_udma_demo_kernel.cpp index a49ca3c..988ef4b 100644 --- a/tests/udma/demo/tilexr_udma_demo_kernel.cpp +++ b/tests/udma/demo/tilexr_udma_demo_kernel.cpp @@ -4,9 +4,48 @@ */ #include "kernel_operator.h" +#include "tilexr_data_as_flag.h" #include "tilexr_udma.h" constexpr int32_t TILEXR_UDMA_DEMO_MAGIC = 0x5444554d; // "TDUM" +constexpr uint64_t TILEXR_UDMA_DEMO_IPC_STAGING_OFFSET = TileXR::IPC_DATA_OFFSET; +constexpr uint64_t TILEXR_UDMA_DEMO_DATA_AS_FLAG_STAGING_OFFSET = TileXR::IPC_DATA_OFFSET; +constexpr uint32_t TILEXR_UDMA_DEMO_DATA_AS_FLAG_UB_BYTES = 64 * 1024; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE = 6; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_RANGE_VALID_BASE = + TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + 16; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_WQE_BEFORE_BASE = + TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + 32; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_WQE_AFTER_BASE = + TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + 48; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_LOCAL_TOKEN_BASE = + TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + 64; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_REMOTE_BASE_LOW_BASE = + TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + 80; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_MEM_ADDR_LOW_BASE = + TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + 96; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_TPN_BASE = + TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + 112; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_IPC_SCATTER = + TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + TileXR::TILEXR_MAX_RANK_SIZE; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_IPC_GATHER = TILEXR_UDMA_DEMO_DEBUG_IPC_SCATTER + 1; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_ALLREDUCE_SCATTER = TILEXR_UDMA_DEMO_DEBUG_IPC_GATHER + 1; +constexpr int32_t TILEXR_UDMA_DEMO_DEBUG_ALLREDUCE_SUM = TILEXR_UDMA_DEMO_DEBUG_ALLREDUCE_SCATTER + 1; + +namespace { + +__aicore__ inline uint64_t AllToAllPayloadBytes(int32_t elementsPerPeer) +{ + return static_cast(elementsPerPeer) * sizeof(int32_t); +} + +__aicore__ inline uint64_t AllToAllDataAsFlagSegmentBytes(uint64_t payloadBytes) +{ + return static_cast(TileXR::DataAsFlagBlockCountForPayloadBytes(payloadBytes)) * + TileXR::DATA_AS_FLAG_BLOCK_BYTES; +} + +} // namespace extern "C" __global__ __aicore__ void tilexr_udma_all_gather_kernel( GM_ADDR commArgsGM, GM_ADDR dataGM, GM_ADDR debugGM, int32_t elementsPerRank) @@ -80,6 +119,186 @@ extern "C" __global__ __aicore__ void tilexr_udma_put_signal_kernel( } } +extern "C" __global__ __aicore__ void tilexr_udma_all_to_all_kernel( + GM_ADDR commArgsGM, GM_ADDR inputGM, GM_ADDR outputGM, GM_ADDR debugGM, + int32_t elementsPerPeer, uint64_t outputByteOffset) +{ + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto input = reinterpret_cast<__gm__ int32_t*>(inputGM); + auto output = reinterpret_cast<__gm__ int32_t*>(outputGM); + auto debug = reinterpret_cast<__gm__ int32_t*>(debugGM); + + int32_t rank = args->rank; + int32_t rankSize = args->rankSize; + bool enabled = TileXR::UDMARegistryEnabled(args); + + if (debug != nullptr) { + debug[0] = TILEXR_UDMA_DEMO_MAGIC; + debug[1] = rank; + debug[2] = rankSize; + debug[3] = enabled ? 1 : 0; + debug[4] = elementsPerPeer; + debug[5] = static_cast(outputByteOffset); + } + if (!enabled) { + return; + } + + const uint64_t payloadBytes = AllToAllPayloadBytes(elementsPerPeer); + auto selfSrc = input + static_cast(rank) * elementsPerPeer; + auto selfDst = output + static_cast(rank) * elementsPerPeer; + for (int32_t i = 0; i < elementsPerPeer; ++i) { + selfDst[i] = selfSrc[i]; + } + + uint32_t bytes = static_cast(payloadBytes); + for (int32_t peer = 0; peer < rankSize; ++peer) { + if (peer == rank) { + continue; + } + auto localSrc = input + static_cast(peer) * elementsPerPeer; + uint64_t remoteOffset = outputByteOffset + + static_cast(rank) * payloadBytes; + auto registry = TileXR::GetUDMARegistry(args); + auto udmaInfo = TileXR::GetUDMAInfo(args); + auto wqCtx = TileXR::UDMAGetWQCtx(udmaInfo, peer, 0); + auto remoteMemInfo = TileXR::UDMAGetRemoteMemInfo(udmaInfo, peer); + bool rangeValid = TileXR::UDMARegisteredRangeValid(registry, peer, remoteOffset, bytes); + uint32_t wqeBefore = ld_dev(reinterpret_cast<__gm__ uint32_t*>(wqCtx->wqeCntAddr), 0); + if (debug != nullptr && peer < 16) { + debug[TILEXR_UDMA_DEMO_DEBUG_RANGE_VALID_BASE + peer] = rangeValid ? 1 : 0; + debug[TILEXR_UDMA_DEMO_DEBUG_WQE_BEFORE_BASE + peer] = static_cast(wqeBefore); + debug[TILEXR_UDMA_DEMO_DEBUG_LOCAL_TOKEN_BASE + peer] = static_cast(wqCtx->localTokenId); + debug[TILEXR_UDMA_DEMO_DEBUG_REMOTE_BASE_LOW_BASE + peer] = + static_cast(reinterpret_cast(registry->regions[peer].base) & 0xFFFFFFFFU); + debug[TILEXR_UDMA_DEMO_DEBUG_MEM_ADDR_LOW_BASE + peer] = + static_cast(remoteMemInfo->addr & 0xFFFFFFFFU); + debug[TILEXR_UDMA_DEMO_DEBUG_TPN_BASE + peer] = static_cast(remoteMemInfo->tpn); + } + TileXR::UDMAPutNbi(args, peer, localSrc, remoteOffset, bytes); + uint32_t wqeAfter = ld_dev(reinterpret_cast<__gm__ uint32_t*>(wqCtx->wqeCntAddr), 0); + if (debug != nullptr && peer < 16) { + debug[TILEXR_UDMA_DEMO_DEBUG_WQE_AFTER_BASE + peer] = static_cast(wqeAfter); + } + uint32_t status = TileXR::UDMAQuietStatus(args, peer); + if (debug != nullptr) { + debug[TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + peer] = static_cast(status); + } + } +} + +extern "C" __global__ __aicore__ void tilexr_all_to_all_ipc_scatter_kernel( + GM_ADDR commArgsGM, GM_ADDR inputGM, GM_ADDR debugGM, int32_t elementsPerPeer) +{ + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto input = reinterpret_cast<__gm__ int32_t*>(inputGM); + auto debug = reinterpret_cast<__gm__ int32_t*>(debugGM); + + int32_t rank = args->rank; + int32_t rankSize = args->rankSize; + const uint64_t payloadBytes = AllToAllPayloadBytes(elementsPerPeer); + const uint64_t segmentBytes = AllToAllDataAsFlagSegmentBytes(payloadBytes); + auto inputBytes = reinterpret_cast<__gm__ uint8_t*>(input); + + AscendC::TPipe pipe; + AscendC::TBuf tBuf; + pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_DATA_AS_FLAG_UB_BYTES); + AscendC::LocalTensor scratch = tBuf.Get(); + if (TileXR::DataAsFlagInit(scratch) == 0U) { + if (debug != nullptr) { + debug[TILEXR_UDMA_DEMO_DEBUG_IPC_SCATTER] = -1; + } + return; + } + + for (int32_t dstRank = 0; dstRank < rankSize; ++dstRank) { + auto localSrc = inputBytes + static_cast(dstRank) * payloadBytes; + auto remoteDst = reinterpret_cast<__gm__ uint8_t*>( + args->peerMems[dstRank] + TILEXR_UDMA_DEMO_DATA_AS_FLAG_STAGING_OFFSET + + static_cast(rank) * segmentBytes); + (void)TileXR::DataAsFlagSend(remoteDst, localSrc, payloadBytes, scratch); + } + if (debug != nullptr) { + debug[TILEXR_UDMA_DEMO_DEBUG_IPC_SCATTER] = 1; + } +} + +extern "C" __global__ __aicore__ void tilexr_all_to_all_ipc_gather_kernel( + GM_ADDR commArgsGM, GM_ADDR outputGM, GM_ADDR debugGM, int32_t elementsPerPeer) +{ + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto output = reinterpret_cast<__gm__ int32_t*>(outputGM); + auto debug = reinterpret_cast<__gm__ int32_t*>(debugGM); + + const uint64_t payloadBytes = AllToAllPayloadBytes(elementsPerPeer); + const uint64_t segmentBytes = AllToAllDataAsFlagSegmentBytes(payloadBytes); + auto outputBytes = reinterpret_cast<__gm__ uint8_t*>(output); + auto localBase = reinterpret_cast<__gm__ uint8_t*>( + args->peerMems[args->rank] + TILEXR_UDMA_DEMO_DATA_AS_FLAG_STAGING_OFFSET); + + AscendC::TPipe pipe; + AscendC::TBuf tBuf; + pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_DATA_AS_FLAG_UB_BYTES); + AscendC::LocalTensor scratch = tBuf.Get(); + + for (int32_t srcRank = 0; srcRank < args->rankSize; ++srcRank) { + auto localSrc = localBase + static_cast(srcRank) * segmentBytes; + auto localDst = outputBytes + static_cast(srcRank) * payloadBytes; + if (!TileXR::DataAsFlagCheckAndRecv(localSrc, payloadBytes, localDst, scratch)) { + if (debug != nullptr) { + debug[TILEXR_UDMA_DEMO_DEBUG_IPC_GATHER] = -1; + } + return; + } + } + if (debug != nullptr) { + debug[TILEXR_UDMA_DEMO_DEBUG_IPC_GATHER] = 1; + } +} + +extern "C" __global__ __aicore__ void tilexr_all_reduce_ipc_scatter_kernel( + GM_ADDR commArgsGM, GM_ADDR inputGM, GM_ADDR debugGM, int32_t elementsPerRank) +{ + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto input = reinterpret_cast<__gm__ int32_t*>(inputGM); + auto debug = reinterpret_cast<__gm__ int32_t*>(debugGM); + + int32_t rank = args->rank; + int32_t rankSize = args->rankSize; + for (int32_t dstRank = 0; dstRank < rankSize; ++dstRank) { + auto remoteBase = reinterpret_cast<__gm__ int32_t*>( + args->peerMems[dstRank] + TILEXR_UDMA_DEMO_IPC_STAGING_OFFSET); + auto remoteDst = remoteBase + rank * elementsPerRank; + for (int32_t i = 0; i < elementsPerRank; ++i) { + remoteDst[i] = input[i]; + } + } + if (debug != nullptr) { + debug[TILEXR_UDMA_DEMO_DEBUG_ALLREDUCE_SCATTER] = 1; + } +} + +extern "C" __global__ __aicore__ void tilexr_all_reduce_ipc_sum_kernel( + GM_ADDR commArgsGM, GM_ADDR outputGM, GM_ADDR debugGM, int32_t elementsPerRank) +{ + auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM); + auto output = reinterpret_cast<__gm__ int32_t*>(outputGM); + auto debug = reinterpret_cast<__gm__ int32_t*>(debugGM); + + auto localBase = reinterpret_cast<__gm__ int32_t*>( + args->peerMems[args->rank] + TILEXR_UDMA_DEMO_IPC_STAGING_OFFSET); + for (int32_t i = 0; i < elementsPerRank; ++i) { + int32_t sum = 0; + for (int32_t srcRank = 0; srcRank < args->rankSize; ++srcRank) { + sum += localBase[srcRank * elementsPerRank + i]; + } + output[i] = sum; + } + if (debug != nullptr) { + debug[TILEXR_UDMA_DEMO_DEBUG_ALLREDUCE_SUM] = 1; + } +} + extern "C" __global__ __aicore__ void tilexr_udma_registered_smoke_kernel( GM_ADDR commArgsGM, GM_ADDR localGM, GM_ADDR debugGM, uint32_t bytes, uint64_t signal) { @@ -125,6 +344,42 @@ void launch_tilexr_udma_put_signal( commArgs, data, signals, debug, elementsPerRank, signal); } +void launch_tilexr_udma_all_to_all( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR input, GM_ADDR output, + GM_ADDR debug, int32_t elementsPerPeer, uint64_t outputByteOffset) +{ + tilexr_udma_all_to_all_kernel<<>>( + commArgs, input, output, debug, elementsPerPeer, outputByteOffset); +} + +void launch_tilexr_all_to_all_ipc_scatter( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR input, GM_ADDR debug, int32_t elementsPerPeer) +{ + tilexr_all_to_all_ipc_scatter_kernel<<>>( + commArgs, input, debug, elementsPerPeer); +} + +void launch_tilexr_all_to_all_ipc_gather( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR output, GM_ADDR debug, int32_t elementsPerPeer) +{ + tilexr_all_to_all_ipc_gather_kernel<<>>( + commArgs, output, debug, elementsPerPeer); +} + +void launch_tilexr_all_reduce_ipc_scatter( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR input, GM_ADDR debug, int32_t elementsPerRank) +{ + tilexr_all_reduce_ipc_scatter_kernel<<>>( + commArgs, input, debug, elementsPerRank); +} + +void launch_tilexr_all_reduce_ipc_sum( + uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR output, GM_ADDR debug, int32_t elementsPerRank) +{ + tilexr_all_reduce_ipc_sum_kernel<<>>( + commArgs, output, debug, elementsPerRank); +} + void launch_tilexr_udma_registered_smoke( uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR local, GM_ADDR debug, uint32_t bytes, uint64_t signal) { diff --git a/tests/udma/run_tests.sh b/tests/udma/run_tests.sh index f020b58..255bf7f 100755 --- a/tests/udma/run_tests.sh +++ b/tests/udma/run_tests.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# 运行 UDMA 测试 +# Run UDMA tests. # set -e @@ -9,11 +9,13 @@ SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) TILEXR_ROOT="${SCRIPT_DIR}/../.." INSTALL_DIR="${SCRIPT_DIR}/install" -# 加载环境 source "${TILEXR_ROOT}/scripts/common_env.sh" -# 设置 LD_LIBRARY_PATH:优先使用当前仓库刚编译安装的库,避免被 /usr/local/lib 中的旧库覆盖 -export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${TILEXR_ROOT}/install/lib:/usr/local/lib:${LD_LIBRARY_PATH}" +if [ -x /usr/local/mpi/bin/mpirun ]; then + export PATH="/usr/local/mpi/bin:${PATH}" +fi + +export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib64:${TILEXR_ROOT}/install/lib:${TILEXR_ROOT}/install/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}" echo "==========================================" echo " Running UDMA Tests" @@ -26,7 +28,8 @@ detect_ok_npus() { local ids=() for id in $(seq 0 15); do local health - health=$(npu-smi info -t health -i "${id}" 2>/dev/null | awk -F: '/Health Status/ {gsub(/^[ \t]+|[ \t]+$/, "", $2); print $2; exit}') + health=$(npu-smi info -t health -i "${id}" 2>/dev/null | + awk -F: '/Health Status/ {gsub(/^[ \t]+|[ \t]+$/, "", $2); print $2; exit}') if [ "${health}" = "OK" ]; then ids+=("${id}") fi @@ -43,15 +46,23 @@ if [ -n "${TILEXR_TEST_DEVICES:-}" ]; then echo "TILEXR_TEST_DEVICES: ${TILEXR_TEST_DEVICES}" fi -# 检查测试二进制是否存在 -if [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma_transport_layout" ] || - [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma_registry" ] || - [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma" ]; then - echo "ERROR: Test binaries not found. Please run build.sh first." - exit 1 -fi +required_bins=( + test_tilexr_udma_transport_layout + test_tilexr_udma_registry + test_tilexr_udma_alltoall_layout + test_tilexr_udma_allreduce_layout + test_tilexr_chip_map_sources + test_tilexr_ipc_pid_mode_sources + test_tilexr_udma +) + +for bin in "${required_bins[@]}"; do + if [ ! -f "${INSTALL_DIR}/bin/${bin}" ]; then + echo "ERROR: ${INSTALL_DIR}/bin/${bin} not found. Please run build.sh first." + exit 1 + fi +done -# 测试 1: UDMA info layout 单元测试(host-only) echo "==========================================" echo "Test 1: TileXR UDMA Transport Layout Unit Test" echo "==========================================" @@ -59,7 +70,6 @@ echo "==========================================" TEST1_RESULT=$? echo "" -# 测试 2: TileXR UDMA registry 单元测试(host-only) echo "==========================================" echo "Test 2: TileXR UDMA Registry Unit Test" echo "==========================================" @@ -67,24 +77,48 @@ echo "==========================================" TEST2_RESULT=$? echo "" -# 测试 3: TileXR 集成测试(单进程,单卡) echo "==========================================" -echo "Test 3: TileXR Integration Tests (Single Process)" +echo "Test 3: TileXR UDMA All-To-All Layout Unit Test" +echo "==========================================" +"${INSTALL_DIR}/bin/test_tilexr_udma_alltoall_layout" +TEST3_RESULT=$? +echo "" + +echo "==========================================" +echo "Test 4: TileXR UDMA All-Reduce Layout Unit Test" +echo "==========================================" +"${INSTALL_DIR}/bin/test_tilexr_udma_allreduce_layout" +TEST4_RESULT=$? +echo "" + +echo "==========================================" +echo "Test 5: TileXR Chip Map Source Unit Test" +echo "==========================================" +"${INSTALL_DIR}/bin/test_tilexr_chip_map_sources" +TEST5_RESULT=$? +echo "" + +echo "==========================================" +echo "Test 6: TileXR IPC PID Mode Source Unit Test" +echo "==========================================" +"${INSTALL_DIR}/bin/test_tilexr_ipc_pid_mode_sources" +TEST6_RESULT=$? +echo "" + +echo "==========================================" +echo "Test 7: TileXR Integration Tests (Single Process)" echo "==========================================" export RANK=0 export RANK_SIZE=1 "${INSTALL_DIR}/bin/test_tilexr_udma" -TEST3_RESULT=$? +TEST7_RESULT=$? echo "" -# 测试 4: TileXR 多进程测试(需要 mpirun) echo "==========================================" -echo "Test 4: TileXR Multi-Process Tests (MPI)" +echo "Test 8: TileXR Multi-Process Tests (MPI)" echo "==========================================" -# 检查是否有 mpirun -if command -v mpirun &> /dev/null; then - # 检测可用的 NPU 数量 +if command -v mpirun >/dev/null 2>&1; then NPU_COUNT=${TILEXR_ASCEND_DEV_NUM:-0} echo "Detected ${NPU_COUNT} NPU(s)" @@ -98,30 +132,33 @@ if command -v mpirun &> /dev/null; then unset RANK unset RANK_SIZE mpirun -n 2 "${INSTALL_DIR}/bin/test_tilexr_udma" - TEST4_RESULT=$? + TEST8_RESULT=$? else echo "SKIP: Need at least 2 usable NPUs for multi-rank test" - TEST4_RESULT=0 + TEST8_RESULT=0 fi else echo "SKIP: mpirun not found, skipping multi-process tests" - TEST4_RESULT=0 + TEST8_RESULT=0 fi echo "" -# 汇总结果 echo "==========================================" echo " Test Results Summary" echo "==========================================" -echo "Test 1 (UDMA Layout): $([ $TEST1_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')" -echo "Test 2 (UDMA Registry): $([ $TEST2_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')" -echo "Test 3 (TileXR Single): $([ $TEST3_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')" -echo "Test 4 (TileXR Multi): $([ $TEST4_RESULT -eq 0 ] && echo 'PASS' || echo 'SKIP/FAIL')" +echo "Test 1 (UDMA Layout): $([ ${TEST1_RESULT} -eq 0 ] && echo 'PASS' || echo 'FAIL')" +echo "Test 2 (UDMA Registry): $([ ${TEST2_RESULT} -eq 0 ] && echo 'PASS' || echo 'FAIL')" +echo "Test 3 (AllToAll Layout): $([ ${TEST3_RESULT} -eq 0 ] && echo 'PASS' || echo 'FAIL')" +echo "Test 4 (AllReduce Layout): $([ ${TEST4_RESULT} -eq 0 ] && echo 'PASS' || echo 'FAIL')" +echo "Test 5 (Chip Map): $([ ${TEST5_RESULT} -eq 0 ] && echo 'PASS' || echo 'FAIL')" +echo "Test 6 (IPC PID Mode): $([ ${TEST6_RESULT} -eq 0 ] && echo 'PASS' || echo 'FAIL')" +echo "Test 7 (TileXR Single): $([ ${TEST7_RESULT} -eq 0 ] && echo 'PASS' || echo 'FAIL')" +echo "Test 8 (TileXR Multi): $([ ${TEST8_RESULT} -eq 0 ] && echo 'PASS' || echo 'SKIP/FAIL')" echo "==========================================" -# 返回失败状态 -if [ $TEST1_RESULT -ne 0 ] || [ $TEST2_RESULT -ne 0 ] || [ $TEST3_RESULT -ne 0 ] || - [ $TEST4_RESULT -ne 0 ]; then +if [ ${TEST1_RESULT} -ne 0 ] || [ ${TEST2_RESULT} -ne 0 ] || [ ${TEST3_RESULT} -ne 0 ] || + [ ${TEST4_RESULT} -ne 0 ] || [ ${TEST5_RESULT} -ne 0 ] || [ ${TEST6_RESULT} -ne 0 ] || + [ ${TEST7_RESULT} -ne 0 ] || [ ${TEST8_RESULT} -ne 0 ]; then exit 1 fi diff --git a/tests/udma/unit/test_tilexr_chip_map_sources.cpp b/tests/udma/unit/test_tilexr_chip_map_sources.cpp new file mode 100644 index 0000000..25d06cc --- /dev/null +++ b/tests/udma/unit/test_tilexr_chip_map_sources.cpp @@ -0,0 +1,45 @@ +#include +#include +#include +#include + +#ifndef TILEXR_SOURCE_ROOT +#define TILEXR_SOURCE_ROOT "." +#endif + +namespace { + +int g_failures = 0; + +#define CHECK_CONTAINS(text, needle) \ + do { \ + if ((text).find(needle) == std::string::npos) { \ + std::cerr << "CHECK_CONTAINS failed at line " << __LINE__ << ": " << needle << std::endl; \ + ++g_failures; \ + } \ + } while (0) + +std::string ReadFile(const std::string& path) +{ + std::ifstream in(path.c_str()); + std::ostringstream out; + out << in.rdbuf(); + return out.str(); +} + +} // namespace + +int main() +{ + const std::string internal = ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/comm/tilexr_internal.cpp"); + CHECK_CONTAINS(internal, "\"Ascend950DT_9592\""); + CHECK_CONTAINS(internal, "\"Ascend950PR_9599\""); + CHECK_CONTAINS(internal, "ChipName::CHIP_950"); + + if (g_failures != 0) { + std::cerr << g_failures << " chip map source checks failed" << std::endl; + return 1; + } + std::cout << "TileXR chip map source checks passed" << std::endl; + return 0; +} diff --git a/tests/udma/unit/test_tilexr_ipc_pid_mode_sources.cpp b/tests/udma/unit/test_tilexr_ipc_pid_mode_sources.cpp new file mode 100644 index 0000000..037647e --- /dev/null +++ b/tests/udma/unit/test_tilexr_ipc_pid_mode_sources.cpp @@ -0,0 +1,68 @@ +#include +#include +#include +#include + +#ifndef TILEXR_SOURCE_ROOT +#define TILEXR_SOURCE_ROOT "." +#endif + +namespace { + +int g_failures = 0; + +#define CHECK_CONTAINS(text, needle) \ + do { \ + if ((text).find(needle) == std::string::npos) { \ + std::cerr << "CHECK_CONTAINS failed at line " << __LINE__ << ": " << needle << std::endl; \ + ++g_failures; \ + } \ + } while (0) + +#define CHECK_NOT_CONTAINS(text, needle) \ + do { \ + if ((text).find(needle) != std::string::npos) { \ + std::cerr << "CHECK_NOT_CONTAINS failed at line " << __LINE__ << ": " << needle << std::endl; \ + ++g_failures; \ + } \ + } while (0) + +std::string ReadFile(const std::string& path) +{ + std::ifstream in(path.c_str()); + std::ostringstream out; + out << in.rdbuf(); + return out.str(); +} + +} // namespace + +int main() +{ + const std::string comm = ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/comm/tilexr_comm.cpp"); + const std::string cmake = ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/tests/udma/CMakeLists.txt"); + CHECK_CONTAINS(comm, "TILEXR_IPC_PID_MODE"); + CHECK_CONTAINS(comm, "physicalInfo_.chipName < ChipName::CHIP_950"); + CHECK_CONTAINS(comm, "rtSetIpcMemPid"); + CHECK_CONTAINS(comm, "rtSetIpcMemorySuperPodPid"); + CHECK_CONTAINS(comm, "fallback to rtSetIpcMemPid"); + CHECK_CONTAINS(comm, "OpenIpcMem failed after sdid setup, retry with pid setup"); + CHECK_CONTAINS(comm, "\"pid_retry\""); + CHECK_CONTAINS(comm, "SetMemoryName(retryName)"); + CHECK_CONTAINS(comm, "GetName(retryName, names)"); + CHECK_NOT_CONTAINS(comm, "do not support pcie > 2 rank"); + CHECK_CONTAINS(cmake, "TILEXR_UDMA_FORCE_ENABLE"); + + const std::string transport = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/comm/udma/tilexr_udma_transport.cpp"); + const std::string udma = ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/include/tilexr_udma.h"); + CHECK_CONTAINS(transport, "RaCtxRmemImport"); + CHECK_CONTAINS(udma, "UDMAQuietStatus"); + + if (g_failures != 0) { + std::cerr << g_failures << " IPC PID mode source checks failed" << std::endl; + return 1; + } + std::cout << "TileXR IPC PID mode source checks passed" << std::endl; + return 0; +} diff --git a/tests/udma/unit/test_tilexr_udma_allreduce_layout.cpp b/tests/udma/unit/test_tilexr_udma_allreduce_layout.cpp new file mode 100644 index 0000000..c0fcd39 --- /dev/null +++ b/tests/udma/unit/test_tilexr_udma_allreduce_layout.cpp @@ -0,0 +1,125 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "demo/tilexr_udma_allreduce_layout.h" + +#ifndef TILEXR_SOURCE_ROOT +#define TILEXR_SOURCE_ROOT "." +#endif + +namespace { + +int g_failures = 0; + +#define CHECK_EQ(lhs, rhs) \ + do { \ + auto lhsValue = (lhs); \ + auto rhsValue = (rhs); \ + if (lhsValue != rhsValue) { \ + std::cerr << "CHECK_EQ failed at line " << __LINE__ << ": " #lhs " != " #rhs \ + << " (" << lhsValue << " vs " << rhsValue << ")" << std::endl; \ + ++g_failures; \ + } \ + } while (0) + +#define CHECK_CONTAINS(text, needle) \ + do { \ + if ((text).find(needle) == std::string::npos) { \ + std::cerr << "CHECK_CONTAINS failed at line " << __LINE__ << ": " << needle << std::endl; \ + ++g_failures; \ + } \ + } while (0) + +std::string ReadFile(const std::string& path) +{ + std::ifstream in(path.c_str()); + std::ostringstream out; + out << in.rdbuf(); + return out.str(); +} + +void TestAllReduceInputPattern() +{ + constexpr int rank = 3; + constexpr int32_t elementsPerRank = 4; + std::vector input(elementsPerRank * 2, -1); + + TileXR::Demo::FillAllReduceInput(input, rank, elementsPerRank); + + for (int32_t i = 0; i < elementsPerRank; ++i) { + CHECK_EQ(input[static_cast(i)], TileXR::Demo::AllReduceValue(rank)); + } + CHECK_EQ(input[static_cast(elementsPerRank)], -1); +} + +void TestAllReduceExpectedEightRankSum() +{ + CHECK_EQ(TileXR::Demo::AllReduceExpectedSum(8), 8028); +} + +void TestAllReduceOutputValidation() +{ + constexpr int rankSize = 4; + constexpr int32_t elementsPerRank = 3; + std::vector output(elementsPerRank, TileXR::Demo::AllReduceExpectedSum(rankSize)); + + CHECK_EQ(TileXR::Demo::ValidateAllReduceOutput(output, rankSize, elementsPerRank), true); + output[1] = -1; + CHECK_EQ(TileXR::Demo::ValidateAllReduceOutput(output, rankSize, elementsPerRank), false); +} + +void TestBuildAllReduceOutput() +{ + constexpr int rankSize = 5; + constexpr int32_t elementsPerRank = 2; + std::vector allInputs(static_cast(rankSize) * elementsPerRank, -1); + + for (int rank = 0; rank < rankSize; ++rank) { + std::fill(allInputs.begin() + static_cast(rank) * elementsPerRank, + allInputs.begin() + static_cast(rank + 1) * elementsPerRank, + TileXR::Demo::AllReduceValue(rank)); + } + + std::vector output(elementsPerRank, -1); + TileXR::Demo::BuildAllReduceOutputFromInputs(allInputs, rankSize, elementsPerRank, output); + + CHECK_EQ(TileXR::Demo::ValidateAllReduceOutput(output, rankSize, elementsPerRank), true); +} + +void TestDemoAllReduceSourceHooks() +{ + const std::string demo = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/tests/udma/demo/tilexr_udma_demo.cpp"); + const std::string kernel = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/tests/udma/demo/tilexr_udma_demo_kernel.cpp"); + const std::string script = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/tests/udma/demo/run_tilexr_udma_demo.sh"); + + CHECK_CONTAINS(demo, "testType == 3"); + CHECK_CONTAINS(demo, "ValidateAllReduceData"); + CHECK_CONTAINS(kernel, "tilexr_all_reduce_ipc_scatter_kernel"); + CHECK_CONTAINS(kernel, "tilexr_all_reduce_ipc_sum_kernel"); + CHECK_CONTAINS(script, "3=all-reduce"); +} + +} // namespace + +int main() +{ + TestAllReduceInputPattern(); + TestAllReduceExpectedEightRankSum(); + TestAllReduceOutputValidation(); + TestBuildAllReduceOutput(); + TestDemoAllReduceSourceHooks(); + if (g_failures != 0) { + std::cerr << g_failures << " all-reduce layout checks failed" << std::endl; + return 1; + } + std::cout << "TileXR UDMA all-reduce layout checks passed" << std::endl; + return 0; +} diff --git a/tests/udma/unit/test_tilexr_udma_alltoall_layout.cpp b/tests/udma/unit/test_tilexr_udma_alltoall_layout.cpp new file mode 100644 index 0000000..0219f9a --- /dev/null +++ b/tests/udma/unit/test_tilexr_udma_alltoall_layout.cpp @@ -0,0 +1,182 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "demo/tilexr_udma_alltoall_layout.h" + +#ifndef TILEXR_SOURCE_ROOT +#define TILEXR_SOURCE_ROOT "." +#endif + +namespace { + +int g_failures = 0; + +#define CHECK_EQ(lhs, rhs) \ + do { \ + auto lhsValue = (lhs); \ + auto rhsValue = (rhs); \ + if (lhsValue != rhsValue) { \ + std::cerr << "CHECK_EQ failed at line " << __LINE__ << ": " #lhs " != " #rhs \ + << " (" << lhsValue << " vs " << rhsValue << ")" << std::endl; \ + ++g_failures; \ + } \ + } while (0) + +#define CHECK_CONTAINS(text, needle) \ + do { \ + if ((text).find(needle) == std::string::npos) { \ + std::cerr << "CHECK_CONTAINS failed at line " << __LINE__ << ": " << needle << std::endl; \ + ++g_failures; \ + } \ + } while (0) + +std::string ReadFile(const std::string& path) +{ + std::ifstream in(path.c_str()); + std::ostringstream out; + out << in.rdbuf(); + return out.str(); +} + +void TestAllToAllInputPattern() +{ + constexpr int rank = 2; + constexpr int rankSize = 4; + constexpr int32_t elementsPerPeer = 3; + std::vector input(static_cast(rankSize) * elementsPerPeer, -1); + + TileXR::Demo::FillAllToAllInput(input, rank, rankSize, elementsPerPeer); + + for (int dstRank = 0; dstRank < rankSize; ++dstRank) { + int32_t expected = TileXR::Demo::AllToAllValue(rank, dstRank); + for (int32_t elem = 0; elem < elementsPerPeer; ++elem) { + CHECK_EQ(input[static_cast(dstRank) * elementsPerPeer + elem], expected); + } + } +} + +void TestAllToAllOutputValidation() +{ + constexpr int rank = 1; + constexpr int rankSize = 3; + constexpr int32_t elementsPerPeer = 2; + std::vector output(static_cast(rankSize) * elementsPerPeer, -1); + + for (int srcRank = 0; srcRank < rankSize; ++srcRank) { + std::fill(output.begin() + static_cast(srcRank) * elementsPerPeer, + output.begin() + static_cast(srcRank + 1) * elementsPerPeer, + TileXR::Demo::AllToAllValue(srcRank, rank)); + } + + CHECK_EQ(TileXR::Demo::ValidateAllToAllOutput(output, rank, rankSize, elementsPerPeer), true); + output[static_cast(2) * elementsPerPeer + 1] = 123; + CHECK_EQ(TileXR::Demo::ValidateAllToAllOutput(output, rank, rankSize, elementsPerPeer), false); +} + +void TestBuildAllToAllOutput() +{ + constexpr int rankSize = 3; + constexpr int32_t elementsPerPeer = 2; + std::vector allInputs(static_cast(rankSize) * rankSize * elementsPerPeer, -1); + + for (int srcRank = 0; srcRank < rankSize; ++srcRank) { + std::vector oneInput(static_cast(rankSize) * elementsPerPeer, -1); + TileXR::Demo::FillAllToAllInput(oneInput, srcRank, rankSize, elementsPerPeer); + std::copy(oneInput.begin(), oneInput.end(), + allInputs.begin() + static_cast(srcRank) * rankSize * elementsPerPeer); + } + + std::vector output(static_cast(rankSize) * elementsPerPeer, -1); + TileXR::Demo::BuildAllToAllOutputFromInputs(allInputs, 2, rankSize, elementsPerPeer, output); + + CHECK_EQ(TileXR::Demo::ValidateAllToAllOutput(output, 2, rankSize, elementsPerPeer), true); +} + +void TestAllToAllMaxRank256With64MiBPerRank() +{ + constexpr int rankSize = 256; + constexpr size_t perRankBytes = 64ULL * 1024ULL * 1024ULL; + constexpr int32_t elementsPerPeer = + static_cast(perRankBytes / (sizeof(int32_t) * rankSize)); + CHECK_EQ(elementsPerPeer, 65536); + + std::vector buffer(static_cast(rankSize) * elementsPerPeer, -1); + const int sampleRanks[] = {0, 1, 127, 255}; + for (int rank : sampleRanks) { + TileXR::Demo::FillAllToAllInput(buffer, rank, rankSize, elementsPerPeer); + CHECK_EQ(buffer[0], TileXR::Demo::AllToAllValue(rank, 0)); + CHECK_EQ(buffer[static_cast(rankSize - 1) * elementsPerPeer], + TileXR::Demo::AllToAllValue(rank, rankSize - 1)); + CHECK_EQ(buffer[static_cast(rankSize) * elementsPerPeer - 1], + TileXR::Demo::AllToAllValue(rank, rankSize - 1)); + + for (int srcRank = 0; srcRank < rankSize; ++srcRank) { + std::fill(buffer.begin() + static_cast(srcRank) * elementsPerPeer, + buffer.begin() + static_cast(srcRank + 1) * elementsPerPeer, + TileXR::Demo::AllToAllValue(srcRank, rank)); + } + CHECK_EQ(TileXR::Demo::ValidateAllToAllOutput(buffer, rank, rankSize, elementsPerPeer), true); + buffer[static_cast(rankSize) * elementsPerPeer - 1] = -1; + CHECK_EQ(TileXR::Demo::ValidateAllToAllOutput(buffer, rank, rankSize, elementsPerPeer), false); + } +} + +void TestDemoDebugLayoutSource() +{ + const std::string demo = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/tests/udma/demo/tilexr_udma_demo.cpp"); + const std::string kernel = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/tests/udma/demo/tilexr_udma_demo_kernel.cpp"); + + CHECK_CONTAINS(demo, "kDebugUdmaStatusBase + TileXR::TILEXR_MAX_RANK_SIZE"); + CHECK_CONTAINS(demo, "kDebugIpcGather + 1"); + CHECK_CONTAINS(kernel, "TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + TileXR::TILEXR_MAX_RANK_SIZE"); + CHECK_CONTAINS(kernel, "TILEXR_UDMA_DEMO_DEBUG_UDMA_STATUS_BASE + peer"); +} + +void TestAllToAllDataAsFlagSource() +{ + const std::string demo = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/tests/udma/demo/tilexr_udma_demo.cpp"); + const std::string kernel = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/tests/udma/demo/tilexr_udma_demo_kernel.cpp"); + + CHECK_CONTAINS(demo, "useAllToAllDataAsFlagIpc"); + CHECK_CONTAINS(demo, "TILEXR_DEMO_ALLTOALL_USE_UDMA"); + CHECK_CONTAINS(demo, "skip TileXRUDMARegister for alltoall data-as-flag IPC path"); + CHECK_CONTAINS(demo, "forceAllToAllIpcFallback"); + CHECK_CONTAINS(demo, "strictAllToAllUdma"); + CHECK_CONTAINS(demo, "ERROR: strict alltoall UDMA registration failed"); + CHECK_CONTAINS(demo, "ERROR: strict alltoall UDMA CQ incomplete"); + CHECK_CONTAINS(demo, "TileXRUDMARegister failed; use alltoall data-as-flag IPC fallback"); + CHECK_CONTAINS(demo, "skip all-to-all UDMA kernel; use data-as-flag IPC fallback"); + CHECK_CONTAINS(kernel, "#include \"tilexr_data_as_flag.h\""); + CHECK_CONTAINS(kernel, "TILEXR_UDMA_DEMO_DATA_AS_FLAG_STAGING_OFFSET"); + CHECK_CONTAINS(kernel, "DataAsFlagBlockCountForPayloadBytes"); + CHECK_CONTAINS(kernel, "DataAsFlagInit"); + CHECK_CONTAINS(kernel, "DataAsFlagSend"); + CHECK_CONTAINS(kernel, "DataAsFlagCheckAndRecv"); +} + +} // namespace + +int main() +{ + TestAllToAllInputPattern(); + TestAllToAllOutputValidation(); + TestBuildAllToAllOutput(); + TestAllToAllMaxRank256With64MiBPerRank(); + TestDemoDebugLayoutSource(); + TestAllToAllDataAsFlagSource(); + if (g_failures != 0) { + std::cerr << g_failures << " all-to-all layout checks failed" << std::endl; + return 1; + } + std::cout << "TileXR UDMA all-to-all layout checks passed" << std::endl; + return 0; +} diff --git a/tests/udma/unit/test_tilexr_udma_registry.cpp b/tests/udma/unit/test_tilexr_udma_registry.cpp index 518c3a4..0c4ffc5 100644 --- a/tests/udma/unit/test_tilexr_udma_registry.cpp +++ b/tests/udma/unit/test_tilexr_udma_registry.cpp @@ -47,11 +47,17 @@ void TestRemoteAddressCalculation() static_cast(0x200040)); } +void TestRankScaleLimit() +{ + CHECK_EQ(TileXR::TILEXR_MAX_RANK_SIZE, 256); +} + } // namespace int main() { TestRemoteAddressCalculation(); + TestRankScaleLimit(); if (g_failures != 0) { std::cerr << g_failures << " registry checks failed" << std::endl; return 1; diff --git a/tests/udma/unit/test_tilexr_udma_transport_layout.cpp b/tests/udma/unit/test_tilexr_udma_transport_layout.cpp index cb14015..2fc3bfa 100644 --- a/tests/udma/unit/test_tilexr_udma_transport_layout.cpp +++ b/tests/udma/unit/test_tilexr_udma_transport_layout.cpp @@ -1,9 +1,16 @@ #include +#include #include +#include +#include #include #include "udma/tilexr_udma_layout.h" +#ifndef TILEXR_SOURCE_ROOT +#define TILEXR_SOURCE_ROOT "." +#endif + namespace { int g_failures = 0; @@ -27,6 +34,30 @@ int g_failures = 0; } \ } while (0) +#define CHECK_CONTAINS(text, needle) \ + do { \ + if ((text).find(needle) == std::string::npos) { \ + std::cerr << "CHECK_CONTAINS failed at line " << __LINE__ << ": " << needle << std::endl; \ + ++g_failures; \ + } \ + } while (0) + +#define CHECK_NOT_CONTAINS(text, needle) \ + do { \ + if ((text).find(needle) != std::string::npos) { \ + std::cerr << "CHECK_NOT_CONTAINS failed at line " << __LINE__ << ": " << needle << std::endl; \ + ++g_failures; \ + } \ + } while (0) + +std::string ReadFile(const std::string& path) +{ + std::ifstream in(path.c_str()); + std::ostringstream out; + out << in.rdbuf(); + return out.str(); +} + void TestHostLayoutUsesDeviceRelativePointers() { std::vector sq(2); @@ -85,12 +116,87 @@ void TestRejectsMismatchedArrays() CHECK_EQ(ret, TileXR::TILEXR_UDMA_LAYOUT_INVALID); } +void TestTransportUsesPerPeerQueues() +{ + const std::string transport = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/comm/udma/tilexr_udma_transport.cpp"); + CHECK_CONTAINS(transport, "struct PeerQueueState"); + CHECK_CONTAINS(transport, "std::map peerQueues"); + CHECK_CONTAINS(transport, "CreatePeerQueue("); + CHECK_CONTAINS(transport, "state.peerQueues[peer]"); + CHECK_CONTAINS(transport, "localPeerImports[peer].in.key"); + CHECK_CONTAINS(transport, "allImports[(peer * options_.rankSize + options_.rank)"); + CHECK_CONTAINS(transport, "queue.localWq"); + CHECK_CONTAINS(transport, "queue.localCq"); + CHECK_CONTAINS(transport, "queue.tpn"); + CHECK_NOT_CONTAINS(transport, "void* qpHandle = nullptr;\n CqInfoT cqInfo"); +} + +void TestRootInfoEidBytesSelectRuntimeContexts() +{ + const std::string transport = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/comm/udma/tilexr_udma_transport.cpp"); + + CHECK_CONTAINS(transport, "ReadTextFile(\"/etc/hccl_rootinfo.json\")"); + CHECK_CONTAINS(transport, "root.eidByLocalId[localId][eidIndex] = eid"); + CHECK_CONTAINS(transport, "root.portToEidByLocalId[localId][port] = eidIndex"); + CHECK_CONTAINS(transport, "localEidByEid_ = localEids->second"); + CHECK_CONTAINS(transport, "auto targetEidIt = localEidByEid_.find(eidIndex)"); + CHECK_CONTAINS(transport, + "matched = std::memcmp(infoList[i].eid.raw, targetEidIt->second.raw, sizeof(infoList[i].eid.raw)) == 0"); + CHECK_CONTAINS(transport, "attr.ub.eidIndex = infoList[i].eidIndex"); + CHECK_CONTAINS(transport, "ctxHandleByEid_[eidIndex] = ctxHandle"); +} + +void TestMemoryRegistrationUsesOfficialUbFlags() +{ + const std::string transport = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/comm/udma/tilexr_udma_transport.cpp"); + CHECK_CONTAINS(transport, "mrInfo.in.ub.flags.bs.cacheable = 0"); + CHECK_CONTAINS(transport, "mrInfo.in.ub.flags.bs.nonPin = 0"); + CHECK_CONTAINS(transport, "mrInfo.in.ub.flags.bs.userIova = 0"); + CHECK_CONTAINS(transport, "mrInfo.in.ub.flags.bs.tokenIdValid = 1"); +} + +void TestDeviceSgeUsesPerPeerLocalTokenId() +{ + const std::string types = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/include/tilexr_udma_types.h"); + const std::string device = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/include/tilexr_udma.h"); + const std::string transport = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/comm/udma/tilexr_udma_transport.cpp"); + + CHECK_CONTAINS(types, "uint32_t localTokenId"); + CHECK_CONTAINS(device, "sgeCtx->tokenId = qpCtxEntry->localTokenId"); + CHECK_NOT_CONTAINS(device, "sgeCtx->tokenId = 0;"); + CHECK_CONTAINS(transport, "queue.localWq.localTokenId = localMrIt->second.tokenId"); +} + +void TestDeviceSqeInitializesOfficialFields() +{ + const std::string device = + ReadFile(std::string(TILEXR_SOURCE_ROOT) + "/src/include/tilexr_udma.h"); + + CHECK_CONTAINS(device, "sqeCtx->sqeBbIdx = curHead % depth"); + CHECK_CONTAINS(device, "sqeCtx->rsv0 = 0"); + CHECK_CONTAINS(device, "sqeCtx->rsv1 = 0"); + CHECK_CONTAINS(device, "sqeCtx->rsv2 = 0"); + CHECK_CONTAINS(device, "sqeCtx->rsv3 = 0"); + CHECK_CONTAINS(device, "UDMAFillSqeCtx(sqeCtx, remoteAddr, remoteMemInfo, curHead, qpCtxEntry->depth"); +} + } // namespace int main() { TestHostLayoutUsesDeviceRelativePointers(); TestRejectsMismatchedArrays(); + TestTransportUsesPerPeerQueues(); + TestRootInfoEidBytesSelectRuntimeContexts(); + TestMemoryRegistrationUsesOfficialUbFlags(); + TestDeviceSgeUsesPerPeerLocalTokenId(); + TestDeviceSqeInitializesOfficialFields(); if (g_failures != 0) { std::cerr << g_failures << " UDMA transport layout checks failed" << std::endl; return 1;