diff --git a/.gitignore b/.gitignore
index 1c01658..2f30b5e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,3 +87,9 @@ comm/Makefile
 comm/cmake_install.cmake
 install_manifest.txt
 patches/
+.tmp
+tests/udma/logs
+*.diff
+*.bz2
+tmp
+.kilo
\ No newline at end of file
diff --git a/docs/TileXR_UDMA_P2P_PERF_GUIDE.md b/docs/TileXR_UDMA_P2P_PERF_GUIDE.md
new file mode 100644
index 0000000..3b4d24e
--- /dev/null
+++ b/docs/TileXR_UDMA_P2P_PERF_GUIDE.md
@@ -0,0 +1,596 @@
+# TileXR UDMA 2-Card P2P Performance Test Guide
+
+## Background
+
+This guide designs a 2-card single-pair P2P performance test for the TileXR
+UDMA path, using commit `378724c583c417ab2388a3c5354bfdb0e57dc589` as the
+reference baseline. That commit already contains the UDMA demo infrastructure
+under `tests/udma/demo`, including local rank launch, TileXR communicator
+initialization, registered device memory, UDMA kernel launches, and per-rank logs.
+
+The P2P test should reuse that infrastructure and add a focused perf mode for
+two ranks only.
+
+## Goal
+
+Measure one directed UDMA P2P transfer at a time on 2 cards:
+
+- `rank0 -> rank1`
+- `rank1 -> rank0`
+
+The final output should include:
+
+- correctness status for each transfer size and direction;
+- latency in microseconds;
+- effective bandwidth in GB/s;
+- a CSV result file;
+- a bandwidth curve comparing `0->1` and `1->0`.
+
+## Test Model
+
+Use `rank_size=2`. Each run launches two local ranks, one rank per NPU.
+
+The P2P perf mode supports these user-facing transport modes:
+
+- `direct_urma`: registered-memory UDMA transfer. Internally this path uses
+  the parallel multi-jetty kernel; `block_dim=1` with one QP is the single-jetty
+  baseline, while `block_dim=N` with `TILEXR_UDMA_QP_NUM=N` uses up to `N`
+  QPs/jettys in parallel.
+- `memory`: peer-memory IPC comparison using Ascend C `DataCopyPad`.
+- `memory_consume`: peer-memory IPC comparison with a separate outer sync flag
+  and a receiver-side copy from the IPC window into the destination buffer.
+- `data_as_flag`: peer-memory IPC comparison where each 512B block carries
+  480B payload plus a 32B ready flag.
+- `data_as_flag_epoch_ordered`: data-as-flag comparison with per-batch commit
+  epochs to avoid cross-iteration ready flag reuse.
+
+The diagnostic transports `memory_segmented` and `memory_segmented_rotate`
+are intended for large-message root-cause experiments. They keep `block_dim=1`
+but split the `memory` copy helper into 16 MiB segments; the rotate variant
+keeps writing inside one 16 MiB destination span and skips payload validation.
+The trace variants `memory_segmented_trace` and
+`memory_segmented_rotate_trace` use 8 MiB segments and record block-0
+per-segment cycle sums in the debug buffer. Enable
+`TILEXR_P2P_DEBUG_SUMMARY=1` to print `traceSegments` and `seg0Cycles` through
+`seg7Cycles`.
+
+For `direct_urma`, for a selected direction:
+
+- the sender rank posts one UDMA write to the receiver's registered memory;
+- the sender calls `UDMAQuietStatus` and records the completion status;
+- the receiver does not send data in that measurement round;
+- after stream synchronization and a host barrier, the receiver copies its
+  destination buffer back and validates the expected byte pattern.
+
+For `memory`, for the same selected direction:
+
+- the sender rank launches `tilexr_memory_p2p_perf_kernel`;
+- only the sender rank performs the device-side copy;
+- the destination address is its peer IPC data window on the receiver;
+- the receiver validates by copying its local IPC data window back to host.
+
+For `data_as_flag`, for the same selected direction:
+
+- the sender rank launches `tilexr_data_as_flag_p2p_perf_kernel`;
+- each 512B block carries 480B payload plus a 32B ready flag in the receiver's
+  peer IPC memory window;
+- the receiver validates by copying its local IPC data window back to host.
+
+Run both directions independently so each result row has an unambiguous source
+and destination. For current comparison work, `0->1` is usually enough because
+`0->1` and `1->0` were observed to be effectively identical on the tested
+2-card setup.
+
+Important scope notes:
+
+- `direct_urma` measures the TileXR registered-memory URMA/UDMA path.
+- `memory` measures peer-memory IPC semantics implemented with AIV
+  `DataCopyPad`. It is a useful baseline, not the same hardware data path as
+  UDMA queue based direct URMA.
+- `memory` and `data_as_flag` are limited by the TileXR IPC data window. The
+  current wrapper rejects sizes above 100 MiB.
+- The current IPC comparison kernels use one AIV block in the runner. Large
+  messages therefore reflect single-block GM->UB->peer-GM copy throughput.
+
+## Metrics
+
+Recommended CSV fields:
+
+```text
+direction,src,dst,ranks,bytes,iters,avg_us,min_us,max_us,bw_GBps,status,errors,log_dir
+```
+
+Definitions:
+
+- `direction`: `0to1` or `1to0`.
+- `bytes`: transfer bytes per measured UDMA write.
+- `avg_us`: average measured time per transfer, excluding warmup.
+- `min_us` and `max_us`: per-transfer minimum and maximum from measured
+  iterations.
+- `bw_GBps`: `bytes / avg_us / 1000`.
+- `status`: UDMA completion status from `UDMAQuietStatus`; `0` means success.
+- `errors`: validation mismatch count on the receiver side.
+
+Do not include communicator initialization, memory registration, H2D/D2H copies,
+or TCP barriers in the measured interval.
+
+For both transports, timing is taken on the source rank and shared through
+per-rank status files before rank 0 writes the CSV row. This avoids reporting
+receiver-side empty-kernel timing for the reverse direction.
+
+## Recommended Sweep
+
+Use this default sweep first:
+
+```text
+rank_size     = 2
+npu_count     = 2
+first_npu     = 0
+directions    = 0->1, 1->0
+min_bytes     = 4096
+max_bytes     = 268435456
+step_factor   = 2
+warmup_iters  = 10
+iters         = 100
+check         = 1
+```
+
+If large messages are unstable during bring-up, start with:
+
+```text
+min_bytes     = 4096
+max_bytes     = 16777216
+warmup_iters  = 5
+iters         = 20
+```
+
+Then expand the range after both directions pass correctness.
+
+## Code Changes
+
+### 1. Add A P2P Perf Test Type
+
+Extend `tests/udma/demo/tilexr_udma_demo.cpp` and
+`tests/udma/demo/tilexr_udma_demo_kernel.cpp` with a new mode:
+
+```text
+test_type=4: 2-card directed UDMA P2P performance test
+```
+
+Suggested host options:
+
+```text
+--src-rank <0|1>
+--dst-rank <0|1>
+--min-bytes <bytes>
+--max-bytes <bytes>
+--step-factor <N>
+--warmup-iters <N>
+--iters <N>
+--csv <path>
+--check <0|1>
+```
+
+For compatibility with the existing positional demo style, a wrapper script can
+translate simple positional arguments into these options.
+
+### 2. Add A Device Kernel
+
+Add a kernel shaped like this:
+
+```cpp
+extern "C" __global__ __aicore__ void tilexr_udma_p2p_perf_kernel(
+    GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR debugGM,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern)
+{
+    auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM);
+    auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM);
+    auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM);
+
+    const int32_t rank = args->rank;
+    if (rank != srcRank) {
+        return;
+    }
+
+    TileXR::UDMAPutNbi<uint8_t>(args, dstRank, src, dstByteOffset, bytes);
+    uint32_t status = TileXR::UDMAQuietStatus(args, dstRank);
+    if (debug != nullptr) {
+        debug[0] = status;
+        debug[1] = bytes;
+        debug[2] = pattern;
+    }
+}
+```
+
+The source buffer should already contain the host-generated pattern before the
+measured launch. The receiver validates the destination buffer after the measured
+iteration batch.
+
+### 3. Allocate Registered Buffers
+
+For each rank, allocate one registered memory region with:
+
+```text
+src buffer
+dst buffer
+debug/status buffer
+```
+
+Round the total registered bytes to the existing 2 MiB UDMA registration
+alignment used by the demo.
+
+The remote destination offset passed to the kernel is the byte offset of the
+receiver's `dst buffer` inside that registered region.
+
+### 4. Measure The Transfer
+
+Use one stream and event timing around each measured kernel launch batch:
+
+```text
+warmup:
+  launch kernel
+  synchronize stream
+
+measured:
+  record start event
+  repeat iters:
+    launch kernel
+  record stop event
+  synchronize stop event
+```
+
+Compute:
+
+```text
+avg_us = elapsed_ms * 1000 / iters
+bw_GBps = bytes / avg_us / 1000
+```
+
+If per-iteration min and max are required, record events around each measured
+iteration. For initial bring-up, average timing is enough; min and max can be
+added once correctness is stable.
+
+### 5. Write CSV
+
+Only rank 0 should create or append to the CSV. For direction `1->0`, rank 0 is
+the receiver, so rank 0 can still write the result after copying back debug and
+validation state.
+
+Each size and direction emits one row.
+
+Example:
+
+```csv
+direction,src,dst,ranks,bytes,iters,avg_us,min_us,max_us,bw_GBps,status,errors,log_dir
+0to1,0,1,2,4096,100,8.31,0,0,0.493,0,0,logs/tilexr_udma_p2p_perf_20260622_120000
+1to0,1,0,2,4096,100,8.45,0,0,0.485,0,0,logs/tilexr_udma_p2p_perf_20260622_120000
+```
+
+When min and max are not implemented yet, write `0` for those fields and keep
+the columns stable.
+
+## Runner Script
+
+Add a script:
+
+```text
+tests/udma/demo/run_tilexr_udma_p2p_perf.sh
+```
+
+Suggested usage:
+
+```bash
+bash demo/run_tilexr_udma_p2p_perf.sh \
+  <src_rank> <dst_rank> <min_bytes> <max_bytes> <step_factor> \
+  <iters> <warmup_iters> <first_npu> <check> <transport>
+```
+
+Example:
+
+```bash
+cd /path/to/TileXR/tests/udma
+bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 direct_urma
+bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 memory
+bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 data_as_flag
+```
+
+Arguments:
+
+- `src_rank`, `dst_rank`: directed transfer, normally `0 1` or `1 0`.
+- `min_bytes`, `max_bytes`, `step_factor`: byte sweep definition.
+- `iters`, `warmup_iters`: measured and warmup launch counts.
+- `first_npu`: first physical NPU id used by local rank 0. For physical cards
+  2 and 3, pass `2`.
+- `check`: `1` validates destination bytes after each size.
+- `transport`: `direct_urma`, `memory`, or `data_as_flag`; default is `direct_urma`.
+
+The script should:
+
+- source `scripts/common_env.sh`;
+- set `TILEXR_COMM_ID` if it is not already set;
+- set `TILEXR_DEMO_NPUS=2`;
+- set `TILEXR_DEMO_FIRST_NPU`;
+- launch exactly two ranks;
+- write per-rank logs under
+  `tests/udma/logs/tilexr_udma_p2p_perf_*_<transport>_<direction>`;
+- write CSV under the same log directory.
+
+## Build
+
+Build TileXR core first:
+
+```bash
+cd /path/to/TileXR
+source scripts/common_env.sh
+mkdir -p build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=../install ..
+make -j$(nproc)
+make install
+```
+
+Build UDMA tests and demo:
+
+```bash
+cd /path/to/TileXR/tests/udma
+bash build.sh
+```
+
+Expected binary:
+
+```text
+tests/udma/install/bin/tilexr_udma_demo
+```
+
+## Run
+
+### Run direct URMA
+
+Run a short correctness-oriented sweep first:
+
+```bash
+cd /path/to/TileXR/tests/udma
+bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 16777216 2 20 5 2 1 direct_urma
+```
+
+Then run the larger sweep. On the tested `141.62.19.144` environment, 64 MiB
+was used as the stable upper bound because 128 MiB and above failed during
+UDMA registration due to the registered region size.
+
+```bash
+bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 direct_urma
+```
+
+### Run memory semantics
+
+Use the same sweep and only change the transport:
+
+```bash
+cd /path/to/TileXR/tests/udma
+bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 memory
+```
+
+For diagnosis around the observed large-message knee, run single-size points:
+
+```bash
+for bytes in 33554432 41943040 50331648 58720256 62914560 67108864 75497472 83886080 100663296; do
+  TILEXR_COMM_ID=127.0.0.1:$((12000 + bytes / 1048576)) \
+    bash demo/run_tilexr_udma_p2p_perf.sh 0 1 ${bytes} ${bytes} 2 50 5 2 1 memory
+done
+```
+
+### Run data-as-flag semantics
+
+Use the same sweep and only change the transport:
+
+```bash
+cd /path/to/TileXR/tests/udma
+bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 67108864 2 100 10 2 1 data_as_flag
+```
+
+Success criteria:
+
+- both rank processes exit with code 0;
+- every row has `status=0`;
+- every row has `errors=0`;
+- for `direct_urma`, per-rank logs include UDMA enabled in `CommArgs`;
+- for `memory` and `data_as_flag`, per-rank logs show non-null `peerMems[]`
+  for the tested ranks;
+- no rank log contains `MISMATCH`, `TileXR UDMA demo failed`, or `ERROR`.
+
+## Plot The Curve
+
+Use the plotting helper:
+
+```text
+tests/udma/demo/plot_tilexr_udma_p2p_perf.py
+```
+
+For the old direction comparison:
+
+```bash
+python3 demo/plot_tilexr_udma_p2p_perf.py \
+  logs/tilexr_udma_p2p_perf_*/p2p_perf.csv \
+  --output logs/tilexr_udma_p2p_perf_curve.png
+```
+
+For a transport comparison, pass both CSVs and label the
+series explicitly:
+
+```bash
+python3 demo/plot_tilexr_udma_p2p_perf.py \
+  logs/tilexr_udma_p2p_perf_20260622_173102_direct_urma_0to1/p2p_perf.csv \
+  logs/tilexr_udma_p2p_perf_20260622_173138_memory_0to1/p2p_perf.csv \
+  --direction 0to1 \
+  --labels direct_urma,memory \
+  --output logs/tilexr_udma_vs_memory_0to1_bandwidth.png \
+  --latency-output logs/tilexr_udma_vs_memory_0to1_latency.png \
+  --latency-max-bytes 1048576
+```
+
+The script generates:
+
+- bandwidth curve: all rows in the selected CSVs;
+- latency curve: rows up to `--latency-max-bytes`, default 1 MiB.
+
+The plot uses:
+
+- x-axis: `bytes`, log scale;
+- x tick labels: human-readable `KB`, `MB`, or `GB`;
+- y-axis: `bw_GBps` for bandwidth or `avg_us` for latency;
+- one line per direction or per explicit label.
+
+Use labels that match the compared transport set, for example
+`direct_urma,memory` or `direct_urma,data_as_flag`.
+
+If multiple CSV files are passed, merge rows by direction and bytes before
+plotting. When `--labels` is used, merge rows by label and bytes instead, which
+allows two `0to1` CSV files to appear as separate series.
+
+### Local CSV and plot workflow
+
+When the test runs on a remote Ascend server, pull only the CSVs to the local
+workspace and plot locally:
+
+```powershell
+$out = "D:\workspace\TileXR\tests\udma\logs\p2p_compare_20260622_173138_64MiB"
+New-Item -ItemType Directory -Force -Path $out | Out-Null
+
+scp -i C:\Users\h30059441\.ssh\id_ed25519 `
+  root@141.62.19.144:/home/h30059441/TileXR/tests/udma/logs/tilexr_udma_p2p_perf_20260622_173102_direct_urma_0to1/p2p_perf.csv `
+  "$out\p2p_perf_direct_urma_0to1.csv"
+
+scp -i C:\Users\h30059441\.ssh\id_ed25519 `
+  root@141.62.19.144:/home/h30059441/TileXR/tests/udma/logs/tilexr_udma_p2p_perf_20260622_173138_memory_0to1/p2p_perf.csv `
+  "$out\p2p_perf_memory_0to1.csv"
+
+python tests\udma\demo\plot_tilexr_udma_p2p_perf.py `
+  "$out\p2p_perf_direct_urma_0to1.csv" `
+  "$out\p2p_perf_memory_0to1.csv" `
+  --direction 0to1 `
+  --labels direct_urma,memory `
+  --output "$out\tilexr_udma_vs_memory_0to1_bandwidth.png" `
+  --latency-output "$out\tilexr_udma_vs_memory_0to1_latency.png" `
+  --latency-max-bytes 1048576
+```
+
+Example output files:
+
+```text
+tests/udma/logs/p2p_compare_20260622_173138_64MiB/p2p_perf_direct_urma_0to1.csv
+tests/udma/logs/p2p_compare_20260622_173138_64MiB/p2p_perf_memory_0to1.csv
+tests/udma/logs/p2p_compare_20260622_173138_64MiB/tilexr_udma_vs_memory_0to1_bandwidth.png
+tests/udma/logs/p2p_compare_20260622_173138_64MiB/tilexr_udma_vs_memory_0to1_latency.png
+```
+
+## Result Review
+
+Check the CSV first:
+
+```bash
+column -s, -t logs/tilexr_udma_p2p_perf_*/p2p_perf.csv | less -S
+```
+
+Look for:
+
+- `status != 0`: UDMA completion failure or CQ polling issue;
+- `errors != 0`: receiver data mismatch;
+- sharp bandwidth drop at a specific size: possible registration range,
+  alignment, CQ depth, or timeout issue;
+- large directional difference between `0->1` and `1->0`: possible topology or
+  route asymmetry.
+- `memory` bandwidth decreasing after 32 MiB: this is not necessarily a data
+  error. On the tested setup, single-size memory runs showed a gradual drop
+  from about 48 GB/s at 32 MiB to about 39 GB/s at 64 MiB and above. That shape
+  is consistent with the current single-AIV-block `DataCopyPad` IPC-window
+  baseline entering sustained GM->UB->peer-GM throughput, not with a 64 MiB
+  correctness boundary.
+- `data_as_flag` differs from `memory` because it spends part of each 512B block
+  on the embedded ready flag; compare it as a separate IPC baseline.
+
+Keep the raw logs with the CSV and curve. They are needed to confirm UDMA was
+enabled and to inspect `CommArgs`, registered memory offsets, and debug words.
+
+## Example Result From 141.62.19.144
+
+Environment:
+
+```text
+CANN: /usr/local/Ascend/ascend-toolkit
+MPI:  /usr/local/mpich
+NPUs: physical 2 and 3
+Direction: 0->1
+Sweep: 4096 -> 67108864, step_factor=2, iters=100, warmup=10
+```
+
+Selected rows:
+
+```text
+transport    bytes      avg_us    bw_GBps  status  errors
+direct_urma  4096       6.504     0.630    0       0
+direct_urma  1048576    26.040    40.267   0       0
+direct_urma  16777216   322.074   52.091   0       0
+direct_urma  67108864   1269.934  52.844   0       0
+memory       4096       4.059     1.009    0       0
+memory       1048576    23.484    44.651   0       0
+memory       16777216   334.924   50.093   0       0
+memory       67108864   1691.058  39.685   0       0
+```
+
+Memory-only diagnostic points:
+
+```text
+32MiB  48.16 GB/s
+40MiB  45.54 GB/s
+48MiB  42.09 GB/s
+56MiB  40.59 GB/s
+60MiB  40.16 GB/s
+64MiB  39.56 GB/s
+72MiB  39.21 GB/s
+80MiB  39.15 GB/s
+96MiB  39.10 GB/s
+```
+
+Interpretation:
+
+- direct URMA reaches about 52.8 GB/s at 64 MiB on this pair;
+- memory semantics is competitive for small and mid-sized messages, but the
+  current single-block DataCopyPad baseline settles near 39-40 GB/s for larger
+  messages;
+- both paths passed payload validation with `status=0` and `errors=0`.
+
+For the specific `memory block_dim=1` drop after 16 MiB, first compare
+`memory`, `memory_segmented`, and `memory_segmented_rotate` at `block_dim=1`,
+then repeat the rotate run at `block_dim=2` or higher. If splitting into
+16 MiB helper calls and rotating inside a 16 MiB destination window still
+follows `memory`, then helper-call size, destination address span, and
+host-side sweep shape are unlikely to be the root cause. Then run:
+
+```bash
+TILEXR_P2P_DEBUG_SUMMARY=1 bash demo/run_tilexr_udma_p2p_concurrency_sweep.sh \
+  16777216 67108864 2 20 5 0 1 \
+  memory_segmented_trace,memory_segmented_rotate_trace \
+  unidir 1,2
+```
+
+Interpret the trace as follows:
+
+- `block_dim=1` later 8 MiB segments get progressively more expensive while
+  `block_dim=2` stays flat: sustained single-stream peer IPC writes are
+  accumulating credit/backpressure or remote write-queue drain delay.
+- every `block_dim=1` segment is similarly slower than `block_dim=2`: the
+  bottleneck is a fixed single-stream bandwidth limit rather than segment-local
+  accumulation.
+- rotate trace follows normal trace: destination window span is still excluded.
+- rotate trace diverges sharply: re-check address mapping, aliasing, or overwrite
+  effects before using the trace as a bandwidth conclusion.
+
+## Minimal Acceptance Checklist
+
+- `test_type=4` exists and is documented in the UDMA demo README.
+- `run_tilexr_udma_p2p_perf.sh` can run `0->1` and `1->0` with `rank_size=2`.
+- CSV rows are produced for every requested size.
+- The receiver validates the copied payload for every measured size.
+- The plot helper generates a two-line bandwidth curve.
+- A short sweep and full sweep both pass with `status=0` and `errors=0`.
diff --git a/docs/UDMA_P2P_TRANSPORT_COMPARISON.md b/docs/UDMA_P2P_TRANSPORT_COMPARISON.md
new file mode 100644
index 0000000..2efe838
--- /dev/null
+++ b/docs/UDMA_P2P_TRANSPORT_COMPARISON.md
@@ -0,0 +1,219 @@
+# UDMA P2P 传输方式对比
+
+**主题：** `memory_consume` 与 `data_as_flag_epoch_ordered`
+
+## 摘要
+
+`data_as_flag_epoch_ordered` 已经消除了最严重的按块写 flag 瓶颈，但其 payload 路径仍比 `memory_consume` 重得多。在相同的有效 payload 带宽统计口径下，`~10 GB/s` 对 `~50 GB/s` 这样的结果与当前实现是相符的。
+
+总体来说：
+
+```text
+memory_consume:
+  连续 payload 传输 + 独立同步 flag
+
+data_as_flag_epoch_ordered:
+  data-as-flag 协议校验，提交信息内嵌
+  在 480B payload + 32B flag/gap 的块布局中
+```
+
+`memory_consume` 更接近高效的连续 payload 传输。`data_as_flag_epoch_ordered` 更接近一条协议路径——通过在数据窗口中内嵌就绪信息来证明数据已就绪。
+
+## `memory_consume` 数据路径
+
+`memory_consume` kernel 保持 payload 窗口连续：
+
+```text
+发送端:
+  连续 src payload
+    -> 对端 IPC 数据窗口
+  设置独立的外部同步 flag
+
+接收端:
+  等待独立的外部同步 flag
+  连续的本地对端 IPC 数据窗口
+    -> 连续的 dst payload
+```
+
+在代码中，发送端从 `srcGM + offset` 拷贝到 `peerBase + dstByteOffset + offset`，然后调用 `sync.SetOuterFlag(magic, step)`。接收端用 `sync.WaitOuterFlag(...)` 等待，再从 `localBase + dstByteOffset + offset` 拷贝到 `dst + offset`。
+
+两端的 payload 布局都保持连续。虽然 helper 内部仍会经过 UB 中转，但拷贝的 payload 是简单的连续 GM 到 GM 流。对于大消息，独立同步 flag 的开销会被大 payload 摊薄，因此这条路径可以达到高得多的带宽。
+
+相关文件：
+
+```text
+tests/udma/demo/tilexr_udma_demo_kernel.cpp
+  tilexr_memory_consume_p2p_perf_kernel
+  TileXRUdmaDemoCopyBytesGmToGm
+```
+
+## `data_as_flag_epoch_ordered` 数据路径
+
+`data_as_flag_epoch_ordered` kernel 使用 data-as-flag 块布局：
+
+```text
+512B 块:
+  480B payload
+   32B flag/gap
+```
+
+发送端路径：
+
+```text
+连续 src payload
+  -> UB 暂存
+  -> 打包成 512B 块:
+       480B payload + 32B flag/gap
+  -> 对端 data-as-flag IPC 窗口
+  -> 在该 batch 最后一个块的 flag 区写入 32B batch commit flag
+```
+
+接收端路径：
+
+```text
+轮询 batch commit flag
+  -> 从 512B data-as-flag 窗口读取
+  -> 解包/提取 480B payload 区域
+  -> 写入连续的 dst payload
+```
+
+epoch-ordered 变体相比旧的按块 data-as-flag 路径有所改进：每个 batch 只写一次 commit flag，而不是每个 480B payload 块写一次。这消除了破坏性最大的小写瓶颈，但该路径仍然存在额外的布局转换、轮询，以及每个 batch 一次的小 commit 写入。
+
+相关文件：
+
+```text
+src/include/tilexr_data_as_flag.h
+  DATA_AS_FLAG_BLOCK_BYTES = 512
+  DATA_AS_FLAG_PAYLOAD_BYTES = 480
+  DATA_AS_FLAG_FLAG_BYTES = 32
+  DataAsFlagSendEpochOrdered
+  DataAsFlagWriteBatchCommitFlag
+  DataAsFlagCheckAndRecvEpochOrdered
+  DataAsFlagCopyBatchToRecvGM
+
+tests/udma/demo/tilexr_udma_demo_kernel.cpp
+  tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel
+```
+
+## 差距的主要来源
+
+### 1. 布局开销
+
+`data_as_flag_epoch_ordered` 把每个 `480B` payload 存进一个 `512B` 块：
+
+```text
+512 / 480 = 1.067x
+```
+
+因此在考虑任何控制或解包开销之前，数据窗口就至少有约 `6.7%` 的额外空间和传输开销。
+
+该开销在 host 侧窗口计算中可见：
+
+```text
+DataAsFlagWindowBytes(payloadBytes)
+  = ceil(payloadBytes / 480) * 512
+```
+
+但带宽报告使用的是有效 payload 字节数，而不是展开后的 data-as-flag 窗口字节数。因此额外的块布局字节会拉低报告的有效带宽。
+
+相关文件：
+
+```text
+tests/udma/demo/tilexr_udma_p2p_perf_config.h
+  DataAsFlagWindowBytes
+  P2PEffectiveTransferBytes
+  FormatP2PPerfCsvRow
+```
+
+### 2. 经过 UB 的打包与解包
+
+`memory_consume` 是从连续 payload 拷贝到连续 payload。
+
+`data_as_flag_epoch_ordered` 必须对 payload 做重塑：
+
+```text
+发送端:
+  连续 src
+    -> UB
+    -> 512B 块中的 480B payload 分块
+    -> 对端 GM
+
+接收端:
+  512B 块/gap 窗口
+    -> UB
+    -> 提取 480B payload 分块
+    -> 连续 dst
+```
+
+这增加了连续 `memory_consume` 路径所不需要的额外 MTE 操作、barrier 以及跨步拷贝行为。
+
+### 3. Batch Commit 小写入
+
+epoch-ordered 路径不再为每个 `480B` payload 块写一次 flag。而是在一个 batch 的 payload 写完后，向该 batch 最后一个块的 flag 区写入一个 `32B` commit flag：
+
+```text
+payload batch 写入:
+  batchBlocks * 512B
+
+batch commit 写入:
+  32B commit flag
+```
+
+这就是“batch commit 小写入”。
+
+它比按块写 flag 好得多，但仍存在固定开销：
+
+```text
+准备 32B epoch flag
+S -> MTE3 同步
+MTE3 向 GM/对端窗口写入 32B commit flag
+MTE3 -> S 同步
+接收端轮询并读取 commit flag
+```
+
+对于大 batch，该开销会被摊薄，但并非免费。结合打包/解包和 512B 块布局，它仍会限制最终带宽。
+
+## 为什么 `0.7 GB/s` 能恢复到 `~10 GB/s`
+
+旧的 data-as-flag 路径实际上以极高频率付出小 flag 写入/检查开销——大约每个 `480B` payload 块一次。这使小写入和轮询主导了 payload 传输。
+
+epoch-ordered 路径将其降为每个 batch 一次 commit 写入：
+
+```text
+旧路径:
+  每个 480B payload 块都有 ready/flag 开销
+
+epoch-ordered:
+  每个 batch 只有一个最终的 32B commit flag
+```
+
+这就解释了为什么性能可以显著恢复，例如从约 `0.7 GB/s` 恢复到约 `10 GB/s`。
+
+但它并未把该路径变成连续 payload 传输。该协议仍要为 data-as-flag 布局付出代价，接收端仍需等待，再从内嵌布局窗口中解包。
+
+## 最终解读
+
+`memory_consume`：
+
+```text
+用独立的同步区来证明 payload 就绪。
+保持 payload 数据窗口连续。
+针对大块连续 payload 搬移做优化。
+```
+
+`data_as_flag_epoch_ordered`：
+
+```text
+用数据窗口内嵌的 commit flag 来证明 payload 就绪。
+以 480B + 32B 的 data-as-flag 块存放 payload。
+付出打包/解包、轮询和 batch commit 开销。
+```
+
+因此在当前实现和相同有效 payload 带宽统计口径下，出现如下差距：
+
+```text
+data_as_flag_epoch_ordered: ~10 GB/s
+memory_consume:            ~50 GB/s
+```
+
+是符合预期的。这并不一定说明 epoch-ordered 的修复失败；它说明该修复消除了最严重的按块 flag 瓶颈，而剩余的 data-as-flag 协议路径仍比带外部同步的连续 payload 传输重得多。
diff --git a/docs/superpowers/plans/2026-06-24-p2p-transport-trim.md b/docs/superpowers/plans/2026-06-24-p2p-transport-trim.md
new file mode 100644
index 0000000..fa998b4
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-24-p2p-transport-trim.md
@@ -0,0 +1,650 @@
+# P2P Transport Trim Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Trim the UDMA P2P performance demo to expose only `direct_urma`, `memory`, and `data_as_flag`, with `direct_urma` using the current parallel multi-jetty implementation internally.
+
+**Architecture:** Keep one UDMA benchmark path and two peer-memory comparison paths. The public transport enum/parser/CSV/docs will only know the three user-facing names, while the `direct_urma` host launcher calls the existing parallel multi-jetty kernel. Remove obsolete single-WQE, multi-WQE, serial multi-jetty, and fixed-WQE benchmark paths.
+
+**Tech Stack:** C++14, Ascend C kernels, Bash demo runners, host-only C++ unit tests, GitHub PR docs.
+
+## Global Constraints
+
+- User-facing transport names must be exactly `direct_urma`, `memory`, and `data_as_flag`. 
+- `direct_urma` must use the current parallel multi-jetty implementation internally.
+- `direct_urma` with `block_dim=1` and one QP must behave like the previous single-QP direct URMA path.
+- `direct_urma` with `block_dim=N` and `TILEXR_UDMA_QP_NUM=N` must use up to `N` QPs/jettys in parallel.
+- Keep QP-indexed device helpers and UDMA layout support.
+- Remove fixed-WQE-only window sizing and mismatch-check logic.
+- Do not touch unrelated untracked files such as `.kilo/`.
+- Source repository guidance from `CLAUDE.md` remains in effect.
+
+---
+
+## File Structure
+
+- `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`: host-only behavior tests for supported transports, rejected aliases, CSV formatting, window sizing, and aggregation.
+- `tests/udma/demo/tilexr_udma_p2p_perf_config.h`: transport enum, parsing, names, validation, window sizing, effective byte accounting, mismatch checking, CSV row formatting, and row aggregation.
+- `tests/udma/demo/tilexr_udma_demo.cpp`: host launch declarations and transport dispatch. This should dispatch `P2PTransport::DirectUrma` to the parallel multi-jetty launcher.
+- `tests/udma/demo/tilexr_udma_demo_kernel.cpp`: Ascend C benchmark kernels and launch wrappers. Keep the parallel multi-jetty kernel, rename its launcher to the public `launch_tilexr_udma_p2p_perf`, and remove obsolete benchmark kernels/wrappers.
+- `tests/udma/demo/run_tilexr_udma_p2p_perf.sh`: runner argument handling and `TILEXR_UDMA_QP_NUM` setup.
+- `tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh`: sweep defaults, already expected to remain `direct_urma,memory,data_as_flag`.
+- `docs/TileXR_UDMA_P2P_PERF_GUIDE.md` and `tests/udma/demo/README.md`: user-facing documentation.
+
+---
+
+### Task 1: Update Config Tests First
+
+**Files:**
+- Modify: `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`
+- Test: `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`
+
+**Interfaces:**
+- Consumes: current `TileXR::Demo::P2PTransport`, `ParseP2PTransport`, `P2PTransportName`, `P2PTransportWindowBytes`, `P2PEffectiveTransferBytes`, `CountP2PTransportMismatches`, `FormatP2PPerfCsvRow`, `ValidateP2PPerfOptions`
+- Produces: expected behavior that later implementation must satisfy:
+  - only `DirectUrma`, `Memory`, `DataAsFlag`, and `Invalid` transport enum values are referenced by tests
+  - obsolete transport strings parse to `Invalid`
+  - `direct_urma` CSV rows keep the name `direct_urma`
+  - direct URMA effective bytes are the payload bytes, not `payload * blockDim`
+
+- [ ] **Step 1: Replace transport name assertions**
+
+Replace the block after `DirectionName` so it only checks the three supported names:
+
+```cpp
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DirectUrma) == "direct_urma",
+        "direct_urma transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::Memory) == "memory",
+        "memory transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DataAsFlag) == "data_as_flag",
+        "data_as_flag transport name mismatch");
+```
+
+- [ ] **Step 2: Replace parse assertions**
+
+Keep the positive parse checks for `direct_urma`, `memory`, and `data_as_flag`, and add explicit rejection checks:
+
+```cpp
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma") == TileXR::Demo::P2PTransport::DirectUrma,
+        "direct_urma transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("udma") == TileXR::Demo::P2PTransport::DirectUrma,
+        "udma alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory") == TileXR::Demo::P2PTransport::Memory,
+        "memory transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("data_as_flag") == TileXR::Demo::P2PTransport::DataAsFlag,
+        "data_as_flag transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_wqe") == TileXR::Demo::P2PTransport::Invalid,
+        "direct_urma_multi_wqe must be rejected");
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty") == TileXR::Demo::P2PTransport::Invalid,
+        "direct_urma_multi_jetty must be rejected");
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty_parallel") == TileXR::Demo::P2PTransport::Invalid,
+        "direct_urma_multi_jetty_parallel must be rejected");
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty_parallel_fixed_wqe") ==
+            TileXR::Demo::P2PTransport::Invalid,
+        "direct_urma_multi_jetty_parallel_fixed_wqe must be rejected");
+```
+
+- [ ] **Step 3: Remove fixed-WQE helper assertions**
+
+Delete assertions that call:
+
+```cpp
+TileXR::Demo::P2PFixedWqeStrideBytes(...)
+TileXR::Demo::P2PFixedWqeWindowBytes(...)
+TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DirectUrmaMultiJettyParallelFixedWqe, ...)
+```
+
+Add this replacement assertion:
+
+```cpp
+    Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DirectUrma, 4096, 8) == 4096,
+        "direct_urma window must equal payload bytes");
+```
+
+- [ ] **Step 4: Replace validation assertions for removed transports**
+
+Remove validation checks for `DirectUrmaMultiWqe`, `DirectUrmaMultiJetty`, `DirectUrmaMultiJettyParallel`, and `DirectUrmaMultiJettyParallelFixedWqe`. Add this direct URMA validation check:
+
+```cpp
+    options.transport = TileXR::Demo::P2PTransport::DirectUrma;
+    options.traffic = TileXR::Demo::P2PTraffic::BiDir;
+    options.blockDim = 8;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "valid direct_urma multi-jetty options rejected");
+```
+
+- [ ] **Step 5: Replace mismatch-check assertions**
+
+Delete the `fixedWqeBytes` setup and the two fixed-WQE mismatch-check assertions. Add this simple direct URMA assertion:
+
+```cpp
+    Require(TileXR::Demo::CountP2PTransportMismatches(
+                bytes, pattern, 4096, TileXR::Demo::P2PTransport::DirectUrma, 8) == 1,
+        "direct_urma mismatch checker must validate payload bytes");
+```
+
+This assertion comes after `bytes[17] ^= 0xff;`.
+
+- [ ] **Step 6: Replace CSV assertions for removed transports**
+
+Delete the CSV assertions for `DirectUrmaMultiWqe`, `DirectUrmaMultiJetty`, `DirectUrmaMultiJettyParallel`, and `DirectUrmaMultiJettyParallelFixedWqe`. Add this direct URMA multi-block CSV assertion:
+
+```cpp
+    row.transport = TileXR::Demo::P2PTransport::DirectUrma;
+    row.traffic = TileXR::Demo::P2PTraffic::UniDir;
+    row.blockDim = 8;
+    const std::string directUrmaParallelCsv = TileXR::Demo::FormatP2PPerfCsvRow(row);
+    Require(directUrmaParallelCsv ==
+            "direct_urma,unidir,8,1to0,1,0,2,4096,20,8.000,0.000,0.000,0.512,0.512,0,0,logs/run\n",
+        "direct_urma parallel csv row mismatch");
+```
+
+- [ ] **Step 7: Run focused unit test and verify RED**
+
+Run:
+
+```powershell
+cmake --build tests/udma/build --target test_tilexr_udma_p2p_perf_config
+```
+
+If the build directory does not exist, run:
+
+```powershell
+cmake -S tests/udma -B tests/udma/build
+cmake --build tests/udma/build --target test_tilexr_udma_p2p_perf_config
+```
+
+Expected before production changes: build fails because enum members such as `DirectUrmaMultiWqe` still exist in production but tests now expect old strings to parse as `Invalid`, or the executable fails with one of the new rejection messages.
+
+- [ ] **Step 8: Commit failing tests only**
+
+Do not commit a failing test state. This step is intentionally a review gate: inspect `git diff tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`, then proceed to Task 2 without committing.
+
+---
+
+### Task 2: Trim Config Helpers
+
+**Files:**
+- Modify: `tests/udma/demo/tilexr_udma_p2p_perf_config.h`
+- Test: `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`
+
+**Interfaces:**
+- Consumes: failing tests from Task 1
+- Produces:
+  - `enum class P2PTransport { DirectUrma, Memory, DataAsFlag, Invalid }`
+  - `ParseP2PTransport("direct_urma"|"udma") -> DirectUrma`
+  - removed transport strings return `Invalid`
+  - `P2PTransportWindowBytes(DirectUrma, bytes, blockDim) -> bytes`
+  - `P2PEffectiveTransferBytes(DirectUrma, bytes, blockDim) -> bytes`
+
+- [ ] **Step 1: Trim `P2PTransport` enum**
+
+Change the enum to:
+
+```cpp
+enum class P2PTransport {
+    DirectUrma,
+    Memory,
+    DataAsFlag,
+    Invalid,
+};
+```
+
+- [ ] **Step 2: Trim `P2PTransportName`**
+
+Keep only:
+
+```cpp
+inline const char* P2PTransportName(P2PTransport transport)
+{
+    switch (transport) {
+        case P2PTransport::DirectUrma:
+            return "direct_urma";
+        case P2PTransport::Memory:
+            return "memory";
+        case P2PTransport::DataAsFlag:
+            return "data_as_flag";
+        default:
+            return "invalid";
+    }
+}
+```
+
+- [ ] **Step 3: Trim `ParseP2PTransport`**
+
+Keep only supported names and aliases:
+
+```cpp
+inline P2PTransport ParseP2PTransport(const std::string& name)
+{
+    if (name == "direct_urma" || name == "udma") {
+        return P2PTransport::DirectUrma;
+    }
+    if (name == "memory" || name == "ipc" || name == "datacopy") {
+        return P2PTransport::Memory;
+    }
+    if (name == "data_as_flag" || name == "data-as-flag" || name == "daf") {
+        return P2PTransport::DataAsFlag;
+    }
+    return P2PTransport::Invalid;
+}
+```
+
+- [ ] **Step 4: Remove fixed-WQE helpers**
+
+Delete:
+
+```cpp
+inline uint64_t P2PAlignUp(uint64_t value, uint64_t alignment)
+inline uint64_t P2PFixedWqeStrideBytes(uint64_t payloadBytes)
+inline uint64_t P2PFixedWqeWindowBytes(uint64_t payloadBytes, uint32_t blockDim)
+```
+
+Then simplify:
+
+```cpp
+inline uint64_t P2PTransportWindowBytes(P2PTransport transport, uint64_t payloadBytes, uint32_t blockDim)
+{
+    (void)blockDim;
+    return P2PTransportWindowBytes(transport, payloadBytes);
+}
+```
+
+- [ ] **Step 5: Simplify effective bytes**
+
+Replace `P2PEffectiveTransferBytes` with:
+
+```cpp
+inline uint64_t P2PEffectiveTransferBytes(P2PTransport transport, uint64_t payloadBytes, uint32_t blockDim)
+{
+    (void)transport;
+    (void)blockDim;
+    return payloadBytes;
+}
+```
+
+- [ ] **Step 6: Update validation message**
+
+Change the invalid transport message to:
+
+```cpp
+return fail("transport must be direct_urma, memory, or data_as_flag");
+```
+
+- [ ] **Step 7: Simplify mismatch checker**
+
+Replace `CountP2PTransportMismatches` with:
+
+```cpp
+inline uint64_t CountP2PTransportMismatches(
+    const std::vector<uint8_t>& data, uint32_t pattern, uint64_t payloadBytes,
+    P2PTransport transport, uint32_t blockDim)
+{
+    (void)transport;
+    (void)blockDim;
+    return CountP2PMismatches(data, pattern, payloadBytes);
+}
+```
+
+- [ ] **Step 8: Run focused unit test and verify GREEN**
+
+Run:
+
+```powershell
+cmake --build tests/udma/build --target test_tilexr_udma_p2p_perf_config
+ctest --test-dir tests/udma/build -R test_tilexr_udma_p2p_perf_config --output-on-failure
+```
+
+Expected: target builds and the config test passes.
+
+- [ ] **Step 9: Commit config test and helper changes**
+
+Run:
+
+```powershell
+git add tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp tests/udma/demo/tilexr_udma_p2p_perf_config.h
+git commit -m "test: trim P2P transport config modes"
+```
+
+---
+
+### Task 3: Route `direct_urma` To Parallel Multi-Jetty And Remove Obsolete Kernels
+
+**Files:**
+- Modify: `tests/udma/demo/tilexr_udma_demo.cpp`
+- Modify: `tests/udma/demo/tilexr_udma_demo_kernel.cpp`
+- Test: UDMA demo build target
+
+**Interfaces:**
+- Consumes:
+  - `P2PTransport::DirectUrma`
+  - `launch_tilexr_udma_p2p_perf(uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug, int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic)`
+- Produces:
+  - `launch_tilexr_udma_p2p_perf` launches the parallel multi-jetty kernel body
+  - no host references to removed enum values
+  - no benchmark wrappers named `launch_tilexr_udma_p2p_perf_multi_wqe`, `launch_tilexr_udma_p2p_perf_multi_jetty`, `launch_tilexr_udma_p2p_perf_multi_jetty_parallel`, or `launch_tilexr_udma_p2p_perf_multi_jetty_parallel_fixed_wqe`
+
+- [ ] **Step 1: Remove obsolete extern declarations from host**
+
+In `tests/udma/demo/tilexr_udma_demo.cpp`, delete declarations for:
+
+```cpp
+launch_tilexr_udma_p2p_perf_multi_wqe
+launch_tilexr_udma_p2p_perf_multi_jetty
+launch_tilexr_udma_p2p_perf_multi_jetty_parallel
+launch_tilexr_udma_p2p_perf_multi_jetty_parallel_fixed_wqe
+```
+
+Keep the existing `launch_tilexr_udma_p2p_perf` declaration.
+
+- [ ] **Step 2: Simplify host dispatch**
+
+In `LaunchP2PKernel`, remove branches for removed transports. The final structure should be:
+
+```cpp
+    if (options.transport == TileXR::Demo::P2PTransport::Memory) {
+        launch_tilexr_memory_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, debugDev,
+            options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic);
+        return;
+    }
+    if (options.transport == TileXR::Demo::P2PTransport::DataAsFlag) {
+        launch_tilexr_data_as_flag_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev,
+            options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic);
+        return;
+    }
+    launch_tilexr_udma_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, debugDev,
+        options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic);
+```
+
+- [ ] **Step 3: Remove fixed-WQE host window special cases**
+
+In `RunP2PPerfMode`, delete `useFixedWqeSharedWindow` and `useSeparateDebugBuffer`. Replace the offset calculations with:
+
+```cpp
+    const uint64_t dstOffset = useIpcTransport ? TileXR::IPC_DATA_OFFSET : dstWindowBytes;
+    const uint64_t localDstOffset = dstWindowBytes;
+    const uint64_t debugOffset = useIpcTransport ? localDstOffset + dstWindowBytes : dstOffset + dstWindowBytes;
+    const uint64_t payloadBytes = debugOffset + kP2PDebugWords * sizeof(uint32_t);
+    const uint64_t registeredPayloadBytes = payloadBytes;
+```
+
+Remove `debugMemory` allocation and cleanup if it is only used by the fixed-WQE path after this change.
+
+- [ ] **Step 4: Simplify host source/destination initialization**
+
+Replace:
+
+```cpp
+const bool initSrc = !useFixedWqeSharedWindow || IsP2PSourceRank(rank, options);
+const bool initDst = !useFixedWqeSharedWindow || IsP2PReceiveRank(rank, options);
+```
+
+with:
+
+```cpp
+const bool initSrc = true;
+const bool initDst = true;
+```
+
+or inline the copy conditions so both local source and destination buffers are initialized on every rank.
+
+- [ ] **Step 5: Simplify UDMA QP support warning**
+
+Near the CLI option handling in `tilexr_udma_demo.cpp`, replace conditions that check removed transports:
+
+```cpp
+p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrmaMultiWqe ||
+p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrmaMultiJetty ||
+p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrmaMultiJettyParallel ||
+p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrmaMultiJettyParallelFixedWqe
+```
+
+with:
+
+```cpp
+p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrma
+```
+
+- [ ] **Step 6: Rename the parallel multi-jetty kernel body**
+
+In `tests/udma/demo/tilexr_udma_demo_kernel.cpp`, delete the old `tilexr_udma_p2p_perf_kernel` body and rename:
+
+```cpp
+tilexr_udma_p2p_perf_multi_jetty_parallel_kernel
+```
+
+to:
+
+```cpp
+tilexr_udma_p2p_perf_kernel
+```
+
+The resulting `tilexr_udma_p2p_perf_kernel` must retain the parallel logic:
+
+```cpp
+uint32_t qpNum = enabled ? TileXR::GetUDMAInfo(args)->qpNum : 0;
+uint32_t jettyCount = blockNum < qpNum ? blockNum : qpNum;
+if (blockIdx >= jettyCount) {
+    TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+    return;
+}
+TileXRUdmaDemoWqeSlice(bytes, jettyCount, blockIdx, offset, sliceBytes);
+TileXR::UDMAPutNbiQp<uint8_t>(args, peer, blockIdx, src + offset, dstByteOffset + offset, sliceBytes);
+uint32_t status = TileXR::UDMAQuietStatusQp(args, peer, blockIdx);
+```
+
+- [ ] **Step 7: Delete obsolete benchmark kernels**
+
+Delete these kernel functions:
+
+```cpp
+tilexr_udma_p2p_perf_multi_wqe_kernel
+tilexr_udma_p2p_perf_multi_jetty_kernel
+tilexr_udma_p2p_perf_multi_jetty_parallel_fixed_wqe_kernel
+```
+
+Keep helper `TileXRUdmaDemoWqeSlice` because the renamed direct URMA kernel uses it. Delete `TileXRUdmaDemoFixedWqeSlice` if it becomes unused.
+
+- [ ] **Step 8: Delete obsolete launch wrappers**
+
+Delete wrappers:
+
+```cpp
+launch_tilexr_udma_p2p_perf_multi_wqe
+launch_tilexr_udma_p2p_perf_multi_jetty
+launch_tilexr_udma_p2p_perf_multi_jetty_parallel
+launch_tilexr_udma_p2p_perf_multi_jetty_parallel_fixed_wqe
+```
+
+Make existing `launch_tilexr_udma_p2p_perf` launch the renamed parallel kernel:
+
+```cpp
+void launch_tilexr_udma_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic)
+{
+    tilexr_udma_p2p_perf_kernel<<<blockDim, nullptr, stream>>>(
+        commArgs, src, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic);
+}
+```
+
+- [ ] **Step 9: Search for removed enum/function references**
+
+Run:
+
+```powershell
+rg -n "DirectUrmaMulti|direct_urma_multi|multi_wqe|multi_jetty_parallel_fixed_wqe|fixed_wqe|launch_tilexr_udma_p2p_perf_multi" tests src docs
+```
+
+Expected after Task 3 code changes but before docs cleanup: only docs or tests planned for Task 4 may still mention removed user-facing names.
+
+- [ ] **Step 10: Build demo/unit targets**
+
+Run:
+
+```powershell
+cmake --build tests/udma/build
+ctest --test-dir tests/udma/build -R "test_tilexr_udma_p2p_perf_config|test_tilexr_udma_transport_layout" --output-on-failure
+```
+
+Expected: build succeeds; focused host-only tests pass. If CANN/Ascend build dependencies are unavailable on this host, capture the exact failure and continue to docs cleanup.
+
+- [ ] **Step 11: Commit host/kernel trim**
+
+Run:
+
+```powershell
+git add tests/udma/demo/tilexr_udma_demo.cpp tests/udma/demo/tilexr_udma_demo_kernel.cpp
+git commit -m "feat: fold P2P direct URMA into parallel jetty path"
+```
+
+---
+
+### Task 4: Update Scripts, Documentation, And PR Description
+
+**Files:**
+- Modify: `tests/udma/demo/run_tilexr_udma_p2p_perf.sh`
+- Modify: `tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh`
+- Modify: `docs/TileXR_UDMA_P2P_PERF_GUIDE.md`
+- Modify: `tests/udma/demo/README.md`
+- External: GitHub PR #40 description
+
+**Interfaces:**
+- Consumes: supported transport names from Task 2
+- Produces: scripts and docs that mention only `direct_urma`, `memory`, and `data_as_flag`
+
+- [ ] **Step 1: Update runner QP setup**
+
+In `tests/udma/demo/run_tilexr_udma_p2p_perf.sh`, replace the transport conditional with:
+
+```bash
+if [ "${transport}" = "direct_urma" ] || [ "${transport}" = "udma" ]; then
+    export TILEXR_UDMA_QP_NUM="${block_dim}"
+else
+    export TILEXR_UDMA_QP_NUM="${TILEXR_UDMA_QP_NUM:-1}"
+fi
+```
+
+- [ ] **Step 2: Verify sweep defaults**
+
+Confirm `tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh` default remains:
+
+```bash
+transports_csv=${8:-direct_urma,memory,data_as_flag}
+```
+
+If it differs, change it to that exact line.
+
+- [ ] **Step 3: Update docs transport model**
+
+In `docs/TileXR_UDMA_P2P_PERF_GUIDE.md`, update the transport list to:
+
+```markdown
+The P2P perf mode supports three user-facing transport modes:
+
+- `direct_urma`: registered-memory UDMA transfer. Internally this path uses
+  the parallel multi-jetty kernel; `block_dim=1` with one QP is the single-jetty
+  baseline, while `block_dim=N` with `TILEXR_UDMA_QP_NUM=N` uses up to `N`
+  QPs/jettys in parallel.
+- `memory`: peer-memory IPC comparison using Ascend C `DataCopyPad`.
+- `data_as_flag`: peer-memory IPC comparison where each 512B block carries
+  480B payload plus a 32B ready flag.
+```
+
+Also update argument descriptions so `transport` says:
+
+```markdown
+- `transport`: `direct_urma`, `memory`, or `data_as_flag`; default is `direct_urma`.
+```
+
+- [ ] **Step 4: Remove obsolete docs wording**
+
+Run:
+
+```powershell
+rg -n "direct_urma_multi|multi-WQE|multi WQE|fixed-WQE|fixed_wqe|direct_urma_multi_jetty_parallel" docs tests/udma/demo
+```
+
+Edit `docs/TileXR_UDMA_P2P_PERF_GUIDE.md` and `tests/udma/demo/README.md` until no user-facing obsolete transport names remain. It is acceptable for the design spec and plan files under `docs/superpowers/` to mention removed names as historical scope.
+
+- [ ] **Step 5: Update PR #40 body with `gh`**
+
+Use the installed GitHub CLI full path if `gh` is not active in `PATH`:
+
+```powershell
+& 'C:\Users\h30059441\AppData\Local\GitHubCLI\bin\gh.exe' pr edit 40 --repo LingquLab/TileXR --body @"
+## Summary
+
+Extend the UDMA P2P performance demo with a trimmed three-mode transport matrix.
+
+This PR adds:
+
+- `test_type=4` UDMA P2P performance mode and usage documentation
+- `direct_urma` registered-memory UDMA transfer with built-in parallel multi-jetty support
+- `memory` peer-memory IPC comparison
+- `data_as_flag` peer-memory IPC comparison with embedded ready flags
+- unidirectional and bidirectional P2P traffic modes
+- CSV output, plotting helper, P2P runner script, and concurrency sweep script
+- host-side unit coverage for P2P config parsing/window sizing and UDMA transport layout metadata
+
+## Test plan
+
+- [x] `git diff --cached --check`
+- [ ] `bash tests/udma/run_tests.sh`
+- [ ] Run `tests/udma/demo/run_tilexr_udma_p2p_perf.sh` on supported Ascend hardware
+- [ ] Run `tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh` on supported Ascend hardware
+"@
+```
+
+- [ ] **Step 6: Run final searches**
+
+Run:
+
+```powershell
+rg -n "direct_urma_multi|multi-WQE|multi WQE|fixed-WQE|fixed_wqe|DirectUrmaMulti|launch_tilexr_udma_p2p_perf_multi" tests src docs --glob "!docs/superpowers/**"
+```
+
+Expected: no matches.
+
+- [ ] **Step 7: Run verification**
+
+Run:
+
+```powershell
+git diff --check
+ctest --test-dir tests/udma/build -R "test_tilexr_udma_p2p_perf_config|test_tilexr_udma_transport_layout" --output-on-failure
+```
+
+If `tests/udma/build` is unavailable, run:
+
+```powershell
+cmake -S tests/udma -B tests/udma/build
+cmake --build tests/udma/build
+ctest --test-dir tests/udma/build -R "test_tilexr_udma_p2p_perf_config|test_tilexr_udma_transport_layout" --output-on-failure
+```
+
+Expected: whitespace check passes; focused tests pass. If environment dependencies prevent building, record the exact command and failure.
+
+- [ ] **Step 8: Commit docs/script cleanup**
+
+Run:
+
+```powershell
+git add tests/udma/demo/run_tilexr_udma_p2p_perf.sh tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh docs/TileXR_UDMA_P2P_PERF_GUIDE.md tests/udma/demo/README.md
+git commit -m "docs: describe trimmed P2P transport modes"
+```
+
+---
+
+## Self-Review Checklist
+
+- Spec coverage:
+  - Three transport names covered in Tasks 1, 2, and 4.
+  - `direct_urma` folded into parallel multi-jetty path covered in Task 3.
+  - Removed modes covered in Tasks 1, 2, 3, and 4.
+  - Script QP behavior covered in Task 4.
+  - TDD red/green covered in Tasks 1 and 2.
+- Placeholder scan: no unfinished placeholder wording is intentional in executable steps.
+- Type consistency:
+  - `P2PTransport::DirectUrma`, `Memory`, `DataAsFlag`, and `Invalid` are the only public enum values used after Task 2.
+  - `launch_tilexr_udma_p2p_perf` remains the host-visible UDMA launcher name.
diff --git a/docs/superpowers/plans/2026-06-26-data-as-flag-epoch-ordered.md b/docs/superpowers/plans/2026-06-26-data-as-flag-epoch-ordered.md
new file mode 100644
index 0000000..7b93261
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-26-data-as-flag-epoch-ordered.md
@@ -0,0 +1,1215 @@
+# DataAsFlag Epoch-Ordered P2P Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Add a side-by-side `data_as_flag_epoch_ordered` P2P transport that uses per-launch epochs and ordered payload/flag MTE3 writes before later replacing legacy `data_as_flag`.
+
+**Architecture:** The implementation extends the existing UDMA P2P demo transport table, keeps the legacy data-as-flag helpers intact, and adds epoch-ordered helper APIs in `src/include/tilexr_data_as_flag.h`. The new P2P kernel reuses the existing data-as-flag role and slice logic, but sender writes payload and epoch flag in separate MTE3 stages while receiver waits on one batch commit flag.
+
+**Tech Stack:** C++14 host code, AscendC AICore device code, CMake UDMA test targets, remote Ascend950 validation through `docs/ASCEND_REMOTE_BUILD_RUNBOOK.md`.
+
+## Global Constraints
+
+- Keep the existing block layout: `512B = 480B payload + 32B flag`.
+- Keep ready information embedded in the data-as-flag window; do not move it to `SyncCollectives` or an independent flag region.
+- Use per-launch `magic` and `step` to build a 64-bit epoch.
+- Do not depend on payload and flag visibility within the same MTE3 operation.
+- Phase 1 must keep legacy `data_as_flag` selectable for A/B comparison.
+- `data_as_flag_epoch_ordered` must use the IPC peer data window and the same window byte calculation as legacy `data_as_flag`.
+- Remote validation targets the existing Ascend environment in `docs/ASCEND_REMOTE_BUILD_RUNBOOK.md`; prefer NPU6/7 if 4/5 or 2/3 are suspect.
+- Do not store the remote password in repo files or scripts.
+
+---
+
+## File Structure
+
+- Modify `tests/udma/demo/tilexr_udma_p2p_perf_config.h`
+  - Add the `DataAsFlagEpochOrdered` transport enum value and helpers.
+  - Keep legacy `DataAsFlag` behavior unchanged.
+- Modify `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`
+  - Add failing then passing config tests for the new transport.
+- Modify `src/include/tilexr_data_as_flag.h`
+  - Add epoch construction and epoch-ordered send/receive helpers.
+  - Keep existing `DataAsFlagSend` and `DataAsFlagCheckAndRecv` untouched for legacy mode.
+- Modify `tests/udma/demo/tilexr_udma_demo_kernel.cpp`
+  - Add the new epoch-ordered P2P kernel and launcher wrapper.
+  - Reuse existing data-as-flag role resolution and slicing helpers.
+- Modify `tests/udma/demo/tilexr_udma_demo.cpp`
+  - Add the launcher declaration and host dispatch.
+  - Pass `magic` and `step` to the new mode.
+  - Ensure legacy data-as-flag per-iteration clear barriers do not apply to the new mode.
+- Modify `tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp`
+  - Add source structure checks for the new helper, kernel, launcher, and host wiring.
+- Optionally modify `tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh`
+  - Include `data_as_flag_epoch_ordered` in default sweeps only after smoke validation passes.
+- Add CSV outputs under `run/csv_full/` only during validation, not as part of core implementation commits unless the user explicitly asks to keep results.
+
+---
+
+### Task 1: Add Transport Config and Unit Tests
+
+**Files:**
+- Modify: `tests/udma/demo/tilexr_udma_p2p_perf_config.h`
+- Modify: `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`
+
+**Interfaces:**
+- Consumes: Existing `P2PTransport`, `P2PTransportName`, `ParseP2PTransport`, `P2PTransportWindowBytes`, `P2PTransportUsesIpc`, `P2PTransportBothRanksActive`, `ValidateP2PPerfOptions`, `FormatP2PPerfCsvRow`.
+- Produces:
+  - `P2PTransport::DataAsFlagEpochOrdered`
+  - Transport name: `"data_as_flag_epoch_ordered"`
+  - Aliases: `"data-as-flag-epoch-ordered"`, `"daf_epoch_ordered"`
+
+- [ ] **Step 1: Write failing enum/name/parse tests**
+
+Add these checks near the existing `DataAsFlag` transport checks in `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`:
+
+```cpp
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered) ==
+            "data_as_flag_epoch_ordered",
+        "data_as_flag_epoch_ordered transport name mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("data_as_flag_epoch_ordered") ==
+            TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered,
+        "data_as_flag_epoch_ordered transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("data-as-flag-epoch-ordered") ==
+            TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered,
+        "data-as-flag-epoch-ordered alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("daf_epoch_ordered") ==
+            TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered,
+        "daf_epoch_ordered alias parse mismatch");
+```
+
+- [ ] **Step 2: Write failing layout/helper tests**
+
+Add these checks after the existing data-as-flag window checks:
+
+```cpp
+    Require(TileXR::Demo::P2PTransportWindowBytes(
+                TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 0) == 0,
+        "data_as_flag_epoch_ordered zero layout mismatch");
+    Require(TileXR::Demo::P2PTransportWindowBytes(
+                TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 480) == 512,
+        "data_as_flag_epoch_ordered 480B layout mismatch");
+    Require(TileXR::Demo::P2PTransportWindowBytes(
+                TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 481) == 1024,
+        "data_as_flag_epoch_ordered 481B layout mismatch");
+    Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered),
+        "data_as_flag_epoch_ordered must use IPC peer window");
+    Require(TileXR::Demo::P2PTransportBothRanksActive(
+                TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, TileXR::Demo::P2PTraffic::UniDir),
+        "data_as_flag_epoch_ordered unidir must keep receiver active");
+```
+
+- [ ] **Step 3: Write failing validation and CSV tests**
+
+Add these checks near existing `DataAsFlag` validation and CSV assertions:
+
+```cpp
+    options.transport = TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered;
+    options.traffic = TileXR::Demo::P2PTraffic::UniDir;
+    options.blockDim = 4;
+    options.maxBytes = 16384;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "valid data_as_flag_epoch_ordered options rejected");
+
+    row.transport = TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered;
+    row.traffic = TileXR::Demo::P2PTraffic::UniDir;
+    row.blockDim = 4;
+    const std::string epochOrderedCsv = TileXR::Demo::FormatP2PPerfCsvRow(row);
+    Require(epochOrderedCsv ==
+            "data_as_flag_epoch_ordered,unidir,4,1to0,1,0,2,4096,20,8.000,0.000,0.000,0.512,0.512,0,0,logs/run\n",
+        "data_as_flag_epoch_ordered csv row mismatch");
+```
+
+- [ ] **Step 4: Run the config test and confirm it fails**
+
+Run on a Linux/Ascend build environment:
+
+```bash
+cd /home/h30059441/TileXR
+source scripts/common_env.sh
+cd tests/udma
+bash build.sh
+./install/bin/test_tilexr_udma_p2p_perf_config
+```
+
+Expected before implementation: compile failure mentioning `DataAsFlagEpochOrdered` is not a member of `P2PTransport`.
+
+- [ ] **Step 5: Implement transport config**
+
+Modify `tests/udma/demo/tilexr_udma_p2p_perf_config.h`:
+
+```cpp
+enum class P2PTransport {
+    DirectUrma,
+    DirectUrmaPostOnly,
+    Memory,
+    MemoryConsume,
+    DataAsFlag,
+    DataAsFlagEpochOrdered,
+    Invalid,
+};
+```
+
+Add name mapping:
+
+```cpp
+        case P2PTransport::DataAsFlagEpochOrdered:
+            return "data_as_flag_epoch_ordered";
+```
+
+Add parser mapping:
+
+```cpp
+    if (name == "data_as_flag_epoch_ordered" ||
+        name == "data-as-flag-epoch-ordered" ||
+        name == "daf_epoch_ordered") {
+        return P2PTransport::DataAsFlagEpochOrdered;
+    }
+```
+
+Update window bytes:
+
+```cpp
+    return (transport == P2PTransport::DataAsFlag ||
+            transport == P2PTransport::DataAsFlagEpochOrdered) ?
+        DataAsFlagWindowBytes(payloadBytes) : payloadBytes;
+```
+
+Update IPC and active-rank helpers:
+
+```cpp
+inline bool P2PTransportUsesIpc(P2PTransport transport)
+{
+    return transport == P2PTransport::Memory ||
+        transport == P2PTransport::MemoryConsume ||
+        transport == P2PTransport::DataAsFlag ||
+        transport == P2PTransport::DataAsFlagEpochOrdered;
+}
+
+inline bool P2PTransportBothRanksActive(P2PTransport transport, P2PTraffic traffic)
+{
+    return traffic == P2PTraffic::BiDir ||
+        transport == P2PTransport::MemoryConsume ||
+        transport == P2PTransport::DataAsFlag ||
+        transport == P2PTransport::DataAsFlagEpochOrdered;
+}
+```
+
+Update the validation error string:
+
+```cpp
+return fail("transport must be direct_urma, direct_urma_post_only, memory, memory_consume, data_as_flag, or data_as_flag_epoch_ordered");
+```
+
+- [ ] **Step 6: Run the config test and confirm it passes**
+
+Run:
+
+```bash
+cd /home/h30059441/TileXR/tests/udma
+bash build.sh
+./install/bin/test_tilexr_udma_p2p_perf_config
+```
+
+Expected: exit code `0`.
+
+- [ ] **Step 7: Commit Task 1**
+
+```bash
+git add tests/udma/demo/tilexr_udma_p2p_perf_config.h \
+        tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp
+git commit -m "feat: add epoch ordered data-as-flag transport config"
+```
+
+---
+
+### Task 2: Add Epoch-Ordered Device Helpers
+
+**Files:**
+- Modify: `src/include/tilexr_data_as_flag.h`
+
+**Interfaces:**
+- Consumes: Existing constants `DATA_AS_FLAG_BLOCK_BYTES`, `DATA_AS_FLAG_PAYLOAD_BYTES`, `DATA_AS_FLAG_FLAG_BYTES`, `DATA_AS_FLAG_FLAG_OFFSET_BYTES`, `DataAsFlagBlockCountForPayloadBytes`, `DataAsFlagScratchBytes`, `DataAsFlagMaxRecvBlocks`, `DataAsFlagCopyPayloadToScratch`, `DataAsFlagCopyBatchToRecvGM`.
+- Produces:
+  - `uint64_t DataAsFlagEpoch(int32_t magic, int32_t step)`
+  - `uint32_t DataAsFlagMaxEpochOrderedSendBlocks(uint32_t scratchBytes)`
+  - `uint32_t DataAsFlagMaxEpochOrderedRecvBlocks(uint32_t scratchBytes)`
+  - `uint32_t DataAsFlagSendEpochOrdered(...)`
+  - `bool DataAsFlagCheckAndRecvEpochOrdered(...)`
+
+- [ ] **Step 1: Add source guard tests before implementation**
+
+This task's implementation will be guarded in Task 5, but first verify absence manually:
+
+```bash
+rg -n "DataAsFlagEpoch|DataAsFlagSendEpochOrdered|DataAsFlagCheckAndRecvEpochOrdered" src/include/tilexr_data_as_flag.h
+```
+
+Expected before implementation: no matches.
+
+- [ ] **Step 2: Add epoch helper**
+
+Add this helper in `src/include/tilexr_data_as_flag.h` outside the AscendC-only block so unit/source checks can find it:
+
+```cpp
+TILEXR_DATA_AS_FLAG_INLINE uint64_t DataAsFlagEpoch(int32_t magic, int32_t step)
+{
+    return (static_cast<uint64_t>(static_cast<uint32_t>(magic)) << 32) |
+        static_cast<uint32_t>(step);
+}
+```
+
+- [ ] **Step 3: Add epoch ordered batch capacity helpers**
+
+Inside `#if TILEXR_ASCENDC_AICORE_COMPILE`, add:
+
+```cpp
+__aicore__ inline uint32_t DataAsFlagMaxEpochOrderedSendBlocks(uint32_t scratchBytes)
+{
+    uint32_t blocks = scratchBytes / DATA_AS_FLAG_BLOCK_BYTES;
+    while (blocks > 0U) {
+        const uint64_t requiredBytes =
+            static_cast<uint64_t>(blocks) * DATA_AS_FLAG_BLOCK_BYTES +
+            static_cast<uint64_t>(blocks) * DATA_AS_FLAG_FLAG_BYTES;
+        if (static_cast<uint64_t>(scratchBytes) >= requiredBytes) {
+            return blocks;
+        }
+        --blocks;
+    }
+    return 0U;
+}
+
+__aicore__ inline uint32_t DataAsFlagMaxEpochOrderedRecvBlocks(uint32_t scratchBytes)
+{
+    return DataAsFlagMaxRecvBlocks(scratchBytes);
+}
+```
+
+This conservative v1 uses one scratch region for packed payload blocks and a second scratch region for epoch flags.
+
+- [ ] **Step 4: Add flag fill and flag copy helpers**
+
+Add:
+
+```cpp
+__aicore__ inline void DataAsFlagFillEpochFlags(
+    AscendC::LocalTensor<uint8_t>& flagScratch,
+    uint32_t blockCount,
+    uint64_t epoch)
+{
+    AscendC::LocalTensor<uint64_t> flagWords = flagScratch.template ReinterpretCast<uint64_t>();
+    const uint32_t words = blockCount * DATA_AS_FLAG_FLAG_BYTES / sizeof(uint64_t);
+    for (uint32_t i = 0; i < words; ++i) {
+        flagWords.SetValue(i, epoch);
+    }
+    AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);
+    AscendC::WaitFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);
+}
+
+__aicore__ inline void DataAsFlagCopyEpochFlagsToGM(
+    __gm__ uint8_t* dstDataAsFlagGM,
+    uint32_t dstBlockOffset,
+    AscendC::LocalTensor<uint8_t>& flagScratch,
+    uint32_t batchBlocks)
+{
+    AscendC::GlobalTensor<uint8_t> flagGlobal;
+    flagGlobal.SetGlobalBuffer(
+        dstDataAsFlagGM + static_cast<uint64_t>(dstBlockOffset) * DATA_AS_FLAG_BLOCK_BYTES +
+        DATA_AS_FLAG_FLAG_OFFSET_BYTES);
+    AscendC::DataCopyExtParams flagParams {
+        static_cast<uint16_t>(batchBlocks),
+        DATA_AS_FLAG_FLAG_BYTES,
+        0U,
+        DATA_AS_FLAG_PAYLOAD_BYTES / DATA_AS_FLAG_ALIGN_BYTES,
+        0U};
+    AscendC::DataCopyPadExtParams<uint8_t> padParams {false, 0U, 0U, 0U};
+    AscendC::DataCopyPad(flagGlobal, flagScratch, flagParams, padParams);
+}
+```
+
+- [ ] **Step 5: Add `DataAsFlagSendEpochOrdered`**
+
+Add:
+
+```cpp
+__aicore__ inline uint32_t DataAsFlagSendEpochOrdered(
+    __gm__ uint8_t* dstDataAsFlagGM,
+    const __gm__ uint8_t* srcGM,
+    uint64_t dataBytes,
+    uint64_t epoch,
+    AscendC::LocalTensor<uint8_t>& scratch)
+{
+    if (dstDataAsFlagGM == nullptr || srcGM == nullptr || dataBytes == 0U) {
+        return 0U;
+    }
+
+    const uint32_t totalBlocks = DataAsFlagBlockCountForPayloadBytes(dataBytes);
+    const uint32_t batchCapacity = DataAsFlagMaxEpochOrderedSendBlocks(DataAsFlagScratchBytes(scratch));
+    if (batchCapacity == 0U) {
+        return 0U;
+    }
+
+    uint32_t sentBlocks = 0U;
+    uint64_t sentBytes = 0U;
+    while (sentBlocks < totalBlocks) {
+        const uint32_t remainingBlocks = totalBlocks - sentBlocks;
+        const uint32_t batchBlocks = remainingBlocks < batchCapacity ? remainingBlocks : batchCapacity;
+        const uint64_t maxBatchBytes = static_cast<uint64_t>(batchBlocks) * DATA_AS_FLAG_PAYLOAD_BYTES;
+        const uint64_t remainingBytes = dataBytes - sentBytes;
+        const uint32_t batchPayloadBytes = static_cast<uint32_t>(
+            remainingBytes < maxBatchBytes ? remainingBytes : maxBatchBytes);
+        const uint32_t fullBlocks = batchPayloadBytes / DATA_AS_FLAG_PAYLOAD_BYTES;
+        const uint32_t tailBytes = batchPayloadBytes % DATA_AS_FLAG_PAYLOAD_BYTES;
+
+        AscendC::LocalTensor<uint8_t> payloadScratch = scratch;
+        AscendC::LocalTensor<uint8_t> flagScratch =
+            scratch[static_cast<uint64_t>(batchBlocks) * DATA_AS_FLAG_BLOCK_BYTES];
+
+        AscendC::Duplicate<uint8_t>(payloadScratch, 0U, batchBlocks * DATA_AS_FLAG_BLOCK_BYTES);
+        AscendC::PipeBarrier<PIPE_ALL>();
+        DataAsFlagCopyPayloadToScratch(payloadScratch, srcGM, sentBytes, fullBlocks, tailBytes);
+        AscendC::SetFlag<AscendC::HardEvent::MTE2_MTE3>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE2_MTE3>(EVENT_ID0);
+        DataAsFlagCopyScratchToDataAsFlagGM(dstDataAsFlagGM, sentBlocks, payloadScratch, batchBlocks);
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+
+        DataAsFlagFillEpochFlags(flagScratch, batchBlocks, epoch);
+        DataAsFlagCopyEpochFlagsToGM(dstDataAsFlagGM, sentBlocks, flagScratch, batchBlocks);
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+
+        sentBlocks += batchBlocks;
+        sentBytes += batchPayloadBytes;
+    }
+    AscendC::PipeBarrier<PIPE_ALL>();
+    return totalBlocks;
+}
+```
+
+Note: the first MTE3 writes packed 512B blocks with zero flags. The second MTE3 writes current epoch flags. This satisfies the ordered protocol while avoiding a risky first version of strided payload-only GM writes.
+
+- [ ] **Step 6: Add commit flag polling helper**
+
+Add:
+
+```cpp
+__aicore__ inline uint64_t DataAsFlagLoadEpochFlag(
+    const __gm__ uint8_t* dataAsFlagGM,
+    uint32_t blockIndex,
+    AscendC::LocalTensor<uint8_t>& scratch)
+{
+    AscendC::GlobalTensor<uint64_t> flagGlobal;
+    flagGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(
+        const_cast<__gm__ uint8_t*>(
+            dataAsFlagGM + static_cast<uint64_t>(blockIndex) * DATA_AS_FLAG_BLOCK_BYTES +
+            DATA_AS_FLAG_FLAG_OFFSET_BYTES)));
+    AscendC::LocalTensor<uint64_t> flagLocal = scratch.template ReinterpretCast<uint64_t>();
+    AscendC::DataCopyExtParams params {1U, sizeof(uint64_t), 0U, 0U, 0U};
+    AscendC::DataCopyPadExtParams<uint64_t> padParams {false, 0U, 0U, 0U};
+    AscendC::DataCopyPad(flagLocal, flagGlobal, params, padParams);
+    AscendC::SetFlag<AscendC::HardEvent::MTE2_S>(EVENT_ID0);
+    AscendC::WaitFlag<AscendC::HardEvent::MTE2_S>(EVENT_ID0);
+    return flagLocal.GetValue(0);
+}
+```
+
+- [ ] **Step 7: Add strict batch checker**
+
+Add:
+
+```cpp
+__aicore__ inline bool DataAsFlagCheckBatchEpochStrict(
+    const __gm__ uint8_t* dataAsFlagGM,
+    uint32_t blockOffset,
+    uint32_t batchBlocks,
+    uint64_t epoch,
+    AscendC::LocalTensor<uint8_t>& scratch)
+{
+    if (batchBlocks == 0U) {
+        return false;
+    }
+    for (uint32_t i = 0; i < batchBlocks; ++i) {
+        if (DataAsFlagLoadEpochFlag(dataAsFlagGM, blockOffset + i, scratch) != epoch) {
+            return false;
+        }
+    }
+    return true;
+}
+```
+
+- [ ] **Step 8: Add `DataAsFlagCheckAndRecvEpochOrdered`**
+
+Add:
+
+```cpp
+__aicore__ inline bool DataAsFlagCheckAndRecvEpochOrdered(
+    const __gm__ uint8_t* dataAsFlagGM,
+    uint64_t dataBytes,
+    __gm__ uint8_t* recvGM,
+    uint64_t epoch,
+    AscendC::LocalTensor<uint8_t>& recvScratch,
+    bool strict)
+{
+    if (dataAsFlagGM == nullptr || recvGM == nullptr) {
+        return false;
+    }
+    if (dataBytes == 0U) {
+        return true;
+    }
+
+    const uint32_t totalBlocks = DataAsFlagBlockCountForPayloadBytes(dataBytes);
+    const uint32_t batchCapacity = DataAsFlagMaxEpochOrderedRecvBlocks(DataAsFlagScratchBytes(recvScratch));
+    if (batchCapacity == 0U) {
+        return false;
+    }
+
+    uint32_t processedBlocks = 0U;
+    uint64_t processedBytes = 0U;
+    while (processedBlocks < totalBlocks) {
+        const uint32_t remainingBlocks = totalBlocks - processedBlocks;
+        const uint32_t batchBlocks = remainingBlocks < batchCapacity ? remainingBlocks : batchCapacity;
+        const uint32_t lastBlock = processedBlocks + batchBlocks - 1U;
+        while (DataAsFlagLoadEpochFlag(dataAsFlagGM, lastBlock, recvScratch) != epoch) {
+        }
+        if (strict && !DataAsFlagCheckBatchEpochStrict(dataAsFlagGM, processedBlocks, batchBlocks, epoch, recvScratch)) {
+            return false;
+        }
+
+        const uint64_t remainingBytes = dataBytes - processedBytes;
+        const uint64_t maxBatchBytes = static_cast<uint64_t>(batchBlocks) * DATA_AS_FLAG_PAYLOAD_BYTES;
+        const uint32_t batchBytes = static_cast<uint32_t>(
+            remainingBytes < maxBatchBytes ? remainingBytes : maxBatchBytes);
+        DataAsFlagCopyBatchToRecvGM(
+            dataAsFlagGM, processedBlocks, processedBytes, batchBytes, recvGM, recvScratch);
+        processedBlocks += batchBlocks;
+        processedBytes += batchBytes;
+    }
+    AscendC::PipeBarrier<PIPE_ALL>();
+    return true;
+}
+```
+
+- [ ] **Step 9: Build and fix compiler issues**
+
+Run:
+
+```bash
+cd /home/h30059441/TileXR
+source scripts/common_env.sh
+cd tests/udma
+bash build.sh
+```
+
+Expected: build succeeds. If AscendC rejects `Duplicate<uint8_t>` or `DataCopyPad` template parameters, adjust only the helper internals while preserving the same signatures and two-stage MTE3 protocol.
+
+- [ ] **Step 10: Commit Task 2**
+
+```bash
+git add src/include/tilexr_data_as_flag.h
+git commit -m "feat: add epoch ordered data-as-flag helpers"
+```
+
+---
+
+### Task 3: Add P2P Kernel and Launcher
+
+**Files:**
+- Modify: `tests/udma/demo/tilexr_udma_demo_kernel.cpp`
+
+**Interfaces:**
+- Consumes:
+  - `TileXRUdmaDemoResolveDataAsFlagRole`
+  - `TileXRUdmaDemoDataAsFlagSlice`
+  - `TileXR::DataAsFlagEpoch`
+  - `TileXR::DataAsFlagSendEpochOrdered`
+  - `TileXR::DataAsFlagCheckAndRecvEpochOrdered`
+- Produces:
+  - `tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel`
+  - `launch_tilexr_data_as_flag_epoch_ordered_p2p_perf`
+
+- [ ] **Step 1: Add failing source guard expectation**
+
+Temporarily add these checks to `tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp` in a new helper `TestDataAsFlagEpochOrderedSource()`:
+
+```cpp
+void TestDataAsFlagEpochOrderedSource()
+{
+    const std::string kernelPath = "tests/udma/demo/tilexr_udma_demo_kernel.cpp";
+    const std::string kernelText = ReadFile(kernelPath);
+    CheckContains(kernelPath, kernelText, "tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel");
+    CheckContains(kernelPath, kernelText, "launch_tilexr_data_as_flag_epoch_ordered_p2p_perf");
+    CheckContains(kernelPath, kernelText, "DataAsFlagEpoch(magic, step)");
+    CheckContains(kernelPath, kernelText, "DataAsFlagSendEpochOrdered");
+    CheckContains(kernelPath, kernelText, "DataAsFlagCheckAndRecvEpochOrdered");
+    CheckContains(kernelPath, kernelText, "int32_t magic, int32_t step");
+}
+```
+
+Call it from `main()`:
+
+```cpp
+    TestDataAsFlagEpochOrderedSource();
+```
+
+Run:
+
+```bash
+cd /home/h30059441/TileXR/tests/udma
+bash build.sh
+./install/bin/test_tilexr_udma_p2p_source_guard
+```
+
+Expected before kernel implementation: source guard failure listing missing strings.
+
+- [ ] **Step 2: Add the epoch-ordered kernel**
+
+Add this kernel beside `tilexr_data_as_flag_p2p_perf_kernel` in `tests/udma/demo/tilexr_udma_demo_kernel.cpp`:
+
+```cpp
+extern "C" __global__ __aicore__ void tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel(
+    GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR dstGM, GM_ADDR debugGM,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step)
+{
+    if constexpr (g_coreType == AscendC::AIV) {
+        auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM);
+        auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM);
+        auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM);
+        auto dst = reinterpret_cast<__gm__ uint8_t*>(dstGM);
+
+        int32_t rank = args->rank;
+        uint32_t blockIdx = AscendC::GetBlockIdx();
+        uint32_t blockNum = AscendC::GetBlockNum();
+        const uint64_t epoch = TileXR::DataAsFlagEpoch(magic, step);
+        if (debug != nullptr && blockIdx == 0) {
+            debug[0] = TILEXR_UDMA_DEMO_MAGIC;
+            debug[1] = rank;
+            debug[2] = 1;
+            debug[3] = bytes;
+            debug[4] = pattern;
+            debug[5] = 0xffffffffu;
+            debug[6] = blockNum;
+            debug[7] = static_cast<uint32_t>(epoch & 0xffffffffu);
+        }
+
+        bool isSender = false;
+        bool isReceiver = false;
+        int32_t peer = -1;
+        if (blockNum == 0 ||
+            !TileXRUdmaDemoResolveDataAsFlagRole(rank, srcRank, dstRank, traffic, isSender, isReceiver, peer) ||
+            peer < 0 || peer >= args->rankSize) {
+            return;
+        }
+
+        AscendC::GlobalTensor<GM_ADDR> peerMems;
+        peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE);
+        GM_ADDR peerBase = peerMems.GetValue(peer);
+        GM_ADDR localBase = peerMems.GetValue(rank);
+        if (peerBase == nullptr || localBase == nullptr || dst == nullptr || (isSender && src == nullptr)) {
+            uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2);
+            if (debug != nullptr && blockIdx == 0) {
+                debug[5] = status;
+            }
+            return;
+        }
+
+        uint32_t payloadOffset = 0;
+        uint32_t sliceBytes = 0;
+        uint32_t dataAsFlagOffset = 0;
+        TileXRUdmaDemoDataAsFlagSlice(bytes, blockNum, blockIdx, payloadOffset, sliceBytes, dataAsFlagOffset);
+        if (debug != nullptr && blockIdx < 8) {
+            debug[16 + blockIdx] = payloadOffset;
+            debug[24 + blockIdx] = sliceBytes;
+        }
+        if (sliceBytes == 0) {
+            TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+            return;
+        }
+
+        AscendC::TPipe pipe;
+        AscendC::TBuf<AscendC::QuePosition::VECCALC> tBuf;
+        pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES);
+        AscendC::LocalTensor<uint8_t> scratch = tBuf.Get<uint8_t>();
+
+        uint32_t status = 0;
+        if (isSender) {
+            uint32_t sentBlocks = TileXR::DataAsFlagSendEpochOrdered(
+                reinterpret_cast<__gm__ uint8_t*>(peerBase + dstByteOffset + dataAsFlagOffset),
+                src + payloadOffset, sliceBytes, epoch, scratch);
+            if (sentBlocks == 0U) {
+                status = 3;
+            }
+        }
+        if (isReceiver && status == 0) {
+            bool strict = false;
+            bool received = TileXR::DataAsFlagCheckAndRecvEpochOrdered(
+                reinterpret_cast<__gm__ uint8_t*>(localBase + dstByteOffset + dataAsFlagOffset),
+                sliceBytes, dst + payloadOffset, epoch, scratch, strict);
+            if (!received) {
+                status = 4;
+            }
+        }
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status);
+        if (debug != nullptr && blockIdx == 0) {
+            debug[5] = status;
+        }
+    }
+}
+```
+
+- [ ] **Step 3: Add launcher wrapper**
+
+Add near the existing `launch_tilexr_data_as_flag_p2p_perf`:
+
+```cpp
+void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step)
+{
+    tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel<<<blockDim, nullptr, stream>>>(
+        commArgs, src, dst, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic, magic, step);
+}
+```
+
+- [ ] **Step 4: Build and run source guard**
+
+Run:
+
+```bash
+cd /home/h30059441/TileXR
+source scripts/common_env.sh
+cd tests/udma
+bash build.sh
+./install/bin/test_tilexr_udma_p2p_source_guard
+```
+
+Expected: build succeeds and source guard exits `0`.
+
+- [ ] **Step 5: Commit Task 3**
+
+```bash
+git add tests/udma/demo/tilexr_udma_demo_kernel.cpp \
+        tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp
+git commit -m "feat: add epoch ordered data-as-flag p2p kernel"
+```
+
+---
+
+### Task 4: Wire Host Launch and Clear-Window Behavior
+
+**Files:**
+- Modify: `tests/udma/demo/tilexr_udma_demo.cpp`
+- Modify: `tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp`
+
+**Interfaces:**
+- Consumes:
+  - `P2PTransport::DataAsFlagEpochOrdered`
+  - `launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(...)`
+- Produces:
+  - Host dispatch from CLI transport `data_as_flag_epoch_ordered`.
+  - No per-iteration clear-window barrier for the new transport.
+
+- [ ] **Step 1: Extend source guard for host wiring**
+
+Add checks to `TestDataAsFlagEpochOrderedSource()`:
+
+```cpp
+    const std::string hostPath = "tests/udma/demo/tilexr_udma_demo.cpp";
+    const std::string hostText = ReadFile(hostPath);
+    CheckContains(hostPath, hostText, "launch_tilexr_data_as_flag_epoch_ordered_p2p_perf");
+    CheckContains(hostPath, hostText, "P2PTransport::DataAsFlagEpochOrdered");
+    CheckContains(hostPath, hostText, "magic, step");
+    CheckContains(hostPath, hostText, "useLegacyDataAsFlagTransport");
+```
+
+Run:
+
+```bash
+cd /home/h30059441/TileXR/tests/udma
+bash build.sh
+./install/bin/test_tilexr_udma_p2p_source_guard
+```
+
+Expected before host implementation: source guard failure.
+
+- [ ] **Step 2: Add launcher declaration**
+
+In `tests/udma/demo/tilexr_udma_demo.cpp`, add:
+
+```cpp
+extern void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step);
+```
+
+- [ ] **Step 3: Dispatch the new transport**
+
+In `LaunchP2PKernel`, add the new branch before the legacy `DataAsFlag` branch:
+
+```cpp
+    if (options.transport == TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered) {
+        launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(
+            options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev,
+            options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic, magic, step);
+        return;
+    }
+```
+
+- [ ] **Step 4: Split legacy clear behavior from new behavior**
+
+Replace:
+
+```cpp
+    const bool useDataAsFlagTransport = options.transport == TileXR::Demo::P2PTransport::DataAsFlag;
+```
+
+with:
+
+```cpp
+    const bool useLegacyDataAsFlagTransport = options.transport == TileXR::Demo::P2PTransport::DataAsFlag;
+```
+
+Then replace warmup/measured per-iteration data-as-flag clear branches to use `useLegacyDataAsFlagTransport` only:
+
+```cpp
+            if (useLegacyDataAsFlagTransport && IsP2PReceiveRank(rank, options) &&
+                !ClearLocalPeerWindow(rank, commArgsHost, dstOffset,
+                    TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes), "p2p data_as_flag warmup window")) {
+                ok = false;
+                break;
+            }
+            if (useLegacyDataAsFlagTransport &&
+                !DemoBarrierAll(rank, rankSize,
+                    "p2p data_as_flag warmup clear bytes=" + std::to_string(bytes) +
+                    " iter=" + std::to_string(i))) {
+                ok = false;
+                break;
+            }
+```
+
+And measured:
+
+```cpp
+        if (useLegacyDataAsFlagTransport && IsP2PReceiveRank(rank, options) &&
+            !ClearLocalPeerWindow(rank, commArgsHost, dstOffset,
+                TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes), "p2p data_as_flag measured window")) {
+            ok = false;
+            break;
+        }
+        if (useLegacyDataAsFlagTransport &&
+            !DemoBarrierAll(rank, rankSize,
+                "p2p data_as_flag measured clear bytes=" + std::to_string(bytes))) {
+            ok = false;
+            break;
+        }
+```
+
+Keep the existing once-per-size IPC destination clear:
+
+```cpp
+        if (useIpcTransport && IsP2PReceiveRank(rank, options)) {
+            const uint64_t clearBytes = TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes);
+            ...
+        }
+```
+
+This one-time clear remains valid for epoch-ordered startup hygiene.
+
+- [ ] **Step 5: Build and run unit/source tests**
+
+Run:
+
+```bash
+cd /home/h30059441/TileXR
+source scripts/common_env.sh
+cd tests/udma
+bash build.sh
+./install/bin/test_tilexr_udma_p2p_perf_config
+./install/bin/test_tilexr_udma_p2p_source_guard
+```
+
+Expected: both tests exit `0`.
+
+- [ ] **Step 6: Commit Task 4**
+
+```bash
+git add tests/udma/demo/tilexr_udma_demo.cpp \
+        tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp
+git commit -m "feat: wire epoch ordered data-as-flag p2p host path"
+```
+
+---
+
+### Task 5: Add Strict Mode Host Control
+
+**Files:**
+- Modify: `tests/udma/demo/tilexr_udma_demo_kernel.cpp`
+- Modify: `tests/udma/demo/tilexr_udma_demo.cpp`
+- Modify: `tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp`
+
+**Interfaces:**
+- Consumes: `DataAsFlagCheckAndRecvEpochOrdered(..., bool strict)`.
+- Produces:
+  - Environment variable `TILEXR_DATA_AS_FLAG_STRICT`.
+  - Launcher and kernel parameter `int32_t strict`.
+
+- [ ] **Step 1: Add source guard checks**
+
+Add checks:
+
+```cpp
+    CheckContains(hostPath, hostText, "TILEXR_DATA_AS_FLAG_STRICT");
+    CheckContains(kernelPath, kernelText, "int32_t strict");
+    CheckContains(kernelPath, kernelText, "strict != 0");
+```
+
+Run source guard and expect failure before implementation.
+
+- [ ] **Step 2: Extend launcher signatures**
+
+In `tests/udma/demo/tilexr_udma_demo.cpp`, change the extern declaration:
+
+```cpp
+extern void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step, int32_t strict);
+```
+
+In `tests/udma/demo/tilexr_udma_demo_kernel.cpp`, extend the launcher wrapper and kernel parameters with `int32_t strict`.
+
+- [ ] **Step 3: Pass strict from host**
+
+In `LaunchP2PKernel`, compute:
+
+```cpp
+    const int32_t dataAsFlagStrict = GetEnvFlag("TILEXR_DATA_AS_FLAG_STRICT", false) ? 1 : 0;
+```
+
+Pass it only to the epoch-ordered launcher:
+
+```cpp
+        launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(
+            options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev,
+            options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic,
+            magic, step, dataAsFlagStrict);
+```
+
+- [ ] **Step 4: Use strict in kernel**
+
+Replace:
+
+```cpp
+            bool strict = false;
+```
+
+with:
+
+```cpp
+            bool strictMode = strict != 0;
+```
+
+Pass `strictMode` to `DataAsFlagCheckAndRecvEpochOrdered`.
+
+- [ ] **Step 5: Build and run source guard**
+
+Run:
+
+```bash
+cd /home/h30059441/TileXR
+source scripts/common_env.sh
+cd tests/udma
+bash build.sh
+./install/bin/test_tilexr_udma_p2p_source_guard
+```
+
+Expected: source guard exits `0`.
+
+- [ ] **Step 6: Commit Task 5**
+
+```bash
+git add tests/udma/demo/tilexr_udma_demo.cpp \
+        tests/udma/demo/tilexr_udma_demo_kernel.cpp \
+        tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp
+git commit -m "feat: add strict flag checks for epoch ordered data-as-flag"
+```
+
+---
+
+### Task 6: Remote Smoke and Sweep Validation
+
+**Files:**
+- Read: `docs/ASCEND_REMOTE_BUILD_RUNBOOK.md`
+- May create local validation outputs under: `run/csv_full/`
+
+**Interfaces:**
+- Consumes: Built demo binary `tests/udma/install/bin/tilexr_udma_demo`.
+- Produces:
+  - Remote logs for focused smoke and sweep runs.
+  - Local CSV copies for old/new transport comparison.
+
+- [ ] **Step 1: Sync changed files to remote**
+
+Use Paramiko from local PowerShell. Keep the password in the environment only:
+
+```powershell
+@'
+import os
+import pathlib
+import paramiko
+
+host = "141.62.19.144"
+user = "root"
+password = os.environ["TILEXR_REMOTE_PASS"]
+local_root = pathlib.Path(r"D:\workspace\TileXR")
+remote_root = "/home/h30059441/TileXR"
+
+files = [
+    "src/include/tilexr_data_as_flag.h",
+    "tests/udma/demo/tilexr_udma_demo.cpp",
+    "tests/udma/demo/tilexr_udma_demo_kernel.cpp",
+    "tests/udma/demo/tilexr_udma_p2p_perf_config.h",
+    "tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp",
+    "tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp",
+]
+
+ssh = paramiko.SSHClient()
+ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ssh.connect(host, username=user, password=password, timeout=15,
+            banner_timeout=15, auth_timeout=15)
+sftp = ssh.open_sftp()
+for path in files:
+    sftp.put(str(local_root / path), remote_root + "/" + path)
+    print("uploaded", path)
+sftp.close()
+ssh.close()
+'@ | python -
+```
+
+- [ ] **Step 2: Build and run tests remotely**
+
+Run:
+
+```powershell
+@'
+import os
+import paramiko
+host = "141.62.19.144"
+user = "root"
+password = os.environ["TILEXR_REMOTE_PASS"]
+cmd = r'''
+cd /home/h30059441/TileXR
+pids=$(ps -eo pid=,comm= | awk '$2=="tilexr_udma_demo" {print $1}')
+[ -n "$pids" ] && kill -TERM $pids || true
+sleep 2
+pids=$(ps -eo pid=,comm= | awk '$2=="tilexr_udma_demo" {print $1}')
+[ -n "$pids" ] && kill -KILL $pids || true
+source scripts/common_env.sh
+cd tests/udma
+bash build.sh
+bash run_tests.sh
+'''
+ssh = paramiko.SSHClient()
+ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ssh.connect(host, username=user, password=password, timeout=15,
+            banner_timeout=15, auth_timeout=15)
+stdin, stdout, stderr = ssh.exec_command(cmd, get_pty=True, timeout=900)
+for line in iter(stdout.readline, ""):
+    print(line, end="")
+err = stderr.read().decode("utf-8", errors="replace")
+if err:
+    print(err, end="")
+rc = stdout.channel.recv_exit_status()
+ssh.close()
+raise SystemExit(rc)
+'@ | python -
+```
+
+Expected: build succeeds, `run_tests.sh` summary reports PASS for layout, registry, P2P perf config, source guard, and single-process integration.
+
+- [ ] **Step 3: Run focused fast-path smoke on NPU6/7**
+
+Run a 4KB to 1MB smoke:
+
+```bash
+cd /home/h30059441/TileXR
+source scripts/common_env.sh
+cd tests/udma
+export TILEXR_COMM_ID=127.0.0.1:10267
+export TILEXR_DEMO_NPUS=2
+export TILEXR_DEMO_FIRST_NPU=6
+export LD_LIBRARY_PATH="$PWD/install/lib:$PWD/install/lib64:/home/h30059441/TileXR/install/lib:/home/h30059441/TileXR/install/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}"
+log_dir="$PWD/logs/tilexr_daf_epoch_smoke_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$log_dir"
+csv="$log_dir/p2p_perf.csv"
+bin="$PWD/install/bin/tilexr_udma_demo"
+for rank in 0 1; do
+  RANK=$rank RANK_SIZE=2 TILEXR_P2P_LOG_DIR="$log_dir" TILEXR_P2P_CSV="$csv" "$bin" \
+    2 "$rank" 4 0 2 6 \
+    0 1 4096 1048576 2 \
+    20 5 1 "$csv" "$log_dir" data_as_flag_epoch_ordered \
+    1 unidir >"$log_dir/rank_${rank}.log" 2>&1 &
+  pids[$rank]=$!
+done
+ret=0
+for pid in ${pids[0]} ${pids[1]}; do
+  wait "$pid" || ret=$?
+done
+echo "log_dir=$log_dir"
+cat "$csv"
+exit $ret
+```
+
+Expected: all CSV rows have `status=0` and `errors=0`.
+
+- [ ] **Step 4: Run strict smoke**
+
+Repeat Step 3 with:
+
+```bash
+export TILEXR_DATA_AS_FLAG_STRICT=1
+```
+
+Use a smaller range:
+
+```text
+8 65536 2
+```
+
+Expected: all CSV rows have `status=0` and `errors=0`.
+
+- [ ] **Step 5: Run full unidir sweep 8B to 64MB**
+
+Run with:
+
+```text
+min_bytes=8
+max_bytes=67108864
+step_factor=2
+iters=20
+warmup_iters=5
+transport=data_as_flag_epoch_ordered
+block_dim=1
+traffic=unidir
+```
+
+Expected: 24 data rows, all `status=0`, all `errors=0`.
+
+- [ ] **Step 6: Download CSVs**
+
+Download the new sweep:
+
+```powershell
+@'
+import os
+import pathlib
+import paramiko
+host = "141.62.19.144"
+user = "root"
+password = os.environ["TILEXR_REMOTE_PASS"]
+local_csv = pathlib.Path(r"D:\workspace\TileXR\run\csv_full\data_as_flag_epoch_ordered_unidir_bd1_8B_64MB.csv")
+ssh = paramiko.SSHClient()
+ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ssh.connect(host, username=user, password=password, timeout=15,
+            banner_timeout=15, auth_timeout=15)
+stdin, stdout, stderr = ssh.exec_command(
+    "ls -td /home/h30059441/TileXR/tests/udma/logs/tilexr_daf_epoch_* 2>/dev/null | head -1",
+    timeout=30)
+log_dir = stdout.read().decode("utf-8", errors="replace").strip()
+err = stderr.read().decode("utf-8", errors="replace").strip()
+if err:
+    raise RuntimeError(err)
+if not log_dir:
+    raise RuntimeError("no tilexr_daf_epoch log directory found")
+remote_csv = log_dir + "/p2p_perf.csv"
+sftp = ssh.open_sftp()
+sftp.get(remote_csv, str(local_csv))
+sftp.close()
+ssh.close()
+print("downloaded", local_csv)
+'@ | python -
+```
+
+- [ ] **Step 7: Confirm no stale demo processes**
+
+Run:
+
+```bash
+ps -eo pid=,comm=,args= | awk '$2=="tilexr_udma_demo" {print}'
+```
+
+Expected: no output.
+
+- [ ] **Step 8: Commit Task 6 if validation artifacts should be tracked**
+
+Only commit CSVs or plot changes if the user asks to keep them in the branch:
+
+```bash
+git add run/csv_full/data_as_flag_epoch_ordered_unidir_bd1_8B_64MB.csv
+git commit -m "test: add epoch ordered data-as-flag p2p sweep results"
+```
+
+If the user does not ask to track artifacts, do not commit this task.
+
+---
+
+### Task 7: Decide Migration to Public `data_as_flag`
+
+**Files:**
+- Modify later only after user approval:
+  - `tests/udma/demo/tilexr_udma_p2p_perf_config.h`
+  - `tests/udma/demo/tilexr_udma_demo.cpp`
+  - `tests/udma/demo/tilexr_udma_demo_kernel.cpp`
+  - `tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp`
+  - `tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp`
+  - plotting scripts under `run/csv_full/` if needed
+
+**Interfaces:**
+- Consumes: successful Task 6 validation evidence.
+- Produces: a follow-up plan or patch that maps `data_as_flag` to the epoch-ordered implementation.
+
+- [ ] **Step 1: Summarize validation**
+
+Prepare a short table:
+
+```text
+transport                       range        block_dim  strict  status/errors
+data_as_flag                    8B-64MB      1          no      ...
+data_as_flag_epoch_ordered      8B-64MB      1          no      ...
+data_as_flag_epoch_ordered      8B-64KB      1          yes     ...
+```
+
+- [ ] **Step 2: Ask for migration approval**
+
+Ask the user:
+
+```text
+Validation is clean. Should I now replace legacy data_as_flag with the epoch-ordered implementation and keep data_as_flag_epoch_ordered as a temporary alias?
+```
+
+- [ ] **Step 3: Stop if approval is not explicit**
+
+Do not remove legacy `data_as_flag` in this plan without explicit user approval after validation.
+
+---
+
+## Self-Review Checklist
+
+- Spec coverage:
+  - New transport config: Task 1.
+  - Per-launch epoch: Tasks 2, 3, 4.
+  - Embedded 512B block layout unchanged: Tasks 2 and 3.
+  - Separate ordered MTE3 payload/flag stages: Task 2.
+  - Fast receiver checks last flag: Task 2.
+  - Strict/debug path: Task 5 and Task 6 strict smoke.
+  - Side-by-side validation with old `data_as_flag`: Task 6 and Task 7.
+  - Later migration to public `data_as_flag`: Task 7.
+- Placeholder scan: no red-flag placeholder words or unnamed tests remain.
+- Type consistency:
+  - Enum name is consistently `DataAsFlagEpochOrdered`.
+  - Public transport string is consistently `data_as_flag_epoch_ordered`.
+  - Device epoch type is consistently `uint64_t`.
+  - Launcher carries `int32_t magic`, `int32_t step`, and later `int32_t strict`.
diff --git a/docs/superpowers/specs/2026-06-24-p2p-transport-trim-design.md b/docs/superpowers/specs/2026-06-24-p2p-transport-trim-design.md
new file mode 100644
index 0000000..18da592
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-24-p2p-transport-trim-design.md
@@ -0,0 +1,63 @@
+# P2P Transport Trim Design
+
+## Goal
+
+Reduce the UDMA P2P performance demo transport matrix to three user-facing modes:
+
+- `direct_urma`
+- `memory`
+- `data_as_flag`
+
+`direct_urma` will remain the only UDMA benchmark transport name, but it will use the current parallel multi-jetty implementation internally.
+
+## Behavior
+
+`direct_urma` represents direct registered-memory UDMA transfer. It supports both a single-jetty baseline and multi-jetty parallel transfer through the existing `block_dim` and `TILEXR_UDMA_QP_NUM` controls:
+
+- `block_dim=1` with one QP behaves like the previous single-QP direct URMA path.
+- `block_dim=N` with `TILEXR_UDMA_QP_NUM=N` uses up to `N` QPs/jettys in parallel.
+
+The removed user-facing variants are:
+
+- `direct_urma_multi_wqe`
+- `direct_urma_multi_jetty`
+- `direct_urma_multi_jetty_parallel`
+- `direct_urma_multi_jetty_parallel_fixed_wqe`
+
+The previous `direct_urma_multi_jetty_parallel` behavior is folded into `direct_urma`.
+
+## Code Changes
+
+Update the P2P transport enum, parser, formatter, validation messages, CSV output tests, runner scripts, and documentation so only the three supported names appear.
+
+Remove the obsolete benchmark kernels and host launch wrappers for:
+
+- single-WQE `direct_urma`
+- `multi_wqe`
+- serial `multi_jetty`
+- fixed-WQE parallel multi-jetty
+
+Keep the existing QP-indexed device helpers and UDMA transport layout support because the new `direct_urma` implementation depends on them.
+
+Remove fixed-WQE-only window sizing and mismatch-check logic from the P2P config helpers and host demo code.
+
+## Scripts And Docs
+
+The P2P runner should set `TILEXR_UDMA_QP_NUM` from `block_dim` when `transport=direct_urma`. The concurrency sweep defaults should cover:
+
+- `direct_urma`
+- `memory`
+- `data_as_flag`
+
+Documentation and PR wording should describe `direct_urma` as a scalable direct UDMA path rather than listing multiple UDMA variants.
+
+## Testing
+
+Use TDD for the implementation:
+
+1. First update host-only unit tests to expect the trimmed transport set and the new `direct_urma` CSV semantics.
+2. Run the unit target and confirm the tests fail before production changes.
+3. Apply the production changes.
+4. Re-run the focused UDMA unit tests and any available build/test script.
+
+Hardware P2P demo runs remain a manual verification item on supported Ascend hardware.
diff --git a/docs/superpowers/specs/2026-06-26-data-as-flag-epoch-ordered-design.md b/docs/superpowers/specs/2026-06-26-data-as-flag-epoch-ordered-design.md
new file mode 100644
index 0000000..69b6abf
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-26-data-as-flag-epoch-ordered-design.md
@@ -0,0 +1,355 @@
+# DataAsFlag Epoch-Ordered P2P Design
+
+Date: 2026-06-26
+
+## Background
+
+The current P2P `data_as_flag` transport uses a 512-byte block layout:
+
+```text
+[0, 480)     payload
+[480, 512)   ready flag area
+```
+
+The sender initializes the local UB scratch with ready flags and then writes the
+packed `payload + flag` block layout to the peer data-as-flag IPC data window.
+The receiver polls each batch by copying one flag value per block and using
+`Sum` to decide whether all blocks in the batch are ready.
+
+This has two important limitations in the P2P benchmark:
+
+- The host does not pass a per-iteration value into `data_as_flag`. Measured
+  iterations currently rely on clearing the receiver window. If the window is
+  cleared only once before a measured loop, later queued kernel launches can see
+  stale ready flags from earlier iterations.
+- A flag written as part of the same MTE3 transfer as the payload is not a
+  sufficient proof that the payload bytes from that same MTE3 transfer are
+  already visible. The correctness argument should depend on the order between
+  distinct MTE3 writes, not on byte-level visibility within a single MTE3 write.
+
+`memory_consume` already avoids stale synchronization by passing `magic` and
+`step` for each launch. `data_as_flag_epoch_ordered` brings the same
+per-iteration discipline to data-as-flag while keeping the flag embedded in the
+data window.
+
+## Goals
+
+- Add a new P2P transport named `data_as_flag_epoch_ordered`.
+- Keep the existing 512-byte data-as-flag block layout unchanged.
+- Keep ready information embedded in the data-as-flag window, not in an
+  independent sync region.
+- Use a per-launch epoch so receiver polling waits for the current iteration
+  and does not depend on clearing the window between launches.
+- Send payload and flag with ordered, separate MTE3 stages:
+  1. MTE3 payload write.
+  2. MTE3 flag write with the current epoch.
+- Reduce receiver overhead by making the default receive path poll only the last
+  flag in each batch as a batch commit marker.
+- Keep a strict/debug path that can validate all flags in a batch after the
+  commit flag is observed.
+- Validate the new transport side-by-side with the existing `data_as_flag`.
+
+## Non-Goals
+
+- Do not change the block layout or payload-per-block ratio.
+- Do not increase payload bytes per flag.
+- Do not move flags to `SyncCollectives` outer flags or any other independent
+  sync area.
+- Do not immediately remove the existing `data_as_flag` implementation. Removal
+  happens only after the new mode is validated.
+- Do not depend on same-MTE3 payload and flag visibility.
+
+## Transport Rollout
+
+The rollout has two phases.
+
+Phase 1 adds a new transport:
+
+```text
+data_as_flag_epoch_ordered
+```
+
+Suggested aliases:
+
+```text
+daf_epoch_ordered
+data-as-flag-epoch-ordered
+```
+
+The old `data_as_flag` remains available for A/B comparison.
+
+Phase 2 happens after remote validation shows correct data and stable
+performance. In that phase, remove or hide the legacy implementation and make
+the epoch-ordered implementation use the public `data_as_flag` transport name.
+
+## Epoch Format
+
+The P2P host already creates one `magic` per phase and size, then passes
+`step = iteration + 1` for each warmup or measured launch. The new transport
+uses the same inputs.
+
+Device-side data-as-flag epoch:
+
+```cpp
+uint64_t epoch = (static_cast<uint64_t>(static_cast<uint32_t>(magic)) << 32) |
+                 static_cast<uint32_t>(step);
+```
+
+The full 64-bit epoch is written into the 32-byte flag area. The default
+implementation should duplicate it four times as `uint64_t` values:
+
+```text
+flag area: epoch, epoch, epoch, epoch
+```
+
+The receiver fast path only needs to compare one `uint64_t`, preferably the
+first word in the flag area. The strict path can check all four words and all
+blocks in the batch.
+
+## Sender Protocol
+
+For each batch:
+
+```text
+MTE3 #1: payload blocks -> peer data-as-flag payload areas
+MTE3 #2: flag blocks    -> peer data-as-flag flag areas, value = epoch
+```
+
+The sender must not prove payload arrival using a flag from the same MTE3 write
+as the payload. The flag is a commit marker only because the flag MTE3 is issued
+after the payload MTE3.
+
+Suggested device helper:
+
+```cpp
+__aicore__ inline uint32_t DataAsFlagSendEpochOrdered(
+    __gm__ uint8_t* dstDataAsFlagGM,
+    const __gm__ uint8_t* srcGM,
+    uint64_t dataBytes,
+    uint64_t epoch,
+    AscendC::LocalTensor<uint8_t>& scratch);
+```
+
+Implementation shape:
+
+1. Compute `totalBlocks = ceil(dataBytes / 480)`.
+2. Split by UB capacity.
+3. Copy continuous payload from `srcGM` into a payload scratch area.
+4. MTE3-copy payload into block payload regions in `dstDataAsFlagGM`, using a
+   destination stride that skips each 32-byte flag area.
+5. Prepare a flag scratch area filled with the current epoch.
+6. MTE3-copy flags into the block flag regions.
+7. Advance to the next batch.
+
+If a direct strided `DataCopyPad` from continuous payload scratch to
+`480B payload + 32B gap` GM layout is not accepted by AscendC or performs badly,
+the conservative first implementation may keep the existing payload packing
+shape but must still issue the flag write as a separate MTE3 stage after the
+payload write.
+
+## Receiver Protocol
+
+For each batch:
+
+```text
+poll last block flag == epoch
+copy batch payload -> dstGM
+```
+
+Suggested device helper:
+
+```cpp
+__aicore__ inline bool DataAsFlagCheckAndRecvEpochOrdered(
+    const __gm__ uint8_t* dataAsFlagGM,
+    uint64_t dataBytes,
+    __gm__ uint8_t* recvGM,
+    uint64_t epoch,
+    AscendC::LocalTensor<uint8_t>& scratch,
+    bool strict);
+```
+
+Implementation shape:
+
+1. Compute `totalBlocks = ceil(dataBytes / 480)`.
+2. Split by receive UB capacity.
+3. For each batch, compute `lastBlock = processedBlocks + batchBlocks - 1`.
+4. Poll the selected word in `lastBlock`'s flag area until it equals `epoch`.
+5. If strict mode is enabled, validate the batch flags after the last flag is
+   observed.
+6. Copy the batch payload from data-as-flag layout into continuous `recvGM`.
+7. Advance to the next batch.
+
+The fast path removes the repeated copy of all flags and the vector `Sum` used
+by the legacy receiver. The strict path keeps a validation option for testing
+the MTE3 order assumption and diagnosing data errors.
+
+## Strict Mode
+
+Strict mode is not the default performance path.
+
+It should be enabled by a host option or environment flag, for example:
+
+```text
+TILEXR_DATA_AS_FLAG_STRICT=1
+```
+
+After the last block flag equals the current epoch, strict mode checks the whole
+batch. A simple first implementation may copy one `uint64_t` flag per block and
+compare each value to `epoch`. It does not need to use `Sum`, because the epoch
+is a 64-bit token rather than `float(1.0f)`.
+
+Strict mode failure should set a nonzero debug status so the host CSV row reports
+the issue through the existing P2P status path.
+
+## Host Changes
+
+The new launcher takes the same launch metadata style as `memory_consume`:
+
+```cpp
+void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(
+    uint32_t blockDim,
+    void* stream,
+    GM_ADDR commArgs,
+    GM_ADDR src,
+    GM_ADDR dst,
+    GM_ADDR debug,
+    int32_t srcRank,
+    int32_t dstRank,
+    uint64_t dstByteOffset,
+    uint32_t bytes,
+    uint32_t pattern,
+    int32_t traffic,
+    int32_t magic,
+    int32_t step);
+```
+
+`LaunchP2PKernel` passes `magic` and `step` to the new transport. Warmup and
+measured loops use the existing values:
+
+```text
+warmup:  magic = warmupMagic,  step = i + 1
+measure: magic = measuredMagic, step = i + 1
+```
+
+The new transport does not need per-iteration clear-window calls or the extra
+clear barriers currently used for legacy `data_as_flag`. A single initialization
+clear per size is still acceptable, mainly to reduce the chance that old data
+accidentally equals an epoch during early bring-up. The epoch value should be
+large enough that accidental matches are practically impossible.
+
+The transport helper predicates should treat `data_as_flag_epoch_ordered` like
+`data_as_flag`:
+
+- Uses IPC peer window.
+- Both ranks are active in unidirectional traffic.
+- Window size is still `ceil(payload / 480) * 512`.
+
+## Kernel Integration
+
+Add a new P2P kernel beside the legacy one:
+
+```cpp
+extern "C" __global__ __aicore__ void tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel(...);
+```
+
+The role resolution can reuse `TileXRUdmaDemoResolveDataAsFlagRole`.
+
+The slicing can reuse `TileXRUdmaDemoDataAsFlagSlice` because the layout and
+block ownership do not change.
+
+Sender path:
+
+```text
+DataAsFlagSendEpochOrdered(peerBase + dstByteOffset + dataAsFlagOffset,
+                           src + payloadOffset,
+                           sliceBytes,
+                           epoch,
+                           scratch)
+```
+
+Receiver path:
+
+```text
+DataAsFlagCheckAndRecvEpochOrdered(localBase + dstByteOffset + dataAsFlagOffset,
+                                   sliceBytes,
+                                   dst + payloadOffset,
+                                   epoch,
+                                   scratch,
+                                   strict)
+```
+
+Debug words should record enough information to distinguish the new mode from
+legacy `data_as_flag`, including block count, status, and at least the low 32
+bits of the epoch.
+
+## Correctness Argument
+
+Stale flags are avoided because every launch writes and waits for a unique epoch.
+The receiver no longer treats a generic ready value from an older launch as
+current data.
+
+Payload visibility depends on MTE3 ordering across distinct MTE3 operations:
+payload MTE3 is issued before flag MTE3. When the receiver sees the batch commit
+flag from the flag MTE3, it can treat the earlier payload MTE3 for the same
+batch as visible.
+
+The protocol does not assume that all bytes from a single MTE3 write become
+visible at the same time. In particular, it does not use a flag written by the
+same MTE3 operation as proof for that MTE3 operation's payload.
+
+The block semantic remains data-as-flag because the flag is still embedded in
+the 512-byte data window block.
+
+## Tests
+
+Unit tests:
+
+- `P2PTransportName` returns `data_as_flag_epoch_ordered`.
+- Parser accepts the new name and aliases.
+- Window size matches legacy `data_as_flag`.
+- `P2PTransportUsesIpc` and `P2PTransportBothRanksActive` include the new mode.
+- CSV formatting preserves the transport name.
+
+Source guard tests:
+
+- New launcher declaration exists.
+- New kernel exists.
+- New device helpers exist in `tilexr_data_as_flag.h`.
+- New helper signatures include `epoch`.
+- Host passes `magic` and `step` to the new launcher.
+- Legacy per-iteration clear-window branches do not apply to the new transport.
+
+Remote validation:
+
+- Build `tests/udma` on the Ascend environment.
+- Run unit tests.
+- Run focused P2P smoke tests for `data_as_flag_epoch_ordered` on 6/7 or other
+  healthy cards.
+- Run 8B to 64MB unidirectional sweep with `block_dim=1`.
+- Compare old `data_as_flag`, new `data_as_flag_epoch_ordered`, `memory`, and
+  `memory_consume` CSVs.
+- Verify all rows have `status=0` and `errors=0`.
+- Run strict mode on a smaller sweep to validate all batch flags.
+
+## Completion Criteria
+
+- `data_as_flag_epoch_ordered` is selectable from the P2P demo CLI.
+- The new mode uses per-launch epoch values and does not require per-iteration
+  clear-window barriers.
+- Sender emits payload and flag in separate ordered MTE3 stages.
+- Receiver fast path waits on one batch commit flag and then copies payload.
+- Strict mode can validate all flags for debug runs.
+- Remote build and tests pass.
+- Remote P2P smoke and sweep runs complete with `status=0` and `errors=0`.
+- CSV output allows direct A/B comparison with legacy `data_as_flag`.
+
+## Later Migration
+
+After validation, replace the public `data_as_flag` behavior with the
+epoch-ordered implementation:
+
+1. Remove or hide the legacy transport from sweeps.
+2. Make `data_as_flag` parse to the epoch-ordered implementation.
+3. Keep `data_as_flag_epoch_ordered` as a temporary alias if useful.
+4. Update docs and plots so the public name is again `data_as_flag`.
+5. Remove legacy clear-window workarounds from the data-as-flag host path.
diff --git a/src/comm/udma/tilexr_udma_layout.cpp b/src/comm/udma/tilexr_udma_layout.cpp
index 401b5f6..0a6d6d0 100644
--- a/src/comm/udma/tilexr_udma_layout.cpp
+++ b/src/comm/udma/tilexr_udma_layout.cpp
@@ -8,10 +8,6 @@
 #include <cstring>
 
 namespace TileXR {
-namespace {
-
-constexpr uint32_t TILEXR_UDMA_QP_NUM = 1;
-
 template <typename T>
 void CopyVector(std::vector<uint8_t>& dst, size_t offset, const std::vector<T>& src)
 {
@@ -20,10 +16,9 @@ void CopyVector(std::vector<uint8_t>& dst, size_t offset, const std::vector<T>&
     }
 }
 
-} // namespace
-
 int BuildUDMAInfoImage(
     uintptr_t deviceBase,
+    uint32_t qpNum,
     const std::vector<UDMAWQCtx>& sq,
     const std::vector<UDMAWQCtx>& rq,
     const std::vector<UDMACQCtx>& scq,
@@ -32,9 +27,9 @@ int BuildUDMAInfoImage(
     UDMAInfo& info,
     std::vector<uint8_t>& bytes)
 {
-    const size_t rankCount = sq.size();
-    if (rankCount == 0 || rq.size() != rankCount || scq.size() != rankCount ||
-        rcq.size() != rankCount || mem.size() != rankCount) {
+    if (qpNum == 0 || sq.empty() || sq.size() % qpNum != 0 ||
+        rq.size() != sq.size() || scq.size() != sq.size() ||
+        rcq.size() != sq.size() || mem.size() != sq.size()) {
         return TILEXR_UDMA_LAYOUT_INVALID;
     }
 
@@ -46,7 +41,7 @@ int BuildUDMAInfoImage(
     const size_t totalBytes = memOffset + mem.size() * sizeof(UDMAMemInfo);
 
     info = {};
-    info.qpNum = TILEXR_UDMA_QP_NUM;
+    info.qpNum = qpNum;
     info.sqPtr = deviceBase + sqOffset;
     info.rqPtr = deviceBase + rqOffset;
     info.scqPtr = deviceBase + scqOffset;
diff --git a/src/comm/udma/tilexr_udma_layout.h b/src/comm/udma/tilexr_udma_layout.h
index fefd49e..21cc138 100644
--- a/src/comm/udma/tilexr_udma_layout.h
+++ b/src/comm/udma/tilexr_udma_layout.h
@@ -18,6 +18,7 @@ constexpr int TILEXR_UDMA_LAYOUT_INVALID = -3;
 
 int BuildUDMAInfoImage(
     uintptr_t deviceBase,
+    uint32_t qpNum,
     const std::vector<UDMAWQCtx>& sq,
     const std::vector<UDMAWQCtx>& rq,
     const std::vector<UDMACQCtx>& scq,
diff --git a/src/comm/udma/tilexr_udma_transport.cpp b/src/comm/udma/tilexr_udma_transport.cpp
index 7bafa5f..0429a19 100644
--- a/src/comm/udma/tilexr_udma_transport.cpp
+++ b/src/comm/udma/tilexr_udma_transport.cpp
@@ -33,6 +33,20 @@ uint32_t Log2Uint64(uint64_t value)
     return result;
 }
 
+uint32_t GetEnvUint(const char* name, uint32_t defaultValue, uint32_t minValue, uint32_t maxValue)
+{
+    const char* value = std::getenv(name);
+    if (value == nullptr || value[0] == '\0') {
+        return defaultValue;
+    }
+    char* end = nullptr;
+    unsigned long parsed = std::strtoul(value, &end, 10);
+    if (end == value || *end != '\0' || parsed < minValue || parsed > maxValue) {
+        return defaultValue;
+    }
+    return static_cast<uint32_t>(parsed);
+}
+
 HccpEid SwapEidForDevice(const HccpEid& hccpEid)
 {
     HccpEid swapped {};
@@ -293,20 +307,20 @@ struct TileXRUDMATransport::PerEidState {
     void* ctxHandle = nullptr;
     void* tokenHandle = nullptr;
     void* chanHandle = nullptr;
-    void* cqHandle = nullptr;
-    void* qpHandle = nullptr;
-    CqInfoT cqInfo {};
-    QpCreateInfo qpInfo {};
-    std::vector<void*> remoteQpHandles;
-    std::vector<uint32_t> tpnList;
-    void* cqPiAddr = nullptr;
-    void* cqCiAddr = nullptr;
-    void* sqPiAddr = nullptr;
-    void* sqCiAddr = nullptr;
-    void* wqeCntAddr = nullptr;
-    void* amoAddr = nullptr;
-    UDMAWQCtx localWq {};
-    UDMACQCtx localCq {};
+    std::vector<void*> cqHandles;
+    std::vector<CqInfoT> cqInfos;
+    std::vector<void*> qpHandles;
+    std::vector<QpCreateInfo> qpInfos;
+    std::vector<std::vector<void*>> remoteQpHandlesByQp;
+    std::vector<std::vector<uint32_t>> tpnListByQp;
+    std::vector<void*> cqPiAddrs;
+    std::vector<void*> cqCiAddrs;
+    std::vector<void*> sqPiAddrs;
+    std::vector<void*> sqCiAddrs;
+    std::vector<void*> wqeCntAddrs;
+    std::vector<void*> amoAddrs;
+    std::vector<UDMAWQCtx> localWqs;
+    std::vector<UDMACQCtx> localCqs;
 };
 
 TileXRUDMATransport::TileXRUDMATransport() = default;
@@ -328,6 +342,7 @@ int TileXRUDMATransport::Init(const TileXRUDMATransportOptions& options)
         return TILEXR_ERROR_PARA_CHECK_FAIL;
     }
     options_ = options;
+    qpNum_ = GetEnvUint("TILEXR_UDMA_QP_NUM", 1, 1, 64);
 
     int ret = loader_.Load();
     if (ret != TILEXR_HCCP_LOADER_SUCCESS) {
@@ -586,8 +601,20 @@ int TileXRUDMATransport::CreateQueues()
         state.eidIndex = ctxEntry.first;
         state.ctxHandle = ctxEntry.second;
         state.tokenHandle = tokenHandleByEid_[ctxEntry.first];
-        state.remoteQpHandles.assign(options_.rankSize, nullptr);
-        state.tpnList.assign(options_.rankSize, 0);
+        state.qpHandles.assign(qpNum_, nullptr);
+        state.qpInfos.resize(qpNum_);
+        state.remoteQpHandlesByQp.assign(qpNum_, std::vector<void*>(options_.rankSize, nullptr));
+        state.tpnListByQp.assign(qpNum_, std::vector<uint32_t>(options_.rankSize, 0));
+        state.cqHandles.assign(qpNum_, nullptr);
+        state.cqInfos.resize(qpNum_);
+        state.cqPiAddrs.assign(qpNum_, nullptr);
+        state.cqCiAddrs.assign(qpNum_, nullptr);
+        state.sqPiAddrs.assign(qpNum_, nullptr);
+        state.sqCiAddrs.assign(qpNum_, nullptr);
+        state.wqeCntAddrs.assign(qpNum_, nullptr);
+        state.amoAddrs.assign(qpNum_, nullptr);
+        state.localWqs.resize(qpNum_);
+        state.localCqs.resize(qpNum_);
 
         ChanInfoT chanInfo {};
         chanInfo.in.dataPlaneFlag.bs.poolCqCstm = 1;
@@ -596,62 +623,66 @@ int TileXRUDMATransport::CreateQueues()
             return TILEXR_ERROR_INTERNAL;
         }
 
-        state.cqInfo.in.chanHandle = state.chanHandle;
-        state.cqInfo.in.depth = TILEXR_UDMA_CQ_DEPTH;
-        state.cqInfo.in.ub.mode = JFC_MODE_USER_CTL_NORMAL;
-        ret = loader_.RaCtxCqCreate(state.ctxHandle, &state.cqInfo, &state.cqHandle);
-        if (ret != 0) {
-            return TILEXR_ERROR_INTERNAL;
-        }
-        state.localCq.cqn = 0;
-        state.localCq.bufAddr = state.cqInfo.out.bufAddr;
-        state.localCq.baseBkShift = Log2Uint64(state.cqInfo.out.cqeSize);
-        state.localCq.depth = state.cqInfo.in.depth;
-        if (AllocDeviceScalar(&state.cqPiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS ||
-            AllocDeviceScalar(&state.cqCiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS) {
-            return TILEXR_ERROR_INTERNAL;
-        }
-        state.localCq.headAddr = reinterpret_cast<uintptr_t>(state.cqPiAddr);
-        state.localCq.tailAddr = reinterpret_cast<uintptr_t>(state.cqCiAddr);
-        state.localCq.dbMode = UDMADBMode::SW_DB;
-        state.localCq.dbAddr = state.cqInfo.out.swdbAddr;
-
-        QpCreateAttr qpAttr {};
-        qpAttr.scqHandle = state.cqHandle;
-        qpAttr.rcqHandle = state.cqHandle;
-        qpAttr.srqHandle = state.cqHandle;
-        qpAttr.sqDepth = TILEXR_UDMA_SQ_DEPTH;
-        qpAttr.rqDepth = TILEXR_UDMA_RQ_DEPTH_DEFAULT;
-        qpAttr.transportMode = CONN_RM;
-        qpAttr.ub.mode = JETTY_MODE_USER_CTL_NORMAL;
-        qpAttr.ub.flag.value = 1;
-        qpAttr.ub.jfsFlag.value = 2;
-        qpAttr.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE;
-        qpAttr.ub.rnrRetry = 7;
-        qpAttr.ub.extMode.piType = 0;
-        qpAttr.ub.extMode.cstmFlag.bs.sqCstm = 0;
-        qpAttr.ub.extMode.sqebbNum = TILEXR_UDMA_SQ_DEPTH;
-        qpAttr.ub.tokenIdHandle = state.tokenHandle;
-        ret = loader_.RaCtxQpCreate(state.ctxHandle, &qpAttr, &state.qpInfo, &state.qpHandle);
-        if (ret != 0) {
-            return TILEXR_ERROR_INTERNAL;
-        }
-        state.localWq.wqn = 0;
-        state.localWq.bufAddr = state.qpInfo.ub.sqBuffVa;
-        state.localWq.baseBkShift = Log2Uint64(state.qpInfo.ub.wqebbSize);
-        state.localWq.depth = TILEXR_UDMA_SQ_BB_COUNT;
-        if (AllocDeviceScalar(&state.sqPiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS ||
-            AllocDeviceScalar(&state.sqCiAddr, sizeof(uint32_t)) != TILEXR_SUCCESS ||
-            AllocDeviceScalar(&state.wqeCntAddr, sizeof(uint32_t)) != TILEXR_SUCCESS ||
-            AllocDeviceScalar(&state.amoAddr, sizeof(uint64_t)) != TILEXR_SUCCESS) {
-            return TILEXR_ERROR_INTERNAL;
+        for (uint32_t qpIdx = 0; qpIdx < qpNum_; ++qpIdx) {
+            state.cqInfos[qpIdx].in.chanHandle = state.chanHandle;
+            state.cqInfos[qpIdx].in.depth = TILEXR_UDMA_CQ_DEPTH;
+            state.cqInfos[qpIdx].in.ub.mode = JFC_MODE_USER_CTL_NORMAL;
+            ret = loader_.RaCtxCqCreate(state.ctxHandle, &state.cqInfos[qpIdx], &state.cqHandles[qpIdx]);
+            if (ret != 0) {
+                return TILEXR_ERROR_INTERNAL;
+            }
+            auto& localCq = state.localCqs[qpIdx];
+            localCq.cqn = qpIdx;
+            localCq.bufAddr = state.cqInfos[qpIdx].out.bufAddr;
+            localCq.baseBkShift = Log2Uint64(state.cqInfos[qpIdx].out.cqeSize);
+            localCq.depth = state.cqInfos[qpIdx].in.depth;
+            if (AllocDeviceScalar(&state.cqPiAddrs[qpIdx], sizeof(uint32_t)) != TILEXR_SUCCESS ||
+                AllocDeviceScalar(&state.cqCiAddrs[qpIdx], sizeof(uint32_t)) != TILEXR_SUCCESS) {
+                return TILEXR_ERROR_INTERNAL;
+            }
+            localCq.headAddr = reinterpret_cast<uintptr_t>(state.cqPiAddrs[qpIdx]);
+            localCq.tailAddr = reinterpret_cast<uintptr_t>(state.cqCiAddrs[qpIdx]);
+            localCq.dbMode = UDMADBMode::SW_DB;
+            localCq.dbAddr = state.cqInfos[qpIdx].out.swdbAddr;
+
+            QpCreateAttr qpAttr {};
+            qpAttr.scqHandle = state.cqHandles[qpIdx];
+            qpAttr.rcqHandle = state.cqHandles[qpIdx];
+            qpAttr.srqHandle = state.cqHandles[qpIdx];
+            qpAttr.sqDepth = TILEXR_UDMA_SQ_DEPTH;
+            qpAttr.rqDepth = TILEXR_UDMA_RQ_DEPTH_DEFAULT;
+            qpAttr.transportMode = CONN_RM;
+            qpAttr.ub.mode = JETTY_MODE_USER_CTL_NORMAL;
+            qpAttr.ub.flag.value = 1;
+            qpAttr.ub.jfsFlag.value = 2;
+            qpAttr.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE;
+            qpAttr.ub.rnrRetry = 7;
+            qpAttr.ub.extMode.piType = 0;
+            qpAttr.ub.extMode.cstmFlag.bs.sqCstm = 0;
+            qpAttr.ub.extMode.sqebbNum = TILEXR_UDMA_SQ_DEPTH;
+            qpAttr.ub.tokenIdHandle = state.tokenHandle;
+            ret = loader_.RaCtxQpCreate(state.ctxHandle, &qpAttr, &state.qpInfos[qpIdx], &state.qpHandles[qpIdx]);
+            if (ret != 0) {
+                return TILEXR_ERROR_INTERNAL;
+            }
+            auto& localWq = state.localWqs[qpIdx];
+            localWq.wqn = qpIdx;
+            localWq.bufAddr = state.qpInfos[qpIdx].ub.sqBuffVa;
+            localWq.baseBkShift = Log2Uint64(state.qpInfos[qpIdx].ub.wqebbSize);
+            localWq.depth = TILEXR_UDMA_SQ_BB_COUNT;
+            if (AllocDeviceScalar(&state.sqPiAddrs[qpIdx], sizeof(uint32_t)) != TILEXR_SUCCESS ||
+                AllocDeviceScalar(&state.sqCiAddrs[qpIdx], sizeof(uint32_t)) != TILEXR_SUCCESS ||
+                AllocDeviceScalar(&state.wqeCntAddrs[qpIdx], sizeof(uint32_t)) != TILEXR_SUCCESS ||
+                AllocDeviceScalar(&state.amoAddrs[qpIdx], sizeof(uint64_t)) != TILEXR_SUCCESS) {
+                return TILEXR_ERROR_INTERNAL;
+            }
+            localWq.headAddr = reinterpret_cast<uintptr_t>(state.sqPiAddrs[qpIdx]);
+            localWq.tailAddr = reinterpret_cast<uintptr_t>(state.sqCiAddrs[qpIdx]);
+            localWq.dbMode = UDMADBMode::SW_DB;
+            localWq.dbAddr = state.qpInfos[qpIdx].ub.dbAddr;
+            localWq.wqeCntAddr = reinterpret_cast<uintptr_t>(state.wqeCntAddrs[qpIdx]);
+            localWq.amoAddr = reinterpret_cast<uintptr_t>(state.amoAddrs[qpIdx]);
         }
-        state.localWq.headAddr = reinterpret_cast<uintptr_t>(state.sqPiAddr);
-        state.localWq.tailAddr = reinterpret_cast<uintptr_t>(state.sqCiAddr);
-        state.localWq.dbMode = UDMADBMode::SW_DB;
-        state.localWq.dbAddr = state.qpInfo.ub.dbAddr;
-        state.localWq.wqeCntAddr = reinterpret_cast<uintptr_t>(state.wqeCntAddr);
-        state.localWq.amoAddr = reinterpret_cast<uintptr_t>(state.amoAddr);
         states_[state.eidIndex] = state;
     }
     return states_.empty() ? TILEXR_ERROR_INTERNAL : TILEXR_SUCCESS;
@@ -659,28 +690,32 @@ int TileXRUDMATransport::CreateQueues()
 
 int TileXRUDMATransport::ImportQueues()
 {
-    std::vector<QpImportInfoT> localImports(eidCount_);
-    std::vector<QpKeyT> localKeys(eidCount_);
+    const size_t routeCount = static_cast<size_t>(eidCount_) * qpNum_;
+    std::vector<QpImportInfoT> localImports(routeCount);
+    std::vector<QpKeyT> localKeys(routeCount);
     for (const auto& stateEntry : states_) {
         const auto& state = stateEntry.second;
         if (state.eidIndex >= eidCount_) {
             return TILEXR_ERROR_INTERNAL;
         }
-        localImports[state.eidIndex].in.ub.mode = JETTY_IMPORT_MODE_NORMAL;
-        localImports[state.eidIndex].in.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE;
-        localImports[state.eidIndex].in.ub.policy = JETTY_GRP_POLICY_RR;
-        localImports[state.eidIndex].in.ub.type = TARGET_TYPE_JETTY;
-        localImports[state.eidIndex].in.ub.flag.bs.tokenPolicy = TOKEN_POLICY_PLAIN_TEXT;
-        localImports[state.eidIndex].in.ub.tpType = 1;
-        localKeys[state.eidIndex] = state.qpInfo.key;
+        for (uint32_t qpIdx = 0; qpIdx < qpNum_; ++qpIdx) {
+            const size_t localIndex = static_cast<size_t>(state.eidIndex) * qpNum_ + qpIdx;
+            localImports[localIndex].in.ub.mode = JETTY_IMPORT_MODE_NORMAL;
+            localImports[localIndex].in.ub.tokenValue = TILEXR_UDMA_TOKEN_VALUE;
+            localImports[localIndex].in.ub.policy = JETTY_GRP_POLICY_RR;
+            localImports[localIndex].in.ub.type = TARGET_TYPE_JETTY;
+            localImports[localIndex].in.ub.flag.bs.tokenPolicy = TOKEN_POLICY_PLAIN_TEXT;
+            localImports[localIndex].in.ub.tpType = 1;
+            localKeys[localIndex] = state.qpInfos[qpIdx].key;
+        }
     }
 
-    std::vector<QpImportInfoT> allImports(options_.rankSize * eidCount_);
+    std::vector<QpImportInfoT> allImports(options_.rankSize * routeCount);
     int ret = options_.exchange->AllGather(localImports.data(), localImports.size(), allImports.data());
     if (ret != TILEXR_SUCCESS) {
         return ret;
     }
-    std::vector<QpKeyT> allKeys(options_.rankSize * eidCount_);
+    std::vector<QpKeyT> allKeys(options_.rankSize * routeCount);
     ret = options_.exchange->AllGather(localKeys.data(), localKeys.size(), allKeys.data());
     if (ret != TILEXR_SUCCESS) {
         return ret;
@@ -700,13 +735,16 @@ int TileXRUDMATransport::ImportQueues()
             if (remoteEid >= eidCount_) {
                 return TILEXR_ERROR_INTERNAL;
             }
-            QpImportInfoT importInfo = allImports[peer * eidCount_ + remoteEid];
-            importInfo.in.key = allKeys[peer * eidCount_ + remoteEid];
-            ret = loader_.RaCtxQpImport(state.ctxHandle, &importInfo, &state.remoteQpHandles[peer]);
-            if (ret != 0) {
-                return TILEXR_ERROR_INTERNAL;
+            for (uint32_t qpIdx = 0; qpIdx < qpNum_; ++qpIdx) {
+                const size_t remoteIndex = (static_cast<size_t>(peer) * eidCount_ + remoteEid) * qpNum_ + qpIdx;
+                QpImportInfoT importInfo = allImports[remoteIndex];
+                importInfo.in.key = allKeys[remoteIndex];
+                ret = loader_.RaCtxQpImport(state.ctxHandle, &importInfo, &state.remoteQpHandlesByQp[qpIdx][peer]);
+                if (ret != 0) {
+                    return TILEXR_ERROR_INTERNAL;
+                }
+                state.tpnListByQp[qpIdx][peer] = importInfo.out.ub.tpn;
             }
-            state.tpnList[peer] = importInfo.out.ub.tpn;
         }
     }
     return TILEXR_SUCCESS;
@@ -768,11 +806,12 @@ int TileXRUDMATransport::RefreshUDMAInfo()
         return TILEXR_ERROR_INTERNAL;
     }
 
-    std::vector<UDMAWQCtx> sq(options_.rankSize);
-    std::vector<UDMAWQCtx> rq(options_.rankSize);
-    std::vector<UDMACQCtx> scq(options_.rankSize);
-    std::vector<UDMACQCtx> rcq(options_.rankSize);
-    std::vector<UDMAMemInfo> mem(options_.rankSize);
+    const size_t queueEntries = static_cast<size_t>(options_.rankSize) * qpNum_;
+    std::vector<UDMAWQCtx> sq(queueEntries);
+    std::vector<UDMAWQCtx> rq(queueEntries);
+    std::vector<UDMACQCtx> scq(queueEntries);
+    std::vector<UDMACQCtx> rcq(queueEntries);
+    std::vector<UDMAMemInfo> mem(queueEntries);
 
     for (int rank = 0; rank < options_.rankSize; ++rank) {
         uint32_t localEid = fallbackEid;
@@ -786,26 +825,29 @@ int TileXRUDMATransport::RefreshUDMAInfo()
             stateIt = fallbackIt;
         }
         const auto& state = stateIt->second;
-        sq[rank] = state.localWq;
-        rq[rank] = state.localWq;
-        scq[rank] = state.localCq;
-        rcq[rank] = state.localCq;
-        if (rank == options_.rank) {
-            const auto localMemIt = localMemInfoByEid_.find(localEid);
-            if (localMemIt != localMemInfoByEid_.end()) {
-                mem[rank] = localMemIt->second;
+        for (uint32_t qpIdx = 0; qpIdx < qpNum_; ++qpIdx) {
+            const size_t entryIndex = static_cast<size_t>(rank) * qpNum_ + qpIdx;
+            sq[entryIndex] = state.localWqs[qpIdx];
+            rq[entryIndex] = state.localWqs[qpIdx];
+            scq[entryIndex] = state.localCqs[qpIdx];
+            rcq[entryIndex] = state.localCqs[qpIdx];
+            if (rank == options_.rank) {
+                const auto localMemIt = localMemInfoByEid_.find(localEid);
+                if (localMemIt != localMemInfoByEid_.end()) {
+                    mem[entryIndex] = localMemIt->second;
+                }
+            } else {
+                mem[entryIndex] = allMem[rank * eidCount_ + remoteEid];
+                mem[entryIndex].tpn = state.tpnListByQp[qpIdx][rank];
             }
-        } else {
-            mem[rank] = allMem[rank * eidCount_ + remoteEid];
-            mem[rank].tpn = state.tpnList[rank];
+            mem[entryIndex].eidAddr = reinterpret_cast<uint64_t>(
+                eidTableDev_ + (rank * eidCount_ + remoteEid) * sizeof(HccpEid));
         }
-        mem[rank].eidAddr = reinterpret_cast<uint64_t>(
-            eidTableDev_ + (rank * eidCount_ + remoteEid) * sizeof(HccpEid));
     }
 
     if (udmaInfoDev_ == nullptr) {
         const size_t oneRankSize = 2 * sizeof(UDMAWQCtx) + 2 * sizeof(UDMACQCtx) + sizeof(UDMAMemInfo);
-        udmaInfoSize_ = static_cast<uint32_t>(sizeof(UDMAInfo) + oneRankSize * options_.rankSize);
+        udmaInfoSize_ = static_cast<uint32_t>(sizeof(UDMAInfo) + oneRankSize * options_.rankSize * qpNum_);
         ret = aclrtMalloc(reinterpret_cast<void**>(&udmaInfoDev_), udmaInfoSize_, ACL_MEM_MALLOC_HUGE_FIRST);
         if (ret != ACL_SUCCESS) {
             return TILEXR_ERROR_INTERNAL;
@@ -814,7 +856,7 @@ int TileXRUDMATransport::RefreshUDMAInfo()
 
     UDMAInfo info {};
     std::vector<uint8_t> image;
-    ret = BuildUDMAInfoImage(reinterpret_cast<uintptr_t>(udmaInfoDev_), sq, rq, scq, rcq, mem, info, image);
+    ret = BuildUDMAInfoImage(reinterpret_cast<uintptr_t>(udmaInfoDev_), qpNum_, sq, rq, scq, rcq, mem, info, image);
     if (ret != TILEXR_UDMA_LAYOUT_SUCCESS) {
         return TILEXR_ERROR_PARA_CHECK_FAIL;
     }
@@ -1002,26 +1044,44 @@ void TileXRUDMATransport::CleanupQueues()
 {
     for (auto& stateEntry : states_) {
         auto& state = stateEntry.second;
-        for (void* remoteQp : state.remoteQpHandles) {
-            if (remoteQp != nullptr && state.ctxHandle != nullptr) {
-                loader_.RaCtxQpUnimport(state.ctxHandle, remoteQp);
+        for (auto& remoteQpHandles : state.remoteQpHandlesByQp) {
+            for (void* remoteQp : remoteQpHandles) {
+                if (remoteQp != nullptr && state.ctxHandle != nullptr) {
+                    loader_.RaCtxQpUnimport(state.ctxHandle, remoteQp);
+                }
             }
         }
-        if (state.qpHandle != nullptr) {
-            loader_.RaCtxQpDestroy(state.qpHandle);
+        for (void* qpHandle : state.qpHandles) {
+            if (qpHandle != nullptr) {
+                loader_.RaCtxQpDestroy(qpHandle);
+            }
         }
-        if (state.cqHandle != nullptr && state.ctxHandle != nullptr) {
-            loader_.RaCtxCqDestroy(state.ctxHandle, state.cqHandle);
+        for (void* cqHandle : state.cqHandles) {
+            if (cqHandle != nullptr && state.ctxHandle != nullptr) {
+                loader_.RaCtxCqDestroy(state.ctxHandle, cqHandle);
+            }
         }
         if (state.chanHandle != nullptr && state.ctxHandle != nullptr) {
             loader_.RaCtxChanDestroy(state.ctxHandle, state.chanHandle);
         }
-        FreeDeviceScalar(state.cqPiAddr);
-        FreeDeviceScalar(state.cqCiAddr);
-        FreeDeviceScalar(state.sqPiAddr);
-        FreeDeviceScalar(state.sqCiAddr);
-        FreeDeviceScalar(state.wqeCntAddr);
-        FreeDeviceScalar(state.amoAddr);
+        for (void*& ptr : state.cqPiAddrs) {
+            FreeDeviceScalar(ptr);
+        }
+        for (void*& ptr : state.cqCiAddrs) {
+            FreeDeviceScalar(ptr);
+        }
+        for (void*& ptr : state.sqPiAddrs) {
+            FreeDeviceScalar(ptr);
+        }
+        for (void*& ptr : state.sqCiAddrs) {
+            FreeDeviceScalar(ptr);
+        }
+        for (void*& ptr : state.wqeCntAddrs) {
+            FreeDeviceScalar(ptr);
+        }
+        for (void*& ptr : state.amoAddrs) {
+            FreeDeviceScalar(ptr);
+        }
     }
     states_.clear();
 }
diff --git a/src/comm/udma/tilexr_udma_transport.h b/src/comm/udma/tilexr_udma_transport.h
index 0a787d1..8c5cd9a 100644
--- a/src/comm/udma/tilexr_udma_transport.h
+++ b/src/comm/udma/tilexr_udma_transport.h
@@ -73,6 +73,7 @@ class TileXRUDMATransport {
     uint32_t logicDevId_ = 0;
     uint32_t deviceIdOffset_ = 0;
     uint32_t eidCount_ = 0;
+    uint32_t qpNum_ = 1;
     std::map<uint32_t, void*> ctxHandleByEid_;
     std::map<uint32_t, void*> tokenHandleByEid_;
     std::map<int, uint32_t> peerLocalEid_;
diff --git a/src/include/tilexr_data_as_flag.h b/src/include/tilexr_data_as_flag.h
index 93c6460..21bf9d2 100644
--- a/src/include/tilexr_data_as_flag.h
+++ b/src/include/tilexr_data_as_flag.h
@@ -27,6 +27,7 @@ constexpr uint32_t DATA_AS_FLAG_ALIGN_BYTES = 32;
 constexpr uint32_t DATA_AS_FLAG_FLOAT_BYTES = sizeof(float);
 constexpr uint32_t DATA_AS_FLAG_FLAG_FLOATS = DATA_AS_FLAG_FLAG_BYTES / DATA_AS_FLAG_FLOAT_BYTES;
 constexpr uint32_t DATA_AS_FLAG_SUM_RESULT_BYTES = DATA_AS_FLAG_ALIGN_BYTES;
+constexpr uint64_t DATA_AS_FLAG_COMMIT_BIT = 0x80000000ULL;
 constexpr float DATA_AS_FLAG_READY_VALUE = 1.0f;
 
 static_assert(DATA_AS_FLAG_PAYLOAD_BYTES + DATA_AS_FLAG_FLAG_BYTES == DATA_AS_FLAG_BLOCK_BYTES,
@@ -52,6 +53,23 @@ TILEXR_DATA_AS_FLAG_INLINE uint64_t DataAsFlagAlignUp(uint64_t value, uint64_t a
     return remainder == 0U ? value : value + alignment - remainder;
 }
 
+TILEXR_DATA_AS_FLAG_INLINE uint64_t DataAsFlagEpoch(int32_t magic, int32_t step)
+{
+    return (static_cast<uint64_t>(static_cast<uint32_t>(magic)) << 32) |
+        static_cast<uint32_t>(step);
+}
+
+TILEXR_DATA_AS_FLAG_INLINE bool DataAsFlagEpochReady(uint64_t observed, uint64_t expected)
+{
+    constexpr uint64_t kMagicMask = 0xffffffff00000000ULL;
+    return (observed & kMagicMask) == (expected & kMagicMask) && observed >= expected;
+}
+
+TILEXR_DATA_AS_FLAG_INLINE uint64_t DataAsFlagCommitEpoch(uint64_t epoch)
+{
+    return epoch | DATA_AS_FLAG_COMMIT_BIT;
+}
+
 #if TILEXR_ASCENDC_AICORE_COMPILE
 
 __aicore__ inline uint32_t DataAsFlagScratchBytes(const AscendC::LocalTensor<uint8_t>& scratch)
@@ -141,6 +159,16 @@ __aicore__ inline uint32_t DataAsFlagMaxRecvBlocks(uint32_t scratchBytes)
     return 0U;
 }
 
+__aicore__ inline uint32_t DataAsFlagMaxEpochOrderedSendBlocks(uint32_t scratchBytes)
+{
+    return scratchBytes / DATA_AS_FLAG_BLOCK_BYTES;
+}
+
+__aicore__ inline uint32_t DataAsFlagMaxEpochOrderedRecvBlocks(uint32_t scratchBytes)
+{
+    return DataAsFlagMaxEpochOrderedSendBlocks(scratchBytes);
+}
+
 __aicore__ inline uint32_t DataAsFlagInit(AscendC::LocalTensor<uint8_t>& sendScratch)
 {
     const uint32_t sendBlocks = DataAsFlagScratchBytes(sendScratch) / DATA_AS_FLAG_BLOCK_BYTES;
@@ -248,6 +276,94 @@ __aicore__ inline uint32_t DataAsFlagSend(
     return totalBlocks;
 }
 
+__aicore__ inline void DataAsFlagFillEpochFlags(
+    AscendC::LocalTensor<uint8_t>& flagScratch,
+    uint32_t blockCount,
+    uint64_t epoch)
+{
+    AscendC::LocalTensor<int64_t> flagWords = flagScratch.template ReinterpretCast<int64_t>();
+    const uint32_t words = blockCount * DATA_AS_FLAG_FLAG_BYTES / sizeof(int64_t);
+    const int64_t signedEpoch = static_cast<int64_t>(epoch);
+    for (uint32_t i = 0; i < words; ++i) {
+        flagWords.SetValue(i, signedEpoch);
+    }
+    AscendC::SetFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);
+    AscendC::WaitFlag<AscendC::HardEvent::S_MTE3>(EVENT_ID0);
+}
+
+__aicore__ inline void DataAsFlagWriteBatchCommitFlag(
+    __gm__ uint8_t* dstDataAsFlagGM,
+    uint32_t dstBlockOffset,
+    AscendC::LocalTensor<uint8_t>& scratch,
+    uint32_t batchBlocks,
+    uint64_t commitEpoch)
+{
+    if (batchBlocks == 0U) {
+        return;
+    }
+    DataAsFlagFillEpochFlags(scratch, 1U, commitEpoch);
+    AscendC::GlobalTensor<int64_t> flagGlobal;
+    const uint32_t lastBlock = dstBlockOffset + batchBlocks - 1U;
+    flagGlobal.SetGlobalBuffer(
+        reinterpret_cast<__gm__ int64_t*>(
+            dstDataAsFlagGM + static_cast<uint64_t>(lastBlock) * DATA_AS_FLAG_BLOCK_BYTES +
+            DATA_AS_FLAG_FLAG_OFFSET_BYTES));
+    AscendC::LocalTensor<int64_t> flagWords = scratch.template ReinterpretCast<int64_t>();
+    constexpr uint32_t kFlagWords = DATA_AS_FLAG_FLAG_BYTES / sizeof(int64_t);
+    AscendC::DataCopy(flagGlobal, flagWords, kFlagWords);
+}
+
+__aicore__ inline uint32_t DataAsFlagSendEpochOrdered(
+    __gm__ uint8_t* dstDataAsFlagGM,
+    const __gm__ uint8_t* srcGM,
+    uint64_t dataBytes,
+    uint64_t epoch,
+    AscendC::LocalTensor<uint8_t>& scratch)
+{
+    if (dstDataAsFlagGM == nullptr || srcGM == nullptr || dataBytes == 0U) {
+        return 0U;
+    }
+
+    const uint32_t totalBlocks = DataAsFlagBlockCountForPayloadBytes(dataBytes);
+    const uint32_t batchCapacity = DataAsFlagMaxEpochOrderedSendBlocks(DataAsFlagScratchBytes(scratch));
+    if (batchCapacity == 0U) {
+        return 0U;
+    }
+
+    uint32_t sentBlocks = 0U;
+    uint64_t sentBytes = 0U;
+    while (sentBlocks < totalBlocks) {
+        const uint32_t remainingBlocks = totalBlocks - sentBlocks;
+        const uint32_t batchBlocks = remainingBlocks < batchCapacity ? remainingBlocks : batchCapacity;
+        const uint64_t maxBatchBytes = static_cast<uint64_t>(batchBlocks) * DATA_AS_FLAG_PAYLOAD_BYTES;
+        const uint64_t remainingBytes = dataBytes - sentBytes;
+        const uint32_t batchPayloadBytes = static_cast<uint32_t>(
+            remainingBytes < maxBatchBytes ? remainingBytes : maxBatchBytes);
+        const uint32_t fullBlocks = batchPayloadBytes / DATA_AS_FLAG_PAYLOAD_BYTES;
+        const uint32_t tailBytes = batchPayloadBytes % DATA_AS_FLAG_PAYLOAD_BYTES;
+
+        AscendC::LocalTensor<uint8_t> payloadScratch = scratch;
+
+        AscendC::Duplicate<uint8_t>(payloadScratch, 0U, batchBlocks * DATA_AS_FLAG_BLOCK_BYTES);
+        AscendC::PipeBarrier<PIPE_ALL>();
+        DataAsFlagCopyPayloadToScratch(payloadScratch, srcGM, sentBytes, fullBlocks, tailBytes);
+        AscendC::SetFlag<AscendC::HardEvent::MTE2_MTE3>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE2_MTE3>(EVENT_ID0);
+        DataAsFlagCopyScratchToDataAsFlagGM(dstDataAsFlagGM, sentBlocks, payloadScratch, batchBlocks);
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+
+        DataAsFlagWriteBatchCommitFlag(dstDataAsFlagGM, sentBlocks, scratch, batchBlocks, DataAsFlagCommitEpoch(epoch));
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_S>(EVENT_ID0);
+
+        sentBlocks += batchBlocks;
+        sentBytes += batchPayloadBytes;
+    }
+    AscendC::PipeBarrier<PIPE_ALL>();
+    return totalBlocks;
+}
+
 __aicore__ inline bool DataAsFlagCheckBatch(
     const __gm__ uint8_t* dataAsFlagGM,
     uint32_t blockOffset,
@@ -290,6 +406,40 @@ __aicore__ inline bool DataAsFlagCheckBatch(
     return sumOut.GetValue(0) == static_cast<float>(batchBlocks);
 }
 
+__aicore__ inline uint64_t DataAsFlagLoadEpochFlag(
+    const __gm__ uint8_t* dataAsFlagGM,
+    uint32_t blockIndex,
+    AscendC::LocalTensor<uint8_t>& scratch)
+{
+    AscendC::GlobalTensor<int64_t> flagGlobal;
+    flagGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(
+        const_cast<__gm__ uint8_t*>(
+            dataAsFlagGM + static_cast<uint64_t>(blockIndex) * DATA_AS_FLAG_BLOCK_BYTES +
+            DATA_AS_FLAG_FLAG_OFFSET_BYTES)));
+    AscendC::LocalTensor<int64_t> flagLocal = scratch.template ReinterpretCast<int64_t>();
+    AscendC::DataCopy(flagLocal, flagGlobal, DATA_AS_FLAG_FLAG_BYTES / sizeof(int64_t));
+    AscendC::SetFlag<AscendC::HardEvent::MTE2_S>(EVENT_ID0);
+    AscendC::WaitFlag<AscendC::HardEvent::MTE2_S>(EVENT_ID0);
+    return static_cast<uint64_t>(flagLocal.GetValue(0));
+}
+
+__aicore__ inline bool DataAsFlagCheckBatchEpochStrict(
+    const __gm__ uint8_t* dataAsFlagGM,
+    uint32_t blockOffset,
+    uint32_t batchBlocks,
+    uint64_t commitEpoch,
+    AscendC::LocalTensor<uint8_t>& scratch)
+{
+    if (batchBlocks == 0U) {
+        return false;
+    }
+    const uint32_t lastBlock = blockOffset + batchBlocks - 1U;
+    if (!DataAsFlagEpochReady(DataAsFlagLoadEpochFlag(dataAsFlagGM, lastBlock, scratch), commitEpoch)) {
+        return false;
+    }
+    return true;
+}
+
 __aicore__ inline bool DataAsFlagCheck(
     const __gm__ uint8_t* dataAsFlagGM,
     uint64_t dataBytes,
@@ -363,6 +513,54 @@ __aicore__ inline void DataAsFlagCopyBatchToRecvGM(
     AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
 }
 
+__aicore__ inline bool DataAsFlagCheckAndRecvEpochOrdered(
+    const __gm__ uint8_t* dataAsFlagGM,
+    uint64_t dataBytes,
+    __gm__ uint8_t* recvGM,
+    uint64_t epoch,
+    AscendC::LocalTensor<uint8_t>& recvScratch,
+    bool strict)
+{
+    if (dataAsFlagGM == nullptr || recvGM == nullptr) {
+        return false;
+    }
+    if (dataBytes == 0U) {
+        return true;
+    }
+
+    const uint32_t totalBlocks = DataAsFlagBlockCountForPayloadBytes(dataBytes);
+    const uint32_t batchCapacity = DataAsFlagMaxEpochOrderedRecvBlocks(DataAsFlagScratchBytes(recvScratch));
+    if (batchCapacity == 0U) {
+        return false;
+    }
+
+    uint32_t processedBlocks = 0U;
+    uint64_t processedBytes = 0U;
+    const uint64_t commitEpoch = DataAsFlagCommitEpoch(epoch);
+    while (processedBlocks < totalBlocks) {
+        const uint32_t remainingBlocks = totalBlocks - processedBlocks;
+        const uint32_t batchBlocks = remainingBlocks < batchCapacity ? remainingBlocks : batchCapacity;
+        const uint32_t lastBlock = processedBlocks + batchBlocks - 1U;
+        while (!DataAsFlagEpochReady(DataAsFlagLoadEpochFlag(dataAsFlagGM, lastBlock, recvScratch), commitEpoch)) {
+        }
+        if (strict &&
+            !DataAsFlagCheckBatchEpochStrict(dataAsFlagGM, processedBlocks, batchBlocks, commitEpoch, recvScratch)) {
+            return false;
+        }
+
+        const uint64_t remainingBytes = dataBytes - processedBytes;
+        const uint64_t maxBatchBytes = static_cast<uint64_t>(batchBlocks) * DATA_AS_FLAG_PAYLOAD_BYTES;
+        const uint32_t batchBytes = static_cast<uint32_t>(
+            remainingBytes < maxBatchBytes ? remainingBytes : maxBatchBytes);
+        DataAsFlagCopyBatchToRecvGM(
+            dataAsFlagGM, processedBlocks, processedBytes, batchBytes, recvGM, recvScratch);
+        processedBlocks += batchBlocks;
+        processedBytes += batchBytes;
+    }
+    AscendC::PipeBarrier<PIPE_ALL>();
+    return true;
+}
+
 __aicore__ inline bool DataAsFlagCheckAndRecv(
     const __gm__ uint8_t* dataAsFlagGM,
     uint64_t dataBytes,
diff --git a/src/include/tilexr_udma.h b/src/include/tilexr_udma.h
index 43f15c0..e8d4b33 100644
--- a/src/include/tilexr_udma.h
+++ b/src/include/tilexr_udma.h
@@ -105,9 +105,10 @@ __aicore__ inline __gm__ UDMACQCtx* UDMAGetSCQCtx(__gm__ UDMAInfo* udmaInfo, uin
     return reinterpret_cast<__gm__ UDMACQCtx*>(udmaInfo->scqPtr + (pe * qpNum + qpIdx) * sizeof(UDMACQCtx));
 }
 
-__aicore__ inline __gm__ UDMAMemInfo* UDMAGetRemoteMemInfo(__gm__ UDMAInfo* udmaInfo, uint32_t pe)
+__aicore__ inline __gm__ UDMAMemInfo* UDMAGetRemoteMemInfo(__gm__ UDMAInfo* udmaInfo, uint32_t pe, uint32_t qpIdx)
 {
-    return reinterpret_cast<__gm__ UDMAMemInfo*>(udmaInfo->memPtr + sizeof(UDMAMemInfo) * pe);
+    uint32_t qpNum = udmaInfo->qpNum;
+    return reinterpret_cast<__gm__ UDMAMemInfo*>(udmaInfo->memPtr + sizeof(UDMAMemInfo) * (pe * qpNum + qpIdx));
 }
 
 __aicore__ inline void UDMAPollCQUpdateInfo(
@@ -244,7 +245,7 @@ __aicore__ inline void UDMAPostSend(
     uint32_t wqeCnt = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->wqeCntAddr), 0);
     UDMAPollCQWhenSQOverflow(udmaInfo, qpCtxEntry, wqeCnt, pe, qpIdx);
 
-    __gm__ UDMAMemInfo* remoteMemInfo = UDMAGetRemoteMemInfo(udmaInfo, pe);
+    __gm__ UDMAMemInfo* remoteMemInfo = UDMAGetRemoteMemInfo(udmaInfo, pe, qpIdx);
     __gm__ uint8_t* wqeAddr =
         reinterpret_cast<__gm__ uint8_t*>(qpCtxEntry->bufAddr + wqeSize * (curHead % TILEXR_UDMA_SQ_BB_COUNT));
     __gm__ UDMASqeCtx* sqeCtx = reinterpret_cast<__gm__ UDMASqeCtx*>(wqeAddr);
@@ -289,17 +290,27 @@ __aicore__ inline void UDMAWriteNotify(
 }
 
 template <typename T>
-__aicore__ inline void UDMAPutNbi(
-    const __gm__ CommArgs* args, int targetRank, const __gm__ T* localSrc, uint64_t byteOffset, uint32_t byteCount)
+__aicore__ inline void UDMAPutNbiQp(
+    const __gm__ CommArgs* args, int targetRank, uint32_t qpIdx,
+    const __gm__ T* localSrc, uint64_t byteOffset, uint32_t byteCount)
 {
     if (!UDMARegistryEnabled(args)) return;
+    __gm__ UDMAInfo* udmaInfo = GetUDMAInfo(args);
+    if (qpIdx >= udmaInfo->qpNum) return;
 
     auto registry = GetUDMARegistry(args);
     if (!UDMARegisteredRangeValid(registry, targetRank, byteOffset, byteCount)) return;
 
     auto remoteAddr = UDMARegisteredRemoteAddr(registry, targetRank, byteOffset);
     UDMAWrite(args, remoteAddr, reinterpret_cast<__gm__ uint8_t*>(const_cast<__gm__ T*>(localSrc)),
-              targetRank, 0, byteCount);
+              targetRank, qpIdx, byteCount);
+}
+
+template <typename T>
+__aicore__ inline void UDMAPutNbi(
+    const __gm__ CommArgs* args, int targetRank, const __gm__ T* localSrc, uint64_t byteOffset, uint32_t byteCount)
+{
+    UDMAPutNbiQp<T>(args, targetRank, 0, localSrc, byteOffset, byteCount);
 }
 
 template <typename T>
@@ -310,16 +321,26 @@ __aicore__ inline void UDMAPutRegisteredNbi(
 }
 
 template <typename T>
-__aicore__ inline void UDMAGetNbi(
-    const __gm__ CommArgs* args, int sourceRank, __gm__ T* localDst, uint64_t byteOffset, uint32_t byteCount)
+__aicore__ inline void UDMAGetNbiQp(
+    const __gm__ CommArgs* args, int sourceRank, uint32_t qpIdx,
+    __gm__ T* localDst, uint64_t byteOffset, uint32_t byteCount)
 {
     if (!UDMARegistryEnabled(args)) return;
+    __gm__ UDMAInfo* udmaInfo = GetUDMAInfo(args);
+    if (qpIdx >= udmaInfo->qpNum) return;
 
     auto registry = GetUDMARegistry(args);
     if (!UDMARegisteredRangeValid(registry, sourceRank, byteOffset, byteCount)) return;
 
     auto remoteAddr = UDMARegisteredRemoteAddr(registry, sourceRank, byteOffset);
-    UDMARead(args, reinterpret_cast<__gm__ uint8_t*>(localDst), remoteAddr, sourceRank, 0, byteCount);
+    UDMARead(args, reinterpret_cast<__gm__ uint8_t*>(localDst), remoteAddr, sourceRank, qpIdx, byteCount);
+}
+
+template <typename T>
+__aicore__ inline void UDMAGetNbi(
+    const __gm__ CommArgs* args, int sourceRank, __gm__ T* localDst, uint64_t byteOffset, uint32_t byteCount)
+{
+    UDMAGetNbiQp<T>(args, sourceRank, 0, localDst, byteOffset, byteCount);
 }
 
 template <typename T>
@@ -359,13 +380,24 @@ __aicore__ inline void UDMAPutRegisteredSignalNbi(
     UDMAPutSignalNbi<T>(args, targetRank, localSrc, byteOffset, byteCount, signalByteOffset, signal);
 }
 
-__aicore__ inline void UDMAQuiet(const __gm__ CommArgs* args, int targetRank)
+__aicore__ inline uint32_t UDMAQuietStatusQp(const __gm__ CommArgs* args, int targetRank, uint32_t qpIdx)
 {
-    if (!UDMAEnabled(args)) return;
+    if (!UDMAEnabled(args)) return 0xFFFFFFFFU;
     __gm__ UDMAInfo* udmaInfo = GetUDMAInfo(args);
-    __gm__ UDMAWQCtx* qpCtxEntry = UDMAGetWQCtx(udmaInfo, targetRank, 0);
+    if (qpIdx >= udmaInfo->qpNum) return 0xFFFFFFFFU;
+    __gm__ UDMAWQCtx* qpCtxEntry = UDMAGetWQCtx(udmaInfo, targetRank, qpIdx);
     uint32_t wqeCnt = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->wqeCntAddr), 0);
-    (void)UDMAPollCQ(udmaInfo, targetRank, 0, wqeCnt);
+    return UDMAPollCQ(udmaInfo, targetRank, qpIdx, wqeCnt);
+}
+
+__aicore__ inline uint32_t UDMAQuietStatus(const __gm__ CommArgs* args, int targetRank)
+{
+    return UDMAQuietStatusQp(args, targetRank, 0);
+}
+
+__aicore__ inline void UDMAQuiet(const __gm__ CommArgs* args, int targetRank)
+{
+    (void)UDMAQuietStatus(args, targetRank);
 }
 
 } // namespace TileXR
diff --git a/tests/udma/CMakeLists.txt b/tests/udma/CMakeLists.txt
index 6c1e41a..7ac0edd 100644
--- a/tests/udma/CMakeLists.txt
+++ b/tests/udma/CMakeLists.txt
@@ -43,7 +43,7 @@ message(STATUS "TILEXR_ROOT: ${TILEXR_ROOT}")
 
 # 查找库
 find_library(TILEXR_LIB tile-comm
-    HINTS "${TILEXR_ROOT}/install/lib" "${TILEXR_ROOT}/build/src/comm"
+    HINTS "${TILEXR_ROOT}/install/lib" "${TILEXR_ROOT}/install/lib64" "${TILEXR_ROOT}/build/src/comm"
     REQUIRED)
 
 message(STATUS "Found tile-comm: ${TILEXR_LIB}")
@@ -52,6 +52,7 @@ message(STATUS "Found tile-comm: ${TILEXR_LIB}")
 include_directories(
     ${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/
     ${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/runtime/
+    ${ASCEND_HOME_PATH}/${ARCH}-linux/asc/include/
     ${ASCEND_HOME_PATH}/${ARCH}-linux/include/
     ${ASCEND_DRIVER_PATH}/kernel/inc
     ${TILEXR_ROOT}/3rdparty
@@ -83,6 +84,22 @@ target_include_directories(test_tilexr_udma_transport_layout PRIVATE
     ${TILEXR_ROOT}/src/comm
 )
 
+add_executable(test_tilexr_udma_p2p_perf_config
+    unit/test_tilexr_udma_p2p_perf_config.cpp
+)
+
+target_include_directories(test_tilexr_udma_p2p_perf_config PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+)
+
+add_executable(test_tilexr_udma_p2p_source_guard
+    unit/test_tilexr_udma_p2p_source_guard.cpp
+)
+
+target_compile_definitions(test_tilexr_udma_p2p_source_guard PRIVATE
+    TILEXR_SOURCE_ROOT="${TILEXR_ROOT}"
+)
+
 # 集成测试：TileXR UDMA
 add_executable(test_tilexr_udma
     integration/test_tilexr_udma.cpp
@@ -99,6 +116,8 @@ set(INSTALL_TARGETS
     test_tilexr_udma
     test_tilexr_udma_registry
     test_tilexr_udma_transport_layout
+    test_tilexr_udma_p2p_perf_config
+    test_tilexr_udma_p2p_source_guard
 )
 
 if(BUILD_TILEXR_UDMA_DEMO)
@@ -161,6 +180,7 @@ if(BUILD_TILEXR_UDMA_DEMO)
             -I${ASCEND_HOME_PATH}/${ARCH}-linux/tikcpp/tikcfw/interface
             -I${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/
             -I${ASCEND_HOME_PATH}/${ARCH}-linux/pkg_inc/runtime/
+            -I${ASCEND_HOME_PATH}/${ARCH}-linux/asc/include/
             -I${ASCEND_HOME_PATH}/${ARCH}-linux/include/
             -I${ASCEND_DRIVER_PATH}/kernel/inc
             -I${TILEXR_ROOT}/3rdparty
@@ -175,6 +195,7 @@ if(BUILD_TILEXR_UDMA_DEMO)
                 -shared
                 ${TILEXR_UDMA_KERNEL_LINK_OPTIONS}
                 -DCATLASS_ARCH=${TILEXR_UDMA_CATLASS_ARCH}
+                -DTILEXR_UDMA_FORCE_ENABLE=1
                 ${TILEXR_UDMA_DEMO_KERNEL_INCLUDES}
                 "${CMAKE_CURRENT_SOURCE_DIR}/demo/tilexr_udma_demo_kernel.cpp"
                 -L${ASCEND_DRIVER_PATH}/lib64/driver
@@ -193,6 +214,7 @@ if(BUILD_TILEXR_UDMA_DEMO)
                 -o "${TILEXR_UDMA_DEMO_KERNEL_SO}"
             DEPENDS
                 "${CMAKE_CURRENT_SOURCE_DIR}/demo/tilexr_udma_demo_kernel.cpp"
+                "${CMAKE_CURRENT_SOURCE_DIR}/demo/tilexr_udma_p2p_perf_config.h"
                 "${TILEXR_ROOT}/src/include/tilexr_udma.h"
             VERBATIM
             COMMENT "Building TileXR UDMA demo kernel with bisheng"
diff --git a/tests/udma/demo/README.md b/tests/udma/demo/README.md
index 5894707..b6273bd 100644
--- a/tests/udma/demo/README.md
+++ b/tests/udma/demo/README.md
@@ -17,6 +17,10 @@ The demo target requires `bisheng`. If `bisheng` is not available, `build.sh` st
 cd /path/to/TileXR/tests/udma
 bash demo/run_tilexr_udma_demo.sh 0 2 16 2 0
 bash demo/run_tilexr_udma_demo.sh 1 2 16 2 0
+bash demo/run_tilexr_udma_p2p_perf.sh 0 1 4096 16777216 2 20 5 0
+bash demo/run_tilexr_udma_p2p_concurrency_sweep.sh 4096 16777216 2 20 5 0 1 direct_urma,memory,data_as_flag unidir,bidir 1,2,4,8
+bash demo/run_tilexr_udma_p2p_concurrency_sweep.sh 16777216 67108864 2 20 5 0 1 memory,memory_segmented,memory_segmented_rotate unidir 1
+TILEXR_P2P_DEBUG_SUMMARY=1 bash demo/run_tilexr_udma_p2p_concurrency_sweep.sh 16777216 67108864 2 20 5 0 1 memory_segmented_trace,memory_segmented_rotate_trace unidir 1,2
 ```
 
 Arguments:
@@ -27,12 +31,48 @@ run_tilexr_udma_demo.sh <test_type> <rank_size> <elements_per_rank> <npu_count>
 
 - `test_type=0`: all-gather style UDMA put.
 - `test_type=1`: UDMA put with signal.
+- `test_type=4`: directed 2-card P2P performance mode. Use
+  `demo/run_tilexr_udma_p2p_perf.sh` instead of calling the binary directly.
 - `rank_size`: number of local ranks to launch.
 - `elements_per_rank`: `int32_t` elements in each rank segment.
 - `npu_count`: number of NPUs available to this run.
 - `first_npu`: first physical NPU id to use.
 
 Each run writes per-rank logs under `tests/udma/logs/tilexr_udma_demo_*`.
+P2P performance runs write logs under `tests/udma/logs/tilexr_udma_p2p_perf_*`
+and append CSV rows to `p2p_perf.csv`.
+
+The P2P performance scripts expose these user-facing transports:
+`direct_urma`, `memory`, `memory_consume`, `data_as_flag`, and
+`data_as_flag_epoch_ordered`. `direct_urma` uses the current parallel
+multi-jetty implementation internally; `block_dim=1` with one QP matches the
+previous single-QP direct URMA baseline, while `block_dim=N` with
+`TILEXR_UDMA_QP_NUM=N` uses up to `N` QPs/jettys in parallel.
+
+Two additional diagnostic IPC transports help isolate large-message `memory`
+throughput drops:
+
+- `memory_segmented`: one `block_dim=1` kernel still transfers the full
+  payload, but calls the internal memory copy helper in 16 MiB segments while
+  writing the normal continuous peer window.
+- `memory_segmented_rotate`: transfers the full payload but rotates destination
+  writes inside one 16 MiB peer-window span. This is a performance diagnostic
+  only; payload validation is skipped for this transport because later segments
+  overwrite earlier destination bytes.
+- `memory_segmented_trace`: same as `memory_segmented`, but uses 8 MiB
+  segments and records per-segment copy cycles for block 0 in the debug
+  summary. Set `TILEXR_P2P_DEBUG_SUMMARY=1` to print `seg0Cycles` through
+  `seg7Cycles`.
+- `memory_segmented_rotate_trace`: same as `memory_segmented_rotate`, with the
+  same 8 MiB per-segment cycle trace. Compare it with
+  `memory_segmented_trace` to keep destination address span separate from
+  sustained single-stream write behavior.
+
+For the `memory` large-message knee, use the trace variants as a root-cause
+probe. If `block_dim=1` later segments cost more cycles while `block_dim=2`
+stays flat, that supports accumulated peer IPC write backpressure or remote
+write-queue drain limits. If all `block_dim=1` segments are uniformly slower
+than `block_dim=2`, the result points to a fixed single-stream bandwidth cap.
 
 Run this demo only on A5 / Ascend950 / 950 hardware. Builds or smoke tests on other Ascend chips are not valid UDMA runtime validation.
 
diff --git a/tests/udma/demo/plot_tilexr_udma_p2p_perf.py b/tests/udma/demo/plot_tilexr_udma_p2p_perf.py
new file mode 100644
index 0000000..52e049e
--- /dev/null
+++ b/tests/udma/demo/plot_tilexr_udma_p2p_perf.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python3
+"""Plot TileXR UDMA P2P performance CSV files."""
+
+import argparse
+import csv
+import math
+from collections import defaultdict
+from pathlib import Path
+from glob import glob
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("csv", nargs="+", help="p2p_perf.csv files to merge")
+    parser.add_argument("--output", default="tilexr_udma_p2p_perf_curve.png", help="bandwidth PNG path")
+    parser.add_argument("--latency-output", default=None, help="latency PNG path")
+    parser.add_argument("--latency-max-bytes", type=int, default=1024 * 1024,
+        help="only plot latency rows up to this size; default: 1048576")
+    parser.add_argument("--direction", default=None, help="only plot one direction, for example 0to1")
+    parser.add_argument("--labels", default=None,
+        help="comma-separated labels matching input CSV files, for example direct_urma,memory")
+    parser.add_argument("--series-by", default="auto", choices=["auto", "label", "transport", "traffic", "block_dim"],
+        help="series grouping for new CSVs; default uses transport+traffic+block_dim")
+    parser.add_argument("--metric", default="bw_GBps", choices=["bw_GBps", "per_flow_bw_GBps"],
+        help="bandwidth metric to plot; default plots aggregate bandwidth")
+    return parser.parse_args()
+
+
+def infer_label(path):
+    text = str(path)
+    if "direct_urma" in text:
+        return "direct_urma"
+    if "memory" in text:
+        return "memory"
+    return Path(path).parent.name or Path(path).stem
+
+
+def expand_paths(paths):
+    expanded = []
+    for path in paths:
+        matches = sorted(glob(path))
+        expanded.extend(matches if matches else [path])
+    return expanded
+
+
+def row_value(row, key, default):
+    value = row.get(key)
+    return default if value is None or value == "" else value
+
+
+def build_series(row, path, fallback_label, series_by):
+    transport = row_value(row, "transport", fallback_label)
+    traffic = row_value(row, "traffic", "unidir")
+    block_dim = row_value(row, "block_dim", "1")
+    if series_by == "label":
+        return fallback_label
+    if series_by == "transport":
+        return transport
+    if series_by == "traffic":
+        return traffic
+    if series_by == "block_dim":
+        return f"bd={block_dim}"
+    return f"{transport} {traffic} bd={block_dim}"
+
+
+def load_rows(paths, labels=None, direction_filter=None, series_by="auto"):
+    if labels is not None and len(labels) != len(paths):
+        raise SystemExit("--labels count must match input CSV count")
+    merged = {}
+    for index, path in enumerate(paths):
+        label = labels[index] if labels is not None else infer_label(path)
+        with open(path, newline="") as handle:
+            reader = csv.DictReader(handle)
+            for row in reader:
+                direction = row_value(row, "direction", "")
+                if direction_filter is not None and direction != direction_filter:
+                    continue
+                series = label if labels is not None else build_series(row, path, label, series_by)
+                key = (series, int(row["bytes"]))
+                merged[key] = {
+                    "series": series,
+                    "direction": direction,
+                    "transport": row_value(row, "transport", label),
+                    "traffic": row_value(row, "traffic", "unidir"),
+                    "block_dim": int(row_value(row, "block_dim", "1")),
+                    "bytes": int(row["bytes"]),
+                    "avg_us": float(row["avg_us"]),
+                    "bw_GBps": float(row["bw_GBps"]),
+                    "per_flow_bw_GBps": float(row_value(row, "per_flow_bw_GBps", row["bw_GBps"])),
+                    "src": row_value(row, "src", ""),
+                    "dst": row_value(row, "dst", ""),
+                    "ranks": row_value(row, "ranks", ""),
+                    "status": int(row["status"]),
+                    "errors": int(row["errors"]),
+                }
+    grouped = defaultdict(list)
+    for row in merged.values():
+        grouped[row["series"]].append(row)
+    for rows in grouped.values():
+        rows.sort(key=lambda item: item["bytes"])
+    return grouped
+
+
+def default_latency_output(output):
+    path = Path(output)
+    suffix = path.suffix or ".png"
+    return str(path.with_name(path.stem + "_latency" + suffix))
+
+
+def format_bytes(value):
+    units = [("GB", 1024 ** 3), ("MB", 1024 ** 2), ("KB", 1024)]
+    for suffix, scale in units:
+        if value >= scale and value % scale == 0:
+            return f"{value // scale}{suffix}"
+    return f"{value}B"
+
+
+def byte_ticks(grouped):
+    values = sorted({row["bytes"] for rows in grouped.values() for row in rows})
+    return values, [format_bytes(value) for value in values]
+
+
+def filter_by_max_bytes(grouped, max_bytes):
+    filtered = defaultdict(list)
+    for direction, rows in grouped.items():
+        filtered[direction] = [row for row in rows if row["bytes"] <= max_bytes]
+    return {direction: rows for direction, rows in filtered.items() if rows}
+
+
+def plot_metric(grouped, metric, ylabel, title, output):
+    import matplotlib.pyplot as plt
+
+    plt.figure(figsize=(9, 5.2))
+    for direction in sorted(grouped):
+        rows = grouped[direction]
+        plt.plot([row["bytes"] for row in rows], [row[metric] for row in rows], marker="o", label=direction)
+
+    plt.xscale("log", base=2)
+    ticks, labels = byte_ticks(grouped)
+    plt.xticks(ticks, labels, rotation=35, ha="right")
+    plt.xlabel("message size")
+    plt.ylabel(ylabel)
+    plt.title(title)
+    plt.grid(True, which="both", linestyle="--", alpha=0.35)
+    plt.legend()
+    plt.tight_layout()
+    plt.savefig(output, dpi=160)
+    plt.close()
+    print(output)
+
+
+def main():
+    args = parse_args()
+    try:
+        import matplotlib  # noqa: F401
+    except ImportError as exc:
+        raise SystemExit("matplotlib is required to plot the curve: python3 -m pip install matplotlib") from exc
+
+    paths = expand_paths(args.csv)
+    labels = args.labels.split(",") if args.labels else None
+    grouped = load_rows(paths, labels=labels, direction_filter=args.direction, series_by=args.series_by)
+    if not grouped:
+        raise SystemExit("no rows found")
+
+    bad_rows = [
+        row for rows in grouped.values() for row in rows
+        if row["status"] != 0 or row["errors"] != 0
+    ]
+    if bad_rows:
+        raise SystemExit("refuse to plot rows with nonzero status/errors")
+
+    metric_label = "aggregate bw_GBps" if args.metric == "bw_GBps" else "per-flow bw_GBps"
+    plot_metric(grouped, args.metric, metric_label, "TileXR P2P bandwidth, rank_size=2", args.output)
+    latency_grouped = filter_by_max_bytes(grouped, args.latency_max_bytes)
+    if not latency_grouped:
+        raise SystemExit(f"no rows found for latency <= {args.latency_max_bytes} bytes")
+    plot_metric(latency_grouped, "avg_us", "avg_us",
+        f"TileXR UDMA P2P latency, rank_size=2, <= {format_bytes(args.latency_max_bytes)}",
+        args.latency_output or default_latency_output(args.output))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh b/tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh
new file mode 100644
index 0000000..652f81f
--- /dev/null
+++ b/tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+#
+# Run TileXR P2P perf across transports, traffic modes, and blockDim values.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+UDMA_DIR=$(cd "${SCRIPT_DIR}/.." && pwd)
+
+min_bytes=${1:-4096}
+max_bytes=${2:-16777216}
+step_factor=${3:-2}
+iters=${4:-20}
+warmup_iters=${5:-5}
+first_npu=${6:-2}
+check=${7:-1}
+transports_csv=${8:-direct_urma,memory,memory_consume,data_as_flag}
+traffic_csv=${9:-unidir,bidir}
+block_dims_csv=${10:-1,2,4,8}
+
+IFS=',' read -r -a transports <<<"${transports_csv}"
+IFS=',' read -r -a traffics <<<"${traffic_csv}"
+IFS=',' read -r -a block_dims <<<"${block_dims_csv}"
+
+base_port=${TILEXR_P2P_SWEEP_BASE_PORT:-13600}
+run_index=0
+
+echo "=========================================="
+echo "  TileXR P2P Concurrency Sweep"
+echo "=========================================="
+echo "Range:       ${min_bytes} -> ${max_bytes}, step=${step_factor}"
+echo "Iters:       ${iters}, warmup=${warmup_iters}"
+echo "First NPU:   ${first_npu}"
+echo "Check:       ${check}"
+echo "Transports:  ${transports_csv}"
+echo "Traffic:     ${traffic_csv}"
+echo "Block dims:  ${block_dims_csv}"
+echo "=========================================="
+
+cd "${UDMA_DIR}"
+for transport in "${transports[@]}"; do
+    for traffic in "${traffics[@]}"; do
+        for block_dim in "${block_dims[@]}"; do
+            port=$((base_port + run_index))
+            export TILEXR_COMM_ID="127.0.0.1:${port}"
+            echo "==== transport=${transport} traffic=${traffic} block_dim=${block_dim} port=${port} ===="
+            bash demo/run_tilexr_udma_p2p_perf.sh \
+                0 1 "${min_bytes}" "${max_bytes}" "${step_factor}" "${iters}" "${warmup_iters}" \
+                "${first_npu}" "${check}" "${transport}" "${block_dim}" "${traffic}"
+            run_index=$((run_index + 1))
+        done
+    done
+done
diff --git a/tests/udma/demo/run_tilexr_udma_p2p_perf.sh b/tests/udma/demo/run_tilexr_udma_p2p_perf.sh
new file mode 100755
index 0000000..0ffd9a3
--- /dev/null
+++ b/tests/udma/demo/run_tilexr_udma_p2p_perf.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+#
+# Run the TileXR 2-card directed UDMA P2P performance demo.
+#
+
+set -euo pipefail
+
+SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+UDMA_DIR=$(cd "${SCRIPT_DIR}/.." && pwd)
+TILEXR_ROOT=$(cd "${UDMA_DIR}/../.." && pwd)
+INSTALL_DIR="${UDMA_DIR}/install"
+
+src_rank=${1:-0}
+dst_rank=${2:-1}
+min_bytes=${3:-4096}
+max_bytes=${4:-16777216}
+step_factor=${5:-2}
+iters=${6:-20}
+warmup_iters=${7:-5}
+first_npu=${8:-0}
+check=${9:-1}
+transport=${10:-direct_urma}
+block_dim=${11:-1}
+traffic=${12:-unidir}
+
+source "${TILEXR_ROOT}/scripts/common_env.sh"
+
+export TILEXR_COMM_ID=${TILEXR_COMM_ID:-127.0.0.1:10067}
+export TILEXR_DEMO_NPUS=2
+export TILEXR_DEMO_FIRST_NPU=${first_npu}
+if [ "${transport}" = "direct_urma" ] || [ "${transport}" = "udma" ]; then
+    export TILEXR_UDMA_QP_NUM="${block_dim}"
+else
+    export TILEXR_UDMA_QP_NUM="${TILEXR_UDMA_QP_NUM:-1}"
+fi
+export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib64:${TILEXR_ROOT}/install/lib:${TILEXR_ROOT}/install/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}"
+
+bin="${INSTALL_DIR}/bin/tilexr_udma_demo"
+if [ ! -x "${bin}" ]; then
+    echo "ERROR: ${bin} not found. Run: cd ${UDMA_DIR} && bash build.sh"
+    exit 1
+fi
+
+log_dir="${UDMA_DIR}/logs/tilexr_udma_p2p_perf_$(date +%Y%m%d_%H%M%S)_${transport}_${traffic}_bd${block_dim}_${src_rank}to${dst_rank}"
+csv_path="${log_dir}/p2p_perf.csv"
+mkdir -p "${log_dir}"
+
+echo "=========================================="
+echo "  TileXR UDMA P2P Performance Demo"
+echo "=========================================="
+echo "Binary:        ${bin}"
+echo "Direction:     ${src_rank}->${dst_rank}"
+echo "Rank size:     2"
+echo "Min bytes:     ${min_bytes}"
+echo "Max bytes:     ${max_bytes}"
+echo "Step factor:   ${step_factor}"
+echo "Iters:         ${iters}"
+echo "Warmup iters:  ${warmup_iters}"
+echo "First NPU:     ${first_npu}"
+echo "Check:         ${check}"
+echo "Transport:     ${transport}"
+echo "Block dim:     ${block_dim}"
+echo "Traffic:       ${traffic}"
+echo "UDMA QP num:   ${TILEXR_UDMA_QP_NUM}"
+echo "TILEXR_COMM_ID:${TILEXR_COMM_ID}"
+echo "Log dir:       ${log_dir}"
+echo "CSV:           ${csv_path}"
+echo "=========================================="
+
+pids=()
+for rank in 0 1; do
+    log_file="${log_dir}/rank_${rank}.log"
+    echo "Starting rank ${rank}, log=${log_file}"
+    RANK=${rank} RANK_SIZE=2 TILEXR_P2P_LOG_DIR="${log_dir}" TILEXR_P2P_CSV="${csv_path}" "${bin}" \
+        2 "${rank}" 4 0 2 "${first_npu}" \
+        "${src_rank}" "${dst_rank}" "${min_bytes}" "${max_bytes}" "${step_factor}" \
+        "${iters}" "${warmup_iters}" "${check}" "${csv_path}" "${log_dir}" "${transport}" \
+        "${block_dim}" "${traffic}" \
+        >"${log_file}" 2>&1 &
+    pids+=("$!")
+done
+
+ret=0
+for idx in "${!pids[@]}"; do
+    pid=${pids[$idx]}
+    rank=${idx}
+    if wait "${pid}"; then
+        echo "rank ${rank} finished successfully"
+    else
+        r=$?
+        echo "rank ${rank} failed with exit code ${r}"
+        ret=${r}
+    fi
+done
+
+echo "=========================================="
+echo "  Rank Log Tails"
+echo "=========================================="
+for rank in 0 1; do
+    log_file="${log_dir}/rank_${rank}.log"
+    echo "----- rank ${rank}: ${log_file} -----"
+    tail -n 80 "${log_file}" || true
+done
+
+if [ -f "${csv_path}" ]; then
+    echo "=========================================="
+    echo "  CSV"
+    echo "=========================================="
+    cat "${csv_path}"
+fi
+
+exit "${ret}"
diff --git a/tests/udma/demo/tilexr_udma_demo.cpp b/tests/udma/demo/tilexr_udma_demo.cpp
index eabe662..f57d76b 100644
--- a/tests/udma/demo/tilexr_udma_demo.cpp
+++ b/tests/udma/demo/tilexr_udma_demo.cpp
@@ -9,7 +9,9 @@
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
+#include <fstream>
 #include <iostream>
+#include <sstream>
 #include <string>
 #include <thread>
 #include <vector>
@@ -21,12 +23,37 @@
 #include "acl/acl.h"
 #include "tilexr_api.h"
 #include "tilexr_types.h"
+#include "tilexr_udma_p2p_perf_config.h"
 
 extern void launch_tilexr_udma_all_gather(
     uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR data, GM_ADDR debug, int32_t elementsPerRank);
 extern void launch_tilexr_udma_put_signal(
     uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR data, GM_ADDR signals, GM_ADDR debug,
     int32_t elementsPerRank, uint64_t signal);
+extern void launch_tilexr_udma_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic);
+extern void launch_tilexr_udma_p2p_post_only_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic);
+extern void launch_tilexr_memory_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic);
+extern void launch_tilexr_memory_segmented_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern,
+    int32_t traffic, uint32_t segmentBytes, int32_t rotateWindow, uint32_t traceSegments);
+extern void launch_tilexr_memory_consume_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step);
+extern void launch_tilexr_data_as_flag_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic);
+extern void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step, int32_t strict);
 
 namespace {
 constexpr int32_t kDefaultElementsPerRank = 16;
@@ -37,6 +64,9 @@ constexpr int kDemoBarrierPortOffset = 97;
 constexpr size_t kUdmaRegistrationAlignment = 2 * 1024 * 1024;
 constexpr int kConnectRetryCount = 500;
 constexpr int kConnectRetrySleepMs = 10;
+constexpr size_t kP2PDebugWords = 64;
+constexpr uint32_t kA5CycleToUsDivisor = 1000;
+constexpr int32_t kP2PMagicBase = 0x5444554d; // "TDUM"
 
 struct BarrierEndpoint {
     uint16_t port;
@@ -48,6 +78,21 @@ int GetEnvInt(const char* name, int defaultValue)
     return value == nullptr ? defaultValue : std::atoi(value);
 }
 
+const char* GetEnvString(const char* name, const char* defaultValue)
+{
+    const char* value = std::getenv(name);
+    return value == nullptr || value[0] == '\0' ? defaultValue : value;
+}
+
+bool GetEnvFlag(const char* name, bool defaultValue)
+{
+    const char* value = std::getenv(name);
+    if (value == nullptr || value[0] == '\0') {
+        return defaultValue;
+    }
+    return std::atoi(value) != 0;
+}
+
 int GetDeviceIdFromEnv(int rank, int npuCount, int firstNpu)
 {
     const char* devices = std::getenv("TILEXR_DEMO_DEVICES");
@@ -359,6 +404,515 @@ bool ValidateSignals(int rank, int rankSize, const std::vector<uint64_t>& signal
     return ok;
 }
 
+uint64_t RoundUpToAlignment(uint64_t value, uint64_t alignment)
+{
+    return ((value + alignment - 1) / alignment) * alignment;
+}
+
+bool WriteTextFile(const std::string& path, const std::string& text)
+{
+    std::ofstream out(path.c_str(), std::ios::out | std::ios::trunc);
+    if (!out) {
+        return false;
+    }
+    out << text;
+    return static_cast<bool>(out);
+}
+
+bool ReadP2PStatusFile(const std::string& path, TileXR::Demo::P2PRankStatus* status)
+{
+    std::ifstream in(path.c_str());
+    if (!in) {
+        return false;
+    }
+    in >> status->status >> status->errors;
+    if (!in) {
+        return false;
+    }
+    if (in >> status->elapsedMs) {
+        if (in >> status->avgUsOverride) {
+            return true;
+        }
+        status->avgUsOverride = 0.0;
+        return true;
+    }
+    return in.eof();
+}
+
+bool AppendP2PPerfCsvRow(const TileXR::Demo::P2PPerfOptions& options, const TileXR::Demo::P2PPerfRow& row)
+{
+    if (options.csvPath.empty()) {
+        return true;
+    }
+    std::ifstream check(options.csvPath.c_str());
+    bool exists = static_cast<bool>(check);
+    check.close();
+    std::ofstream out(options.csvPath.c_str(), std::ios::out | std::ios::app);
+    if (!out) {
+        return false;
+    }
+    if (!exists) {
+        out << TileXR::Demo::P2PPerfCsvHeader();
+    }
+    out << TileXR::Demo::FormatP2PPerfCsvRow(row);
+    return static_cast<bool>(out);
+}
+
+bool CheckPeerMemsReady(int rank, int rankSize, const TileXR::CommArgs& args)
+{
+    for (int peer = 0; peer < rankSize; ++peer) {
+        if (args.peerMems[peer] == nullptr) {
+            std::cerr << "[rank " << rank << "] ERROR: peerMems[" << peer << "] is null" << std::endl;
+            return false;
+        }
+    }
+    return true;
+}
+
+bool IsP2PSourceRank(int rank, const TileXR::Demo::P2PPerfOptions& options)
+{
+    return options.traffic == TileXR::Demo::P2PTraffic::BiDir ?
+        (rank == options.srcRank || rank == options.dstRank) : rank == options.srcRank;
+}
+
+bool IsP2PReceiveRank(int rank, const TileXR::Demo::P2PPerfOptions& options)
+{
+    return options.traffic == TileXR::Demo::P2PTraffic::BiDir ?
+        (rank == options.srcRank || rank == options.dstRank) : rank == options.dstRank;
+}
+
+uint32_t FoldP2PDebugStatus(const std::vector<uint32_t>& debug, uint32_t blockDim)
+{
+    uint32_t status = debug.size() > 5 ? debug[5] : 0xffffffffu;
+    const uint32_t limit = std::min<uint32_t>(blockDim, 16U);
+    for (uint32_t i = 0; i < limit && 8U + i < debug.size(); ++i) {
+        status |= debug[8U + i];
+    }
+    return status;
+}
+
+void PrintP2PDebugSummary(int rank, const std::vector<uint32_t>& debug, uint32_t blockDim)
+{
+    std::ostringstream out;
+    out << "p2p debug:"
+        << " magic=0x" << std::hex << (debug.size() > 0 ? debug[0] : 0)
+        << std::dec
+        << " kernelRank=" << (debug.size() > 1 ? debug[1] : 0)
+        << " enabled=" << (debug.size() > 2 ? debug[2] : 0)
+        << " bytes=" << (debug.size() > 3 ? debug[3] : 0)
+        << " status=" << (debug.size() > 5 ? debug[5] : 0xffffffffu)
+        << " blockNum=" << (debug.size() > 6 ? debug[6] : 0)
+        << " qpNum=" << (debug.size() > 7 ? debug[7] : 0)
+        << " jettyCount=" << (debug.size() > 15 ? debug[15] : 0)
+        << " postCycles=" << (debug.size() > 29 ?
+            ((static_cast<uint64_t>(debug[29]) << 32) | debug[28]) : 0)
+        << " postCycleSum=" << (debug.size() > 31 ?
+            ((static_cast<uint64_t>(debug[31]) << 32) | debug[30]) : 0)
+        << " validateSum=" << (debug.size() > 33 ?
+            ((static_cast<uint64_t>(debug[33]) << 32) | debug[32]) : 0)
+        << " ctxSum=" << (debug.size() > 35 ?
+            ((static_cast<uint64_t>(debug[35]) << 32) | debug[34]) : 0)
+        << " sqeSum=" << (debug.size() > 37 ?
+            ((static_cast<uint64_t>(debug[37]) << 32) | debug[36]) : 0)
+        << " sgeSum=" << (debug.size() > 39 ?
+            ((static_cast<uint64_t>(debug[39]) << 32) | debug[38]) : 0)
+        << " cleanSum=" << (debug.size() > 41 ?
+            ((static_cast<uint64_t>(debug[41]) << 32) | debug[40]) : 0)
+        << " doorbellSum=" << (debug.size() > 43 ?
+            ((static_cast<uint64_t>(debug[43]) << 32) | debug[42]) : 0)
+        << " quietSum=" << (debug.size() > 45 ?
+            ((static_cast<uint64_t>(debug[45]) << 32) | debug[44]) : 0);
+    const uint32_t traceSegmentCount = debug.size() > 46 ? debug[46] : 0;
+    if (traceSegmentCount != 0) {
+        out << " traceSegments=" << traceSegmentCount;
+        const uint32_t traceLimit = std::min<uint32_t>(traceSegmentCount, 8U);
+        for (uint32_t i = 0; i < traceLimit; ++i) {
+            const uint32_t lowIdx = 48U + i * 2U;
+            const uint64_t cycles = lowIdx + 1U < debug.size() ?
+                ((static_cast<uint64_t>(debug[lowIdx + 1U]) << 32) | debug[lowIdx]) : 0U;
+            out << " seg" << i << "Cycles=" << cycles;
+        }
+    }
+    const uint32_t limit = std::min<uint32_t>(blockDim, 8U);
+    for (uint32_t i = 0; i < limit; ++i) {
+        out << " b" << i
+            << "{status=" << (8U + i < debug.size() ? debug[8U + i] : 0xffffffffu)
+            << ",offset=" << (16U + i < debug.size() ? debug[16U + i] : 0)
+            << ",bytes=" << (24U + i < debug.size() ? debug[24U + i] : 0)
+            << "}";
+    }
+    PrintStatus(rank, out.str());
+}
+
+bool ClearLocalPeerWindow(
+    int rank, const TileXR::CommArgs& commArgsHost, uint64_t offset, uint64_t bytes, const std::string& name)
+{
+    if (bytes == 0) {
+        return true;
+    }
+    void* localWindow = reinterpret_cast<void*>(commArgsHost.peerMems[rank] + offset);
+    return CheckAcl(rank, "aclrtMemset " + name, aclrtMemset(localWindow, static_cast<size_t>(bytes), 0,
+        static_cast<size_t>(bytes)));
+}
+
+void LaunchP2PKernel(
+    aclrtStream stream, GM_ADDR commArgsDev, GM_ADDR srcDev, GM_ADDR dstDev, GM_ADDR debugDev,
+    const TileXR::Demo::P2PPerfOptions& options, uint64_t dstOffset,
+    uint32_t transferBytes, uint32_t pattern, int32_t magic, int32_t step)
+{
+    const int32_t traffic = options.traffic == TileXR::Demo::P2PTraffic::BiDir ? 1 : 0;
+    const int32_t dataAsFlagStrict = GetEnvFlag("TILEXR_DATA_AS_FLAG_STRICT", false) ? 1 : 0;
+    if (options.transport == TileXR::Demo::P2PTransport::Memory) {
+        launch_tilexr_memory_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, debugDev,
+            options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic);
+        return;
+    }
+    if (options.transport == TileXR::Demo::P2PTransport::MemorySegmented ||
+        options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotate ||
+        options.transport == TileXR::Demo::P2PTransport::MemorySegmentedTrace ||
+        options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace) {
+        const bool rotateWindow =
+            options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotate ||
+            options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace;
+        const bool traceSegments =
+            options.transport == TileXR::Demo::P2PTransport::MemorySegmentedTrace ||
+            options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace;
+        launch_tilexr_memory_segmented_p2p_perf(
+            options.blockDim, stream, commArgsDev, srcDev, debugDev, options.srcRank, options.dstRank,
+            dstOffset, transferBytes, pattern, traffic,
+            static_cast<uint32_t>(traceSegments ? TileXR::Demo::kP2PMemoryTraceSegmentBytes :
+                TileXR::Demo::kP2PMemorySegmentBytes),
+            rotateWindow ? 1 : 0, traceSegments ? 8U : 0U);
+        return;
+    }
+    if (options.transport == TileXR::Demo::P2PTransport::MemoryConsume) {
+        launch_tilexr_memory_consume_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev,
+            options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic, magic, step);
+        return;
+    }
+    if (options.transport == TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered) {
+        launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(
+            options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev,
+            options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic, magic, step,
+            dataAsFlagStrict);
+        return;
+    }
+    if (options.transport == TileXR::Demo::P2PTransport::DataAsFlag) {
+        launch_tilexr_data_as_flag_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, dstDev, debugDev,
+            options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic);
+        return;
+    }
+    if (options.transport == TileXR::Demo::P2PTransport::DirectUrmaPostOnly) {
+        launch_tilexr_udma_p2p_post_only_perf(options.blockDim, stream, commArgsDev, srcDev, debugDev,
+            options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic);
+        return;
+    }
+    launch_tilexr_udma_p2p_perf(options.blockDim, stream, commArgsDev, srcDev, debugDev,
+        options.srcRank, options.dstRank, dstOffset, transferBytes, pattern, traffic);
+}
+
+bool RunP2PPerfMode(
+    int rank, int rankSize, TileXRCommPtr comm, const TileXR::CommArgs& commArgsHost,
+    GM_ADDR commArgsDev, aclrtStream stream,
+    const TileXR::Demo::P2PPerfOptions& options)
+{
+    std::string error;
+    if (!TileXR::Demo::ValidateP2PPerfOptions(options, rankSize, &error)) {
+        std::cerr << "[rank " << rank << "] ERROR: " << error << std::endl;
+        return false;
+    }
+
+    void* registeredMemory = nullptr;
+    uint32_t* debug = nullptr;
+    TileXRUDMAMemHandle udmaHandle = 0;
+    bool udmaRegistered = false;
+    aclrtEvent startEvent = nullptr;
+    aclrtEvent stopEvent = nullptr;
+
+    const bool useMemoryTransport = options.transport == TileXR::Demo::P2PTransport::Memory;
+    const bool validatePeerWindow =
+        options.transport == TileXR::Demo::P2PTransport::Memory ||
+        options.transport == TileXR::Demo::P2PTransport::MemorySegmented ||
+        options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotate ||
+        options.transport == TileXR::Demo::P2PTransport::MemorySegmentedTrace ||
+        options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace;
+    const bool useLegacyDataAsFlagTransport = options.transport == TileXR::Demo::P2PTransport::DataAsFlag;
+    const bool useIpcTransport = TileXR::Demo::P2PTransportUsesIpc(options.transport);
+    const bool bothRanksActive = TileXR::Demo::P2PTransportBothRanksActive(options.transport, options.traffic);
+    const uint64_t srcOffset = 0;
+    const uint64_t dstWindowBytes =
+        TileXR::Demo::P2PTransportWindowBytes(options.transport, options.maxBytes, options.blockDim);
+    const uint64_t dstOffset = useIpcTransport ? TileXR::IPC_DATA_OFFSET : dstWindowBytes;
+    const uint64_t localDstOffset = dstWindowBytes;
+    const uint64_t debugOffset = useIpcTransport ? localDstOffset + dstWindowBytes : dstOffset + dstWindowBytes;
+    const uint64_t payloadBytes = debugOffset + kP2PDebugWords * sizeof(uint32_t);
+    const uint64_t registeredPayloadBytes = payloadBytes;
+    const uint64_t registeredBytes = RoundUpToAlignment(registeredPayloadBytes, kUdmaRegistrationAlignment);
+
+    auto cleanup = [&]() {
+        if (startEvent != nullptr) {
+            CheckAcl(rank, "aclrtDestroyEvent start", aclrtDestroyEvent(startEvent));
+        }
+        if (stopEvent != nullptr) {
+            CheckAcl(rank, "aclrtDestroyEvent stop", aclrtDestroyEvent(stopEvent));
+        }
+        if (udmaRegistered) {
+            CheckTileXR(rank, "TileXRUDMAUnregister", TileXRUDMAUnregister(comm, udmaHandle));
+        }
+        if (registeredMemory != nullptr) {
+            PrintStatus(rank, "aclrtFree p2p registered memory");
+            aclrtFree(registeredMemory);
+        }
+    };
+
+    const std::string memoryName = useMemoryTransport ? "p2p memory local scratch" : "p2p registered memory";
+    if (!CheckAcl(rank, "aclrtMalloc " + memoryName,
+            aclrtMalloc(&registeredMemory, registeredBytes, ACL_MEM_MALLOC_HUGE_FIRST))) {
+        cleanup();
+        return false;
+    }
+    auto base = static_cast<uint8_t*>(registeredMemory);
+    auto srcDev = base + srcOffset;
+    auto dstDev = base + localDstOffset;
+    debug = reinterpret_cast<uint32_t*>(base + debugOffset);
+
+    if (!useIpcTransport) {
+        if (!CheckTileXR(rank, "TileXRUDMARegister p2p",
+                TileXRUDMARegister(comm, static_cast<GM_ADDR>(registeredMemory), registeredBytes, &udmaHandle))) {
+            cleanup();
+            return false;
+        }
+        udmaRegistered = true;
+    }
+    PrintStatus(rank, std::string("p2p transport=") + TileXR::Demo::P2PTransportName(options.transport) +
+        " traffic=" + TileXR::Demo::P2PTrafficName(options.traffic) +
+        " blockDim=" + std::to_string(options.blockDim) +
+        " memory base=" + std::to_string(reinterpret_cast<uintptr_t>(registeredMemory)) +
+        " bytes=" + std::to_string(registeredBytes) +
+        " srcOffset=" + std::to_string(srcOffset) +
+        " dstOffset=" + std::to_string(dstOffset) +
+        " debugOffset=" + std::to_string(debugOffset));
+
+    if (!CheckAcl(rank, "aclrtCreateEvent start", aclrtCreateEvent(&startEvent)) ||
+        !CheckAcl(rank, "aclrtCreateEvent stop", aclrtCreateEvent(&stopEvent))) {
+        cleanup();
+        return false;
+    }
+
+    bool ok = true;
+    int32_t launchMagic = kP2PMagicBase;
+    for (uint64_t bytes : TileXR::Demo::BuildP2PPerfSizeSweep(options)) {
+        const uint32_t transferBytes = static_cast<uint32_t>(bytes);
+        const uint32_t pattern = TileXR::Demo::P2PPattern(options.srcRank, options.dstRank, bytes);
+        const uint64_t transferWindowBytes =
+            TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes, options.blockDim);
+        std::vector<uint8_t> hostSrc(static_cast<size_t>(transferWindowBytes), 0);
+        std::vector<uint8_t> hostDst(static_cast<size_t>(transferWindowBytes), 0);
+        std::vector<uint32_t> hostDebug(kP2PDebugWords, 0);
+        TileXR::Demo::FillP2PPattern(hostSrc, pattern);
+
+        const bool initSrc = true;
+        const bool initDst = true;
+        if ((initSrc && !CopyHostToDevice(rank, srcDev, static_cast<size_t>(transferWindowBytes), hostSrc.data(),
+                static_cast<size_t>(transferWindowBytes), "p2p src")) ||
+            (initDst && !CopyHostToDevice(rank, dstDev, static_cast<size_t>(transferWindowBytes), hostDst.data(),
+                static_cast<size_t>(transferWindowBytes), "p2p dst")) ||
+            !CopyHostToDevice(rank, debug, hostDebug.size() * sizeof(uint32_t), hostDebug.data(),
+                hostDebug.size() * sizeof(uint32_t), "p2p debug")) {
+            ok = false;
+            break;
+        }
+        if (useIpcTransport && IsP2PReceiveRank(rank, options)) {
+            const uint64_t clearBytes = TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes);
+            if (!ClearLocalPeerWindow(rank, commArgsHost, dstOffset, clearBytes, "p2p ipc dst window")) {
+                ok = false;
+                break;
+            }
+        }
+        if (!DemoBarrierAll(rank, rankSize, "p2p initialized bytes=" + std::to_string(bytes))) {
+            ok = false;
+            break;
+        }
+
+        const int32_t warmupMagic = ++launchMagic;
+        for (int i = 0; i < options.warmupIters; ++i) {
+            if (useLegacyDataAsFlagTransport && IsP2PReceiveRank(rank, options) &&
+                !ClearLocalPeerWindow(rank, commArgsHost, dstOffset,
+                    TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes), "p2p data_as_flag warmup window")) {
+                ok = false;
+                break;
+            }
+            if (useLegacyDataAsFlagTransport &&
+                !DemoBarrierAll(rank, rankSize,
+                    "p2p data_as_flag warmup clear bytes=" + std::to_string(bytes) +
+                    " iter=" + std::to_string(i))) {
+                ok = false;
+                break;
+            }
+            LaunchP2PKernel(stream, commArgsDev, reinterpret_cast<GM_ADDR>(srcDev), reinterpret_cast<GM_ADDR>(dstDev),
+                reinterpret_cast<GM_ADDR>(debug), options, dstOffset, transferBytes, pattern, warmupMagic, i + 1);
+            if (!CheckAcl(rank, "aclrtSynchronizeStream p2p warmup", aclrtSynchronizeStream(stream))) {
+                ok = false;
+                break;
+            }
+        }
+        if (!ok || !DemoBarrierAll(rank, rankSize, "p2p warmup done bytes=" + std::to_string(bytes))) {
+            ok = false;
+            break;
+        }
+
+        if (useLegacyDataAsFlagTransport && IsP2PReceiveRank(rank, options) &&
+            !ClearLocalPeerWindow(rank, commArgsHost, dstOffset,
+                TileXR::Demo::P2PTransportWindowBytes(options.transport, bytes), "p2p data_as_flag measured window")) {
+            ok = false;
+            break;
+        }
+        if (useLegacyDataAsFlagTransport &&
+            !DemoBarrierAll(rank, rankSize,
+                "p2p data_as_flag measured clear bytes=" + std::to_string(bytes))) {
+            ok = false;
+            break;
+        }
+        const bool resetDebugBeforeMeasured =
+            options.transport == TileXR::Demo::P2PTransport::DirectUrmaPostOnly ||
+            options.transport == TileXR::Demo::P2PTransport::MemorySegmentedTrace ||
+            options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace;
+        if (resetDebugBeforeMeasured &&
+            !CopyHostToDevice(rank, debug, hostDebug.size() * sizeof(uint32_t), hostDebug.data(),
+                hostDebug.size() * sizeof(uint32_t), "p2p post-only debug reset")) {
+            ok = false;
+            break;
+        }
+
+        if (!CheckAcl(rank, "aclrtRecordEvent start", aclrtRecordEvent(startEvent, stream))) {
+            ok = false;
+            break;
+        }
+        const int32_t measuredMagic = ++launchMagic;
+        for (int i = 0; i < options.iters; ++i) {
+            LaunchP2PKernel(stream, commArgsDev, reinterpret_cast<GM_ADDR>(srcDev), reinterpret_cast<GM_ADDR>(dstDev),
+                reinterpret_cast<GM_ADDR>(debug), options, dstOffset, transferBytes, pattern, measuredMagic, i + 1);
+        }
+        if (!ok ||
+            !CheckAcl(rank, "aclrtRecordEvent stop", aclrtRecordEvent(stopEvent, stream)) ||
+            !CheckAcl(rank, "aclrtSynchronizeStream p2p measured", aclrtSynchronizeStream(stream))) {
+            ok = false;
+            break;
+        }
+
+        float elapsedMs = 0.0f;
+        if (!CheckAcl(rank, "aclrtEventElapsedTime p2p", aclrtEventElapsedTime(&elapsedMs, startEvent, stopEvent))) {
+            ok = false;
+            break;
+        }
+        if (!DemoBarrierAll(rank, rankSize, "p2p measured done bytes=" + std::to_string(bytes))) {
+            ok = false;
+            break;
+        }
+
+        uint64_t errors = 0;
+        const bool skipPayloadCheck =
+            options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotate ||
+            options.transport == TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace;
+        if (IsP2PReceiveRank(rank, options) && options.check && !skipPayloadCheck) {
+            const void* validateSrc = validatePeerWindow ?
+                reinterpret_cast<void*>(commArgsHost.peerMems[rank] + TileXR::IPC_DATA_OFFSET) :
+                static_cast<const void*>(dstDev);
+            if (!CopyDeviceToHost(rank, hostDst.data(), static_cast<size_t>(transferWindowBytes), validateSrc,
+                    static_cast<size_t>(transferWindowBytes), "p2p dst")) {
+                ok = false;
+                break;
+            }
+            errors = TileXR::Demo::CountP2PTransportMismatches(
+                hostDst, pattern, bytes, options.transport, options.blockDim);
+        }
+        if (IsP2PSourceRank(rank, options) || bothRanksActive) {
+            if (!CopyDeviceToHost(rank, hostDebug.data(), hostDebug.size() * sizeof(uint32_t), debug,
+                    hostDebug.size() * sizeof(uint32_t), "p2p debug")) {
+                ok = false;
+                break;
+            }
+        }
+
+        TileXR::Demo::P2PRankStatus localStatus;
+        localStatus.status = (IsP2PSourceRank(rank, options) || bothRanksActive) ?
+            FoldP2PDebugStatus(hostDebug, options.blockDim) : 0;
+        localStatus.errors = IsP2PReceiveRank(rank, options) ? errors : 0;
+        localStatus.elapsedMs = elapsedMs;
+        if (options.transport == TileXR::Demo::P2PTransport::DirectUrmaPostOnly &&
+            IsP2PSourceRank(rank, options) && hostDebug.size() > 31) {
+            const uint64_t postCycleSum = (static_cast<uint64_t>(hostDebug[31]) << 32) | hostDebug[30];
+            localStatus.avgUsOverride = options.iters > 0 ?
+                static_cast<double>(postCycleSum) /
+                    static_cast<double>(options.iters) /
+                    static_cast<double>(kA5CycleToUsDivisor) :
+                0.0;
+        }
+        if (IsP2PSourceRank(rank, options) &&
+            (GetEnvFlag("TILEXR_P2P_DEBUG_SUMMARY", false) || localStatus.status != 0 || errors != 0)) {
+            PrintP2PDebugSummary(rank, hostDebug, options.blockDim);
+        }
+        if (!options.logDir.empty()) {
+            std::string statusPath = options.logDir + "/p2p_rank_" + std::to_string(rank) +
+                "_" + std::to_string(bytes) + ".status";
+            std::ostringstream statusText;
+            statusText << localStatus.status << ' ' << localStatus.errors << ' '
+                       << localStatus.elapsedMs << ' ' << localStatus.avgUsOverride << '\n';
+            if (!WriteTextFile(statusPath, statusText.str())) {
+                std::cerr << "[rank " << rank << "] ERROR: failed to write " << statusPath << std::endl;
+                ok = false;
+                break;
+            }
+        }
+        if (!DemoBarrierAll(rank, rankSize, "p2p status files bytes=" + std::to_string(bytes))) {
+            ok = false;
+            break;
+        }
+
+        if (rank == 0) {
+            TileXR::Demo::P2PRankStatus srcStatus = localStatus;
+            TileXR::Demo::P2PRankStatus dstStatus = localStatus;
+            if (options.srcRank != 0) {
+                std::string srcPath = options.logDir + "/p2p_rank_" + std::to_string(options.srcRank) +
+                    "_" + std::to_string(bytes) + ".status";
+                if (!ReadP2PStatusFile(srcPath, &srcStatus)) {
+                    std::cerr << "[rank " << rank << "] ERROR: failed to read " << srcPath << std::endl;
+                    ok = false;
+                    break;
+                }
+            }
+            if (options.dstRank != 0) {
+                std::string dstPath = options.logDir + "/p2p_rank_" + std::to_string(options.dstRank) +
+                    "_" + std::to_string(bytes) + ".status";
+                if (!ReadP2PStatusFile(dstPath, &dstStatus)) {
+                    std::cerr << "[rank " << rank << "] ERROR: failed to read " << dstPath << std::endl;
+                    ok = false;
+                    break;
+                }
+            }
+            TileXR::Demo::P2PPerfRow row =
+                TileXR::Demo::BuildP2PPerfRow(options, rankSize, bytes, srcStatus, dstStatus);
+            if (!AppendP2PPerfCsvRow(options, row)) {
+                std::cerr << "[rank " << rank << "] ERROR: failed to append CSV " << options.csvPath << std::endl;
+                ok = false;
+                break;
+            }
+            std::cout << "[rank " << rank << "] p2p row: "
+                      << TileXR::Demo::FormatP2PPerfCsvRow(row);
+            if (row.status != 0 || row.errors != 0) {
+                ok = false;
+            }
+        }
+        if (!DemoBarrierAll(rank, rankSize, "p2p csv written bytes=" + std::to_string(bytes))) {
+            ok = false;
+            break;
+        }
+    }
+
+    cleanup();
+    return ok;
+}
+
 void Cleanup(
     TileXRCommPtr comm, aclrtStream stream, void* registeredMemory, int32_t* debug, int rank, int deviceId)
 {
@@ -390,6 +944,29 @@ int main(int argc, char** argv)
     int32_t elementsPerRank = argc > argIndex ? std::atoi(argv[argIndex++]) : kDefaultElementsPerRank;
     int npuCount = argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_DEMO_NPUS", 8);
     int firstNpu = argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_DEMO_FIRST_NPU", 0);
+    TileXR::Demo::P2PPerfOptions p2pOptions;
+    if (testType == 4) {
+        p2pOptions.srcRank = argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_P2P_SRC_RANK", 0);
+        p2pOptions.dstRank = argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_P2P_DST_RANK", 1);
+        p2pOptions.minBytes = argc > argIndex ? std::strtoull(argv[argIndex++], nullptr, 10) :
+            static_cast<uint64_t>(GetEnvInt("TILEXR_P2P_MIN_BYTES", 4096));
+        p2pOptions.maxBytes = argc > argIndex ? std::strtoull(argv[argIndex++], nullptr, 10) :
+            static_cast<uint64_t>(GetEnvInt("TILEXR_P2P_MAX_BYTES", 4096));
+        p2pOptions.stepFactor = argc > argIndex ? std::strtoull(argv[argIndex++], nullptr, 10) :
+            static_cast<uint64_t>(GetEnvInt("TILEXR_P2P_STEP_FACTOR", 2));
+        p2pOptions.iters = argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_P2P_ITERS", 100);
+        p2pOptions.warmupIters = argc > argIndex ? std::atoi(argv[argIndex++]) :
+            GetEnvInt("TILEXR_P2P_WARMUP_ITERS", 10);
+        p2pOptions.check = (argc > argIndex ? std::atoi(argv[argIndex++]) : GetEnvInt("TILEXR_P2P_CHECK", 1)) != 0;
+        p2pOptions.csvPath = argc > argIndex ? argv[argIndex++] : GetEnvString("TILEXR_P2P_CSV", "");
+        p2pOptions.logDir = argc > argIndex ? argv[argIndex++] : GetEnvString("TILEXR_P2P_LOG_DIR", "");
+        std::string transportName = argc > argIndex ? argv[argIndex++] : GetEnvString("TILEXR_P2P_TRANSPORT", "direct_urma");
+        p2pOptions.transport = TileXR::Demo::ParseP2PTransport(transportName);
+        p2pOptions.blockDim = argc > argIndex ? static_cast<uint32_t>(std::strtoul(argv[argIndex++], nullptr, 10)) :
+            static_cast<uint32_t>(GetEnvInt("TILEXR_P2P_BLOCK_DIM", 1));
+        std::string trafficName = argc > argIndex ? argv[argIndex++] : GetEnvString("TILEXR_P2P_TRAFFIC", "unidir");
+        p2pOptions.traffic = TileXR::Demo::ParseP2PTraffic(trafficName);
+    }
     int deviceId = GetDeviceIdFromEnv(rank, npuCount, firstNpu);
 
     std::cout << "========================================" << std::endl;
@@ -398,6 +975,21 @@ int main(int argc, char** argv)
     std::cout << "[rank " << rank << "] argv: rankSize=" << rankSize << " rank=" << rank
               << " testType=" << testType << " elementsPerRank=" << elementsPerRank
               << " npuCount=" << npuCount << " firstNpu=" << firstNpu << std::endl;
+    if (testType == 4) {
+        std::cout << "[rank " << rank << "] p2p: src=" << p2pOptions.srcRank
+                  << " dst=" << p2pOptions.dstRank
+                  << " minBytes=" << p2pOptions.minBytes
+                  << " maxBytes=" << p2pOptions.maxBytes
+                  << " stepFactor=" << p2pOptions.stepFactor
+                  << " iters=" << p2pOptions.iters
+                  << " warmupIters=" << p2pOptions.warmupIters
+                  << " check=" << (p2pOptions.check ? 1 : 0)
+                  << " transport=" << TileXR::Demo::P2PTransportName(p2pOptions.transport)
+                  << " blockDim=" << p2pOptions.blockDim
+                  << " traffic=" << TileXR::Demo::P2PTrafficName(p2pOptions.traffic)
+                  << " csv=" << p2pOptions.csvPath
+                  << " logDir=" << p2pOptions.logDir << std::endl;
+    }
     std::cout << "[rank " << rank << "] PID=" << getpid()
               << " TILEXR_COMM_ID=" << (std::getenv("TILEXR_COMM_ID") ? std::getenv("TILEXR_COMM_ID") : "<unset>")
               << " LD_LIBRARY_PATH=" << (std::getenv("LD_LIBRARY_PATH") ? std::getenv("LD_LIBRARY_PATH") : "<unset>")
@@ -439,13 +1031,33 @@ int main(int argc, char** argv)
     }
     PrintCommArgs(rank, *commArgsHost, commArgsDev);
 
-    if ((commArgsHost->extraFlag & TileXR::ExtraFlag::UDMA) == 0 || commArgsHost->udmaInfoPtr == nullptr) {
+    if (testType == 4 &&
+        TileXR::Demo::P2PTransportUsesIpc(p2pOptions.transport) &&
+        !CheckPeerMemsReady(rank, rankSize, *commArgsHost)) {
+        Cleanup(comm, stream, registeredMemory, debug, rank, deviceId);
+        return 1;
+    }
+
+    if ((testType != 4 ||
+            p2pOptions.transport == TileXR::Demo::P2PTransport::DirectUrma) &&
+        ((commArgsHost->extraFlag & TileXR::ExtraFlag::UDMA) == 0 || commArgsHost->udmaInfoPtr == nullptr)) {
         std::cerr << "[rank " << rank << "] ERROR: TileXR UDMA is not enabled. "
                   << "Check A5/Ascend950 hardware support, CANN/driver setup, and LD_LIBRARY_PATH." << std::endl;
         Cleanup(comm, stream, registeredMemory, debug, rank, deviceId);
         return 1;
     }
 
+    if (testType == 4) {
+        bool ok = RunP2PPerfMode(rank, rankSize, comm, *commArgsHost, commArgsDev, stream, p2pOptions);
+        Cleanup(comm, stream, registeredMemory, debug, rank, deviceId);
+        if (!ok) {
+            std::cerr << "[rank " << rank << "] TileXR UDMA P2P perf failed" << std::endl;
+            return 1;
+        }
+        std::cout << "[rank " << rank << "] TileXR UDMA P2P perf success" << std::endl;
+        return 0;
+    }
+
     size_t dataCount = static_cast<size_t>(rankSize) * elementsPerRank;
     size_t dataBytes = dataCount * sizeof(int32_t);
     size_t signalBytes = static_cast<size_t>(rankSize) * sizeof(uint64_t);
diff --git a/tests/udma/demo/tilexr_udma_demo_kernel.cpp b/tests/udma/demo/tilexr_udma_demo_kernel.cpp
index a49ca3c..b452260 100644
--- a/tests/udma/demo/tilexr_udma_demo_kernel.cpp
+++ b/tests/udma/demo/tilexr_udma_demo_kernel.cpp
@@ -4,9 +4,223 @@
  */
 
 #include "kernel_operator.h"
+#include "tilexr_data_as_flag.h"
+#include "tilexr_sync.h"
 #include "tilexr_udma.h"
 
 constexpr int32_t TILEXR_UDMA_DEMO_MAGIC = 0x5444554d; // "TDUM"
+constexpr uint32_t TILEXR_UDMA_DEMO_P2P_UB_BYTES = 64 * 1024;
+constexpr uint32_t TILEXR_UDMA_DEMO_P2P_SYNC_UB_BYTES = 4 * 1024;
+constexpr uint32_t TILEXR_UDMA_DEMO_P2P_COPY_TILE_BYTES =
+    TILEXR_UDMA_DEMO_P2P_UB_BYTES - TILEXR_UDMA_DEMO_P2P_SYNC_UB_BYTES;
+constexpr uint32_t TILEXR_UDMA_DEMO_P2P_MAX_DEBUG_BLOCKS = 16;
+
+__aicore__ inline uint32_t TileXRUdmaDemoCeilDiv(uint32_t value, uint32_t divisor)
+{
+    return divisor == 0 ? 0 : (value + divisor - 1) / divisor;
+}
+
+__aicore__ inline void TileXRUdmaDemoBlockSlice(
+    uint32_t total, uint32_t blockNum, uint32_t blockIdx, uint32_t& offset, uint32_t& bytes)
+{
+    uint32_t perBlock = TileXRUdmaDemoCeilDiv(total, blockNum);
+    perBlock = ((perBlock + TileXR::BLOCK_UNIT_BYTE - 1) / TileXR::BLOCK_UNIT_BYTE) * TileXR::BLOCK_UNIT_BYTE;
+    offset = blockIdx * perBlock;
+    if (offset >= total) {
+        offset = total;
+        bytes = 0;
+        return;
+    }
+    bytes = total - offset;
+    if (bytes > perBlock) {
+        bytes = perBlock;
+    }
+}
+
+__aicore__ inline void TileXRUdmaDemoWqeSlice(
+    uint32_t total, uint32_t wqeCount, uint32_t wqeIdx, uint32_t& offset, uint32_t& bytes)
+{
+    uint32_t perWqe = TileXRUdmaDemoCeilDiv(total, wqeCount);
+    perWqe = ((perWqe + TileXR::BLOCK_UNIT_BYTE - 1) / TileXR::BLOCK_UNIT_BYTE) * TileXR::BLOCK_UNIT_BYTE;
+    offset = wqeIdx * perWqe;
+    if (wqeCount == 0 || offset >= total) {
+        offset = total;
+        bytes = 0;
+        return;
+    }
+    bytes = total - offset;
+    if (bytes > perWqe) {
+        bytes = perWqe;
+    }
+}
+
+__aicore__ inline bool TileXRUdmaDemoResolvePeer(
+    int32_t rank, int32_t srcRank, int32_t dstRank, int32_t traffic, int32_t& peer)
+{
+    const bool bidir = traffic == 1;
+    if (bidir) {
+        if (rank == srcRank) {
+            peer = dstRank;
+            return true;
+        }
+        if (rank == dstRank) {
+            peer = srcRank;
+            return true;
+        }
+        peer = -1;
+        return false;
+    }
+    peer = dstRank;
+    return rank == srcRank;
+}
+
+__aicore__ inline bool TileXRUdmaDemoResolveDataAsFlagRole(
+    int32_t rank, int32_t srcRank, int32_t dstRank, int32_t traffic,
+    bool& isSender, bool& isReceiver, int32_t& peer)
+{
+    const bool bidir = traffic == 1;
+    isSender = false;
+    isReceiver = false;
+    peer = -1;
+    if (bidir) {
+        if (rank == srcRank) {
+            isSender = true;
+            isReceiver = true;
+            peer = dstRank;
+            return true;
+        }
+        if (rank == dstRank) {
+            isSender = true;
+            isReceiver = true;
+            peer = srcRank;
+            return true;
+        }
+        return false;
+    }
+    if (rank == srcRank) {
+        isSender = true;
+        peer = dstRank;
+        return true;
+    }
+    if (rank == dstRank) {
+        isReceiver = true;
+        peer = srcRank;
+        return true;
+    }
+    return false;
+}
+
+__aicore__ inline bool TileXRUdmaDemoResolveMemoryConsumeRole(
+    int32_t rank, int32_t srcRank, int32_t dstRank, int32_t traffic,
+    bool& isSender, bool& isReceiver, int32_t& peer)
+{
+    const bool bidir = traffic == 1;
+    isSender = false;
+    isReceiver = false;
+    peer = -1;
+    if (bidir) {
+        if (rank == srcRank) {
+            isSender = true;
+            isReceiver = true;
+            peer = dstRank;
+            return true;
+        }
+        if (rank == dstRank) {
+            isSender = true;
+            isReceiver = true;
+            peer = srcRank;
+            return true;
+        }
+        return false;
+    }
+    if (rank == srcRank) {
+        isSender = true;
+        peer = dstRank;
+        return true;
+    }
+    if (rank == dstRank) {
+        isReceiver = true;
+        peer = srcRank;
+        return true;
+    }
+    return false;
+}
+
+__aicore__ inline uint32_t TileXRUdmaDemoFoldDebugStatus(
+    __gm__ uint32_t* debug, uint32_t blockIdx, uint32_t status)
+{
+    if (debug != nullptr && blockIdx < TILEXR_UDMA_DEMO_P2P_MAX_DEBUG_BLOCKS) {
+        debug[8 + blockIdx] = status;
+    }
+    return status;
+}
+
+__aicore__ inline void TileXRUdmaDemoAddCycleSum(__gm__ uint32_t* debug, uint32_t lowIdx, uint64_t cycles)
+{
+    if (debug == nullptr) {
+        return;
+    }
+    uint64_t sum = (static_cast<uint64_t>(debug[lowIdx + 1]) << 32) | debug[lowIdx];
+    sum += cycles;
+    debug[lowIdx] = static_cast<uint32_t>(sum & 0xffffffffu);
+    debug[lowIdx + 1] = static_cast<uint32_t>(sum >> 32);
+}
+
+__aicore__ inline void TileXRUdmaDemoDataAsFlagSlice(
+    uint32_t bytes, uint32_t blockNum, uint32_t blockIdx,
+    uint32_t& payloadOffset, uint32_t& sliceBytes, uint32_t& dataAsFlagOffset)
+{
+    uint32_t totalBlocks = TileXR::DataAsFlagBlockCountForPayloadBytes(bytes);
+    uint32_t perBlock = TileXRUdmaDemoCeilDiv(totalBlocks, blockNum);
+    uint32_t startBlock = blockIdx * perBlock;
+    if (blockNum == 0 || perBlock == 0 || startBlock >= totalBlocks) {
+        payloadOffset = bytes;
+        sliceBytes = 0;
+        dataAsFlagOffset = totalBlocks * TileXR::DATA_AS_FLAG_BLOCK_BYTES;
+        return;
+    }
+    uint32_t blockCount = totalBlocks - startBlock;
+    if (blockCount > perBlock) {
+        blockCount = perBlock;
+    }
+    payloadOffset = startBlock * TileXR::DATA_AS_FLAG_PAYLOAD_BYTES;
+    uint32_t maxPayloadBytes = blockCount * TileXR::DATA_AS_FLAG_PAYLOAD_BYTES;
+    sliceBytes = bytes - payloadOffset;
+    if (sliceBytes > maxPayloadBytes) {
+        sliceBytes = maxPayloadBytes;
+    }
+    dataAsFlagOffset = startBlock * TileXR::DATA_AS_FLAG_BLOCK_BYTES;
+}
+
+__aicore__ inline void TileXRUdmaDemoCopyBytesGmToGm(
+    GM_ADDR dstGM, GM_ADDR srcGM, AscendC::TBuf<AscendC::QuePosition::VECCALC>& tBuf, uint32_t bytes)
+{
+    if (dstGM == nullptr || srcGM == nullptr || bytes == 0) {
+        return;
+    }
+
+    AscendC::LocalTensor<uint8_t> local =
+        tBuf.GetWithOffset<uint8_t>(TILEXR_UDMA_DEMO_P2P_COPY_TILE_BYTES, TILEXR_UDMA_DEMO_P2P_SYNC_UB_BYTES);
+    AscendC::GlobalTensor<uint8_t> src;
+    AscendC::GlobalTensor<uint8_t> dst;
+    src.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t*>(srcGM), bytes);
+    dst.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t*>(dstGM), bytes);
+
+    for (uint32_t copied = 0; copied < bytes; copied += TILEXR_UDMA_DEMO_P2P_COPY_TILE_BYTES) {
+        uint32_t tileBytes = bytes - copied;
+        if (tileBytes > TILEXR_UDMA_DEMO_P2P_COPY_TILE_BYTES) {
+            tileBytes = TILEXR_UDMA_DEMO_P2P_COPY_TILE_BYTES;
+        }
+        AscendC::DataCopyPadParams padParams {false, 0, 0, 0};
+        AscendC::DataCopyParams copyParams {1, static_cast<uint16_t>(tileBytes), 0, 0};
+        AscendC::DataCopyPad(local, src[copied], copyParams, padParams);
+        AscendC::SetFlag<AscendC::HardEvent::MTE2_MTE3>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE2_MTE3>(EVENT_ID0);
+        AscendC::DataCopyPad(dst[copied], local, copyParams);
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
+    }
+}
 
 extern "C" __global__ __aicore__ void tilexr_udma_all_gather_kernel(
     GM_ADDR commArgsGM, GM_ADDR dataGM, GM_ADDR debugGM, int32_t elementsPerRank)
@@ -110,6 +324,561 @@ extern "C" __global__ __aicore__ void tilexr_udma_registered_smoke_kernel(
     }
 }
 
+extern "C" __global__ __aicore__ void tilexr_udma_p2p_perf_kernel(
+    GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR debugGM,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic)
+{
+    auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM);
+    auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM);
+    auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM);
+
+    int32_t rank = args->rank;
+    uint32_t blockIdx = AscendC::GetBlockIdx();
+    uint32_t blockNum = AscendC::GetBlockNum();
+    bool enabled = TileXR::UDMARegistryEnabled(args);
+    uint32_t qpNum = enabled ? TileXR::GetUDMAInfo(args)->qpNum : 0;
+    if (debug != nullptr && blockIdx == 0) {
+        debug[0] = TILEXR_UDMA_DEMO_MAGIC;
+        debug[1] = rank;
+        debug[2] = enabled ? 1 : 0;
+        debug[3] = bytes;
+        debug[4] = pattern;
+        debug[5] = 0xffffffffu;
+        debug[6] = blockNum;
+        debug[7] = qpNum;
+    }
+
+    int32_t peer = -1;
+    if (!enabled || blockNum == 0 || qpNum == 0 ||
+        !TileXRUdmaDemoResolvePeer(rank, srcRank, dstRank, traffic, peer)) {
+        return;
+    }
+    uint32_t jettyCount = blockNum < qpNum ? blockNum : qpNum;
+    if (blockIdx >= jettyCount) {
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+        return;
+    }
+
+    uint32_t offset = 0;
+    uint32_t sliceBytes = 0;
+    TileXRUdmaDemoWqeSlice(bytes, jettyCount, blockIdx, offset, sliceBytes);
+    if (debug != nullptr && blockIdx < 8) {
+        debug[16 + blockIdx] = offset;
+        debug[24 + blockIdx] = sliceBytes;
+    }
+    if (sliceBytes == 0) {
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+        return;
+    }
+    TileXR::UDMAPutNbiQp<uint8_t>(args, peer, blockIdx, src + offset, dstByteOffset + offset, sliceBytes);
+    uint32_t status = TileXR::UDMAQuietStatusQp(args, peer, blockIdx);
+    TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status);
+    if (debug != nullptr && blockIdx == 0) {
+        debug[5] = status;
+        debug[15] = jettyCount;
+    }
+}
+
+extern "C" __global__ __aicore__ void tilexr_udma_p2p_post_only_perf_kernel(
+    GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR debugGM,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic)
+{
+    auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM);
+    auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM);
+    auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM);
+
+    int32_t rank = args->rank;
+    uint32_t blockIdx = AscendC::GetBlockIdx();
+    uint32_t blockNum = AscendC::GetBlockNum();
+    bool enabled = TileXR::UDMARegistryEnabled(args);
+    uint32_t qpNum = enabled ? TileXR::GetUDMAInfo(args)->qpNum : 0;
+    if (debug != nullptr && blockIdx == 0) {
+        debug[0] = TILEXR_UDMA_DEMO_MAGIC;
+        debug[1] = rank;
+        debug[2] = enabled ? 1 : 0;
+        debug[3] = bytes;
+        debug[4] = pattern;
+        debug[5] = 0xffffffffu;
+        debug[6] = blockNum;
+        debug[7] = qpNum;
+    }
+
+    int32_t peer = -1;
+    if (!enabled || blockNum == 0 || qpNum == 0 ||
+        !TileXRUdmaDemoResolvePeer(rank, srcRank, dstRank, traffic, peer)) {
+        return;
+    }
+    uint32_t jettyCount = blockNum < qpNum ? blockNum : qpNum;
+    if (blockIdx >= jettyCount) {
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+        return;
+    }
+
+    uint32_t offset = 0;
+    uint32_t sliceBytes = 0;
+    TileXRUdmaDemoWqeSlice(bytes, jettyCount, blockIdx, offset, sliceBytes);
+    if (debug != nullptr && blockIdx < 8) {
+        debug[16 + blockIdx] = offset;
+        debug[24 + blockIdx] = sliceBytes;
+    }
+    if (sliceBytes == 0) {
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+        return;
+    }
+
+    uint64_t startCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
+    auto udmaInfo = TileXR::GetUDMAInfo(args);
+    auto registry = TileXR::GetUDMARegistry(args);
+    if (!TileXR::UDMARegisteredRangeValid(registry, peer, dstByteOffset + offset, sliceBytes)) {
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0xffffffffu);
+        return;
+    }
+    uint64_t validateCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
+    __gm__ TileXR::UDMAWQCtx* qpCtxEntry = TileXR::UDMAGetWQCtx(udmaInfo, peer, blockIdx);
+    uint32_t wqeSize = 1U << qpCtxEntry->baseBkShift;
+    uint32_t curHead = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->headAddr), 0);
+    uint32_t wqeCnt = ld_dev(reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->wqeCntAddr), 0);
+    TileXR::UDMAPollCQWhenSQOverflow(udmaInfo, qpCtxEntry, wqeCnt, peer, blockIdx);
+    __gm__ TileXR::UDMAMemInfo* remoteMemInfo = TileXR::UDMAGetRemoteMemInfo(udmaInfo, peer, blockIdx);
+    __gm__ uint8_t* remoteAddr = TileXR::UDMARegisteredRemoteAddr(registry, peer, dstByteOffset + offset);
+    __gm__ uint8_t* wqeAddr =
+        reinterpret_cast<__gm__ uint8_t*>(qpCtxEntry->bufAddr + wqeSize * (curHead % TileXR::TILEXR_UDMA_SQ_BB_COUNT));
+    __gm__ TileXR::UDMASqeCtx* sqeCtx = reinterpret_cast<__gm__ TileXR::UDMASqeCtx*>(wqeAddr);
+    uint64_t ctxCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
+    TileXR::UDMAFillSqeCtx(sqeCtx, remoteAddr, remoteMemInfo, curHead, TileXR::UDMAOpcode::WRITE, nullptr);
+    uint64_t sqeCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
+    __gm__ TileXR::UDMASgeCtx* sgeCtx =
+        reinterpret_cast<__gm__ TileXR::UDMASgeCtx*>(TileXR::UDMAGetSgeCtxAddr(wqeAddr, TileXR::UDMAOpcode::WRITE));
+    TileXR::UDMAFillSgeCtx(sgeCtx, sliceBytes, src + offset);
+    uint64_t sgeCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
+    uint32_t wqeBbCnt = TileXR::UDMAWqeBBCnt(TileXR::UDMAOpcode::WRITE);
+    TileXR::UDMACleanCacheLines(wqeAddr, wqeSize * wqeBbCnt);
+    uint64_t cleanCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
+    curHead += wqeBbCnt;
+    TileXR::UDMAPostSendUpdateInfo(curHead, qpCtxEntry);
+    ++wqeCnt;
+    st_dev(wqeCnt, reinterpret_cast<__gm__ uint32_t*>(qpCtxEntry->wqeCntAddr), 0);
+    uint64_t endCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
+    uint32_t status = TileXR::UDMAQuietStatusQp(args, peer, blockIdx);
+    uint64_t quietCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
+    TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status);
+    if (debug != nullptr && blockIdx == 0) {
+        uint64_t delta = endCycle - startCycle;
+        TileXRUdmaDemoAddCycleSum(debug, 30, delta);
+        TileXRUdmaDemoAddCycleSum(debug, 32, validateCycle - startCycle);
+        TileXRUdmaDemoAddCycleSum(debug, 34, ctxCycle - validateCycle);
+        TileXRUdmaDemoAddCycleSum(debug, 36, sqeCycle - ctxCycle);
+        TileXRUdmaDemoAddCycleSum(debug, 38, sgeCycle - sqeCycle);
+        TileXRUdmaDemoAddCycleSum(debug, 40, cleanCycle - sgeCycle);
+        TileXRUdmaDemoAddCycleSum(debug, 42, endCycle - cleanCycle);
+        TileXRUdmaDemoAddCycleSum(debug, 44, quietCycle - endCycle);
+        debug[5] = status;
+        debug[15] = jettyCount;
+        debug[28] = static_cast<uint32_t>(delta & 0xffffffffu);
+        debug[29] = static_cast<uint32_t>(delta >> 32);
+    }
+}
+
+extern "C" __global__ __aicore__ void tilexr_memory_p2p_perf_kernel(
+    GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR debugGM,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic)
+{
+    if constexpr (g_coreType == AscendC::AIV) {
+        auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM);
+        auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM);
+
+        int32_t rank = args->rank;
+        uint32_t blockIdx = AscendC::GetBlockIdx();
+        uint32_t blockNum = AscendC::GetBlockNum();
+        if (debug != nullptr && blockIdx == 0) {
+            debug[0] = TILEXR_UDMA_DEMO_MAGIC;
+            debug[1] = rank;
+            debug[2] = 1;
+            debug[3] = bytes;
+            debug[4] = pattern;
+            debug[5] = 0xffffffffu;
+        }
+
+        int32_t peer = -1;
+        if (blockNum == 0 || !TileXRUdmaDemoResolvePeer(rank, srcRank, dstRank, traffic, peer) ||
+            peer < 0 || peer >= args->rankSize) {
+            return;
+        }
+
+        AscendC::GlobalTensor<GM_ADDR> peerMems;
+        peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE);
+        GM_ADDR dstBase = peerMems.GetValue(peer);
+        if (dstBase == nullptr) {
+            uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2);
+            if (debug != nullptr && blockIdx == 0) {
+                debug[5] = status;
+            }
+            return;
+        }
+
+        uint32_t offset = 0;
+        uint32_t sliceBytes = 0;
+        TileXRUdmaDemoBlockSlice(bytes, blockNum, blockIdx, offset, sliceBytes);
+        AscendC::TPipe pipe;
+        AscendC::TBuf<AscendC::QuePosition::VECCALC> tBuf;
+        pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES);
+        TileXRUdmaDemoCopyBytesGmToGm(
+            dstBase + dstByteOffset + offset, srcGM + offset, tBuf, sliceBytes);
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+        if (debug != nullptr && blockIdx == 0) {
+            debug[5] = 0;
+        }
+    }
+}
+
+extern "C" __global__ __aicore__ void tilexr_memory_segmented_p2p_perf_kernel(
+    GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR debugGM,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, uint32_t segmentBytes,
+    int32_t rotateWindow, uint32_t traceSegments)
+{
+    if constexpr (g_coreType == AscendC::AIV) {
+        auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM);
+        auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM);
+
+        int32_t rank = args->rank;
+        uint32_t blockIdx = AscendC::GetBlockIdx();
+        uint32_t blockNum = AscendC::GetBlockNum();
+        if (debug != nullptr && blockIdx == 0) {
+            debug[0] = TILEXR_UDMA_DEMO_MAGIC;
+            debug[1] = rank;
+            debug[2] = 1;
+            debug[3] = bytes;
+            debug[4] = pattern;
+            debug[5] = 0xffffffffu;
+            debug[6] = segmentBytes;
+            debug[7] = static_cast<uint32_t>(rotateWindow != 0);
+            debug[46] = traceSegments;
+        }
+
+        int32_t peer = -1;
+        if (blockNum == 0 || segmentBytes == 0U ||
+            !TileXRUdmaDemoResolvePeer(rank, srcRank, dstRank, traffic, peer) ||
+            peer < 0 || peer >= args->rankSize) {
+            return;
+        }
+
+        AscendC::GlobalTensor<GM_ADDR> peerMems;
+        peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE);
+        GM_ADDR dstBase = peerMems.GetValue(peer);
+        if (dstBase == nullptr) {
+            uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2);
+            if (debug != nullptr && blockIdx == 0) {
+                debug[5] = status;
+            }
+            return;
+        }
+
+        uint32_t offset = 0;
+        uint32_t sliceBytes = 0;
+        TileXRUdmaDemoBlockSlice(bytes, blockNum, blockIdx, offset, sliceBytes);
+        AscendC::TPipe pipe;
+        AscendC::TBuf<AscendC::QuePosition::VECCALC> tBuf;
+        pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES);
+
+        uint32_t copiedInSlice = 0;
+        uint32_t traceIndex = 0;
+        while (copiedInSlice < sliceBytes) {
+            uint32_t chunkBytes = sliceBytes - copiedInSlice;
+            if (chunkBytes > segmentBytes) {
+                chunkBytes = segmentBytes;
+            }
+            const uint32_t srcOffset = offset + copiedInSlice;
+            uint32_t dstInWindow = srcOffset;
+            if (rotateWindow != 0) {
+                dstInWindow = srcOffset % segmentBytes;
+            }
+            const bool recordTrace = debug != nullptr && blockIdx == 0 && traceIndex < traceSegments && traceIndex < 8U;
+            uint64_t startCycle = 0U;
+            if (recordTrace) {
+                startCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
+            }
+            TileXRUdmaDemoCopyBytesGmToGm(
+                dstBase + dstByteOffset + dstInWindow, srcGM + srcOffset, tBuf, chunkBytes);
+            if (recordTrace) {
+                const uint64_t endCycle = static_cast<uint64_t>(AscendC::GetSystemCycle());
+                const uint32_t traceLowIdx = 48U + traceIndex * 2U;
+                TileXRUdmaDemoAddCycleSum(debug, traceLowIdx, endCycle - startCycle);
+            }
+            copiedInSlice += chunkBytes;
+            ++traceIndex;
+        }
+
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+        if (debug != nullptr && blockIdx == 0) {
+            debug[5] = 0;
+        }
+    }
+}
+
+extern "C" __global__ __aicore__ void tilexr_memory_consume_p2p_perf_kernel(
+    GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR dstGM, GM_ADDR debugGM,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step)
+{
+    if constexpr (g_coreType == AscendC::AIV) {
+        auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM);
+        auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM);
+        auto dst = reinterpret_cast<__gm__ uint8_t*>(dstGM);
+
+        int32_t rank = args->rank;
+        uint32_t blockIdx = AscendC::GetBlockIdx();
+        uint32_t blockNum = AscendC::GetBlockNum();
+        if (debug != nullptr && blockIdx == 0) {
+            debug[0] = TILEXR_UDMA_DEMO_MAGIC;
+            debug[1] = rank;
+            debug[2] = 1;
+            debug[3] = bytes;
+            debug[4] = pattern;
+            debug[5] = 0xffffffffu;
+            debug[6] = blockNum;
+            debug[7] = static_cast<uint32_t>(magic);
+        }
+
+        bool isSender = false;
+        bool isReceiver = false;
+        int32_t peer = -1;
+        if (blockNum == 0 ||
+            !TileXRUdmaDemoResolveMemoryConsumeRole(rank, srcRank, dstRank, traffic, isSender, isReceiver, peer) ||
+            peer < 0 || peer >= args->rankSize) {
+            return;
+        }
+
+        AscendC::GlobalTensor<GM_ADDR> peerMems;
+        peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE);
+        GM_ADDR shareAddrs[TileXR::TILEXR_MAX_RANK_SIZE];
+        for (int32_t i = 0; i < args->rankSize; ++i) {
+            shareAddrs[i] = peerMems.GetValue(i);
+        }
+        GM_ADDR peerBase = shareAddrs[peer];
+        GM_ADDR localBase = shareAddrs[rank];
+        if (peerBase == nullptr || localBase == nullptr || (isReceiver && dst == nullptr) ||
+            (isSender && srcGM == nullptr)) {
+            uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2);
+            if (debug != nullptr && blockIdx == 0) {
+                debug[5] = status;
+            }
+            return;
+        }
+
+        uint32_t offset = 0;
+        uint32_t sliceBytes = 0;
+        TileXRUdmaDemoBlockSlice(bytes, blockNum, blockIdx, offset, sliceBytes);
+        if (debug != nullptr && blockIdx < 8) {
+            debug[16 + blockIdx] = offset;
+            debug[24 + blockIdx] = sliceBytes;
+        }
+        if (sliceBytes == 0) {
+            TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+            return;
+        }
+
+        AscendC::TPipe pipe;
+        AscendC::TBuf<AscendC::QuePosition::VECCALC> tBuf;
+        pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES);
+        SyncCollectives sync;
+        sync.Init(rank, args->rankSize, shareAddrs, tBuf);
+
+        uint32_t status = 0;
+        if (isSender) {
+            TileXRUdmaDemoCopyBytesGmToGm(
+                peerBase + dstByteOffset + offset, srcGM + offset, tBuf, sliceBytes);
+            sync.SetOuterFlag(magic, step);
+        }
+        if (isReceiver && status == 0) {
+            sync.WaitOuterFlag(magic, step, peer, blockIdx);
+            TileXRUdmaDemoCopyBytesGmToGm(
+                reinterpret_cast<GM_ADDR>(dst + offset), localBase + dstByteOffset + offset, tBuf, sliceBytes);
+        }
+
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status);
+        if (debug != nullptr && blockIdx == 0) {
+            debug[5] = status;
+        }
+    }
+}
+
+extern "C" __global__ __aicore__ void tilexr_data_as_flag_p2p_perf_kernel(
+    GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR dstGM, GM_ADDR debugGM,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic)
+{
+    if constexpr (g_coreType == AscendC::AIV) {
+        auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM);
+        auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM);
+        auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM);
+        auto dst = reinterpret_cast<__gm__ uint8_t*>(dstGM);
+
+        int32_t rank = args->rank;
+        uint32_t blockIdx = AscendC::GetBlockIdx();
+        uint32_t blockNum = AscendC::GetBlockNum();
+        if (debug != nullptr && blockIdx == 0) {
+            debug[0] = TILEXR_UDMA_DEMO_MAGIC;
+            debug[1] = rank;
+            debug[2] = 1;
+            debug[3] = bytes;
+            debug[4] = pattern;
+            debug[5] = 0xffffffffu;
+        }
+
+        bool isSender = false;
+        bool isReceiver = false;
+        int32_t peer = -1;
+        if (blockNum == 0 ||
+            !TileXRUdmaDemoResolveDataAsFlagRole(rank, srcRank, dstRank, traffic, isSender, isReceiver, peer) ||
+            peer < 0 || peer >= args->rankSize) {
+            return;
+        }
+
+        AscendC::GlobalTensor<GM_ADDR> peerMems;
+        peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE);
+        GM_ADDR peerBase = peerMems.GetValue(peer);
+        GM_ADDR localBase = peerMems.GetValue(rank);
+        if (peerBase == nullptr || localBase == nullptr || dst == nullptr) {
+            uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2);
+            if (debug != nullptr && blockIdx == 0) {
+                debug[5] = status;
+            }
+            return;
+        }
+
+        uint32_t payloadOffset = 0;
+        uint32_t sliceBytes = 0;
+        uint32_t dataAsFlagOffset = 0;
+        TileXRUdmaDemoDataAsFlagSlice(bytes, blockNum, blockIdx, payloadOffset, sliceBytes, dataAsFlagOffset);
+        if (sliceBytes == 0) {
+            TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+            return;
+        }
+
+        AscendC::TPipe pipe;
+        AscendC::TBuf<AscendC::QuePosition::VECCALC> tBuf;
+        pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES);
+        AscendC::LocalTensor<uint8_t> scratch = tBuf.Get<uint8_t>();
+
+        uint32_t status = 0;
+        if (isSender) {
+            uint32_t initBlocks = TileXR::DataAsFlagInit(scratch);
+            uint32_t sentBlocks = TileXR::DataAsFlagSend(
+                reinterpret_cast<__gm__ uint8_t*>(peerBase + dstByteOffset + dataAsFlagOffset),
+                src + payloadOffset, sliceBytes, scratch);
+            if (initBlocks == 0 || sentBlocks == 0) {
+                status = 3;
+            }
+        }
+        if (isReceiver && status == 0) {
+            bool received = TileXR::DataAsFlagCheckAndRecv(
+                reinterpret_cast<__gm__ uint8_t*>(localBase + dstByteOffset + dataAsFlagOffset),
+                sliceBytes, dst + payloadOffset, scratch);
+            if (!received) {
+                status = 4;
+            }
+        }
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status);
+        if (debug != nullptr && blockIdx == 0) {
+            debug[5] = status;
+        }
+    }
+}
+
+extern "C" __global__ __aicore__ void tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel(
+    GM_ADDR commArgsGM, GM_ADDR srcGM, GM_ADDR dstGM, GM_ADDR debugGM,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step, int32_t strict)
+{
+    if constexpr (g_coreType == AscendC::AIV) {
+        auto args = reinterpret_cast<__gm__ TileXR::CommArgs*>(commArgsGM);
+        auto debug = reinterpret_cast<__gm__ uint32_t*>(debugGM);
+        auto src = reinterpret_cast<__gm__ uint8_t*>(srcGM);
+        auto dst = reinterpret_cast<__gm__ uint8_t*>(dstGM);
+
+        int32_t rank = args->rank;
+        uint32_t blockIdx = AscendC::GetBlockIdx();
+        uint32_t blockNum = AscendC::GetBlockNum();
+        const uint64_t epoch = TileXR::DataAsFlagEpoch(magic, step);
+        if (debug != nullptr && blockIdx == 0) {
+            debug[0] = TILEXR_UDMA_DEMO_MAGIC;
+            debug[1] = rank;
+            debug[2] = 1;
+            debug[3] = bytes;
+            debug[4] = pattern;
+            debug[5] = 0xffffffffu;
+            debug[6] = blockNum;
+            debug[7] = static_cast<uint32_t>(epoch & 0xffffffffu);
+        }
+
+        bool isSender = false;
+        bool isReceiver = false;
+        int32_t peer = -1;
+        if (blockNum == 0 ||
+            !TileXRUdmaDemoResolveDataAsFlagRole(rank, srcRank, dstRank, traffic, isSender, isReceiver, peer) ||
+            peer < 0 || peer >= args->rankSize) {
+            return;
+        }
+
+        AscendC::GlobalTensor<GM_ADDR> peerMems;
+        peerMems.SetGlobalBuffer(&(args->peerMems[0]), TileXR::TILEXR_MAX_RANK_SIZE);
+        GM_ADDR peerBase = peerMems.GetValue(peer);
+        GM_ADDR localBase = peerMems.GetValue(rank);
+        if (peerBase == nullptr || localBase == nullptr || dst == nullptr || (isSender && src == nullptr)) {
+            uint32_t status = TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 2);
+            if (debug != nullptr && blockIdx == 0) {
+                debug[5] = status;
+            }
+            return;
+        }
+
+        uint32_t payloadOffset = 0;
+        uint32_t sliceBytes = 0;
+        uint32_t dataAsFlagOffset = 0;
+        TileXRUdmaDemoDataAsFlagSlice(bytes, blockNum, blockIdx, payloadOffset, sliceBytes, dataAsFlagOffset);
+        if (debug != nullptr && blockIdx < 8) {
+            debug[16 + blockIdx] = payloadOffset;
+            debug[24 + blockIdx] = sliceBytes;
+        }
+        if (sliceBytes == 0) {
+            TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, 0);
+            return;
+        }
+
+        AscendC::TPipe pipe;
+        AscendC::TBuf<AscendC::QuePosition::VECCALC> tBuf;
+        pipe.InitBuffer(tBuf, TILEXR_UDMA_DEMO_P2P_UB_BYTES);
+        AscendC::LocalTensor<uint8_t> scratch = tBuf.Get<uint8_t>();
+
+        uint32_t status = 0;
+        if (isSender) {
+            uint32_t sentBlocks = TileXR::DataAsFlagSendEpochOrdered(
+                reinterpret_cast<__gm__ uint8_t*>(peerBase + dstByteOffset + dataAsFlagOffset),
+                src + payloadOffset, sliceBytes, epoch, scratch);
+            if (sentBlocks == 0U) {
+                status = 3;
+            }
+        }
+        if (isReceiver && status == 0) {
+            bool strictMode = strict != 0;
+            bool received = TileXR::DataAsFlagCheckAndRecvEpochOrdered(
+                reinterpret_cast<__gm__ uint8_t*>(localBase + dstByteOffset + dataAsFlagOffset),
+                sliceBytes, dst + payloadOffset, epoch, scratch, strictMode);
+            if (!received) {
+                status = 4;
+            }
+        }
+        TileXRUdmaDemoFoldDebugStatus(debug, blockIdx, status);
+        if (debug != nullptr && blockIdx == 0) {
+            debug[5] = status;
+        }
+    }
+}
+
 void launch_tilexr_udma_all_gather(
     uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR data, GM_ADDR debug, int32_t elementsPerRank)
 {
@@ -131,3 +900,63 @@ void launch_tilexr_udma_registered_smoke(
     tilexr_udma_registered_smoke_kernel<<<blockDim, nullptr, stream>>>(
         commArgs, local, debug, bytes, signal);
 }
+
+void launch_tilexr_udma_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic)
+{
+    tilexr_udma_p2p_perf_kernel<<<blockDim, nullptr, stream>>>(
+        commArgs, src, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic);
+}
+
+void launch_tilexr_udma_p2p_post_only_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic)
+{
+    tilexr_udma_p2p_post_only_perf_kernel<<<blockDim, nullptr, stream>>>(
+        commArgs, src, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic);
+}
+
+void launch_tilexr_memory_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic)
+{
+    tilexr_memory_p2p_perf_kernel<<<blockDim, nullptr, stream>>>(
+        commArgs, src, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic);
+}
+
+void launch_tilexr_memory_segmented_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern,
+    int32_t traffic, uint32_t segmentBytes, int32_t rotateWindow, uint32_t traceSegments)
+{
+    tilexr_memory_segmented_p2p_perf_kernel<<<blockDim, nullptr, stream>>>(
+        commArgs, src, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic,
+        segmentBytes, rotateWindow, traceSegments);
+}
+
+void launch_tilexr_memory_consume_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step)
+{
+    tilexr_memory_consume_p2p_perf_kernel<<<blockDim, nullptr, stream>>>(
+        commArgs, src, dst, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic, magic, step);
+}
+
+void launch_tilexr_data_as_flag_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset, uint32_t bytes, uint32_t pattern, int32_t traffic)
+{
+    tilexr_data_as_flag_p2p_perf_kernel<<<blockDim, nullptr, stream>>>(
+        commArgs, src, dst, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic);
+}
+
+void launch_tilexr_data_as_flag_epoch_ordered_p2p_perf(
+    uint32_t blockDim, void* stream, GM_ADDR commArgs, GM_ADDR src, GM_ADDR dst, GM_ADDR debug,
+    int32_t srcRank, int32_t dstRank, uint64_t dstByteOffset,
+    uint32_t bytes, uint32_t pattern, int32_t traffic, int32_t magic, int32_t step, int32_t strict)
+{
+    tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel<<<blockDim, nullptr, stream>>>(
+        commArgs, src, dst, debug, srcRank, dstRank, dstByteOffset, bytes, pattern, traffic, magic, step, strict);
+}
diff --git a/tests/udma/demo/tilexr_udma_p2p_perf_config.h b/tests/udma/demo/tilexr_udma_p2p_perf_config.h
new file mode 100644
index 0000000..36dc98c
--- /dev/null
+++ b/tests/udma/demo/tilexr_udma_p2p_perf_config.h
@@ -0,0 +1,401 @@
+#ifndef TILEXR_UDMA_P2P_PERF_CONFIG_H
+#define TILEXR_UDMA_P2P_PERF_CONFIG_H
+
+#include <cstdint>
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace TileXR {
+namespace Demo {
+
+constexpr uint64_t kP2PMemoryMaxBytes = 100ULL * 1024ULL * 1024ULL;
+constexpr uint64_t kP2PMemorySegmentBytes = 16ULL * 1024ULL * 1024ULL;
+constexpr uint64_t kP2PMemoryTraceSegmentBytes = 8ULL * 1024ULL * 1024ULL;
+
+enum class P2PTransport {
+    DirectUrma,
+    DirectUrmaPostOnly,
+    Memory,
+    MemorySegmented,
+    MemorySegmentedRotate,
+    MemorySegmentedTrace,
+    MemorySegmentedRotateTrace,
+    MemoryConsume,
+    DataAsFlag,
+    DataAsFlagEpochOrdered,
+    Invalid,
+};
+
+enum class P2PTraffic {
+    UniDir,
+    BiDir,
+    Invalid,
+};
+
+inline const char* P2PTransportName(P2PTransport transport)
+{
+    switch (transport) {
+        case P2PTransport::DirectUrma:
+            return "direct_urma";
+        case P2PTransport::DirectUrmaPostOnly:
+            return "direct_urma_post_only";
+        case P2PTransport::Memory:
+            return "memory";
+        case P2PTransport::MemorySegmented:
+            return "memory_segmented";
+        case P2PTransport::MemorySegmentedRotate:
+            return "memory_segmented_rotate";
+        case P2PTransport::MemorySegmentedTrace:
+            return "memory_segmented_trace";
+        case P2PTransport::MemorySegmentedRotateTrace:
+            return "memory_segmented_rotate_trace";
+        case P2PTransport::MemoryConsume:
+            return "memory_consume";
+        case P2PTransport::DataAsFlag:
+            return "data_as_flag";
+        case P2PTransport::DataAsFlagEpochOrdered:
+            return "data_as_flag_epoch_ordered";
+        default:
+            return "invalid";
+    }
+}
+
+inline const char* P2PTrafficName(P2PTraffic traffic)
+{
+    switch (traffic) {
+        case P2PTraffic::UniDir:
+            return "unidir";
+        case P2PTraffic::BiDir:
+            return "bidir";
+        default:
+            return "invalid";
+    }
+}
+
+inline P2PTransport ParseP2PTransport(const std::string& name)
+{
+    if (name == "direct_urma" || name == "udma") {
+        return P2PTransport::DirectUrma;
+    }
+    if (name == "direct_urma_post_only" || name == "udma_post_only" || name == "post_only") {
+        return P2PTransport::DirectUrmaPostOnly;
+    }
+    if (name == "memory" || name == "ipc" || name == "datacopy") {
+        return P2PTransport::Memory;
+    }
+    if (name == "memory_segmented" || name == "memory-segmented" || name == "ipc_segmented") {
+        return P2PTransport::MemorySegmented;
+    }
+    if (name == "memory_segmented_rotate" ||
+        name == "memory-segmented-rotate" ||
+        name == "ipc_segmented_rotate") {
+        return P2PTransport::MemorySegmentedRotate;
+    }
+    if (name == "memory_segmented_trace" ||
+        name == "memory-segmented-trace" ||
+        name == "ipc_segmented_trace") {
+        return P2PTransport::MemorySegmentedTrace;
+    }
+    if (name == "memory_segmented_rotate_trace" ||
+        name == "memory-segmented-rotate-trace" ||
+        name == "ipc_segmented_rotate_trace") {
+        return P2PTransport::MemorySegmentedRotateTrace;
+    }
+    if (name == "memory_consume" || name == "memory-consume" || name == "mem_consume") {
+        return P2PTransport::MemoryConsume;
+    }
+    if (name == "data_as_flag" || name == "data-as-flag" || name == "daf") {
+        return P2PTransport::DataAsFlag;
+    }
+    if (name == "data_as_flag_epoch_ordered" ||
+        name == "data-as-flag-epoch-ordered" ||
+        name == "daf_epoch_ordered") {
+        return P2PTransport::DataAsFlagEpochOrdered;
+    }
+    return P2PTransport::Invalid;
+}
+
+inline P2PTraffic ParseP2PTraffic(const std::string& name)
+{
+    if (name == "unidir" || name == "uni" || name == "single") {
+        return P2PTraffic::UniDir;
+    }
+    if (name == "bidir" || name == "bi" || name == "duplex" || name == "full_duplex") {
+        return P2PTraffic::BiDir;
+    }
+    return P2PTraffic::Invalid;
+}
+
+struct P2PPerfOptions {
+    int srcRank = 0;
+    int dstRank = 1;
+    uint64_t minBytes = 4096;
+    uint64_t maxBytes = 4096;
+    uint64_t stepFactor = 2;
+    int iters = 100;
+    int warmupIters = 10;
+    bool check = true;
+    std::string csvPath;
+    std::string logDir;
+    P2PTransport transport = P2PTransport::DirectUrma;
+    P2PTraffic traffic = P2PTraffic::UniDir;
+    uint32_t blockDim = 1;
+};
+
+struct P2PPerfRow {
+    P2PTransport transport = P2PTransport::DirectUrma;
+    P2PTraffic traffic = P2PTraffic::UniDir;
+    uint32_t blockDim = 1;
+    int srcRank = 0;
+    int dstRank = 1;
+    int rankSize = 2;
+    uint64_t bytes = 0;
+    int iters = 0;
+    double avgUs = 0.0;
+    double minUs = 0.0;
+    double maxUs = 0.0;
+    uint32_t status = 0;
+    uint64_t errors = 0;
+    std::string logDir;
+};
+
+struct P2PRankStatus {
+    uint32_t status = 0;
+    uint64_t errors = 0;
+    float elapsedMs = 0.0f;
+    double avgUsOverride = 0.0;
+};
+
+inline std::string DirectionName(int srcRank, int dstRank)
+{
+    return std::to_string(srcRank) + "to" + std::to_string(dstRank);
+}
+
+inline std::string TrafficDirectionName(int srcRank, int dstRank, P2PTraffic traffic)
+{
+    if (traffic == P2PTraffic::BiDir) {
+        return DirectionName(srcRank, dstRank) + "+" + DirectionName(dstRank, srcRank);
+    }
+    return DirectionName(srcRank, dstRank);
+}
+
+inline uint64_t DataAsFlagWindowBytes(uint64_t payloadBytes)
+{
+    return ((payloadBytes + 479ULL) / 480ULL) * 512ULL;
+}
+
+inline uint64_t P2PTransportWindowBytes(P2PTransport transport, uint64_t payloadBytes)
+{
+    return (transport == P2PTransport::DataAsFlag ||
+            transport == P2PTransport::DataAsFlagEpochOrdered) ?
+        DataAsFlagWindowBytes(payloadBytes) : payloadBytes;
+}
+
+inline uint64_t P2PTransportWindowBytes(P2PTransport transport, uint64_t payloadBytes, uint32_t blockDim)
+{
+    (void)blockDim;
+    return P2PTransportWindowBytes(transport, payloadBytes);
+}
+
+inline bool P2PTransportUsesIpc(P2PTransport transport)
+{
+    return transport == P2PTransport::Memory ||
+        transport == P2PTransport::MemorySegmented ||
+        transport == P2PTransport::MemorySegmentedRotate ||
+        transport == P2PTransport::MemorySegmentedTrace ||
+        transport == P2PTransport::MemorySegmentedRotateTrace ||
+        transport == P2PTransport::MemoryConsume ||
+        transport == P2PTransport::DataAsFlag ||
+        transport == P2PTransport::DataAsFlagEpochOrdered;
+}
+
+inline bool P2PTransportBothRanksActive(P2PTransport transport, P2PTraffic traffic)
+{
+    return traffic == P2PTraffic::BiDir ||
+        transport == P2PTransport::MemoryConsume ||
+        transport == P2PTransport::DataAsFlag ||
+        transport == P2PTransport::DataAsFlagEpochOrdered;
+}
+
+inline int ActiveP2PFlowCount(P2PTraffic traffic)
+{
+    return traffic == P2PTraffic::BiDir ? 2 : 1;
+}
+
+inline uint64_t P2PEffectiveTransferBytes(P2PTransport transport, uint64_t payloadBytes, uint32_t blockDim)
+{
+    (void)transport;
+    (void)blockDim;
+    return payloadBytes;
+}
+
+inline bool ValidateP2PPerfOptions(const P2PPerfOptions& options, int rankSize, std::string* error)
+{
+    auto fail = [error](const std::string& message) {
+        if (error != nullptr) {
+            *error = message;
+        }
+        return false;
+    };
+    if (rankSize != 2) {
+        return fail("test_type=4 requires rank_size=2");
+    }
+    if (options.srcRank < 0 || options.srcRank >= rankSize ||
+        options.dstRank < 0 || options.dstRank >= rankSize) {
+        return fail("src_rank and dst_rank must be in [0, rank_size)");
+    }
+    if (options.srcRank == options.dstRank) {
+        return fail("src_rank and dst_rank must be different");
+    }
+    if (options.minBytes == 0 || options.maxBytes == 0 || options.minBytes > options.maxBytes) {
+        return fail("byte range must be nonzero and min_bytes <= max_bytes");
+    }
+    if (options.stepFactor < 2) {
+        return fail("step_factor must be at least 2");
+    }
+    if (options.iters <= 0 || options.warmupIters < 0) {
+        return fail("iters must be positive and warmup_iters must be nonnegative");
+    }
+    if (options.blockDim == 0U || options.blockDim > 64U) {
+        return fail("block_dim must be in [1, 64]");
+    }
+    if (options.transport == P2PTransport::Invalid) {
+        return fail(
+            "transport must be direct_urma, direct_urma_post_only, memory, memory_segmented, memory_segmented_rotate, memory_segmented_trace, memory_segmented_rotate_trace, memory_consume, data_as_flag, or data_as_flag_epoch_ordered");
+    }
+    if (options.traffic == P2PTraffic::Invalid) {
+        return fail("traffic must be unidir or bidir");
+    }
+    if (P2PTransportUsesIpc(options.transport) &&
+        P2PTransportWindowBytes(options.transport, options.maxBytes, options.blockDim) > kP2PMemoryMaxBytes) {
+        return fail("memory/memory_segmented/memory_consume/data_as_flag transport max_bytes must fit in the TileXR IPC data window");
+    }
+    return true;
+}
+
+inline std::vector<uint64_t> BuildP2PPerfSizeSweep(const P2PPerfOptions& options)
+{
+    std::vector<uint64_t> sizes;
+    for (uint64_t bytes = options.minBytes; bytes <= options.maxBytes;) {
+        sizes.push_back(bytes);
+        if (bytes > options.maxBytes / options.stepFactor) {
+            break;
+        }
+        bytes *= options.stepFactor;
+    }
+    if (sizes.empty() || sizes.back() != options.maxBytes) {
+        sizes.push_back(options.maxBytes);
+    }
+    return sizes;
+}
+
+inline uint32_t P2PPattern(int srcRank, int dstRank, uint64_t bytes)
+{
+    uint32_t pattern = 0x5a000000u;
+    pattern ^= static_cast<uint32_t>((srcRank & 0xff) << 16);
+    pattern ^= static_cast<uint32_t>((dstRank & 0xff) << 8);
+    pattern ^= static_cast<uint32_t>(bytes & 0xffu);
+    return pattern;
+}
+
+inline uint8_t P2PPatternByte(uint32_t pattern, uint64_t index)
+{
+    return static_cast<uint8_t>((pattern >> ((index & 3u) * 8u)) & 0xffu);
+}
+
+inline void FillP2PPattern(std::vector<uint8_t>& data, uint32_t pattern)
+{
+    for (uint64_t i = 0; i < data.size(); ++i) {
+        data[static_cast<size_t>(i)] = P2PPatternByte(pattern, i);
+    }
+}
+
+inline uint64_t CountP2PMismatches(const std::vector<uint8_t>& data, uint32_t pattern, uint64_t bytes)
+{
+    uint64_t errors = 0;
+    const uint64_t limit = bytes < data.size() ? bytes : data.size();
+    for (uint64_t i = 0; i < limit; ++i) {
+        if (data[static_cast<size_t>(i)] != P2PPatternByte(pattern, i)) {
+            ++errors;
+        }
+    }
+    return errors;
+}
+
+inline uint64_t CountP2PTransportMismatches(
+    const std::vector<uint8_t>& data, uint32_t pattern, uint64_t payloadBytes,
+    P2PTransport transport, uint32_t blockDim)
+{
+    (void)transport;
+    (void)blockDim;
+    return CountP2PMismatches(data, pattern, payloadBytes);
+}
+
+inline std::string P2PPerfCsvHeader()
+{
+    return "transport,traffic,block_dim,direction,src,dst,ranks,bytes,iters,avg_us,min_us,max_us,"
+        "bw_GBps,per_flow_bw_GBps,status,errors,log_dir\n";
+}
+
+inline std::string FormatP2PPerfCsvRow(const P2PPerfRow& row)
+{
+    const uint64_t effectiveBytes = P2PEffectiveTransferBytes(row.transport, row.bytes, row.blockDim);
+    const double perFlowBwGBps = row.avgUs > 0.0 ? static_cast<double>(effectiveBytes) / row.avgUs / 1000.0 : 0.0;
+    const double bwGBps = perFlowBwGBps * static_cast<double>(ActiveP2PFlowCount(row.traffic));
+    std::ostringstream out;
+    out << P2PTransportName(row.transport) << ','
+        << P2PTrafficName(row.traffic) << ','
+        << row.blockDim << ','
+        << TrafficDirectionName(row.srcRank, row.dstRank, row.traffic) << ','
+        << row.srcRank << ','
+        << row.dstRank << ','
+        << row.rankSize << ','
+        << row.bytes << ','
+        << row.iters << ','
+        << std::fixed << std::setprecision(3)
+        << row.avgUs << ','
+        << row.minUs << ','
+        << row.maxUs << ','
+        << bwGBps << ','
+        << perFlowBwGBps << ','
+        << row.status << ','
+        << row.errors << ','
+        << row.logDir << '\n';
+    return out.str();
+}
+
+inline P2PPerfRow BuildP2PPerfRow(
+    const P2PPerfOptions& options,
+    int rankSize,
+    uint64_t bytes,
+    const P2PRankStatus& srcStatus,
+    const P2PRankStatus& dstStatus)
+{
+    P2PPerfRow row;
+    row.transport = options.transport;
+    row.traffic = options.traffic;
+    row.blockDim = options.blockDim;
+    row.srcRank = options.srcRank;
+    row.dstRank = options.dstRank;
+    row.rankSize = rankSize;
+    row.bytes = bytes;
+    row.iters = options.iters;
+    const bool bothRanksActive = P2PTransportBothRanksActive(options.transport, options.traffic);
+    const float elapsedMs = bothRanksActive && dstStatus.elapsedMs > srcStatus.elapsedMs ?
+        dstStatus.elapsedMs : srcStatus.elapsedMs;
+    row.avgUs = options.iters > 0 ? static_cast<double>(elapsedMs) * 1000.0 / static_cast<double>(options.iters) : 0.0;
+    if (options.transport == P2PTransport::DirectUrmaPostOnly && srcStatus.avgUsOverride > 0.0) {
+        row.avgUs = srcStatus.avgUsOverride;
+    }
+    row.status = bothRanksActive ? (srcStatus.status | dstStatus.status) : srcStatus.status;
+    row.errors = bothRanksActive ? (srcStatus.errors + dstStatus.errors) : dstStatus.errors;
+    row.logDir = options.logDir;
+    return row;
+}
+
+} // namespace Demo
+} // namespace TileXR
+
+#endif // TILEXR_UDMA_P2P_PERF_CONFIG_H
diff --git a/tests/udma/run_tests.sh b/tests/udma/run_tests.sh
index f020b58..f4f37da 100755
--- a/tests/udma/run_tests.sh
+++ b/tests/udma/run_tests.sh
@@ -13,7 +13,7 @@ INSTALL_DIR="${SCRIPT_DIR}/install"
 source "${TILEXR_ROOT}/scripts/common_env.sh"
 
 # 设置 LD_LIBRARY_PATH：优先使用当前仓库刚编译安装的库，避免被 /usr/local/lib 中的旧库覆盖
-export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${TILEXR_ROOT}/install/lib:/usr/local/lib:${LD_LIBRARY_PATH}"
+export LD_LIBRARY_PATH="${INSTALL_DIR}/lib:${INSTALL_DIR}/lib64:${TILEXR_ROOT}/install/lib:${TILEXR_ROOT}/install/lib64:/usr/local/lib:${LD_LIBRARY_PATH:-}"
 
 echo "=========================================="
 echo "  Running UDMA Tests"
@@ -46,6 +46,7 @@ fi
 # 检查测试二进制是否存在
 if [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma_transport_layout" ] ||
    [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma_registry" ] ||
+   [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma_p2p_perf_config" ] ||
    [ ! -f "${INSTALL_DIR}/bin/test_tilexr_udma" ]; then
     echo "ERROR: Test binaries not found. Please run build.sh first."
     exit 1
@@ -67,19 +68,27 @@ echo "=========================================="
 TEST2_RESULT=$?
 echo ""
 
-# 测试 3: TileXR 集成测试（单进程，单卡）
+# 测试 3: P2P perf 配置单元测试（host-only）
 echo "=========================================="
-echo "Test 3: TileXR Integration Tests (Single Process)"
+echo "Test 3: TileXR UDMA P2P Perf Config Unit Test"
+echo "=========================================="
+"${INSTALL_DIR}/bin/test_tilexr_udma_p2p_perf_config"
+TEST3_RESULT=$?
+echo ""
+
+# 测试 4: TileXR 集成测试（单进程，单卡）
+echo "=========================================="
+echo "Test 4: TileXR Integration Tests (Single Process)"
 echo "=========================================="
 export RANK=0
 export RANK_SIZE=1
 "${INSTALL_DIR}/bin/test_tilexr_udma"
-TEST3_RESULT=$?
+TEST4_RESULT=$?
 echo ""
 
-# 测试 4: TileXR 多进程测试（需要 mpirun）
+# 测试 5: TileXR 多进程测试（需要 mpirun）
 echo "=========================================="
-echo "Test 4: TileXR Multi-Process Tests (MPI)"
+echo "Test 5: TileXR Multi-Process Tests (MPI)"
 echo "=========================================="
 
 # 检查是否有 mpirun
@@ -98,14 +107,14 @@ if command -v mpirun &> /dev/null; then
         unset RANK
         unset RANK_SIZE
         mpirun -n 2 "${INSTALL_DIR}/bin/test_tilexr_udma"
-        TEST4_RESULT=$?
+        TEST5_RESULT=$?
     else
         echo "SKIP: Need at least 2 usable NPUs for multi-rank test"
-        TEST4_RESULT=0
+        TEST5_RESULT=0
     fi
 else
     echo "SKIP: mpirun not found, skipping multi-process tests"
-    TEST4_RESULT=0
+    TEST5_RESULT=0
 fi
 echo ""
 
@@ -115,13 +124,14 @@ echo "  Test Results Summary"
 echo "=========================================="
 echo "Test 1 (UDMA Layout):     $([ $TEST1_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')"
 echo "Test 2 (UDMA Registry):   $([ $TEST2_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')"
-echo "Test 3 (TileXR Single):   $([ $TEST3_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')"
-echo "Test 4 (TileXR Multi):    $([ $TEST4_RESULT -eq 0 ] && echo 'PASS' || echo 'SKIP/FAIL')"
+echo "Test 3 (P2P Perf Config): $([ $TEST3_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')"
+echo "Test 4 (TileXR Single):   $([ $TEST4_RESULT -eq 0 ] && echo 'PASS' || echo 'FAIL')"
+echo "Test 5 (TileXR Multi):    $([ $TEST5_RESULT -eq 0 ] && echo 'PASS' || echo 'SKIP/FAIL')"
 echo "=========================================="
 
 # 返回失败状态
 if [ $TEST1_RESULT -ne 0 ] || [ $TEST2_RESULT -ne 0 ] || [ $TEST3_RESULT -ne 0 ] ||
-   [ $TEST4_RESULT -ne 0 ]; then
+   [ $TEST4_RESULT -ne 0 ] || [ $TEST5_RESULT -ne 0 ]; then
     exit 1
 fi
 
diff --git a/tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp b/tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp
new file mode 100644
index 0000000..6c828e5
--- /dev/null
+++ b/tests/udma/unit/test_tilexr_udma_p2p_perf_config.cpp
@@ -0,0 +1,335 @@
+#include <cmath>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "demo/tilexr_udma_p2p_perf_config.h"
+
+namespace {
+
+void Require(bool condition, const char* message)
+{
+    if (!condition) {
+        throw message;
+    }
+}
+
+} // namespace
+
+int main()
+{
+    TileXR::Demo::P2PPerfOptions options;
+    options.srcRank = 1;
+    options.dstRank = 0;
+    options.minBytes = 4096;
+    options.maxBytes = 16384;
+    options.stepFactor = 2;
+    options.iters = 20;
+    options.warmupIters = 5;
+    options.csvPath = "logs/p2p_perf.csv";
+
+    std::string error;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), "valid options rejected");
+    Require(TileXR::Demo::DirectionName(0, 1) == "0to1", "direction name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DirectUrma) == "direct_urma",
+        "direct transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DirectUrmaPostOnly) ==
+            "direct_urma_post_only",
+        "direct post-only transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::Memory) == "memory",
+        "memory transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::MemorySegmented) == "memory_segmented",
+        "memory_segmented transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::MemorySegmentedRotate) ==
+            "memory_segmented_rotate",
+        "memory_segmented_rotate transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::MemorySegmentedTrace) ==
+            "memory_segmented_trace",
+        "memory_segmented_trace transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace) ==
+            "memory_segmented_rotate_trace",
+        "memory_segmented_rotate_trace transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::MemoryConsume) == "memory_consume",
+        "memory_consume transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DataAsFlag) == "data_as_flag",
+        "data_as_flag transport name mismatch");
+    Require(TileXR::Demo::P2PTransportName(TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered) ==
+            "data_as_flag_epoch_ordered",
+        "data_as_flag_epoch_ordered transport name mismatch");
+    Require(TileXR::Demo::P2PTrafficName(TileXR::Demo::P2PTraffic::UniDir) == "unidir",
+        "unidir traffic name mismatch");
+    Require(TileXR::Demo::P2PTrafficName(TileXR::Demo::P2PTraffic::BiDir) == "bidir",
+        "bidir traffic name mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma") == TileXR::Demo::P2PTransport::DirectUrma,
+        "direct transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("udma") == TileXR::Demo::P2PTransport::DirectUrma,
+        "udma alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma_post_only") ==
+            TileXR::Demo::P2PTransport::DirectUrmaPostOnly,
+        "direct_urma_post_only transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("post_only") == TileXR::Demo::P2PTransport::DirectUrmaPostOnly,
+        "post_only alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory") == TileXR::Demo::P2PTransport::Memory,
+        "memory transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory_segmented") == TileXR::Demo::P2PTransport::MemorySegmented,
+        "memory_segmented transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory-segmented") == TileXR::Demo::P2PTransport::MemorySegmented,
+        "memory-segmented alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory_segmented_rotate") ==
+            TileXR::Demo::P2PTransport::MemorySegmentedRotate,
+        "memory_segmented_rotate transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory-segmented-rotate") ==
+            TileXR::Demo::P2PTransport::MemorySegmentedRotate,
+        "memory-segmented-rotate alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory_segmented_trace") ==
+            TileXR::Demo::P2PTransport::MemorySegmentedTrace,
+        "memory_segmented_trace transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory-segmented-trace") ==
+            TileXR::Demo::P2PTransport::MemorySegmentedTrace,
+        "memory-segmented-trace alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory_segmented_rotate_trace") ==
+            TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace,
+        "memory_segmented_rotate_trace transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory-segmented-rotate-trace") ==
+            TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace,
+        "memory-segmented-rotate-trace alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory_consume") == TileXR::Demo::P2PTransport::MemoryConsume,
+        "memory_consume transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("memory-consume") == TileXR::Demo::P2PTransport::MemoryConsume,
+        "memory-consume alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("mem_consume") == TileXR::Demo::P2PTransport::MemoryConsume,
+        "mem_consume alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("data_as_flag") == TileXR::Demo::P2PTransport::DataAsFlag,
+        "data_as_flag transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("data_as_flag_epoch_ordered") ==
+            TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered,
+        "data_as_flag_epoch_ordered transport parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("data-as-flag-epoch-ordered") ==
+            TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered,
+        "data-as-flag-epoch-ordered alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("daf_epoch_ordered") ==
+            TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered,
+        "daf_epoch_ordered alias parse mismatch");
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_wqe") == TileXR::Demo::P2PTransport::Invalid,
+        "direct_urma_multi_wqe must be rejected");
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty") == TileXR::Demo::P2PTransport::Invalid,
+        "direct_urma_multi_jetty must be rejected");
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty_parallel") ==
+            TileXR::Demo::P2PTransport::Invalid,
+        "direct_urma_multi_jetty_parallel must be rejected");
+    Require(TileXR::Demo::ParseP2PTransport("direct_urma_multi_jetty_parallel_fixed_wqe") ==
+            TileXR::Demo::P2PTransport::Invalid,
+        "direct_urma_multi_jetty_parallel_fixed_wqe must be rejected");
+    Require(TileXR::Demo::ParseP2PTraffic("unidir") == TileXR::Demo::P2PTraffic::UniDir,
+        "unidir traffic parse mismatch");
+    Require(TileXR::Demo::ParseP2PTraffic("bidir") == TileXR::Demo::P2PTraffic::BiDir,
+        "bidir traffic parse mismatch");
+    Require(TileXR::Demo::TrafficDirectionName(0, 1, TileXR::Demo::P2PTraffic::BiDir) == "0to1+1to0",
+        "bidir direction name mismatch");
+    Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DataAsFlag, 0) == 0,
+        "data_as_flag zero layout mismatch");
+    Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DataAsFlag, 480) == 512,
+        "data_as_flag 480B layout mismatch");
+    Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DataAsFlag, 481) == 1024,
+        "data_as_flag 481B layout mismatch");
+    Require(TileXR::Demo::P2PTransportWindowBytes(
+                TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 0) == 0,
+        "data_as_flag_epoch_ordered zero layout mismatch");
+    Require(TileXR::Demo::P2PTransportWindowBytes(
+                TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 480) == 512,
+        "data_as_flag_epoch_ordered 480B layout mismatch");
+    Require(TileXR::Demo::P2PTransportWindowBytes(
+                TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, 481) == 1024,
+        "data_as_flag_epoch_ordered 481B layout mismatch");
+    Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::DirectUrma, 4096, 8) == 4096,
+        "direct_urma window must equal payload bytes");
+    Require(TileXR::Demo::P2PTransportWindowBytes(TileXR::Demo::P2PTransport::MemoryConsume, 4096, 4) == 4096,
+        "memory_consume window must equal payload bytes");
+    Require(TileXR::Demo::P2PTransportWindowBytes(
+                TileXR::Demo::P2PTransport::MemorySegmented, 64ULL * 1024ULL * 1024ULL, 1) ==
+            64ULL * 1024ULL * 1024ULL,
+        "memory_segmented window must equal payload bytes");
+    Require(TileXR::Demo::P2PTransportWindowBytes(
+                TileXR::Demo::P2PTransport::MemorySegmentedRotate, 64ULL * 1024ULL * 1024ULL, 1) ==
+            64ULL * 1024ULL * 1024ULL,
+        "memory_segmented_rotate allocation window must preserve source payload bytes");
+    Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::MemorySegmented),
+        "memory_segmented must use IPC peer window");
+    Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::MemorySegmentedRotate),
+        "memory_segmented_rotate must use IPC peer window");
+    Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::MemorySegmentedTrace),
+        "memory_segmented_trace must use IPC peer window");
+    Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace),
+        "memory_segmented_rotate_trace must use IPC peer window");
+    Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::MemoryConsume),
+        "memory_consume must use IPC peer window");
+    Require(TileXR::Demo::P2PTransportUsesIpc(TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered),
+        "data_as_flag_epoch_ordered must use IPC peer window");
+    Require(TileXR::Demo::P2PTransportBothRanksActive(
+                TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered, TileXR::Demo::P2PTraffic::UniDir),
+        "data_as_flag_epoch_ordered unidir must keep receiver active");
+    Require(TileXR::Demo::ActiveP2PFlowCount(TileXR::Demo::P2PTraffic::UniDir) == 1,
+        "unidir active flow count mismatch");
+    Require(TileXR::Demo::ActiveP2PFlowCount(TileXR::Demo::P2PTraffic::BiDir) == 2,
+        "bidir active flow count mismatch");
+    options.transport = TileXR::Demo::P2PTransport::Memory;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), "memory transport options rejected");
+    options.transport = TileXR::Demo::P2PTransport::MemorySegmented;
+    options.maxBytes = 64ULL * 1024ULL * 1024ULL;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "valid memory_segmented options rejected");
+    options.transport = TileXR::Demo::P2PTransport::MemorySegmentedRotate;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "valid memory_segmented_rotate options rejected");
+    options.transport = TileXR::Demo::P2PTransport::MemorySegmentedTrace;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "valid memory_segmented_trace options rejected");
+    options.transport = TileXR::Demo::P2PTransport::MemorySegmentedRotateTrace;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "valid memory_segmented_rotate_trace options rejected");
+    options.maxBytes = 16384;
+    options.transport = TileXR::Demo::P2PTransport::DirectUrma;
+    options.traffic = TileXR::Demo::P2PTraffic::BiDir;
+    options.blockDim = 8;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "valid direct_urma multi-jetty options rejected");
+    options.transport = TileXR::Demo::P2PTransport::DataAsFlag;
+    options.traffic = TileXR::Demo::P2PTraffic::BiDir;
+    options.blockDim = 4;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "valid data_as_flag bidir options rejected");
+    options.transport = TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered;
+    options.traffic = TileXR::Demo::P2PTraffic::UniDir;
+    options.blockDim = 4;
+    options.maxBytes = 16384;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "valid data_as_flag_epoch_ordered options rejected");
+    options.transport = TileXR::Demo::P2PTransport::MemoryConsume;
+    options.traffic = TileXR::Demo::P2PTraffic::UniDir;
+    options.blockDim = 4;
+    options.maxBytes = 16384;
+    Require(TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "valid memory_consume options rejected");
+    options.maxBytes = TileXR::Demo::kP2PMemoryMaxBytes + 1;
+    Require(!TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error),
+        "oversized memory_consume transport accepted");
+    options.maxBytes = 16384;
+    options.blockDim = 0;
+    Require(!TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), "block_dim=0 accepted");
+    options.blockDim = 4;
+    options.maxBytes = TileXR::Demo::kP2PMemoryMaxBytes + 1;
+    Require(!TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), "oversized ipc transport accepted");
+    options.maxBytes = 16384;
+    options.transport = TileXR::Demo::P2PTransport::Memory;
+    options.traffic = TileXR::Demo::P2PTraffic::UniDir;
+    options.blockDim = 4;
+
+    const std::vector<uint64_t> sizes = TileXR::Demo::BuildP2PPerfSizeSweep(options);
+    Require(sizes.size() == 3, "size sweep count mismatch");
+    Require(sizes[0] == 4096 && sizes[1] == 8192 && sizes[2] == 16384, "size sweep values mismatch");
+
+    const uint32_t pattern = TileXR::Demo::P2PPattern(1, 0, 4096);
+    std::vector<uint8_t> bytes(4096);
+    TileXR::Demo::FillP2PPattern(bytes, pattern);
+    Require(TileXR::Demo::CountP2PMismatches(bytes, pattern, 4096) == 0, "pattern validation failed");
+    bytes[17] ^= 0xff;
+    Require(TileXR::Demo::CountP2PMismatches(bytes, pattern, 4096) == 1, "mismatch count failed");
+    Require(TileXR::Demo::CountP2PTransportMismatches(
+                bytes, pattern, 4096, TileXR::Demo::P2PTransport::DirectUrma, 8) == 1,
+        "direct_urma mismatch checker must validate payload bytes");
+
+    TileXR::Demo::P2PPerfRow row;
+    row.srcRank = 1;
+    row.dstRank = 0;
+    row.rankSize = 2;
+    row.bytes = 4096;
+    row.iters = 20;
+    row.avgUs = 8.0;
+    row.status = 0;
+    row.errors = 0;
+    row.logDir = "logs/run";
+    const std::string csv = TileXR::Demo::FormatP2PPerfCsvRow(row);
+    Require(csv == "direct_urma,unidir,1,1to0,1,0,2,4096,20,8.000,0.000,0.000,0.512,0.512,0,0,logs/run\n",
+        "csv row mismatch");
+    row.traffic = TileXR::Demo::P2PTraffic::BiDir;
+    row.blockDim = 4;
+    row.transport = TileXR::Demo::P2PTransport::DataAsFlag;
+    const std::string bidirCsv = TileXR::Demo::FormatP2PPerfCsvRow(row);
+    Require(bidirCsv ==
+            "data_as_flag,bidir,4,1to0+0to1,1,0,2,4096,20,8.000,0.000,0.000,1.024,0.512,0,0,logs/run\n",
+        "bidir csv row mismatch");
+    row.transport = TileXR::Demo::P2PTransport::DataAsFlagEpochOrdered;
+    row.traffic = TileXR::Demo::P2PTraffic::UniDir;
+    row.blockDim = 4;
+    const std::string epochOrderedCsv = TileXR::Demo::FormatP2PPerfCsvRow(row);
+    Require(epochOrderedCsv ==
+            "data_as_flag_epoch_ordered,unidir,4,1to0,1,0,2,4096,20,8.000,0.000,0.000,0.512,0.512,0,0,logs/run\n",
+        "data_as_flag_epoch_ordered csv row mismatch");
+    row.transport = TileXR::Demo::P2PTransport::DirectUrma;
+    row.traffic = TileXR::Demo::P2PTraffic::UniDir;
+    row.blockDim = 8;
+    const std::string directUrmaParallelCsv = TileXR::Demo::FormatP2PPerfCsvRow(row);
+    Require(directUrmaParallelCsv ==
+            "direct_urma,unidir,8,1to0,1,0,2,4096,20,8.000,0.000,0.000,0.512,0.512,0,0,logs/run\n",
+        "direct_urma parallel csv row mismatch");
+
+    TileXR::Demo::P2PRankStatus srcSample;
+    srcSample.status = 0;
+    srcSample.errors = 0;
+    srcSample.elapsedMs = 6.4f;
+    TileXR::Demo::P2PRankStatus dstSample;
+    dstSample.status = 0xffffffffu;
+    dstSample.errors = 0;
+    dstSample.elapsedMs = 0.1f;
+
+    const TileXR::Demo::P2PPerfRow aggregated =
+        TileXR::Demo::BuildP2PPerfRow(options, 2, 4096, srcSample, dstSample);
+    Require(std::fabs(aggregated.avgUs - 320.0) < 0.001,
+        "p2p row must use src rank elapsed time");
+    Require(aggregated.status == 0, "p2p row must use src rank status");
+    Require(aggregated.errors == 0, "p2p row must use dst rank errors");
+    Require(aggregated.transport == TileXR::Demo::P2PTransport::Memory,
+        "p2p row must preserve transport");
+    Require(aggregated.traffic == TileXR::Demo::P2PTraffic::UniDir,
+        "p2p row must preserve traffic");
+    Require(aggregated.blockDim == 4, "p2p row must preserve block_dim");
+
+    options.traffic = TileXR::Demo::P2PTraffic::BiDir;
+    dstSample.status = 4;
+    dstSample.errors = 7;
+    dstSample.elapsedMs = 10.0f;
+    const TileXR::Demo::P2PPerfRow bidirAggregated =
+        TileXR::Demo::BuildP2PPerfRow(options, 2, 4096, srcSample, dstSample);
+    Require(std::fabs(bidirAggregated.avgUs - 500.0) < 0.001,
+        "bidir row must use max rank elapsed time");
+    Require(bidirAggregated.status == 4, "bidir row must combine rank status");
+    Require(bidirAggregated.errors == 7, "bidir row must sum rank errors");
+
+    options.transport = TileXR::Demo::P2PTransport::MemoryConsume;
+    options.traffic = TileXR::Demo::P2PTraffic::UniDir;
+    options.blockDim = 4;
+    options.srcRank = 1;
+    options.dstRank = 0;
+    srcSample.status = 1;
+    srcSample.errors = 2;
+    srcSample.elapsedMs = 6.4f;
+    dstSample.status = 4;
+    dstSample.errors = 7;
+    dstSample.elapsedMs = 10.0f;
+    const TileXR::Demo::P2PPerfRow memoryConsumeAggregated =
+        TileXR::Demo::BuildP2PPerfRow(options, 2, 4096, srcSample, dstSample);
+    Require(std::fabs(memoryConsumeAggregated.avgUs - 500.0) < 0.001,
+        "memory_consume unidir row must use max rank elapsed time");
+    Require(memoryConsumeAggregated.status == 5,
+        "memory_consume unidir row must combine rank status");
+    Require(memoryConsumeAggregated.errors == 9,
+        "memory_consume unidir row must sum rank errors");
+    const std::string memoryConsumeCsv = TileXR::Demo::FormatP2PPerfCsvRow(memoryConsumeAggregated);
+    Require(memoryConsumeCsv ==
+            "memory_consume,unidir,4,1to0,1,0,2,4096,20,500.000,0.000,0.000,0.008,0.008,5,9,\n",
+        "memory_consume csv row mismatch");
+
+    options.dstRank = 1;
+    Require(!TileXR::Demo::ValidateP2PPerfOptions(options, 2, &error), "same src/dst accepted");
+    return 0;
+}
diff --git a/tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp b/tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp
new file mode 100644
index 0000000..e0d9430
--- /dev/null
+++ b/tests/udma/unit/test_tilexr_udma_p2p_source_guard.cpp
@@ -0,0 +1,165 @@
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+namespace {
+
+int g_failures = 0;
+
+std::string RepoPath(const std::string& path)
+{
+#ifdef TILEXR_SOURCE_ROOT
+    return std::string(TILEXR_SOURCE_ROOT) + "/" + path;
+#else
+    return path;
+#endif
+}
+
+std::string ReadFile(const std::string& path)
+{
+    const std::string fullPath = RepoPath(path);
+    std::ifstream input(fullPath.c_str());
+    if (!input.is_open()) {
+        std::cerr << "failed to open " << fullPath << std::endl;
+        ++g_failures;
+        return {};
+    }
+    std::ostringstream buffer;
+    buffer << input.rdbuf();
+    return buffer.str();
+}
+
+void CheckContains(const std::string& path, const std::string& text, const std::string& needle)
+{
+    if (text.find(needle) == std::string::npos) {
+        std::cerr << path << " does not contain required text: " << needle << std::endl;
+        ++g_failures;
+    }
+}
+
+void CheckNotContains(const std::string& path, const std::string& text, const std::string& needle)
+{
+    if (text.find(needle) != std::string::npos) {
+        std::cerr << path << " contains forbidden text: " << needle << std::endl;
+        ++g_failures;
+    }
+}
+
+void TestMemoryConsumeKernelUsesSyncCollectives()
+{
+    const std::string path = "tests/udma/demo/tilexr_udma_demo_kernel.cpp";
+    const std::string text = ReadFile(path);
+    CheckContains(path, text, "#include \"tilexr_sync.h\"");
+    CheckContains(path, text, "tilexr_memory_consume_p2p_perf_kernel");
+    CheckContains(path, text, "launch_tilexr_memory_consume_p2p_perf");
+    CheckContains(path, text, "SyncCollectives sync");
+    CheckContains(path, text, "int32_t magic, int32_t step");
+    CheckContains(path, text, "sync.SetOuterFlag(magic, step);");
+    CheckContains(path, text, "sync.WaitOuterFlag(magic, step, peer, blockIdx);");
+}
+
+void TestMemorySegmentedDiagnosticTransport()
+{
+    const std::string kernelPath = "tests/udma/demo/tilexr_udma_demo_kernel.cpp";
+    const std::string kernelText = ReadFile(kernelPath);
+    CheckContains(kernelPath, kernelText, "tilexr_memory_segmented_p2p_perf_kernel");
+    CheckContains(kernelPath, kernelText, "launch_tilexr_memory_segmented_p2p_perf");
+    CheckContains(kernelPath, kernelText, "uint32_t segmentBytes, int32_t rotateWindow");
+    CheckContains(kernelPath, kernelText, "dstInWindow = srcOffset % segmentBytes");
+    CheckContains(kernelPath, kernelText, "uint32_t traceSegments");
+    CheckContains(kernelPath, kernelText, "AscendC::GetSystemCycle()");
+    CheckContains(kernelPath, kernelText, "TileXRUdmaDemoAddCycleSum(debug, traceLowIdx");
+
+    const std::string configPath = "tests/udma/demo/tilexr_udma_p2p_perf_config.h";
+    const std::string configText = ReadFile(configPath);
+    CheckContains(configPath, configText, "kP2PMemorySegmentBytes = 16ULL * 1024ULL * 1024ULL");
+    CheckContains(configPath, configText, "kP2PMemoryTraceSegmentBytes = 8ULL * 1024ULL * 1024ULL");
+    CheckContains(configPath, configText, "MemorySegmented");
+    CheckContains(configPath, configText, "MemorySegmentedRotate");
+    CheckContains(configPath, configText, "MemorySegmentedTrace");
+    CheckContains(configPath, configText, "MemorySegmentedRotateTrace");
+    CheckContains(configPath, configText, "memory_segmented_rotate");
+    CheckContains(configPath, configText, "memory_segmented_rotate_trace");
+
+    const std::string hostPath = "tests/udma/demo/tilexr_udma_demo.cpp";
+    const std::string hostText = ReadFile(hostPath);
+    CheckContains(hostPath, hostText, "launch_tilexr_memory_segmented_p2p_perf");
+    CheckContains(hostPath, hostText, "P2PTransport::MemorySegmentedRotate");
+    CheckContains(hostPath, hostText, "P2PTransport::MemorySegmentedRotateTrace");
+    CheckContains(hostPath, hostText, "traceSegments ? 8U : 0U");
+    CheckContains(hostPath, hostText, "skipPayloadCheck");
+}
+
+void TestMemoryConsumeHostWiring()
+{
+    const std::string path = "tests/udma/demo/tilexr_udma_demo.cpp";
+    const std::string text = ReadFile(path);
+    CheckContains(path, text, "launch_tilexr_memory_consume_p2p_perf");
+    CheckContains(path, text, "P2PTransport::MemoryConsume");
+    CheckContains(path, text, "P2PTransportUsesIpc");
+    CheckContains(path, text, "P2PTransportBothRanksActive");
+    CheckContains(path, text, "constexpr int32_t kP2PMagicBase = 0x5444554d");
+    CheckContains(path, text, "int32_t launchMagic = kP2PMagicBase");
+    CheckContains(path, text, "const int32_t warmupMagic = ++launchMagic");
+    CheckContains(path, text, "const int32_t measuredMagic = ++launchMagic");
+    CheckContains(path, text, "warmupMagic, i + 1");
+    CheckContains(path, text, "measuredMagic, i + 1");
+}
+
+void TestMemoryConsumeSweepDefault()
+{
+    const std::string path = "tests/udma/demo/run_tilexr_udma_p2p_concurrency_sweep.sh";
+    const std::string text = ReadFile(path);
+    CheckContains(path, text, "direct_urma,memory,memory_consume,data_as_flag");
+}
+
+void TestDataAsFlagEpochOrderedSource()
+{
+    const std::string kernelPath = "tests/udma/demo/tilexr_udma_demo_kernel.cpp";
+    const std::string kernelText = ReadFile(kernelPath);
+    CheckContains(kernelPath, kernelText, "tilexr_data_as_flag_epoch_ordered_p2p_perf_kernel");
+    CheckContains(kernelPath, kernelText, "launch_tilexr_data_as_flag_epoch_ordered_p2p_perf");
+    CheckContains(kernelPath, kernelText, "DataAsFlagEpoch(magic, step)");
+    CheckContains(kernelPath, kernelText, "DataAsFlagSendEpochOrdered");
+    CheckContains(kernelPath, kernelText, "DataAsFlagCheckAndRecvEpochOrdered");
+    CheckContains(kernelPath, kernelText, "int32_t magic, int32_t step");
+    CheckContains(kernelPath, kernelText, "int32_t strict");
+    CheckContains(kernelPath, kernelText, "strict != 0");
+
+    const std::string headerPath = "src/include/tilexr_data_as_flag.h";
+    const std::string headerText = ReadFile(headerPath);
+    CheckContains(headerPath, headerText, "DataAsFlagEpochReady");
+    CheckContains(headerPath, headerText, "DataAsFlagCommitEpoch");
+    CheckContains(headerPath, headerText, "DataAsFlagWriteBatchCommitFlag");
+    CheckContains(headerPath, headerText, "DATA_AS_FLAG_COMMIT_BIT");
+    CheckContains(headerPath, headerText, "const uint64_t commitEpoch = DataAsFlagCommitEpoch(epoch)");
+    CheckContains(headerPath, headerText, "DataAsFlagEpochReady(DataAsFlagLoadEpochFlag(dataAsFlagGM, lastBlock, recvScratch), commitEpoch)");
+    CheckNotContains(headerPath, headerText, "DataAsFlagCopyEpochFlagsToGM");
+    CheckNotContains(headerPath, headerText, "return DataAsFlagMaxRecvBlocks(scratchBytes);");
+
+    const std::string hostPath = "tests/udma/demo/tilexr_udma_demo.cpp";
+    const std::string hostText = ReadFile(hostPath);
+    CheckContains(hostPath, hostText, "launch_tilexr_data_as_flag_epoch_ordered_p2p_perf");
+    CheckContains(hostPath, hostText, "P2PTransport::DataAsFlagEpochOrdered");
+    CheckContains(hostPath, hostText, "magic, step");
+    CheckContains(hostPath, hostText, "useLegacyDataAsFlagTransport");
+    CheckContains(hostPath, hostText, "TILEXR_DATA_AS_FLAG_STRICT");
+}
+
+} // namespace
+
+int main()
+{
+    TestMemoryConsumeKernelUsesSyncCollectives();
+    TestMemorySegmentedDiagnosticTransport();
+    TestMemoryConsumeHostWiring();
+    TestMemoryConsumeSweepDefault();
+    TestDataAsFlagEpochOrderedSource();
+    if (g_failures != 0) {
+        std::cerr << g_failures << " TileXR UDMA P2P source guard checks failed" << std::endl;
+        return 1;
+    }
+    std::cout << "TileXR UDMA P2P source guard checks passed" << std::endl;
+    return 0;
+}
diff --git a/tests/udma/unit/test_tilexr_udma_transport_layout.cpp b/tests/udma/unit/test_tilexr_udma_transport_layout.cpp
index cb14015..089f411 100644
--- a/tests/udma/unit/test_tilexr_udma_transport_layout.cpp
+++ b/tests/udma/unit/test_tilexr_udma_transport_layout.cpp
@@ -45,7 +45,7 @@ void TestHostLayoutUsesDeviceRelativePointers()
     constexpr uintptr_t deviceBase = 0x100000000ULL;
     TileXR::UDMAInfo info = {};
     std::vector<uint8_t> bytes;
-    const int ret = TileXR::BuildUDMAInfoImage(deviceBase, sq, rq, scq, rcq, mem, info, bytes);
+    const int ret = TileXR::BuildUDMAInfoImage(deviceBase, 1, sq, rq, scq, rcq, mem, info, bytes);
 
     CHECK_EQ(ret, TileXR::TILEXR_UDMA_LAYOUT_SUCCESS);
     CHECK_EQ(info.qpNum, 1U);
@@ -81,16 +81,43 @@ void TestRejectsMismatchedArrays()
 
     TileXR::UDMAInfo info = {};
     std::vector<uint8_t> bytes;
-    const int ret = TileXR::BuildUDMAInfoImage(0x1000, sq, rq, scq, rcq, mem, info, bytes);
+    const int ret = TileXR::BuildUDMAInfoImage(0x1000, 1, sq, rq, scq, rcq, mem, info, bytes);
     CHECK_EQ(ret, TileXR::TILEXR_UDMA_LAYOUT_INVALID);
 }
 
+void TestHostLayoutSupportsMultipleQps()
+{
+    std::vector<TileXR::UDMAWQCtx> sq(4);
+    std::vector<TileXR::UDMAWQCtx> rq(4);
+    std::vector<TileXR::UDMACQCtx> scq(4);
+    std::vector<TileXR::UDMACQCtx> rcq(4);
+    std::vector<TileXR::UDMAMemInfo> mem(4);
+
+    sq[3].bufAddr = 0x8000;
+    mem[3].tpn = 17;
+
+    constexpr uintptr_t deviceBase = 0x200000000ULL;
+    TileXR::UDMAInfo info = {};
+    std::vector<uint8_t> bytes;
+    const int ret = TileXR::BuildUDMAInfoImage(deviceBase, 2, sq, rq, scq, rcq, mem, info, bytes);
+
+    CHECK_EQ(ret, TileXR::TILEXR_UDMA_LAYOUT_SUCCESS);
+    CHECK_EQ(info.qpNum, 2U);
+    const auto* imageSq = reinterpret_cast<const TileXR::UDMAWQCtx*>(
+        bytes.data() + (info.sqPtr - deviceBase));
+    const auto* imageMem = reinterpret_cast<const TileXR::UDMAMemInfo*>(
+        bytes.data() + (info.memPtr - deviceBase));
+    CHECK_EQ(imageSq[3].bufAddr, static_cast<uint64_t>(0x8000));
+    CHECK_EQ(imageMem[3].tpn, 17U);
+}
+
 } // namespace
 
 int main()
 {
     TestHostLayoutUsesDeviceRelativePointers();
     TestRejectsMismatchedArrays();
+    TestHostLayoutSupportsMultipleQps();
     if (g_failures != 0) {
         std::cerr << g_failures << " UDMA transport layout checks failed" << std::endl;
         return 1;