diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
index 13867ce7e..f63bba614 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_mi325x.sh
@@ -24,9 +24,10 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
     export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
 fi
 
-# following AMD andy's recipe 
+# following AMD andy's recipe
 # https://www.linkedin.com/posts/andyluo77_day-0-support-of-minimax-25-on-amd-gpu-activity-7428151527309025280-hXR8/
 export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
@@ -52,6 +53,7 @@ $EP \
 --max-model-len $MAX_MODEL_LEN \
 --block-size=32 \
 --no-enable-prefix-caching \
+--attention-backend "ROCM_AITER_FA" \
 --trust-remote-code > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 684d40dcc..2ea351f76 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3068,3 +3068,9 @@
   description:
     - "Bump image to rocm/sgl-dev:rocm720-mi35x-8c3b5aa-20260521-DSv4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1548
+
+- config-keys:
+    - minimaxm2.5-fp8-mi325x-vllm
+  description:
+    - "Add VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 + --attention-backend ROCM_AITER_FA (match AMD-recommended AITER recipe pattern used on mi355x)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1549