diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 4cacc2710f10..64b285a0dc1b 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -326,8 +326,7 @@ apply_rocm_test_overrides() {
   if [[ $cmds == *" kernels/moe"* ]]; then
     cmds="${cmds} \
     --ignore=kernels/moe/test_moe.py \
-    --ignore=kernels/moe/test_cutlass_moe.py \
-    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+    --ignore=kernels/moe/test_cutlass_moe.py"
   fi
 
   # --- Entrypoint ignores ---
diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
index 6ec6ab94ff08..1def2c4682b1 100755
--- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
+++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -127,7 +127,7 @@ run_and_track_test() {
 
 # --- Actual Test Execution ---
 run_and_track_test 1 "test_struct_output_generate.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 2 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 3 "test_lora.py" \
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index 82e97bfbb1b2..1fd3d0e2488d 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -39,8 +39,7 @@
 #####################################################################################################################################
 #                                                                                                                                   #
 # IMPORTANT:                                                                                                                        #
-#   * Currently AMD CI has MI300 agents, MI325 agents, and MI355 agents. Of those, AMD is using mostly MI325 and MI355. AMD team    #
-#     is actively working on enabling more MI300 machines. All upcoming feature improvements are tracked in:                        #
+#   * Currently AMD CI has MI250 agents, MI325 agents, and MI355 agents. All upcoming feature improvements are tracked in:          #
 #         https://github.com/vllm-project/vllm/issues/34994                                                                         #
 #                                                                                                                                   #
 #-----------------------------------------------------------------------------------------------------------------------------------#
@@ -49,13 +48,15 @@
 #   * [Pytorch Nightly Dependency Override Check]: if this test fails, it means the nightly torch version is not compatible with    #
 #                                                  some of the dependencies. Please check the error message and add the package to  #
 #                                                  whitelist in `/vllm/tools/pre_commit/generate_nightly_torch_test.py`.            #
-#   * [Entrypoints Integration Test (LLM)]:                                                                                         #
+#   * [Entrypoints Integration (LLM)]:                                                                                              #
 #     - {`pytest -v -s entrypoints/llm/test_generate.py`}: It needs a clean process                                                 #
 #     - {`pytest -v -s entrypoints/offline_mode`}: Needs to avoid interference with other tests                                     #
-#   * [V1 Test e2e + engine]: The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. See discussion here:        #
-#                             https://github.com/vllm-project/vllm/pull/31040                                                       #
-#   * [V1 others]:                                                                                                                  #
-#     - Split the tests to avoid interference                                                                                       #
+#   * [Engine / Engine (1 GPU) / e2e Scheduling / e2e Core / V1 e2e / Spec Decode / V1 Sample + Logits / V1 Core + KV + Metrics]:   #
+#     - Previously a single "V1 Test e2e + engine" step, now split across multiple groups.                                          #
+#     - V1 e2e (2/4 GPUs) uses 4 GPUs but is scheduled on 8-GPU machines for stability. See:                                        #
+#       https://github.com/vllm-project/vllm/pull/31040                                                                             #
+#   * [V1 Sample + Logits / V1 Core + KV + Metrics / V1 others (CPU)]:                                                              #
+#     - Previously a single "V1 others" step, now split to avoid interference.                                                      #
 #     - Integration test for streaming correctness (requires special branch for __harness__ lib).                                   #
 #   * [V1 others (CPU)]: Split the tests to avoid interference                                                                      #
 #   * [PyTorch Compilation Unit Tests]: Run unit tests defined directly under `compile/`, not including subdirectories, which       #
@@ -83,9 +84,9 @@
 #                                          run plamo2 model in vLLM.                                                                #
 #   * [Language Models Test (Extended Generation)]: Install fast path packages for testing against transformers (mamba, conv1d)     #
 #                                                   and to run plamo2 model in vLLM.                                                #
-#   * [Multi-Modal Models (Standard)]:                                                                                              #
+#   * [Multi-Modal Models (Standard) 1-4]:                                                                                          #
 #     - Do NOT remove `VLLM_WORKER_MULTIPROC_METHOD=spawn` setting as ROCm requires this for certain models to function.            #
-#   * [Transformers Nightly Models Test]: Whisper needs `VLLM_WORKER_MULTIPROC_METHOD=spawn` to avoid deadlock.                     #
+#   * [Transformers Nightly Models]: Whisper needs `VLLM_WORKER_MULTIPROC_METHOD=spawn` to avoid deadlock.                          #
 #   * [Plugin Tests (2 GPUs)]:                                                                                                      #
 #     - {`pytest -v -s entrypoints/openai/test_oot_registration.py`}: It needs a clean process                                      #
 #     - {`pytest -v -s models/test_oot_registration.py`}: It needs a clean process                                                  #
@@ -94,11 +95,11 @@
 #     - There is some Tensor Parallelism related processing logic in LoRA that requires multi-GPU testing for validation.           #
 #     - {`pytest -v -s -x lora/test_gptoss_tp.py`}: Disabled for now because MXFP4 backend on non-cuda platform doesn't support     #
 #                                                   LoRA yet.                                                                       #
-#   * [Distributed Tests (GPU_TAG)]: Don't test llama model here, it seems hf implementation is buggy. See:                         #
-#                                    https://github.com/vllm-project/vllm/pull/5689                                                 #
-#   * [Distributed Tests (GPU_TAG)]: Some old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 in          #
-#                                    favor of new tests in fusions_e2e. We avoid replicating the new jobs in                        #
-#                                    this file as it's deprecated.                                                                  #
+#   * [Distributed Tests (NxGPUs)(HW-TAG)]: Don't test llama model here, it seems hf implementation is buggy. See:                  #
+#                                           https://github.com/vllm-project/vllm/pull/5689                                          #
+#   * [Distributed Tests (NxGPUs)(HW-TAG)]: Some old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293     #
+#                                           in favor of new tests in fusions_e2e. We avoid replicating the new jobs in              #
+#                                           this file as it's deprecated.                                                           #
 #                                                                                                                                   #
 #####################################################################################################################################
 
@@ -223,7 +224,7 @@ steps:
   - vllm/platforms/rocm.py
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 
 - label: Entrypoints Integration (LLM) # TBD
@@ -254,11 +255,11 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/rpc
-  - tests/entrypoints/instrumentator
+  - tests/entrypoints/serve/instrumentator
   - tests/tool_use
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/serve/instrumentator
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
   - pytest -v -s tool_use
 
@@ -483,19 +484,6 @@ steps:
     - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"
 
 
-- label: Entrypoints V1 # TBD
-  timeout_in_minutes: 180
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
-  agent_pool: mi250_1
-  optional: true
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/
-  - tests/v1
-  commands:
-  - pytest -v -s v1/entrypoints
-
-
 - label: V1 Sample + Logits # TBD
   timeout_in_minutes: 60
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
@@ -1173,14 +1161,14 @@ steps:
   - vllm/v1/engine/
   - vllm/v1/worker/
   - tests/v1/distributed
-  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/entrypoints/openai/test_multi_api_servers.py
   - vllm/platforms/rocm.py
   commands:
   - export TORCH_NCCL_BLOCKING_WAIT=1
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
 
 
 - label: Distributed Compile + RPC Tests (2 GPUs) # TBD
@@ -1402,7 +1390,7 @@ steps:
 - label: Distributed Tests (2 GPUs)(H100-MI250) # TBD
   timeout_in_minutes: 180
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
-  agent_pool: mi250_2
+  agent_pool: mi325_2
   num_gpus: 2
   working_dir: "/vllm-workspace/"
   source_file_dependencies:
@@ -1412,7 +1400,6 @@ steps:
   - vllm/v1/attention/backends/
   - vllm/v1/attention/selector.py
   - tests/distributed/test_context_parallel.py
-  - tests/v1/distributed/test_dbo.py
   - examples/offline_inference/data_parallel.py
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
@@ -1420,7 +1407,6 @@ steps:
   - export TORCH_NCCL_BLOCKING_WAIT=1
   - pytest -v -s tests/distributed/test_context_parallel.py
   - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
-  - pytest -v -s tests/v1/distributed/test_dbo.py
 
 
 #####################################################################################################################################
@@ -1477,11 +1463,11 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/rpc
-  - tests/entrypoints/instrumentator
+  - tests/entrypoints/serve/instrumentator
   - tests/tool_use
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/serve/instrumentator
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
   - pytest -v -s tool_use
 
@@ -1511,8 +1497,6 @@ steps:
   - vllm/distributed/
   - tests/distributed/test_torchrun_example.py
   - tests/distributed/test_torchrun_example_moe.py
-  - examples/offline_inference/rlhf.py
-  - examples/offline_inference/rlhf_colocate.py
   - examples/rl/
   - tests/examples/offline_inference/data_parallel.py
   - vllm/platforms/rocm.py
@@ -1762,6 +1746,7 @@ steps:
   timeout_in_minutes: 106
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -1770,19 +1755,6 @@ steps:
     - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"
 
 
-- label: Entrypoints V1 # 25.7m
-  timeout_in_minutes: 45
-  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
-  agent_pool: mi325_1
-  optional: true
-  working_dir: "/vllm-workspace/tests"
-  source_file_dependencies:
-  - vllm/
-  - tests/v1
-  commands:
-  - pytest -v -s v1/entrypoints
-
-
 - label: V1 Spec Decode # TBD
   timeout_in_minutes: 40
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
@@ -2395,14 +2367,14 @@ steps:
   - vllm/v1/engine/
   - vllm/v1/worker/
   - tests/v1/distributed
-  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/entrypoints/openai/test_multi_api_servers.py
   - vllm/platforms/rocm.py
   commands:
   - export TORCH_NCCL_BLOCKING_WAIT=1
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
 
 
 - label: Distributed Compile + RPC Tests (2 GPUs) # 56.1m
@@ -2580,6 +2552,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
   num_gpus: 4
+  optional: true
   working_dir: "/vllm-workspace/tests"
   source_file_dependencies:
   - vllm/
@@ -2596,21 +2569,16 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_2
   num_gpus: 2
-  optional: true
   working_dir: "/vllm-workspace/"
   source_file_dependencies:
   - vllm/distributed/
   - vllm/v1/distributed/
   - vllm/model_executor/layers/fused_moe/
-  - tests/distributed/test_context_parallel.py
   - tests/v1/distributed/test_dbo.py
-  - examples/offline_inference/data_parallel.py
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
   commands:
   - export TORCH_NCCL_BLOCKING_WAIT=1
-  - pytest -v -s tests/distributed/test_context_parallel.py
-  - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
   - pytest -v -s tests/v1/distributed/test_dbo.py
 
 
@@ -2669,7 +2637,7 @@ steps:
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8.txt
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt
 
 
 - label: LM Eval Large Models (H200-MI325) # TBD
@@ -2700,6 +2668,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
   num_gpus: 4
+  optional: true
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - csrc/
@@ -2720,6 +2689,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
   num_gpus: 4
+  optional: true
   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
   source_file_dependencies:
   - csrc/
@@ -2785,6 +2755,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
   num_gpus: 4
+  optional: true
   working_dir: "/vllm-workspace"
   source_file_dependencies:
   - vllm/model_executor/models/
@@ -2827,6 +2798,7 @@ steps:
   timeout_in_minutes: 11
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
+  optional: true
   working_dir: "/vllm-workspace"
   source_file_dependencies:
   - vllm/model_executor/models/
@@ -2848,6 +2820,7 @@ steps:
   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
   agent_pool: mi325_4
   num_gpus: 4
+  optional: true
   working_dir: "/vllm-workspace"
   source_file_dependencies:
   - vllm/model_executor/models/
@@ -2990,11 +2963,11 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/rpc
-  - tests/entrypoints/instrumentator
+  - tests/entrypoints/serve/instrumentator
   - tests/tool_use
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/serve/instrumentator
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
   - pytest -v -s tool_use
 
@@ -3599,7 +3572,7 @@ steps:
   - vllm/_aiter_ops.py
   - vllm/platforms/rocm.py
   commands:
-  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8.txt
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt
 
 
 - label: LM Eval Large Models (4 GPUs)(FP8) # TBD
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
index 6cf8b43f57c4..0b76c0223f93 100644
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -27,14 +27,14 @@ steps:
   - vllm/v1/engine/
   - vllm/v1/worker/
   - tests/v1/distributed
-  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/entrypoints/openai/test_multi_api_servers.py
   commands:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
 
 - label: Distributed Compile + RPC Tests (2 GPUs)
   timeout_in_minutes: 20
@@ -88,7 +88,6 @@ steps:
   - vllm/distributed/
   - tests/distributed/test_torchrun_example.py
   - tests/distributed/test_torchrun_example_moe.py
-  - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - examples/rl/
   - tests/examples/offline_inference/data_parallel.py
diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
index be83bab8fa29..ed0df3e4d879 100644
--- a/.buildkite/test_areas/engine.yaml
+++ b/.buildkite/test_areas/engine.yaml
@@ -70,3 +70,15 @@ steps:
       device: mi325_4
       depends_on:
       - image-build-amd
+
+- label: V1 e2e (4xH100)
+  timeout_in_minutes: 60
+  device: h100
+  num_devices: 4
+  optional: true
+  source_file_dependencies:
+    - vllm/v1/attention/backends/utils.py
+    - vllm/v1/worker/gpu_model_runner.py
+    - tests/v1/e2e/test_hybrid_chunked_prefill.py
+  commands:
+    - pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
index ac6be8e141f2..25c22c4ded9d 100644
--- a/.buildkite/test_areas/entrypoints.yaml
+++ b/.buildkite/test_areas/entrypoints.yaml
@@ -10,7 +10,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration (LLM)
   timeout_in_minutes: 40
@@ -34,7 +34,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/test_chat_utils.py
   mirror:
     amd:
@@ -48,11 +48,11 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/rpc
-  - tests/entrypoints/instrumentator
+  - tests/entrypoints/serve/instrumentator
   - tests/tool_use
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/serve/instrumentator
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
   - pytest -v -s tool_use
 
@@ -75,19 +75,6 @@ steps:
   commands:
   - pytest -v -s entrypoints/openai/responses
 
-- label: Entrypoints V1
-  timeout_in_minutes: 50
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    - pytest -v -s v1/entrypoints
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-
 - label: OpenAI API Correctness
   timeout_in_minutes: 30
   source_file_dependencies:
diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
index 3e2610e70a31..29f8cb3bc6c1 100644
--- a/.buildkite/test_areas/lm_eval.yaml
+++ b/.buildkite/test_areas/lm_eval.yaml
@@ -45,6 +45,22 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 
+- label: LM Eval Qwen3.5 Models (B200)
+  timeout_in_minutes: 120
+  device: b200
+  optional: true
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/model_executor/models/qwen3_5.py
+  - vllm/model_executor/models/qwen3_5_mtp.py
+  - vllm/transformers_utils/configs/qwen3_5.py
+  - vllm/transformers_utils/configs/qwen3_5_moe.py
+  - vllm/model_executor/models/qwen3_next.py
+  - vllm/model_executor/models/qwen3_next_mtp.py
+  - vllm/model_executor/layers/fla/ops/
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
+
 - label: LM Eval Large Models (H200)
   timeout_in_minutes: 60
   device: h200
diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml
index 85421399d1b8..238d5956a025 100644
--- a/.buildkite/test_areas/model_runner_v2.yaml
+++ b/.buildkite/test_areas/model_runner_v2.yaml
@@ -11,7 +11,7 @@ steps:
   - vllm/v1/attention/
   - tests/v1/engine/test_llm_engine.py
   - tests/v1/e2e/
-  - tests/v1/entrypoints/llm/test_struct_output_generate.py
+  - tests/entrypoints/llm/test_struct_output_generate.py
   commands:
   - set -x
   - export VLLM_USE_V2_MODEL_RUNNER=1
@@ -22,7 +22,7 @@ steps:
   - pytest -v -s v1/e2e/general/test_context_length.py
   - pytest -v -s v1/e2e/general/test_min_tokens.py
   # Temporary hack filter to exclude ngram spec decoding based tests.
-  - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
+  - pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
 
 - label: Model Runner V2 Examples
   timeout_in_minutes: 45
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index b0e49432775f..c0ceae044d25 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -75,7 +75,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
+/tests/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
 /tests/weight_loading @mgoin @youkaichao @yewentao256
diff --git a/.github/mergify.yml b/.github/mergify.yml
index 1c6837277831..eace1f479035 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -260,7 +260,7 @@ pull_request_rules:
       - files=examples/offline_inference/structured_outputs.py
       - files=examples/online_serving/structured_outputs/structured_outputs.py
       - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
+      - files=tests/entrypoints/llm/test_struct_output_generate.py
       - files~=^vllm/v1/structured_output/
   actions:
     label:
diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu
index 442b20e41de5..60e10e53391a 100644
--- a/csrc/rocm/skinny_gemms.cu
+++ b/csrc/rocm/skinny_gemms.cu
@@ -26,6 +26,16 @@
   #define __HIP__GFX9__
 #endif
 
+#if defined(__HIPCC__) &&                                                    \
+    (defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1150__) || \
+     defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__))
+  #define __HIP__GFX1X__
+#endif
+
+#if defined(__HIPCC__) && (defined(__gfx1200__) || defined(__gfx1201__))
+  #define __HIP__GFX12__
+#endif
+
 #if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__))
   #define __HIP__MI3XX__
 #endif
@@ -37,15 +47,31 @@
 #endif
 
 int get_lds_size() {
-  static bool is_cached = false;
-  static int result;
-  if (is_cached == false) {
-    auto dprops = at::cuda::getCurrentDeviceProperties();
-    std::string device_arch = dprops->gcnArchName;
-    size_t substring = device_arch.find("gfx95");
-    result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024);
-    is_cached = true;
-  }
+  static const int result = [] {
+    const auto* dprops = at::cuda::getCurrentDeviceProperties();
+    const std::string device_arch = dprops->gcnArchName;
+    return device_arch.find("gfx95") == std::string::npos ? 64 * 1024
+                                                          : 160 * 1024;
+  }();
+  return result;
+}
+
+bool on_gfx1x() {
+  static const bool result = [] {
+    const auto* dprops = at::cuda::getCurrentDeviceProperties();
+    const std::string device_arch = dprops->gcnArchName;
+    return device_arch.find("gfx11") != std::string::npos ||
+           device_arch.find("gfx12") != std::string::npos;
+  }();
+  return result;
+}
+
+bool on_gfx12() {
+  static const bool result = [] {
+    const auto* dprops = at::cuda::getCurrentDeviceProperties();
+    const std::string device_arch = dprops->gcnArchName;
+    return device_arch.find("gfx12") != std::string::npos;
+  }();
   return result;
 }
 
@@ -286,21 +312,35 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b,
   return out_c;
 }
 
-#define DOT2C(V0, V2, V3)                                                     \
-  if constexpr (std::is_same_v<scalar_t, half>) {                             \
-    asm("v_dot2c_f32_f16 %0, %2, %3" : "=v"(V0) : "0"(V0), "v"(V2), "v"(V3)); \
-  } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {            \
-    float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) *             \
-               __bfloat1622float2(*((__hip_bfloat162*)(&(V3))));              \
-    V0 += (s.x + s.y);                                                        \
-  }
+#if defined(__HIP__GFX9__) && !defined(__HIP__GFX1X__)
+  #define DOT2C(V0, V2, V3)                                          \
+    if constexpr (std::is_same_v<scalar_t, half>) {                  \
+      asm("v_dot2c_f32_f16 %0, %2, %3"                               \
+          : "=v"(V0)                                                 \
+          : "0"(V0), "v"(V2), "v"(V3));                              \
+    } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) { \
+      float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) *  \
+                 __bfloat1622float2(*((__hip_bfloat162*)(&(V3))));   \
+      V0 += (s.x + s.y);                                             \
+    }
+#elif defined(__HIP__GFX1X__)
+  // gfx1x: v_dot2_f32_f16 (VOP3-P, dot10-insts, available on gfx11+gfx12)
+  #define DOT2C(V0, V2, V3)                                               \
+    if constexpr (std::is_same_v<scalar_t, half>) {                       \
+      asm("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(V0) : "v"(V2), "v"(V3)); \
+    } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {      \
+      float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) *       \
+                 __bfloat1622float2(*((__hip_bfloat162*)(&(V3))));        \
+      V0 += (s.x + s.y);                                                  \
+    }
+#endif
 
 // To avoid LLVM silently upcasting to double
 __device__ inline unsigned int min__(uint32_t a, uint32_t b) {
   return min(a, b);
 }
 
-#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
 // This version targets cases where A[] fits LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
@@ -442,14 +482,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
                                                 1);  // row_shr2
           sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
                                                 1);  // row_shr1
+  #if defined(__HIP__GFX9__)
           sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
                                                 1);  // ROW_BCAST15
           sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
                                                 1);  // ROW_BCAST31
+  #else
+          sum[n][y] += __shfl_xor(sum[n][y], 16);
+  #endif
         }
       }
 
-      if (threadIdx.x == 63) {
+      if (threadIdx.x == (THRDS - 1)) {
         scalar_t biases[N][YTILE] = {};
         if (BIAS)
           for (int n = 0; n < N; n++) {
@@ -469,9 +513,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         }
       }
     } else {
-  #pragma unroll
+  #ifdef __HIP__GFX9__
+    #pragma unroll
       for (int n = 0; n < N; n++) {
-  #pragma unroll
+    #pragma unroll
         for (int y = 0; y < YTILE; y++) {
           /*float accm1 = 0;
            for (int i=0; i<64; i++)
@@ -498,7 +543,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
           sum4[n][y][0] = accm;
         }
       }
-      if (threadIdx.x == 63) {
+      if (threadIdx.x == (THRDS - 1)) {
         scalar_t biases[N][YTILE] = {};
         if (BIAS)
           for (int n = 0; n < N; n++) {
@@ -513,11 +558,12 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
           }
         }
       }
+  #endif  // __HIP__GFX9__ (MFMA path)
     }
     m += CuCount * _WvPrGrp * YTILE;
   }
 }
-#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
+#else
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap,
@@ -528,9 +574,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap,
                                  const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
+#endif
 
-#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
 // This version targets cases where A[] marginally exceeds LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
@@ -657,14 +703,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
                                                 1);  // row_shr2
           sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
                                                 1);  // row_shr1
+  #if defined(__HIP__GFX9__)
           sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
                                                 1);  // ROW_BCAST15
           sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
                                                 1);  // ROW_BCAST31
+  #else
+          sum[n][y] += __shfl_xor(sum[n][y], 16);
+  #endif
         }
       }
 
-      if (threadIdx.x == 63) {
+      if (threadIdx.x == (THRDS - 1)) {
         scalar_t biases[N][YTILE] = {};
         if (BIAS)
           for (int n = 0; n < N; n++) {
@@ -686,9 +736,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         }
       }
     } else {
-  #pragma unroll
+  #ifdef __HIP__GFX9__
+    #pragma unroll
       for (int n = 0; n < N; n++) {
-  #pragma unroll
+    #pragma unroll
         for (int y = 0; y < YTILE; y++) {
           // float accm1 = 0;
           // for (int i=0; i<64; i++)
@@ -713,7 +764,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
           sum4[n][y][0] = accm;
         }
       }
-      if (threadIdx.x == 63) {
+      if (threadIdx.x == (THRDS - 1)) {
         scalar_t biases[N][YTILE] = {};
         if (BIAS)
           for (int n = 0; n < N; n++) {
@@ -730,6 +781,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
           }
         }
       }
+  #endif  // __HIP__GFX9__ (MFMA path)
     }
 
     m += CuCount * _WvPrGrp * YTILE;
@@ -746,7 +798,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   }
 }
 
-#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
+#else
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap,
@@ -756,9 +808,9 @@ __global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap,
                              const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
+#endif
 
-#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__)
 // This version targets big A[] cases, where it is much larger than LDS capacity
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
@@ -1004,14 +1056,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
                                                 1);  // row_shr2
           sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf,
                                                 1);  // row_shr1
+  #if defined(__HIP__GFX9__)
           sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf,
                                                 1);  // ROW_BCAST15
           sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf,
                                                 1);  // ROW_BCAST31
+  #else
+          sum[n][y] += __shfl_xor(sum[n][y], 16);
+  #endif
         }
       }
 
-      if (threadIdx.x == 63) {
+      if (threadIdx.x == (THRDS - 1)) {
         scalar_t biases[N][YTILE] = {};
         if (BIAS)
           for (int n = 0; n < N; n++) {
@@ -1033,9 +1089,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         }
       }
     } else {
-  #pragma unroll
+  #ifdef __HIP__GFX9__
+    #pragma unroll
       for (int n = 0; n < N; n++) {
-  #pragma unroll
+    #pragma unroll
         for (int y = 0; y < YTILE; y++) {
           float accm = sum4[n][y][0];
           accm += __builtin_amdgcn_mov_dpp(sum4[n][y][1], 0x101, 0xf, 0xf,
@@ -1057,7 +1114,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
           sum4[n][y][0] = accm;
         }
       }
-      if (threadIdx.x == 63) {
+      if (threadIdx.x == (THRDS - 1)) {
         scalar_t biases[N][YTILE] = {};
         if (BIAS)
           for (int n = 0; n < N; n++) {
@@ -1074,6 +1131,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
           }
         }
       }
+  #endif  // __HIP__GFX9__ (MFMA path)
     }
 
     m += CuCount * _WvPrGrp * YTILE;
@@ -1090,7 +1148,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     }
   }
 }
-#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
+#else
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N>
 __global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap,
@@ -1101,7 +1159,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap,
                                  const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
+#endif
 
 // Find the min val of div2 that doesn't increase N/(div1*div2)
 int mindiv(int N, int div1, int div2) {
@@ -1148,40 +1206,40 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const int max_lds_len = get_lds_size() / 2;
 
-#define WVSPLITK(_YTILE, _UNRL, _N)                                           \
+#define WVSPLITK_CFG(_THRDS, _WVPRGRP, _YTILE, _UNRL, _N)                     \
   {                                                                           \
-    dim3 block(64, 16);                                                       \
-    int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16);                       \
+    dim3 block(_THRDS, _WVPRGRP);                                             \
+    int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, _WVPRGRP);                 \
     if ((Kbp_in * N_in <= max_lds_len) && (M_in % _YTILE == 0))               \
-      wvSplitK_hf_sml_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                  \
+      wvSplitK_hf_sml_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N>        \
           <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
                                        By_in, af4, bf4, biasf4, c, __wvPrGrp, \
                                        CuCount);                              \
     else if (Kbp_in * N_in <= max_lds_len * 1.2)                              \
-      wvSplitK_hf_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                      \
+      wvSplitK_hf_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N>            \
           <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
                                        By_in, af4, bf4, biasf4, c, __wvPrGrp, \
                                        CuCount);                              \
     else                                                                      \
-      wvSplitK_hf_big_<fptype, 64, _YTILE, 16, 8, _UNRL, _N>                  \
+      wvSplitK_hf_big_<fptype, _THRDS, _YTILE, _WVPRGRP, 8, _UNRL, _N>        \
           <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
                                        By_in, af4, bf4, biasf4, c, __wvPrGrp, \
                                        CuCount);                              \
   }
 
-#define WVSPLIT_TILE(_sYT, __N)                           \
+#define WVSPLIT_TILE_CFG(_THRDS, _WVPRGRP, _sYT, __N)     \
   {                                                       \
     bool fit_lds = (Kbp_in * N_in <= max_lds_len);        \
     if (_sYT <= 1)                                        \
-      WVSPLITK(1, 4, __N)                                 \
+      WVSPLITK_CFG(_THRDS, _WVPRGRP, 1, 4, __N)           \
     else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \
-      WVSPLITK(2, 2, __N)                                 \
+      WVSPLITK_CFG(_THRDS, _WVPRGRP, 2, 2, __N)           \
     else if (_sYT <= 4 * 3)                               \
-      WVSPLITK(3, 2, __N)                                 \
+      WVSPLITK_CFG(_THRDS, _WVPRGRP, 3, 2, __N)           \
     else if (__N == 4)                                    \
-      WVSPLITK(4, 1, __N)                                 \
+      WVSPLITK_CFG(_THRDS, _WVPRGRP, 4, 1, __N)           \
     else                                                  \
-      WVSPLITK(4, 2, __N)                                 \
+      WVSPLITK_CFG(_THRDS, _WVPRGRP, 4, 2, __N)           \
   }
 
   AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] {
@@ -1198,18 +1256,31 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b,
     // then cut the active waves to balance their distribution...
     int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4);
 
+    const bool use_wave32 = on_gfx1x();
     switch (N_in) {
       case 1:
-        WVSPLIT_TILE(sYT, 1)
+        if (use_wave32)
+          WVSPLIT_TILE_CFG(32, 16, sYT, 1)
+        else
+          WVSPLIT_TILE_CFG(64, 16, sYT, 1)
         break;
       case 2:
-        WVSPLIT_TILE(sYT, 2)
+        if (use_wave32)
+          WVSPLIT_TILE_CFG(32, 16, sYT, 2)
+        else
+          WVSPLIT_TILE_CFG(64, 16, sYT, 2)
         break;
       case 3:
-        WVSPLIT_TILE(sYT, 3)
+        if (use_wave32)
+          WVSPLIT_TILE_CFG(32, 16, sYT, 3)
+        else
+          WVSPLIT_TILE_CFG(64, 16, sYT, 3)
         break;
       case 4:
-        WVSPLIT_TILE(sYT, 4)
+        if (use_wave32)
+          WVSPLIT_TILE_CFG(32, 16, sYT, 4)
+        else
+          WVSPLIT_TILE_CFG(64, 16, sYT, 4)
         break;
       default:
         throw std::runtime_error(
@@ -1653,7 +1724,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   #endif
   }
 }
-#else   // !defined(__HIP__GFX9__) TODO: Add NAVI support
+#else
 template <typename scalar_t, int THRDS, int YTILE, int WvPrGrp, int A_CHUNK,
           int UNRL, int N, int GrpsShrB, int CHUNKK, int DTRMNSTC>
 __global__ void wvSplitKrc_(const int actlN, const int K, const int Kap,
@@ -1688,6 +1759,8 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
   TORCH_CHECK(in_a.dtype() == torch::kFloat16 ||
               in_a.dtype() == torch::kBFloat16);
 
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
+
   auto out_c = torch::empty(
       {N_in, M_in},
       torch::TensorOptions().dtype(in_a.dtype()).device(in_a.device()));
@@ -1696,7 +1769,6 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
 
   dim3 grid(CuCount);
 
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   // const int max_lds_len = get_lds_size() / 2;
 
@@ -1773,7 +1845,7 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b,
   return out_c;
 }
 
-#if defined(__HIP__MI3XX__)  // TODO: Add NAVI support
+#if defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
@@ -1817,12 +1889,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 
   uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
 
-  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
   float sA = *s_A;
   float sB = *s_B;
 
   while (m < M) {
+  #ifdef __HIP__GFX12__
+    // gfx12: per-lane scalar accumulation via v_dot4_f32_fp8_fp8
+    float sum[N][YTILE] = {};
+  #else
+    // gfx9: MFMA accumulation
     scalar8 sum[N][YTILE] = {};
+  #endif
     for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
       bigType bigA[N][UNRL] = {};
       bigType bigB[YTILE][UNRL];
@@ -1854,6 +1931,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   #pragma unroll
       for (uint32_t k2 = 0; k2 < UNRL; k2++) {
         for (uint32_t n = 0; n < N; n++) {
+  #ifdef __HIP__GFX12__
+          // gfx12: 4 x dot4 per A_CHUNK=16 bytes (4 FP8 per dot4)
+          for (int y = 0; y < YTILE; ++y) {
+    #pragma unroll
+            for (int i = 0; i < A_CHUNK / 4; i++) {
+              sum[n][y] = __builtin_amdgcn_dot4_f32_fp8_fp8(
+                  bigA[n][k2].i[i], bigB[y][k2].i[i], sum[n][y]);
+            }
+          }
+  #else
+          // gfx9: MFMA path
           for (int i = 0; i < A_CHUNK; i += 8) {
             for (int y = 0; y < YTILE; ++y) {
               sum[n][y] = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
@@ -1861,11 +1949,33 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
                   0);
             }
           }
+  #endif
         }
       }
     }
 
     // Final reduction
+  #ifdef __HIP__GFX12__
+    // gfx12 wave32: DPP row_shr within 16-lane rows + cross-row shuffle
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:1 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        sum[n][y] += __shfl_xor(sum[n][y], 16);
+      }
+    }
+  #else
+    // gfx9 MFMA reduction
     for (int n = 0; n < N; n++) {
       for (int y = 0; y < YTILE; y++) {
         float accm0 = sum[n][y][0];
@@ -1880,8 +1990,15 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         sum[n][y][0] = accm0;
       }
     }
+  #endif
 
-    if (threadIdx.x == 0) {
+    const bool writeback_lane =
+  #ifdef __HIP__GFX12__
+        threadIdx.x == (THRDS - 1);
+  #else
+        threadIdx.x == 0;
+  #endif
+    if (writeback_lane) {
       scalar_t biases[N][YTILE] = {};
       if (BIAS)
         for (int n = 0; n < N; n++) {
@@ -1892,13 +2009,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
       for (int n = 0; n < N; n++) {
         for (int y = 0; y < YTILE; y++) {
           if (y + m >= M) break;  // To avoid mem access fault.
-          sum[n][y][0] *= sA * sB;
+  #ifdef __HIP__GFX12__
+          float result = sum[n][y] * sA * sB;
+  #else
+          float result = sum[n][y][0] * sA * sB;
+  #endif
           if constexpr (std::is_same_v<scalar_t, half>) {
-            sum[n][y][0] += __half2float(biases[n][y]);
+            result += __half2float(biases[n][y]);
           } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-            sum[n][y][0] += __bfloat162float(biases[n][y]);
+            result += __bfloat162float(biases[n][y]);
           }
-          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
+          C[m + y + n * M] = __float2s<scalar_t>(result);
         }
       }
     }
@@ -1906,7 +2027,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     m += CuCount * _WvPrGrp * YTILE;
   }
 }
-#else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
+#else   // !defined(__HIP__MI3XX__) && !defined(__HIP__GFX12__)
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp,
@@ -1918,9 +2039,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp,
                                   const int _WvPrGrp, const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support
+#endif  // defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
 
-#if defined(__HIP__MI3XX__)  // TODO: Add NAVI support
+#if defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void __launch_bounds__(WvPrGrp* THRDS)
@@ -1963,12 +2084,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
 
   uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE;
 
-  using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
   float sA = *s_A;
   float sB = *s_B;
 
   while (m < M) {
+  #ifdef __HIP__GFX12__
+    // gfx12: per-lane scalar accumulation via v_dot4_f32_fp8_fp8
+    float sum[N][YTILE] = {};
+  #else
+    // gfx9: MFMA accumulation
     scalar8 sum[N][YTILE] = {};
+  #endif
     for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) {
       bigType bigA[N][UNRL] = {};
       bigType bigB[YTILE][UNRL];
@@ -2002,6 +2128,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
   #pragma unroll
       for (uint32_t k2 = 0; k2 < UNRL; k2++) {
         for (uint32_t n = 0; n < N; n++) {
+  #ifdef __HIP__GFX12__
+          // gfx12: 4 x dot4 per A_CHUNK=16 bytes (4 FP8 per dot4)
+          for (int y = 0; y < YTILE; ++y) {
+    #pragma unroll
+            for (int i = 0; i < A_CHUNK / 4; i++) {
+              sum[n][y] = __builtin_amdgcn_dot4_f32_fp8_fp8(
+                  bigA[n][k2].i[i], bigB[y][k2].i[i], sum[n][y]);
+            }
+          }
+  #else
+          // gfx9: MFMA path
           for (int i = 0; i < A_CHUNK; i += 8) {
             for (int y = 0; y < YTILE; ++y) {
               sum[n][y] = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(
@@ -2009,11 +2146,33 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
                   0);
             }
           }
+  #endif
         }
       }
     }
 
     // Final reduction
+  #ifdef __HIP__GFX12__
+    // gfx12 wave32: DPP row_shr within 16-lane rows + cross-row shuffle
+    for (int n = 0; n < N; n++) {
+      for (int y = 0; y < YTILE; y++) {
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:1 bound_ctrl:0 "
+            : "=v"(sum[n][y])
+            : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y]));
+        sum[n][y] += __shfl_xor(sum[n][y], 16);
+      }
+    }
+  #else
+    // gfx9 MFMA reduction
     for (int n = 0; n < N; n++) {
       for (int y = 0; y < YTILE; y++) {
         float accm0 = sum[n][y][0];
@@ -2028,8 +2187,15 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
         sum[n][y][0] = accm0;
       }
     }
+  #endif
 
-    if (threadIdx.x == 0) {
+    const bool writeback_lane =
+  #ifdef __HIP__GFX12__
+        threadIdx.x == (THRDS - 1);
+  #else
+        threadIdx.x == 0;
+  #endif
+    if (writeback_lane) {
       scalar_t biases[N][YTILE] = {};
       if (BIAS)
         for (int n = 0; n < N; n++) {
@@ -2040,13 +2206,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
       for (int n = 0; n < N; n++) {
         for (int y = 0; y < YTILE; y++) {
           if (y + m >= M) break;  // To avoid mem access fault.
-          sum[n][y][0] *= sA * sB;
+  #ifdef __HIP__GFX12__
+          float result = sum[n][y] * sA * sB;
+  #else
+          float result = sum[n][y][0] * sA * sB;
+  #endif
           if constexpr (std::is_same_v<scalar_t, half>) {
-            sum[n][y][0] += __half2float(biases[n][y]);
+            result += __half2float(biases[n][y]);
           } else if constexpr (std::is_same_v<scalar_t, __hip_bfloat16>) {
-            sum[n][y][0] += __bfloat162float(biases[n][y]);
+            result += __bfloat162float(biases[n][y]);
           }
-          C[m + y + n * M] = __float2s<scalar_t>(sum[n][y][0]);
+          C[m + y + n * M] = __float2s<scalar_t>(result);
         }
       }
     }
@@ -2054,7 +2224,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS)
     m += CuCount * _WvPrGrp * YTILE;
   }
 }
-#else   // !defined(__HIP__MI3XX__) TODO: Add NAVI support
+#else   // !defined(__HIP__MI3XX__) && !defined(__HIP__GFX12__)
 template <typename scalar_t, typename fp8_t, int THRDS, int YTILE, int WvPrGrp,
           int A_CHUNK, int UNRL, int N>
 __global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp,
@@ -2066,7 +2236,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp,
                               const int CuCount) {
   UNREACHABLE_CODE
 }
-#endif  // defined(__HIP__MI3XX__) TODO: Add NAVI support
+#endif  // defined(__HIP__MI3XX__) || defined(__HIP__GFX12__)
 
 void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
                const std::optional<at::Tensor>& in_bias, at::Tensor& out_c,
@@ -2099,24 +2269,30 @@ void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   const int max_lds_len = get_lds_size();
 
-#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N)             \
-  {                                                                           \
-    dim3 block(64, _WvPrGrp);                                                 \
-    if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {            \
-      int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16));     \
-      wvSplitKQ_hf_sml_<fptype, fp8_t, 64, _YTILEs, _WvPrGrp, 16, _UNRLs, _N> \
-          <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
-                                       By_in, b_ptr, a_ptr, bias_ptr, c_ptr,  \
-                                       s_a, s_b, __wvPrGrp, CuCount);         \
-    } else {                                                                  \
-      int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16));     \
-      wvSplitKQ_hf_<fptype, fp8_t, 64, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>     \
-          <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,     \
-                                       By_in, b_ptr, a_ptr, bias_ptr, c_ptr,  \
-                                       s_a, s_b, __wvPrGrp, CuCount);         \
-    }                                                                         \
+#define WVSPLITKQ_IMPL(_THRDS, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
+  {                                                                            \
+    dim3 block(_THRDS, _WvPrGrp);                                              \
+    if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) {             \
+      int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16));      \
+      wvSplitKQ_hf_sml_<fptype, fp8_t, _THRDS, _YTILEs, _WvPrGrp, 16, _UNRLs,  \
+                        _N><<<grid, block, 0, stream>>>(                       \
+          K_in, Kap_in, Kbp_in, M_in, Bx_in, By_in, b_ptr, a_ptr, bias_ptr,    \
+          c_ptr, s_a, s_b, __wvPrGrp, CuCount);                                \
+    } else {                                                                   \
+      int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16));      \
+      wvSplitKQ_hf_<fptype, fp8_t, _THRDS, _YTILEm, _WvPrGrp, 16, _UNRLm, _N>  \
+          <<<grid, block, 0, stream>>>(K_in, Kap_in, Kbp_in, M_in, Bx_in,      \
+                                       By_in, b_ptr, a_ptr, bias_ptr, c_ptr,   \
+                                       s_a, s_b, __wvPrGrp, CuCount);          \
+    }                                                                          \
   }
 
+#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N)      \
+  if (on_gfx12())                                                      \
+    WVSPLITKQ_IMPL(32, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \
+  else                                                                 \
+    WVSPLITKQ_IMPL(64, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N)
+
   AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] {
     using fptype = typename scalar<scalar_t>::type;
     auto c_ptr = reinterpret_cast<fptype*>(out_c.data_ptr());
@@ -2136,10 +2312,10 @@ void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a,
           WVSPLITKQ(16, 2, 2, 2, 2, 2)
           break;
         case 3:
-          WVSPLITKQ(16, 2, 2, 2, 2, 3)
+          WVSPLITKQ(16, 2, 2, 1, 1, 3)
           break;
         case 4:
-          WVSPLITKQ(16, 2, 2, 2, 2, 4)
+          WVSPLITKQ(16, 2, 2, 1, 1, 4)
           break;
         default:
           throw std::runtime_error(
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index c6e972e89d00..e5a216c77ba6 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -44,7 +44,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 # Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev \
+    && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev liblzma-dev pkg-config \
     && for i in 1 2 3; do \
         add-apt-repository -y ppa:deadsnakes/ppa && break || \
         { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md
index e4bb0b696727..1d12d63549a0 100644
--- a/docs/contributing/profiling.md
+++ b/docs/contributing/profiling.md
@@ -3,6 +3,10 @@
 !!! warning
     Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
 
+!!! tip "Choosing a profiler"
+    - Use **Nsight Systems** for low-overhead, performance-critical profiling.
+    - Use **PyTorch Profiler** for medium-overhead profiling with richer debugging information (e.g., stack traces, memory, shapes). Note that enabling these features adds overhead and is not recommended for benchmarking.
+
 ## Profile with PyTorch Profiler
 
 We support tracing vLLM workers using different profilers. You can enable profiling by setting the `--profiler-config` flag when launching the server.
diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md
index 3d2e02e9d165..6045a4014209 100644
--- a/docs/design/moe_kernel_features.md
+++ b/docs/design/moe_kernel_features.md
@@ -88,8 +88,8 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k
 | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
 | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
-| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
-| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] |
+| trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],</br>[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],</br>[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],</br>[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] |
+| rocm aiter moe | standard | mxfp4,</br>fp8 | G(32),G(128),A,T | silu, gelu,</br>swigluoai | Y | N | `rocm_aiter_fused_experts`,</br>`AiterExperts` |
 | cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] |
 | naive batched<sup>4</sup> | batched | int8,</br>fp8 | G,A,T | silu, gelu | <sup>6</sup> | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] |
 
diff --git a/docs/models/pooling_models/README.md b/docs/models/pooling_models/README.md
index b34cc1efe6ae..02e2c82cf009 100644
--- a/docs/models/pooling_models/README.md
+++ b/docs/models/pooling_models/README.md
@@ -31,28 +31,29 @@ Of course, we also have "plugin" tasks that allow users to customize input and o
 
 ### Pooling Tasks
 
-| Pooling Tasks      | Granularity   | Outputs                                         |
-|--------------------|---------------|-------------------------------------------------|
-| `classify`         | Sequence-wise | probability vector of classes for each sequence |
-| `score` (see note) | Sequence-wise | reranker score for each sequence                |
-| `embed`            | Sequence-wise | vector representations for each sequence        |
-| `token_classify`   | Token-wise    | probability vector of classes for each token    |
-| `token_embed`      | Token-wise    | vector representations for each token           |
+| Pooling Tasks         | Granularity   | Outputs                                         |
+|-----------------------|---------------|-------------------------------------------------|
+| `classify` (see note) | Sequence-wise | probability vector of classes for each sequence |
+| `embed`               | Sequence-wise | vector representations for each sequence        |
+| `token_classify`      | Token-wise    | probability vector of classes for each token    |
+| `token_embed`         | Token-wise    | vector representations for each token           |
 
 !!! note
     Within classification tasks, there is a specialized subcategory: Cross-encoder (aka reranker) models. These models are a subset of classification models that accept two prompts as input and output num_labels equal to 1.
 
 ### Score Types
 
-| Pooling Tasks      | Granularity   | Outputs                                         | Score Types        | scoring function         |
-|--------------------|---------------|-------------------------------------------------|--------------------|--------------------------|
-| `classify`         | Sequence-wise | probability vector of classes for each sequence | nan                | nan                      |
-| `score` (see note) | Sequence-wise | reranker score for each sequence                | `cross-encoder`    | linear classifier        |
-| `embed`            | Sequence-wise | vector representations for each sequence        | `bi-encoder`       | cosine similarity        |
-| `token_classify`   | Token-wise    | probability vector of classes for each token    | nan                | nan                      |
-| `token_embed`      | Token-wise    | vector representations for each token           | `late-interaction` | late interaction(MaxSim) |
+The scoring models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`.
 
-The score models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`.
+| Pooling Tasks         | Granularity   | Outputs                                      | Score Types        | scoring function         |
+|-----------------------|---------------|----------------------------------------------|--------------------|--------------------------|
+| `classify` (see note) | Sequence-wise | reranker score for each sequence             | `cross-encoder`    | linear classifier        |
+| `embed`               | Sequence-wise | vector representations for each sequence     | `bi-encoder`       | cosine similarity        |
+| `token_classify`      | Token-wise    | probability vector of classes for each token | nan                | nan                      |
+| `token_embed`         | Token-wise    | vector representations for each token        | `late-interaction` | late interaction(MaxSim) |
+
+!!! note
+    Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
 
 ### Pooling Usages
 
@@ -85,14 +86,16 @@ enabling the corresponding APIs.
 
 ### Offline APIs corresponding to pooling tasks
 
-| Task             | APIs                                                                       |
-|------------------|----------------------------------------------------------------------------|
-| `embed`          | `LLM.embed(...)`,`LLM.encode(..., pooling_task="embed")`, `LLM.score(...)` |
-| `classify`       | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")`            |
-| `score`          | `LLM.score(...)`                                                           |
-| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")`        |
-| `token_embed`    | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)`            |
-| `plugin`         | `LLM.encode(..., pooling_task="plugin")`                                   |
+| Task             | APIs                                                                                  |
+|------------------|---------------------------------------------------------------------------------------|
+| `embed`          | `LLM.embed(...)`, `LLM.encode(..., pooling_task="embed")`, `LLM.score(...)`(see note) |
+| `classify`       | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")`, `LLM.score(...)`     |
+| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")`                   |
+| `token_embed`    | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)`                       |
+| `plugin`         | `LLM.encode(..., pooling_task="plugin")`                                              |
+
+!!! note
+    Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
 
 ### `LLM.classify`
 
@@ -206,11 +209,11 @@ If `--runner pooling` has been set (manually or automatically) but the model doe
 vLLM will attempt to automatically convert the model according to the architecture names
 shown in the table below.
 
-| Architecture                                    | `--convert` | Supported pooling tasks               |
-| ----------------------------------------------- | ----------- | ------------------------------------- |
-| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `token_embed`, `embed`                |
-| `*ForRewardModeling`, `*RewardModel`            | `embed`     | `token_embed`, `embed`                |
-| `*For*Classification`, `*ClassificationModel`   | `classify`  | `token_classify`, `classify`, `score` |
+| Architecture                                    | `--convert` | Supported pooling tasks      |
+|-------------------------------------------------|-------------|------------------------------|
+| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed`     | `token_embed`, `embed`       |
+| `*ForRewardModeling`, `*RewardModel`            | `embed`     | `token_embed`, `embed`       |
+| `*For*Classification`, `*ClassificationModel`   | `classify`  | `token_classify`, `classify` |
 
 !!! tip
     You can explicitly set `--convert <type>` to specify how to convert the model.
@@ -251,3 +254,7 @@ Pooling models now default support all pooling, you can use it without any setti
 
 - Extracting hidden states prefers using `token_embed` task.
 - Named Entity Recognition (NER) and reward models prefers using `token_classify` task.
+
+### Score task
+
+`score` task is deprecated and will be removed in v0.20. Please use `classify` instead. Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
diff --git a/docs/models/pooling_models/classify.md b/docs/models/pooling_models/classify.md
index 10d7892b5361..1247bb4a0bbc 100644
--- a/docs/models/pooling_models/classify.md
+++ b/docs/models/pooling_models/classify.md
@@ -17,6 +17,8 @@ The key distinction between (sequence) classification and token classification l
 
 Many classification models support both (sequence) classification and token classification. For further details on token classification, please refer to [this page](token_classify.md).
 
+Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled, please refer to [this page](scoring.md).
+
 ## Typical Use Cases
 
 ### Classification
@@ -54,7 +56,7 @@ If your model is not in the above list, we will try to automatically convert the
 
 Cross-encoder (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. Most classification models can also be used as [cross-encoder models](scoring.md#cross-encoder-models). For more information on cross-encoder models, please refer to [this page](scoring.md).
 
---8<-- "docs/models/pooling_models/scoring.md:supported-score-models"
+--8<-- "docs/models/pooling_models/scoring.md:supported-cross-encoder-models"
 
 ### Reward Models
 
diff --git a/docs/models/pooling_models/scoring.md b/docs/models/pooling_models/scoring.md
index 6227b689acb0..ac94a0cd76bc 100644
--- a/docs/models/pooling_models/scoring.md
+++ b/docs/models/pooling_models/scoring.md
@@ -10,11 +10,11 @@ The score models is designed to compute similarity scores between two input prom
 - Model Usage: Scoring
 - Pooling Task:
 
-| Score Types        | Pooling Tasks | scoring function         |
-|--------------------|---------------|--------------------------|
-| `cross-encoder`    | `score`       | linear classifier        |
-| `late-interaction` | `token_embed` | late interaction(MaxSim) |
-| `bi-encoder`       | `embed`       | cosine similarity        |
+| Score Types        | Pooling Tasks         | scoring function         |
+|--------------------|-----------------------|--------------------------|
+| `cross-encoder`    | `classify` (see note) | linear classifier        |
+| `late-interaction` | `token_embed`         | late interaction(MaxSim) |
+| `bi-encoder`       | `embed`               | cosine similarity        |
 
 - Offline APIs:
     - `LLM.score`
@@ -22,13 +22,16 @@ The score models is designed to compute similarity scores between two input prom
     - [Score API](scoring.md#score-api) (`/score`)
     - [Rerank API](scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
 
+!!! note
+    Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled.
+
 ## Supported Models
 
 ### Cross-encoder models
 
 [Cross-encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1.
 
---8<-- [start:supported-score-models]
+--8<-- [start:supported-cross-encoder-models]
 
 #### Text-only Models
 
@@ -99,7 +102,7 @@ The score models is designed to compute similarity scores between two input prom
     vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
     ```
 
---8<-- [end:supported-score-models]
+--8<-- [end:supported-cross-encoder-models]
 
 ### Late-interaction models
 
diff --git a/docs/models/pooling_models/specific_models.md b/docs/models/pooling_models/specific_models.md
index 4b0027a3dd4b..0d908c1aa1a3 100644
--- a/docs/models/pooling_models/specific_models.md
+++ b/docs/models/pooling_models/specific_models.md
@@ -11,6 +11,7 @@ vLLM supports ColBERT models with multiple encoder backbones:
 | `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` |
 | `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` |
 | `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` |
+| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` |
 
 **BERT-based ColBERT** models work out of the box:
 
@@ -29,6 +30,10 @@ vllm serve lightonai/GTE-ModernColBERT-v1 \
 vllm serve jinaai/jina-colbert-v2 \
     --hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \
     --trust-remote-code
+
+# LFM2 backbone
+vllm serve LiquidAI/LFM2-ColBERT-350M \
+    --hf-overrides '{"architectures": ["ColBERTLfm2Model"]}'
 ```
 
 Then you can use the rerank API:
diff --git a/docs/models/pooling_models/token_embed.md b/docs/models/pooling_models/token_embed.md
index c950d2e99376..e847fb09bcbb 100644
--- a/docs/models/pooling_models/token_embed.md
+++ b/docs/models/pooling_models/token_embed.md
@@ -39,6 +39,7 @@ Models of any architecture can be converted into embedding models using `--conve
 
 | Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) |
 | ------------ | ------ | ----------------- | -------------------- | ------------------------- |
+| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | | |
 | `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | |
 | `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | |
 | `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | |
diff --git a/requirements/test.in b/requirements/test.in
index 8bd00514435b..be4c2e5795f4 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test
 peft>=0.15.0 # required for phi-4-mm test
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
+resampy # required for audio tests
 sentence-transformers>=5.2.0 # required for embedding tests
 soundfile # required for audio tests
 jiwer # required for audio tests
diff --git a/requirements/test.txt b/requirements/test.txt
index e2f9040beecc..7d3a988a729d 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -544,6 +544,7 @@ numba==0.61.2
     # via
     #   -r requirements/test.in
     #   librosa
+    #   resampy
 numpy==2.2.6
     # via
     #   -r requirements/test.in
@@ -584,6 +585,7 @@ numpy==2.2.6
     #   pyogrio
     #   pywavelets
     #   rasterio
+    #   resampy
     #   rioxarray
     #   rouge-score
     #   runai-model-streamer
@@ -995,6 +997,8 @@ requests==2.32.3
     #   tiktoken
     #   transformers
     #   wandb
+resampy==0.4.3
+    # via -r requirements/test.in
 responses==0.25.3
     # via genai-perf
 rfc3339-validator==0.1.4
diff --git a/setup.py b/setup.py
index 7b5c49e98b6b..2f251a6a296d 100644
--- a/setup.py
+++ b/setup.py
@@ -987,11 +987,11 @@ def _read_requirements(filename: str) -> list[str]:
         "instanttensor": ["instanttensor >= 0.1.5"],
         "runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
         "audio": [
-            "librosa",
+            "av",
+            "resampy",
             "scipy",
             "soundfile",
             "mistral_common[audio]",
-            "av",
         ],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         "flashinfer": [],  # Kept for backwards compatibility
diff --git a/tests/compile/fusions_e2e/conftest.py b/tests/compile/fusions_e2e/conftest.py
index 5716c95bb241..7cd2acdf56c2 100644
--- a/tests/compile/fusions_e2e/conftest.py
+++ b/tests/compile/fusions_e2e/conftest.py
@@ -84,7 +84,10 @@ def run(
 
         # TODO: remove this after finishing migration from envs to model kwargs
         if model_name == "openai/gpt-oss-20b":
-            monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
+            from .common import is_blackwell
+
+            if is_blackwell():
+                monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1")
 
         # Disable, compile cache to make sure custom passes run.
         # Otherwise, we can't verify fusion happened through the logs.
diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py
index 9f6a1a13e8ea..8a5191ed226c 100644
--- a/tests/compile/test_aot_compile.py
+++ b/tests/compile/test_aot_compile.py
@@ -14,6 +14,7 @@
 import pytest
 import torch
 
+import vllm.envs as envs
 import vllm.model_executor.layers.activation
 from vllm.compilation.backends import VllmBackend
 from vllm.compilation.caching import (
@@ -162,6 +163,9 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch):
 
 @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
 def test_save_and_load_slice(monkeypatch: pytest.MonkeyPatch):
+    from torch._subclasses import FakeTensorMode
+    from torch.fx.experimental.symbolic_shapes import ShapeEnv
+
     def foo(x: torch.Tensor):
         return x[slice(0, x.shape[0])]
 
@@ -172,12 +176,13 @@ def foo(x: torch.Tensor):
     gm = torch.fx.symbolic_trace(foo)
     assert "getitem_1 = x[slice(0, getitem, None)]" in gm.code
     with use_vllm_config(vllm_config):
-        payload = VllmSerializableFunction.serialize_compile_artifacts(
-            VllmSerializableFunction(gm, (example_input,), "", foo)
+        payload = VllmSerializableFunction.serialize_graph_module(gm)
+        fake_mode = FakeTensorMode(shape_env=ShapeEnv())
+        loaded_gm = VllmSerializableFunction.deserialize_graph_module(
+            payload, fake_mode
         )
-        fn = VllmSerializableFunction.deserialize_compile_artifacts(payload)
 
-    assert gm.code == fn.graph_module.code
+    assert gm.code == loaded_gm.code
 
 
 @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
@@ -725,6 +730,10 @@ def test_deduplication(self):
         ]:
             assert cache.get(submod, shape) == shared_data
 
+    @pytest.mark.skipif(
+        envs.VLLM_USE_MEGA_AOT_ARTIFACT,
+        reason="There's no AOT Autograd run with mega artifact",
+    )
     def test_functorch_config(self):
         vllm_config = make_vllm_config()
         example_inputs = (torch.randn(10, 10),)
diff --git a/tests/compile/test_startup.py b/tests/compile/test_startup.py
index 545299565c16..32a586011590 100644
--- a/tests/compile/test_startup.py
+++ b/tests/compile/test_startup.py
@@ -9,11 +9,15 @@
 
 import multiprocessing as mp
 
+import pytest
 from torch._dynamo.utils import counters
 
+import vllm.envs as envs
 from vllm.compilation.counter import compilation_counter
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
 
+from ..utils import fork_new_process_for_each_test
+
 MODEL = "microsoft/Phi-tiny-MoE-instruct"
 
 
@@ -45,8 +49,11 @@ def _cold_start(vllm_runner):
     assert counters["aot_autograd"]["autograd_cache_hit"] == 0
 
 
-def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
+@fork_new_process_for_each_test
+@pytest.mark.parametrize("mega_aot_artifact", ["0", "1"])
+def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact):
     monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+    monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact)
 
     # Cold start in a forked child (must fork before CUDA init).
     # This model has 32 identical transformer layers which produce
@@ -64,7 +71,12 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache):
         num_compiled_artifacts_saved=0,
     ):
         _run_vllm(vllm_runner)
-    assert counters["aot_autograd"]["total"] == 30
+    if envs.VLLM_USE_MEGA_AOT_ARTIFACT:
+        # MEGA_AOT_ARTIFACT is enabled, so we expect no aot_autograd running on
+        # subgraphs.
+        assert counters["aot_autograd"]["total"] == 0
+    else:
+        assert counters["aot_autograd"]["total"] == 30
     assert counters["aot_autograd"]["autograd_cache_miss"] == 0
     assert (
         counters["aot_autograd"]["autograd_cache_hit"] == 0
diff --git a/tests/conftest.py b/tests/conftest.py
index 719bfa5ed1f0..f3b22d898903 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,9 +6,6 @@
 
 from tblib import pickling_support
 
-# Import fixture
-from tests.v1.entrypoints.conftest import sample_json_schema  # noqa
-
 # ruff: noqa
 
 # Install support for pickling exceptions so that we can nicely propagate
@@ -81,6 +78,55 @@
 
 logger = init_logger(__name__)
 
+
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "age": {"type": "integer"},
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                },
+            },
+            "grade": {
+                "type": "string",
+                "pattern": "^[A-D]$",
+            },
+            "email": {
+                "type": "string",
+                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {"type": "string"},
+                        "duration": {
+                            "type": "number",
+                            "minimum": 0.0,
+                            "maximum": 100.0,
+                        },
+                        "position": {"type": "string"},
+                    },
+                    "required": ["company", "duration", "position"],
+                    "additionalProperties": False,
+                },
+                "minItems": 0,
+                "maxItems": 3,
+            },
+        },
+        "required": ["name", "age", "skills", "grade", "email", "work_history"],
+        "additionalProperties": False,
+        "minProperties": 1,
+        "maxProperties": 10,
+    }
+
+
 _TEST_DIR = os.path.dirname(__file__)
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/entrypoints/llm/test_struct_output_generate.py
similarity index 91%
rename from tests/v1/entrypoints/llm/test_struct_output_generate.py
rename to tests/entrypoints/llm/test_struct_output_generate.py
index 70c6d250bc1b..3ece27234368 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/entrypoints/llm/test_struct_output_generate.py
@@ -24,6 +24,108 @@
     StructuredOutputsParams,
 )
 
+SAMPLE_REGEX = (
+    r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+    r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
+)
+
+# Note: Ensure this only uses attributes compatible with xgrammar
+SAMPLE_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "name": {"type": "string"},
+        "age": {"type": "integer"},
+        "skills": {
+            "type": "array",
+            "items": {
+                "type": "string",
+            },
+        },
+        "grade": {
+            "type": "string",
+            "pattern": "^[A-D]$",  # Regex pattern
+        },
+        "email": {
+            "type": "string",
+            "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
+        },
+        "work_history": {
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "company": {"type": "string"},
+                    "duration": {
+                        "type": "number",
+                        "minimum": 0.0,
+                        "maximum": 100.0,  # Numeric range
+                    },
+                    "position": {"type": "string"},
+                },
+                "required": ["company", "duration", "position"],
+                "additionalProperties": False,
+            },
+            "minItems": 0,
+            "maxItems": 3,
+        },
+    },
+    "required": ["name", "age", "skills", "grade", "email", "work_history"],
+    "additionalProperties": False,
+    "minProperties": 1,
+    "maxProperties": 10,
+}
+
+# A schema unsupported by xgrammar
+UNSUPPORTED_JSON_SCHEMA = {
+    "type": "object",
+    "properties": {
+        "score": {
+            "type": "integer",
+            "multipleOf": 5,  # Numeric multiple
+        },
+        "tags": {
+            "type": "array",
+            "items": {"type": "string", "minLength": 10, "maxLength": 20},
+        },
+    },
+    "required": ["score", "tags"],
+    "additionalProperties": False,
+    "patternProperties": {
+        "^score$": {"type": "integer"},
+    },
+}
+
+SAMPLE_STRUCTURED_OUTPUTS_CHOICES = [
+    "Python",
+    "Java",
+    "JavaScript",
+    "C++",
+    "C#",
+    "PHP",
+    "TypeScript",
+    "Ruby",
+    "Swift",
+    "Kotlin",
+]
+
+SAMPLE_SQL_EBNF = """
+root ::= select_statement
+select_statement ::= "SELECT" column "from" table "where" condition
+column ::= "col_1" | "col_2"
+table ::= "table_1" | "table_2"
+condition ::= column "=" number
+number ::= "1" | "2"
+"""
+
+SAMPLE_SQL_LARK = """
+start: select_statement
+select_statement: "SELECT" column "from" table "where" condition
+column: "col_1" | "col_2"
+table: "table_1" | "table_2"
+condition: column "=" number
+number: "1" | "2"
+"""
+
 NGRAM_SPEC_CONFIG = {
     "model": "[ngram]",
     "num_speculative_tokens": 5,
@@ -110,17 +212,17 @@ class CarDescription(BaseModel):
     PARAMS_MODELS_BACKENDS_TOKENIZER_MODE,
 )
 def test_structured_output(
-    sample_json_schema: dict[str, Any],
-    unsupported_json_schema: dict[str, Any],
-    sample_sql_ebnf: str,
-    sample_sql_lark: str,
-    sample_regex: str,
-    sample_structured_outputs_choices: str,
     backend: str,
     tokenizer_mode: str,
     model_name: str,
     speculative_config: dict[str, Any],
 ):
+    sample_json_schema = SAMPLE_JSON_SCHEMA
+    unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA
+    sample_sql_ebnf = SAMPLE_SQL_EBNF
+    sample_sql_lark = SAMPLE_SQL_LARK
+    sample_regex = SAMPLE_REGEX
+    sample_structured_outputs_choices = SAMPLE_STRUCTURED_OUTPUTS_CHOICES
     if current_platform.is_tpu() and speculative_config:
         pytest.skip("TPU does not support speculative decoding")
 
@@ -702,10 +804,10 @@ def test_structured_output_with_reasoning_matrices(
 
 @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE)
 def test_structured_output_auto_mode(
-    unsupported_json_schema: dict[str, Any],
     model_name: str,
     tokenizer_mode: str,
 ):
+    unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA
     llm = LLM(
         model=model_name,
         max_model_len=1024,
@@ -808,9 +910,9 @@ def generate_with_backend(backend):
 
 @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"])
 def test_structured_output_batched_with_non_structured_outputs_requests(
-    sample_json_schema: dict[str, Any],
     backend: str,
 ):
+    sample_json_schema = SAMPLE_JSON_SCHEMA
     # Don't use eager execution on TPUs because we want to test for no
     # recompilation at runtime
     enforce_eager = bool(not current_platform.is_tpu())
diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/entrypoints/openai/chat_completion/test_chat_completion.py
similarity index 100%
rename from tests/v1/entrypoints/openai/test_chat_completion.py
rename to tests/entrypoints/openai/chat_completion/test_chat_completion.py
diff --git a/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py b/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
index 704598a5708b..965b21351302 100644
--- a/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
+++ b/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py
@@ -231,13 +231,14 @@ def k2_server():
         "--gpu-memory-utilization",
         "0.4",
     ] + ROCM_EXTRA_ARGS
-    # hack to test kimi_k2 tool use tool_id format.
-    # avoid error in is_deepseek_mla check by setting kv_lora_rank=null
+    # Test kimi_k2 tool use tool_id format by overriding model_type.
+    # is_deepseek_mla safely returns False via getattr when kv_lora_rank
+    # is absent from the underlying config.
     with RemoteOpenAIServer(
         MODEL_NAME,
         args,
         env_dict=ROCM_ENV_OVERRIDES,
-        override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None},
+        override_hf_configs={"model_type": "kimi_k2"},
     ) as remote_server:
         yield remote_server
 
diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/entrypoints/openai/chat_completion/test_completion_with_image_embeds.py
similarity index 100%
rename from tests/v1/entrypoints/openai/test_completion_with_image_embeds.py
rename to tests/entrypoints/openai/chat_completion/test_completion_with_image_embeds.py
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/completion/test_completion.py
similarity index 100%
rename from tests/v1/entrypoints/openai/test_completion.py
rename to tests/entrypoints/openai/completion/test_completion.py
diff --git a/tests/entrypoints/openai/cpu/__init__.py b/tests/entrypoints/openai/cpu/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py b/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
index e9bde638d4a3..4ac48699a022 100644
--- a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
+++ b/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py
@@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name)
             model_name,
             foscolo,
             language="it",
-            expected_text="ove il mio corpo fanciulletto giacque",
+            expected_text="ove il mio corpo fanciulletto",
         )
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/entrypoints/openai/test_multi_api_servers.py
similarity index 100%
rename from tests/v1/entrypoints/openai/test_multi_api_servers.py
rename to tests/entrypoints/openai/test_multi_api_servers.py
diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py
index cf7e2a7b0c07..bf670105bbc4 100644
--- a/tests/entrypoints/openai/test_run_batch.py
+++ b/tests/entrypoints/openai/test_run_batch.py
@@ -275,7 +275,7 @@
     ]
 )
 
-MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA="
+MINIMAL_WAV_BASE64 = "UklGRigAAABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YQQAAAAAAP9/"
 INPUT_TRANSCRIPTION_BATCH = (
     json.dumps(
         {
diff --git a/tests/entrypoints/instrumentator/__init__.py b/tests/entrypoints/serve/instrumentator/__init__.py
similarity index 100%
rename from tests/entrypoints/instrumentator/__init__.py
rename to tests/entrypoints/serve/instrumentator/__init__.py
diff --git a/tests/entrypoints/instrumentator/test_basic.py b/tests/entrypoints/serve/instrumentator/test_basic.py
similarity index 99%
rename from tests/entrypoints/instrumentator/test_basic.py
rename to tests/entrypoints/serve/instrumentator/test_basic.py
index 5f48fb266efb..1ab963dc1801 100644
--- a/tests/entrypoints/instrumentator/test_basic.py
+++ b/tests/entrypoints/serve/instrumentator/test_basic.py
@@ -11,11 +11,10 @@
 import requests
 from fastapi import Request
 
+from tests.utils import RemoteOpenAIServer
 from vllm.v1.engine.exceptions import EngineDeadError
 from vllm.version import __version__ as VLLM_VERSION
 
-from ...utils import RemoteOpenAIServer
-
 MODEL_NAME = "Qwen/Qwen3-0.6B"
 
 
diff --git a/tests/entrypoints/instrumentator/test_metrics.py b/tests/entrypoints/serve/instrumentator/test_metrics.py
similarity index 100%
rename from tests/entrypoints/instrumentator/test_metrics.py
rename to tests/entrypoints/serve/instrumentator/test_metrics.py
diff --git a/tests/entrypoints/instrumentator/test_optional_middleware.py b/tests/entrypoints/serve/instrumentator/test_optional_middleware.py
similarity index 98%
rename from tests/entrypoints/instrumentator/test_optional_middleware.py
rename to tests/entrypoints/serve/instrumentator/test_optional_middleware.py
index c2c7fbdb0114..fef10cdc0cdf 100644
--- a/tests/entrypoints/instrumentator/test_optional_middleware.py
+++ b/tests/entrypoints/serve/instrumentator/test_optional_middleware.py
@@ -10,7 +10,7 @@
 import pytest
 import requests
 
-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer
 
 # Use a small embeddings model for faster startup and smaller memory footprint.
 # Since we are not testing any chat functionality,
diff --git a/tests/entrypoints/instrumentator/test_orca_metrics.py b/tests/entrypoints/serve/instrumentator/test_orca_metrics.py
similarity index 98%
rename from tests/entrypoints/instrumentator/test_orca_metrics.py
rename to tests/entrypoints/serve/instrumentator/test_orca_metrics.py
index 1ce043df0cd8..923951367767 100644
--- a/tests/entrypoints/instrumentator/test_orca_metrics.py
+++ b/tests/entrypoints/serve/instrumentator/test_orca_metrics.py
@@ -5,7 +5,7 @@
 import pytest
 import pytest_asyncio
 
-from ...utils import RemoteOpenAIServer
+from tests.utils import RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "Qwen/Qwen3-0.6B"
diff --git a/tests/entrypoints/instrumentator/test_sleep.py b/tests/entrypoints/serve/instrumentator/test_sleep.py
similarity index 100%
rename from tests/entrypoints/instrumentator/test_sleep.py
rename to tests/entrypoints/serve/instrumentator/test_sleep.py
diff --git a/tests/entrypoints/openai/cpu/test_render.py b/tests/entrypoints/serve/render/test_render.py
similarity index 100%
rename from tests/entrypoints/openai/cpu/test_render.py
rename to tests/entrypoints/serve/render/test_render.py
diff --git a/tests/entrypoints/openai/cpu/test_render_multimodal.py b/tests/entrypoints/serve/render/test_render_multimodal.py
similarity index 100%
rename from tests/entrypoints/openai/cpu/test_render_multimodal.py
rename to tests/entrypoints/serve/render/test_render_multimodal.py
diff --git a/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
new file mode 100644
index 000000000000..62be504e2c52
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
@@ -0,0 +1,8 @@
+model_name: "Qwen/Qwen3.5-35B-A3B"
+accuracy_threshold: 0.86
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --max-model-len 4096
+  --data-parallel-size 2
+  --enable-expert-parallel
diff --git a/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
new file mode 100644
index 000000000000..9380e0b25803
--- /dev/null
+++ b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
@@ -0,0 +1,9 @@
+model_name: "Qwen/Qwen3.5-35B-A3B-FP8"
+accuracy_threshold: 0.86
+num_questions: 1319
+num_fewshot: 5
+server_args: >-
+  --max-model-len 4096
+  --data-parallel-size 2
+  --enable-expert-parallel
+  --kv-cache-dtype fp8
diff --git a/tests/evals/gsm8k/configs/models-mi3xx-quantized.txt b/tests/evals/gsm8k/configs/models-mi3xx-fp8-and-mixed.txt
similarity index 100%
rename from tests/evals/gsm8k/configs/models-mi3xx-quantized.txt
rename to tests/evals/gsm8k/configs/models-mi3xx-fp8-and-mixed.txt
diff --git a/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt b/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
new file mode 100644
index 000000000000..4e7af71c7f4a
--- /dev/null
+++ b/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
@@ -0,0 +1 @@
+Qwen3.5-35B-A3B-DEP2.yaml
diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py
index 347205755c68..3ebf9cc3713a 100644
--- a/tests/kernels/attention/test_attention_selector.py
+++ b/tests/kernels/attention/test_attention_selector.py
@@ -14,8 +14,19 @@
 )
 from vllm.platforms import current_platform
 from vllm.platforms.cpu import CpuPlatform
-from vllm.platforms.cuda import CudaPlatform
-from vllm.platforms.rocm import RocmPlatform
+
+# CudaPlatform and RocmPlatform import their respective compiled C extensions
+# at module level, raising ModuleNotFoundError on incompatible builds.
+try:
+    from vllm.platforms.cuda import CudaPlatform
+except (ImportError, ModuleNotFoundError):
+    CudaPlatform = None
+
+try:
+    from vllm.platforms.rocm import RocmPlatform
+except (ImportError, ModuleNotFoundError):
+    RocmPlatform = None
+
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend
 
@@ -101,6 +112,8 @@ def test_backend_selection(
             assert backend.get_name() == "CPU_ATTN"
 
         elif device == "hip":
+            if RocmPlatform is None:
+                pytest.skip("RocmPlatform not available")
             with patch("vllm.platforms.current_platform", RocmPlatform()):
                 if use_mla:
                     # ROCm MLA backend logic:
@@ -126,6 +139,8 @@ def test_backend_selection(
                     assert backend.get_name() == expected
 
         elif device == "cuda":
+            if CudaPlatform is None:
+                pytest.skip("CudaPlatform not available")
             with patch("vllm.platforms.current_platform", CudaPlatform()):
                 capability = torch.cuda.get_device_capability()
                 if use_mla:
@@ -214,7 +229,7 @@ def test_backend_selection(
                     assert backend.get_name() == expected
 
 
-@pytest.mark.parametrize("device", ["cpu", "cuda"])
+@pytest.mark.parametrize("device", ["cpu", "cuda", "hip"])
 def test_fp32_fallback(device: str):
     """Test attention backend selection with fp32."""
     # Use default config (no backend specified)
@@ -227,10 +242,25 @@ def test_fp32_fallback(device: str):
             assert backend.get_name() == "CPU_ATTN"
 
         elif device == "cuda":
+            if CudaPlatform is None:
+                pytest.skip("CudaPlatform not available")
             with patch("vllm.platforms.current_platform", CudaPlatform()):
                 backend = get_attn_backend(16, torch.float32, None)
             assert backend.get_name() == "FLEX_ATTENTION"
 
+        elif device == "hip":
+            if RocmPlatform is None:
+                pytest.skip("RocmPlatform not available")
+            # ROCm backends do not support head_size=16 (minimum is 32).
+            # No known HuggingFace transformer model uses head_size=16.
+            # Revisit if a real model with this head size is identified
+            # and accuracy-tested.
+            with (
+                patch("vllm.platforms.current_platform", RocmPlatform()),
+                pytest.raises(ValueError, match="No valid attention backend"),
+            ):
+                get_attn_backend(16, torch.float32, None)
+
 
 def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
     """Test FlashAttn validation."""
@@ -367,6 +397,8 @@ def test_per_head_quant_scales_backend_selection(
         attention_config=attention_config, cache_config=cache_config
     )
 
+    if CudaPlatform is None:
+        pytest.skip("CudaPlatform not available")
     with (
         set_current_vllm_config(vllm_config),
         patch("vllm.platforms.current_platform", CudaPlatform()),
diff --git a/tests/kernels/moe/test_cutedsl_moe.py b/tests/kernels/moe/test_cutedsl_moe.py
index 66a97b48bdc3..bca3eba0f91c 100644
--- a/tests/kernels/moe/test_cutedsl_moe.py
+++ b/tests/kernels/moe/test_cutedsl_moe.py
@@ -17,7 +17,7 @@
 from torch.nn import functional as F
 
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
+from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutedsl_moe import (
     flashinfer_cutedsl_moe_masked,
 )
 from vllm.utils.flashinfer import (
diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
index 630ea2e3fe9d..1b2067148bd8 100644
--- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py
+++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py
@@ -6,6 +6,7 @@
 import torch
 import torch.nn.functional as F
 
+from vllm.platforms import current_platform
 from vllm.utils.import_utils import has_triton_kernels
 
 if not has_triton_kernels():
@@ -14,6 +15,7 @@
         allow_module_level=True,
     )
 
+import triton_kernels.matmul_ogs_details.opt_flags as opt_flags
 import triton_kernels.swiglu
 from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
 from triton_kernels.numerics import InFlexData
@@ -21,12 +23,16 @@
 from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
 from triton_kernels.tensor_details import layout
 from triton_kernels.testing import assert_close
+from triton_kernels.topk import topk as topk_fn
 
 from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config
 from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+    legacy_routing,
+    make_routing_data,
     triton_kernel_moe_forward,
 )
 from vllm.utils.math_utils import round_up
+from vllm.utils.torch_utils import set_random_seed
 
 from .utils import shuffle_weight
 
@@ -299,6 +305,12 @@ def test_equiv(num_token, a_dtype, w_dtype, tp, workspace_init):
         pc2,
     ) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=8)
 
+    if current_platform.is_device_capability_family(100):
+        constraints = {
+            "is_persistent": True,
+        }
+        opt_flags.update_opt_flags_constraints(constraints)
+
     if a_dtype == "bf16" and w_dtype == "mx4":
         quant_config = mxfp4_w4a16_moe_quant_config(
             w1_scale=pc1,
@@ -355,3 +367,43 @@ def test_unit_shuffle():
     )
 
     assert_close(ref=out_ref, tri=out)
+
+
+@pytest.mark.parametrize("num_tokens", [2, 8, 64])
+@pytest.mark.parametrize("num_experts", [32, 128])
+@pytest.mark.parametrize("topk", [1, 4])
+@pytest.mark.parametrize("renormalize", [True, False])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+def test_legacy_routing(
+    num_tokens: int, num_experts: int, topk: int, renormalize: bool, dtype: torch.dtype
+):
+    set_random_seed(0)
+    gating_output = torch.randn(num_tokens, num_experts, device="cuda", dtype=dtype)
+
+    sm_first = not renormalize
+    logits = gating_output
+    if sm_first:
+        logits = torch.softmax(logits, dim=-1)
+    sparse_logits = topk_fn(logits, topk, apply_softmax=not sm_first)
+    topk_ids = sparse_logits.indx.to(torch.long)
+    topk_weights = sparse_logits.vals
+    routing_data_ref, gather_indx_ref, scatter_indx_ref = make_routing_data(
+        topk_ids, topk_weights, num_experts
+    )
+
+    routing_data, gather_indx, scatter_indx = legacy_routing(
+        gating_output, topk, sm_first=sm_first
+    )
+
+    assert_close(
+        ref=gather_indx_ref.src_indx, tri=gather_indx.src_indx, maxtol=0, rmstol=0
+    )
+    assert_close(
+        ref=gather_indx_ref.dst_indx, tri=gather_indx.dst_indx, maxtol=0, rmstol=0
+    )
+    assert_close(
+        ref=scatter_indx_ref.src_indx, tri=scatter_indx.src_indx, maxtol=0, rmstol=0
+    )
+    assert_close(
+        ref=scatter_indx_ref.dst_indx, tri=scatter_indx.dst_indx, maxtol=0, rmstol=0
+    )
diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py
index cf9021663809..e54e7a9cd18e 100644
--- a/tests/kernels/moe/test_ocp_mx_moe.py
+++ b/tests/kernels/moe/test_ocp_mx_moe.py
@@ -82,7 +82,7 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase):
         model_case.model_id,
         tensor_parallel_size=model_case.tp,
         load_format="dummy",
-        cudagraph_capture_sizes=[16],
+        compilation_config={"cudagraph_capture_sizes": [16]},
     ) as llm:
         # Disabled as check_model is broken: https://github.com/vllm-project/vllm/pull/18465#issuecomment-3329880562
         # def check_model(model):
diff --git a/tests/kernels/quantization/test_mxfp4_triton_ep.py b/tests/kernels/quantization/test_mxfp4_triton_ep.py
index d4eb91058906..6c8aebe42c07 100644
--- a/tests/kernels/quantization/test_mxfp4_triton_ep.py
+++ b/tests/kernels/quantization/test_mxfp4_triton_ep.py
@@ -17,89 +17,6 @@
 import pytest
 import torch
 
-from vllm.model_executor.layers.quantization.mxfp4 import (
-    Mxfp4Backend,
-    Mxfp4MoEMethod,
-)
-
-
-def _make_mock_moe_config(ep_size: int = 1) -> MagicMock:
-    """Create a mock FusedMoEConfig with the given EP size."""
-    parallel_config = MagicMock()
-    parallel_config.ep_size = ep_size
-
-    moe_config = MagicMock()
-    moe_config.ep_size = ep_size
-    moe_config.is_lora_enabled = False
-    moe_config.moe_parallel_config = parallel_config
-    return moe_config
-
-
-class TestMxfp4TritonIsMonolithic:
-    """Verify that is_monolithic is always True for the TRITON backend,
-    regardless of EP size, since triton_kernel_moe_forward now handles
-    expert_map remapping internally."""
-
-    @pytest.mark.parametrize(
-        "backend,ep_size,expected_monolithic",
-        [
-            # TRITON is always monolithic (handles EP via expert_map remapping)
-            (Mxfp4Backend.TRITON, 1, True),
-            (Mxfp4Backend.TRITON, 2, True),
-            (Mxfp4Backend.TRITON, 4, True),
-            # SM100 backends are always monolithic
-            (Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM, 1, True),
-            (Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM, 2, True),
-            (Mxfp4Backend.SM100_FI_MXFP4_BF16, 1, True),
-            (Mxfp4Backend.SM100_FI_MXFP4_BF16, 2, True),
-            # MARLIN is never monolithic
-            (Mxfp4Backend.MARLIN, 1, False),
-            (Mxfp4Backend.MARLIN, 2, False),
-        ],
-        ids=[
-            "triton-no-ep",
-            "triton-ep2",
-            "triton-ep4",
-            "sm100-trtllm-no-ep",
-            "sm100-trtllm-ep2",
-            "sm100-bf16-no-ep",
-            "sm100-bf16-ep2",
-            "marlin-no-ep",
-            "marlin-ep2",
-        ],
-    )
-    @patch(
-        "vllm.model_executor.layers.quantization.mxfp4.get_mxfp4_backend",
-    )
-    @patch(
-        "vllm.model_executor.layers.quantization.mxfp4.get_current_vllm_config",
-    )
-    def test_is_monolithic(
-        self,
-        mock_get_config,
-        mock_get_backend,
-        backend,
-        ep_size,
-        expected_monolithic,
-    ):
-        """is_monolithic should be True for TRITON regardless of EP size."""
-        mock_get_backend.return_value = backend
-
-        mock_compilation_config = MagicMock()
-        mock_compilation_config.max_cudagraph_capture_size = 1024
-        mock_vllm_config = MagicMock()
-        mock_vllm_config.compilation_config = mock_compilation_config
-        mock_get_config.return_value = mock_vllm_config
-
-        moe_config = _make_mock_moe_config(ep_size=ep_size)
-        method = Mxfp4MoEMethod(moe_config)
-
-        assert method.is_monolithic == expected_monolithic, (
-            f"Expected is_monolithic={expected_monolithic} for "
-            f"backend={backend.name}, ep_size={ep_size}, "
-            f"but got {method.is_monolithic}."
-        )
-
 
 class TestTritonMoeForwardExpertMap:
     """Test that triton_kernel_moe_forward applies expert_map remapping
diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py
index 91b774c47464..d2123db2e8da 100644
--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
@@ -160,6 +160,8 @@ def test_rocm_wvsplitkrc_kernel(xnorm, n, k, m, dtype, seed, padded_a, bias_mode
         BIAS = torch.rand(m, dtype=dtype, device="cuda") * 2 - 1
     elif bias_mode == 2:
         BIAS = torch.rand(n, m, dtype=dtype, device="cuda") * 2 - 1
+    elif bias_mode == 3:
+        BIAS = torch.rand(1, m, dtype=dtype, device="cuda") * 2 - 1
 
     ref_out = torch.nn.functional.linear(A, B, BIAS)
     out = ops.wvSplitKrc(A, B, cu_count, BIAS)
@@ -224,10 +226,9 @@ def test_rocm_wvsplitk_kernel(
     ref_out = torch.nn.functional.linear(A, B, BIAS)
     out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS)
 
-    if xnorm:
-        assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-8)
-    else:
-        assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-2)
+    # Accumulation error in fp16 GEMM scales with sqrt(K)
+    atol = torch.finfo(dtype).eps * math.sqrt(k)
+    torch.testing.assert_close(out, ref_out, atol=atol, rtol=1e-2)
 
 
 @pytest.mark.parametrize("xnorm", [False, True])
diff --git a/tests/model_executor/layers/test_rocm_unquantized_gemm.py b/tests/model_executor/layers/test_rocm_unquantized_gemm.py
new file mode 100644
index 000000000000..c435a6e72422
--- /dev/null
+++ b/tests/model_executor/layers/test_rocm_unquantized_gemm.py
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+if current_platform.is_cuda():
+    pytest.skip(
+        "ROCm skinny GEMM tests are not supported on CUDA.",
+        allow_module_level=True,
+    )
+
+from vllm.model_executor.layers import utils
+
+
+def test_rocm_unquantized_gemm_gfx1x_wvsplitk_path(monkeypatch):
+    x = torch.randn(1, 64, dtype=torch.float16)
+    weight = torch.randn(128, 64, dtype=torch.float16)
+
+    monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False)
+    monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: True)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: False)
+    monkeypatch.setattr(utils, "get_cu_count", lambda: 120)
+
+    wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock)
+    llmm1_mock = MagicMock(side_effect=lambda w, x_view, _: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "LLMM1", llmm1_mock)
+
+    out = utils.rocm_unquantized_gemm_impl(x, weight, None)
+    ref = torch.nn.functional.linear(x, weight, None)
+
+    wvsplitk_mock.assert_called_once()
+    llmm1_mock.assert_not_called()
+    assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3)
+
+
+def test_rocm_unquantized_gemm_gfx1x_n_gt_4_falls_back(monkeypatch):
+    x = torch.randn(5, 64, dtype=torch.float16)
+    weight = torch.randn(128, 64, dtype=torch.float16)
+
+    monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False)
+    monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: True)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: False)
+    monkeypatch.setattr(utils, "get_cu_count", lambda: 120)
+
+    wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock)
+    llmm1_mock = MagicMock(side_effect=lambda w, x_view, _: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "LLMM1", llmm1_mock)
+
+    out = utils.rocm_unquantized_gemm_impl(x, weight, None)
+    ref = torch.nn.functional.linear(x, weight, None)
+
+    wvsplitk_mock.assert_not_called()
+    llmm1_mock.assert_not_called()
+    assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3)
+
+
+def test_rocm_unquantized_gemm_gfx950_wvsplitkrc_path(monkeypatch):
+    x = torch.randn(16, 1024, dtype=torch.float16)
+    weight = torch.randn(256, 1024, dtype=torch.float16)
+
+    monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False)
+    monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: False)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False)
+    monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: True)
+    monkeypatch.setattr(utils, "get_cu_count", lambda: 120)
+
+    wvsplitkrc_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "wvSplitKrc", wvsplitkrc_mock)
+    wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t())
+    monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock)
+
+    out = utils.rocm_unquantized_gemm_impl(x, weight, None)
+    ref = torch.nn.functional.linear(x, weight, None)
+
+    wvsplitkrc_mock.assert_called_once()
+    wvsplitk_mock.assert_not_called()
+    assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3)
diff --git a/tests/models/language/pooling/test_colbert.py b/tests/models/language/pooling/test_colbert.py
index 6edd9c28c519..a245f879ba2b 100644
--- a/tests/models/language/pooling/test_colbert.py
+++ b/tests/models/language/pooling/test_colbert.py
@@ -59,6 +59,22 @@
             "model_cls": "AutoModel",
         },
     },
+    "lfm2": {
+        "model": "LiquidAI/LFM2-ColBERT-350M",
+        "colbert_dim": 128,
+        "max_model_len": 511,
+        "extra_kwargs": {
+            "hf_overrides": {
+                "architectures": ["ColBERTLfm2Model"],
+            },
+        },
+        "hf_comparison": {
+            "weights_file": "1_Dense/model.safetensors",
+            "weights_key": "linear.weight",
+            "trust_remote_code": False,
+            "model_cls": "AutoModel",
+        },
+    },
 }
 
 
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index c16efd065e1b..1404d9628faa 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -220,7 +220,10 @@
         vllm_runner_kwargs={
             "model_impl": "transformers",
         },
-        marks=[pytest.mark.core_model],
+        marks=[
+            pytest.mark.core_model,
+            *([large_gpu_mark(min_gb=80)] if current_platform.is_rocm() else []),
+        ],
     ),
     "idefics3-transformers": VLMTestInfo(
         models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
@@ -542,8 +545,12 @@
         auto_cls=AutoModelForImageTextToText,
     ),
     "isaac": VLMTestInfo(
+        # NOTE: PerceptronAI/Isaac-0.1 removed because the upstream HF
+        # repo has a stale model.safetensors.index.json that references
+        # shard files which no longer exist (consolidated into a single
+        # model.safetensors on 2026-03-20). Re-add once upstream fixes
+        # the index file.
         models=[
-            "PerceptronAI/Isaac-0.1",
             "PerceptronAI/Isaac-0.2-2B-Preview",
         ],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py
index 1519a50c1a0c..f0650d4c234d 100644
--- a/tests/models/multimodal/generation/test_granite_speech.py
+++ b/tests/models/multimodal/generation/test_granite_speech.py
@@ -39,7 +39,11 @@ def vllm_to_hf_output(
 def granite_speech_attention_config():
     """Return attention config for Granite Speech tests on ROCm."""
     if current_platform.is_rocm():
-        return {"backend": "ROCM_AITER_FA"}
+        from vllm.platforms.rocm import on_mi3xx
+
+        if on_mi3xx():
+            return {"backend": "ROCM_AITER_FA"}
+        return {"backend": "TRITON_ATTN"}
     return None
 
 
diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py
index 47852453c058..1b7e2347be2f 100644
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info(
         test_info.audio_idx_to_prompt,
         test_info.prompt_formatter,
     )
-    resampler = AudioResampler(
-        target_sr=16000,
-        method="librosa",
-    )
+    resampler = AudioResampler(target_sr=16000)
     audios = [asset.audio_and_sample_rate for asset in audio_assets]
     resampled_audios = [
         (
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index 9bdedb3c5c25..0a692387cffc 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -24,6 +24,7 @@
     GenerationConfig,
     GenerationMixin,
 )
+from transformers.masking_utils import create_causal_mask
 from transformers.video_utils import VideoMetadata
 
 from vllm.logprobs import SampleLogprobs
@@ -680,10 +681,14 @@ def patched_forward(
         sin = sin.to(inputs_embeds.dtype)
 
         # Prepare attention mask
-        if attention_mask is not None:
-            attention_mask = self._update_causal_mask(
-                attention_mask, inputs_embeds, cache_position, past_key_values, False
-            )
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+            cache_position=cache_position,
+        )
 
         # Initialize and collect hidden states
         hidden_states = inputs_embeds
diff --git a/tests/models/multimodal/pooling/test_llama_nemotron_vl.py b/tests/models/multimodal/pooling/test_llama_nemotron_vl.py
index 4c92d41c31db..6bea808152f6 100644
--- a/tests/models/multimodal/pooling/test_llama_nemotron_vl.py
+++ b/tests/models/multimodal/pooling/test_llama_nemotron_vl.py
@@ -22,8 +22,10 @@
     ChatCompletionContentPartTextParam,
 )
 from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
+from vllm.platforms import current_platform
 
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import ROCM_ENGINE_KWARGS
 from ...utils import check_embeddings_close
 
 # Prefixes used by the model API
@@ -70,6 +72,7 @@ def _run_test(
         max_model_len=2048,
         enforce_eager=True,
         trust_remote_code=True,
+        **ROCM_ENGINE_KWARGS,
     ) as vllm_model:
         vllm_outputs = vllm_model.embed(input_texts, images=input_images)
 
@@ -250,6 +253,7 @@ def _run_vllm_reranker(
         max_model_len=2048,
         enforce_eager=True,
         trust_remote_code=True,
+        **ROCM_ENGINE_KWARGS,
     ) as vllm_model:
         has_images = any(img is not None for _, img in docs)
 
@@ -322,8 +326,11 @@ def _run_reranker_test(
     assert len(hf_scores) == len(vllm_scores), (
         f"Output length mismatch: HF={len(hf_scores)}, vLLM={len(vllm_scores)}"
     )
+    # NOTE: ROCm shows slightly higher numerical variance dues to different attention
+    # backend between vLLM and HF; use a marginally looser tolerance
+    rel_tol = 0.022 if current_platform.is_rocm() else 0.02
     for i, (hf_score, vllm_score) in enumerate(zip(hf_scores, vllm_scores)):
-        assert hf_score == pytest.approx(vllm_score, rel=0.02), (
+        assert hf_score == pytest.approx(vllm_score, rel=rel_tol), (
             f"Score mismatch at index {i}: HF={hf_score:.4f}, vLLM={vllm_score:.4f}"
         )
 
diff --git a/tests/models/registry.py b/tests/models/registry.py
index aac707a9065b..ff997706ccf2 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -628,6 +628,11 @@ def check_available_online(
         trust_remote_code=True,
         hf_overrides={"architectures": ["ColBERTJinaRobertaModel"]},
     ),
+    "ColBERTLfm2Model": _HfExamplesInfo(
+        "LiquidAI/LFM2-ColBERT-350M",
+        trust_remote_code=True,
+        hf_overrides={"architectures": ["ColBERTLfm2Model"]},
+    ),
     # [Multimodal]
     "ColModernVBertForRetrieval": _HfExamplesInfo(
         "ModernVBERT/colmodernvbert-merged",
diff --git a/tests/multimodal/media/test_audio.py b/tests/multimodal/media/test_audio.py
index 18f142008c31..4361066ab885 100644
--- a/tests/multimodal/media/test_audio.py
+++ b/tests/multimodal/media/test_audio.py
@@ -10,6 +10,8 @@
 
 from vllm.multimodal.media import AudioMediaIO
 
+from ...conftest import AudioTestAssets
+
 pytestmark = pytest.mark.cpu_test
 
 ASSETS_DIR = Path(__file__).parent.parent / "assets"
@@ -22,40 +24,32 @@ def dummy_audio():
 
 
 @pytest.fixture
-def dummy_audio_bytes():
-    return b"FAKEAUDIOBYTES"
+def dummy_audio_bytes(audio_assets: AudioTestAssets):
+    with open(audio_assets[0].get_local_path(), "rb") as f:
+        return f.read()
 
 
 def test_audio_media_io_load_bytes(dummy_audio_bytes):
     audio_io = AudioMediaIO()
-    with patch("librosa.load") as mock_load:
-        mock_load.return_value = (np.array([0.1, 0.2]), 16000)
-        out = audio_io.load_bytes(dummy_audio_bytes)
-        mock_load.assert_called_once()
-        assert isinstance(out[0], np.ndarray)
-        assert out[1] == 16000
+    out = audio_io.load_bytes(dummy_audio_bytes)
+    assert isinstance(out[0], np.ndarray)
+    assert out[1] == 16000
 
 
 def test_audio_media_io_load_base64(dummy_audio_bytes):
     audio_io = AudioMediaIO()
     encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
-    with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
-        mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
-        out = audio_io.load_base64("audio/wav", encoded)
-        mock_load_bytes.assert_called_once()
-        assert isinstance(out[0], np.ndarray)
-        assert out[1] == 16000
+    out = audio_io.load_base64("audio/wav", encoded)
+    assert isinstance(out[0], np.ndarray)
+    assert out[1] == 16000
 
 
-def test_audio_media_io_load_file():
+def test_audio_media_io_load_file(audio_assets: AudioTestAssets):
     audio_io = AudioMediaIO()
-    path = Path("/fake/path.wav")
-    with patch("librosa.load") as mock_load:
-        mock_load.return_value = (np.array([0.1, 0.2]), 16000)
-        out = audio_io.load_file(path)
-        mock_load.assert_called_once_with(path, sr=None)
-        assert isinstance(out[0], np.ndarray)
-        assert out[1] == 16000
+    path = audio_assets[0].get_local_path()
+    out = audio_io.load_file(path)
+    assert isinstance(out[0], np.ndarray)
+    assert out[1] == 16000
 
 
 def test_audio_media_io_encode_base64(dummy_audio):
diff --git a/tests/multimodal/test_audio.py b/tests/multimodal/test_audio.py
index 3cc6bcadbec4..0bc8988452f0 100644
--- a/tests/multimodal/test_audio.py
+++ b/tests/multimodal/test_audio.py
@@ -14,7 +14,7 @@
     AudioSpec,
     ChannelReduction,
     normalize_audio,
-    resample_audio_librosa,
+    resample_audio_pyav,
     resample_audio_scipy,
     split_audio,
 )
@@ -25,14 +25,14 @@ def dummy_audio():
     return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
 
 
-def test_resample_audio_librosa(dummy_audio):
-    with patch("vllm.multimodal.audio.librosa.resample") as mock_resample:
-        mock_resample.return_value = dummy_audio * 2
-        out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050)
-        mock_resample.assert_called_once_with(
-            dummy_audio, orig_sr=44100, target_sr=22050
-        )
-        assert np.all(out == dummy_audio * 2)
+def test_resample_audio_pyav(dummy_audio):
+    out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2)
+    out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4)
+    out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4)
+
+    assert len(out_down) == 3
+    assert len(out_up) == 10
+    assert np.all(out_same == dummy_audio)
 
 
 def test_resample_audio_scipy(dummy_audio):
@@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio):
     assert np.isfinite(out).all()
 
 
-def test_audio_resampler_librosa_calls_resample(dummy_audio):
-    resampler = AudioResampler(target_sr=22050, method="librosa")
-    with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample:
+def test_audio_resampler_pyav_calls_resample(dummy_audio):
+    resampler = AudioResampler(target_sr=22050, method="pyav")
+    with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample:
         mock_resample.return_value = dummy_audio
         out = resampler.resample(dummy_audio, orig_sr=44100)
         mock_resample.assert_called_once_with(
@@ -423,13 +423,13 @@ def test_soundfile_format_normalized_to_mono_e2e(self):
         # Verify channel averaging: mean of [0.5, -0.5] = 0.0
         np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5)
 
-    def test_librosa_mono_passthrough_e2e(self):
-        """Full pipeline: librosa mono format → preserved as mono."""
+    def test_pyav_mono_passthrough_e2e(self):
+        """Full pipeline: pyav mono format → preserved as mono."""
         from vllm.multimodal.parse import MultiModalDataParser
 
-        # Simulate librosa output: already mono (time,) format
-        mono_librosa = np.random.randn(16000).astype(np.float32)
-        assert mono_librosa.shape == (16000,)
+        # Simulate pyav output: already mono (time,) format
+        mono_pyav = np.random.randn(16000).astype(np.float32)
+        assert mono_pyav.shape == (16000,)
 
         # Create parser with mono normalization
         parser = MultiModalDataParser(
@@ -438,7 +438,7 @@ def test_librosa_mono_passthrough_e2e(self):
         )
 
         # Process audio through the parser
-        result = parser._parse_audio_data((mono_librosa, 16000))
+        result = parser._parse_audio_data((mono_pyav, 16000))
         audio_output = result.get(0)
 
         # Verify output is still mono 1D
@@ -446,7 +446,7 @@ def test_librosa_mono_passthrough_e2e(self):
         assert audio_output.shape == (16000,)
 
         # Verify audio content is preserved
-        np.testing.assert_array_almost_equal(audio_output, mono_librosa)
+        np.testing.assert_array_almost_equal(audio_output, mono_pyav)
 
     def test_multichannel_5_1_surround_to_mono_e2e(self):
         """Full pipeline: 5.1 surround (6 channels) → mono output."""
diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py
deleted file mode 100644
index 6858062b9183..000000000000
--- a/tests/quantization/test_ptpc_fp8.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Tests whether PTPC w8a8 FP8 computation is enabled correctly.
-
-Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
-"""
-
-import pytest
-
-from tests.quantization.utils import is_quant_method_supported
-from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
-from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod
-from vllm.platforms import current_platform
-
-
-@pytest.fixture(scope="function", autouse=True)
-def enable_pickle(monkeypatch):
-    """`LLM.apply_model` requires pickling a function."""
-    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
-
-
-@pytest.mark.skipif(
-    not is_quant_method_supported("ptpc_fp8"),
-    reason="PTPC FP8 is not supported on this GPU type.",
-)
-@pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.")
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
-    llm = vllm_runner(
-        "facebook/opt-125m",
-        dtype=dtype,
-        quantization="ptpc_fp8",
-        enforce_eager=True,
-        kv_cache_dtype=kv_cache_dtype,
-        allow_deprecated_quantization=True,
-    )
-
-    with llm:
-
-        def check_model(model):
-            fc1 = model.model.decoder.layers[0].fc1
-            assert isinstance(fc1.quant_method, PTPCFp8LinearMethod)
-            if kv_cache_dtype == "ptpc_fp8":
-                attn = model.model.decoder.layers[0].self_attn.attn
-                assert isinstance(attn.quant_method, Fp8KVCacheMethod)
-                assert attn._k_scale == 1.0
-                assert attn._v_scale == 1.0
-
-            # For GPUs with hardware support, we keep weights in fp8
-            if current_platform.has_device_capability(94):
-                assert fc1.weight.dtype == current_platform.fp8_dtype()
-
-        llm.apply_model(check_model)
-
-        output = llm.generate_greedy("Hello my name is", max_tokens=4)
-        assert output
diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py
index 54a577d2bf84..6cf2a82d2ff1 100644
--- a/tests/test_pooling_params.py
+++ b/tests/test_pooling_params.py
@@ -74,7 +74,7 @@ def test_embed_dimensions(model_info: EmbedModelInfo):
         pooling_params.verify(model_config)
 
 
-@pytest.mark.parametrize("task", ["score", "classify"])
+@pytest.mark.parametrize("task", ["classify"])
 def test_classify(task):
     model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS"))
 
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 978e0783919d..a38b4428dea5 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -12,6 +12,7 @@
 import pytest
 import torch
 
+from tests.utils import large_gpu_mark
 from vllm import LLM, SamplingParams
 from vllm.platforms import current_platform
 
@@ -32,10 +33,21 @@ def test_duplicated_ignored_sequence_group():
     assert len(prompts) == len(outputs)
 
 
-def test_max_tokens_none():
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(
+            "distilbert/distilgpt2",
+            marks=[
+                *([large_gpu_mark(min_gb=80)] if current_platform.is_rocm() else []),
+            ],
+        ),
+    ],
+)
+def test_max_tokens_none(model):
     sampling_params = SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None)
     llm = LLM(
-        model="distilbert/distilgpt2",
+        model=model,
         max_num_batched_tokens=4096,
         tensor_parallel_size=1,
     )
diff --git a/tests/v1/attention/test_batch_reordering.py b/tests/v1/attention/test_batch_reordering.py
index 6265e12f9a7d..f59740238da7 100644
--- a/tests/v1/attention/test_batch_reordering.py
+++ b/tests/v1/attention/test_batch_reordering.py
@@ -10,9 +10,10 @@
 
 
 class MockInputBatch:
-    def __init__(self, req_ids, num_computed_tokens_cpu):
+    def __init__(self, req_ids, num_computed_tokens_cpu, num_prompt_tokens):
         self.req_ids = req_ids
         self.num_computed_tokens_cpu = num_computed_tokens_cpu
+        self.num_prompt_tokens = num_prompt_tokens
 
     def swap_states(self, i, j):
         self.req_ids[i], self.req_ids[j] = self.req_ids[j], self.req_ids[i]
@@ -20,6 +21,10 @@ def swap_states(self, i, j):
             self.num_computed_tokens_cpu[j],
             self.num_computed_tokens_cpu[i],
         )
+        self.num_prompt_tokens[i], self.num_prompt_tokens[j] = (
+            self.num_prompt_tokens[j],
+            self.num_prompt_tokens[i],
+        )
 
 
 class MockSchedulerOutput:
@@ -29,96 +34,139 @@ def __init__(self, num_scheduled_tokens):
 
 @dataclass
 class ReorderTestCase:
-    requests: list[tuple[int, int]]  # (num_scheduled_tokens, num_computed_tokens)
+    # (num_scheduled_tokens, num_computed_tokens, num_prompt_tokens)
+    requests: list[tuple[int, int, int]]
     expected_order: list[int]
     expected_modified: bool
     decode_threshold: int = 1
 
 
 # Test cases for batch reordering
+# Format: (num_scheduled, num_computed, num_prompt)
 REORDER_TEST_CASES = {
     "all_decodes": ReorderTestCase(
-        requests=[(1, 10), (1, 20), (1, 30)],
+        requests=[(1, 10, 10), (1, 20, 20), (1, 30, 30)],
         expected_order=[0, 1, 2],
         expected_modified=False,
     ),
-    "all_prefills": ReorderTestCase(
-        requests=[(100, 100), (200, 200), (300, 300)],
+    "all_long_extends": ReorderTestCase(
+        requests=[(100, 100, 100), (200, 200, 200), (300, 300, 300)],
         expected_order=[0, 1, 2],
         expected_modified=False,
     ),
-    "mixed_interleaved": ReorderTestCase(
-        requests=[(100, 100), (1, 10), (200, 200), (1, 20)],
-        expected_order=[3, 1, 2, 0],  # Only swap 0↔3, keep 1 and 2 in place
+    "mixed_decodes_long_extends": ReorderTestCase(
+        requests=[(100, 100, 100), (1, 10, 10), (200, 200, 200), (1, 20, 20)],
+        expected_order=[3, 1, 2, 0],
         expected_modified=True,
     ),
     "already_ordered": ReorderTestCase(
-        requests=[(1, 10), (1, 20), (100, 100), (200, 0)],
+        requests=[(1, 10, 10), (1, 20, 20), (100, 100, 100), (200, 0, 200)],
         expected_order=[0, 1, 2, 3],
         expected_modified=False,
     ),
     "single_request": ReorderTestCase(
-        requests=[(1, 10)],
+        requests=[(1, 10, 10)],
         expected_order=[0],
         expected_modified=False,
     ),
     "higher_threshold": ReorderTestCase(
-        requests=[(2, 10), (3, 20), (5, 30), (6, 40)],
+        requests=[(2, 10, 10), (3, 20, 20), (5, 30, 30), (6, 40, 40)],
         expected_order=[0, 1, 2, 3],
         expected_modified=False,
         decode_threshold=4,
     ),
     "decodes_at_end": ReorderTestCase(
-        requests=[(100, 100), (200, 200), (1, 10), (1, 20)],
+        requests=[(100, 100, 100), (200, 200, 200), (1, 10, 10), (1, 20, 20)],
         expected_order=[2, 3, 0, 1],
         expected_modified=True,
     ),
-    "decode_extend_prefill": ReorderTestCase(
-        requests=[(100, 0), (10, 50), (1, 10)],
+    "decode_long_extend_prefill": ReorderTestCase(
+        requests=[(100, 0, 100), (10, 50, 50), (1, 10, 10)],
         expected_order=[2, 1, 0],
         expected_modified=True,
     ),
-    "extend_prefill_only": ReorderTestCase(
-        requests=[(100, 0), (10, 50), (200, 0), (20, 75)],
-        expected_order=[3, 1, 2, 0],  # Only swap 0↔3, keep 1 and 2 in place
+    "long_extend_prefill_only": ReorderTestCase(
+        requests=[(100, 0, 100), (10, 50, 50), (200, 0, 200), (20, 75, 75)],
+        expected_order=[3, 1, 2, 0],
         expected_modified=True,
     ),
-    "complicated_mixed_interleaved": ReorderTestCase(
+    "complicated_mixed": ReorderTestCase(
         requests=[
-            (1, 20),
-            (1, 50),
-            (374, 0),
-            (300, 20),
-            (1, 20),
-            (256, 0),
-            (1, 5),
-            (27, 0),
-            (1, 4),
+            (1, 20, 20),  # decode
+            (1, 50, 50),  # decode
+            (374, 0, 374),  # prefill
+            (300, 20, 20),  # long_extend
+            (1, 20, 20),  # decode
+            (256, 0, 256),  # prefill
+            (1, 5, 5),  # decode
+            (27, 0, 27),  # prefill
+            (1, 4, 4),  # decode
         ],
         expected_order=[0, 1, 6, 8, 4, 3, 2, 7, 5],
         expected_modified=True,
     ),
     "new_request_single_token_prefill": ReorderTestCase(
         requests=[
-            (100, 0),
-            (1, 0),  # New request with only 1 token (STILL prefill)
-            (50, 100),
-            (1, 10),
+            (100, 0, 100),  # prefill
+            (1, 0, 1),  # prefill (single token, still prefill)
+            (50, 100, 100),  # long_extend
+            (1, 10, 10),  # decode
         ],
-        # Only index 3 is a true decode (has num_computed_tokens > 0)
         expected_order=[3, 2, 0, 1],
         expected_modified=True,
     ),
     "multiple_new_requests_single_token_prefill": ReorderTestCase(
         requests=[
-            (1, 0),  # New prefill (1 token, no computed)
-            (1, 0),  # New prefill (1 token, no computed)
-            (1, 50),
-            (200, 0),
+            (1, 0, 1),  # prefill
+            (1, 0, 1),  # prefill
+            (1, 50, 50),  # decode
+            (200, 0, 200),  # prefill
         ],
         expected_order=[2, 1, 0, 3],
         expected_modified=True,
     ),
+    "four_way_already_ordered": ReorderTestCase(
+        requests=[
+            (1, 100, 100),  # decode
+            (1, 50, 100),  # short_extend
+            (10, 50, 100),  # long_extend
+            (100, 0, 100),  # prefill
+        ],
+        expected_order=[0, 1, 2, 3],
+        expected_modified=False,
+    ),
+    "four_way_needs_reorder": ReorderTestCase(
+        requests=[
+            (100, 0, 100),  # prefill
+            (1, 50, 100),  # short_extend
+            (1, 100, 100),  # decode
+            (10, 50, 100),  # long_extend
+        ],
+        expected_order=[2, 1, 3, 0],
+        expected_modified=True,
+    ),
+    "four_way_multiple_short_extends": ReorderTestCase(
+        requests=[
+            (2, 100, 100),  # decode
+            (2, 50, 200),  # short_extend
+            (2, 75, 150),  # short_extend
+            (2, 200, 200),  # decode
+        ],
+        expected_order=[0, 3, 2, 1],
+        expected_modified=True,
+        decode_threshold=2,
+    ),
+    "four_way_spec_decode_threshold": ReorderTestCase(
+        requests=[
+            (5, 100, 100),  # decode
+            (5, 50, 100),  # short_extend
+            (5, 0, 100),  # prefill
+            (10, 50, 100),  # long_extend
+        ],
+        expected_order=[0, 1, 3, 2],
+        expected_modified=True,
+        decode_threshold=5,
+    ),
 }
 
 
@@ -129,8 +177,9 @@ def test_reorder_batch_to_split_decodes_and_prefills(test_case: ReorderTestCase)
     req_ids = [f"r{i}" for i in range(len(test_case.requests))]
     num_computed_tokens = np.array([r[1] for r in test_case.requests], dtype=np.int32)
     num_scheduled_tokens = {f"r{i}": r[0] for i, r in enumerate(test_case.requests)}
+    num_prompt_tokens = np.array([r[2] for r in test_case.requests], dtype=np.int32)
 
-    input_batch = MockInputBatch(req_ids, num_computed_tokens)
+    input_batch = MockInputBatch(req_ids, num_computed_tokens, num_prompt_tokens)
     scheduler_output = MockSchedulerOutput(num_scheduled_tokens)
 
     modified = reorder_batch_to_split_decodes_and_prefills(
diff --git a/tests/v1/e2e/test_hybrid_chunked_prefill.py b/tests/v1/e2e/test_hybrid_chunked_prefill.py
index 030081a38af3..1790343ca836 100644
--- a/tests/v1/e2e/test_hybrid_chunked_prefill.py
+++ b/tests/v1/e2e/test_hybrid_chunked_prefill.py
@@ -43,7 +43,7 @@
         pytest.param("Qwen/Qwen3.5-4B", marks=[large_gpu_mark(min_gb=40)]),
         pytest.param(
             "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8",
-            marks=[large_gpu_mark(min_gb=80)] + multi_gpu_marks(num_gpus=2),
+            marks=[large_gpu_mark(min_gb=80)] + multi_gpu_marks(num_gpus=4),
         ),
     ],
 )
@@ -68,7 +68,7 @@ def test_mtp_speculative_mixed_batch_short_prefill(
         max_num_batched_tokens=chunk_size,
         max_model_len=512,
         enforce_eager=True,
-        tensor_parallel_size=2,
+        tensor_parallel_size=4,
         trust_remote_code=True,
         enable_chunked_prefill=True,
         enable_prefix_caching=enable_prefix_caching,
diff --git a/tests/v1/entrypoints/__init__.py b/tests/v1/entrypoints/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py
deleted file mode 100644
index bc9674ee86cf..000000000000
--- a/tests/v1/entrypoints/conftest.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-
-
-@pytest.fixture
-def sample_prompts():
-    return [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-
-
-@pytest.fixture
-def sample_token_ids():
-    return [
-        [0],
-        [0, 1],
-        [0, 2, 1],
-        [0, 3, 1, 2],
-    ]
-
-
-@pytest.fixture
-def sample_regex():
-    return (
-        r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
-        r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)"
-    )
-
-
-# Note: Ensure this only uses attributes compatible with xgrammar
-@pytest.fixture
-def sample_json_schema():
-    return {
-        "type": "object",
-        "properties": {
-            "name": {"type": "string"},
-            "age": {"type": "integer"},
-            "skills": {
-                "type": "array",
-                "items": {
-                    "type": "string",
-                },
-            },
-            "grade": {
-                "type": "string",
-                "pattern": "^[A-D]$",  # Regex pattern
-            },
-            "email": {
-                "type": "string",
-                "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$",
-            },
-            "work_history": {
-                "type": "array",
-                "items": {
-                    "type": "object",
-                    "properties": {
-                        "company": {"type": "string"},
-                        "duration": {
-                            "type": "number",
-                            "minimum": 0.0,
-                            "maximum": 100.0,  # Numeric range
-                        },
-                        "position": {"type": "string"},
-                    },
-                    "required": ["company", "duration", "position"],
-                    "additionalProperties": False,
-                },
-                "minItems": 0,
-                "maxItems": 3,
-            },
-        },
-        "required": ["name", "age", "skills", "grade", "email", "work_history"],
-        "additionalProperties": False,
-        "minProperties": 1,
-        "maxProperties": 10,
-    }
-
-
-# A schema unsupported by xgrammar
-@pytest.fixture
-def unsupported_json_schema():
-    return {
-        "type": "object",
-        "properties": {
-            "score": {
-                "type": "integer",
-                "multipleOf": 5,  # Numeric multiple
-            },
-            "tags": {
-                "type": "array",
-                "items": {"type": "string", "minLength": 10, "maxLength": 20},
-            },
-        },
-        "required": ["score", "tags"],
-        "additionalProperties": False,
-        "patternProperties": {
-            "^score$": {"type": "integer"},
-        },
-    }
-
-
-@pytest.fixture
-def sample_definition_json_schema():
-    return {
-        "$defs": {
-            "Step": {
-                "properties": {
-                    "explanation": {"title": "Explanation", "type": "string"},
-                    "output": {"title": "Output", "type": "string"},
-                },
-                "required": ["explanation", "output"],
-                "title": "Step",
-                "type": "object",
-            }
-        },
-        "properties": {
-            "steps": {
-                "items": {"$ref": "#/$defs/Step"},
-                "title": "Steps",
-                "type": "array",
-            },
-            "final_answer": {"title": "Final Answer", "type": "string"},
-        },
-        "required": ["steps", "final_answer"],
-        "title": "MathReasoning",
-        "type": "object",
-        "additionalProperties": False,
-    }
-
-
-@pytest.fixture
-def sample_structured_outputs_choices():
-    return [
-        "Python",
-        "Java",
-        "JavaScript",
-        "C++",
-        "C#",
-        "PHP",
-        "TypeScript",
-        "Ruby",
-        "Swift",
-        "Kotlin",
-    ]
-
-
-@pytest.fixture
-def sample_sql_ebnf():
-    return """
-root ::= select_statement
-select_statement ::= "SELECT" column "from" table "where" condition
-column ::= "col_1" | "col_2"
-table ::= "table_1" | "table_2"
-condition ::= column "=" number
-number ::= "1" | "2"
-"""
-
-
-@pytest.fixture
-def sample_sql_lark():
-    return """
-start: select_statement
-select_statement: "SELECT" column "from" table "where" condition
-column: "col_1" | "col_2"
-table: "table_1" | "table_2"
-condition: column "=" number
-number: "1" | "2"
-"""
diff --git a/tests/v1/entrypoints/llm/__init__.py b/tests/v1/entrypoints/llm/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/v1/streaming_input/test_gpu_model_runner_v2_streaming.py b/tests/v1/streaming_input/test_gpu_model_runner_v2_streaming.py
new file mode 100644
index 000000000000..8fde0f117ca2
--- /dev/null
+++ b/tests/v1/streaming_input/test_gpu_model_runner_v2_streaming.py
@@ -0,0 +1,207 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Unit tests for MRv2 GPUModelRunner.add_requests streaming input support."""
+
+from unittest.mock import Mock
+
+import pytest
+import torch
+
+from vllm.v1.core.sched.output import (
+    CachedRequestData,
+    NewRequestData,
+    SchedulerOutput,
+)
+from vllm.v1.worker.gpu.model_runner import GPUModelRunner
+from vllm.v1.worker.gpu.states import RequestState
+
+pytestmark = pytest.mark.cpu_test
+
+
+@pytest.fixture
+def mock_model_runner_with_req_states():
+    """Create a mock MRv2 GPUModelRunner with a real RequestState."""
+
+    runner = Mock(spec=GPUModelRunner)
+    runner.req_states = RequestState(
+        max_num_reqs=10,
+        max_model_len=1024,
+        max_num_batched_tokens=1024,
+        num_speculative_steps=0,
+        vocab_size=32000,
+        device=torch.device("cpu"),
+        model_dtype=torch.float32,
+        cache_draft_logits=False,
+    )
+    runner.encoder_cache = None
+    runner.model_state = Mock()
+    runner.block_tables = Mock()
+    runner.lora_state = Mock()
+    runner.sampler = None
+    runner.prompt_logprobs_worker = None
+    runner.is_last_pp_rank = False
+
+    # Mock staged writes — they use Triton kernels that require GPU
+    runner.req_states.apply_staged_writes = Mock()
+
+    # Bind the real methods to our mock
+    runner._remove_request = GPUModelRunner._remove_request.__get__(runner)
+    runner.add_requests = GPUModelRunner.add_requests.__get__(runner)
+    return runner
+
+
+def _make_scheduler_output(new_reqs):
+    return SchedulerOutput(
+        scheduled_new_reqs=new_reqs,
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={},
+        total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=[],
+        finished_req_ids=set(),
+        free_encoder_mm_hashes=[],
+    )
+
+
+def test_e2e_streaming_request_update_basic_flow(
+    mock_model_runner_with_req_states,
+):
+    """Test that streaming sessions are updated correctly.
+
+    This test validates that when a streaming session is updated with new
+    prompt tokens:
+    1. The old request state is removed (no free_indices leak)
+    2. The new state is written with updated prefill_token_ids
+    3. model_state and block_tables are re-registered for the new state
+    """
+    runner = mock_model_runner_with_req_states
+    req_states = runner.req_states
+    req_id = "streaming_req_0"
+    initial_free = len(req_states.free_indices)
+
+    # Step 1: Add initial request with 3 prompt tokens, all computed
+    initial_req_data = NewRequestData(
+        req_id=req_id,
+        prompt_token_ids=[1, 2, 3],
+        prefill_token_ids=[1, 2, 3],
+        mm_features=[],
+        sampling_params=None,
+        pooling_params=None,
+        block_ids=([0],),
+        num_computed_tokens=3,
+        lora_request=None,
+    )
+    runner.add_requests(_make_scheduler_output([initial_req_data]))
+    assert req_id in req_states.req_id_to_index
+    assert len(req_states.free_indices) == initial_free - 1
+
+    # Step 2: Create streaming update with extended prompt
+    # The scheduler has already set prefill_token_ids to the full sequence
+    # (original prompt + intermediate output + new prompt tokens)
+    updated_req_data = NewRequestData(
+        req_id=req_id,
+        prompt_token_ids=[1, 2, 3],
+        prefill_token_ids=[1, 2, 3, 10, 4, 5],
+        mm_features=[],
+        sampling_params=None,
+        pooling_params=None,
+        block_ids=([0, 1],),
+        num_computed_tokens=4,  # 3 original prompt + 1 intermediate output
+        lora_request=None,
+    )
+    runner.add_requests(_make_scheduler_output([updated_req_data]))
+
+    # Step 3: Verify no free_indices leak (old slot recycled)
+    assert len(req_states.free_indices) == initial_free - 1
+
+    # Verify the request is still tracked with exactly one index
+    assert req_id in req_states.req_id_to_index
+    assert sum(1 for v in req_states.index_to_req_id.values() if v == req_id) == 1
+
+    # Verify state was updated with new values
+    new_idx = req_states.req_id_to_index[req_id]
+    assert req_states.prompt_len.np[new_idx] == 3
+    assert req_states.prefill_len.np[new_idx] == 6
+    assert req_states.num_computed_prefill_tokens[new_idx] == 4
+
+    # Verify model_state and block_tables were re-registered
+    runner.model_state.add_request.assert_called_with(new_idx, updated_req_data)
+    runner.block_tables.append_block_ids.assert_called_with(
+        new_idx, ([0, 1],), overwrite=True
+    )
+
+
+def test_e2e_streaming_with_multimodal_features(
+    mock_model_runner_with_req_states,
+):
+    """Test that streaming sessions with multimodal features are updated.
+
+    This test validates that when a streaming session with mm features
+    is updated:
+    1. The old request state is removed (no free_indices leak)
+    2. encoder_cache is cleaned up and re-registered with new mm_features
+    3. model_state is re-registered (recomputes M-RoPE positions etc.)
+    """
+    runner = mock_model_runner_with_req_states
+    req_states = runner.req_states
+    req_id = "streaming_mm_req_0"
+    initial_free = len(req_states.free_indices)
+
+    # Enable encoder_cache for multimodal
+    runner.encoder_cache = Mock()
+
+    # Step 1: Add initial request with one audio feature
+    mm_feature_1 = Mock()
+    initial_req_data = NewRequestData(
+        req_id=req_id,
+        prompt_token_ids=[1, 2] + [0] * 10 + [3, 4],
+        prefill_token_ids=[1, 2] + [0] * 10 + [3, 4],
+        mm_features=[mm_feature_1],
+        sampling_params=None,
+        pooling_params=None,
+        block_ids=([0],),
+        num_computed_tokens=14,
+        lora_request=None,
+    )
+    runner.add_requests(_make_scheduler_output([initial_req_data]))
+    assert req_id in req_states.req_id_to_index
+
+    # Reset mocks to track only the streaming update calls
+    runner.encoder_cache.reset_mock()
+    runner.model_state.reset_mock()
+
+    # Step 2: Create streaming update with additional multimodal feature
+    # The scheduler has folded the intermediate output (100) into
+    # prefill_token_ids and added a new audio chunk
+    mm_feature_2 = Mock()
+    updated_req_data = NewRequestData(
+        req_id=req_id,
+        prompt_token_ids=[1, 2] + [0] * 10 + [3, 4],
+        prefill_token_ids=[1, 2] + [0] * 10 + [3, 4, 100] + [0] * 5 + [5],
+        mm_features=[mm_feature_1, mm_feature_2],
+        sampling_params=None,
+        pooling_params=None,
+        block_ids=([0, 1],),
+        num_computed_tokens=14,
+        lora_request=None,
+    )
+    runner.add_requests(_make_scheduler_output([updated_req_data]))
+
+    # Step 3: Verify no free_indices leak
+    assert len(req_states.free_indices) == initial_free - 1
+    assert sum(1 for v in req_states.index_to_req_id.values() if v == req_id) == 1
+
+    # Verify encoder_cache was cleaned up and re-registered
+    runner.encoder_cache.remove_request.assert_called_once_with(req_id)
+    runner.encoder_cache.add_request.assert_called_once_with(
+        req_id, [mm_feature_1, mm_feature_2]
+    )
+
+    # Verify model_state was re-registered with new data
+    new_idx = req_states.req_id_to_index[req_id]
+    runner.model_state.add_request.assert_called_once_with(new_idx, updated_req_data)
+
+    # Verify updated prefill length
+    assert req_states.prefill_len.np[new_idx] == 21
diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py
index a5dc1773d477..4ed8724e60fb 100644
--- a/tests/v1/test_serial_utils.py
+++ b/tests/v1/test_serial_utils.py
@@ -278,3 +278,148 @@ def test_custom_class_serialization_disallowed_without_pickle():
     with pytest.raises(TypeError):
         # Attempt to encode the custom class
         encoder.encode(obj)
+
+
+@dataclass
+class RequestWithTensor:
+    """Mock request with non-multimodal tensor field like EngineCoreRequest."""
+
+    prompt_embeds: torch.Tensor | None
+    data: str
+
+
+def test_non_multimodal_tensor_with_ipc():
+    """Test that non-multimodal tensor fields work correctly with IPC enabled.
+
+    This reproduces the bug where fields like prompt_embeds: torch.Tensor | None
+    would fail to decode when IPC is enabled because _decode_tensor expected a
+    raw tensor tuple but received a msgpack-decoded TensorIpcHandle list.
+    """
+    import torch.multiprocessing as torch_mp
+
+    from vllm.v1.engine.tensor_ipc import TensorIpcReceiver, TensorIpcSender
+
+    # Create tensor queues for IPC
+    tensor_queues = [torch_mp.Queue()]
+
+    # Create encoder with IPC sender
+    sender = TensorIpcSender(tensor_queues[0])
+    encoder = MsgpackEncoder(oob_tensor_consumer=sender)
+
+    # Create decoder with IPC receiver
+    receiver = TensorIpcReceiver(tensor_queues[0])
+    decoder = MsgpackDecoder(RequestWithTensor, oob_tensor_provider=receiver)
+
+    # Create a request with a non-multimodal tensor
+    original_tensor = torch.randn(5, 10, dtype=torch.float32)
+    request = RequestWithTensor(prompt_embeds=original_tensor, data="test_data")
+
+    # Encode the request - this should send the tensor via IPC
+    encoded = encoder.encode(request)
+
+    # Verify encoding succeeded
+    assert len(encoded) > 0
+
+    # Decode the request - this should retrieve the tensor from IPC queue
+    # Previously this would fail because the decoder tried to unpack the
+    # handle list as raw tensor bytes metadata.
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded request matches the original
+    assert isinstance(decoded, RequestWithTensor)
+    assert decoded.data == "test_data"
+    assert decoded.prompt_embeds is not None
+    assert torch.allclose(decoded.prompt_embeds, original_tensor), (
+        "Decoded tensor does not match the original tensor."
+    )
+
+
+def test_non_multimodal_tensor_with_ipc_none_value():
+    """Test that None values for tensor fields work correctly with IPC enabled."""
+    import torch.multiprocessing as torch_mp
+
+    from vllm.v1.engine.tensor_ipc import TensorIpcReceiver, TensorIpcSender
+
+    # Create tensor queues for IPC
+    tensor_queues = [torch_mp.Queue()]
+
+    # Create encoder with IPC sender
+    sender = TensorIpcSender(tensor_queues[0])
+    encoder = MsgpackEncoder(oob_tensor_consumer=sender)
+
+    # Create decoder with IPC receiver
+    receiver = TensorIpcReceiver(tensor_queues[0])
+    decoder = MsgpackDecoder(RequestWithTensor, oob_tensor_provider=receiver)
+
+    # Create a request with None for the tensor field
+    request = RequestWithTensor(prompt_embeds=None, data="test_data_with_none")
+
+    # Encode and decode the request
+    encoded = encoder.encode(request)
+    decoded = decoder.decode(encoded)
+
+    # Verify the decoded request matches the original
+    assert isinstance(decoded, RequestWithTensor)
+    assert decoded.data == "test_data_with_none"
+    assert decoded.prompt_embeds is None
+
+
+def test_multiple_senders_single_receiver_ipc():
+    """Test N senders sharing a queue with a single receiver via msgpack.
+
+    Simulates the real vLLM topology where multiple API server frontends
+    each have their own MsgpackEncoder + TensorIpcSender, all putting
+    tensors onto the same torch.mp queue, and a single engine core
+    decodes them with one MsgpackDecoder + TensorIpcReceiver.
+    """
+    import torch.multiprocessing as torch_mp
+
+    from vllm.v1.engine.tensor_ipc import TensorIpcReceiver, TensorIpcSender
+
+    num_senders = 3
+    num_messages_per_sender = 2
+    tensor_queue = torch_mp.Queue()
+
+    # Create N independent senders (each gets its own uuid-based sender_id)
+    senders = []
+    encoders = []
+    for _ in range(num_senders):
+        s = TensorIpcSender(tensor_queue)
+        senders.append(s)
+        encoders.append(MsgpackEncoder(oob_tensor_consumer=s))
+
+    # Single receiver
+    receiver = TensorIpcReceiver(tensor_queue)
+    decoder = MsgpackDecoder(RequestWithTensor, oob_tensor_provider=receiver)
+
+    # Encode messages from all senders, interleaving the order
+    # so that tensors from different senders land on the queue interleaved.
+    encoded_payloads: list[tuple[int, int, torch.Tensor, list]] = []
+    for msg_idx in range(num_messages_per_sender):
+        for sender_idx in range(num_senders):
+            tensor = torch.full(
+                (sender_idx + 1, msg_idx + 2),
+                float(sender_idx * 100 + msg_idx),
+                dtype=torch.float32,
+            )
+            req = RequestWithTensor(
+                prompt_embeds=tensor,
+                data=f"s{sender_idx}_m{msg_idx}",
+            )
+            encoded = encoders[sender_idx].encode(req)
+            encoded_payloads.append((sender_idx, msg_idx, tensor, encoded))
+
+    # Decode all messages — the receiver must correctly match each
+    # tensor handle to the right TensorIpcData from the shared queue.
+    for sender_idx, msg_idx, original_tensor, encoded in encoded_payloads:
+        decoded = decoder.decode(encoded)
+        assert isinstance(decoded, RequestWithTensor)
+        assert decoded.data == f"s{sender_idx}_m{msg_idx}"
+        assert decoded.prompt_embeds is not None
+        assert decoded.prompt_embeds.shape == original_tensor.shape, (
+            f"Shape mismatch for sender {sender_idx} msg {msg_idx}: "
+            f"{decoded.prompt_embeds.shape} != {original_tensor.shape}"
+        )
+        assert torch.allclose(decoded.prompt_embeds, original_tensor), (
+            f"Value mismatch for sender {sender_idx} msg {msg_idx}"
+        )
diff --git a/tests/v1/test_tensor_ipc_queue.py b/tests/v1/test_tensor_ipc_queue.py
new file mode 100644
index 000000000000..a3fcb97ca171
--- /dev/null
+++ b/tests/v1/test_tensor_ipc_queue.py
@@ -0,0 +1,943 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Tests for tensor IPC queue functionality."""
+
+import contextlib
+import multiprocessing as mp
+from dataclasses import dataclass
+from multiprocessing.synchronize import Barrier as BarrierType
+from multiprocessing.synchronize import Event as EventType
+from typing import Any
+
+import pytest
+import torch
+import torch.multiprocessing as torch_mp
+
+from vllm.v1.engine.tensor_ipc import (
+    TensorIpcData,
+    TensorIpcReceiver,
+    TensorIpcSender,
+)
+from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+
+
+@pytest.fixture(scope="module", autouse=True)
+def setup_multiprocessing():
+    """Set multiprocessing start method to 'spawn' for compatibility."""
+    with contextlib.suppress(RuntimeError):
+        # Already set, which is fine
+        torch_mp.set_start_method("spawn", force=True)
+    yield
+
+
+@dataclass
+# Use a typed container so the test covers the real vLLM path where tensor IPC
+# handles are encoded and decoded as fields nested inside larger msgpack payloads.
+class TensorEnvelope:
+    tensor: torch.Tensor
+    label: str
+
+
+def encoder_process(
+    tensor_queue: torch_mp.Queue,
+    payload_queue: mp.Queue,
+    result_queue: mp.Queue,
+    tensor_data: dict[str, Any],
+    ready_event: EventType,
+    retrieval_done: EventType,
+):
+    """Process that msgpack-encodes and sends tensors via IPC."""
+    try:
+        sender = TensorIpcSender(tensor_queue)
+        encoder = MsgpackEncoder(oob_tensor_consumer=sender)
+
+        if torch.cuda.is_available():
+            device = "cuda:0"
+            tensor = torch.randn(
+                *tensor_data["shape"], dtype=tensor_data["dtype"], device=device
+            )
+        else:
+            # Fall back to CPU for testing
+            device = "cpu"
+            tensor = torch.randn(*tensor_data["shape"], dtype=tensor_data["dtype"])
+
+        message = TensorEnvelope(tensor=tensor, label="cuda-msgpack")
+        encoded = encoder.encode(message)
+        payload_queue.put(encoded, timeout=10.0)
+
+        ready_event.set()
+
+        result_queue.put(
+            {
+                "success": True,
+                "encoded_length": len(encoded),
+                "device": str(device),
+                "tensor_shape": tuple(tensor.shape),
+            }
+        )
+        retrieval_done.wait(timeout=30.0)
+    except Exception as e:
+        import traceback
+
+        ready_event.set()
+        retrieval_done.set()
+        result_queue.put(
+            {"success": False, "error": str(e), "traceback": traceback.format_exc()}
+        )
+
+
+def decoder_process(
+    tensor_queue: torch_mp.Queue,
+    payload_queue: mp.Queue,
+    result_queue: mp.Queue,
+    expected_shape: tuple,
+    encoder_ready: EventType,
+    retrieval_done: EventType,
+):
+    """Process that msgpack-decodes tensors received via IPC."""
+    try:
+        if not encoder_ready.wait(timeout=10.0):
+            raise TimeoutError("Encoder did not signal ready")
+
+        encoded = payload_queue.get(timeout=5.0)
+        receiver = TensorIpcReceiver(tensor_queue)
+        decoder = MsgpackDecoder(TensorEnvelope, oob_tensor_provider=receiver)
+        decoded = decoder.decode(encoded)
+
+        result_queue.put(
+            {
+                "success": True,
+                "tensor_shape": tuple(decoded.tensor.shape),
+                "device": str(decoded.tensor.device),
+                "label": decoded.label,
+                "matches_expected": tuple(decoded.tensor.shape) == expected_shape,
+            }
+        )
+    except Exception as e:
+        import traceback
+
+        retrieval_done.set()
+        result_queue.put(
+            {"success": False, "error": str(e), "traceback": traceback.format_exc()}
+        )
+    else:
+        retrieval_done.set()
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_cuda_tensor_queue_basic():
+    """Test CUDA tensor IPC through the msgpack encoder/decoder path."""
+    tensor_queue = torch_mp.Queue()
+    payload_queue: mp.Queue = mp.Queue()
+    result_queue: mp.Queue = mp.Queue()
+    encoder_ready = mp.Event()
+    retrieval_done = mp.Event()
+
+    tensor_shape = (4, 8, 16)
+    tensor_dtype = torch.float32
+
+    encoder_proc = mp.Process(
+        target=encoder_process,
+        args=(
+            tensor_queue,
+            payload_queue,
+            result_queue,
+            {"shape": tensor_shape, "dtype": tensor_dtype},
+            encoder_ready,
+            retrieval_done,
+        ),
+    )
+    encoder_proc.start()
+
+    decoder_proc = mp.Process(
+        target=decoder_process,
+        args=(
+            tensor_queue,
+            payload_queue,
+            result_queue,
+            tensor_shape,
+            encoder_ready,
+            retrieval_done,
+        ),
+    )
+    decoder_proc.start()
+
+    encoder_result = result_queue.get(timeout=10.0)
+    decoder_result = result_queue.get(timeout=10.0)
+
+    encoder_proc.join(timeout=5.0)
+    decoder_proc.join(timeout=5.0)
+
+    # Verify results
+    assert encoder_result["success"], (
+        f"Encoder failed: {encoder_result.get('error')}\n"
+        f"{encoder_result.get('traceback', '')}"
+    )
+    assert decoder_result["success"], (
+        f"Decoder failed: {decoder_result.get('error')}\n"
+        f"{decoder_result.get('traceback', '')}"
+    )
+    assert decoder_result["matches_expected"], "Tensor shape mismatch"
+    assert "cuda" in decoder_result["device"], "Tensor not on CUDA device"
+    assert decoder_result["label"] == "cuda-msgpack"
+
+
+def test_cpu_tensor_fallback():
+    """Test that CPU tensors use standard serialization path."""
+    encoder = MsgpackEncoder()
+
+    # Create a CPU tensor
+    tensor = torch.randn(3, 4, dtype=torch.float32)
+
+    # Encode the tensor (should use standard path, not queue)
+    encoded = encoder.encode({"test_tensor": tensor})
+
+    # Verify encoding succeeded
+    assert len(encoded) > 0
+    assert isinstance(encoded, (list, tuple))
+
+    # Basic check: no queue should be used, so tensor goes through standard path
+    # This is mainly to ensure no exceptions are raised
+
+
+def test_msgpack_encoder_decoder_with_ipc():
+    """Test the full msgpack + tensor IPC path in one process."""
+    tensor_queue = torch_mp.Queue()
+    sender = TensorIpcSender(tensor_queue)
+    encoder = MsgpackEncoder(oob_tensor_consumer=sender)
+    receiver = TensorIpcReceiver(tensor_queue)
+    decoder = MsgpackDecoder(TensorEnvelope, oob_tensor_provider=receiver)
+
+    # Use CPU here to exercise the msgpack + sender/receiver integration
+    # without relying on same-process CUDA IPC behavior.
+    tensor = torch.randn(2, 3)
+
+    message = TensorEnvelope(tensor=tensor, label="test")
+    encoded = encoder.encode(message)
+    assert len(encoded) > 0
+
+    decoded = decoder.decode(encoded)
+    assert isinstance(decoded, TensorEnvelope)
+    assert decoded.label == "test"
+    assert torch.allclose(decoded.tensor, tensor)
+
+
+def test_decoder_buffer_management():
+    """Test receiver's tensor buffer management when draining queue."""
+    tensor_queue = torch_mp.Queue()
+
+    sender_id = "test_sender"
+    message_id = 1
+
+    # Put multiple tensors in queue using TensorIpcData
+    tensors_data = [
+        (0, torch.randn(2, 3)),
+        (1, torch.randn(4, 5)),
+        (2, torch.randn(6, 7)),
+    ]
+
+    for tensor_id, tensor in tensors_data:
+        ipc_data = TensorIpcData(
+            sender_id=sender_id,
+            message_id=message_id,
+            tensor_id=tensor_id,
+            tensor=tensor,
+        )
+        tensor_queue.put(ipc_data)
+
+    # Create receiver directly
+    receiver = TensorIpcReceiver(tensor_queue)
+
+    # Request tensor_id=2 (should buffer tensor_id=0 and tensor_id=1)
+    handle = {"sender_id": sender_id, "message_id": message_id, "tensor_id": 2}
+
+    result = receiver("float32", (6, 7), handle)
+    assert result.shape == (6, 7)
+
+    # Verify buffer has tensor_id 0 and 1
+    sender = receiver._tensor_buffers[sender_id]
+    tensors = sender.tensors.get(message_id, {})
+    assert 0 in tensors
+    assert 1 in tensors
+
+    # Request buffered tensor
+    handle2 = {"sender_id": sender_id, "message_id": message_id, "tensor_id": 0}
+
+    result2 = receiver("float32", (2, 3), handle2)
+    assert result2.shape == (2, 3)
+    # tensor_id 0 should be removed from buffer
+    sender = receiver._tensor_buffers[sender_id]
+    tensors = sender.tensors.get(message_id, {})
+    assert 0 not in tensors
+
+
+def api_server_worker(
+    server_id: int,
+    tensor_queue: torch_mp.Queue,
+    result_queue: mp.Queue,
+    barrier: BarrierType,
+    retrieval_done: EventType,
+):
+    """Worker simulating an API server sending tensors."""
+    try:
+        # Each server sends a unique tensor
+        tensor = torch.ones(server_id + 1, server_id + 2) * server_id
+        sender_id = f"server_{server_id}"
+
+        # Wait for all servers to be ready
+        barrier.wait()
+
+        # Send tensor using TensorIpcData
+        ipc_data = TensorIpcData(
+            sender_id=sender_id,
+            message_id=0,
+            tensor_id=0,
+            tensor=tensor,
+        )
+        tensor_queue.put(ipc_data)
+
+        result_queue.put({"server_id": server_id, "success": True})
+
+        # Keep process alive until main process has retrieved all tensors
+        # This prevents shared memory handles from being invalidated
+        retrieval_done.wait(timeout=30.0)
+    except Exception as e:
+        import traceback
+
+        result_queue.put(
+            {
+                "server_id": server_id,
+                "success": False,
+                "error": str(e),
+                "traceback": traceback.format_exc(),
+            }
+        )
+
+
+def test_multiple_api_servers_to_engine():
+    """Test multiple API servers sending to one engine core via multiprocessing."""
+    num_api_servers = 3
+    tensor_queue = torch_mp.Queue()
+    result_queue: mp.Queue = mp.Queue()
+    barrier = mp.Barrier(num_api_servers)
+    retrieval_done = mp.Event()
+
+    # Start multiple API server processes
+    processes = []
+    for server_id in range(num_api_servers):
+        proc = mp.Process(
+            target=api_server_worker,
+            args=(server_id, tensor_queue, result_queue, barrier, retrieval_done),
+        )
+        proc.start()
+        processes.append(proc)
+
+    # Collect results from all servers
+    results = []
+    for _ in range(num_api_servers):
+        result = result_queue.get(timeout=10.0)
+        results.append(result)
+
+    # Verify all servers succeeded
+    for result in results:
+        assert result["success"], (
+            f"Server {result['server_id']} failed: {result.get('error')}"
+        )
+
+    # Verify all tensors are in queue
+    received_tensors = []
+    for _ in range(num_api_servers):
+        ipc_data = tensor_queue.get(timeout=1.0)
+        received_tensors.append((ipc_data.sender_id, ipc_data.tensor))
+
+    assert len(received_tensors) == num_api_servers
+
+    # Verify tensor content (order may vary with multiprocessing)
+    tensor_by_sender = {sid: t for sid, t in received_tensors}
+    for server_id in range(num_api_servers):
+        expected_id = f"server_{server_id}"
+        assert expected_id in tensor_by_sender, (
+            f"Missing tensor from server {server_id}"
+        )
+        expected_tensor = torch.ones(server_id + 1, server_id + 2) * server_id
+        assert torch.allclose(tensor_by_sender[expected_id], expected_tensor)
+
+    # Signal workers that retrieval is complete
+    retrieval_done.set()
+
+    # Wait for all processes to complete
+    for proc in processes:
+        proc.join(timeout=5.0)
+
+
+def mixed_tensor_encoder_process(
+    tensor_queue: torch_mp.Queue,
+    result_queue: mp.Queue,
+    ready_event: EventType,
+    retrieval_done: EventType,
+):
+    """Process that encodes mixed CPU/CUDA tensors."""
+    try:
+        sender = TensorIpcSender(tensor_queue)
+        _encoder = MsgpackEncoder(oob_tensor_consumer=sender)
+
+        # Create only CUDA tensor for IPC (CPU will be serialized)
+        # But actually, let's just send CUDA tensor directly
+        cuda_tensor = torch.randn(4, 5, device="cuda:0")
+
+        # Manually send via IPC to test the mechanism
+        cuda_tensor_shared = cuda_tensor.share_memory_()
+
+        ipc_data = TensorIpcData(
+            sender_id="mixed_encoder",
+            message_id=0,
+            tensor_id=0,
+            tensor=cuda_tensor_shared,
+        )
+        tensor_queue.put(ipc_data, timeout=10.0)
+
+        ready_event.set()
+
+        result_queue.put({"success": True, "sent_cuda": True})
+
+        # Keep process alive until decoder has retrieved the tensor
+        retrieval_done.wait(timeout=30.0)
+    except Exception as e:
+        import traceback
+
+        ready_event.set()
+        result_queue.put(
+            {"success": False, "error": str(e), "traceback": traceback.format_exc()}
+        )
+
+
+def mixed_tensor_decoder_process(
+    tensor_queue: torch_mp.Queue,
+    result_queue: mp.Queue,
+    encoder_ready: EventType,
+    retrieval_done: EventType,
+):
+    """Process that retrieves mixed tensors from queue."""
+    try:
+        # Wait for encoder to finish
+        if not encoder_ready.wait(timeout=10.0):
+            raise TimeoutError("Encoder did not signal ready")
+
+        # Try to get CUDA tensor from queue
+        ipc_data = tensor_queue.get(timeout=5.0)
+
+        result_queue.put(
+            {
+                "success": True,
+                "is_cuda": ipc_data.tensor.is_cuda,
+                "shape": tuple(ipc_data.tensor.shape),
+            }
+        )
+
+        # Signal that retrieval is complete
+        retrieval_done.set()
+    except Exception as e:
+        import traceback
+
+        retrieval_done.set()  # Signal even on failure
+        result_queue.put(
+            {"success": False, "error": str(e), "traceback": traceback.format_exc()}
+        )
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_mixed_cpu_cuda_tensors():
+    """Test encoding with mixed CPU and CUDA tensors using multiprocessing."""
+    tensor_queue = torch_mp.Queue()
+    result_queue: mp.Queue = mp.Queue()
+    encoder_ready = mp.Event()
+    retrieval_done = mp.Event()
+
+    # Start encoder process
+    encoder_proc = mp.Process(
+        target=mixed_tensor_encoder_process,
+        args=(tensor_queue, result_queue, encoder_ready, retrieval_done),
+    )
+    encoder_proc.start()
+
+    # Start decoder process
+    decoder_proc = mp.Process(
+        target=mixed_tensor_decoder_process,
+        args=(tensor_queue, result_queue, encoder_ready, retrieval_done),
+    )
+    decoder_proc.start()
+
+    # Get results
+    encoder_result = result_queue.get(timeout=10.0)
+    decoder_result = result_queue.get(timeout=10.0)
+
+    encoder_proc.join(timeout=5.0)
+    decoder_proc.join(timeout=5.0)
+
+    # Verify encoder succeeded
+    assert encoder_result["success"], (
+        f"Encoder failed: {encoder_result.get('error')}\n"
+        f"{encoder_result.get('traceback', '')}"
+    )
+
+    # Verify decoder succeeded and got CUDA tensor
+    assert decoder_result["success"], (
+        f"Decoder failed: {decoder_result.get('error')}\n"
+        f"{decoder_result.get('traceback', '')}"
+    )
+    assert decoder_result["is_cuda"], "Retrieved tensor is not on CUDA"
+    assert decoder_result["shape"] == (4, 5), (
+        f"Unexpected shape: {decoder_result['shape']}"
+    )
+
+
+def cpu_tensor_ipc_encoder_process(
+    tensor_queue: torch_mp.Queue,
+    result_queue: mp.Queue,
+    tensor_shape: tuple,
+    ready_event: EventType,
+    retrieval_done: EventType,
+):
+    """Process that encodes and sends CPU tensors via IPC queue."""
+    try:
+        # Create encoder with IPC enabled for all tensors
+        sender = TensorIpcSender(tensor_queue)
+        encoder = MsgpackEncoder(oob_tensor_consumer=sender)
+
+        # Create a CPU tensor
+        tensor = torch.randn(*tensor_shape, dtype=torch.float32)
+
+        # Encode the tensor (should use IPC queue, not standard serialization)
+        encoded = encoder.encode({"test_tensor": tensor})
+
+        # Signal that encoding is complete
+        ready_event.set()
+
+        result_queue.put(
+            {
+                "success": True,
+                "encoded_length": len(encoded),
+                "device": str(tensor.device),
+                "tensor_shape": tuple(tensor.shape),
+            }
+        )
+
+        # Keep process alive until decoder has retrieved the tensor
+        # This is necessary for CPU tensor shared memory to remain valid
+        retrieval_done.wait(timeout=30.0)
+    except Exception as e:
+        import traceback
+
+        ready_event.set()
+        result_queue.put(
+            {"success": False, "error": str(e), "traceback": traceback.format_exc()}
+        )
+
+
+def cpu_tensor_ipc_decoder_process(
+    tensor_queue: torch_mp.Queue,
+    result_queue: mp.Queue,
+    expected_shape: tuple,
+    encoder_ready: EventType,
+    retrieval_done: EventType,
+):
+    """Process that decodes and receives CPU tensors from IPC queue."""
+    try:
+        # Wait for encoder to finish sending
+        if not encoder_ready.wait(timeout=10.0):
+            raise TimeoutError("Encoder did not signal ready")
+
+        # Get tensor from queue
+        ipc_data = tensor_queue.get(timeout=5.0)
+
+        result_queue.put(
+            {
+                "success": True,
+                "tensor_id": ipc_data.tensor_id,
+                "tensor_shape": tuple(ipc_data.tensor.shape),
+                "device": str(ipc_data.tensor.device),
+                "matches_expected": tuple(ipc_data.tensor.shape) == expected_shape,
+                "is_cpu": ipc_data.tensor.device.type == "cpu",
+            }
+        )
+
+        # Signal that retrieval is complete
+        retrieval_done.set()
+    except Exception as e:
+        import traceback
+
+        retrieval_done.set()  # Signal even on failure
+        result_queue.put(
+            {"success": False, "error": str(e), "traceback": traceback.format_exc()}
+        )
+
+
+def test_cpu_tensor_ipc():
+    """Test CPU tensor sharing via IPC queue when mm_tensor_ipc is enabled."""
+    # Set up single queue and synchronization
+    tensor_queue = torch_mp.Queue()
+    result_queue: mp.Queue = mp.Queue()
+    encoder_ready = mp.Event()
+    retrieval_done = mp.Event()
+
+    tensor_shape = (3, 5, 7)
+
+    # Start encoder process
+    encoder_proc = mp.Process(
+        target=cpu_tensor_ipc_encoder_process,
+        args=(
+            tensor_queue,
+            result_queue,
+            tensor_shape,
+            encoder_ready,
+            retrieval_done,
+        ),
+    )
+    encoder_proc.start()
+
+    # Start decoder process
+    decoder_proc = mp.Process(
+        target=cpu_tensor_ipc_decoder_process,
+        args=(
+            tensor_queue,
+            result_queue,
+            tensor_shape,
+            encoder_ready,
+            retrieval_done,
+        ),
+    )
+    decoder_proc.start()
+
+    # Wait for processes and collect results
+    encoder_result = result_queue.get(timeout=10.0)
+    decoder_result = result_queue.get(timeout=10.0)
+
+    encoder_proc.join(timeout=5.0)
+    decoder_proc.join(timeout=5.0)
+
+    # Verify results
+    assert encoder_result["success"], (
+        f"Encoder failed: {encoder_result.get('error')}\n"
+        f"{encoder_result.get('traceback', '')}"
+    )
+    assert decoder_result["success"], (
+        f"Decoder failed: {decoder_result.get('error')}\n"
+        f"{decoder_result.get('traceback', '')}"
+    )
+    assert decoder_result["matches_expected"], "Tensor shape mismatch"
+    assert decoder_result["is_cpu"], "Tensor not on CPU device"
+
+
+def test_ipc_disabled_mode():
+    """Test that IPC is disabled when no sender is provided."""
+    tensor_queues = [torch_mp.Queue()]
+
+    # Create encoder without IPC sender (IPC disabled)
+    encoder = MsgpackEncoder()
+
+    # Create a CPU tensor
+    cpu_tensor = torch.randn(2, 3, dtype=torch.float32)
+
+    # Encode the tensor (should use standard serialization, not IPC)
+    encoded = encoder.encode({"test_tensor": cpu_tensor})
+
+    # Verify encoding succeeded
+    assert len(encoded) > 0
+    assert isinstance(encoded, (list, tuple))
+
+    # Verify queue is empty (no IPC was used)
+    assert tensor_queues[0].empty(), "Tensor queue should be empty when IPC is disabled"
+
+    # If CUDA is available, test with CUDA tensor too
+    if torch.cuda.is_available():
+        cuda_tensor = torch.randn(4, 5, device="cuda:0")
+        encoded_cuda = encoder.encode({"cuda_tensor": cuda_tensor})
+        assert len(encoded_cuda) > 0
+        assert tensor_queues[0].empty(), (
+            "Tensor queue should be empty for CUDA tensor when IPC is disabled"
+        )
+
+
+@dataclass
+class MultiTensorMessage:
+    """Message with multiple tensors to test multi-tensor IPC."""
+
+    t1: torch.Tensor
+    t2: torch.Tensor
+    sender_label: str
+
+
+def concurrent_sender_process(
+    tensor_queue: torch_mp.Queue,
+    payload_queue: mp.Queue,
+    result_queue: mp.Queue,
+    sender_index: int,
+    num_messages: int,
+    barrier: BarrierType,
+    retrieval_done: EventType,
+):
+    """Process that acts as one of N concurrent senders."""
+    try:
+        sender = TensorIpcSender(tensor_queue)
+        encoder = MsgpackEncoder(oob_tensor_consumer=sender)
+
+        # Wait for all senders to be ready before sending
+        barrier.wait(timeout=10.0)
+
+        encoded_payloads = []
+        for msg_idx in range(num_messages):
+            # Each sender creates uniquely-shaped tensors so we can
+            # verify correct routing on the receiver side.
+            t1 = torch.full((sender_index + 1, 3), float(msg_idx), dtype=torch.float32)
+            t2 = torch.full(
+                (2, sender_index + 2), float(msg_idx + 100), dtype=torch.float64
+            )
+            msg = MultiTensorMessage(
+                t1=t1,
+                t2=t2,
+                sender_label=f"sender_{sender_index}_msg_{msg_idx}",
+            )
+            encoded = encoder.encode(msg)
+            encoded_payloads.append(encoded)
+
+        # Send all encoded payloads via the regular (non-tensor) queue
+        for encoded in encoded_payloads:
+            payload_queue.put(encoded, timeout=10.0)
+
+        result_queue.put(
+            {
+                "success": True,
+                "sender_index": sender_index,
+                "num_sent": num_messages,
+            }
+        )
+
+        # Keep alive so shared-memory handles remain valid
+        retrieval_done.wait(timeout=30.0)
+    except Exception as e:
+        import traceback
+
+        result_queue.put(
+            {
+                "success": False,
+                "sender_index": sender_index,
+                "error": str(e),
+                "traceback": traceback.format_exc(),
+            }
+        )
+
+
+def test_concurrent_senders_single_receiver():
+    """Test N concurrent senders sharing one queue with a single receiver.
+
+    Each sender encodes multiple messages (each containing two tensors) via
+    its own MsgpackEncoder + TensorIpcSender.  A single TensorIpcReceiver
+    on the receiving side must correctly drain-and-buffer interleaved
+    TensorIpcData items from the shared queue and match them back to the
+    right message handles during decode.
+    """
+    num_senders = 4
+    num_messages_per_sender = 3
+    tensor_queue = torch_mp.Queue()
+    payload_queue: mp.Queue = mp.Queue()
+    result_queue: mp.Queue = mp.Queue()
+    barrier = mp.Barrier(num_senders)
+    retrieval_done = mp.Event()
+
+    # Launch sender processes
+    processes = []
+    for i in range(num_senders):
+        proc = mp.Process(
+            target=concurrent_sender_process,
+            args=(
+                tensor_queue,
+                payload_queue,
+                result_queue,
+                i,
+                num_messages_per_sender,
+                barrier,
+                retrieval_done,
+            ),
+        )
+        proc.start()
+        processes.append(proc)
+
+    # Collect send confirmations
+    send_results = []
+    for _ in range(num_senders):
+        send_results.append(result_queue.get(timeout=15.0))
+    for r in send_results:
+        assert r["success"], (
+            f"Sender {r['sender_index']} failed: {r.get('error')}\n"
+            f"{r.get('traceback', '')}"
+        )
+
+    # Now decode all messages from the main process using a single receiver
+    receiver = TensorIpcReceiver(tensor_queue)
+    decoder = MsgpackDecoder(MultiTensorMessage, oob_tensor_provider=receiver)
+
+    decoded_messages: list[MultiTensorMessage] = []
+    total = num_senders * num_messages_per_sender
+    for _ in range(total):
+        encoded = payload_queue.get(timeout=10.0)
+        decoded = decoder.decode(encoded)
+        assert isinstance(decoded, MultiTensorMessage)
+        decoded_messages.append(decoded)
+
+    # Signal senders they can exit
+    retrieval_done.set()
+
+    # Group by sender_label prefix to verify all messages arrived
+    by_sender: dict[int, list[MultiTensorMessage]] = {}
+    for msg in decoded_messages:
+        # label format: "sender_{i}_msg_{j}"
+        parts = msg.sender_label.split("_")
+        sender_idx = int(parts[1])
+        by_sender.setdefault(sender_idx, []).append(msg)
+
+    assert len(by_sender) == num_senders, (
+        f"Expected {num_senders} senders, got {len(by_sender)}"
+    )
+
+    for sender_idx in range(num_senders):
+        msgs = sorted(by_sender[sender_idx], key=lambda m: m.sender_label)
+        assert len(msgs) == num_messages_per_sender, (
+            f"Sender {sender_idx}: expected {num_messages_per_sender} "
+            f"messages, got {len(msgs)}"
+        )
+        for msg_idx, msg in enumerate(msgs):
+            assert msg.sender_label == f"sender_{sender_idx}_msg_{msg_idx}"
+            # Verify tensor shapes match what the sender created
+            assert msg.t1.shape == (sender_idx + 1, 3)
+            assert msg.t2.shape == (2, sender_idx + 2)
+            # Verify tensor values
+            assert torch.allclose(msg.t1, torch.full_like(msg.t1, float(msg_idx)))
+            assert torch.allclose(msg.t2, torch.full_like(msg.t2, float(msg_idx + 100)))
+
+    for proc in processes:
+        proc.join(timeout=5.0)
+
+
+def test_concurrent_senders_interleaved_buffer():
+    """Test receiver buffering when tensors from multiple senders interleave.
+
+    Manually enqueue TensorIpcData from two senders in an interleaved order
+    and verify the receiver correctly buffers and retrieves each tensor by
+    its (sender_id, message_id, tensor_id) handle.
+    """
+    tensor_queue = torch_mp.Queue()
+
+    # Sender A: 2 tensors for message 1
+    a_t0 = torch.randn(2, 3)
+    a_t1 = torch.randn(4, 5)
+    # Sender B: 2 tensors for message 1
+    b_t0 = torch.randn(6, 7)
+    b_t1 = torch.randn(8, 9)
+
+    # Interleave: B_t0, A_t0, B_t1, A_t1
+    for sid, mid, tid, t in [
+        ("B", 1, 0, b_t0),
+        ("A", 1, 0, a_t0),
+        ("B", 1, 1, b_t1),
+        ("A", 1, 1, a_t1),
+    ]:
+        tensor_queue.put(
+            TensorIpcData(sender_id=sid, message_id=mid, tensor_id=tid, tensor=t)
+        )
+
+    receiver = TensorIpcReceiver(tensor_queue)
+
+    # Request A_t1 first — receiver must drain and buffer B_t0, A_t0, B_t1
+    result = receiver(
+        "float32", a_t1.shape, {"sender_id": "A", "message_id": 1, "tensor_id": 1}
+    )
+    assert torch.equal(result, a_t1)
+
+    # Now request B_t0 from buffer
+    result = receiver(
+        "float32", b_t0.shape, {"sender_id": "B", "message_id": 1, "tensor_id": 0}
+    )
+    assert torch.equal(result, b_t0)
+
+    # Request A_t0 from buffer
+    result = receiver(
+        "float32", a_t0.shape, {"sender_id": "A", "message_id": 1, "tensor_id": 0}
+    )
+    assert torch.equal(result, a_t0)
+
+    # Request B_t1 from buffer
+    result = receiver(
+        "float64", b_t1.shape, {"sender_id": "B", "message_id": 1, "tensor_id": 1}
+    )
+    assert torch.equal(result, b_t1)
+
+    # All buffers should be drained
+    for sid in ("A", "B"):
+        tensors = receiver._tensor_buffers[sid].tensors.get(1, {})
+        assert len(tensors) == 0, f"Sender {sid} buffer not empty: {tensors}"
+
+
+def test_mixed_cpu_cuda_with_ipc_enabled():
+    """Test that encoder is configured correctly for IPC with all tensor types."""
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    tensor_queue = torch_mp.Queue()
+
+    # Create sender and encoder with IPC enabled
+    sender = TensorIpcSender(tensor_queue)
+    encoder = MsgpackEncoder(oob_tensor_consumer=sender)
+
+    # Verify sender configuration
+    assert encoder.oob_tensor_consumer is not None, "Consumer should be set"
+
+    # Note: Actual IPC transfer only works across processes
+    # (tested in test_cpu_tensor_ipc)
+    # This test just verifies the configuration is correct
+
+
+def test_tensor_cleanup_after_decode():
+    """Test that tensors are removed from tracking after successful decode."""
+    # Create a tensor queue
+    tensor_queue = torch_mp.Queue()
+
+    # Create and encode a tensor
+    tensor = torch.randn(5, 5)
+    # Move to shared memory for IPC
+    if not tensor.is_shared():
+        tensor.share_memory_()
+
+    # Manually create a TensorIpcData and put it in the queue
+    sender_id = "test_sender"
+    message_id = 0
+    tensor_id = 0
+    ipc_data = TensorIpcData(
+        sender_id=sender_id,
+        message_id=message_id,
+        tensor_id=tensor_id,
+        tensor=tensor,
+    )
+    tensor_queue.put(ipc_data)
+
+    # Create receiver directly
+    receiver = TensorIpcReceiver(tensor_queue)
+
+    handle = {
+        "sender_id": sender_id,
+        "message_id": message_id,
+        "tensor_id": tensor_id,
+    }
+
+    # Receive the tensor - this should retrieve it from the queue
+    decoded_tensor = receiver(
+        str(tensor.dtype).removeprefix("torch."), tensor.shape, handle
+    )
+
+    # Verify the tensor was decoded
+    assert decoded_tensor.shape == tensor.shape, "Decoded tensor should match shape"
+
+    # Verify the tensor was removed from buffer after decode
+    sender = receiver._tensor_buffers[sender_id]
+    tensors = sender.tensors.get(message_id, {})
+    assert tensor_id not in tensors, "Tensor should be removed from buffer"
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 6ea65c6944b0..c4a55c8370e0 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -378,3 +378,65 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, swap_list: lis
     ref_input_batch.refresh_metadata()
 
     _compare_objs(input_batch, ref_input_batch)
+
+
+def _construct_pooling_request(req_id_suffix: int):
+    from vllm.pooling_params import PoolingParams
+
+    prompt_token_ids = [
+        np.random.randint(0, VOCAB_SIZE)
+        for _ in range(np.random.randint(10, MAX_PROMPT_SIZE))
+    ]
+    return CachedRequestState(
+        req_id=f"pool_req_{req_id_suffix}",
+        prompt_token_ids=prompt_token_ids,
+        sampling_params=None,
+        pooling_params=PoolingParams(task="classify"),
+        mm_features=[],
+        block_ids=([],),
+        generator=None,
+        num_computed_tokens=0,
+        output_token_ids=[],
+    )
+
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_pooling_prompt_lens_not_aliased(device: str):
+    """Verify that prompt_lens in PoolingMetadata does not share memory
+    with the internal num_prompt_tokens pinned buffer. Guards against possible
+    non-determinism in pooling metadata due to mutations to the internal buffer.
+    """
+    batch_size = 4
+    input_batch = InputBatch(
+        max_num_reqs=batch_size * 2,
+        max_model_len=MAX_PROMPT_SIZE + NUM_OUTPUT_TOKENS,
+        max_num_batched_tokens=batch_size * (MAX_PROMPT_SIZE + NUM_OUTPUT_TOKENS),
+        device=torch.device(device),
+        pin_memory=is_pin_memory_available(),
+        vocab_size=VOCAB_SIZE,
+        block_sizes=[16],
+        kernel_block_sizes=[16],
+        is_pooling_model=True,
+    )
+
+    reqs = []
+    # Add requests
+    for i in range(batch_size):
+        req = _construct_pooling_request(i)
+        input_batch.add_request(req)
+        reqs.append(req)
+    input_batch.refresh_metadata()
+
+    # prompt_lens must be a snapshot
+    metadata = input_batch.get_pooling_metadata()
+    prompt_lens_snapshot = metadata.prompt_lens.clone()
+
+    # Mutate the internal buffer (simulates next batch adding new requests)
+    input_batch.num_prompt_tokens_cpu_tensor.fill_(999)
+
+    # prompt_lens must be unaffected by the mutation
+    assert torch.equal(metadata.prompt_lens, prompt_lens_snapshot), (
+        "prompt_lens shares memory with internal pinned buffer; "
+        "mutations to num_prompt_tokens_cpu_tensor corrupted prompt_lens. "
+        f"Expected {prompt_lens_snapshot}, got {metadata.prompt_lens}"
+    )
diff --git a/tests/v1/worker/test_mamba_utils.py b/tests/v1/worker/test_mamba_utils.py
index df3b7de9b4c9..c5d0661476e3 100644
--- a/tests/v1/worker/test_mamba_utils.py
+++ b/tests/v1/worker/test_mamba_utils.py
@@ -36,6 +36,7 @@ def test_resumed_req_ids_cleared_from_mamba_state_idx():
     spec = MagicMock(block_size=64, num_speculative_blocks=0)
     cache_config = MagicMock(enable_prefix_caching=True)
     input_batch = MagicMock(req_ids=[])
+    copy_bufs = MagicMock(mamba_group_ids=[0], mamba_spec=spec)
 
     mamba_state_idx = {
         "finished": 1,
@@ -62,7 +63,7 @@ def test_resumed_req_ids_cleared_from_mamba_state_idx():
             {},
             {},
             (),
-            MagicMock(),
+            copy_bufs,
         )
 
     assert mamba_state_idx == {"keep": 99}
diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py
index b527ffcf9b18..24a5b9bee3f5 100644
--- a/vllm/assets/audio.py
+++ b/vllm/assets/audio.py
@@ -8,15 +8,10 @@
 
 import numpy.typing as npt
 
-from vllm.utils.import_utils import PlaceholderModule
+from vllm.multimodal.media.audio import load_audio
 
 from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets
 
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
-
 ASSET_DIR = "multimodal_asset"
 
 AudioAssetName = Literal["winning_call", "mary_had_lamb"]
@@ -33,7 +28,7 @@ def filename(self) -> str:
     @property
     def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]:
         audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
-        return librosa.load(audio_path, sr=None)
+        return load_audio(audio_path, sr=None)
 
     def get_local_path(self) -> Path:
         return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR)
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index d025368cbd43..f5e443db978f 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -10,15 +10,10 @@
 from huggingface_hub import hf_hub_download
 from PIL import Image
 
-from vllm.utils.import_utils import PlaceholderModule
+from vllm.multimodal.media.audio import load_audio_pyav
 
 from .base import get_cache_dir
 
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
-
 
 @lru_cache
 def download_video_asset(filename: str) -> str:
@@ -146,4 +141,4 @@ def get_audio(self, sampling_rate: float | None = None) -> npt.NDArray:
 
         See also: examples/offline_inference/qwen2_5_omni/only_thinker.py
         """
-        return librosa.load(self.video_path, sr=sampling_rate)[0]
+        return load_audio_pyav(self.video_path, sr=sampling_rate)[0]
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index 1e0a63dd6eb3..8304e8703b55 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -38,6 +38,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.audio import get_audio_duration
 from vllm.multimodal.image import convert_image_mode
 from vllm.tokenizers import TokenizerLike
 from vllm.utils.argparse_utils import FlexibleArgumentParser
@@ -54,10 +55,6 @@
 except ImportError:
     pd = PlaceholderModule("pandas")
 
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")
 
 logger = logging.getLogger(__name__)
 
@@ -3253,7 +3250,7 @@ def sample(
                 break
             audio = item["audio"]
             y, sr = audio["array"], audio["sampling_rate"]
-            duration_s = librosa.get_duration(y=y, sr=sr)
+            duration_s = get_audio_duration(y=y, sr=sr)
             if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec:
                 skipped += 1
                 continue
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
index 2b667344ff37..c089f02a37ff 100644
--- a/vllm/compilation/caching.py
+++ b/vllm/compilation/caching.py
@@ -11,10 +11,13 @@
 from unittest.mock import patch
 
 import torch
+from torch._subclasses import FakeTensorMode
+from torch.fx._graph_pickler import GraphPickler, Options
 from torch.utils import _pytree as pytree
 
 import vllm.envs as envs
 from vllm.compilation.compiler_interface import get_inductor_factors
+from vllm.compilation.counter import compilation_counter
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.config.utils import hash_factors
 from vllm.logger import init_logger
@@ -59,6 +62,7 @@ def insert(self, submod_name: str, shape: str, entry: bytes) -> None:
         self.submodule_bytes[f"{submod_name}_{shape}"] = hex_digest
         if hex_digest not in self.submodule_bytes_store:
             self.submodule_bytes_store[hex_digest] = entry
+            compilation_counter.num_compiled_artifacts_saved += 1
             logger.debug(
                 "inserting new artifact for submod %s with shape %s "
                 "(%s bytes) at hash %s",
@@ -122,6 +126,7 @@ def load_all(self) -> None:
 
         def _load_entry(entry_bytes: bytes) -> AOTCompiledArtifact:
             entry = pickle.loads(entry_bytes)
+            compilation_counter.num_compiled_artifacts_loaded += 1
             return AOTCompiledArtifact.deserialize(entry)
 
         with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -206,26 +211,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:
         return self.optimized_call(*args, **kwargs)
 
     @classmethod
-    def serialize_compile_artifacts(
-        cls, compiled_fn: "VllmSerializableFunction"
-    ) -> bytes:
+    def serialize_graph_module(cls, graph_module: torch.fx.GraphModule) -> bytes:
         import sympy
-        from torch._subclasses import FakeTensorMode
-        from torch.fx._graph_pickler import GraphPickler, Options
-
-        state = compiled_fn.__dict__.copy()
-        state.pop("optimized_call")
-        state.pop("shape_env")
-        state.pop("vllm_backend", None)
-        state.pop("_fake_mode", None)
-        for node in state["graph_module"].graph.nodes:
-            node.meta.pop("source_fn_stack", None)
-            node.meta.pop("nn_module_stack", None)
-        for name, submod in state["graph_module"].named_children():
-            if hasattr(submod, "graph"):
-                for node in submod.graph.nodes:
-                    node.meta.pop("source_fn_stack", None)
-                    node.meta.pop("nn_module_stack", None)
 
         graph_reducer_override = GraphPickler.reducer_override
 
@@ -242,6 +229,37 @@ def _graph_reducer_override(
                 return type(None), ()
             return graph_reducer_override(self, obj)
 
+        with (
+            patch.object(GraphPickler, "reducer_override", _graph_reducer_override),
+            patch_pytree_map_over_slice(),
+        ):
+            return GraphPickler.dumps(graph_module, Options(ops_filter=None))
+
+    @classmethod
+    def deserialize_graph_module(
+        cls, data: bytes, fake_mode: FakeTensorMode
+    ) -> torch.fx.GraphModule:
+        with patch_pytree_map_over_slice():
+            return GraphPickler.loads(data, fake_mode)
+
+    @classmethod
+    def serialize_compile_artifacts(
+        cls, compiled_fn: "VllmSerializableFunction"
+    ) -> bytes:
+        state = compiled_fn.__dict__.copy()
+        state.pop("optimized_call")
+        state.pop("shape_env")
+        state.pop("vllm_backend", None)
+        state.pop("_fake_mode", None)
+        for node in state["graph_module"].graph.nodes:
+            node.meta.pop("source_fn_stack", None)
+            node.meta.pop("nn_module_stack", None)
+        for name, submod in state["graph_module"].named_children():
+            if hasattr(submod, "graph"):
+                for node in submod.graph.nodes:
+                    node.meta.pop("source_fn_stack", None)
+                    node.meta.pop("nn_module_stack", None)
+
         if state.get("sym_tensor_indices"):
             # put tensor inputs on meta device since their data
             # isn't needed, yet we need the meta for make_copy_and_call
@@ -257,14 +275,9 @@ def _graph_reducer_override(
                 lambda inp: torch.empty_like(inp, device="meta"),
                 state["example_inputs"],
             )
-        with (
-            patch.object(GraphPickler, "reducer_override", _graph_reducer_override),
-            patch_pytree_map_over_slice(),
-        ):
-            state["graph_module"] = GraphPickler.dumps(
-                state["graph_module"], Options(ops_filter=None)
-            )
-            state["example_inputs"] = GraphPickler.dumps(state["example_inputs"])
+
+        state["graph_module"] = cls.serialize_graph_module(state["graph_module"])
+        state["example_inputs"] = GraphPickler.dumps(state["example_inputs"])
 
         if compiled_fn.vllm_backend:
             (
@@ -280,14 +293,14 @@ def _graph_reducer_override(
     @classmethod
     def deserialize_compile_artifacts(cls, data: bytes) -> "VllmSerializableFunction":
         from torch._guards import TracingContext, tracing
-        from torch._subclasses import FakeTensorMode
-        from torch.fx._graph_pickler import GraphPickler
         from torch.fx.experimental.symbolic_shapes import ShapeEnv
 
         state = pickle.loads(data)
         fake_mode = FakeTensorMode(shape_env=ShapeEnv())
-        with patch_pytree_map_over_slice():
-            state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode)
+
+        state["graph_module"] = cls.deserialize_graph_module(
+            state["graph_module"], fake_mode
+        )
         state["graph_module"].recompile()
         state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode)
 
diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
index f141a7c171f7..d55b305992e9 100644
--- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
+++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py
@@ -62,6 +62,11 @@
         4: 32,  # 32MB
         8: 1,  # 1MB
     },
+    103: {
+        2: 64,  # 64MB
+        4: 64,  # 64MB
+        8: 2,  # 2MB
+    },
 }
 
 # Max size of the input tensor per world size per device capability
@@ -78,6 +83,11 @@
         4: 4,  # 4MB
         8: 1,  # 1MB
     },
+    103: {
+        2: 32,  # 32MB
+        4: 4,  # 4MB
+        8: 2,  # 2MB
+    },
 }
 
 
@@ -86,8 +96,6 @@
         destroy_fi_ar_workspace,
         get_fi_ar_quant_workspace,
         get_fi_ar_workspace,
-        initialize_fi_ar_quant_workspace,
-        initialize_fi_ar_workspace,
     )
 
     ar_fusion_patterns = flashinfer_comm.AllReduceFusionPattern
@@ -133,15 +141,23 @@ def call_trtllm_fused_allreduce_norm(
 
         # Select workspace based on pattern: quant patterns use the
         # trtllm quant workspace, non-quant patterns use the primary workspace.
-        if pattern_code in (
+        is_quant_pattern = pattern_code in (
             ar_fusion_patterns.kARResidualRMSNormFP8Quant,
             ar_fusion_patterns.kARResidualRMSNormFP4Quant,
-        ):
-            workspace = get_fi_ar_quant_workspace()
-        else:
-            workspace = get_fi_ar_workspace()
+        )
+        get_workspace_fn = (
+            get_fi_ar_quant_workspace if is_quant_pattern else get_fi_ar_workspace
+        )
+        workspace = get_workspace_fn(
+            world_size=world_size,
+            rank=get_tensor_model_parallel_rank(),
+            max_token_num=max_token_num,
+            hidden_dim=hidden_size,
+            dtype=allreduce_in.dtype,
+            group=get_tp_group().device_group,
+        )
         assert workspace is not None, (
-            "Flashinfer workspace must be initialized when using flashinfer"
+            "Flashinfer allreduce workspace must be initialized when using flashinfer"
         )
         assert flashinfer_comm is not None
         if norm_out is None:
@@ -753,35 +769,29 @@ def __init__(self, config: VllmConfig) -> None:
             scope="global",
         )
 
-        for workspace_init_fn in [
-            initialize_fi_ar_workspace,
-            initialize_fi_ar_quant_workspace,
-        ]:
-            try:
-                workspace_init_fn(
-                    world_size=self.tp_size,
-                    rank=rank,
-                    max_token_num=self.max_token_num,
-                    hidden_dim=self.hidden_dim,
-                    dtype=self.model_dtype,
-                    group=self.group,
-                )
-            except Exception as e:
-                if "multicast" in str(e).lower():
-                    logger.warning(
-                        "AllReduce fusion pass is disabled: flashinfer workspace "
-                        "creation failed: %s. This is expected on GPUs without "
-                        "NVSwitch (e.g., NVLink bridge-only or PCIe topologies). "
-                        "Falling back to non-fused allreduce.",
-                        str(e),
-                    )
-                else:
-                    logger.warning(
-                        "Failed to initialize FlashInfer All Reduce workspace: %s. "
-                        "AllReduce fusion pass will be disabled.",
-                        e,
-                    )
-                return
+        workspace_kwargs = dict(
+            world_size=self.tp_size,
+            rank=rank,
+            max_token_num=self.max_token_num,
+            hidden_dim=self.hidden_dim,
+            dtype=self.model_dtype,
+            group=self.group,
+        )
+        if get_fi_ar_workspace(**workspace_kwargs) is None:
+            logger.warning_once(
+                "Failed to initialize Flashinfer allreduce workspace. "
+                "Flashinfer allreduce-norm fusion will be disabled."
+            )
+            return
+
+        self.supports_quant_fusion = (
+            get_fi_ar_quant_workspace(**workspace_kwargs) is not None
+        )
+        if not self.supports_quant_fusion:
+            logger.warning_once(
+                "Failed to initialize Flashinfer allreduce workspace. "
+                "Flashinfer allreduce-norm-quant fusion will be disabled."
+            )
 
         self.allreduce_params = FlashInferFusedAllReduceParams(
             world_size=self.tp_size,
@@ -793,9 +803,8 @@ def __init__(self, config: VllmConfig) -> None:
 
     @enable_fake_mode
     def register_patterns(self) -> None:
-        supports_quantization = get_fi_ar_quant_workspace() is not None
         for epsilon in [1e-5, 1e-6]:
-            if supports_quantization:
+            if self.supports_quant_fusion:
                 AllReduceFusedRMSNormStaticQuantFP8Pattern(
                     epsilon,
                     self.model_dtype,
diff --git a/vllm/config/model.py b/vllm/config/model.py
index b12202f9c712..19787f80b50f 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -14,7 +14,12 @@
 from vllm.config.model_arch import (
     ModelArchitectureConfig,
 )
-from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig
+from vllm.config.multimodal import (
+    MMCacheType,
+    MMEncoderTPMode,
+    MMTensorIPC,
+    MultiModalConfig,
+)
 from vllm.config.pooler import PoolerConfig
 from vllm.config.scheduler import RunnerType
 from vllm.config.utils import config, getattr_iter
@@ -310,6 +315,7 @@ class ModelConfig:
     interleave_mm_strings: InitVar[bool | None] = None
     skip_mm_profiling: InitVar[bool | None] = None
     video_pruning_rate: InitVar[float | None] = None
+    mm_tensor_ipc: InitVar[MMTensorIPC] = None
 
     def compute_hash(self) -> str:
         """
@@ -430,6 +436,7 @@ def __post_init__(
         interleave_mm_strings: bool | None,
         skip_mm_profiling: bool | None,
         video_pruning_rate: float | None,
+        mm_tensor_ipc: MMTensorIPC,
     ) -> None:
         # Keep set served_model_name before maybe_model_redirect(self.model)
         self.served_model_name = get_served_model_name(
@@ -612,6 +619,7 @@ def __post_init__(
                 interleave_mm_strings=interleave_mm_strings,
                 skip_mm_profiling=skip_mm_profiling,
                 video_pruning_rate=video_pruning_rate,
+                mm_tensor_ipc=mm_tensor_ipc,
             )
 
             mm_config_kwargs = {
@@ -1112,6 +1120,22 @@ def verify_with_parallel_config(
                 f"({parallel_config.decode_context_parallel_size})."
             )
 
+        # torch_shm uses a single IPC queue to rank 0; DP>1 is
+        # incompatible because API servers can't know which
+        # CoreEngine the scheduler will assign work to. TP>1 is
+        # also not supported because this requires broadcasting
+        # MM tensors between all TP ranks.
+        if (
+            self.multimodal_config is not None
+            and self.multimodal_config.mm_tensor_ipc == "torch_shm"
+            and parallel_config.world_size_across_dp > 1
+        ):
+            raise ValueError(
+                "mm_tensor_ipc='torch_shm' is not supported with "
+                "data_parallel_size > 1 or tensor_parallel_size > 1 "
+                "or pipeline_parallel_size > 1."
+            )
+
     def get_sliding_window(self) -> int | None:
         """Get the sliding window size from the HF text config if present."""
         return getattr(self.hf_text_config, "sliding_window", None)
@@ -1435,10 +1459,10 @@ def requires_raw_input_tokens(self) -> bool:
     @property
     def score_type(self) -> ScoreType:
         """
-        Score API handles score/rerank for:
-        - "score" task (score_type: cross-encoder models)
-        - "embed" task (score_type: bi-encoder models)
-        - "token_embed" task (score_type: late interaction models)
+        Scoring API handles score/rerank for:\n
+        - "classify" task (score_type: cross-encoder models)\n
+        - "embed" task (score_type: bi-encoder models)\n
+        - "token_embed" task (score_type: late interaction models)\n
         """
         # fixme: self._model_info.score_type is the score type before
         #  as_seq_cls_model, which is "bi-encoder", rather than the
diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py
index f95a2e140c67..1c9bc43b01ca 100644
--- a/vllm/config/multimodal.py
+++ b/vllm/config/multimodal.py
@@ -59,6 +59,7 @@ class MultiModalDummyOptionsBuiltins(TypedDict, total=False):
 
 MMEncoderTPMode = Literal["weights", "data"]
 MMCacheType = Literal["shm", "lru"]
+MMTensorIPC = Literal["direct_rpc", "torch_shm"]
 MMDummyOptions: TypeAlias = dict[str, BaseDummyOptions]
 """
 A dictionary containing an entry for each modality type of dummy data.
@@ -172,6 +173,11 @@ class MultiModalConfig:
     Value sits in range [0;1) and determines fraction of media tokens
     from each video to be pruned.
     """
+    mm_tensor_ipc: MMTensorIPC = "direct_rpc"
+    """IPC (inter-process communication) method for multimodal tensors.
+    - "direct_rpc": Use msgspec serialization via RPC
+    - "torch_shm": Use torch.multiprocessing shared memory for zero-copy IPC
+    Defaults to "direct_rpc". """
 
     @field_validator("limit_per_prompt", mode="before")
     @classmethod
diff --git a/vllm/config/profiler.py b/vllm/config/profiler.py
index 6a40b9daddc0..e79e213106db 100644
--- a/vllm/config/profiler.py
+++ b/vllm/config/profiler.py
@@ -45,10 +45,10 @@ class ProfilerConfig:
     worker's traces (CPU & GPU) will be saved under this directory. Note that
     it must be an absolute path."""
 
-    torch_profiler_with_stack: bool = False
-    """If `True`, enables stack tracing in the torch profiler. Disabled by default
-    to reduce overhead. Can be enabled via VLLM_TORCH_PROFILER_WITH_STACK=1 env var
-    or --profiler-config.torch_profiler_with_stack=true CLI flag."""
+    torch_profiler_with_stack: bool = True
+    """If `True`, enables stack tracing in the torch profiler. Enabled by default
+    as it is useful for debugging. Can be disabled via 
+    --profiler-config.torch_profiler_with_stack=false CLI flag."""
 
     torch_profiler_with_flops: bool = False
     """If `True`, enables FLOPS counting in the torch profiler. Disabled by default."""
diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py
index f525ac871c3e..a178a8f54bc2 100644
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -120,7 +120,7 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool:
         and current_platform.is_cuda()
         and has_flashinfer()
         and (
-            current_platform.is_device_capability(100)
+            current_platform.is_device_capability_family(100)
             or current_platform.is_device_capability(90)
         )
         # tp-dp combination broken:
@@ -766,6 +766,17 @@ def __post_init__(self):
             else:
                 self.parallel_config.disable_nccl_for_dp_synchronization = False
 
+        if (
+            self.model_config is not None
+            and self.model_config.multimodal_config is not None
+            and self.model_config.multimodal_config.mm_tensor_ipc == "torch_shm"
+            and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"
+        ):
+            raise ValueError(
+                "torch_shm is known to fail without "
+                "VLLM_WORKER_MULTIPROC_METHOD set to spawn"
+            )
+
         from vllm.platforms import current_platform
 
         if (
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index 0cdff90320da..075f4e0859e4 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -10,6 +10,7 @@
 from vllm.distributed import get_dp_group, get_ep_group
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils.flashinfer import (
     has_flashinfer_nvlink_one_sided,
     has_flashinfer_nvlink_two_sided,
@@ -325,14 +326,20 @@ def _make_all2all_kwargs(self) -> dict[Any, Any]:
 
         assert num_rdma_bytes is not None
         assert num_qps_per_rank is not None
-        return dict(
+        # TODO: remove platform-specific logic
+        # once ROCm DeepEP is updated with the latest APIs.
+        kwargs = dict(
             group=self.cpu_group,
             num_nvl_bytes=num_nvl_bytes,
             num_rdma_bytes=num_rdma_bytes,
             low_latency_mode=False,
             num_qps_per_rank=num_qps_per_rank,
-            explicitly_destroy=True,
         )
+        if not current_platform.is_rocm():
+            kwargs.update(
+                explicitly_destroy=True,
+            )
+        return kwargs
 
     def get_handle(self, kwargs):
         assert len(kwargs) == 0, (
@@ -397,16 +404,22 @@ def _make_all2all_kwargs(
         )
 
         assert num_rdma_bytes is not None
-        return dict(
+        # TODO: remove platform-specific logic
+        # once ROCm DeepEP is updated with the latest APIs.
+        kwargs = dict(
             group=self.cpu_group,
             num_nvl_bytes=num_nvl_bytes,
             num_rdma_bytes=num_rdma_bytes,
             low_latency_mode=True,
             num_qps_per_rank=num_qps_per_rank,
-            allow_nvlink_for_low_latency_mode=True,
-            allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL,
-            explicitly_destroy=True,
         )
+        if not current_platform.is_rocm():
+            kwargs.update(
+                allow_nvlink_for_low_latency_mode=True,
+                allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL,
+                explicitly_destroy=True,
+            )
+        return kwargs
 
     def get_handle(self, kwargs):
         """
diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py
index 3c347ef756d4..9777be5aa7f8 100644
--- a/vllm/distributed/device_communicators/all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/all_reduce_utils.py
@@ -44,6 +44,12 @@
         6: 1 * MiB,  # 1 MB
         8: 1 * MiB,  # 1 MB
     },
+    "10.3": {
+        2: 4 * MiB,  # 4 MB
+        4: 4 * MiB,  # 4 MB
+        6: 8 * MiB,  # 8 MB
+        8: 4 * MiB,  # 4 MB
+    },
 }
 
 SYMM_MEM_ALL_REDUCE_MAX_SIZES = {
@@ -59,6 +65,12 @@
         6: 128 * MiB,  # 128 MB
         8: 128 * MiB,  # 128 MB
     },
+    "10.3": {
+        2: 4 * MiB,  # 4 MB
+        4: 32 * MiB,  # 32 MB
+        6: 32 * MiB,  # 32 MB
+        8: 64 * MiB,  # 64 MB
+    },
 }
 
 # NCCL symmetric memory allreduce configuration based on H100 and GB200 benchmarks.
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index bd5741e8dc72..4550bdb25629 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -338,6 +338,7 @@ def broadcast(self, tensor: torch.Tensor, src: int = 0) -> torch.Tensor:
 
     def destroy(self):
         if self.pynccl_comm is not None:
+            self.pynccl_comm.destroy()
             self.pynccl_comm = None
         if self.ca_comm is not None:
             self.ca_comm = None
diff --git a/vllm/distributed/device_communicators/flashinfer_all_reduce.py b/vllm/distributed/device_communicators/flashinfer_all_reduce.py
index 66e089182869..b2edfc15d731 100644
--- a/vllm/distributed/device_communicators/flashinfer_all_reduce.py
+++ b/vllm/distributed/device_communicators/flashinfer_all_reduce.py
@@ -29,50 +29,27 @@
 except ImportError:
     pass
 
-# Global workspace for standalone allreduce and non-quant ar+rms fusion
+# Workspace for standalone allreduce and non-quant ar+rms fusion
 _fi_ar_workspace = None
 # Extra workspace for quant fusion patterns (only supported by trtllm backend)
-# Only created if primary workspace is not already trtllm
 _fi_ar_quant_workspace = None
 
 
-def get_fi_ar_workspace():
-    return _fi_ar_workspace
-
-
-def get_fi_ar_quant_workspace():
-    return _fi_ar_quant_workspace
-
-
-def initialize_fi_ar_workspace(
+def _create_workspace(
+    backend: str,
     world_size: int,
     rank: int,
     max_token_num: int,
     hidden_dim: int,
     dtype: torch.dtype,
     group: ProcessGroup,
-) -> None:
-    """
-    Initialize the workspace if not already initialized.
-
-    Currently, this function is called by either the AllReduceFusionPass
-    or the FlashInferAllReduce backend for standalone allreduce.
-    If the fusion pass is enabled via
-    --compilation-config.pass_config.fuse_allreduce_rms=true,
-    it will create the workspace first, and the standalone backend
-    will reuse the workspace. Otherwise, the standalone backend will
-    create the workspace.
-    """
-    global _fi_ar_workspace
-    if _fi_ar_workspace is not None:
-        return
-
-    backend = envs.VLLM_FLASHINFER_ALLREDUCE_BACKEND
+):
+    """Create a flashinfer allreduce workspace, returning None on failure."""
     comm_backend = TorchDistBackend(group=group)
     rng_state = random.getstate()
     try:
         random.seed(int.from_bytes(os.urandom(16), byteorder="big"))
-        _fi_ar_workspace = flashinfer_comm.create_allreduce_fusion_workspace(
+        workspace = flashinfer_comm.create_allreduce_fusion_workspace(
             backend=backend,
             world_size=world_size,
             rank=rank,
@@ -81,9 +58,22 @@ def initialize_fi_ar_workspace(
             dtype=dtype,
             comm_backend=comm_backend,
         )
+    except Exception as e:
+        if "multicast" in str(e).lower():
+            logger.warning_once(
+                "Failed to initialize FlashInfer All Reduce workspace: %s. "
+                "This is expected on GPUs without NVSwitch (e.g., NVLink "
+                "bridge-only or PCIe topologies).",
+                e,
+            )
+        else:
+            logger.warning_once(
+                "Failed to initialize FlashInfer All Reduce workspace: %s.",
+                e,
+            )
+        return None
     finally:
         random.setstate(rng_state)
-    assert _fi_ar_workspace is not None
     logger.debug(
         "Initialized FlashInfer All Reduce workspace: backend=%s, "
         "world_size=%d, rank=%d, max_token_num=%d, hidden_dim=%d, dtype=%s",
@@ -94,70 +84,84 @@ def initialize_fi_ar_workspace(
         hidden_dim,
         dtype,
     )
+    return workspace
+
+
+def get_fi_ar_workspace(
+    world_size: int,
+    rank: int,
+    max_token_num: int,
+    hidden_dim: int,
+    dtype: torch.dtype,
+    group: ProcessGroup,
+):
+    """
+    Return the allreduce workspace for non-quant patterns, initializing if needed.
+
+    Used by AllReduceFusionPass (non-quant patterns) and FlashInferAllReduce
+    for standalone allreduce. Backend is controlled by
+    VLLM_FLASHINFER_ALLREDUCE_BACKEND env var.
+    """
+    global _fi_ar_workspace
+    if _fi_ar_workspace is not None:
+        return _fi_ar_workspace
+
+    backend = envs.VLLM_FLASHINFER_ALLREDUCE_BACKEND
+
+    # Reuse the quant workspace if it was already created with the same backend
+    if _fi_ar_quant_workspace is not None and _fi_ar_quant_workspace.backend == backend:
+        _fi_ar_workspace = _fi_ar_quant_workspace
+        return _fi_ar_workspace
+
+    _fi_ar_workspace = _create_workspace(
+        backend, world_size, rank, max_token_num, hidden_dim, dtype, group
+    )
+    return _fi_ar_workspace
 
 
-def initialize_fi_ar_quant_workspace(
+def get_fi_ar_quant_workspace(
     world_size: int,
     rank: int,
     max_token_num: int,
     hidden_dim: int,
     dtype: torch.dtype,
     group: ProcessGroup,
-) -> None:
+):
     """
-    Initialize the workspace used by quantization fusion patterns.
+    Return the allreduce workspace for quant patterns, initializing if needed.
 
-    Currently this always creates a workspace for trtllm backend as only it
-    supports quantization fusion (FP8/FP4). If the primary workspace
-    is already trtllm, the quant workspace aliases to it.
+    Always uses trtllm backend as it is the only one supporting quantization
+    fusion (FP8/FP4).
     """
     global _fi_ar_quant_workspace
     if _fi_ar_quant_workspace is not None:
-        return
+        return _fi_ar_quant_workspace
 
-    # If primary workspace is already trtllm, reuse it
+    # Reuse the non-quant workspace if it was already created with trtllm
     if _fi_ar_workspace is not None and _fi_ar_workspace.backend == "trtllm":
         _fi_ar_quant_workspace = _fi_ar_workspace
-        return
+        return _fi_ar_quant_workspace
 
-    comm_backend = TorchDistBackend(group=group)
-    _fi_ar_quant_workspace = flashinfer_comm.create_allreduce_fusion_workspace(
-        backend="trtllm",
-        world_size=world_size,
-        rank=rank,
-        max_token_num=max_token_num,
-        hidden_dim=hidden_dim,
-        dtype=dtype,
-        comm_backend=comm_backend,
-    )
-    assert _fi_ar_quant_workspace is not None
-    logger.debug(
-        "Initialized FlashInfer All Reduce workspace: backend=trtllm, "
-        "world_size=%d, rank=%d, max_token_num=%d, hidden_dim=%d, dtype=%s",
-        world_size,
-        rank,
-        max_token_num,
-        hidden_dim,
-        dtype,
+    _fi_ar_quant_workspace = _create_workspace(
+        "trtllm", world_size, rank, max_token_num, hidden_dim, dtype, group
     )
+    return _fi_ar_quant_workspace
 
 
 _fi_ar_workspace_lock = threading.Lock()
 
 
 def destroy_fi_ar_workspace():
-    global _fi_ar_workspace
-    global _fi_ar_quant_workspace
+    global _fi_ar_workspace, _fi_ar_quant_workspace
     with _fi_ar_workspace_lock:
-        if (
-            _fi_ar_quant_workspace is not None
-            and _fi_ar_quant_workspace is not _fi_ar_workspace
-        ):
-            _fi_ar_quant_workspace.destroy()
-        _fi_ar_quant_workspace = None
+        is_alias = _fi_ar_workspace is _fi_ar_quant_workspace
+
         if _fi_ar_workspace is not None:
             _fi_ar_workspace.destroy()
-            _fi_ar_workspace = None
+        if _fi_ar_quant_workspace is not None and not is_alias:
+            _fi_ar_quant_workspace.destroy()
+
+        _fi_ar_workspace = _fi_ar_quant_workspace = None
 
 
 atexit.register(destroy_fi_ar_workspace)
@@ -209,29 +213,21 @@ def __init__(
 
     def _ensure_workspace(self, hidden_dim: int, dtype: torch.dtype) -> bool:
         """Ensure the all reduce workspace is initialized."""
-        if get_fi_ar_workspace() is not None:
-            return True
         if self.max_num_tokens == 0:
             element_size = torch.tensor([], dtype=dtype, device="cpu").element_size()
             self.max_num_tokens = self.max_workspace_size // (hidden_dim * element_size)
-        try:
-            initialize_fi_ar_workspace(
-                world_size=self.world_size,
-                rank=self.rank,
-                max_token_num=self.max_num_tokens,
-                hidden_dim=hidden_dim,
-                dtype=dtype,
-                group=self.group,
-            )
-            return True
-        except Exception as e:
-            logger.warning(
-                "Failed to initialize FlashInfer All Reduce workspace: %s. "
-                "FlashInfer All Reduce will be disabled.",
-                e,
-            )
+        workspace = get_fi_ar_workspace(
+            world_size=self.world_size,
+            rank=self.rank,
+            max_token_num=self.max_num_tokens,
+            hidden_dim=hidden_dim,
+            dtype=dtype,
+            group=self.group,
+        )
+        if workspace is None:
             self.disabled = True
             return False
+        return True
 
     def should_use_fi_ar(self, input_tensor: torch.Tensor) -> bool:
         if self.disabled:
@@ -257,7 +253,15 @@ def should_use_fi_ar(self, input_tensor: torch.Tensor) -> bool:
         return self._ensure_workspace(hidden_dim, input_tensor.dtype)
 
     def all_reduce(self, input_tensor: torch.Tensor) -> torch.Tensor:
-        workspace = get_fi_ar_workspace()
+        _, hidden_dim = input_tensor.shape
+        workspace = get_fi_ar_workspace(
+            world_size=self.world_size,
+            rank=self.rank,
+            max_token_num=self.max_num_tokens,
+            hidden_dim=hidden_dim,
+            dtype=input_tensor.dtype,
+            group=self.group,
+        )
         return flashinfer_comm.allreduce_fusion(
             input=input_tensor,
             workspace=workspace,
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 84a032541015..6ac3b9ea3c7c 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -145,6 +145,13 @@ def __init__(
             stream.synchronize()
             del data
 
+    def destroy(self):
+        if self.available and not self.disabled:
+            with torch.accelerator.device_index(self.device.index):
+                self.nccl.ncclCommDestroy(self.comm)
+            self.available = False
+            self.disabled = True
+
     def all_reduce(
         self,
         in_tensor: torch.Tensor,
diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py
index 98c7ac20a171..c25ff8cf1a8d 100644
--- a/vllm/distributed/device_communicators/symm_mem.py
+++ b/vllm/distributed/device_communicators/symm_mem.py
@@ -28,6 +28,7 @@ class SymmMemCommunicator:
     _WORLD_SIZES_MULTIMEM = {
         "9.0": [4, 6, 8],
         "10.0": [6, 8],
+        "10.3": [6, 8],
     }
 
     def __init__(
diff --git a/vllm/distributed/elastic_ep/elastic_execute.py b/vllm/distributed/elastic_ep/elastic_execute.py
index 00ac6d84b425..8b05c58eaec5 100644
--- a/vllm/distributed/elastic_ep/elastic_execute.py
+++ b/vllm/distributed/elastic_ep/elastic_execute.py
@@ -145,11 +145,37 @@ def execute(self, execute_method: str, *args, **kwargs):
             raise ValueError(f"Unknown execute method: {execute_method}")
         return method(*args, **kwargs)
 
+    def _set_eplb_suppressed(self, suppressed: bool) -> None:
+        self.worker.model_runner.eep_eplb_suppressed = suppressed
+        ep_group = get_standby_ep_group() or get_ep_group()
+        if ep_group.rank == 0:
+            logger.info(
+                "[Elastic EP] EPLB %s elastic scaling transition",
+                "disabled during" if suppressed else "re-enabled after",
+            )
+
+    def load_model(self) -> None:
+        (
+            expanded_physical_to_logical,
+            num_logical_experts,
+            old_num_physical_experts,
+        ) = self.receive_expert_mapping()
+        num_physical_experts = expanded_physical_to_logical.shape[1]
+        self.worker.parallel_config.eplb_config.num_redundant_experts = (
+            num_physical_experts - num_logical_experts
+        )
+        self.worker.load_model(load_dummy_weights=True)
+        self.worker.model_runner.setup_eplb_from_mapping(
+            expanded_physical_to_logical, old_num_physical_experts
+        )
+        self._set_eplb_suppressed(True)
+
     def create_standby_groups(
         self, reconfig_request: ReconfigureDistributedRequest
     ) -> None:
         self.reconfig_request = reconfig_request
         new_dp_size = reconfig_request.new_data_parallel_size
+        old_dp_size = get_dp_group().world_size
         world_size = self.worker.vllm_config.parallel_config.world_size
         new_world_size_across_dp = world_size * new_dp_size
         updated_config = copy.copy(self.worker.vllm_config)
@@ -165,11 +191,8 @@ def create_standby_groups(
                 coord_store_port=reconfig_request.coord_store_port,
                 enable_eplb=updated_config.parallel_config.enable_eplb,
             )
-        self.worker.model_runner.eep_eplb_suppressed = True
-        standby_ep_group = get_standby_ep_group()
-        assert standby_ep_group is not None
-        if standby_ep_group.rank == 0:
-            logger.info("[Elastic EP] EPLB disabled during elastic scaling transition")
+        if new_dp_size > old_dp_size:
+            self._set_eplb_suppressed(True)
 
     def transfer_weights(self, old_dp_size: int, new_dp_size: int) -> None:
         standby_dp_group = get_standby_dp_group()
@@ -237,13 +260,31 @@ def broadcast_expert_mapping(self) -> None:
             device=self.worker.device,
         )
 
+    def _release_cuda_graphs(self) -> None:
+        if isinstance(self.worker.model_runner.model, CUDAGraphWrapper):
+            wrapper = self.worker.model_runner.model
+            wrapper.concrete_cudagraph_entries = {}
+
+        elif isinstance(self.worker.model_runner.model, UBatchWrapper):
+            raise RuntimeError("DBO is not yet supported in elastic EP")
+
+        torch.compiler.reset()
+        with set_current_vllm_config(self.worker.vllm_config):
+            reset_compile_wrapper(self.worker.model_runner.get_model())
+
+        gc.collect()
+        torch.accelerator.synchronize()
+        torch.accelerator.empty_cache()
+
     def switch_and_remove(self) -> None:
+        self._release_cuda_graphs()
         _replace_active_groups(world=None, dp=None, ep=None, eplb=None, node_count=None)
 
     def switch_and_prepare(self) -> None:
         old_dp_size = get_dp_group().world_size
         old_ep_size = get_ep_group().world_size
 
+        self._release_cuda_graphs()
         _replace_active_groups(**pop_standby_groups())
 
         parallel_config = self.worker.vllm_config.parallel_config
@@ -384,13 +425,6 @@ def switch_and_prepare(self) -> None:
             compilation_counter.stock_torch_compile_count += 1
             self.worker.model_runner.model.compile(fullgraph=True, backend=backend)
 
-        # release all previously captured CUDA graphs
-        if isinstance(self.worker.model_runner.model, CUDAGraphWrapper):
-            wrapper = self.worker.model_runner.model
-            wrapper.concrete_cudagraph_entries = {}
-        elif isinstance(self.worker.model_runner.model, UBatchWrapper):
-            raise RuntimeError("DBO is not yet supported in elastic EP")
-
         multi_block_table = self.worker.model_runner.input_batch.block_table
         saved_block_tables: list[tuple[torch.Tensor, torch.Tensor]] = []
         for bt in multi_block_table.block_tables:
@@ -399,14 +433,6 @@ def switch_and_prepare(self) -> None:
             )
         multi_block_table.clear()
 
-        # reset the compile wrapper
-        torch.compiler.reset()
-        with set_current_vllm_config(self.worker.vllm_config):
-            reset_compile_wrapper(self.worker.model_runner.get_model())
-
-        gc.collect()
-        torch.accelerator.synchronize()
-        torch.accelerator.empty_cache()
         unlock_workspace()
         self.worker.compile_or_warm_up_model()
         lock_workspace()
@@ -416,8 +442,12 @@ def switch_and_prepare(self) -> None:
         ):
             bt.block_table.gpu.copy_(saved_gpu)
             bt.block_table.cpu.copy_(saved_cpu)
+        if new_dp_size < old_dp_size:
+            self._set_eplb_suppressed(False)
 
-    def perform_eplb_reshuffle(self, new_dp_size: int | None = None) -> None:
+    def _perform_eplb_reshuffle(
+        self, rank_mapping: dict[int, int] | None = None
+    ) -> None:
         if get_ep_group().rank == 0:
             logger.info("[Elastic EP] Starting expert resharding...")
 
@@ -428,20 +458,9 @@ def perform_eplb_reshuffle(self, new_dp_size: int | None = None) -> None:
         eplb_model_state = eplb_state.model_states[model_config.compute_hash()]
         is_async_enabled = eplb_state.is_async
         eplb_state.is_async = False
-        if new_dp_size is None:
+        if rank_mapping is None:
             eplb_state.rearrange()
         else:
-            # scale down
-            parallel_config = self.worker.vllm_config.parallel_config
-            tp_size = parallel_config.tensor_parallel_size
-            old_ep_size = parallel_config.data_parallel_size * tp_size
-            new_ep_size = new_dp_size * tp_size
-
-            rank_mapping = {
-                old_ep_rank: old_ep_rank if old_ep_rank < new_ep_size else -1
-                for old_ep_rank in range(old_ep_size)
-            }
-
             eplb_state.rearrange(rank_mapping=rank_mapping)
         # NOTE(yongji): check whether we need to synchronize here
         torch.accelerator.synchronize()
@@ -451,10 +470,25 @@ def perform_eplb_reshuffle(self, new_dp_size: int | None = None) -> None:
             eplb_model_state.physical_to_logical_map.shape[1]
         )
         eplb_state.is_async = is_async_enabled
-        self.worker.model_runner.eep_eplb_suppressed = False
         if get_ep_group().rank == 0:
             logger.info("[Elastic EP] Expert resharding completed")
 
+    def perform_eplb_reshuffle(self) -> None:
+        self._perform_eplb_reshuffle()
+        self._set_eplb_suppressed(False)
+
+    def perform_scale_down_eplb_reshuffle(self, new_dp_size: int) -> None:
+        self._set_eplb_suppressed(True)
+        parallel_config = self.worker.vllm_config.parallel_config
+        tp_size = parallel_config.tensor_parallel_size
+        old_ep_size = parallel_config.data_parallel_size * tp_size
+        new_ep_size = new_dp_size * tp_size
+        rank_mapping = {
+            old_ep_rank: old_ep_rank if old_ep_rank < new_ep_size else -1
+            for old_ep_rank in range(old_ep_size)
+        }
+        self._perform_eplb_reshuffle(rank_mapping=rank_mapping)
+
     def receive_weights(self) -> None:
         dp_group = get_dp_group()
         assert isinstance(dp_group, StatelessGroupCoordinator)
diff --git a/vllm/distributed/elastic_ep/elastic_state.py b/vllm/distributed/elastic_ep/elastic_state.py
index cd989a49a2b8..bace771a2ab6 100644
--- a/vllm/distributed/elastic_ep/elastic_state.py
+++ b/vllm/distributed/elastic_ep/elastic_state.py
@@ -43,9 +43,10 @@ class ScaleUpExistingEngineState(enum.IntEnum):
 
 
 class ScaleUpNewEngineState(enum.IntEnum):
-    PREPARE = 0
-    EPLB_RESHUFFLE = 1
-    COMPLETE = 2
+    PRE_KV_INIT = 0
+    PREPARE = 1
+    EPLB_RESHUFFLE = 2
+    COMPLETE = 3
 
 
 class ScaleDownRemainingEngineState(enum.IntEnum):
@@ -104,7 +105,7 @@ def __init__(
         self.state: EngineState
         if scale_type == "scale_up":
             self.state = (
-                ScaleUpNewEngineState.PREPARE
+                ScaleUpNewEngineState.PRE_KV_INIT
                 if worker_type == "new"
                 else ScaleUpExistingEngineState.WAIT_NEW_CORE_ENGINES_INIT
             )
@@ -142,6 +143,12 @@ def progress(self) -> bool:
             else self._progress_remaining_engine()
         )
 
+    def run_pre_kv_init_states(self) -> None:
+        assert self.scale_type == "scale_up" and self.worker_type == "new"
+        assert self.state == ScaleUpNewEngineState.PRE_KV_INIT
+        assert self.progress()
+        assert self.state == ScaleUpNewEngineState.PREPARE
+
     def _execute_tcp_store_barrier(
         self, dp_store, group_rank, group_size, barrier_id, timeout=None
     ):
@@ -303,7 +310,23 @@ def _progress_new_engine(self) -> bool:
         state = self.state
         assert self.new_dp_group is not None and self.new_dp_store is not None
 
-        if state == ScaleUpNewEngineState.PREPARE:
+        if state == ScaleUpNewEngineState.PRE_KV_INIT:
+            self.engine_core._eep_send_engine_core_notification(
+                EEPNotificationType.NEW_CORE_ENGINES_WEIGHTS_INIT_READY
+            )
+            self.model_executor.collective_rpc(
+                "elastic_ep_execute", args=("receive_weights",)
+            )
+            self.engine_core.available_gpu_memory_for_kv_cache = (
+                ParallelConfig.sync_kv_cache_memory_size(self.new_dp_group, -1)
+            )
+            self.model_executor.collective_rpc(
+                "elastic_ep_execute", args=("prepare_new_worker",)
+            )
+            self.state = ScaleUpNewEngineState.PREPARE
+            return True
+
+        elif state == ScaleUpNewEngineState.PREPARE:
             tensor = torch.tensor([0, 0, 0], dtype=torch.int32, device="cpu")
             torch.distributed.all_reduce(
                 tensor,
@@ -403,7 +426,6 @@ def _progress_removing_engine(self) -> bool:
             self.engine_core._eep_send_engine_core_notification(
                 EEPNotificationType.SHUTDOWN_COMPLETE
             )
-            self.engine_core.shutdown()
             return True
 
         else:
@@ -525,7 +547,7 @@ def _eplb_reshuffle_before_scale_down(self):
         self.model_executor.collective_rpc(
             "elastic_ep_execute",
             args=(
-                "perform_eplb_reshuffle",
+                "perform_scale_down_eplb_reshuffle",
                 self.reconfig_request.new_data_parallel_size,
             ),
         )
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
index db77d41c487f..faaffd72eca3 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py
@@ -126,28 +126,17 @@ def __init__(
         self._labelnames = labelnames
         self.per_engine_labelvalues = per_engine_labelvalues
 
-    def make_per_engine(self, metric: PromMetric) -> dict[int, PromMetric]:
-        """
-        Create a per-engine child of a prometheus_client.Metric with
-        the appropriate labels set. The parent metric must be created
-        using the labelnames list.
-        """
-        return {
-            idx: metric.labels(*labelvalues)
-            for idx, labelvalues in self.per_engine_labelvalues.items()
-        }
-
     def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0):
         """
         Record the supplied transfer statistics to Prometheus metrics. These
         statistics are engine-specific, and should be recorded to a metric
         with the appropriate 'engine' label. These metric instances can be
-        created using the make_per_engine() helper method.
+        created using the create_metric_per_engine() helper method.
         """
         raise NotImplementedError
 
 
-class KVConnectorPrometheus:
+class KVConnectorProm:
     """
     Support for registering per-connector Prometheus metrics, and
     recording transfer statistics to those metrics. Uses
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
index 1861c9e8e3d0..dcde7665f344 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py
@@ -1396,9 +1396,6 @@ def _write_blocks_for_req(self, req_id: ReqId, meta: ReqMeta, layer_name, kv_lay
             remote_ip=meta.remote_host,
         )
 
-    def _is_last_layer(self, layer_name):
-        return layer_name == list(self.kv_caches.keys())[-1]
-
     def merge_contiguous_blocks(
         self,
         offsets_local: list[int],
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index ed53c35c9ed9..a86a52a6a6fb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -65,6 +65,7 @@
     SlidingWindowSpec,
     UniformTypeKVCacheSpecs,
 )
+from vllm.v1.metrics.utils import create_metric_per_engine
 from vllm.v1.worker.block_table import BlockTable
 from vllm.v1.worker.utils import select_common_block_size
 
@@ -3057,7 +3058,9 @@ def __init__(
             buckets=buckets[1:],
             labelnames=labelnames,
         )
-        self.nixl_histogram_xfer_time = self.make_per_engine(nixl_histogram_xfer_time)
+        self.nixl_histogram_xfer_time = create_metric_per_engine(
+            nixl_histogram_xfer_time, self.per_engine_labelvalues
+        )
         nixl_histogram_post_time = self._histogram_cls(
             name="vllm:nixl_post_time_seconds",
             documentation="Histogram of transfer post time for NIXL KV"
@@ -3065,7 +3068,9 @@ def __init__(
             buckets=buckets,
             labelnames=labelnames,
         )
-        self.nixl_histogram_post_time = self.make_per_engine(nixl_histogram_post_time)
+        self.nixl_histogram_post_time = create_metric_per_engine(
+            nixl_histogram_post_time, self.per_engine_labelvalues
+        )
         # uniform 2kb to 16gb range
         buckets = [2 ** (10 + i) for i in range(1, 25, 2)]
         nixl_histogram_bytes_transferred = self._histogram_cls(
@@ -3074,8 +3079,8 @@ def __init__(
             buckets=buckets,
             labelnames=labelnames,
         )
-        self.nixl_histogram_bytes_transferred = self.make_per_engine(
-            nixl_histogram_bytes_transferred
+        self.nixl_histogram_bytes_transferred = create_metric_per_engine(
+            nixl_histogram_bytes_transferred, self.per_engine_labelvalues
         )
         buckets = [
             10,
@@ -3100,24 +3105,24 @@ def __init__(
             buckets=buckets,
             labelnames=labelnames,
         )
-        self.nixl_histogram_num_descriptors = self.make_per_engine(
-            nixl_histogram_num_descriptors
+        self.nixl_histogram_num_descriptors = create_metric_per_engine(
+            nixl_histogram_num_descriptors, self.per_engine_labelvalues
         )
         counter_nixl_num_failed_transfers = self._counter_cls(
             name="vllm:nixl_num_failed_transfers",
             documentation="Number of failed NIXL KV Cache transfers.",
             labelnames=labelnames,
         )
-        self.counter_nixl_num_failed_transfers = self.make_per_engine(
-            counter_nixl_num_failed_transfers
+        self.counter_nixl_num_failed_transfers = create_metric_per_engine(
+            counter_nixl_num_failed_transfers, self.per_engine_labelvalues
         )
         counter_nixl_num_failed_notifications = self._counter_cls(
             name="vllm:nixl_num_failed_notifications",
             documentation="Number of failed NIXL KV Cache notifications.",
             labelnames=labelnames,
         )
-        self.counter_nixl_num_failed_notifications = self.make_per_engine(
-            counter_nixl_num_failed_notifications
+        self.counter_nixl_num_failed_notifications = create_metric_per_engine(
+            counter_nixl_num_failed_notifications, self.per_engine_labelvalues
         )
 
         counter_nixl_num_kv_expired_reqs = self._counter_cls(
@@ -3126,8 +3131,8 @@ def __init__(
             "NOTE: This metric is tracked on the P instance.",
             labelnames=labelnames,
         )
-        self.counter_nixl_num_kv_expired_reqs = self.make_per_engine(
-            counter_nixl_num_kv_expired_reqs
+        self.counter_nixl_num_kv_expired_reqs = create_metric_per_engine(
+            counter_nixl_num_kv_expired_reqs, self.per_engine_labelvalues
         )
 
     def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 730641a184fc..e344bae26402 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -79,7 +79,7 @@
     RunnerOption,
     TokenizerMode,
 )
-from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
+from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MMTensorIPC
 from vllm.config.observability import DetailedTraceModules
 from vllm.config.parallel import (
     All2AllBackend,
@@ -509,6 +509,7 @@ class EngineArgs:
     io_processor_plugin: str | None = None
     skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
     video_pruning_rate: float | None = MultiModalConfig.video_pruning_rate
+    mm_tensor_ipc: MMTensorIPC = MultiModalConfig.mm_tensor_ipc
     # LoRA fields
     enable_lora: bool = False
     max_loras: int = LoRAConfig.max_loras
@@ -1097,6 +1098,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         multimodal_group.add_argument(
             "--video-pruning-rate", **multimodal_kwargs["video_pruning_rate"]
         )
+        multimodal_group.add_argument(
+            "--mm-tensor-ipc", **multimodal_kwargs["mm_tensor_ipc"]
+        )
 
         # LoRA related configs
         lora_kwargs = get_kwargs(LoRAConfig)
@@ -1423,6 +1427,7 @@ def create_model_config(self) -> ModelConfig:
             override_attention_dtype=self.override_attention_dtype,
             logits_processors=self.logits_processors,
             video_pruning_rate=self.video_pruning_rate,
+            mm_tensor_ipc=self.mm_tensor_ipc,
             io_processor_plugin=self.io_processor_plugin,
         )
 
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 195b945bcbce..65e31b829833 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -290,7 +290,7 @@ def signal_handler(signum, frame):
 
     with launch_core_engines(
         vllm_config, executor_class, log_stats, addresses, num_api_servers
-    ) as (local_engine_manager, coordinator, addresses):
+    ) as (local_engine_manager, coordinator, addresses, tensor_queue):
         # Construct common args for the APIServerProcessManager up-front.
         api_server_manager_kwargs = dict(
             target_server_fn=run_api_server_worker_proc,
@@ -303,6 +303,7 @@ def signal_handler(signum, frame):
             stats_update_address=coordinator.get_stats_publish_address()
             if coordinator
             else None,
+            tensor_queue=tensor_queue,
         )
 
         # For dp ranks > 0 in external/hybrid DP LB modes, we must delay the
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 5909b3043007..4b617333c02f 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1477,9 +1477,9 @@ def _cross_encoding_score(
             data_1 = data_1 * len(data_2)
 
         if pooling_params is None:
-            pooling_params = PoolingParams(task="score")
+            pooling_params = PoolingParams(task="classify")
         elif pooling_params.task is None:
-            pooling_params.task = "score"
+            pooling_params.task = "classify"
 
         pooling_params_list = list[PoolingParams]()
 
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 4d5c5eae8de0..95e831b51ec0 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -22,7 +22,7 @@
 from starlette.datastructures import State
 
 import vllm.envs as envs
-from vllm.config import VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import load_chat_template
@@ -155,7 +155,9 @@ async def build_async_engine_client_from_engine_args(
 
 
 def build_app(
-    args: Namespace, supported_tasks: tuple["SupportedTask", ...] | None = None
+    args: Namespace,
+    supported_tasks: tuple["SupportedTask", ...] | None = None,
+    model_config: ModelConfig | None = None,
 ) -> FastAPI:
     if supported_tasks is None:
         warnings.warn(
@@ -191,7 +193,7 @@ def build_app(
         attach_router as register_sagemaker_api_router,
     )
 
-    register_sagemaker_api_router(app, supported_tasks)
+    register_sagemaker_api_router(app, supported_tasks, model_config)
 
     if "generate" in supported_tasks:
         from vllm.entrypoints.openai.generate.api_router import (
@@ -242,7 +244,7 @@ def build_app(
     if any(task in POOLING_TASKS for task in supported_tasks):
         from vllm.entrypoints.pooling import register_pooling_api_routers
 
-        register_pooling_api_routers(app, supported_tasks)
+        register_pooling_api_routers(app, supported_tasks, model_config)
 
     app.root_path = args.root_path
     app.add_middleware(
@@ -583,8 +585,10 @@ async def build_and_serve(
         uvicorn_kwargs["log_config"] = log_config
 
     supported_tasks = await engine_client.get_supported_tasks()
+    model_config = engine_client.model_config
+
     logger.info("Supported tasks: %s", supported_tasks)
-    app = build_app(args, supported_tasks)
+    app = build_app(args, supported_tasks, model_config)
     await init_app_state(engine_client, app.state, args, supported_tasks)
 
     logger.info("Starting vLLM server on %s", listen_address)
diff --git a/vllm/entrypoints/openai/responses/context.py b/vllm/entrypoints/openai/responses/context.py
index bab59e0aa1ec..a4c55c23c588 100644
--- a/vllm/entrypoints/openai/responses/context.py
+++ b/vllm/entrypoints/openai/responses/context.py
@@ -9,7 +9,7 @@
 from collections.abc import Callable
 from contextlib import AsyncExitStack
 from dataclasses import replace
-from typing import TYPE_CHECKING, Final, Union
+from typing import TYPE_CHECKING, Any, Final, Union
 
 from openai.types.responses.response_function_tool_call_output_item import (
     ResponseFunctionToolCallOutputItem,
@@ -182,6 +182,7 @@ def __init__(self):
         self.all_turn_metrics = []
 
         self.input_messages: list[ResponseRawMessageAndToken] = []
+        self.kv_transfer_params: dict[str, Any] | None = None
 
     def append_output(self, output) -> None:
         self.last_output = output
@@ -190,6 +191,8 @@ def append_output(self, output) -> None:
         self.num_prompt_tokens = len(output.prompt_token_ids or [])
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
+        if output.kv_transfer_params is not None:
+            self.kv_transfer_params = output.kv_transfer_params
 
         # Accumulate text, token_ids, and logprobs for streaming mode
         delta_output = output.outputs[0]
@@ -308,11 +311,14 @@ def __init__(
         self.input_messages: list[ResponseRawMessageAndToken] = []
         self.output_messages: list[ResponseRawMessageAndToken] = []
         self._accumulated_token_ids: list[int] = []
+        self.kv_transfer_params: dict[str, Any] | None = None
 
     def append_output(self, output: RequestOutput) -> None:
         self.num_prompt_tokens = len(output.prompt_token_ids or [])
         self.num_cached_tokens = output.num_cached_tokens or 0
         self.num_output_tokens += len(output.outputs[0].token_ids or [])
+        if output.kv_transfer_params is not None:
+            self.kv_transfer_params = output.kv_transfer_params
         self.parser.process(output.outputs[0])
         output_token_ids = output.outputs[0].token_ids or []
         self._accumulated_token_ids.extend(output_token_ids)
@@ -538,6 +544,7 @@ def __init__(
         self.all_turn_metrics: list[TurnMetrics] = []
         self.is_first_turn = True
         self.first_tok_of_message = True  # For streaming support
+        self.kv_transfer_params: dict[str, Any] | None = None
 
     def _update_num_reasoning_tokens(self):
         channel = self.parser.current_channel
@@ -557,6 +564,8 @@ def append_output(self, output: RequestOutput) -> None:
             self._update_num_reasoning_tokens()
         self._update_prefill_token_usage(output)
         self._update_decode_token_usage(output)
+        if output.kv_transfer_params is not None:
+            self.kv_transfer_params = output.kv_transfer_params
         # Append current turn to all turn list for next turn's calculations
         self.all_turn_metrics.append(self.current_turn_metrics.copy())
         self.current_turn_metrics.reset()
@@ -868,6 +877,8 @@ def append_output(self, output: RequestOutput) -> None:
         if last_delta_text:
             self.last_content_delta = last_delta_text
         self._update_decode_token_usage(output)
+        if output.kv_transfer_params is not None:
+            self.kv_transfer_params = output.kv_transfer_params
 
         # For streaming, update previous turn when message is complete
         if output.finished:
diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py
index a5f62bdd8c39..43fbba1dd43f 100644
--- a/vllm/entrypoints/openai/responses/protocol.py
+++ b/vllm/entrypoints/openai/responses/protocol.py
@@ -252,6 +252,10 @@ class ResponsesRequest(OpenAIBaseModel):
             "numeric values, used by custom extensions."
         ),
     )
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None,
+        description="KVTransfer parameters used for disaggregated serving.",
+    )
     # --8<-- [end:responses-extra-params]
 
     def build_chat_params(
@@ -351,6 +355,10 @@ def to_sampling_params(
         if isinstance(stop, str):
             stop = [stop]
 
+        extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {}
+        if self.kv_transfer_params:
+            extra_args["kv_transfer_params"] = self.kv_transfer_params
+
         return SamplingParams.from_optional(
             temperature=temperature,
             top_p=top_p,
@@ -367,7 +375,7 @@ def to_sampling_params(
             ),
             structured_outputs=structured_outputs,
             logit_bias=self.logit_bias,
-            extra_args=self.vllm_xargs or {},
+            extra_args=extra_args,
             skip_clone=True,  # Created fresh per request, safe to skip clone
             skip_special_tokens=self.skip_special_tokens,
             include_stop_str_in_output=self.include_stop_str_in_output,
@@ -488,6 +496,11 @@ class ResponsesResponse(OpenAIBaseModel):
     usage: ResponseUsage | None = None
     user: str | None = None
 
+    # vLLM-specific fields that are not in OpenAI spec
+    kv_transfer_params: dict[str, Any] | None = Field(
+        default=None, description="KVTransfer parameters."
+    )
+
     # --8<-- [start:responses-response-extra-params]
     # These are populated when enable_response_messages is set to True
     # NOTE: custom serialization is needed
@@ -531,6 +544,7 @@ def from_request(
         usage: ResponseUsage | None = None,
         input_messages: ResponseInputOutputMessage | None = None,
         output_messages: ResponseInputOutputMessage | None = None,
+        kv_transfer_params: dict[str, Any] | None = None,
     ) -> "ResponsesResponse":
         incomplete_details: IncompleteDetails | None = None
         if status == "incomplete":
@@ -566,6 +580,7 @@ def from_request(
             truncation=request.truncation,
             user=request.user,
             usage=usage,
+            kv_transfer_params=kv_transfer_params,
         )
 
 
diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py
index 574282c4cdc6..53c28693ade7 100644
--- a/vllm/entrypoints/openai/responses/serving.py
+++ b/vllm/entrypoints/openai/responses/serving.py
@@ -873,6 +873,7 @@ async def responses_full_generator(
             output=output,
             status=status,
             usage=usage,
+            kv_transfer_params=context.kv_transfer_params,
         )
 
         if request.store:
diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
index 4a6030d71b63..bf58273f7504 100644
--- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
+++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py
@@ -42,32 +42,13 @@
 from vllm.logger import init_logger
 from vllm.logprobs import FlatLogprobs, Logprob
 from vllm.model_executor.models import SupportsTranscription
-from vllm.multimodal.audio import split_audio
-from vllm.multimodal.media.audio import extract_audio_from_video_bytes
+from vllm.multimodal.audio import get_audio_duration, split_audio
+from vllm.multimodal.media.audio import load_audio
 from vllm.outputs import RequestOutput
 from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt
 from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.tokenizers import get_tokenizer
-from vllm.utils.import_utils import PlaceholderModule
-
-try:
-    import librosa
-except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
-
-try:
-    import soundfile as sf
-except ImportError:
-    sf = PlaceholderModule("soundfile")  # type: ignore[assignment]
-
-# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
-# being librosa's main backend. Used to validate if an audio loading error is due to a
-# server error vs a client error (invalid audio file).
-# 1 = unrecognised format      (file is not a supported audio container)
-# 3 = malformed file           (corrupt or structurally invalid audio)
-# 4 = unsupported encoding     (codec not supported by this libsndfile build)
-_BAD_SF_CODES = {1, 3, 4}
 
 SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse
 SpeechToTextResponseVerbose: TypeAlias = (
@@ -214,32 +195,13 @@ async def _preprocess_speech_to_text(
         # pre-requisite for chunking, as it assumes Whisper SR.
         try:
             with io.BytesIO(audio_data) as buf:
-                y, sr = librosa.load(buf, sr=self.asr_config.sample_rate)  # type: ignore[return-value]
-        except sf.LibsndfileError as exc:
-            # Only fall back for known format-detection failures.
-            # Re-raise anything else (e.g. corrupt but recognised format).
-            if exc.code not in _BAD_SF_CODES:
-                raise
-            logger.debug(
-                "librosa/soundfile could not decode audio from BytesIO "
-                "(code=%s: %s); falling back to pyav in-process decode",
-                exc.code,
-                exc,
-            )
-            try:
-                native_y, native_sr = extract_audio_from_video_bytes(audio_data)
-                sr = self.asr_config.sample_rate
-                y = librosa.resample(native_y, orig_sr=native_sr, target_sr=sr)
-            except Exception as pyav_exc:
-                logger.debug(
-                    "pyAV fallback also failed: %s",
-                    pyav_exc,
-                )
-                raise ValueError("Invalid or unsupported audio file.") from pyav_exc
+                y, sr = load_audio(buf, sr=self.asr_config.sample_rate)
+        except Exception as exc:
+            raise ValueError("Invalid or unsupported audio file.") from exc
 
-        duration = librosa.get_duration(y=y, sr=sr)
-        do_split_audio = (
-            self.asr_config.allow_audio_chunking
+        duration = get_audio_duration(y=y, sr=sr)
+        do_split_audio = self.asr_config.allow_audio_chunking and (
+            self.asr_config.max_audio_clip_s is not None
             and duration > self.asr_config.max_audio_clip_s
         )
 
diff --git a/vllm/entrypoints/pooling/__init__.py b/vllm/entrypoints/pooling/__init__.py
index d2baea8959d2..e115b710ceeb 100644
--- a/vllm/entrypoints/pooling/__init__.py
+++ b/vllm/entrypoints/pooling/__init__.py
@@ -5,6 +5,9 @@
 
 from fastapi import FastAPI
 
+from vllm.config import ModelConfig
+from vllm.logger import init_logger
+
 if TYPE_CHECKING:
     from argparse import Namespace
 
@@ -17,9 +20,30 @@
     RequestLogger = object
     SupportedTask = object
 
+logger = init_logger(__name__)
+
+
+def enable_scoring_api(
+    supported_tasks: tuple["SupportedTask", ...],
+    model_config: ModelConfig | None = None,
+) -> bool:
+    if any(t in supported_tasks for t in ("embed", "token_embed")):
+        return True
+
+    if model_config is not None and "classify" in supported_tasks:
+        num_labels = getattr(model_config.hf_config, "num_labels", 0)
+        if num_labels != 1:
+            logger.debug_once("Score API is only enabled for num_labels == 1.")
+            return False
+        return True
+
+    return False
+
 
 def register_pooling_api_routers(
-    app: FastAPI, supported_tasks: tuple["SupportedTask", ...]
+    app: FastAPI,
+    supported_tasks: tuple["SupportedTask", ...],
+    model_config: ModelConfig | None = None,
 ):
     from vllm.entrypoints.pooling.pooling.api_router import router as pooling_router
 
@@ -37,11 +61,7 @@ def register_pooling_api_routers(
 
         app.include_router(embed_router)
 
-    # Score API handles score/rerank for:
-    # - "score" task (score_type: cross-encoder models)
-    # - "embed" task (score_type: bi-encoder models)
-    # - "token_embed" task (score_type: late interaction models)
-    if any(t in supported_tasks for t in ("score", "embed", "token_embed")):
+    if enable_scoring_api(supported_tasks, model_config):
         from vllm.entrypoints.pooling.score.api_router import router as score_router
 
         app.include_router(score_router)
@@ -61,6 +81,8 @@ def init_pooling_state(
     from vllm.entrypoints.pooling.score.serving import ServingScores
     from vllm.tasks import POOLING_TASKS
 
+    model_config = engine_client.model_config
+
     resolved_chat_template = load_chat_template(args.chat_template)
 
     state.serving_pooling = (
@@ -102,10 +124,6 @@ def init_pooling_state(
         if "classify" in supported_tasks
         else None
     )
-    # Score API handles score/rerank for:
-    # - "score" task (score_type: cross-encoder models)
-    # - "embed" task (score_type: bi-encoder models)
-    # - "token_embed" task (score_type: late interaction models)
     state.serving_scores = (
         ServingScores(
             engine_client,
@@ -114,6 +132,6 @@ def init_pooling_state(
             score_template=resolved_chat_template,
             log_error_stack=args.log_error_stack,
         )
-        if any(t in supported_tasks for t in ("embed", "score", "token_embed"))
+        if enable_scoring_api(supported_tasks, model_config)
         else None
     )
diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py
index 2aea1bd7b27a..bb633fc28b3c 100644
--- a/vllm/entrypoints/pooling/score/protocol.py
+++ b/vllm/entrypoints/pooling/score/protocol.py
@@ -35,7 +35,7 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
             max_total_tokens_param="max_model_len",
         )
 
-    def to_pooling_params(self, task: PoolingTask = "score"):
+    def to_pooling_params(self, task: PoolingTask = "classify"):
         return PoolingParams(
             task=task,
             use_activation=self.use_activation,
@@ -111,7 +111,7 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
             max_total_tokens_param="max_model_len",
         )
 
-    def to_pooling_params(self, task: PoolingTask = "score"):
+    def to_pooling_params(self, task: PoolingTask = "classify"):
         return PoolingParams(
             task=task,
             use_activation=self.use_activation,
diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py
index c58fe6d36c07..d8cbff99d068 100644
--- a/vllm/entrypoints/pooling/score/serving.py
+++ b/vllm/entrypoints/pooling/score/serving.py
@@ -413,7 +413,7 @@ async def _cross_encoding_score(
         # Schedule the request and get the result generator.
         generators: list[AsyncGenerator[PoolingRequestOutput, None]] = []
 
-        default_pooling_params = request.to_pooling_params("score")
+        default_pooling_params = request.to_pooling_params("classify")
 
         for i, engine_prompt in enumerate(engine_prompts):
             request_id_item = f"{request_id}-{i}"
diff --git a/vllm/entrypoints/sagemaker/api_router.py b/vllm/entrypoints/sagemaker/api_router.py
index 32faaa02e681..e8c48d1c6d53 100644
--- a/vllm/entrypoints/sagemaker/api_router.py
+++ b/vllm/entrypoints/sagemaker/api_router.py
@@ -10,9 +10,11 @@
 from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request
 from fastapi.responses import JSONResponse, Response
 
+from vllm.config import ModelConfig
 from vllm.entrypoints.openai.engine.protocol import ErrorResponse
 from vllm.entrypoints.openai.engine.serving import OpenAIServing
 from vllm.entrypoints.openai.utils import validate_json_request
+from vllm.entrypoints.pooling import enable_scoring_api
 from vllm.entrypoints.pooling.base.serving import PoolingServing
 from vllm.entrypoints.serve.instrumentator.basic import base
 from vllm.entrypoints.serve.instrumentator.health import health
@@ -25,7 +27,10 @@
 EndpointFn = Callable[[RequestType, Request], Awaitable[Any]]
 
 
-def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]):
+def get_invocation_types(
+    supported_tasks: tuple["SupportedTask", ...],
+    model_config: ModelConfig | None = None,
+):
     # NOTE: Items defined earlier take higher priority
     INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = []
 
@@ -70,7 +75,7 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]):
             (ClassificationRequest, (classify, create_classify)),
         ]
 
-    if "score" in supported_tasks:
+    if enable_scoring_api(supported_tasks, model_config):
         from vllm.entrypoints.pooling.score.api_router import do_rerank, rerank
         from vllm.entrypoints.pooling.score.protocol import RerankRequest
 
@@ -78,7 +83,6 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]):
             (RerankRequest, (rerank, do_rerank)),
         ]
 
-    if "score" in supported_tasks or "embed" in supported_tasks:
         from vllm.entrypoints.pooling.score.api_router import create_score, score
         from vllm.entrypoints.pooling.score.protocol import ScoreRequest
 
@@ -97,11 +101,15 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]):
     return INVOCATION_TYPES
 
 
-def attach_router(app: FastAPI, supported_tasks: tuple["SupportedTask", ...]):
+def attach_router(
+    app: FastAPI,
+    supported_tasks: tuple["SupportedTask", ...],
+    model_config: ModelConfig | None = None,
+):
     router = APIRouter()
 
     # NOTE: Construct the TypeAdapters only once
-    INVOCATION_TYPES = get_invocation_types(supported_tasks)
+    INVOCATION_TYPES = get_invocation_types(supported_tasks, model_config)
     INVOCATION_VALIDATORS = [
         (pydantic.TypeAdapter(request_type), (get_handler, endpoint))
         for request_type, (get_handler, endpoint) in INVOCATION_TYPES
diff --git a/vllm/model_executor/kernels/linear/mixed_precision/conch.py b/vllm/model_executor/kernels/linear/mixed_precision/conch.py
index e98676e01754..82dd32da19a0 100644
--- a/vllm/model_executor/kernels/linear/mixed_precision/conch.py
+++ b/vllm/model_executor/kernels/linear/mixed_precision/conch.py
@@ -124,6 +124,14 @@ def apply_weights(
 
         w_q, w_s, w_zp, _ = self._get_weight_params(layer)
 
+        # Map channelwise group_size=-1 to the actual input dimension K.
+        # The conch kernel computes stride_mul = block_k / group_size;
+        # passing -1 produces a negative stride that reads out-of-bounds
+        # scale values for all K-blocks after the first.
+        group_size = self.config.group_size
+        if group_size == -1:
+            group_size = x.shape[-1]
+
         output = mixed_precision_gemm(
             x=x,
             w_q_packed=w_q.data,
@@ -131,7 +139,7 @@ def apply_weights(
             w_zp=w_zp.data if w_zp is not None else None,
             weight_size_bits=self.config.weight_type.size_bits,
             weight_bias=self.config.weight_type.bias,
-            group_size=self.config.group_size,
+            group_size=group_size,
         )
 
         if bias is not None:
diff --git a/vllm/model_executor/kernels/linear/mixed_precision/exllama.py b/vllm/model_executor/kernels/linear/mixed_precision/exllama.py
index 537a8e278a39..3ad43a225fa8 100644
--- a/vllm/model_executor/kernels/linear/mixed_precision/exllama.py
+++ b/vllm/model_executor/kernels/linear/mixed_precision/exllama.py
@@ -59,6 +59,13 @@ def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
                 f"{cls.SUPPORTED_QUANT_TYPES}",
             )
 
+        if c.group_size <= 0:
+            return (
+                False,
+                f"Group size ({c.group_size}) must be positive, "
+                "Exllama does not support channelwise quantization",
+            )
+
         if c.full_weight_shape[0] % c.group_size != 0:
             return (
                 False,
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
index 2eb0f49217d8..f4e3ed8e055c 100644
--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -346,7 +346,7 @@ def g2_alphas(self) -> torch.Tensor | None:
 
     @property
     def use_fp8_w8a8(self) -> bool:
-        return self.quant_dtype == torch.float8_e4m3fn
+        return self.quant_dtype == current_platform.fp8_dtype()
 
     @property
     def use_int8_w8a8(self) -> bool:
@@ -566,7 +566,7 @@ def fp8_w8a8_moe_quant_config(
     Construct a quant config for fp8 activations and fp8 weights.
     """
     return FusedMoEQuantConfig.make(
-        torch.float8_e4m3fn,
+        current_platform.fp8_dtype(),
         w1_scale=w1_scale,
         g1_alphas=g1_alphas,
         w2_scale=w2_scale,
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
index a22b89415364..a3266f5e847b 100644
--- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -16,6 +16,7 @@
     moe_kernel_quantize_input,
     normalize_batched_scales_shape,
 )
+from vllm.platforms import current_platform
 from vllm.v1.worker.ubatching import (
     dbo_current_ubatch_id,
     dbo_enabled,
@@ -158,11 +159,6 @@ def _map_global_to_physical_ids(self, topk_ids: torch.Tensor) -> torch.Tensor:
             return topk_ids
         return self.global_to_physical[topk_ids]
 
-    def _map_local_to_global_ids(self, expert_topk_ids: torch.Tensor) -> torch.Tensor:
-        if self.local_expert_global_ids is None:
-            return expert_topk_ids
-        return self.local_expert_global_ids[expert_topk_ids]
-
     def _do_quant(
         self,
         x: torch.Tensor | tuple[torch.Tensor, torch.Tensor],
@@ -295,23 +291,46 @@ def prepare_async(
 
         # Dispatch
         dispatch_topk_ids = self._map_global_to_physical_ids(topk_ids)
-        expert_x, expert_num_tokens, handle, _, hook = self.buffer.low_latency_dispatch(
-            a1,
-            dispatch_topk_ids,
-            self.max_tokens_per_rank,
-            num_experts,
-            use_fp8=self.use_fp8_dispatch,
-            round_scale=self.use_ue8m0_dispatch,
-            use_ue8m0=self.use_ue8m0_dispatch,
-            **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
-            **(
-                dict(x_global_scale=qc_a1_gscale_or_scale)
-                if qc_a1_gscale_or_scale is not None
-                else dict()
-            ),
-            async_finish=False,
-            return_recv_hook=True,
-        )
+        if current_platform.is_rocm():
+            (
+                expert_x,
+                expert_num_tokens,
+                handle,
+                _,
+                hook,
+            ) = self.buffer.low_latency_dispatch(
+                a1,
+                dispatch_topk_ids,
+                self.max_tokens_per_rank,
+                num_experts,
+                use_fp8=self.use_fp8_dispatch,
+                async_finish=False,
+                return_recv_hook=True,
+            )
+        else:
+            (
+                expert_x,
+                expert_num_tokens,
+                handle,
+                _,
+                hook,
+            ) = self.buffer.low_latency_dispatch(
+                a1,
+                dispatch_topk_ids,
+                self.max_tokens_per_rank,
+                num_experts,
+                use_fp8=self.use_fp8_dispatch,
+                round_scale=self.use_ue8m0_dispatch,
+                use_ue8m0=self.use_ue8m0_dispatch,
+                **(dict(use_nvfp4=True) if use_nvfp4 else dict()),
+                **(
+                    dict(x_global_scale=qc_a1_gscale_or_scale)
+                    if qc_a1_gscale_or_scale is not None
+                    else dict()
+                ),
+                async_finish=False,
+                return_recv_hook=True,
+            )
         self.handles[a2a_idx] = handle
 
         return (
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/experts/flashinfer_cutedsl_moe.py
similarity index 100%
rename from vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py
rename to vllm/model_executor/layers/fused_moe/experts/flashinfer_cutedsl_moe.py
diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
index 501c10ab0cf4..f57a05dc6ecc 100644
--- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py
@@ -269,9 +269,16 @@ def _supports_routing_method(
         weight_key: QuantKey | None,
         activation_key: QuantKey | None,
     ) -> bool:
-        """Monolithic kernels need to express router support."""
+        """Monolithic kernels need to express router support.
+        Renormalize/RenormalizeNaive are excluded: the monolithic kernel's
+        internal routing for these methods produces output uncorrelated
+        with the modular kernel's output and with Triton kernel's output
+        for Qwen3.5-35B-A3B-FP8.
+        See: https://github.com/vllm-project/vllm/issues/37591
+        """
         # NOTE(dbari): TopK routing could also be enabled, but need to validate models
         # NOTE(dbari): Default is not implemented and should not be enabled until it is
+
         if (weight_key, activation_key) in [
             (kFp8Static128BlockSym, kFp8Dynamic128Sym),
             (kMxfp8Static, kMxfp8Dynamic),
@@ -279,16 +286,12 @@ def _supports_routing_method(
             # NOTE(rob): potentially allow others here. This is a conservative list.
             return routing_method in [
                 RoutingMethodType.DeepSeekV3,
-                RoutingMethodType.Renormalize,
-                RoutingMethodType.RenormalizeNaive,
             ]
         elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
             # NOTE(dbari): as above, potentially allow others here.
             return routing_method in [
                 RoutingMethodType.DeepSeekV3,
                 RoutingMethodType.Llama4,
-                RoutingMethodType.Renormalize,
-                RoutingMethodType.RenormalizeNaive,
             ]
         else:
             raise ValueError("Unsupported quantization scheme.")
diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py
new file mode 100644
index 000000000000..d084283360c4
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py
@@ -0,0 +1,352 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
+    FusedMoEQuantConfig,
+    RoutingMethodType,
+)
+from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
+    TopKWeightAndReduceNoOP,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey,
+    kMxfp4Static,
+    kMxfp8Dynamic,
+)
+from vllm.platforms import current_platform
+from vllm.utils.flashinfer import has_flashinfer
+
+
+class TrtLlmMxfp4ExpertsBase:
+    """
+    MXFP4 TRTLLM-Gen MoE kernels. Shared base for modular and monolithic.
+    """
+
+    def __init__(
+        self,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        # NOTE: FusedMoEExperts.__init__ is called by the concrete subclass
+        # (Monolithic/Modular) via MRO, not here, to avoid mypy issues with
+        # multiple inheritance. This matches the NvFP4 expert pattern.
+        self.moe_config = moe_config
+        self.quant_config = quant_config
+
+        self.routing_method_type = moe_config.routing_method
+        self.topk = moe_config.experts_per_token
+        self.intermediate_size_per_partition = (
+            moe_config.intermediate_size_per_partition
+        )
+        self.hidden_dim = moe_config.hidden_dim
+        self.local_num_experts = moe_config.num_local_experts
+        self.ep_rank = moe_config.moe_parallel_config.ep_rank
+
+        # MXFP4-specific TRTLLM parameters
+        device = torch.accelerator.current_device_index()
+        self.gemm1_alpha = torch.tensor(
+            [1.702] * self.local_num_experts,
+            dtype=torch.float32,
+            device=device,
+        )
+        self.gemm1_beta = torch.tensor(
+            [1.0] * self.local_num_experts,
+            dtype=torch.float32,
+            device=device,
+        )
+        self.gemm1_clamp_limit = torch.tensor(
+            [7.0] * self.local_num_experts,
+            dtype=torch.float32,
+            device=device,
+        )
+
+        from vllm.config import get_current_vllm_config
+
+        self.max_capture_size = (
+            get_current_vllm_config().compilation_config.max_cudagraph_capture_size
+        )
+
+        # P1-5 fix: use public quant_dtype property instead of private _a1
+        self.use_mxfp8_input = quant_config.quant_dtype == "mxfp8"
+
+    @staticmethod
+    def _supports_current_device() -> bool:
+        p = current_platform
+        return p.is_cuda() and p.is_device_capability_family(100) and has_flashinfer()
+
+    @staticmethod
+    def _supports_no_act_and_mul() -> bool:
+        return False
+
+    @staticmethod
+    def _supports_quant_scheme(
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        SUPPORTED_W_A = [
+            (kMxfp4Static, None),
+            (kMxfp4Static, kMxfp8Dynamic),
+        ]
+        return (weight_key, activation_key) in SUPPORTED_W_A
+
+    @staticmethod
+    def _supports_activation(activation: MoEActivation) -> bool:
+        return activation == MoEActivation.SWIGLUOAI
+
+    @staticmethod
+    def activation_format() -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
+    def supports_chunking(self) -> bool:
+        return False
+
+    def supports_expert_map(self) -> bool:
+        return False
+
+    @property
+    def expects_unquantized_inputs(self) -> bool:
+        # Expert handles MXFP8 quantization internally if needed
+        return True
+
+
+class TrtLlmMxfp4ExpertsMonolithic(
+    TrtLlmMxfp4ExpertsBase, mk.FusedMoEExpertsMonolithic
+):
+    """
+    Monolithic version of the MXFP4 TRTLLM kernel (router + experts).
+    Wraps flashinfer.trtllm_fp4_block_scale_moe().
+    """
+
+    @staticmethod
+    def _supports_parallel_config(
+        moe_parallel_config: FusedMoEParallelConfig,
+    ) -> bool:
+        return (
+            not moe_parallel_config.use_all2all_kernels
+            and not moe_parallel_config.enable_eplb
+            and moe_parallel_config.dp_size <= 1
+        )
+
+    @staticmethod
+    def _supports_routing_method(
+        routing_method: RoutingMethodType,
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        return routing_method in [
+            RoutingMethodType.Renormalize,
+            RoutingMethodType.RenormalizeNaive,
+        ]
+
+    @staticmethod
+    def _supports_router_logits_dtype(
+        router_logits_dtype: torch.dtype | None,
+        routing_method: RoutingMethodType,
+    ) -> bool:
+        # Kernel converts to bfloat16 internally
+        return True
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        router_logits: torch.Tensor,
+        activation: MoEActivation,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        apply_router_weight_on_input: bool,
+        # grouped topk + fused topk bias parameters
+        num_expert_group: int | None = None,
+        e_score_correction_bias: torch.Tensor | None = None,
+        routed_scaling_factor: float | None = None,
+        topk_group: int | None = None,
+    ) -> torch.Tensor:
+        from flashinfer import trtllm_fp4_block_scale_moe
+
+        # Handle input quantization
+        if self.use_mxfp8_input:
+            from flashinfer import mxfp8_quantize
+
+            x_quant, x_scale = mxfp8_quantize(
+                hidden_states,
+                is_sf_swizzled_layout=False,
+                alignment=256,
+            )
+            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                *hidden_states.shape[:-1], -1
+            )
+        else:
+            assert hidden_states.dtype == torch.bfloat16
+            x_quant = hidden_states
+            x_scale = None
+
+        output = torch.empty_like(hidden_states)
+
+        return trtllm_fp4_block_scale_moe(
+            routing_logits=router_logits.to(torch.bfloat16),
+            routing_bias=None,
+            hidden_states=x_quant,
+            hidden_states_scale=x_scale,
+            gemm1_weights=w1,
+            gemm1_weights_scale=self.w1_scale,
+            gemm1_bias=self.w1_bias,
+            gemm1_alpha=self.gemm1_alpha,
+            gemm1_beta=self.gemm1_beta,
+            gemm1_clamp_limit=self.gemm1_clamp_limit,
+            gemm2_weights=w2,
+            gemm2_weights_scale=self.w2_scale,
+            gemm2_bias=self.w2_bias,
+            output1_scale_scalar=None,
+            output1_scale_gate_scalar=None,
+            output2_scale_scalar=None,
+            num_experts=global_num_experts,
+            top_k=self.topk,
+            n_group=None,
+            topk_group=None,
+            intermediate_size=self.intermediate_size_per_partition,
+            local_expert_offset=self.ep_rank * self.local_num_experts,
+            local_num_experts=self.local_num_experts,
+            routed_scaling_factor=None,
+            routing_method_type=self.routing_method_type,
+            do_finalize=True,
+            tune_max_num_tokens=max(self.max_capture_size, 1),
+            output=output,
+        )[0]
+
+
+class TrtLlmMxfp4ExpertsModular(TrtLlmMxfp4ExpertsBase, mk.FusedMoEExpertsModular):
+    """
+    Modular version of the MXFP4 TRTLLM kernel (just the experts).
+    Wraps flashinfer.trtllm_fp4_block_scale_routed_moe().
+    Moved from trtllm_moe.py.
+    """
+
+    @property
+    def expects_unquantized_inputs(self) -> bool:
+        return True
+
+    @staticmethod
+    def _supports_parallel_config(
+        moe_parallel_config: FusedMoEParallelConfig,
+    ) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
+        return TopKWeightAndReduceNoOP()
+
+    def workspace_shapes(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        topk: int,
+        global_num_experts: int,
+        local_num_experts: int,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        activation: MoEActivation,
+    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
+        # The workspaces for this implementation are managed by flashinfer.
+        workspace1 = (0,)
+        workspace2 = (0,)
+        output = (M, K)
+        return (workspace1, workspace2, output)
+
+    def apply(
+        self,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
+        activation: MoEActivation,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        a2_scale: torch.Tensor | None,
+        workspace13: torch.Tensor,
+        workspace2: torch.Tensor,
+        expert_tokens_meta: mk.ExpertTokensMetadata | None,
+        apply_router_weight_on_input: bool,
+    ):
+        topk = topk_ids.size(-1)
+        local_num_experts = w1.size(0)
+        intermediate_size = w2.size(1)
+        local_expert_offset = self.moe_config.ep_rank * local_num_experts
+
+        # Handle input quantization
+        if self.use_mxfp8_input:
+            from flashinfer import mxfp8_quantize
+
+            x_quant, x_scale = mxfp8_quantize(
+                hidden_states,
+                is_sf_swizzled_layout=False,
+                alignment=256,
+            )
+            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                *hidden_states.shape[:-1], -1
+            )
+        else:
+            assert hidden_states.dtype == torch.bfloat16
+            x_quant = hidden_states
+            x_scale = None
+
+        packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to(
+            torch.bfloat16
+        ).view(torch.int16)
+
+        assert self.w1_scale is not None
+        assert self.w2_scale is not None
+        kwargs = {
+            "topk_ids": packed_tensor,
+            "routing_bias": None,
+            "hidden_states": x_quant,
+            "hidden_states_scale": x_scale,
+            "gemm1_weights": w1,
+            "gemm1_weights_scale": self.w1_scale,
+            "gemm1_bias": self.w1_bias,
+            "gemm1_alpha": self.gemm1_alpha,
+            "gemm1_beta": self.gemm1_beta,
+            "gemm1_clamp_limit": self.gemm1_clamp_limit,
+            "gemm2_weights": w2,
+            "gemm2_weights_scale": self.w2_scale,
+            "gemm2_bias": self.w2_bias,
+            "output1_scale_scalar": None,
+            "output1_scale_gate_scalar": None,
+            "output2_scale_scalar": None,
+            "num_experts": global_num_experts,
+            "top_k": topk,
+            "n_group": None,
+            "topk_group": None,
+            "intermediate_size": intermediate_size,
+            "local_expert_offset": local_expert_offset,
+            "local_num_experts": local_num_experts,
+            "routed_scaling_factor": None,
+            "routing_method_type": self.routing_method_type,
+            "do_finalize": True,
+            "output": output,
+            "tune_max_num_tokens": max(self.max_capture_size, 1),
+        }
+
+        from flashinfer import trtllm_fp4_block_scale_routed_moe
+
+        from vllm.utils.flashinfer import autotune
+
+        with autotune(False):
+            # Enable autotune when,
+            # https://github.com/flashinfer-ai/flashinfer/issues/2023 is
+            # resolved.
+            trtllm_fp4_block_scale_routed_moe(**kwargs)
+
+        return output
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index 9df94b72d246..e2b5a8f6764e 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -1017,6 +1017,7 @@ def apply(
             torch.float16,
             torch.bfloat16,
             torch.float8_e4m3fn,
+            torch.float8_e4m3fnuz,
         ]
         assert expert_tokens_meta is not None
 
@@ -1046,7 +1047,7 @@ def apply(
             compute_type = tl.float16
         elif hidden_states.dtype == torch.float32:
             compute_type = tl.float32
-        elif hidden_states.dtype == torch.float8_e4m3fn:
+        elif hidden_states.dtype == current_platform.fp8_dtype():
             compute_type = tl.bfloat16
         else:
             raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}")
diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
index 45575ab09c40..136a8188d6a0 100644
--- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py
@@ -40,6 +40,7 @@
     kFp8Static128BlockSym,
     kFp8StaticChannelSym,
     kFp8StaticTensorSym,
+    kMxfp4Static,
     kNvfp4Static,
 )
 from vllm.platforms import current_platform
@@ -574,12 +575,13 @@ def _supports_quant_scheme(
         weight_key: QuantKey | None,
         activation_key: QuantKey | None,
     ) -> bool:
-        # TODO(rob): add int4, mxfp4, int8 as integrations
+        # TODO(rob): add int4, int8 as integrations
         # are migrated to use the oracle one-by-one.
         SUPPORTED_W = [
             kFp8Static128BlockSym,
             kFp8StaticChannelSym,
             kFp8StaticTensorSym,
+            kMxfp4Static,
             kNvfp4Static,
         ]
         return weight_key in SUPPORTED_W
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 03ca8ba119c0..d5b8feb3c9b9 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1616,7 +1616,7 @@ def _get_config_quant_dtype(
     fused_experts_impl.
     """
     if use_fp8_w8a8:
-        return torch.float8_e4m3fn
+        return current_platform.fp8_dtype()
     elif use_int8_w8a8:
         return torch.int8
     elif ocp_mx_scheme == "w_mxfp4_a_mxfp4":
diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
index 82b0a21cba93..5862abe20518 100644
--- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
@@ -11,8 +11,10 @@
 from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.model_executor.layers.fused_moe.config import (
     FUSED_MOE_UNQUANTIZED_CONFIG,
+    FusedMoEConfig,
     FusedMoEParallelConfig,
     FusedMoEQuantConfig,
+    RoutingMethodType,
 )
 from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
     TopKWeightAndReduceNoOP,
@@ -20,6 +22,7 @@
 from vllm.model_executor.layers.fused_moe.utils import _resize_cache
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
+    kMxfp4Static,
 )
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
@@ -142,6 +145,33 @@ def legacy_routing_from_bitmatrix(
     return routing_data, gather_idx, scatter_idx
 
 
+def legacy_routing_from_sparsematrix(
+    sparse_logits: "SparseMatrix",
+    n_expts_tot: int,
+    n_expts_act: int,
+) -> tuple["RoutingData", "GatherIndx", "ScatterIndx"]:
+    """
+    Creates routing data from a SparseMatrix representation.
+    """
+    dispatch_indx = sparse_logits.mask_metadata.row_sorted_indx
+    combine_indx = sparse_logits.mask_metadata.col_sorted_indx
+    ragged_batch_metadata = make_ragged_tensor_metadata(
+        sparse_logits.mask_metadata.col_sum,
+        dispatch_indx.shape[0],
+    )
+    gate_scal = sparse_logits.vals.flatten()[combine_indx]
+    routing_data = RoutingData(
+        gate_scal,
+        ragged_batch_metadata.block_sizes,
+        n_expts_tot,
+        n_expts_act,
+        ragged_batch_metadata,
+    )
+    gather_idx = GatherIndx(combine_indx, dispatch_indx)
+    scatter_idx = ScatterIndx(dispatch_indx, combine_indx)
+    return routing_data, gather_idx, scatter_idx
+
+
 def legacy_routing(
     logits: torch.Tensor,
     n_expts_act: int,
@@ -158,10 +188,8 @@ def legacy_routing(
     if sm_first:
         logits = torch.softmax(logits, dim=-1)
     sparse_logits = topk(logits, n_expts_act, apply_softmax=not sm_first)
-    return legacy_routing_from_bitmatrix(
-        sparse_logits.mask,
-        sparse_logits.vals,
-        sparse_logits.indx,
+    return legacy_routing_from_sparsematrix(
+        sparse_logits,
         logits.shape[-1],
         n_expts_act,
     )
@@ -512,43 +540,43 @@ def make_routing_data(
 
 
 class BaseOAITritonExperts(mk.FusedMoEExpertsModular):
+    @property
+    def expects_unquantized_inputs(self) -> bool:
+        return True
+
     @staticmethod
     def _supports_current_device() -> bool:
-        raise NotImplementedError(
-            "OAITritonExperts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
+        p = current_platform
+        if not p.is_cuda_alike():
+            return False
+        cap = p.get_device_capability()
+        if cap is None:
+            return False
+        # (9,0) <= cap < (11,0) covers CUDA SM90 (Hopper), SM100+ (Blackwell)
+        # and ROCm gfx942/gfx950 (which map to 9.4/9.5).
+        return (9, 0) <= (cap.major, cap.minor) < (11, 0)
 
     @staticmethod
     def _supports_no_act_and_mul() -> bool:
-        raise NotImplementedError(
-            "OAITritonExperts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
+        return False
 
     @staticmethod
     def _supports_quant_scheme(
         weight_key: QuantKey | None,
         activation_key: QuantKey | None,
     ) -> bool:
-        raise NotImplementedError(
-            "OAITritonExperts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
+        SUPPORTED_W_A = [
+            (kMxfp4Static, None),
+        ]
+        return (weight_key, activation_key) in SUPPORTED_W_A
 
     @staticmethod
     def _supports_activation(activation: MoEActivation) -> bool:
-        raise NotImplementedError(
-            "OAITritonExperts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
+        raise NotImplementedError
 
     @staticmethod
     def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
-        raise NotImplementedError(
-            "OAITritonExperts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
+        return True
 
     def supports_expert_map(self) -> bool:
         return True
@@ -605,6 +633,10 @@ def _make_routing_data(
 class OAITritonExperts(BaseOAITritonExperts):
     """OAI Triton-based fused MoE expert implementation."""
 
+    @staticmethod
+    def _supports_activation(activation: MoEActivation) -> bool:
+        return activation == MoEActivation.SWIGLUOAI
+
     @staticmethod
     def activation_format() -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
@@ -689,6 +721,15 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts):
     One use case for it is to inject LoRA modules on the activation and moe_sum.
     """
 
+    @staticmethod
+    def _supports_activation(activation: MoEActivation) -> bool:
+        return activation in [
+            MoEActivation.SILU,
+            MoEActivation.GELU,
+            MoEActivation.SWIGLUOAI,
+            MoEActivation.SWIGLUSTEP,
+        ]
+
     @staticmethod
     def activation_format() -> mk.FusedMoEActivationFormat:
         return mk.FusedMoEActivationFormat.Standard
@@ -814,3 +855,118 @@ def apply(
         )
 
         self.moe_sum(intermediate_cache3.view(-1, topk, K), output)
+
+
+class OAITritonMxfp4ExpertsMonolithic(mk.FusedMoEExpertsMonolithic):
+    """Monolithic Triton MXFP4 expert. Wraps triton_kernel_moe_forward()."""
+
+    def __init__(
+        self,
+        moe_config: FusedMoEConfig,
+        quant_config: FusedMoEQuantConfig,
+    ):
+        super().__init__(moe_config, quant_config)
+        self.topk = moe_config.experts_per_token
+        self.renormalize = moe_config.routing_method in (
+            RoutingMethodType.Renormalize,
+            RoutingMethodType.RenormalizeNaive,
+        )
+
+    @staticmethod
+    def activation_format() -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
+    @staticmethod
+    def _supports_current_device() -> bool:
+        p = current_platform
+        if not p.is_cuda_alike():
+            return False
+        cap = p.get_device_capability()
+        if cap is None:
+            return False
+        # (9,0) <= cap < (11,0) covers CUDA SM90 (Hopper), SM100+ (Blackwell)
+        # and ROCm gfx942/gfx950 (which map to 9.4/9.5).
+        return (9, 0) <= (cap.major, cap.minor) < (11, 0)
+
+    @staticmethod
+    def _supports_no_act_and_mul() -> bool:
+        return False
+
+    @staticmethod
+    def _supports_quant_scheme(
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        SUPPORTED_W_A = [
+            (kMxfp4Static, None),
+        ]
+        return (weight_key, activation_key) in SUPPORTED_W_A
+
+    @staticmethod
+    def _supports_activation(activation: MoEActivation) -> bool:
+        return activation == MoEActivation.SWIGLUOAI
+
+    @staticmethod
+    def _supports_parallel_config(
+        moe_parallel_config: FusedMoEParallelConfig,
+    ) -> bool:
+        return (
+            not moe_parallel_config.use_all2all_kernels
+            and not moe_parallel_config.enable_eplb
+            and moe_parallel_config.dp_size <= 1
+        )
+
+    @staticmethod
+    def _supports_routing_method(
+        routing_method: RoutingMethodType,
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+    ) -> bool:
+        return routing_method in [
+            RoutingMethodType.Renormalize,
+            RoutingMethodType.RenormalizeNaive,
+        ]
+
+    @staticmethod
+    def _supports_router_logits_dtype(
+        router_logits_dtype: torch.dtype | None,
+        routing_method: RoutingMethodType,
+    ) -> bool:
+        return True
+
+    def supports_expert_map(self) -> bool:
+        return True
+
+    @property
+    def expects_unquantized_inputs(self) -> bool:
+        return True
+
+    def apply(
+        self,
+        hidden_states: torch.Tensor,
+        w1: torch.Tensor,
+        w2: torch.Tensor,
+        router_logits: torch.Tensor,
+        activation: MoEActivation,
+        global_num_experts: int,
+        expert_map: torch.Tensor | None,
+        a1q_scale: torch.Tensor | None,
+        apply_router_weight_on_input: bool,
+        # grouped topk + fused topk bias parameters
+        num_expert_group: int | None = None,
+        e_score_correction_bias: torch.Tensor | None = None,
+        routed_scaling_factor: float | None = None,
+        topk_group: int | None = None,
+    ) -> torch.Tensor:
+        return triton_kernel_moe_forward(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            gating_output=router_logits,
+            topk=self.topk,
+            renormalize=self.renormalize,
+            global_num_experts=global_num_experts,
+            expert_map=expert_map,
+            quant_config=self.quant_config,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        )
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 2f704569209c..85fd1813a363 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -52,7 +52,6 @@
     QuantizationConfig,
 )
 from vllm.platforms import current_platform
-from vllm.utils.math_utils import round_up
 
 logger = init_logger(__name__)
 
@@ -218,7 +217,6 @@ def maybe_roundup_hidden_size(
     moe_parallel_config: FusedMoEParallelConfig,
     is_lora_enabled: bool,
     model_type: str | None,
-    is_mxfp4_quant: bool,
 ) -> int:
     """
     Given layer hidden size and MoE configurations, round up hidden_size
@@ -232,7 +230,6 @@ def maybe_roundup_hidden_size(
             is used in the case of mxfp4 quantization in selecting the
             MxFP4Backend.
         model_type: for checking if gpt-oss
-        is_mxfp4_quant: whether the layer is quantized with mxfp4
 
     Return:
         Rounded up hidden_size if rounding up is required based on the configs.
@@ -246,28 +243,6 @@ def maybe_roundup_hidden_size(
         hidden_size, act_dtype, moe_parallel_config
     )
 
-    # we are padding globally so EP buffer allocation works
-    if model_type == "gpt_oss" and is_mxfp4_quant:
-        from vllm.model_executor.layers.quantization.mxfp4 import (
-            Mxfp4Backend,
-            get_mxfp4_backend,
-        )
-
-        current_mxfp4_backend = get_mxfp4_backend(is_lora_enabled)
-
-        if (
-            current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
-            or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
-        ):
-            hidden_size = round_up(hidden_size, 128)
-        elif (
-            current_platform.is_rocm()
-            or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-            or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
-            or current_mxfp4_backend == Mxfp4Backend.MARLIN
-        ):
-            hidden_size = round_up(hidden_size, 256)
-
     return hidden_size
 
 
@@ -540,9 +515,6 @@ def __init__(
             moe_parallel_config=self.moe_parallel_config,
             is_lora_enabled=vllm_config.lora_config is not None,
             model_type=self.model_type,
-            is_mxfp4_quant=(
-                quant_config is not None and quant_config.is_mxfp4_quant(prefix, self)
-            ),
         )
         self.hidden_size = hidden_size
 
diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
new file mode 100644
index 000000000000..ddc6588dc517
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py
@@ -0,0 +1,847 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from enum import Enum
+from typing import Union
+
+import torch
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from vllm import envs
+from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe import (
+    FusedMoEConfig,
+)
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig,
+    mxfp4_mxfp8_moe_quant_config,
+    mxfp4_w4a16_moe_quant_config,
+    ocp_mx_moe_quant_config,
+)
+from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
+    _swizzle_mxfp4,
+    get_padding_alignment,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey,
+    kMxfp4Static,
+    kMxfp8Dynamic,
+)
+from vllm.platforms import current_platform
+from vllm.utils.import_utils import has_triton_kernels
+from vllm.utils.math_utils import round_up
+
+logger = init_logger(__name__)
+
+if has_triton_kernels():
+    try:
+        from triton_kernels.matmul_ogs import PrecisionConfig
+    except (ImportError, AttributeError) as e:
+        logger.error(
+            "Failed to import Triton kernels. Please make sure your triton "
+            "version is compatible. Error: %s",
+            e,
+        )
+
+
+class Mxfp4MoeBackend(Enum):
+    NONE = "None"
+    # FlashInfer TRTLLM backends
+    FLASHINFER_TRTLLM_MXFP4_MXFP8 = "FLASHINFER_TRTLLM_MXFP4_MXFP8"
+    FLASHINFER_TRTLLM_MXFP4_BF16 = "FLASHINFER_TRTLLM_MXFP4_BF16"
+    # FlashInfer CUTLASS backends
+    FLASHINFER_CUTLASS_MXFP4_MXFP8 = "FLASHINFER_CUTLASS_MXFP4_MXFP8"
+    FLASHINFER_CUTLASS_MXFP4_BF16 = "FLASHINFER_CUTLASS_MXFP4_BF16"
+    # Marlin
+    BATCHED_MARLIN = "BATCHED_MARLIN"
+    MARLIN = "MARLIN"
+    # ROCm AITER (CK)
+    CK = "CK"
+    # Triton
+    TRITON = "TRITON"
+    TRITON_UNFUSED = "TRITON_UNFUSED"
+    # XPU
+    XPU = "XPU"
+
+
+# Backends that share the same TRTLLM weight format
+TRTLLM_BACKENDS = (
+    Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
+    Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
+)
+
+TRITON_BACKENDS = (
+    Mxfp4MoeBackend.TRITON,
+    Mxfp4MoeBackend.TRITON_UNFUSED,
+)
+
+
+def backend_to_kernel_cls(
+    backend: Mxfp4MoeBackend,
+) -> list[type[mk.FusedMoEExperts]]:
+    if backend in (
+        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
+        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
+    ):
+        from vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe import (
+            TrtLlmMxfp4ExpertsModular,
+            TrtLlmMxfp4ExpertsMonolithic,
+        )
+
+        # NOTE: prefer Monolithic > Modular, so return Monolithic first.
+        return [TrtLlmMxfp4ExpertsMonolithic, TrtLlmMxfp4ExpertsModular]
+
+    elif backend in (
+        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
+        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
+    ):
+        from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
+            FlashInferExperts,
+        )
+
+        return [FlashInferExperts]
+
+    elif backend == Mxfp4MoeBackend.TRITON:
+        from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+            OAITritonExperts,
+            OAITritonMxfp4ExpertsMonolithic,
+        )
+
+        # NOTE: prefer Monolithic > Modular, so return Monolithic first.
+        return [OAITritonMxfp4ExpertsMonolithic, OAITritonExperts]
+
+    elif backend == Mxfp4MoeBackend.TRITON_UNFUSED:
+        from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
+            UnfusedOAITritonExperts,
+        )
+
+        return [UnfusedOAITritonExperts]
+
+    elif backend == Mxfp4MoeBackend.MARLIN:
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            MarlinExperts,
+        )
+
+        return [MarlinExperts]
+
+    elif backend == Mxfp4MoeBackend.BATCHED_MARLIN:
+        from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
+            BatchedMarlinExperts,
+        )
+
+        return [BatchedMarlinExperts]
+
+    elif backend == Mxfp4MoeBackend.CK:
+        from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
+            AiterExperts,
+        )
+
+        return [AiterExperts]
+
+    elif backend == Mxfp4MoeBackend.XPU:
+        raise NotImplementedError("XPU backend uses XpuMxfp4MoEMethod directly.")
+    else:
+        raise ValueError(f"Unknown MXFP4 MoE backend: {backend.value}")
+
+
+def map_mxfp4_backend(runner_backend: str) -> Mxfp4MoeBackend:
+    """Map user's moe_backend string to Mxfp4MoeBackend."""
+    mapping: dict[str, Mxfp4MoeBackend] = {
+        "flashinfer_trtllm": Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
+        "flashinfer_trtllm_afp8": Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
+        "flashinfer_cutlass": Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
+        "flashinfer_cutlass_afp8": Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
+        "triton": Mxfp4MoeBackend.TRITON,
+        "marlin": Mxfp4MoeBackend.MARLIN,
+        "ck": Mxfp4MoeBackend.CK,
+    }
+    if backend := mapping.get(runner_backend):
+        return backend
+    raise ValueError(
+        f"moe_backend='{runner_backend}' is not supported for MXFP4 MoE. "
+        f"Expected one of {list(mapping.keys())}."
+    )
+
+
+def _get_priority_backends() -> list[Mxfp4MoeBackend]:
+    """
+    Get available backends in priority order based on platform and config.
+    Only includes BF16 backends. MXFP8 backends are selected via env vars.
+    """
+    _AVAILABLE_BACKENDS = [
+        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
+        Mxfp4MoeBackend.CK,
+        Mxfp4MoeBackend.TRITON,
+        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
+        Mxfp4MoeBackend.TRITON_UNFUSED,
+        Mxfp4MoeBackend.MARLIN,
+        Mxfp4MoeBackend.BATCHED_MARLIN,
+    ]
+    return _AVAILABLE_BACKENDS
+
+
+def _backend_activation_key(backend: Mxfp4MoeBackend) -> QuantKey | None:
+    """Map backend to its activation key (MXFP8 or None for BF16)."""
+    if backend in (
+        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
+        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
+    ):
+        return kMxfp8Dynamic
+    return None
+
+
+def select_mxfp4_moe_backend(
+    config: FusedMoEConfig,
+) -> tuple[Mxfp4MoeBackend, type[mk.FusedMoEExperts] | None]:
+    """
+    Select the primary MXFP4 MoE backend.
+    Note: Shape-specific fallbacks may still occur at runtime.
+    """
+    triton_kernels_supported = has_triton_kernels() and (
+        9,
+        0,
+    ) <= current_platform.get_device_capability() < (11, 0)
+
+    # LoRA: separate experts backend path
+    if config.is_lora_enabled:
+        if not current_platform.is_cuda():
+            raise NotImplementedError("Mxfp4 LoRA only supported on CUDA Platform.")
+        if envs.VLLM_MXFP4_USE_MARLIN is False and triton_kernels_supported:
+            logger.info_once("Using Triton backend for mxfp4 lora")
+            return Mxfp4MoeBackend.TRITON_UNFUSED, backend_to_kernel_cls(
+                Mxfp4MoeBackend.TRITON_UNFUSED
+            )[0]
+        logger.info_once("Using Marlin backend for mxfp4 lora")
+        return Mxfp4MoeBackend.MARLIN, backend_to_kernel_cls(Mxfp4MoeBackend.MARLIN)[0]
+
+    activation_format = (
+        mk.FusedMoEActivationFormat.BatchedExperts
+        if config.moe_parallel_config.use_batched_activation_format
+        else mk.FusedMoEActivationFormat.Standard
+    )
+
+    def _make_log_backend(backend: Mxfp4MoeBackend):
+        return f"Using '{backend.value}' Mxfp4 MoE backend."
+
+    def _make_log_unsupported(backend: Mxfp4MoeBackend, reason: str | None) -> str:
+        if reason:
+            return (
+                f"Mxfp4 MoE backend '{backend.value}' does not support the "
+                f"deployment configuration since {reason}."
+            )
+        return (
+            f"Mxfp4 MoE backend '{backend.value}' does not support the "
+            "deployment configuration."
+        )
+
+    def _return_or_raise(
+        backend: Mxfp4MoeBackend,
+        config: FusedMoEConfig,
+        weight_key: QuantKey | None,
+        activation_key: QuantKey | None,
+        activation_format: mk.FusedMoEActivationFormat,
+    ) -> tuple[Mxfp4MoeBackend, type[mk.FusedMoEExperts]]:
+        reason: str | None = None
+        for k_cls in backend_to_kernel_cls(backend):
+            supported, reason = k_cls.is_supported_config(
+                k_cls, config, weight_key, activation_key, activation_format
+            )
+            if supported:
+                logger.info_once(_make_log_backend(backend), scope="local")
+                return backend, k_cls
+        raise ValueError(_make_log_unsupported(backend, reason))
+
+    runner_backend = config.moe_backend
+    if runner_backend != "auto":
+        requested_backend = map_mxfp4_backend(runner_backend)
+        if (
+            activation_format == mk.FusedMoEActivationFormat.BatchedExperts
+            and requested_backend == Mxfp4MoeBackend.MARLIN
+        ):
+            requested_backend = Mxfp4MoeBackend.BATCHED_MARLIN
+        return _return_or_raise(
+            requested_backend,
+            config,
+            kMxfp4Static,
+            _backend_activation_key(requested_backend),
+            activation_format,
+        )
+
+    # Select kernels in order of backend.
+    AVAILABLE_BACKENDS = _get_priority_backends()
+
+    # Handle explicit FlashInfer MXFP4 BF16 configuration.
+    if envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"):
+        if not envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16:
+            AVAILABLE_BACKENDS.remove(Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16)
+            AVAILABLE_BACKENDS.remove(Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16)
+        else:
+            if current_platform.is_device_capability(90):
+                return _return_or_raise(
+                    Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
+                    config,
+                    kMxfp4Static,
+                    None,
+                    activation_format,
+                )
+            if current_platform.is_device_capability_family(100):
+                return _return_or_raise(
+                    Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
+                    config,
+                    kMxfp4Static,
+                    None,
+                    activation_format,
+                )
+            raise ValueError(
+                "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16=1 is set but the "
+                "current device capability is not supported. "
+                "Only SM90 (CUTLASS) and SM100+ (TRTLLM) are supported."
+            )
+
+    # Handle explicit FlashInfer MXFP4 MXFP8 TRTLLM configuration.
+    if (
+        envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8")
+        and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
+    ):
+        return _return_or_raise(
+            Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
+            config,
+            kMxfp4Static,
+            kMxfp8Dynamic,
+            activation_format,
+        )
+
+    # Handle explicit FlashInfer MXFP4 MXFP8 CUTLASS configuration.
+    if (
+        envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS")
+        and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
+    ):
+        return _return_or_raise(
+            Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
+            config,
+            kMxfp4Static,
+            kMxfp8Dynamic,
+            activation_format,
+        )
+
+    # Handle explicit Marlin MXFP4 configuration.
+    if envs.is_set("VLLM_MXFP4_USE_MARLIN") and envs.VLLM_MXFP4_USE_MARLIN:
+        return _return_or_raise(
+            Mxfp4MoeBackend.MARLIN,
+            config,
+            kMxfp4Static,
+            None,
+            activation_format,
+        )
+
+    for backend in AVAILABLE_BACKENDS:
+        activation_key = _backend_activation_key(backend)
+        for k_cls in backend_to_kernel_cls(backend):
+            supported, reason = k_cls.is_supported_config(
+                k_cls, config, kMxfp4Static, activation_key, activation_format
+            )
+            if supported:
+                logger.info_once(_make_log_backend(backend), scope="local")
+                return backend, k_cls
+            else:
+                logger.debug_once(_make_log_unsupported(backend, reason), scope="local")
+
+    if current_platform.is_xpu():
+        backend = Mxfp4MoeBackend.XPU
+        logger.info_once(_make_log_backend(backend))
+        return backend, None
+
+    if current_platform.is_cuda() or current_platform.is_rocm():
+        raise NotImplementedError(
+            "No MXFP4 MoE backend supports the deployment configuration."
+        )
+
+    return Mxfp4MoeBackend.NONE, None
+
+
+def mxfp4_round_up_hidden_size_and_intermediate_size(
+    backend: Mxfp4MoeBackend, hidden_size: int, intermediate_size: int
+) -> tuple[int, int]:
+    """Round up hidden_size and intermediate_size based on backend requirements."""
+    if backend in (Mxfp4MoeBackend.MARLIN, Mxfp4MoeBackend.BATCHED_MARLIN):
+        intermediate_size = round_up(intermediate_size, 128)
+        if current_platform.is_xpu():
+            hidden_size = round_up(hidden_size, 128)
+        else:
+            hidden_size = round_up(hidden_size, 256)
+    elif backend in TRTLLM_BACKENDS:
+        intermediate_size = round_up(intermediate_size, 256)
+        hidden_size = round_up(hidden_size, 256)
+    elif backend in (
+        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
+        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
+    ):
+        intermediate_size = round_up(intermediate_size, 128)
+        hidden_size = round_up(hidden_size, 128)
+    elif current_platform.is_rocm():
+        pad_align = get_padding_alignment()
+        intermediate_size = round_up(intermediate_size, pad_align)
+        hidden_size = round_up(hidden_size, pad_align)
+    else:
+        intermediate_size = round_up(intermediate_size, 64)
+    return hidden_size, intermediate_size
+
+
+def convert_to_mxfp4_moe_kernel_format(
+    mxfp4_backend: Mxfp4MoeBackend,
+    layer: torch.nn.Module,
+    w13_weight: torch.Tensor,
+    w2_weight: torch.Tensor,
+    w13_weight_scale: torch.Tensor,
+    w2_weight_scale: torch.Tensor,
+    w13_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+    _cache_permute_indices: dict[torch.Size, torch.Tensor] | None = None,
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    Union[torch.Tensor, "PrecisionConfig"],
+    Union[torch.Tensor, "PrecisionConfig"],
+    torch.Tensor | None,
+    torch.Tensor | None,
+]:
+    """Convert loaded weights into backend-specific kernel format."""
+
+    num_experts = w13_weight.shape[0]
+    intermediate_size = w13_weight.shape[1] // 2
+    hidden_size = w13_weight.shape[2] * 2
+
+    sf_block_size = 32  # mxfp4 block size
+
+    if mxfp4_backend in (Mxfp4MoeBackend.MARLIN, Mxfp4MoeBackend.BATCHED_MARLIN):
+        from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
+            prepare_moe_mxfp4_layer_for_marlin,
+        )
+
+        return prepare_moe_mxfp4_layer_for_marlin(
+            layer,
+            w13_weight,
+            w2_weight,
+            w13_weight_scale,
+            w2_weight_scale,
+            w13_bias,
+            w2_bias,
+        )
+
+    elif mxfp4_backend in TRTLLM_BACKENDS:
+        assert _cache_permute_indices is not None
+        from flashinfer.fp4_quantization import nvfp4_block_scale_interleave
+        from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache
+
+        # gemm1_alpha/beta/clamp_limit are created by the expert class
+        # (TrtLlmMxfp4ExpertsBase), not on the layer.
+
+        w13_weight = w13_weight.data
+        w2_weight = w2_weight.data
+        w13_weight_scale = w13_weight_scale.data
+        w2_weight_scale = w2_weight_scale.data
+        assert w13_bias is not None and w2_bias is not None
+        w13_bias = w13_bias.data.to(torch.float32)
+        w2_bias = w2_bias.data.to(torch.float32)
+
+        # Swap w1 and w3 as the definition of swiglu is different in trtllm-gen
+        def swap_every_two_rows(x, axis=-1):
+            shape = x.shape
+            if axis < 0:
+                axis = len(shape) + axis
+            new_shape = list(shape)
+            new_shape[axis] = shape[axis] // 2
+            new_shape.insert(axis + 1, 2)
+            x = x.reshape(*new_shape)
+            x = x.flip(axis + 1)
+            new_shape = list(shape)
+            return x.reshape(*new_shape)
+
+        w13_weight_scale = swap_every_two_rows(w13_weight_scale, -2)
+        w13_weight = swap_every_two_rows(w13_weight, -2)
+        w13_bias = swap_every_two_rows(w13_bias, -1)
+
+        # Shuffle weights and scaling factors for transposed mma output
+        gemm1_weights_shuffled = []
+        gemm1_scales_shuffled = []
+        gemm2_weights_shuffled = []
+        gemm2_scales_shuffled = []
+        gemm1_bias_shuffled = []
+        gemm2_bias_shuffled = []
+        epilogue_tile_m = 128
+        for i in range(num_experts):
+            # w13 weight
+            permute_indices = get_w2_permute_indices_with_cache(
+                _cache_permute_indices,
+                w13_weight[i].view(torch.uint8),
+                epilogue_tile_m,
+            )
+            gemm1_weights_shuffled.append(
+                w13_weight[i]
+                .view(torch.uint8)[permute_indices.to(w13_weight.device)]
+                .contiguous()
+            )
+            # w13 scale
+            permute_sf_indices = get_w2_permute_indices_with_cache(
+                _cache_permute_indices,
+                w13_weight_scale[i].view(torch.uint8),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
+            gemm1_scales_shuffled.append(
+                nvfp4_block_scale_interleave(
+                    w13_weight_scale[i]
+                    .view(torch.uint8)[permute_sf_indices.to(w13_weight_scale.device)]
+                    .contiguous()
+                )
+            )
+            # w13 bias
+            permute_bias_indices = get_w2_permute_indices_with_cache(
+                _cache_permute_indices,
+                w13_bias[i].clone().reshape(-1, 1),
+                epilogue_tile_m,
+            )
+            gemm1_bias_shuffled.append(
+                w13_bias[i]
+                .clone()
+                .reshape(-1, 1)[permute_bias_indices.to(w13_bias.device)]
+                .contiguous()
+            )
+            # w2 weight
+            permute_indices = get_w2_permute_indices_with_cache(
+                _cache_permute_indices,
+                w2_weight[i].view(torch.uint8),
+                epilogue_tile_m,
+            )
+            gemm2_weights_shuffled.append(
+                w2_weight[i]
+                .view(torch.uint8)[permute_indices.to(w2_weight.device)]
+                .contiguous()
+            )
+            # w2 scale
+            permute_sf_indices = get_w2_permute_indices_with_cache(
+                _cache_permute_indices,
+                w2_weight_scale[i].view(torch.uint8),
+                epilogue_tile_m,
+                num_elts_per_sf=16,
+            )
+            gemm2_scales_shuffled.append(
+                nvfp4_block_scale_interleave(
+                    w2_weight_scale[i]
+                    .view(torch.uint8)[permute_sf_indices.to(w2_weight_scale.device)]
+                    .contiguous()
+                )
+            )
+            # w2 bias
+            permute_indices = get_w2_permute_indices_with_cache(
+                _cache_permute_indices,
+                w2_bias[i].clone().reshape(-1, 1),
+                epilogue_tile_m,
+            )
+            gemm2_bias_shuffled.append(
+                w2_bias[i]
+                .clone()
+                .reshape(-1, 1)[permute_indices.to(w2_bias.device)]
+                .contiguous()
+            )
+
+        w13_weight = torch.stack(gemm1_weights_shuffled)
+        w13_weight_scale = (
+            torch.stack(gemm1_scales_shuffled)
+            .reshape(num_experts, 2 * intermediate_size, hidden_size // sf_block_size)
+            .view(torch.float8_e4m3fn)
+        )
+        w2_weight = torch.stack(gemm2_weights_shuffled)
+        w2_weight_scale = (
+            torch.stack(gemm2_scales_shuffled)
+            .reshape(num_experts, hidden_size, intermediate_size // sf_block_size)
+            .view(torch.float8_e4m3fn)
+        )
+        w13_bias = torch.stack(gemm1_bias_shuffled).reshape(num_experts, -1)
+        w2_bias = torch.stack(gemm2_bias_shuffled).reshape(num_experts, -1)
+
+        return (
+            w13_weight,
+            w2_weight,
+            w13_weight_scale,
+            w2_weight_scale,
+            w13_bias,
+            w2_bias,
+        )
+
+    elif mxfp4_backend in (
+        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
+        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
+    ):
+        # De-interleave and swap for w13 weight, bias, and scales
+        w13_w = w13_weight.data
+        gate_w, up_w = w13_w[:, ::2, :], w13_w[:, 1::2, :]
+        deinterleaved_w13_w = torch.cat([gate_w, up_w], dim=1)
+        w1_w, w3_w = torch.chunk(deinterleaved_w13_w, 2, dim=1)
+        w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
+
+        assert w13_bias is not None and w2_bias is not None
+        w13_b = w13_bias.data.to(torch.float32)
+        gate_b, up_b = w13_b[:, ::2], w13_b[:, 1::2]
+        deinterleaved_w13_b = torch.cat([gate_b, up_b], dim=1)
+        b1, b3 = torch.chunk(deinterleaved_w13_b, 2, dim=-1)
+        w13_bias_swapped = torch.cat([b3, b1], dim=-1).to(torch.bfloat16)
+
+        w13_s = w13_weight_scale.data
+        gate_s, up_s = w13_s[:, ::2, :], w13_s[:, 1::2, :]
+        deinterleaved_w13_s = torch.cat([gate_s, up_s], dim=1)
+        s1, s3 = torch.chunk(deinterleaved_w13_s, 2, dim=1)
+        w13_scale_swapped = torch.cat([s3, s1], dim=1)
+
+        if mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8:
+            from flashinfer import block_scale_interleave
+
+            orig_shape = w13_scale_swapped.shape
+            w13_scale_interleaved = block_scale_interleave(
+                w13_scale_swapped.view(torch.uint8)
+            ).reshape(orig_shape)
+
+            w2_s = w2_weight_scale.data
+            orig_shape = w2_s.shape
+            w2_scale_interleaved = block_scale_interleave(
+                w2_s.view(torch.uint8)
+            ).reshape(orig_shape)
+
+            return (
+                w13_weight_swapped,
+                w2_weight,
+                w13_scale_interleaved,
+                w2_scale_interleaved,
+                w13_bias_swapped,
+                w2_bias,
+            )
+
+        else:
+            assert mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16
+
+            def _interleave_mxfp4_cutlass_sm90(w):
+                w_shape = w.shape
+                w_interleaved = w.reshape(w_shape[0], w_shape[1], (w_shape[2] // 4), 4)
+                w_interleaved = w_interleaved.permute(0, 2, 1, 3)
+                w_interleaved = w_interleaved.reshape(
+                    w_shape[0], w_shape[2] // 4, w_shape[1] * 4
+                )
+                return w_interleaved
+
+            w31_scales = w13_scale_swapped.to(torch.uint8)
+            w31_scales_interleaved = _interleave_mxfp4_cutlass_sm90(w31_scales)
+
+            w2_scale = w2_weight_scale.data.to(torch.uint8)
+            w2_scale_interleaved = _interleave_mxfp4_cutlass_sm90(w2_scale)
+
+            return (
+                w13_weight_swapped,
+                w2_weight,
+                w31_scales_interleaved,
+                w2_scale_interleaved,
+                w13_bias_swapped,
+                w2_bias,
+            )
+
+    elif mxfp4_backend == Mxfp4MoeBackend.CK:
+        from vllm._aiter_ops import rocm_aiter_ops
+
+        if w13_bias is not None:
+            w13_bias = w13_bias.data.to(torch.float32)
+        if w2_bias is not None:
+            w2_bias = w2_bias.data.to(torch.float32)
+
+        e, n, k = w13_weight.shape
+
+        # De-interleave w13 rows: gate/up pairs -> contiguous gate, up blocks
+        w13_weight.view(torch.uint8).copy_(
+            w13_weight.data.view(torch.uint8)
+            .view(e, n // 2, 2, k)
+            .permute(0, 2, 1, 3)
+            .contiguous()
+            .view(e, n, k)
+        )
+        w13_weight_scale.data = (
+            w13_weight_scale.data.view(e, n // 2, 2, -1)
+            .permute(0, 2, 1, 3)
+            .contiguous()
+            .view(e, n, -1)
+        )
+
+        # View as native FP4 dtype for AITER shuffle
+        w13_weight.data = w13_weight.data.view(torch.float4_e2m1fn_x2)
+        w2_weight.data = w2_weight.data.view(torch.float4_e2m1fn_x2)
+
+        # Shuffle weights and scales for AITER CK kernel layout
+        w13_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w13_weight, 16, True)
+        shuffled_w13_scale = rocm_aiter_ops.shuffle_scale_a16w4(
+            w13_weight_scale.view(-1, w13_weight_scale.shape[-1]),
+            num_experts,
+            True,
+        )
+
+        w2_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w2_weight, 16, False)
+        shuffled_w2_scale = rocm_aiter_ops.shuffle_scale_a16w4(
+            w2_weight_scale.view(-1, w2_weight_scale.shape[-1]),
+            num_experts,
+            False,
+        )
+
+        # Permute bias to match de-interleaved weight layout
+        if w13_bias is not None:
+            w13_bias = (
+                w13_bias.data.view(-1, n // 2, 2)
+                .permute(0, 2, 1)
+                .contiguous()
+                .view(-1, n)
+            )
+
+        return (
+            w13_weight,
+            w2_weight,
+            shuffled_w13_scale,
+            shuffled_w2_scale,
+            w13_bias,
+            w2_bias,
+        )
+
+    elif mxfp4_backend in TRITON_BACKENDS:
+        from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
+
+        assert w13_bias is not None and w2_bias is not None
+        w13_bias = w13_bias.to(torch.float32)
+        w2_bias = w2_bias.to(torch.float32)
+
+        w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
+            w13_weight,
+            w13_weight_scale,
+        )
+        w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
+            w2_weight,
+            w2_weight_scale,
+        )
+
+        w13_precision_config = PrecisionConfig(
+            weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex)
+        )
+        w2_precision_config = PrecisionConfig(
+            weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)
+        )
+
+        del layer.w13_weight
+        del layer.w2_weight
+
+        return (
+            w13_weight,
+            w2_weight,
+            w13_precision_config,
+            w2_precision_config,
+            w13_bias,
+            w2_bias,
+        )
+    else:
+        raise ValueError(
+            f"Unsupported mxfp4_backend: {mxfp4_backend}: "
+            f"should be one of: {list(Mxfp4MoeBackend)}."
+        )
+
+
+def make_mxfp4_moe_quant_config(
+    mxfp4_backend: Mxfp4MoeBackend,
+    w1_scale: Union[torch.Tensor, "PrecisionConfig"],
+    w2_scale: Union[torch.Tensor, "PrecisionConfig"],
+    w1_bias: torch.Tensor | None = None,
+    w2_bias: torch.Tensor | None = None,
+) -> FusedMoEQuantConfig | None:
+    """Create a FusedMoEQuantConfig for the given MXFP4 backend."""
+    if mxfp4_backend in (
+        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8,
+        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8,
+    ):
+        return mxfp4_mxfp8_moe_quant_config(
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+        )
+    elif mxfp4_backend in (
+        Mxfp4MoeBackend.MARLIN,
+        Mxfp4MoeBackend.BATCHED_MARLIN,
+        Mxfp4MoeBackend.TRITON,
+        Mxfp4MoeBackend.TRITON_UNFUSED,
+        Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16,
+        Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16,
+        Mxfp4MoeBackend.CK,
+    ):
+        return mxfp4_w4a16_moe_quant_config(
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+        )
+    else:
+        return ocp_mx_moe_quant_config(
+            quant_dtype="mxfp4",
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+        )
+
+
+def make_mxfp4_moe_kernel(
+    moe_quant_config: FusedMoEQuantConfig,
+    moe_config: FusedMoEConfig,
+    experts_cls: type[mk.FusedMoEExperts],
+    mxfp4_backend: Mxfp4MoeBackend,
+    routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+    shared_experts: torch.nn.Module | None = None,
+) -> mk.FusedMoEKernel:
+    """Create a FusedMoEKernel for the given MXFP4 backend."""
+    is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic)
+
+    # Create Prepare/Finalize.
+    prepare_finalize = maybe_make_prepare_finalize(
+        moe=moe_config,
+        quant_config=moe_quant_config,
+        routing_tables=routing_tables,
+        allow_new_interface=True,
+        use_monolithic=is_monolithic,
+    )
+    assert prepare_finalize is not None
+
+    logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local")
+
+    # Create Experts.
+    if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts:
+        max_num_tokens = prepare_finalize.max_num_tokens_per_rank()
+        assert max_num_tokens is not None
+        experts = experts_cls(
+            moe_config=moe_config,
+            quant_config=moe_quant_config,
+            max_num_tokens=max_num_tokens,
+            num_dispatchers=prepare_finalize.num_dispatchers(),
+        )
+    else:
+        experts = experts_cls(
+            moe_config=moe_config,
+            quant_config=moe_quant_config,
+        )
+
+    kernel = mk.FusedMoEKernel(
+        prepare_finalize,
+        experts,
+        shared_experts=(
+            shared_experts
+            if moe_config.moe_parallel_config.use_deepep_ll_kernels
+            else None
+        ),
+        moe_parallel_config=moe_config.moe_parallel_config,
+        inplace=(
+            not moe_config.disable_inplace and mxfp4_backend not in TRTLLM_BACKENDS
+        ),
+    )
+
+    return kernel
diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
index 8a224cb39e7c..35451e87dd7d 100644
--- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
+++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py
@@ -14,7 +14,6 @@
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEConfig,
     FusedMoEQuantConfig,
-    mxfp4_w4a16_moe_quant_config,
     nvfp4_moe_quant_config,
     nvfp4_w4a16_moe_quant_config,
 )
@@ -87,7 +86,7 @@ def backend_to_kernel_cls(
         return [FlashInferExperts]
 
     elif backend == NvFp4MoeBackend.FLASHINFER_CUTEDSL:
-        from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import (
+        from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutedsl_moe import (  # noqa: E501
             FlashInferCuteDSLExperts,
         )
 
@@ -347,16 +346,6 @@ def convert_to_nvfp4_moe_kernel_format(
     )
 
 
-def make_mxfp4_moe_quant_config(
-    w13_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-) -> FusedMoEQuantConfig:
-    return mxfp4_w4a16_moe_quant_config(
-        w1_scale=w13_scale,
-        w2_scale=w2_scale,
-    )
-
-
 def make_nvfp4_moe_quant_config(
     backend: NvFp4MoeBackend,
     w13_scale: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index b9f161ae88ec..98af53fcec58 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -24,6 +24,7 @@
     kFp8Static128BlockSym,
     kFp8StaticChannelSym,
     kFp8StaticTensorSym,
+    kMxfp4Static,
 )
 
 
@@ -201,6 +202,8 @@ def rocm_aiter_fused_experts(
         activation_method = ActivationMethod.SILU
     elif activation == MoEActivation.GELU:
         activation_method = ActivationMethod.GELU
+    elif activation == MoEActivation.SWIGLUOAI:
+        activation_method = rocm_aiter_ops.get_aiter_activation_type("swiglu")
     else:
         raise ValueError(f"Unsupported activation: {activation}")
 
@@ -247,8 +250,8 @@ def rocm_aiter_fused_experts(
 
     else:
         quant_method = QuantMethod.NO.value
-        # quark moe for mxfp4 w_dtype mxfp4 a_dtype
-        if quant_config.use_mxfp4_w4a4:
+        # mxfp4: both w4a4 (quark) and w4a16 (oracle CK) use BLOCK_1X32
+        if quant_config.use_mxfp4_w4a4 or quant_config.use_mxfp4_w4a16:
             quant_method = QuantMethod.BLOCK_1X32.value
         # w8a8 block-scaled
         if quant_config.block_shape is not None and quant_config.use_fp8_w8a8:
@@ -289,6 +292,8 @@ def rocm_aiter_fused_experts(
             doweight_stage1=apply_router_weight_on_input,
             num_local_tokens=num_local_tokens,
             output_dtype=output_dtype,
+            bias1=quant_config.w1_bias if quant_config.use_mxfp4_w4a16 else None,
+            bias2=quant_config.w2_bias if quant_config.use_mxfp4_w4a16 else None,
         )
 
 
@@ -319,21 +324,23 @@ def _supports_quant_scheme(
         weight_key: QuantKey | None,
         activation_key: QuantKey | None,
     ) -> bool:
-        # TODO(rob): AITER also supports MXFP4, which is not
-        # yet supported via an Oracle. Once it is, we will add
-        # MXFP4 to this list.
         SUPPORTED_W_A = [
             (None, None),
             (kFp8Static128BlockSym, kFp8Dynamic128Sym),
             (kFp8StaticTensorSym, kFp8StaticTensorSym),
             (kFp8StaticTensorSym, kFp8DynamicTensorSym),
             (kFp8StaticChannelSym, kFp8DynamicTokenSym),
+            (kMxfp4Static, None),
         ]
         return (weight_key, activation_key) in SUPPORTED_W_A
 
     @staticmethod
     def _supports_activation(activation: MoEActivation) -> bool:
-        return activation in [MoEActivation.SILU, MoEActivation.GELU]
+        return activation in [
+            MoEActivation.SILU,
+            MoEActivation.GELU,
+            MoEActivation.SWIGLUOAI,
+        ]
 
     @staticmethod
     def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py
deleted file mode 100644
index 30ed77a8b64b..000000000000
--- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import torch
-
-import vllm.model_executor.layers.fused_moe.modular_kernel as mk
-from vllm.model_executor.layers.fused_moe.activation import MoEActivation
-from vllm.model_executor.layers.fused_moe.config import (
-    FusedMoEConfig,
-    FusedMoEParallelConfig,
-    FusedMoEQuantConfig,
-)
-from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import (
-    TopKWeightAndReduceNoOP,
-)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    QuantKey,
-)
-
-
-class TrtLlmGenExperts(mk.FusedMoEExpertsModular):
-    """TensorRT-LLM-based fused MoE expert implementation."""
-
-    def __init__(
-        self,
-        moe_config: FusedMoEConfig,
-        quant_config: FusedMoEQuantConfig,
-        max_capture_size,
-    ):
-        super().__init__(moe_config, quant_config)
-        self.device = torch.accelerator.current_device_index()
-        self.num_experts = moe_config.num_local_experts
-        self.gemm1_alpha = torch.tensor(
-            [1.702] * self.num_experts, dtype=torch.float32, device=self.device
-        )
-        self.gemm1_beta = torch.tensor(
-            [1.0] * self.num_experts, dtype=torch.float32, device=self.device
-        )
-        self.gemm1_clamp_limit = torch.tensor(
-            [7.0] * self.num_experts, dtype=torch.float32, device=self.device
-        )
-        self.max_capture_size = max_capture_size
-
-    @staticmethod
-    def activation_format() -> mk.FusedMoEActivationFormat:
-        return mk.FusedMoEActivationFormat.Standard
-
-    @staticmethod
-    def _supports_current_device() -> bool:
-        raise NotImplementedError(
-            "TrtLlmGenExperts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
-
-    @staticmethod
-    def _supports_no_act_and_mul() -> bool:
-        raise NotImplementedError(
-            "TrtLlmGenExperts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
-
-    @staticmethod
-    def _supports_quant_scheme(
-        weight_key: QuantKey | None,
-        activation_key: QuantKey | None,
-    ) -> bool:
-        raise NotImplementedError(
-            "TrtLlmGenExperts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
-
-    @staticmethod
-    def _supports_activation(activation: MoEActivation) -> bool:
-        raise NotImplementedError(
-            "TrtLlmGenExperts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
-
-    @staticmethod
-    def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool:
-        raise NotImplementedError(
-            "TrtLlmGenExperts is not yet used by an Oracle. "
-            "This method should not be called."
-        )
-
-    def supports_expert_map(self) -> bool:
-        return True
-
-    def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce:
-        return TopKWeightAndReduceNoOP()
-
-    def workspace_shapes(
-        self,
-        M: int,
-        N: int,
-        K: int,
-        topk: int,
-        global_num_experts: int,
-        local_num_experts: int,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-        activation: MoEActivation,
-    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
-        # The workspaces for this implementation are managed by flashinfer.
-        workspace1 = (0,)
-        workspace2 = (0,)
-        output = (M, K)
-        return (workspace1, workspace2, output)
-
-    def apply(
-        self,
-        output: torch.Tensor,
-        hidden_states: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        topk_weights: torch.Tensor,
-        topk_ids: torch.Tensor,
-        activation: MoEActivation,
-        global_num_experts: int,
-        expert_map: torch.Tensor | None,
-        a1q_scale: torch.Tensor | None,
-        a2_scale: torch.Tensor | None,
-        workspace13: torch.Tensor,
-        workspace2: torch.Tensor,
-        expert_tokens_meta: mk.ExpertTokensMetadata | None,
-        apply_router_weight_on_input: bool,
-    ):
-        topk = topk_ids.size(-1)
-        local_num_experts = w1.size(0)
-        intermediate_size = w2.size(1)
-        local_expert_offset = self.moe_config.ep_rank * local_num_experts
-
-        x_quant = hidden_states
-        x_scale = a1q_scale
-        if x_scale is not None:
-            x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*x_quant.shape[:-1], -1)
-
-        packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to(
-            torch.bfloat16
-        ).view(torch.int16)
-
-        assert self.w1_scale is not None
-        assert self.w2_scale is not None
-        kwargs = {
-            "topk_ids": packed_tensor,
-            "routing_bias": None,
-            "hidden_states": x_quant,
-            "hidden_states_scale": x_scale,
-            "gemm1_weights": w1,
-            "gemm1_weights_scale": self.w1_scale,
-            "gemm1_bias": self.w1_bias,
-            "gemm1_alpha": self.gemm1_alpha,
-            "gemm1_beta": self.gemm1_beta,
-            "gemm1_clamp_limit": self.gemm1_clamp_limit,
-            "gemm2_weights": w2,
-            "gemm2_weights_scale": self.w2_scale,
-            "gemm2_bias": self.w2_bias,
-            "output1_scale_scalar": None,
-            "output1_scale_gate_scalar": None,
-            "output2_scale_scalar": None,
-            "num_experts": global_num_experts,
-            "top_k": topk,
-            "n_group": None,
-            "topk_group": None,
-            "intermediate_size": intermediate_size,
-            "local_expert_offset": local_expert_offset,
-            "local_num_experts": local_num_experts,
-            "routed_scaling_factor": None,
-            "routing_method_type": 1,
-            "do_finalize": True,
-            "output": output,
-            "tune_max_num_tokens": max(self.max_capture_size, 1),
-        }
-
-        from flashinfer import trtllm_fp4_block_scale_routed_moe
-
-        from vllm.utils.flashinfer import autotune
-
-        with autotune(False):
-            # Enable autotune when,
-            # https://github.com/flashinfer-ai/flashinfer/issues/2023 is
-            # resolved.
-            trtllm_fp4_block_scale_routed_moe(**kwargs)
-
-        return output
diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py
index 4adb7f1cfa0e..ba4494f6cdc3 100644
--- a/vllm/model_executor/layers/fused_moe/utils.py
+++ b/vllm/model_executor/layers/fused_moe/utils.py
@@ -25,6 +25,7 @@
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     per_tensor_dequantize,
 )
+from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
 from vllm.utils.math_utils import cdiv
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -265,7 +266,7 @@ def moe_kernel_quantize_input(
         # weights are already dequantized, and we proceed with normal
         # activation quantization below.
 
-    if quant_dtype == torch.float8_e4m3fn:
+    if quant_dtype == current_platform.fp8_dtype():
         return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape)
     elif quant_dtype == torch.int8:
         return _int8_quantize(A, A_scale, per_act_token_quant, block_shape)
@@ -316,27 +317,6 @@ def normalize_batched_scales_shape(
     return scales
 
 
-def _validate_scale_shape(
-    a: torch.Tensor,
-    a_scale: torch.Tensor | None,
-    per_act_token_quant: bool,
-    block_shape: list[int] | None,
-) -> None:
-    if a_scale is None:
-        return
-
-    if not per_act_token_quant and block_shape is None:
-        assert a_scale.numel() == 1, f"{a_scale.shape}"
-    elif per_act_token_quant:
-        assert a_scale.shape[0] == a.shape[0] and a_scale.shape[1] == 1, (
-            f"{a_scale.shape[0]} == {a.shape[0]} and {a_scale.shape[1]} == 1"
-        )
-    else:
-        assert block_shape is not None
-        expected = (a.shape[0], cdiv(a.shape[1], block_shape[1]))
-        assert a_scale.shape == expected, f"{a_scale.shape} == {expected}"
-
-
 # Torch custom ops can't deal with outputs aliasing inputs so we need to
 # disable inplace for torch >= 2.9.
 # See https://github.com/vllm-project/vllm/issues/26378
diff --git a/vllm/model_executor/layers/pooler/activations.py b/vllm/model_executor/layers/pooler/activations.py
index b57e6ba68b94..4213ee7b85cb 100644
--- a/vllm/model_executor/layers/pooler/activations.py
+++ b/vllm/model_executor/layers/pooler/activations.py
@@ -16,25 +16,22 @@
 logger = init_logger(__name__)
 
 
-def get_classification_act_fn(
+def get_act_fn(
     config: PretrainedConfig,
+    static_num_labels: bool = True,
 ) -> "PoolerActivation":
+    # get classification act_fn
     # Implement alignment with transformers ForSequenceClassificationLoss
     # https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92
     problem_type = getattr(config, "problem_type", "")
     if problem_type == "regression":
         return PoolerIdentity()
     if problem_type == "single_label_classification":
-        return PoolerClassify()
+        return PoolerClassify(static_num_labels=static_num_labels)
     if problem_type == "multi_label_classification":
         return PoolerMultiLabelClassify()
 
-    return PoolerClassify()
-
-
-def get_cross_encoder_act_fn(
-    config: PretrainedConfig,
-) -> "PoolerActivation":
+    # get cross_encoder act_fn
     function_name: str | None = None
     if (
         hasattr(config, "sentence_transformers")
@@ -55,24 +52,16 @@ def get_cross_encoder_act_fn(
         fn = resolve_obj_by_qualname(function_name)()
         return PoolerActivation.wraps(fn)
 
-    return PoolerClassify()
+    return PoolerClassify(static_num_labels=static_num_labels)
 
 
 def resolve_classifier_act_fn(
     model_config: ModelConfig,
     static_num_labels: bool = True,
-    act_fn: "PoolerActivation | str | None" = None,
+    act_fn: "PoolerActivation | None" = None,
 ):
-    if isinstance(act_fn, str):
-        if act_fn == "classify":
-            return get_classification_act_fn(model_config.hf_config)
-        if act_fn == "score":
-            return get_cross_encoder_act_fn(model_config.hf_config)
-
-        raise ValueError(f"act_fn [{act_fn=}] not supported.")
-
     if act_fn is None:
-        return PoolerClassify(static_num_labels=static_num_labels)
+        return get_act_fn(model_config.hf_config, static_num_labels)
 
     assert callable(act_fn)
     return act_fn
@@ -97,9 +86,8 @@ def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor:
 
     def forward(self, pooled_data: _T) -> _T:
         # shape:
-        # classify (& score) -> (batch_size, num_classes)
-        # embed -> (batch_size, embedding_dim) or list(embedding_dim)
-        #          (batch_size, dimensions) or list(dimensions) if using MRL
+        # classify -> (batch_size, num_classes)
+        # embed -> (batch_size, embedding_size) or list(embedding_size)
         if isinstance(pooled_data, list):
             return [self.forward_chunk(data) for data in pooled_data]
 
diff --git a/vllm/model_executor/layers/pooler/seqwise/heads.py b/vllm/model_executor/layers/pooler/seqwise/heads.py
index 42059284e5cd..31a961223927 100644
--- a/vllm/model_executor/layers/pooler/seqwise/heads.py
+++ b/vllm/model_executor/layers/pooler/seqwise/heads.py
@@ -56,29 +56,31 @@ def forward(
 
         if isinstance(pooled_data, list):
             pooled_data = torch.stack(pooled_data)
-        # pooled_data shape: [batchsize, hidden_dimension]
+        # pooled_data shape: [batchsize, hidden_size]
 
         if self.head_dtype is not None:
             pooled_data = pooled_data.to(self.head_dtype)
 
         # Apply ST projector
         if self.projector is not None:
-            pooled_data = self.projector(pooled_data)
-        # pooled_data shape: [batchsize, embedding_dimension]
+            embeddings = self.projector(pooled_data)
+        else:
+            embeddings = pooled_data
+        # embeddings shape: [batchsize, embedding_size]
 
         # for matryoshka representation
         dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params]
         if any(d is not None for d in dimensions_list):
             # change the output dimension
-            assert len(pooled_data) == len(dimensions_list)
-            if len(set(dimensions_list)) == 1 and not isinstance(pooled_data, list):
+            assert len(embeddings) == len(dimensions_list)
+            if len(set(dimensions_list)) == 1 and not isinstance(embeddings, list):
                 # if all dimensions are the same
                 d = dimensions_list[0]
-                pooled_data = pooled_data[..., :d]
+                embeddings = embeddings[..., :d]
             else:
-                pooled_data = [
+                embeddings = [
                     vecs if d is None else vecs[..., :d]
-                    for vecs, d in zip(pooled_data, dimensions_list)
+                    for vecs, d in zip(embeddings, dimensions_list)
                 ]
 
         # for normalize
@@ -86,15 +88,15 @@ def forward(
             flags = [p.use_activation for p in pooling_params]
             if len(set(flags)) == 1:
                 if flags[0]:
-                    pooled_data = self.activation(pooled_data)
+                    embeddings = self.activation(embeddings)
             else:
-                pooled_data = [
+                embeddings = [
                     self.activation(vecs) if f else vecs
-                    for vecs, f in zip(pooled_data, flags)
+                    for vecs, f in zip(embeddings, flags)
                 ]
 
-        # pooled_data shape: [batchsize, embedding_dimension]
-        return pooled_data
+        # embeddings shape: [batchsize, embedding_size]
+        return embeddings
 
 
 class ClassifierPoolerHead(SequencePoolerHead):
@@ -113,7 +115,7 @@ def __init__(
         self.activation = activation
 
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"classify", "score"}
+        return {"classify"}
 
     def forward(
         self,
@@ -131,21 +133,23 @@ def forward(
             pooled_data = pooled_data.to(self.head_dtype)
 
         if self.classifier is not None:
-            pooled_data = self.classifier(pooled_data)
-        # pooled_data shape: [batchsize, num_labels]
+            logits = self.classifier(pooled_data)
+        else:
+            logits = pooled_data
 
+        # logits shape: [batchsize, num_labels]
         if self.logit_bias is not None:
-            pooled_data -= self.logit_bias
+            logits -= self.logit_bias
 
         if self.activation is not None:
             flags = [p.use_activation for p in pooling_params]
             if len(set(flags)) == 1:
-                pooled_data = self.activation(pooled_data) if flags[0] else pooled_data
+                logits = self.activation(logits) if flags[0] else logits
             else:
-                pooled_data = [
+                logits = [
                     self.activation(vecs) if f else vecs
-                    for vecs, f in zip(pooled_data, flags)
+                    for vecs, f in zip(logits, flags)
                 ]
 
-        # pooled_data shape: [batchsize, num_labels]
-        return pooled_data
+        # logits shape: [batchsize, num_labels]
+        return logits
diff --git a/vllm/model_executor/layers/pooler/seqwise/methods.py b/vllm/model_executor/layers/pooler/seqwise/methods.py
index 5d8551095096..f3c7f29d6092 100644
--- a/vllm/model_executor/layers/pooler/seqwise/methods.py
+++ b/vllm/model_executor/layers/pooler/seqwise/methods.py
@@ -17,7 +17,7 @@
 
 class SequencePoolingMethod(nn.Module, ABC):
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"token_embed", "token_classify", "embed", "classify", "score"}
+        return {"token_embed", "token_classify", "embed", "classify"}
 
     def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
         return PoolingParamsUpdate()
diff --git a/vllm/model_executor/layers/pooler/seqwise/poolers.py b/vllm/model_executor/layers/pooler/seqwise/poolers.py
index 8bf3e25e66b6..f46834a7c3f2 100644
--- a/vllm/model_executor/layers/pooler/seqwise/poolers.py
+++ b/vllm/model_executor/layers/pooler/seqwise/poolers.py
@@ -108,7 +108,7 @@ def pooler_for_classify(
     *,
     pooling: SequencePoolingMethod | SequencePoolingFn | None = None,
     classifier: ClassifierFn | None = None,
-    act_fn: PoolerActivation | str | None = None,
+    act_fn: PoolerActivation | None = None,
 ):
     if pooling is None:
         pooling = get_seq_pooling_method(pooler_config.get_seq_pooling_type())
diff --git a/vllm/model_executor/layers/pooler/special.py b/vllm/model_executor/layers/pooler/special.py
index 5e0f9ec75597..686072632685 100644
--- a/vllm/model_executor/layers/pooler/special.py
+++ b/vllm/model_executor/layers/pooler/special.py
@@ -52,13 +52,6 @@ def for_seq_cls(
                     pooler_config,
                     pooling=pooling,
                     classifier=classifier,
-                    act_fn="classify",
-                ),
-                "score": pooler_for_classify(
-                    pooler_config,
-                    pooling=pooling,
-                    classifier=classifier,
-                    act_fn="score",
                 ),
             }
         )
@@ -115,7 +108,7 @@ def extra_repr(self) -> str:
 
 class IdentityPooler(Pooler):
     def get_supported_tasks(self) -> Set[PoolingTask]:
-        return {"plugin", "score"}
+        return {"plugin"}
 
     def forward(
         self,
diff --git a/vllm/model_executor/layers/pooler/tokwise/heads.py b/vllm/model_executor/layers/pooler/tokwise/heads.py
index 4183f5b1ba25..80c5c831fa08 100644
--- a/vllm/model_executor/layers/pooler/tokwise/heads.py
+++ b/vllm/model_executor/layers/pooler/tokwise/heads.py
@@ -68,22 +68,24 @@ def forward_chunk(
 
         if self.head_dtype is not None:
             pooled_data = pooled_data.to(self.head_dtype)
-        # pooled_data shape: [n_tokens, hidden_dimension]
+        # pooled_data shape: [n_tokens, hidden_size]
 
         # Apply ST projector
         if self.projector is not None:
-            pooled_data = self.projector(pooled_data)
-        # pooled_data shape: [n_tokens, embedding_dimension]
+            embeddings = self.projector(pooled_data)
+        else:
+            embeddings = pooled_data
+        # embeddings shape: [n_tokens, embedding_size]
 
         # for matryoshka representation
-        pooled_data = pooled_data[..., : pooling_param.dimensions]
+        embeddings = embeddings[..., : pooling_param.dimensions]
 
         # for normalize
         if self.activation is not None and pooling_param.use_activation:
-            pooled_data = self.activation(pooled_data)
+            embeddings = self.activation(embeddings)
 
-        # pooled_data shape: [n_tokens, embedding_dimension]
-        return pooled_data
+        # embeddings shape: [n_tokens, embedding_size]
+        return embeddings
 
 
 class TokenClassifierPoolerHead(TokenPoolerHead):
@@ -118,16 +120,16 @@ def forward_chunk(
         # hidden_states shape: [n_token, hidden_size]
 
         if self.classifier is not None:
-            scores = self.classifier(pooled_data)
+            logits = self.classifier(pooled_data)
         else:
-            scores = pooled_data
-        # scores shape: [n_token, num_labels]
+            logits = pooled_data
+        # logits shape: [n_token, num_labels]
 
         if self.logit_bias is not None:
-            scores -= self.logit_bias
+            logits -= self.logit_bias
 
         if self.activation is not None and pooling_param.use_activation:
-            scores = self.activation(scores)
+            logits = self.activation(logits)
 
-        # scores shape: [n_token, num_labels]
-        return scores
+        # logits shape: [n_token, num_labels]
+        return logits
diff --git a/vllm/model_executor/layers/pooler/tokwise/poolers.py b/vllm/model_executor/layers/pooler/tokwise/poolers.py
index 996f20d98cc9..c56970fcabaa 100644
--- a/vllm/model_executor/layers/pooler/tokwise/poolers.py
+++ b/vllm/model_executor/layers/pooler/tokwise/poolers.py
@@ -116,7 +116,7 @@ def pooler_for_token_classify(
     *,
     pooling: TokenPoolingMethod | TokenPoolingFn | None = None,
     classifier: ClassifierFn | None = None,
-    act_fn: PoolerActivation | str | None = None,
+    act_fn: PoolerActivation | None = None,
 ):
     if pooling is None:
         pooling = get_tok_pooling_method(pooler_config.get_tok_pooling_type())
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index e08a6456aba7..9aceb3be054d 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -12,7 +12,6 @@
 QuantizationMethods = Literal[
     "awq",
     "fp8",
-    "ptpc_fp8",
     "fbgemm_fp8",
     "fp_quant",
     "modelopt",
@@ -39,7 +38,6 @@
 
 DEPRECATED_QUANTIZATION_METHODS = [
     "tpu_int8",
-    "ptpc_fp8",
     "fbgemm_fp8",
     "fp_quant",
     "experts_int8",
@@ -132,7 +130,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
     from .mxfp4 import Mxfp4Config
     from .mxfp8 import Mxfp8Config
     from .petit import PetitNvFp4Config
-    from .ptpc_fp8 import PTPCFp8Config
     from .torchao import TorchAOConfig
 
     method_to_config: dict[str, type[QuantizationConfig]] = {
@@ -150,7 +147,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "gptq": GPTQConfig,
         "compressed-tensors": CompressedTensorsConfig,
         "bitsandbytes": BitsAndBytesConfig,
-        "ptpc_fp8": PTPCFp8Config,
         "experts_int8": ExpertsInt8Config,
         "quark": QuarkConfig,
         "moe_wna16": MoeWNA16Config,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index 29115fbbc255..5e14d1712aec 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -45,11 +45,14 @@
     make_fp8_moe_quant_config,
     select_fp8_moe_backend,
 )
+from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import (
+    Mxfp4MoeBackend,
+    make_mxfp4_moe_kernel,
+    make_mxfp4_moe_quant_config,
+)
 from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import (
-    NvFp4MoeBackend,
     convert_to_nvfp4_moe_kernel_format,
     is_global_sf_supported_for_nvfp4_backend,
-    make_mxfp4_moe_quant_config,
     make_nvfp4_moe_kernel,
     make_nvfp4_moe_quant_config,
     select_nvfp4_moe_backend,
@@ -235,7 +238,7 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod):
     def __init__(self, moe):
         super().__init__(moe)
         self.group_size = 32
-        self.mxfp4_backend = NvFp4MoeBackend.MARLIN
+        self.mxfp4_backend = Mxfp4MoeBackend.MARLIN
         self.experts_cls = MarlinExperts
 
     def create_weights(
@@ -310,7 +313,9 @@ def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
         return make_mxfp4_moe_quant_config(
-            w13_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale
+            mxfp4_backend=self.mxfp4_backend,
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
         )
 
     def process_weights_after_loading(self, layer: FusedMoE) -> None:
@@ -334,10 +339,11 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None:
 
         self.moe_quant_config = self.get_fused_moe_quant_config(layer)
         if self.moe_quant_config is not None:
-            self.moe_kernel = make_nvfp4_moe_kernel(
+            self.moe_kernel = make_mxfp4_moe_kernel(
                 moe_quant_config=self.moe_quant_config,
                 moe_config=self.moe,
                 experts_cls=self.experts_cls,
+                mxfp4_backend=self.mxfp4_backend,
                 shared_experts=layer.shared_experts,
                 routing_tables=layer._maybe_init_expert_routing_tables(),
             )
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index f992d0f86c4e..22077be8a44b 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1,12 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from enum import Enum
 
 import torch
-from torch.nn.parameter import Parameter
 
-from vllm import envs
-from vllm._aiter_ops import rocm_aiter_ops
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention import Attention
@@ -17,173 +13,31 @@
     MoEActivation,
 )
 from vllm.model_executor.layers.fused_moe import modular_kernel as mk
-from vllm.model_executor.layers.fused_moe.all2all_utils import (
-    maybe_make_prepare_finalize,
-)
 from vllm.model_executor.layers.fused_moe.config import (
     FusedMoEQuantConfig,
-    mxfp4_mxfp8_moe_quant_config,
-    mxfp4_w4a16_moe_quant_config,
-    ocp_mx_moe_quant_config,
-)
-from vllm.model_executor.layers.fused_moe.fused_marlin_moe import (
-    BatchedMarlinExperts,
-    MarlinExperts,
 )
-from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (
-    OAITritonExperts,
-    UnfusedOAITritonExperts,
+from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import (
+    TRITON_BACKENDS,
+    Mxfp4MoeBackend,
+    convert_to_mxfp4_moe_kernel_format,
+    make_mxfp4_moe_kernel,
+    make_mxfp4_moe_quant_config,
+    mxfp4_round_up_hidden_size_and_intermediate_size,
+    select_mxfp4_moe_backend,
 )
-from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts
 from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from vllm.model_executor.layers.quantization.utils.marlin_utils import (
-    get_marlin_input_dtype,
-)
-from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
-    prepare_moe_fp4_layer_for_marlin,
-)
-from vllm.model_executor.layers.quantization.utils.mxfp4_utils import (
-    CK_MXFP4_MOE_DIM_ALIGNMENT,
-    _can_support_mxfp4,
-    _swizzle_mxfp4,
-    get_padding_alignment,
-)
 from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped
-from vllm.model_executor.utils import set_weight_attrs
+from vllm.model_executor.utils import replace_parameter, set_weight_attrs
 from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer
-from vllm.utils.import_utils import has_triton_kernels
-from vllm.utils.math_utils import round_up
 
 logger = init_logger(__name__)
 
 
-# enum for mxfp4 backend
-class Mxfp4Backend(Enum):
-    NONE = 0
-
-    # FlashInfer Backend
-    SM100_FI_MXFP4_MXFP8_TRTLLM = 1
-    SM100_FI_MXFP4_MXFP8_CUTLASS = 2
-    SM100_FI_MXFP4_BF16 = 3
-    SM90_FI_MXFP4_BF16 = 4
-
-    # Marlin Backend
-    MARLIN = 5
-
-    # Triton Backend
-    TRITON = 6
-
-    CK = 7
-
-
-def get_mxfp4_backend_with_lora() -> Mxfp4Backend:
-    """
-    Not all MXFP4 backends support LoRA. Select backends that are known to
-    have LoRA support.
-    """
-    if not current_platform.is_cuda():
-        return Mxfp4Backend.NONE
-
-    # If FlashInfer is not available, try either Marlin or Triton
-    triton_kernels_supported = (
-        has_triton_kernels()
-        # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
-        # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
-        # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
-        and (9, 0) <= current_platform.get_device_capability() < (11, 0)
-    )
-    if envs.VLLM_MXFP4_USE_MARLIN is False and triton_kernels_supported:
-        logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend")
-        return Mxfp4Backend.TRITON
-
-    logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend")
-    return Mxfp4Backend.MARLIN
-
-
-def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend:
-    # Backend Selection
-
-    if with_lora_support:
-        return get_mxfp4_backend_with_lora()
-
-    if current_platform.is_cuda():
-        if (
-            current_platform.is_device_capability(90)
-            and has_flashinfer()
-            and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16
-        ):
-            logger.info_once("Using FlashInfer MXFP4 BF16 backend for SM90")
-            return Mxfp4Backend.SM90_FI_MXFP4_BF16
-        elif (
-            current_platform.is_device_capability_family(100)
-            and has_flashinfer()
-            and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS
-        ):
-            logger.info_once("Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100")
-            return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
-        elif (
-            current_platform.is_device_capability_family(100)
-            and has_flashinfer()
-            and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8
-        ):
-            logger.info_once(
-                "Using FlashInfer MXFP4 MXFP8 TRTLLM backend for SM100", scope="local"
-            )
-            return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-        elif current_platform.is_device_capability_family(100) and has_flashinfer():
-            logger.info_once(
-                "Using FlashInfer MXFP4 BF16 backend for SM100, "
-                "For faster performance on SM100, consider setting "
-                "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, though this may impact "
-                "accuracy."
-            )
-            return Mxfp4Backend.SM100_FI_MXFP4_BF16
-        elif (
-            current_platform.is_device_capability_family(100)
-            or current_platform.is_device_capability(90)
-        ) and not has_flashinfer():
-            logger.warning_once(
-                "MXFP4 MoE is enabled on Hopper/Blackwell but FlashInfer "
-                "is not available. This may result in degraded performance. "
-                "Please `pip install vllm[flashinfer]` for best results."
-            )
-
-        # If FlashInfer is not available, try either Marlin or Triton
-        triton_kernels_supported = (
-            has_triton_kernels()
-            # NOTE: triton_kernels are only confirmed to work on SM90 and SM100
-            # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317
-            # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498
-            and (9, 0) <= current_platform.get_device_capability() < (11, 0)
-        )
-        if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported:
-            logger.info_once("Using Marlin backend")
-            return Mxfp4Backend.MARLIN
-        else:
-            logger.info_once("Using Triton backend")
-            return Mxfp4Backend.TRITON
-    elif current_platform.is_xpu():
-        logger.info_once("Using xpu backend on XPU")
-        return Mxfp4Backend.MARLIN
-    elif current_platform.is_rocm():
-        from vllm.platforms.rocm import on_gfx950
-
-        if rocm_aiter_ops.is_enabled() and on_gfx950():
-            logger.info_once("Using CK MXFP4 MoE backend (Aiter ROCm)")
-            return Mxfp4Backend.CK
-        elif has_triton_kernels():
-            logger.info_once("Using Triton backend")
-            return Mxfp4Backend.TRITON
-
-    return Mxfp4Backend.NONE
-
-
 class Mxfp4Config(QuantizationConfig):
     def __init__(self, ignored_layers: list[str] | None = None):
         super().__init__()
@@ -219,9 +73,6 @@ def get_quant_method(
                 fused_mapping=self.packed_modules_mapping,
             ):
                 return UnquantizedLinearMethod()
-            # TODO: Add support for MXFP4 Linear Method.
-            # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation
-            # if you are interested in enabling MXFP4 here.
             logger.debug_once(
                 "MXFP4 linear layer is not implemented - falling back to "
                 "UnquantizedLinearMethod.",
@@ -232,10 +83,8 @@ def get_quant_method(
             if current_platform.is_xpu():
                 return XpuMxfp4MoEMethod(layer.moe_config)
             else:
-                quant_method = Mxfp4MoEMethod(layer.moe_config)
-                return quant_method
+                return Mxfp4MoEMethod(layer.moe_config)
         elif isinstance(layer, Attention):
-            # TODO: Add support for MXFP4 Attention.
             logger.debug_once(
                 "MXFP4 attention layer is not implemented. "
                 "Skipping quantization for this layer.",
@@ -254,51 +103,36 @@ class Mxfp4MoEMethod(FusedMoEMethodBase):
     def __init__(self, moe: FusedMoEConfig):
         super().__init__(moe)
         self.weight_dtype = "mxfp4"
-        self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+        self.mxfp4_backend, self.experts_cls = select_mxfp4_moe_backend(moe)
 
         self.max_capture_size = (
             get_current_vllm_config().compilation_config.max_cudagraph_capture_size
         )
 
-        # CK's pre-compiled MXFP4 MoE GEMM kernel instances have dimension
-        # alignment requirements. Fall back to Triton when not met.
-        if (
-            self.mxfp4_backend == Mxfp4Backend.CK
-            and moe.intermediate_size_per_partition % CK_MXFP4_MOE_DIM_ALIGNMENT != 0
-        ):
-            if has_triton_kernels():
-                logger.warning_once(
-                    "CK MXFP4 MoE GEMM does not support "
-                    "intermediate_size_per_partition=%d (not a multiple of "
-                    "%d). Falling back to Triton backend.",
-                    moe.intermediate_size_per_partition,
-                    CK_MXFP4_MOE_DIM_ALIGNMENT,
-                )
-                self.mxfp4_backend = Mxfp4Backend.TRITON
-            else:
-                raise ValueError(
-                    f"CK MXFP4 MoE GEMM does not support "
-                    f"intermediate_size_per_partition="
-                    f"{moe.intermediate_size_per_partition} (not a multiple "
-                    f"of {CK_MXFP4_MOE_DIM_ALIGNMENT}) and no Triton "
-                    f"fallback is available. Use a compatible "
-                    f"tensor_parallel_size."
-                )
-
-        assert self.mxfp4_backend != Mxfp4Backend.NONE, (
-            f"get_mxfp4_backend(with_lora_support={moe.is_lora_enabled}) found"
-            "no compatible MXFP4 MoE backend (FlashInfer/Marlin/Triton)."
-            "Please check your environment and try again."
-        )
         self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {}
-        # Initialized in process_weights_after_loading for CUTLASS/SM90 backends
         self.moe_kernel: mk.FusedMoEKernel | None = None
 
+        # Round up dims once based on backend. This mutates the shared
+        # FusedMoEConfig in-place so that create_weights() and all
+        # downstream code see the padded dimensions. This must happen
+        # before create_weights() is called.
+        self.moe.hidden_dim, self.moe.intermediate_size_per_partition = (
+            mxfp4_round_up_hidden_size_and_intermediate_size(
+                self.mxfp4_backend,
+                self.moe.hidden_dim,
+                self.moe.intermediate_size_per_partition,
+            )
+        )
+
+        # Used for triton kernel precision configs
+        self.w13_precision_config = None
+        self.w2_precision_config = None
+
     @property
     def skip_forward_padding(self) -> bool:
         # SM100_FI_MXFP4_MXFP8_TRTLLM supports padding with mxfp8 quant
         # so can skip the padding in the forward before applying the moe method
-        return self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
+        return self.mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8
 
     def create_weights(
         self,
@@ -312,77 +146,14 @@ def create_weights(
         self.num_experts = num_experts
         weight_dtype = torch.uint8
         scale_dtype = torch.uint8
-
-        # FIXME (zyongye): ship after torch and safetensors support mxfp4
-        # is_torch_mxfp4_available = (
-        #     hasattr(torch, "float4_e2m1fn_x2") and
-        #     hasattr(torch, "float8_e8m0fnu"))
-        # if is_torch_mxfp4_available:
-        #     weight_dtype = torch.float4_e2m1fn_x2
-        #     scale_dtype = torch.float8_e8m0fnu
-
         mxfp4_block = 32
 
-        intermediate_size_per_partition_after_pad = intermediate_size_per_partition
-        if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            # The moe marlin kernel requires that for each linear
-            # n % 256 == 0 and k % 128 == 0.
-            # In gate_up_proj:
-            #    n = 2 * intermediate_size_per_partition_after_pad
-            #    k = hidden_size
-            # In down_proj
-            #    n = hidden_size
-            #    k = intermediate_size_per_partition_after_pad
-            intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 128
-            )
-            if current_platform.is_xpu():
-                hidden_size = round_up(hidden_size, 128)
-            else:
-                hidden_size = round_up(hidden_size, 256)
-
-            layer.params_dtype = params_dtype
-            layer.num_experts = num_experts
-            layer.hidden_size = hidden_size
-            layer.intermediate_size_per_partition = (
-                intermediate_size_per_partition_after_pad
-            )
-        elif (
-            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-            or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
-        ):
-            # pad the intermediate size to be a multiple of 2 * mxfp4_block
-            # for to hold non-uniform sharded tensor as well as swizzling
-            # other padding to increase performance
-            intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 256
-            )
-            hidden_size = round_up(hidden_size, 256)
-        elif (
-            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
-            or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
-        ):
-            intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 128
-            )
-            hidden_size = round_up(hidden_size, 128)
-        elif current_platform.is_rocm():
-            pad_align = get_padding_alignment()
-            intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, pad_align
-            )
-            hidden_size = round_up(hidden_size, pad_align)
-        else:
-            intermediate_size_per_partition_after_pad = round_up(
-                intermediate_size_per_partition, 64
-            )
-
-        self.intermediate_size = intermediate_size_per_partition_after_pad
-        self.hidden_size = hidden_size
-        self.hidden_pad = extra_weight_attrs.get("hidden_pad", 0)
-        self.intermediate_pad = (
-            intermediate_size_per_partition_after_pad - intermediate_size_per_partition
+        # Use pre-rounded sizes from config
+        self.intermediate_size = intermediate_size_per_partition_after_pad = (
+            self.moe.intermediate_size_per_partition
         )
+        self.hidden_size = hidden_size = self.moe.hidden_dim
+
         # Fused gate_up_proj (column parallel)
         w13_weight = torch.nn.Parameter(
             torch.zeros(
@@ -408,17 +179,6 @@ def create_weights(
         layer.register_parameter("w13_weight_scale", w13_weight_scale)
         set_weight_attrs(w13_weight_scale, extra_weight_attrs)
 
-        w13_bias = torch.nn.Parameter(
-            torch.zeros(
-                num_experts,
-                2 * intermediate_size_per_partition_after_pad,
-                dtype=torch.bfloat16,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w13_bias", w13_bias)
-        set_weight_attrs(w13_bias, extra_weight_attrs)
-
         # down_proj (row parallel)
         w2_weight = torch.nn.Parameter(
             torch.zeros(
@@ -444,604 +204,170 @@ def create_weights(
         layer.register_parameter("w2_weight_scale", w2_weight_scale)
         set_weight_attrs(w2_weight_scale, extra_weight_attrs)
 
-        w2_bias = torch.nn.Parameter(
-            torch.zeros(
-                num_experts,
-                hidden_size,
-                dtype=torch.bfloat16,
-            ),
-            requires_grad=False,
-        )
-        layer.register_parameter("w2_bias", w2_bias)
-        set_weight_attrs(w2_bias, extra_weight_attrs)
-
-    def process_weights_after_loading(self, layer):
-        if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            prepare_moe_fp4_layer_for_marlin(
-                layer, input_dtype=get_marlin_input_dtype()
-            )
-
-            self.moe_quant_config = self.get_fused_moe_quant_config(layer)
-            assert self.moe_quant_config is not None
-
-            prepare_finalize = maybe_make_prepare_finalize(
-                moe=self.moe,
-                quant_config=self.moe_quant_config,
-                routing_tables=layer._maybe_init_expert_routing_tables(),
-                allow_new_interface=True,
-            )
-            assert prepare_finalize is not None
-
-            self.moe_kernel = mk.FusedMoEKernel(
-                prepare_finalize,
-                MarlinExperts(
-                    self.moe,
-                    self.moe_quant_config,
+        if self.moe.has_bias:
+            w13_bias = torch.nn.Parameter(
+                torch.zeros(
+                    num_experts,
+                    2 * intermediate_size_per_partition_after_pad,
+                    dtype=torch.bfloat16,
                 ),
-                inplace=not self.moe.disable_inplace,
-                shared_experts=None,
-            )
-        elif (
-            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-            or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
-        ):
-            from flashinfer.fp4_quantization import nvfp4_block_scale_interleave
-            from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache
-
-            layer.gemm1_alpha = Parameter(
-                torch.tensor([1.702] * self.num_experts, dtype=torch.float32).cuda(),
                 requires_grad=False,
             )
-            layer.gemm1_beta = Parameter(
-                torch.tensor([1.0] * self.num_experts, dtype=torch.float32).cuda(),
-                requires_grad=False,
-            )
-            layer.gemm1_clamp_limit = Parameter(
-                torch.tensor([7.0] * self.num_experts, dtype=torch.float32).cuda(),
-                requires_grad=False,
-            )
-            sf_block_size = 32  # mxfp4 block size
-
-            assert (
-                layer.w13_weight.dim() == 3
-                and layer.w13_weight.shape[0] == self.num_experts
-                and layer.w13_weight.shape[1] == self.intermediate_size * 2
-                and layer.w13_weight.shape[2] == self.hidden_size // 2
-            )
-            assert (
-                layer.w13_weight_scale.dim() == 3
-                and layer.w13_weight_scale.shape[0] == self.num_experts
-                and layer.w13_weight_scale.shape[1] == self.intermediate_size * 2
-                and layer.w13_weight_scale.shape[2] == self.hidden_size // sf_block_size
-            )
-            assert (
-                layer.w2_weight.dim() == 3
-                and layer.w2_weight.shape[0] == self.num_experts
-                and layer.w2_weight.shape[1] == self.hidden_size
-                and layer.w2_weight.shape[2] == self.intermediate_size // 2
-            )
-            assert (
-                layer.w2_weight_scale.dim() == 3
-                and layer.w2_weight_scale.shape[1] == self.hidden_size
-                and layer.w2_weight_scale.shape[2]
-                == self.intermediate_size // sf_block_size
-            )
-            assert (
-                layer.w13_bias.dim() == 2
-                and layer.w13_bias.shape[0] == self.num_experts
-                and layer.w13_bias.shape[1] == self.intermediate_size * 2
-            )
-            assert (
-                layer.w2_bias.dim() == 2
-                and layer.w2_bias.shape[0] == self.num_experts
-                and layer.w2_bias.shape[1] == self.hidden_size
-            )
-
-            w13_weight_scale = layer.w13_weight_scale.data
-            w2_weight_scale = layer.w2_weight_scale.data
-            w13_weight = layer.w13_weight.data
-            w2_weight = layer.w2_weight.data
-            w13_bias = layer.w13_bias.data.to(torch.float32)
-            w2_bias = layer.w2_bias.data.to(torch.float32)
-
-            # Swap w1 and w3 as the definition of
-            # swiglu is different in the trtllm-gen
-            def swap_every_two_rows(x, axis=-1):
-                shape = x.shape
-                if axis < 0:
-                    axis = len(shape) + axis
-
-                # Create a new shape with pairs swapped along specified axis
-                new_shape = list(shape)
-                new_shape[axis] = shape[axis] // 2
-                new_shape.insert(axis + 1, 2)
+            layer.register_parameter("w13_bias", w13_bias)
+            set_weight_attrs(w13_bias, extra_weight_attrs)
 
-                # Reshape to expose pairs, swap them, and reshape back
-                x = x.reshape(*new_shape)
-                x = x.flip(axis + 1)
-                new_shape = list(shape)
-                return x.reshape(*new_shape)
-
-            w13_weight_scale = swap_every_two_rows(w13_weight_scale, -2)
-            w13_weight = swap_every_two_rows(w13_weight, -2)
-            w13_bias = swap_every_two_rows(w13_bias, -1)
-
-            # Do not interleave as the checkpoint is already interleaved
-
-            # Shuffle weights and scaling factors for transposed mma output
-            gemm1_weights_mxfp4_shuffled = []
-            gemm1_scales_mxfp4_shuffled = []
-            gemm2_weights_mxfp4_shuffled = []
-            gemm2_scales_mxfp4_shuffled = []
-            gemm1_bias_shuffled = []
-            gemm2_bias_shuffled = []
-            epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
-            for i in range(self.num_experts):
-                # w13 weight shuffling
-                permute_indices = get_w2_permute_indices_with_cache(
-                    self._cache_permute_indices,
-                    w13_weight[i].view(torch.uint8),
-                    epilogue_tile_m,
-                )
-                gemm1_weights_mxfp4_shuffled.append(
-                    w13_weight[i]
-                    .view(torch.uint8)[permute_indices.to(w13_weight.device)]
-                    .contiguous()
-                )
-                # w13 scale shuffling
-                permute_sf_indices = get_w2_permute_indices_with_cache(
-                    self._cache_permute_indices,
-                    w13_weight_scale[i].view(torch.uint8),
-                    epilogue_tile_m,
-                    num_elts_per_sf=16,
-                )
-                gemm1_scales_mxfp4_shuffled.append(
-                    nvfp4_block_scale_interleave(
-                        w13_weight_scale[i]
-                        .view(torch.uint8)[
-                            permute_sf_indices.to(w13_weight_scale.device)
-                        ]
-                        .contiguous()
-                    )
-                )
-                # w13 bias shuffling
-                permute_bias_indices = get_w2_permute_indices_with_cache(
-                    self._cache_permute_indices,
-                    w13_bias[i].clone().reshape(-1, 1),
-                    epilogue_tile_m,
-                )
-                gemm1_bias_shuffled.append(
-                    w13_bias[i]
-                    .clone()
-                    .reshape(-1, 1)[permute_bias_indices.to(w13_bias.device)]
-                    .contiguous()
-                )
-                # w2 weight shuffling
-                permute_indices = get_w2_permute_indices_with_cache(
-                    self._cache_permute_indices,
-                    w2_weight[i].view(torch.uint8),
-                    epilogue_tile_m,
-                )
-                gemm2_weights_mxfp4_shuffled.append(
-                    w2_weight[i]
-                    .view(torch.uint8)[permute_indices.to(w2_weight.device)]
-                    .contiguous()
-                )
-                # w2 scale shuffling
-                permute_sf_indices = get_w2_permute_indices_with_cache(
-                    self._cache_permute_indices,
-                    w2_weight_scale[i].view(torch.uint8),
-                    epilogue_tile_m,
-                    num_elts_per_sf=16,
-                )
-                gemm2_scales_mxfp4_shuffled.append(
-                    nvfp4_block_scale_interleave(
-                        w2_weight_scale[i]
-                        .view(torch.uint8)[
-                            permute_sf_indices.to(w2_weight_scale.device)
-                        ]
-                        .contiguous()
-                    )
-                )
-                # w2 bias shuffling
-                permute_indices = get_w2_permute_indices_with_cache(
-                    self._cache_permute_indices,
-                    w2_bias[i].clone().reshape(-1, 1),
-                    epilogue_tile_m,
-                )
-                gemm2_bias_shuffled.append(
-                    w2_bias[i]
-                    .clone()
-                    .reshape(-1, 1)[permute_indices.to(w2_bias.device)]
-                    .contiguous()
-                )
-
-            w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled)
-            w13_weight_scale = (
-                torch.stack(gemm1_scales_mxfp4_shuffled)
-                .reshape(
-                    self.num_experts,
-                    2 * self.intermediate_size,
-                    self.hidden_size // sf_block_size,
-                )
-                .view(torch.float8_e4m3fn)
-            )
-
-            w2_weight = torch.stack(gemm2_weights_mxfp4_shuffled)
-            w2_weight_scale = (
-                torch.stack(gemm2_scales_mxfp4_shuffled)
-                .reshape(
-                    self.num_experts,
-                    self.hidden_size,
-                    self.intermediate_size // sf_block_size,
-                )
-                .view(torch.float8_e4m3fn)
-            )
-
-            layer.w13_weight = Parameter(w13_weight, requires_grad=False)
-            layer.w13_weight_scale = Parameter(w13_weight_scale, requires_grad=False)
-            layer.w2_weight = Parameter(w2_weight, requires_grad=False)
-            layer.w2_weight_scale = Parameter(w2_weight_scale, requires_grad=False)
-            layer.w13_bias = Parameter(
-                torch.stack(gemm1_bias_shuffled).reshape(self.num_experts, -1),
-                requires_grad=False,
-            )
-            layer.w2_bias = Parameter(
-                torch.stack(gemm2_bias_shuffled).reshape(self.num_experts, -1),
+            w2_bias = torch.nn.Parameter(
+                torch.zeros(
+                    num_experts,
+                    hidden_size,
+                    dtype=torch.bfloat16,
+                ),
                 requires_grad=False,
             )
-        elif (
-            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
-            or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
-        ):
-            sf_block_size = 32  # mxfp4 block size
+            layer.register_parameter("w2_bias", w2_bias)
+            set_weight_attrs(w2_bias, extra_weight_attrs)
 
-            # Common shape assertions
-            assert (
-                layer.w13_weight.dim() == 3
-                and layer.w13_weight.shape[0] == self.num_experts
-                and layer.w13_weight.shape[1] == self.intermediate_size * 2
-                and layer.w13_weight.shape[2] == self.hidden_size // 2
-            )
-            assert (
-                layer.w13_weight_scale.dim() == 3
-                and layer.w13_weight_scale.shape[0] == self.num_experts
-                and layer.w13_weight_scale.shape[1] == self.intermediate_size * 2
-                and layer.w13_weight_scale.shape[2] == self.hidden_size // sf_block_size
-            )
-            assert (
-                layer.w2_weight.dim() == 3
-                and layer.w2_weight.shape[0] == self.num_experts
-                and layer.w2_weight.shape[1] == self.hidden_size
-                and layer.w2_weight.shape[2] == self.intermediate_size // 2
-            )
-            assert (
-                layer.w2_weight_scale.dim() == 3
-                and layer.w2_weight_scale.shape[1] == self.hidden_size
-                and layer.w2_weight_scale.shape[2]
-                == self.intermediate_size // sf_block_size
-            )
+    def _setup_kernel(
+        self,
+        layer: FusedMoE,
+        w13: torch.Tensor,
+        w2: torch.Tensor,
+        w13_scale: torch.Tensor,
+        w2_scale: torch.Tensor,
+        w13_bias: torch.Tensor | None = None,
+        w2_bias: torch.Tensor | None = None,
+    ) -> None:
+        num_experts = self.num_experts
+        intermediate_size = self.intermediate_size
+        hidden_size = self.hidden_size
+        sf_block_size = 32
+
+        # Shape assertions
+        assert (
+            w13.dim() == 3
+            and w13.shape[0] == num_experts
+            and w13.shape[1] == intermediate_size * 2
+            and w13.shape[2] == hidden_size // 2
+        )
+        assert (
+            w13_scale.dim() == 3
+            and w13_scale.shape[0] == num_experts
+            and w13_scale.shape[1] == intermediate_size * 2
+            and w13_scale.shape[2] == hidden_size // sf_block_size
+        )
+        assert (
+            w2.dim() == 3
+            and w2.shape[0] == num_experts
+            and w2.shape[1] == hidden_size
+            and w2.shape[2] == intermediate_size // 2
+        )
+        assert (
+            w2_scale.dim() == 3
+            and w2_scale.shape[1] == hidden_size
+            and w2_scale.shape[2] == intermediate_size // sf_block_size
+        )
+        if w13_bias is not None:
             assert (
-                layer.w13_bias.dim() == 2
-                and layer.w13_bias.shape[0] == self.num_experts
-                and layer.w13_bias.shape[1] == self.intermediate_size * 2
+                w13_bias.dim() == 2
+                and w13_bias.shape[0] == num_experts
+                and w13_bias.shape[1] == intermediate_size * 2
             )
+        if w2_bias is not None:
             assert (
-                layer.w2_bias.dim() == 2
-                and layer.w2_bias.shape[0] == self.num_experts
-                and layer.w2_bias.shape[1] == self.hidden_size
-            )
-
-            # De-interleave and swap for w13 weight, bias, and scales
-            w13_w = layer.w13_weight.data
-            gate_w, up_w = w13_w[:, ::2, :], w13_w[:, 1::2, :]
-            deinterleaved_w13_w = torch.cat([gate_w, up_w], dim=1)
-            w1_w, w3_w = torch.chunk(deinterleaved_w13_w, 2, dim=1)
-            w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1)
-
-            w13_b = layer.w13_bias.data.to(torch.float32)
-            gate_b, up_b = w13_b[:, ::2], w13_b[:, 1::2]
-            deinterleaved_w13_b = torch.cat([gate_b, up_b], dim=1)
-            b1, b3 = torch.chunk(deinterleaved_w13_b, 2, dim=-1)
-            w13_bias_swapped = torch.cat([b3, b1], dim=-1).to(torch.bfloat16)
-
-            w13_s = layer.w13_weight_scale.data
-            gate_s, up_s = w13_s[:, ::2, :], w13_s[:, 1::2, :]
-            deinterleaved_w13_s = torch.cat([gate_s, up_s], dim=1)
-            s1, s3 = torch.chunk(deinterleaved_w13_s, 2, dim=1)
-            w13_scale_swapped = torch.cat([s3, s1], dim=1)
-
-            if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS:
-                from flashinfer import block_scale_interleave
-
-                orig_shape = w13_scale_swapped.shape
-                w13_scale_interleaved = block_scale_interleave(
-                    w13_scale_swapped.view(torch.uint8)
-                ).reshape(orig_shape)
-
-                w2_s = layer.w2_weight_scale.data
-                orig_shape = w2_s.shape
-                w2_scale_interleaved = block_scale_interleave(
-                    w2_s.view(torch.uint8)
-                ).reshape(orig_shape)
-
-                layer.w13_weight = Parameter(w13_weight_swapped, requires_grad=False)
-                layer.w13_weight_scale = Parameter(
-                    w13_scale_interleaved, requires_grad=False
-                )
-                layer.w13_bias = Parameter(w13_bias_swapped, requires_grad=False)
-                layer.w2_weight_scale = Parameter(
-                    w2_scale_interleaved, requires_grad=False
-                )
-            elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16:
-
-                def _interleave_mxfp4_cutlass_sm90(w):
-                    w_shape = w.shape
-                    w_interleaved = w.reshape(
-                        w_shape[0], w_shape[1], (w_shape[2] // 4), 4
-                    )
-                    w_interleaved = w_interleaved.permute(0, 2, 1, 3)
-                    w_interleaved = w_interleaved.reshape(
-                        w_shape[0], w_shape[2] // 4, w_shape[1] * 4
-                    )
-                    return w_interleaved
-
-                w31_scales = w13_scale_swapped.to(torch.uint8).view(torch.uint8)
-                w31_scales_interleaved = _interleave_mxfp4_cutlass_sm90(w31_scales)
-
-                w2_weight_scale = layer.w2_weight_scale.data
-                w2_scales = w2_weight_scale.to(torch.uint8).view(torch.uint8)
-                w2_scales_interleaved = _interleave_mxfp4_cutlass_sm90(w2_scales)
-
-                layer.w13_weight = torch.nn.Parameter(
-                    torch.cat([w3_w, w1_w], dim=1), requires_grad=False
-                )
-                layer.w13_bias = torch.nn.Parameter(
-                    w13_bias_swapped, requires_grad=False
-                )
-                layer.w13_weight_scale = torch.nn.Parameter(
-                    w31_scales_interleaved, requires_grad=False
-                )
-                layer.w2_weight_scale = torch.nn.Parameter(
-                    w2_scales_interleaved, requires_grad=False
-                )
-
-            # theses two kernels go through the `flashinfer_cutlass_fused_moe` path
-            from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
-                FlashInferExperts,
+                w2_bias.dim() == 2
+                and w2_bias.shape[0] == num_experts
+                and w2_bias.shape[1] == hidden_size
+            )
+
+        # Convert weights to kernel format
+        w13, w2, w13_scale, w2_scale, w13_bias, w2_bias = (
+            convert_to_mxfp4_moe_kernel_format(
+                mxfp4_backend=self.mxfp4_backend,
+                layer=layer,
+                w13_weight=w13,
+                w2_weight=w2,
+                w13_weight_scale=w13_scale,
+                w2_weight_scale=w2_scale,
+                w13_bias=w13_bias,
+                w2_bias=w2_bias,
+                _cache_permute_indices=self._cache_permute_indices,
             )
+        )
 
-            self.moe_quant_config = self.get_fused_moe_quant_config(layer)
-            assert self.moe_quant_config is not None
-            prepare_finalize = maybe_make_prepare_finalize(
-                moe=self.moe,
-                quant_config=self.moe_quant_config,
+        # For TRITON backends, weights are wrapped tensors from triton_kernels
+        # that don't support .detach(). Manually assign parameters.
+        if self.mxfp4_backend not in TRITON_BACKENDS:
+            replace_parameter(layer, "w13_weight", w13)
+            replace_parameter(layer, "w2_weight", w2)
+            replace_parameter(layer, "w13_weight_scale", w13_scale)
+            replace_parameter(layer, "w2_weight_scale", w2_scale)
+        else:
+            layer.w13_weight = w13
+            layer.w2_weight = w2
+            self.w13_precision_config = w13_scale
+            self.w2_precision_config = w2_scale
+
+        if w13_bias is not None and w2_bias is not None:
+            replace_parameter(layer, "w13_bias", w13_bias)
+            replace_parameter(layer, "w2_bias", w2_bias)
+
+        # Build quant config
+        self.moe_quant_config = self.get_fused_moe_quant_config(layer)
+
+        # Build kernel (modular or monolithic)
+        if self.moe_quant_config is not None and self.experts_cls is not None:
+            self.moe_kernel = make_mxfp4_moe_kernel(
+                moe_quant_config=self.moe_quant_config,
+                moe_config=self.moe,
+                mxfp4_backend=self.mxfp4_backend,
+                experts_cls=self.experts_cls,
                 routing_tables=layer._maybe_init_expert_routing_tables(),
-                allow_new_interface=True,
-            )
-            assert prepare_finalize is not None
-
-            self.moe_kernel = mk.FusedMoEKernel(
-                prepare_finalize,
-                FlashInferExperts(
-                    moe_config=self.moe,
-                    quant_config=self.moe_quant_config,
-                ),
-                shared_experts=None,
-            )
-        elif self.mxfp4_backend == Mxfp4Backend.CK:
-            if layer.w13_bias is not None:
-                layer.w13_bias.data = layer.w13_bias.data.to(torch.float32)
-            if layer.w2_bias.data is not None:
-                layer.w2_bias.data = layer.w2_bias.data.to(torch.float32)
-
-            e, n, k = layer.w13_weight.shape
-            layer.w13_weight.view(torch.uint8).copy_(
-                layer.w13_weight.data.view(torch.uint8)
-                .view(e, n // 2, 2, k)
-                .permute(0, 2, 1, 3)
-                .contiguous()
-                .view(e, n, k)
-            )
-            layer.w13_weight_scale.data = (
-                layer.w13_weight_scale.data.view(e, n // 2, 2, -1)
-                .permute(0, 2, 1, 3)
-                .contiguous()
-                .view(e, n, -1)
-            )
-            layer.w13_weight.data = layer.w13_weight.data.view(torch.float4_e2m1fn_x2)
-            layer.w2_weight.data = layer.w2_weight.data.view(torch.float4_e2m1fn_x2)
-
-            layer.w13_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(
-                layer.w13_weight, 16, True
-            )
-            shuffled_w13_scale = rocm_aiter_ops.shuffle_scale_a16w4(
-                layer.w13_weight_scale.view(-1, layer.w13_weight_scale.shape[-1]),
-                self.num_experts,
-                True,
-            )
-
-            layer.w2_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(
-                layer.w2_weight, 16, False
-            )
-            shuffled_w2_scale = rocm_aiter_ops.shuffle_scale_a16w4(
-                layer.w2_weight_scale.view(-1, layer.w2_weight_scale.shape[-1]),
-                self.num_experts,
-                False,
+                shared_experts=layer.shared_experts,
             )
 
-            layer.w13_bias.data = (
-                layer.w13_bias.data.view(-1, n // 2, 2)
-                .permute(0, 2, 1)
-                .contiguous()
-                .view(-1, n)
-            )
-
-            layer.w13_weight_scale = torch.nn.Parameter(
-                shuffled_w13_scale, requires_grad=False
-            )
-            layer.w2_weight_scale = torch.nn.Parameter(
-                shuffled_w2_scale, requires_grad=False
-            )
-            # replace_parameter(layer, "w13_bias", w13_bias)
-            # replace_parameter(layer, "w13_weight_scale", w13_weight_scale)
-            # replace_parameter(layer, "w2_weight_scale", w2_weight_scale)
-            # replace_parameter(layer, "w13_weight", w13_weight)
-            # replace_parameter(layer, "w2_weight", w2_weight)
-
-        elif self.mxfp4_backend == Mxfp4Backend.TRITON:
-            from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig
-
-            w13_bias = layer.w13_bias.to(torch.float32)
-            w2_bias = layer.w2_bias.to(torch.float32)
-
-            layer.w13_bias = Parameter(w13_bias, requires_grad=False)
-            layer.w2_bias = Parameter(w2_bias, requires_grad=False)
-            # Ideally we'd use FusedMoEModularKernel.prepare_finalize object
-            # (stored in self.fused_experts) to determine if the MoE has a
-            # batched activation format. As self.fused_experts is not
-            # initialized at this point, we resort to checking the MoE config
-            # directly.
-            is_batched_moe = (
-                self.moe.use_deepep_ll_kernels or self.moe.use_nixl_ep_kernels
-            )
-            if is_batched_moe:
-                num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8
-            else:
-                num_warps = 8
-            w13_weight, w13_flex, w13_scale = _swizzle_mxfp4(
-                layer.w13_weight, layer.w13_weight_scale, num_warps
-            )
-            w2_weight, w2_flex, w2_scale = _swizzle_mxfp4(
-                layer.w2_weight, layer.w2_weight_scale, num_warps
-            )
+    def process_weights_after_loading(self, layer):
+        w13 = layer.w13_weight
+        w2 = layer.w2_weight
+        w13_scale = layer.w13_weight_scale
+        w2_scale = layer.w2_weight_scale
+        w13_bias = getattr(layer, "w13_bias", None)
+        w2_bias = getattr(layer, "w2_bias", None)
 
-            self.w13_precision_config = PrecisionConfig(
-                weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex)
-            )
-            self.w2_precision_config = PrecisionConfig(
-                weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex)
-            )
-            self.w13_weight = w13_weight
-            self.w2_weight = w2_weight
-            del layer.w13_weight
-            del layer.w2_weight
-            layer.w13_weight = w13_weight
-            layer.w2_weight = w2_weight
+        if self.mxfp4_backend == Mxfp4MoeBackend.NONE:
+            return
 
-        else:
-            raise ValueError(
-                f"Unsupported mxfp4_backend: {self.mxfp4_backend}: "
-                f"should be one of: {list(Mxfp4Backend)}."
-            )
+        self._setup_kernel(layer, w13, w2, w13_scale, w2_scale, w13_bias, w2_bias)
 
     def get_fused_moe_quant_config(
         self, layer: torch.nn.Module
     ) -> FusedMoEQuantConfig | None:
-        if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-            return mxfp4_w4a16_moe_quant_config(
-                w1_bias=layer.w13_bias,
-                w2_bias=layer.w2_bias,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-            )
-        elif self.mxfp4_backend == Mxfp4Backend.TRITON:
+        w1_scale = layer.w13_weight_scale
+        w2_scale = layer.w2_weight_scale
+        w1_bias = getattr(layer, "w13_bias", None)
+        w2_bias = getattr(layer, "w2_bias", None)
+
+        if self.mxfp4_backend in TRITON_BACKENDS:
+            assert self.w13_precision_config is not None
+            assert self.w2_precision_config is not None
             w1_scale = self.w13_precision_config
             w2_scale = self.w2_precision_config
-            return mxfp4_w4a16_moe_quant_config(
-                w1_bias=layer.w13_bias,
-                w2_bias=layer.w2_bias,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-            )
-        elif self.mxfp4_backend in [
-            Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM,
-            Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS,
-        ]:
-            return mxfp4_mxfp8_moe_quant_config(
-                w1_bias=layer.w13_bias,
-                w2_bias=layer.w2_bias,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-            )
-        elif self.mxfp4_backend in [
-            Mxfp4Backend.SM100_FI_MXFP4_BF16,
-            Mxfp4Backend.SM90_FI_MXFP4_BF16,
-            Mxfp4Backend.CK,
-        ]:
-            return mxfp4_w4a16_moe_quant_config(
-                w1_bias=layer.w13_bias,
-                w2_bias=layer.w2_bias,
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-            )
-        else:
-            w1_scale = layer.w13_weight_scale
-            w2_scale = layer.w2_weight_scale
-            return ocp_mx_moe_quant_config(
-                quant_dtype="mxfp4",
-                w1_bias=layer.w13_bias,
-                w2_bias=layer.w2_bias,
-                w1_scale=w1_scale,
-                w2_scale=w2_scale,
-            )
+
+        return make_mxfp4_moe_quant_config(
+            mxfp4_backend=self.mxfp4_backend,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            w1_bias=w1_bias,
+            w2_bias=w2_bias,
+        )
 
     def select_gemm_impl(
         self,
-        prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular,
+        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
         layer: torch.nn.Module,
     ) -> mk.FusedMoEExpertsModular:
-        if (
-            prepare_finalize.activation_format
-            == mk.FusedMoEActivationFormat.BatchedExperts
-        ):
-            if self.mxfp4_backend == Mxfp4Backend.MARLIN:
-                max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
-                assert max_num_tokens_per_rank is not None
-                assert self.moe_quant_config is not None
-                return BatchedMarlinExperts(
-                    max_num_tokens=max_num_tokens_per_rank,
-                    num_dispatchers=prepare_finalize.num_dispatchers(),
-                    quant_config=self.moe_quant_config,
-                    moe_config=self.moe,
-                )
-            else:
-                raise NotImplementedError(
-                    f"Incompatible Mxfp4 backend ({self.mxfp4_backend}) for "
-                    "EP batched experts format"
-                )
-        else:
-            assert self.moe_quant_config is not None
-            if (
-                self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-                or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
-            ):
-                # B200 code-path
-                kwargs = {
-                    # TODO(bnell): part of quant_config
-                    "max_capture_size": self.max_capture_size,
-                }
-                return TrtLlmGenExperts(self.moe, self.moe_quant_config, **kwargs)
-            elif self.mxfp4_backend == Mxfp4Backend.MARLIN:
-                return MarlinExperts(self.moe, self.moe_quant_config)
-            elif self.mxfp4_backend == Mxfp4Backend.TRITON:
-                if self.moe.is_lora_enabled:
-                    return UnfusedOAITritonExperts(self.moe, self.moe_quant_config)
-                return OAITritonExperts(self.moe, self.moe_quant_config)
-            else:
-                raise NotImplementedError(
-                    f"Incompatible Mxfp4 backend ({self.mxfp4_backend}) for EP"
-                )
-
-    @property
-    def is_monolithic(self) -> bool:
-        if self.moe.is_lora_enabled:
-            return False
-        return (
-            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-            or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
-            or self.mxfp4_backend == Mxfp4Backend.TRITON
-            or self.mxfp4_backend == Mxfp4Backend.CK
+        raise ValueError(
+            f"{self.__class__.__name__} uses the new modular kernel "
+            "initialization logic. This function should not be called."
         )
 
     def apply(
@@ -1053,30 +379,6 @@ def apply(
         shared_experts_input: torch.Tensor | None,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert not self.is_monolithic
-        if layer.enable_eplb:
-            raise NotImplementedError("EPLB is not supported for mxfp4")
-
-        assert _can_support_mxfp4(
-            layer.use_grouped_topk,
-            layer.topk_group,
-            layer.num_expert_group,
-            layer.expert_map,
-            layer.custom_routing_function,
-            layer.e_score_correction_bias,
-            layer.apply_router_weight_on_input,
-            layer.scoring_func,
-            layer.activation,
-            layer.eplb_state.expert_load_view,
-            layer.eplb_state.logical_to_physical_map,
-            layer.eplb_state.logical_replica_count,
-        ), "MXFP4 are not supported with this configuration."
-
-        assert (
-            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS
-            or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16
-            or self.mxfp4_backend == Mxfp4Backend.MARLIN
-        )
-
         assert self.moe_kernel is not None
         return self.moe_kernel.apply(
             hidden_states=x,
@@ -1098,126 +400,17 @@ def apply_monolithic(
         router_logits: torch.Tensor,
     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
         assert self.is_monolithic
-
-        if layer.enable_eplb:
-            raise NotImplementedError("EPLB is not supported for mxfp4")
-
-        assert _can_support_mxfp4(
-            layer.use_grouped_topk,
-            layer.topk_group,
-            layer.num_expert_group,
-            layer.expert_map,
-            layer.custom_routing_function,
-            layer.e_score_correction_bias,
-            layer.apply_router_weight_on_input,
-            layer.scoring_func,
-            layer.activation,
-            layer.eplb_state.expert_load_view,
-            layer.eplb_state.logical_to_physical_map,
-            layer.eplb_state.logical_replica_count,
-        ), "MXFP4 are not supported with this configuration."
-
-        # Apply routing simulation strategy if specified.
-        # This applies to all monolithic backends (SM100_FI and TRITON).
-        routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
-        if routing_strategy == "uniform_random":
-            router_logits = torch.rand_like(router_logits)
-
-        if (
-            self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM
-            or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16
-        ):
-            from flashinfer import trtllm_fp4_block_scale_moe
-
-            if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16:
-                assert x.dtype == torch.bfloat16
-                x_quant = x
-                x_scale = None
-            elif self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM:
-                from flashinfer import mxfp8_quantize
-
-                # x_quant is padded in hidden dimension with alignment=256
-                x_quant, x_scale = mxfp8_quantize(
-                    x,
-                    is_sf_swizzled_layout=False,
-                    alignment=256,
-                )
-                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*x.shape[:-1], -1)
-
-            # output with original unpadded hidden size
-            output = torch.empty_like(x)
-
-            trtllm_gen_output = trtllm_fp4_block_scale_moe(
-                routing_logits=router_logits.to(torch.bfloat16),
-                routing_bias=None,
-                hidden_states=x_quant,
-                hidden_states_scale=x_scale,
-                gemm1_weights=layer.w13_weight,  # uint8 (e2m1 x 2)
-                gemm1_weights_scale=layer.w13_weight_scale,  # uint8 (e4m3 x 2)
-                gemm1_bias=layer.w13_bias,  # fp32 per expert per channel
-                gemm1_alpha=layer.gemm1_alpha,  # fp32 per expert
-                gemm1_beta=layer.gemm1_beta,  # fp32 per expert
-                gemm1_clamp_limit=layer.gemm1_clamp_limit,  # fp32 per expert
-                gemm2_weights=layer.w2_weight,  # uint8 (e2m1 x 2)
-                gemm2_weights_scale=layer.w2_weight_scale,  # ue8m0
-                gemm2_bias=layer.w2_bias,  # fp32 per expert per channel
-                output1_scale_scalar=None,
-                output1_scale_gate_scalar=None,
-                output2_scale_scalar=None,
-                num_experts=layer.global_num_experts,
-                top_k=layer.top_k,
-                n_group=None,
-                topk_group=None,
-                intermediate_size=self.intermediate_size,  # padded to multiple of 256
-                local_expert_offset=layer.ep_rank * layer.local_num_experts,
-                local_num_experts=self.num_experts,
-                routed_scaling_factor=None,
-                routing_method_type=1 if layer.renormalize else 0,
-                do_finalize=True,
-                tune_max_num_tokens=max(self.max_capture_size, 1),
-                output=output,
-            )[0]
-            return trtllm_gen_output
-        elif self.mxfp4_backend == Mxfp4Backend.CK:
-            topk_weights, topk_ids = rocm_aiter_ops.fused_topk(
-                x, router_logits, layer.top_k, True
-            )
-            output = rocm_aiter_ops.fused_moe(
-                x,
-                layer.w13_weight,
-                layer.w2_weight,
-                topk_weights,
-                topk_ids,
-                activation_method=rocm_aiter_ops.get_aiter_activation_type("swiglu"),
-                quant_method=rocm_aiter_ops.get_aiter_quant_type("per_1x32"),
-                w1_scale=layer.w13_weight_scale,
-                w2_scale=layer.w2_weight_scale,
-                doweight_stage1=False,
-                hidden_pad=self.hidden_pad // 128 * 128,
-                intermediate_pad=self.intermediate_pad // 64 * 64 * 2,
-                bias1=layer.w13_bias,
-                bias2=layer.w2_bias,
-            )
-            return output
-        elif self.mxfp4_backend == Mxfp4Backend.TRITON:
-            from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import (  # noqa: E501
-                triton_kernel_moe_forward,
-            )
-
-            return triton_kernel_moe_forward(
-                hidden_states=x,
-                w1=layer.w13_weight,
-                w2=layer.w2_weight,
-                gating_output=router_logits,
-                topk=layer.top_k,
-                renormalize=layer.renormalize,
-                global_num_experts=layer.global_num_experts,
-                expert_map=layer.expert_map,
-                quant_config=self.moe_quant_config,
-                apply_router_weight_on_input=layer.apply_router_weight_on_input,
-            )
-        else:
-            raise ValueError(f"Unsupported backend: {self.mxfp4_backend}")
+        assert self.moe_kernel is not None
+        return self.moe_kernel.apply_monolithic(
+            hidden_states=x,
+            w1=layer.w13_weight,
+            w2=layer.w2_weight,
+            router_logits=router_logits,
+            activation=layer.activation,
+            global_num_experts=layer.global_num_experts,
+            expert_map=layer.expert_map,
+            apply_router_weight_on_input=layer.apply_router_weight_on_input,
+        )
 
 
 class XpuMxfp4MoEMethod(Mxfp4MoEMethod):
diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py
deleted file mode 100644
index 5d7b7b54adc8..000000000000
--- a/vllm/model_executor/layers/quantization/ptpc_fp8.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from typing import Any
-
-import torch
-from torch.nn.parameter import Parameter
-
-from vllm import _custom_ops as ops
-from vllm.model_executor.kernels.linear import (
-    init_fp8_linear_kernel,
-)
-from vllm.model_executor.layers.attention import Attention
-from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod
-from vllm.model_executor.layers.quantization import QuantizationMethods
-from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase
-from vllm.model_executor.layers.quantization.fp8 import (
-    Fp8Config,
-    Fp8KVCacheMethod,
-    Fp8LinearMethod,
-)
-from vllm.model_executor.layers.quantization.utils.quant_utils import (
-    is_layer_skipped,
-    kFp8DynamicTokenSym,
-)
-from vllm.platforms import current_platform
-
-
-class PTPCFp8Config(Fp8Config):
-    """Config class for Per-Token-Per-Channel Dynamic Quantization Fp8."""
-
-    def __init__(
-        self,
-        activation_scheme: str = "dynamic",
-        ignored_layers: list[str] | None = None,
-    ) -> None:
-        if not current_platform.is_rocm():
-            raise ValueError("ptpc_fp8 quantization is supported only on ROCm.")
-
-        if not current_platform.has_device_capability(94):
-            raise ValueError(
-                "ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer."  # noqa: E501
-            )
-        if activation_scheme == "static":
-            raise ValueError("ptpc_fp8 as of now only support dynamic quantization.")
-
-        super().__init__(
-            is_checkpoint_fp8_serialized=False,
-            activation_scheme=activation_scheme,
-            ignored_layers=ignored_layers,
-        )
-
-    @classmethod
-    def get_name(cls) -> QuantizationMethods:
-        return "ptpc_fp8"
-
-    @classmethod
-    def from_config(cls, config: dict[str, Any]) -> "PTPCFp8Config":
-        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
-        ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None)
-        return cls(activation_scheme=activation_scheme, ignored_layers=ignored_layers)
-
-    def get_quant_method(
-        self, layer: torch.nn.Module, prefix: str
-    ) -> "QuantizeMethodBase | None":
-        if isinstance(layer, LinearBase):
-            if is_layer_skipped(prefix, self.ignored_layers):
-                return UnquantizedLinearMethod()
-            return PTPCFp8LinearMethod(self)
-        elif isinstance(layer, Attention):
-            return Fp8KVCacheMethod(self)
-        return None
-
-
-class PTPCFp8LinearMethod(Fp8LinearMethod):
-    """Linear method for Per-Token and Per-Channel FP8 Quantization.
-    Only supports loading quantized BF16 model checkpoints with dynamic
-    activation scaling. To load FP16 model checkpoints, user must specify
-    to convert the FP16 model weight loading into BF16.
-    The weight scaling factor will be initialized after
-    the model weights are loaded.
-
-    Limitations:
-    1. Only support float8_e4m3fnuz data type due to the limitation of
-       torch._scaled_mm (https://github.com/ROCm/pytorch/blob/8c0504d7f3fb0ee4c278c096a5c3caedb01129fa/aten/src/ATen/native/cuda/Blas.cpp#L1041)
-
-    Args:
-        quant_config: The quantization config.
-    """
-
-    def __init__(self, quant_config: PTPCFp8Config):
-        assert current_platform.is_rocm(), (
-            "PTPCFp8LinearMethod is only supported on ROCm."
-        )
-        super().__init__(quant_config=quant_config)
-        # Force weight quantization
-        self.fp8_linear = init_fp8_linear_kernel(
-            activation_quant_key=kFp8DynamicTokenSym,
-            weight_quant_key=kFp8DynamicTokenSym,
-            out_dtype=torch.get_default_dtype(),
-            module_name=self.__class__.__name__,
-        )
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        assert layer.weight.data.dtype not in (torch.float16, torch.float32), (
-            "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support "
-            f"output dtype of bfloat16. {layer.weight.data.dtype} is specified."
-        )
-
-        if layer.weight.data.dtype == torch.bfloat16:
-            # Quantize the weights.
-            qweight, weight_scale = ops.scaled_fp8_quant(
-                layer.weight, scale=None, use_per_token_if_dynamic=True
-            )
-
-            # Update the layer with the new values.
-            layer.weight = Parameter(
-                qweight.t(), requires_grad=False
-            )  # Pretranspose the weight
-            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-        else:
-            assert layer.weight.data.dtype == current_platform.fp8_dtype()
-            assert getattr(layer, "weight_scale", None) is not None
-        layer.input_scale = None
-
-    def apply(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        bias: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        return self.fp8_linear.apply_weights(layer, x, bias)
diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py
index 0a5db4e71fdb..b2b77e6688c1 100644
--- a/vllm/model_executor/layers/quantization/quark/quark_moe.py
+++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py
@@ -25,9 +25,9 @@
     ocp_mx_moe_quant_config,
 )
 from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe
-from vllm.model_executor.layers.quantization.mxfp4 import (
-    Mxfp4Backend,
-    get_mxfp4_backend,
+from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import (
+    Mxfp4MoeBackend,
+    select_mxfp4_moe_backend,
 )
 from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
     prepare_fp8_moe_layer_for_marlin,
@@ -92,7 +92,8 @@ def get_moe_method(
                 rocm_aiter_ops.is_fused_moe_enabled()
             )
             if (
-                input_config.get("dtype") == "fp8_e4m3"
+                input_config is not None
+                and input_config.get("dtype") == "fp8_e4m3"
                 and not input_config.get("is_dynamic")
                 and not emulate
             ):
@@ -698,9 +699,9 @@ def __init__(
                 f"Please check that the combination is supported in OCP_MX_Scheme."
             )
 
-        self.mxfp4_backend: Mxfp4Backend | None = None
+        self.mxfp4_backend: Mxfp4MoeBackend | None = None
         if self.ocp_mx_scheme == "w_mxfp4":
-            self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled)
+            self.mxfp4_backend, _ = select_mxfp4_moe_backend(moe)
 
         if self.input_quant is not None:
             self.static_input_scales = not self.input_quant.get("is_dynamic")
diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
index 6917bb6f2deb..1b30f5b82c6a 100644
--- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
+++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py
@@ -176,7 +176,7 @@ class QuarkOCP_MX(QuarkScheme):
     def __init__(
         self,
         weight_quant_spec: dict[str, Any],
-        input_quant_spec: dict[str, Any],
+        input_quant_spec: dict[str, Any] | None,
         dynamic_mxfp4_quant: bool = False,
     ):
         self.out_dtype = torch.get_default_dtype()
@@ -185,7 +185,13 @@ def __init__(
         self.input_quant_spec = input_quant_spec
         self.dynamic_mxfp4_quant = dynamic_mxfp4_quant
         self.weight_dtype = weight_quant_spec["dtype"].replace("fp", "mxfp")
-        self.input_dtype = input_quant_spec["dtype"].replace("fp", "mxfp")
+        self.input_dtype: str | None = None
+        if input_quant_spec is not None:
+            input_quant = input_quant_spec["dtype"]
+            if input_quant == "fp8_e4m3":
+                self.input_dtype = "fp8"
+            else:
+                self.input_dtype = input_quant.replace("fp", "mxfp")
 
         self.ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype(
             self.input_dtype, self.weight_dtype
@@ -200,14 +206,21 @@ def __init__(
                 dequant_mxfp6, quant_dtype=self.weight_dtype.replace("mx", "")
             )
 
-        if self.input_dtype == "mxfp4":
+        if self.input_dtype is None:
+            self.quant_dequant_func: Callable[[torch.Tensor], torch.Tensor] = (
+                lambda x: x
+            )  # no input Q/DQ for weight-only
+        elif self.input_dtype == "mxfp4":
             self.quant_dequant_func = quant_dequant_mxfp4
         else:
             self.quant_dequant_func = partial(
                 quant_dequant_mxfp6, quant_dtype=self.input_dtype.replace("mx", "")
             )
 
-        self.static_input_scales = not input_quant_spec.get("is_dynamic")
+        if input_quant_spec is None:
+            self.static_input_scales = False
+        else:
+            self.static_input_scales = not input_quant_spec.get("is_dynamic")
 
         if self.static_input_scales:
             raise NotImplementedError(
diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
index d6b32c4bbef2..9bc58d2f302d 100644
--- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
+++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py
@@ -389,9 +389,9 @@ def prepare_moe_fp4_layer_for_marlin(
 
     group_size = 16 if is_nvfp4 else 32
 
-    e = layer.num_experts
-    k = layer.hidden_size
-    n = layer.intermediate_size_per_partition
+    e = layer.moe_config.num_experts
+    k = layer.moe_config.hidden_dim
+    n = layer.moe_config.intermediate_size_per_partition
 
     # WORKSPACE
     device = layer.w13_weight.device
@@ -500,6 +500,120 @@ def prepare_moe_fp4_layer_for_marlin(
         setattr(layer, name, bias)
 
 
+def prepare_moe_mxfp4_layer_for_marlin(
+    layer: torch.nn.Module,
+    w13: torch.Tensor,
+    w2: torch.Tensor,
+    w13_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    w13_bias: torch.Tensor | None,
+    w2_bias: torch.Tensor | None,
+) -> tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor | None,
+    torch.Tensor | None,
+]:
+    """Pure-function version of prepare_moe_fp4_layer_for_marlin for MXFP4.
+
+    Takes weight tensors as inputs and returns transformed tensors.
+    Does NOT modify the layer in-place.
+    """
+    input_dtype = get_marlin_input_dtype()
+    if (
+        input_dtype is not None
+        and input_dtype.itemsize == 1
+        and input_dtype != torch.float8_e4m3fn
+    ):
+        raise RuntimeError("MXFP4 weight + INT8 activation is not supported.")
+
+    group_size = 32  # MXFP4 block size
+
+    # Derive dimensions from actual weight shapes to handle rounded/padded
+    # sizes correctly (e.g., Mxfp4MoEMethod rounds up hidden_dim).
+    # w13 shape: (E, 2*N, K//2)
+    e = w13.shape[0]
+    n = w13.shape[1] // 2  # intermediate_size_per_partition
+    k = w13.shape[2] * 2  # hidden_size
+
+    device = w13.device
+    param_dtype = layer.params_dtype
+    is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
+    perm = torch.empty(0, dtype=torch.int, device=device)
+
+    # WEIGHT: Repack weights to marlin format
+    def repack_weight(weight: torch.Tensor, name: str) -> torch.Tensor:
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+
+        assert weight.shape == (e, size_n, size_k // 2)
+
+        for i in range(e):
+            qweight = weight[i].view(torch.int32).T.contiguous()
+            marlin_qweight = ops.gptq_marlin_repack(
+                b_q_weight=qweight,
+                perm=perm,
+                size_k=size_k,
+                size_n=size_n,
+                num_bits=4,
+                is_a_8bit=is_a_8bit,
+            )
+            tensor_list.append(marlin_qweight)
+        return torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+
+    w13 = repack_weight(w13, "w13")
+    w2 = repack_weight(w2, "w2")
+
+    # WEIGHT SCALES: Permute scales
+    def permute_scales(scales: torch.Tensor, name: str) -> torch.Tensor:
+        scales = scales.view(torch.float8_e8m0fnu)
+        scales = scales.to(param_dtype)
+
+        tensor_list = []
+        if "w13" in name:
+            size_n, size_k = n * 2, k
+        else:
+            size_n, size_k = k, n
+
+        for i in range(e):
+            scale = scales[i].T
+            marlin_scales = marlin_permute_scales(
+                s=scale,
+                size_k=size_k,
+                size_n=size_n,
+                group_size=group_size,
+                is_a_8bit=is_a_8bit,
+            )
+            marlin_scales = mxfp4_marlin_process_scales(
+                marlin_scales, input_dtype=input_dtype
+            )
+            tensor_list.append(marlin_scales)
+        return torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+
+    w13_scale = permute_scales(w13_scale, "w13")
+    w2_scale = permute_scales(w2_scale, "w2")
+
+    # BIAS: Permute bias
+    def permute_bias(bias: torch.Tensor | None) -> torch.Tensor | None:
+        if bias is None:
+            return None
+        bias = bias.to(param_dtype)
+        tensor_list = []
+        for i in range(e):
+            tensor_list.append(marlin_permute_bias(bias[i]))
+        return torch.cat([x.unsqueeze(0) for x in tensor_list], 0)
+
+    w13_bias = permute_bias(w13_bias)
+    w2_bias = permute_bias(w2_bias)
+
+    return w13, w2, w13_scale, w2_scale, w13_bias, w2_bias
+
+
 def rand_marlin_weight_nvfp4_like(weight, group_size, input_dtype=None):
     is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1
 
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
index 23d7cf55474a..49ddc8accc29 100644
--- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -1,12 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Callable
 from typing import Any
 
 import torch
 
 from vllm.logger import init_logger
-from vllm.model_executor.layers.fused_moe.activation import MoEActivation
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
 from vllm.utils.import_utils import has_triton_kernels
@@ -22,7 +20,7 @@
 CK_MXFP4_MOE_DIM_ALIGNMENT = 256
 
 
-def _swizzle_mxfp4(quant_tensor, scale, num_warps):
+def _swizzle_mxfp4(quant_tensor, scale, num_warps=8):
     """weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel"""
     assert has_triton_kernels()
     import triton_kernels.matmul_ogs_details.opt_flags as opt_flags
@@ -87,35 +85,6 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
     return quant_tensor, InFlexData(), scale
 
 
-def _can_support_mxfp4(
-    use_grouped_topk: bool = False,
-    topk_group: int | None = None,
-    num_expert_group: int | None = None,
-    expert_map: torch.Tensor | None = None,
-    custom_routing_function: Callable | None = None,
-    e_score_correction_bias: torch.Tensor | None = None,
-    apply_router_weight_on_input: bool = False,
-    scoring_func: str = "softmax",
-    activation: MoEActivation = MoEActivation.SWIGLUOAI,
-    expert_load_view: torch.Tensor | None = None,
-    logical_to_physical_map: torch.Tensor | None = None,
-    logical_replica_count: torch.Tensor | None = None,
-):
-    return not (
-        use_grouped_topk
-        or topk_group
-        or num_expert_group
-        or custom_routing_function
-        or e_score_correction_bias
-        or apply_router_weight_on_input
-        or scoring_func != "softmax"
-        or activation != MoEActivation.SWIGLUOAI
-        or expert_load_view
-        or logical_to_physical_map
-        or logical_replica_count
-    )
-
-
 def get_padding_alignment():
     return (
         256
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
index 757d1ecc5284..4918c83bdc39 100644
--- a/vllm/model_executor/layers/utils.py
+++ b/vllm/model_executor/layers/utils.py
@@ -122,7 +122,7 @@ def use_aiter_triton_gemm(n, m, k, dtype):
 def rocm_unquantized_gemm_impl(
     x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None = None
 ) -> torch.Tensor:
-    from vllm.platforms.rocm import on_gfx9, on_gfx950
+    from vllm.platforms.rocm import on_gfx1x, on_gfx9, on_gfx950
 
     n = x.numel() // x.size(-1)
     m = weight.shape[0]
@@ -169,12 +169,12 @@ def rocm_unquantized_gemm_impl(
 
     use_skinny = (
         envs.VLLM_ROCM_USE_SKINNY_GEMM
-        and on_gfx9()
+        and (on_gfx9() or on_gfx1x())
         and x.dtype in [torch.float16, torch.bfloat16]
         and k % 8 == 0
     )
 
-    if use_skinny is not True:
+    if not use_skinny:
         return torch.nn.functional.linear(x, weight, bias)
 
     x_view = x.reshape(-1, x.size(-1))
diff --git a/vllm/model_executor/models/colbert.py b/vllm/model_executor/models/colbert.py
index 66def505f1f7..7b6889899762 100644
--- a/vllm/model_executor/models/colbert.py
+++ b/vllm/model_executor/models/colbert.py
@@ -27,8 +27,9 @@
 from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed
 
 from .bert import BertEmbeddingModel, BertModel
-from .interfaces import SupportsLateInteraction
+from .interfaces import HasInnerState, IsHybrid, SupportsLateInteraction
 from .interfaces_base import default_pooling_type
+from .lfm2 import Lfm2ForCausalLM, Lfm2Model
 
 
 class ColBERTMixin(nn.Module, SupportsLateInteraction):
@@ -414,3 +415,98 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
             loaded.update(colbert_loaded)
 
         return loaded
+
+
+# -----------------------------------------------------------------------
+# Concrete model: ColBERT + LFM2 backbone
+# -----------------------------------------------------------------------
+
+
+@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL")
+class ColBERTLfm2Model(ColBERTMixin, nn.Module, HasInnerState, IsHybrid):
+    """ColBERT late interaction model with LFM2 backbone.
+
+    For ``LiquidAI/LFM2-ColBERT-350M`` and similar models.
+
+    The projection is auto-loaded from sentence-transformers ``1_Dense/``
+    when not present in the main checkpoint.
+    """
+
+    is_pooling_model = True
+    # LFM2 is a hybrid model (attention + SSM layers); these flags ensure
+    # HybridAttentionMambaModelConfig.verify_and_update_config runs so that
+    # mamba_block_size and related cache settings are correctly initialised.
+    is_hybrid = True
+    has_inner_state = True
+
+    @classmethod
+    def get_mamba_state_shape_from_config(cls, vllm_config: VllmConfig):
+        return Lfm2ForCausalLM.get_mamba_state_shape_from_config(vllm_config)
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(cls, vllm_config: VllmConfig):
+        return Lfm2ForCausalLM.get_mamba_state_dtype_from_config(vllm_config)
+
+    @classmethod
+    def get_mamba_state_copy_func(cls):
+        return Lfm2ForCausalLM.get_mamba_state_copy_func()
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+
+        colbert_dim = self.get_colbert_dim_from_config(config)
+        self._init_colbert_components(
+            hidden_size=config.hidden_size,
+            colbert_dim=colbert_dim,
+            head_dtype=vllm_config.model_config.head_dtype,
+        )
+
+        self.model = Lfm2Model(
+            vllm_config=vllm_config,
+            prefix=prefix,
+        )
+
+        pooler_config = vllm_config.model_config.pooler_config
+        assert pooler_config is not None
+        self.pooler = self._build_colbert_pooler(pooler_config)
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors=None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        return self.model(
+            input_ids=input_ids,
+            positions=positions,
+            inputs_embeds=inputs_embeds,
+            intermediate_tensors=intermediate_tensors,
+        )
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+        other_weights, colbert_loaded = self._load_colbert_weights(weights)
+
+        # Strip "model." prefix added by the embedding adapter
+        model_weights = [
+            (n[len("model.") :] if n.startswith("model.") else n, w)
+            for n, w in other_weights
+        ]
+        loaded_model = self.model.load_weights(model_weights)
+        loaded = {f"model.{name}" for name in loaded_model} | colbert_loaded
+
+        # When the ST projector was auto-loaded during init
+        # (not from the main checkpoint), mark its params as loaded
+        # so the weight validator doesn't complain.
+        if hasattr(self.pooler, "head"):
+            head = self.pooler.head
+            projector = getattr(head, "projector", None)
+            if projector is not None and isinstance(projector, nn.Module):
+                for name, _ in projector.named_parameters():
+                    loaded.add(f"pooler.head.projector.{name}")
+
+        return loaded
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 488cfa35c14f..a5644a414aee 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -113,8 +113,24 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         Args:
             vllm_config: vLLM Config
         """
+        cache_config = vllm_config.cache_config
+
+        # Disable calculate_kv_scales for hybrid models: uninitialized
+        # recurrent state corrupts scales during the calibration pass.
+        # See issue: https://github.com/vllm-project/vllm/issues/37554
+        if cache_config.calculate_kv_scales:
+            logger.warning(
+                "Disabling calculate_kv_scales for hybrid model '%s'. "
+                "Hybrid models with recurrent layers (GDN, Mamba, SSM) "
+                "produce unreliable KV cache scales during the "
+                "calibration pass because recurrent state is "
+                "uninitialized. Using default scale of 1.0 instead.",
+                vllm_config.model_config.model,
+            )
+            cache_config.calculate_kv_scales = False
+
         # Save the user input before it gets modified by MambaModelConfig
-        mamba_block_size = vllm_config.cache_config.mamba_block_size
+        mamba_block_size = cache_config.mamba_block_size
         # Enable FULL_AND_PIECEWISE by default
         MambaModelConfig.verify_and_update_config(vllm_config)
 
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 55c42e5fa57e..0c182a891cd3 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -194,18 +194,18 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]):
     [vllm.config.model.ModelConfig.score_type][]
     to use by default.
     
-    Score API handles score/rerank for:
-    - "score" task (score_type: cross-encoder models)
-    - "embed" task (score_type: bi-encoder models)
-    - "token_embed" task (score_type: late interaction models)
+    Scoring API handles score/rerank for:\n
+    - "classify" task (score_type: cross-encoder models)\n
+    - "embed" task (score_type: bi-encoder models)\n
+    - "token_embed" task (score_type: late interaction models)\n
     
-    score_type defaults to bi-encoder, then the Score API uses the "embed" task.
+    score_type defaults to bi-encoder, then the Score API uses the "embed" task.\n
     If you set score_type to cross-encoder via 
     [vllm.model_executor.models.interfaces.SupportsCrossEncoding][], 
-    then the Score API uses the "score" task.
+    then the Score API uses the "score" task.\n
     If you set score_type to late-interaction via 
     [vllm.model_executor.models.interfaces.SupportsLateInteraction][], 
-    then the Score API uses the "token_embed" task.    
+    then the Score API uses the "token_embed" task.\n
     """
 
     pooler: Pooler
diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py
index 8e03e29a77c3..e29646182137 100644
--- a/vllm/model_executor/models/isaac.py
+++ b/vllm/model_executor/models/isaac.py
@@ -334,15 +334,14 @@ def get_hf_config(self) -> IsaacConfig:
         return IsaacConfig()
 
     def get_image_processor(self, **kwargs) -> IsaacImageProcessor:
-        return IsaacImageProcessor(kwargs)
+        return IsaacImageProcessor(**kwargs)
 
     def get_hf_processor(self, **kwargs) -> IsaacProcessor:
         hf_config = self.get_hf_config()
 
-        return self.ctx.init_processor(
-            IsaacProcessor,
+        return IsaacProcessor(
             tokenizer=self.get_tokenizer(),
-            image_processor=self.get_image_processor(),
+            image_processor=self.get_image_processor(**kwargs),
             image_token=hf_config.vision_token,
         )
 
diff --git a/vllm/model_executor/models/kimi_k25.py b/vllm/model_executor/models/kimi_k25.py
index 4b2b6a4b60a2..10d21aab0cf8 100644
--- a/vllm/model_executor/models/kimi_k25.py
+++ b/vllm/model_executor/models/kimi_k25.py
@@ -104,19 +104,25 @@ class KimiK25ProcessingInfo(BaseProcessingInfo):
 
     def __init__(self, ctx: InputProcessingContext) -> None:
         super().__init__(ctx)
-        self.hf_config = self.get_hf_config()
-        self.media_token_id = self.hf_config.media_placeholder_token_id
-        media_processor = cached_get_image_processor(
+
+        self.hf_config = hf_config = self.get_hf_config()
+
+        tokenizer = self.get_tokenizer()
+        image_processor = cached_get_image_processor(
             self.ctx.model_config.model,
             trust_remote_code=self.ctx.model_config.trust_remote_code,
         )
-        self.media_processor = media_processor
+
+        self.media_token_id = media_token_id = hf_config.media_placeholder_token_id
+        self.media_token = tokenizer.decode(media_token_id)
+
+        self.image_processor = image_processor
         self.hf_processor = KimiK25Processor(
-            media_processor=self.media_processor,
-            tokenizer=self.get_tokenizer(),
-            media_token_id=self.media_token_id,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            media_token_id=media_token_id,
         )
-        self.media_tokens_calculator = self.media_processor.media_tokens_calculator
+        self.media_tokens_calculator = image_processor.media_tokens_calculator
 
     def get_hf_processor(self):
         return self.hf_processor
@@ -132,20 +138,15 @@ def get_supported_mm_limits(self) -> Mapping[str, int | None]:
 class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]):
     """Builds dummy inputs for Kimi-K2.5 model profiling."""
 
-    def __init__(self, info: KimiK25ProcessingInfo) -> None:
-        super().__init__(info)
-        self.media_token_id = self.info.media_token_id
-        self.frame_per_chunk = self.info.media_processor.num_frames_per_chunk
-
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_media = mm_counts.get("vision_chunk", 0)
-        return "<|media_pad|>" * num_media
+        return self.info.media_token * num_media
 
     def get_dummy_mm_items(self):
         dummy_videos = self._get_dummy_images(
             height=MaxImageTokenMeta.height,
             width=MaxImageTokenMeta.width,
-            num_images=self.frame_per_chunk,
+            num_images=self.info.image_processor.num_frames_per_chunk,
         )
 
         video_chunk_dummy_item = VisionChunkVideo(
@@ -236,9 +237,6 @@ def get_replacement(item_idx: int):
             ),
         ]
 
-    def split_video_chunks(self, video):
-        return self.info.media_processor.split_video_chunks(video)
-
 
 @MULTIMODAL_REGISTRY.register_processor(
     KimiK25MultiModalProcessor,
diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py
index 5ff9c5f04b5e..1741e18fdda6 100644
--- a/vllm/model_executor/models/nano_nemotron_vl.py
+++ b/vllm/model_executor/models/nano_nemotron_vl.py
@@ -12,6 +12,7 @@
 import warnings
 from collections.abc import Iterable, Mapping, Sequence
 from functools import cached_property
+from io import BytesIO
 from typing import Annotated, Literal, TypeAlias
 
 import torch
@@ -53,7 +54,7 @@
     MultiModalKwargsItems,
     VideoItem,
 )
-from vllm.multimodal.media.audio import extract_audio_from_video_bytes
+from vllm.multimodal.media.audio import load_audio_pyav
 from vllm.multimodal.parse import (
     AudioProcessorItems,
     ImageEmbeddingItems,
@@ -553,7 +554,7 @@ def _extract_audio_from_videos(
                     "video must be loaded with keep_video_bytes=True (e.g. via "
                     "the chat API with a model that sets use_audio_in_video)."
                 )
-            audio_items.append(extract_audio_from_video_bytes(video_bytes))
+            audio_items.append(load_audio_pyav(BytesIO(video_bytes)))
 
         # Create a new VideoProcessorItems with metadata that does not contain
         # the large video bytes, to avoid modifying the input `mm_items`.
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 8b1455359f57..eaf5843a3516 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -66,9 +66,11 @@
 
 from .interfaces import (
     MultiModalEmbeddings,
+    SupportsEagle3,
     SupportsLoRA,
     SupportsMultiModal,
     SupportsPP,
+    supports_eagle3,
 )
 from .module_mapping import MultiModelKeys
 from .utils import StageMissingLayer, init_vllm_registered_model, maybe_prefix
@@ -262,7 +264,7 @@ def _cached_apply_hf_processor(
     dummy_inputs=PixtralDummyInputsBuilder,
 )
 class PixtralForConditionalGeneration(
-    nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP
+    nn.Module, SupportsLoRA, SupportsEagle3, SupportsMultiModal, SupportsPP
 ):
     @classmethod
     def get_placeholder_str(cls, modality: str, i: int) -> str | None:
@@ -390,6 +392,21 @@ def compute_logits(
     ) -> torch.Tensor | None:
         return self.language_model.compute_logits(hidden_states)
 
+    def _require_language_model_eagle3(self) -> None:
+        if not supports_eagle3(self.language_model):
+            raise RuntimeError(
+                f"EAGLE-3 speculative decoding requires the language model to "
+                f"support EAGLE-3, but {type(self.language_model).__name__} does not."
+            )
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self._require_language_model_eagle3()
+        self.language_model.set_aux_hidden_state_layers(layers)
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        self._require_language_model_eagle3()
+        return self.language_model.get_eagle3_aux_hidden_state_layers()
+
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         def is_vision_encoder_weights(weight: tuple[str, torch.Tensor]):
             return weight[0].startswith(("vision_encoder", "vision_tower"))
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 9b1e52722923..c3e7edb7da4a 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -269,6 +269,7 @@
     "HF_ColBERT": ("colbert", "ColBERTModel"),
     "ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"),
     "ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"),
+    "ColBERTLfm2Model": ("colbert", "ColBERTLfm2Model"),
     # [Multimodal]
     "ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"),
     "ColPaliForRetrieval": ("colpali", "ColPaliModel"),
diff --git a/vllm/model_executor/models/whisper_causal.py b/vllm/model_executor/models/whisper_causal.py
index 6774ea11dd0c..8e4322ea335d 100644
--- a/vllm/model_executor/models/whisper_causal.py
+++ b/vllm/model_executor/models/whisper_causal.py
@@ -150,8 +150,10 @@ def build(
             new_common_attn_metadata.query_start_loc *= block_pool_size
             new_common_attn_metadata.query_start_loc_cpu *= block_pool_size
             new_common_attn_metadata.seq_lens *= block_pool_size
-            new_common_attn_metadata._seq_lens_cpu *= block_pool_size
-            new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
+            if new_common_attn_metadata._seq_lens_cpu is not None:
+                new_common_attn_metadata._seq_lens_cpu *= block_pool_size
+            if new_common_attn_metadata._num_computed_tokens_cpu is not None:
+                new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size
             new_common_attn_metadata.num_actual_tokens *= block_pool_size
             new_common_attn_metadata.max_query_len *= block_pool_size
             new_common_attn_metadata.max_seq_len *= block_pool_size
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 28f066d112ed..0a748a6d15c6 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -12,17 +12,35 @@
 from vllm.utils.import_utils import PlaceholderModule
 
 try:
-    import librosa
+    import av as av
 except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+    av = PlaceholderModule("av")  # type: ignore[assignment]
 
+try:
+    import resampy
+except ImportError:
+    resampy = PlaceholderModule("resampy")  # type: ignore[assignment]
 
 try:
     import scipy.signal as scipy_signal
 except ImportError:
     scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal")  # type: ignore[assignment]
 
+
 # ============================================================
+# Aligned with `librosa.get_duration` function
+def get_audio_duration(*, y: npt.NDArray[np.floating], sr: float = 22050) -> float:
+    """Get the duration of an audio array in seconds.
+
+    Args:
+        y: Audio time series. Can be 1D (samples,) or 2D (channels, samples).
+        sr: Sample rate of the audio in Hz.
+
+    Returns:
+        Duration of the audio in seconds.
+    """
+    n_samples = y.shape[-1]
+    return float(n_samples) / sr
 
 
 class ChannelReduction(str, Enum):
@@ -153,13 +171,71 @@ def normalize_audio(
 # ============================================================
 
 
-def resample_audio_librosa(
+def resample_audio_pyav(
     audio: npt.NDArray[np.floating],
     *,
     orig_sr: float,
     target_sr: float,
 ) -> npt.NDArray[np.floating]:
-    return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)
+    """Resample audio using PyAV (libswresample via FFmpeg).
+
+    Args:
+        audio: Input audio. Can be:
+            - 1D array ``(samples,)``: mono audio
+            - 2D array ``(channels, samples)``: stereo audio
+        orig_sr: Original sample rate in Hz.
+        target_sr: Target sample rate in Hz.
+
+    Returns:
+        Resampled audio with the same shape as the input (1D → 1D, 2D → 2D).
+    """
+    orig_sr_int = int(round(orig_sr))
+    target_sr_int = int(round(target_sr))
+
+    if orig_sr_int == target_sr_int:
+        return audio
+
+    if audio.ndim == 2:
+        # Resample each channel independently and re-stack.
+        return np.stack(
+            [
+                resample_audio_pyav(ch, orig_sr=orig_sr, target_sr=target_sr)
+                for ch in audio
+            ],
+            axis=0,
+        )
+
+    expected_len = int(math.ceil(audio.shape[-1] * target_sr_int / orig_sr_int))
+
+    # from_ndarray expects shape (channels, samples) for planar formats.
+    # libswresample requires a minimum number of input samples to produce
+    # output frames; pad short inputs with zeros so we always get output,
+    # then trim to the expected output length.
+    _MIN_SAMPLES = 1024
+    audio_f32 = np.asarray(audio, dtype=np.float32)
+    if len(audio_f32) < _MIN_SAMPLES:
+        audio_f32 = np.pad(audio_f32, (0, _MIN_SAMPLES - len(audio_f32)))
+    audio_f32 = audio_f32.reshape(1, -1)
+
+    resampler = av.AudioResampler(format="fltp", layout="mono", rate=target_sr_int)
+
+    frame = av.AudioFrame.from_ndarray(audio_f32, format="fltp", layout="mono")
+    frame.sample_rate = orig_sr_int
+
+    out_frames = resampler.resample(frame)
+    out_frames.extend(resampler.resample(None))  # flush buffered samples
+
+    result = np.concatenate([f.to_ndarray() for f in out_frames], axis=1).squeeze(0)
+    return result[:expected_len]
+
+
+def resample_audio_resampy(
+    audio: npt.NDArray[np.floating],
+    *,
+    orig_sr: float,
+    target_sr: float,
+) -> npt.NDArray[np.floating]:
+    return resampy.resample(audio, sr_orig=orig_sr, sr_new=target_sr)
 
 
 def resample_audio_scipy(
@@ -167,7 +243,7 @@ def resample_audio_scipy(
     *,
     orig_sr: float,
     target_sr: float,
-):
+) -> npt.NDArray[np.floating]:
     if orig_sr > target_sr:
         return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr)
     elif orig_sr < target_sr:
@@ -181,7 +257,7 @@ class AudioResampler:
     def __init__(
         self,
         target_sr: float | None = None,
-        method: Literal["librosa", "scipy"] = "librosa",
+        method: Literal["pyav", "resampy", "scipy"] = "resampy",
     ):
         self.target_sr = target_sr
         self.method = method
@@ -203,8 +279,10 @@ def resample(
             abs_tol=1e-6,
         ):
             return audio
-        if self.method == "librosa":
-            return resample_audio_librosa(
+        if self.method == "pyav":
+            return resample_audio_pyav(audio, orig_sr=orig_sr, target_sr=self.target_sr)
+        if self.method == "resampy":
+            return resample_audio_resampy(
                 audio, orig_sr=orig_sr, target_sr=self.target_sr
             )
         elif self.method == "scipy":
@@ -214,7 +292,7 @@ def resample(
         else:
             raise ValueError(
                 f"Invalid resampling method: {self.method}. "
-                "Supported methods are 'librosa' and 'scipy'."
+                "Supported methods are 'pyav' and 'scipy'."
             )
 
 
diff --git a/vllm/multimodal/media/audio.py b/vllm/multimodal/media/audio.py
index 88dcb0b0186a..ae0a9f55bdce 100644
--- a/vllm/multimodal/media/audio.py
+++ b/vllm/multimodal/media/audio.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
 from io import BytesIO
 from pathlib import Path
 
@@ -14,58 +15,80 @@
 from .base import MediaIO
 
 try:
-    import librosa
+    import av
 except ImportError:
-    librosa = PlaceholderModule("librosa")  # type: ignore[assignment]
+    av = PlaceholderModule("av")  # type: ignore[assignment]
 
 try:
     import soundfile
 except ImportError:
     soundfile = PlaceholderModule("soundfile")  # type: ignore[assignment]
 
+
 try:
-    import av
+    import resampy
 except ImportError:
-    av = PlaceholderModule("av")  # type: ignore[assignment]
+    resampy = PlaceholderModule("resampy")  # type: ignore[assignment]
 
 
-def extract_audio_from_video_bytes(
-    data: bytes,
-) -> tuple[npt.NDArray, float]:
-    """Extract the audio track from raw video bytes using PyAV.
+# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile
+# being librosa's main backend. Used to validate if an audio loading error is due to a
+# server error vs a client error (invalid audio file).
+# 1 = unrecognised format      (file is not a supported audio container)
+# 3 = malformed file           (corrupt or structurally invalid audio)
+# 4 = unsupported encoding     (codec not supported by this libsndfile build)
+_BAD_SF_CODES = {1, 3, 4}
 
-    PyAV wraps FFmpeg's C libraries in-process — no subprocess is
-    spawned, which is critical to avoid crashing CUDA-active vLLM
-    worker processes.
 
-    The returned waveform is at the native sample rate of the video's
-    audio stream.  Resampling to a model-specific rate is left to the
-    downstream :class:`AudioResampler` in the parsing pipeline.
+def load_audio_pyav(
+    path: BytesIO | Path | str,
+    *,
+    sr: float | None = 22050,
+    mono: bool = True,
+) -> tuple[npt.NDArray, float]:
+    """Load an audio file using PyAV (FFmpeg), returning float32 mono waveform.
+
+    Decodes the audio stream at its native sample rate. Channel reduction to
+    mono is performed by averaging across channels.  Resampling to a
+    model-specific rate is left to the downstream :class:`AudioResampler`.
 
     Args:
-        data: Raw video file bytes (e.g. from an mp4 file).
+        path: A :class:`~io.BytesIO` buffer, a filesystem
+            :class:`~pathlib.Path`, or a string path.
 
     Returns:
-        A tuple of ``(waveform, sample_rate)`` suitable for use as an
-        :class:`AudioItem`.
+        ``(waveform, sample_rate)`` where *waveform* is a 1-D float32
+        NumPy array and *sample_rate* is the native sample rate in Hz.
     """
-    if data is None or len(data) == 0:
-        raise ValueError(
-            "Cannot extract audio: video bytes are missing or empty. "
-            "Ensure video was loaded with keep_video_bytes=True for "
-            "audio-in-video extraction."
-        )
+    native_sr = None
     try:
-        with av.open(BytesIO(data)) as container:
+        with av.open(path) as container:
             if not container.streams.audio:
-                raise ValueError("No audio stream found in the video.")
+                raise ValueError("No audio stream found.")
             stream = container.streams.audio[0]
+            stream.thread_type = "AUTO"
             native_sr = stream.rate
+            sr = sr or native_sr
 
             chunks: list[npt.NDArray] = []
-            for frame in container.decode(audio=0):
-                arr = frame.to_ndarray()
-                chunks.append(arr.mean(axis=0) if arr.ndim > 1 else arr)
+            needs_resampling = not math.isclose(
+                float(sr),
+                float(native_sr),
+                rel_tol=0.0,
+                abs_tol=1e-6,
+            )
+            resampler = (
+                av.AudioResampler(format="fltp", layout="mono", rate=sr)
+                if needs_resampling
+                else None
+            )
+            for frame in container.decode(stream):
+                if needs_resampling:
+                    assert resampler is not None
+                    for out_frame in resampler.resample(frame):
+                        chunks.append(out_frame.to_ndarray())
+                else:
+                    chunks.append(frame.to_ndarray())
     except ValueError:
         raise
     except Exception as e:
@@ -77,37 +100,54 @@ def extract_audio_from_video_bytes(
     if not chunks:
         raise ValueError("No audio found in the video.")
 
-    audio = np.concatenate(chunks).astype(np.float32)
-    return audio, float(native_sr)
+    audio = np.concatenate(chunks, axis=-1).astype(np.float32)
+    if mono and audio.ndim > 1:
+        audio = np.mean(audio, axis=0)
 
+    return audio, sr
 
-def is_video(data: bytes) -> bool:
-    """Check if the fetched bytes are video"""
-    if len(data) < 12:
-        return False
 
-    box_type = data[4:8]
-    major_brand = data[8:12]
+def load_audio_soundfile(
+    path: BytesIO | Path | str,
+    *,
+    sr: float | None = 22050,
+    mono: bool = True,
+) -> tuple[np.ndarray, int]:
+    """Load audio via soundfile"""
+    with soundfile.SoundFile(path) as f:
+        native_sr = f.samplerate
+        y = f.read(dtype="float32", always_2d=False).T
 
-    MP4_BRANDS = {
-        b"mp41",
-        b"mp42",  # MP4
-        b"isom",  # ISO Base Media
-        b"iso2",
-        b"iso4",
-        b"iso5",
-        b"iso6",
-        b"M4V ",
-        b"M4A ",  # Apple
-        b"avc1",  # H.264
-        b"dash",  # DASH
-        b"mmp4",
-        b"MSNV",
-    }
+    if mono and y.ndim > 1:
+        y = np.mean(y, axis=tuple(range(y.ndim - 1)))
 
-    is_avi = data[:4] == b"RIFF" and major_brand == b"AVI "
-    is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS
-    return is_mp4 or is_avi
+    if sr is not None and sr != native_sr:
+        y = resampy.resample(y, sr_orig=native_sr, sr_new=sr)
+        return y, int(sr)
+    return y, native_sr
+
+
+def load_audio(
+    path: BytesIO | Path | str,
+    *,
+    sr: float | None = 22050,
+    mono: bool = True,
+):
+    try:
+        return load_audio_soundfile(path, sr=sr, mono=mono)
+    except soundfile.LibsndfileError as exc:
+        # Only fall back for known format-detection failures.
+        # Re-raise anything else (e.g. corrupt but recognised format).
+        if exc.code not in _BAD_SF_CODES:
+            raise
+        # soundfile may have advanced the BytesIO seek position before failing;
+        # reset it so PyAV can read from the beginning.
+        if isinstance(path, BytesIO):
+            path.seek(0)
+        try:
+            return load_audio_pyav(path, sr=sr, mono=mono)
+        except Exception as pyav_exc:
+            raise ValueError("Invalid or unsupported audio file.") from pyav_exc
 
 
 class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
@@ -128,9 +168,7 @@ def __init__(self, **kwargs) -> None:
         self.kwargs = kwargs
 
     def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
-        if is_video(data):
-            return extract_audio_from_video_bytes(data)
-        return librosa.load(BytesIO(data), sr=None)
+        return load_audio(BytesIO(data), sr=None)
 
     def load_base64(
         self,
@@ -140,7 +178,7 @@ def load_base64(
         return self.load_bytes(pybase64.b64decode(data))
 
     def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
-        return librosa.load(filepath, sr=None)
+        return load_audio(filepath, sr=None)
 
     def encode_base64(
         self,
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 6a588dad0207..9e1774e3921b 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -497,7 +497,7 @@ def __init__(
         *,
         target_sr: float | None = None,
         target_channels: int | None = None,
-        audio_resample_method: Literal["librosa", "scipy"] = "librosa",
+        audio_resample_method: Literal["pyav", "scipy"] = "pyav",
         video_needs_metadata: bool = False,
         expected_hidden_size: int | None = None,
     ) -> None:
diff --git a/vllm/parser/parser_manager.py b/vllm/parser/parser_manager.py
index 4331eba9884f..5577dfb1d8bb 100644
--- a/vllm/parser/parser_manager.py
+++ b/vllm/parser/parser_manager.py
@@ -199,7 +199,7 @@ def get_tool_parser(
         parser: type[ToolParser] | None = None
         if not enable_auto_tools or tool_parser_name is None:
             return parser
-        logger.info('"auto" tool choice has been enabled.')
+        logger.info_once('"auto" tool choice has been enabled.')
 
         try:
             if (
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index 7070fd0b604d..50a79cbb0b8d 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -511,6 +511,11 @@ def support_hybrid_kv_cache(cls) -> bool:
     def support_static_graph_mode(cls) -> bool:
         return True
 
+    @classmethod
+    def support_deep_gemm(cls) -> bool:
+        """Currently, only Hopper and Blackwell GPUs are supported."""
+        return cls.is_device_capability(90) or cls.is_device_capability_family(100)
+
     @classmethod
     def num_compute_units(cls, device_id: int = 0) -> int:
         return torch.cuda.get_device_properties(device_id).multi_processor_count
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 619b403ba4c1..39688bb8b235 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -712,6 +712,13 @@ def support_static_graph_mode(cls) -> bool:
         """
         return False
 
+    @classmethod
+    def support_deep_gemm(cls) -> bool:
+        """
+        Returns if DeepGEMM is supported by the current platform.
+        """
+        return False
+
     @classmethod
     def use_custom_op_collectives(cls) -> bool:
         """
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 3c5f8a0795c5..29d7d5ce8592 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -28,6 +28,7 @@
     from amdsmi import (
         AmdSmiException,
         amdsmi_get_gpu_asic_info,
+        amdsmi_get_gpu_device_uuid,
         amdsmi_get_processor_handles,
         amdsmi_init,
         amdsmi_shut_down,
@@ -377,7 +378,6 @@ class RocmPlatform(Platform):
         "fbgemm_fp8",
         "gguf",
         "quark",
-        "ptpc_fp8",
         "mxfp4",
         "petit_nvfp4",
         "torchao",
@@ -608,6 +608,20 @@ def get_device_name(cls, device_id: int = 0) -> str:
             return _ROCM_DEVICE_ID_NAME_MAP[device_name]
         return asic_info["market_name"]
 
+    @classmethod
+    @with_amdsmi_context
+    def get_device_uuid(cls, device_id: int = 0) -> str:
+        try:
+            device = amdsmi_get_processor_handles()[device_id]
+        except AmdSmiException as error:
+            logger.error("amdsmi device query failed ", exc_info=error)
+            return ""
+        try:
+            device_uuid = amdsmi_get_gpu_device_uuid(device)
+        except AmdSmiException as error:
+            logger.error("amdsmi device uuid query failed ", exc_info=error)
+        return device_uuid
+
     @classmethod
     def get_device_total_memory(cls, device_id: int = 0) -> int:
         device_props = torch.cuda.get_device_properties(device_id)
diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py
index e5e993b75556..b347ec831abc 100644
--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
@@ -7,9 +7,12 @@
 import msgspec
 
 from vllm.config import ModelConfig, PoolerConfig
+from vllm.logger import init_logger
 from vllm.sampling_params import RequestOutputKind
 from vllm.tasks import PoolingTask
 
+logger = init_logger(__name__)
+
 
 class LateInteractionParams(
     msgspec.Struct,
@@ -54,10 +57,6 @@ class PoolingParams(
     dimensions: int | None = None
     # --8<-- [end:embed-pooling-params]
 
-    ## for classification, scoring and rerank
-    # --8<-- [start:classify-pooling-params]
-    # --8<-- [end:classify-pooling-params]
-
     ## for step pooling models
     step_tag_id: int | None = None
     returned_token_ids: list[int] | None = None
@@ -79,7 +78,6 @@ def valid_parameters(self):
         return {
             "embed": ["dimensions", "use_activation"],
             "classify": ["use_activation"],
-            "score": ["use_activation"],
             "token_embed": ["dimensions", "use_activation"],
             "token_classify": ["use_activation"],
         }
@@ -89,6 +87,13 @@ def clone(self) -> "PoolingParams":
         return deepcopy(self)
 
     def verify(self, model_config: ModelConfig) -> None:
+        if self.task == "score":
+            logger.warning_once(
+                "`score` task is deprecated and will be removed in v0.20. "
+                "Please use `classify` instead."
+            )
+            self.task = "classify"
+
         # plugin task uses io_processor.parse_request to verify inputs,
         # skipping PoolingParams verify
         if self.task == "plugin":
@@ -184,7 +189,7 @@ def _set_default_parameters(self, model_config: ModelConfig):
                 elif self.dimensions < 1:
                     raise ValueError("Dimensions must be greater than 0")
 
-        elif self.task in ["classify", "score", "token_classify"]:
+        elif self.task in ["classify", "token_classify"]:
             if self.use_activation is None:
                 self.use_activation = True
         else:
diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py
index b468712adb0c..63946e8fdd22 100644
--- a/vllm/renderers/base.py
+++ b/vllm/renderers/base.py
@@ -172,9 +172,6 @@ def warmup(self, chat_params: ChatParams) -> None:
 
         For chat requests:
         - Jinja2 template compilation
-
-        For multi-modal requests:
-        - Importing libraries such as librosa triggers JIT compilation.
         """
         from vllm.entrypoints.chat_utils import ChatTemplateResolutionError
 
diff --git a/vllm/tasks.py b/vllm/tasks.py
index 83dd7f85eee0..4e324c188519 100644
--- a/vllm/tasks.py
+++ b/vllm/tasks.py
@@ -8,7 +8,6 @@
 PoolingTask = Literal[
     "embed",
     "classify",
-    "score",
     "token_embed",
     "token_classify",
     "plugin",
@@ -16,10 +15,6 @@
 ]
 POOLING_TASKS: tuple[PoolingTask, ...] = get_args(PoolingTask)
 
-# Score API handles score/rerank for:
-# - "score" task (score_type: cross-encoder models)
-# - "embed" task (score_type: bi-encoder models)
-# - "token_embed" task (score_type: late interaction models)
 ScoreType = Literal["bi-encoder", "cross-encoder", "late-interaction"]
 
 FrontendTask = Literal["render"]
diff --git a/vllm/transformers_utils/configs/colpali.py b/vllm/transformers_utils/configs/colpali.py
index f64aa7564fd6..c40c58b25ce1 100644
--- a/vllm/transformers_utils/configs/colpali.py
+++ b/vllm/transformers_utils/configs/colpali.py
@@ -27,7 +27,6 @@ def __init__(
         embedding_dim: int | None = None,
         embed_dim: int | None = None,
         dim: int | None = None,
-        projection_dim: int | None = None,
         colbert_dim: int | None = None,
         pooling: str | None = None,
         vlm_config: dict | None = None,
@@ -37,7 +36,6 @@ def __init__(
         self.embedding_dim = embedding_dim
         self.embed_dim = embed_dim
         self.dim = dim
-        self.projection_dim = projection_dim
         self.colbert_dim = colbert_dim
         self.pooling = pooling
 
diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py
index 822e8cdd0bcf..80fedd1017ca 100644
--- a/vllm/transformers_utils/configs/deepseek_vl2.py
+++ b/vllm/transformers_utils/configs/deepseek_vl2.py
@@ -90,8 +90,6 @@ def __init__(
 class DeepseekVLV2Config(PretrainedConfig):
     model_type = "deepseek_vl_v2"
     architectures: list[str] | None = None
-    vision_config: VisionEncoderConfig
-    projector_config: MlpProjectorConfig
 
     tile_tag: str = "2D"
     global_view_pos: str = "head"
diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py
index 90728bbffb60..bdeadec1bf07 100644
--- a/vllm/transformers_utils/configs/mistral.py
+++ b/vllm/transformers_utils/configs/mistral.py
@@ -257,7 +257,6 @@ def _remap_mistral_audio_args(config: dict) -> dict:
             encoder_attention_heads=encoder_args["n_heads"],
             encoder_head_dim=encoder_args["head_dim"],
             vocab_size=encoder_args["vocab_size"],
-            max_source_positions=encoder_args["max_source_positions"],
             is_encoder_decoder=False,  # Override WhisperConfig default
             is_causal=encoder_args.get("causal", False),
             sliding_window=encoder_args.get("sliding_window", None),
@@ -270,6 +269,10 @@ def _remap_mistral_audio_args(config: dict) -> dict:
             max_position_embeddings=block_pool_size * config["max_position_embeddings"],
         ),
     }
+    # Sometimes max_source_positions is explicitly set to None in params.json but this
+    # is not a valid value for WhisperConfig (or downstream code that uses it).
+    if (max_source_positions := encoder_args.get("max_source_positions")) is not None:
+        config["audio_config"].max_source_positions = max_source_positions
     if quant_config:
         config["quantization_config"] = quant_config
     return config
diff --git a/vllm/transformers_utils/configs/parakeet.py b/vllm/transformers_utils/configs/parakeet.py
index efd4c466478b..7c7a5ddd800e 100644
--- a/vllm/transformers_utils/configs/parakeet.py
+++ b/vllm/transformers_utils/configs/parakeet.py
@@ -6,11 +6,21 @@
 
 
 class ParakeetConfig(ParakeetEncoderConfig):
-    llm_hidden_size: int
-    projection_hidden_size: int
-    projection_bias: bool
-    projection_eps: float = 1e-5
-    sampling_rate: int
+    def __init__(
+        self,
+        llm_hidden_size: int,
+        projection_hidden_size: int,
+        projection_bias: bool,
+        sampling_rate: int,
+        projection_eps: float = 1e-5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.llm_hidden_size = llm_hidden_size
+        self.projection_hidden_size = projection_hidden_size
+        self.projection_bias = projection_bias
+        self.sampling_rate = sampling_rate
+        self.projection_eps = projection_eps
 
     @staticmethod
     def from_hf_config(
diff --git a/vllm/transformers_utils/configs/qwen3_asr.py b/vllm/transformers_utils/configs/qwen3_asr.py
index 28fa96e72f40..a08b2b7de34e 100644
--- a/vllm/transformers_utils/configs/qwen3_asr.py
+++ b/vllm/transformers_utils/configs/qwen3_asr.py
@@ -408,7 +408,6 @@ def __init__(
         support_languages=None,
         **kwargs,
     ):
-        super().__init__(**kwargs)
         if thinker_config is None:
             thinker_config = {}
             logger.info(
@@ -417,6 +416,7 @@ def __init__(
 
         self.thinker_config = Qwen3ASRThinkerConfig(**thinker_config)
         self.support_languages = support_languages
+        super().__init__(**kwargs)
 
     def get_text_config(self, decoder=False) -> "PretrainedConfig":
         """
diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py
index 2a39e2f16b06..697c9d52e81b 100644
--- a/vllm/transformers_utils/configs/speculators/base.py
+++ b/vllm/transformers_utils/configs/speculators/base.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
+from dataclasses import fields, is_dataclass
 from typing import Any
 
 from transformers import PretrainedConfig
@@ -15,11 +16,21 @@ class SpeculatorsConfig(PretrainedConfig):
     model_type = "speculators"
 
     def __init__(self, **kwargs):
-        """In Transformers v5, `PretrainedConfig` is decorated with `dataclass` and
-        `huggingface_hub.dataclasses.strict(accept_kwargs=True)`.
-        Inheriting classes do not inherit the `accept_kwargs=True` behaviour so we must
-        explicitly pass any kwargs to `PretrainedConfig.__init__`."""
-        super().__init__(**kwargs)
+        # Transformers v4 - super().__init__ which sets all kwargs as attributes
+        if not is_dataclass(PretrainedConfig):
+            return super().__init__(**kwargs)
+        # Transformers v5 - super().__init__ performs some validation before
+        # setting all kwargs as attributes, so we set them first to be safe
+        pre_trained_config_fields = {f.name for f in fields(PretrainedConfig)}
+        super_kwargs = dict()
+        for key, value in kwargs.items():
+            if key == "model_type":
+                continue  # model_type is set as a class variable, so skip it here
+            elif key in pre_trained_config_fields:
+                super_kwargs[key] = value
+            else:
+                setattr(self, key, value)
+        super().__init__(**super_kwargs)
 
     @classmethod
     def from_pretrained(
diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py
index 26fc0404200f..f5fb290d1491 100644
--- a/vllm/transformers_utils/model_arch_config_convertor.py
+++ b/vllm/transformers_utils/model_arch_config_convertor.py
@@ -228,7 +228,7 @@ def is_deepseek_mla(self) -> bool:
             "pangu_ultra_moe_mtp",
             "bailing_hybrid",
         ):
-            return self.hf_text_config.kv_lora_rank is not None
+            return getattr(self.hf_text_config, "kv_lora_rank", None) is not None
         elif self.hf_text_config.model_type == "eagle":
             # if the model is an EAGLE module, check for the
             # underlying architecture
@@ -241,7 +241,7 @@ def is_deepseek_mla(self) -> bool:
                     "deepseek_v32",
                     "deepseek_mtp",
                 )
-                and self.hf_text_config.kv_lora_rank is not None
+                and getattr(self.hf_text_config, "kv_lora_rank", None) is not None
             )
         return False
 
diff --git a/vllm/transformers_utils/processors/fireredasr2.py b/vllm/transformers_utils/processors/fireredasr2.py
index 4bde53015003..bba7e7ee0495 100644
--- a/vllm/transformers_utils/processors/fireredasr2.py
+++ b/vllm/transformers_utils/processors/fireredasr2.py
@@ -188,7 +188,7 @@ def padding_position_is_0(padded_input, input_lengths):
         for speech in raw_speech:
             """
             We must multiply by 32768 here because FireRedASR2 loads audio data
-            using kaldiio.load_mat, while vLLM loads audio data using librosa.
+            using kaldiio.load_mat, while vLLM loads audio data using pyav.
             """
             speech = speech * 32768
             fbank = self.fbank(sampling_rate, speech)
diff --git a/vllm/transformers_utils/processors/isaac.py b/vllm/transformers_utils/processors/isaac.py
index 986b70840d25..1464afc6677f 100644
--- a/vllm/transformers_utils/processors/isaac.py
+++ b/vllm/transformers_utils/processors/isaac.py
@@ -6,12 +6,14 @@
 from typing import Any
 
 import numpy as np
-import PIL.Image
 import torch
 import torch.nn.functional as F
+from PIL import Image
 from transformers import BatchFeature, ProcessorMixin, TensorType
 from typing_extensions import TypedDict, Unpack
 
+from vllm.tokenizers.hf import HfTokenizer
+
 MAX_PIXELS = 60_000_000  # 60-megapixel ceiling ≈ 8200 × 7300 px
 
 # Vision preprocessing constants
@@ -39,7 +41,7 @@ def _make_writeable(arr: np.ndarray) -> np.ndarray:
         return arr.copy()
 
 
-def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None:
+def extract_image_pil(image: Image.Image) -> torch.Tensor:
     if image.width * image.height > MAX_PIXELS:
         raise ValueError(
             f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`"
@@ -314,31 +316,30 @@ class IsaacImageProcessorKwargs(TypedDict, total=False):
 
 
 class IsaacImageProcessor:
-    patch_size = 16
-    max_num_patches = 6144
-    min_num_patches = 256
-    pixel_shuffle_scale = 2
-
     valid_kwargs = IsaacImageProcessorKwargs
     model_input_names = ["pixel_values", "image_grid_thw"]
 
-    def __init__(self, kwargs):
-        self.patch_size = kwargs.pop("patch_size", self.patch_size)
-        self.vision_max_num_patches = kwargs.pop(
-            "vision_max_num_patches", self.max_num_patches
-        )
-        self.vision_min_num_patches = kwargs.pop(
-            "vision_min_num_patches", self.min_num_patches
-        )
-        self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2)
-
-    def preprocess(
+    def __init__(
         self,
-        images: list[torch.Tensor],
-        return_tensors: str | TensorType | None,
+        patch_size: int = 16,
+        vision_max_num_patches: int = 6144,
+        vision_min_num_patches: int = 256,
+        pixel_shuffle_scale: int = 2,
+    ) -> None:
+        self.patch_size = patch_size
+        self.vision_max_num_patches = vision_max_num_patches
+        self.vision_min_num_patches = vision_min_num_patches
+        self.pixel_shuffle_scale = pixel_shuffle_scale
+
+    def __call__(
+        self,
+        images: Image.Image | list[Image.Image],
+        return_tensors: str | TensorType | None = None,
         **kwargs: Unpack[IsaacImageProcessorKwargs],
     ) -> BatchFeature:
         """Preprocess images into format compatible with vLLM input processing."""
+        if not isinstance(images, list):
+            images = [images]
 
         all_pixel_values: list[torch.Tensor] = []
         all_image_grids: list[torch.Tensor] = []
@@ -388,23 +389,40 @@ def preprocess(
 class IsaacProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]
 
-    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
-        self.image_token = kwargs.pop("image_token", "<image>")
+    def __init__(
+        self,
+        image_processor: IsaacImageProcessor,
+        tokenizer: HfTokenizer,
+        image_token: str = "<image>",
+    ):
         self.image_processor = image_processor
         self.tokenizer = tokenizer
 
-    def __call__(self, text=None, images=None, **kwargs) -> BatchFeature:
-        result = {}
+        self.image_token = image_token
 
+    def __call__(
+        self,
+        text: str | list[str] | None = None,
+        images: Image.Image | list[Image.Image] | None = None,
+        return_tensors: str | TensorType | None = None,
+        **kwargs,
+    ) -> BatchFeature:
         if images is not None:
-            image_inputs = self.image_processor.preprocess(images, **kwargs)
+            image_inputs = self.image_processor(
+                images,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
             image_grid_thw = image_inputs["image_grid_thw"]
-            result.update(image_inputs)
+        else:
+            image_inputs = {}
+            image_grid_thw = []
 
-            if text is not None:
-                if not isinstance(text, list):
-                    text = [text]
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
 
+            if image_inputs:
                 text = text.copy()  # below lines change text in-place
                 merge_length = self.image_processor.pixel_shuffle_scale**2
                 index = 0
@@ -417,10 +435,14 @@ def __call__(self, text=None, images=None, **kwargs) -> BatchFeature:
                         index += 1
                     text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
 
-        if text is not None:
-            result.update(self.tokenizer(text, **kwargs))
+            text_inputs = self.tokenizer(text, return_tensors=return_tensors)
+        else:
+            text_inputs = {}
 
-        return BatchFeature(result)
+        return BatchFeature(
+            data={**text_inputs, **image_inputs},
+            tensor_type=return_tensors,
+        )
 
     def apply_chat_template(
         self,
diff --git a/vllm/transformers_utils/processors/kimi_k25.py b/vllm/transformers_utils/processors/kimi_k25.py
index 06147f2113dd..edee9734ce42 100644
--- a/vllm/transformers_utils/processors/kimi_k25.py
+++ b/vllm/transformers_utils/processors/kimi_k25.py
@@ -1,38 +1,41 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
-from transformers import BatchFeature
+from transformers import BaseImageProcessor, BatchFeature, TensorType
 from transformers.processing_utils import ProcessorMixin
 
 from vllm.multimodal.inputs import VisionChunk
+from vllm.tokenizers.hf import HfTokenizer
 
 
 class KimiK25Processor(ProcessorMixin):
-    attributes = ["tokenizer"]
-    tokenizer_class = "AutoTokenizer"
+    attributes = ["image_processor", "tokenizer"]
 
     def __init__(
-        self, media_processor=None, tokenizer=None, media_token_id: int | None = None
-    ):
-        super().__init__(tokenizer)
-        self.media_processor = media_processor
+        self,
+        image_processor: BaseImageProcessor,
+        tokenizer: HfTokenizer,
+        media_token_id: int,
+    ) -> None:
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+
         self.media_token_id = media_token_id
-        assert self.media_token_id is not None
 
     def __call__(
         self,
+        text: str | list[str] | None = None,
         vision_chunks: list[VisionChunk] | None = None,
-        *,
-        text: list[int] | str,
+        return_tensors: str | TensorType | None = None,
         **kwargs,
     ) -> BatchFeature:
         """
         Args:
-            vision_chunks: List of VisionChunk items to be processed.
-                For image: VisionChunkImage with type='image', image=PIL.Image
-                For video_chunk: VisionChunkVideo with type='video_chunk',
-                  video_chunk=list[PIL.Image]
-            text: The token ids to be fed to a model (required).
+            text: The text to be field to the model.
+            vision_chunks: List of `VisionChunk` items to be processed.
+                For image: `VisionChunkImage` with
+                  `type='image', image=PIL.Image`
+                For video_chunk: `VisionChunkVideo` with
+                  `type='video_chunk', video_chunk=list[PIL.Image]`
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
@@ -42,31 +45,44 @@ def __call__(
             - **grid_thws** -- list of image 3D grid in LLM.
               Returned when `vision_chunks` is not `None`.
         """
-        mm_inputs = {}
-        input_ids = self.tokenizer.encode(text) if isinstance(text, str) else text
         if vision_chunks is not None:
-            assert isinstance(vision_chunks, list)
-            mm_inputs = self.media_processor.preprocess(vision_chunks)
+            mm_inputs = self.image_processor.preprocess(
+                vision_chunks,
+                return_tensors=return_tensors,
+            )
+        else:
+            mm_inputs = {}
+
+        if text is not None:
+            if not isinstance(text, list):
+                text = [text]
+
+            text_inputs = self.tokenizer(text)
+
+            # Note: Modify in-place
+            input_ids: list[list[int]] = text_inputs["input_ids"]  # type: ignore
+
+            if vision_chunks is not None:
+                num_tokens_per_chunk = [
+                    self.image_processor.media_tokens_calculator(chunk)
+                    for chunk in vision_chunks
+                ]
 
-            num_tokens_per_chunk = [
-                self.media_processor.media_tokens_calculator(chunk)
-                for chunk in vision_chunks
-            ]
+                for i in range(len(input_ids)):
+                    new_input_ids = []
+                    for token in input_ids[i]:
+                        if token == self.media_token_id:
+                            new_input_ids.extend(
+                                [self.media_token_id] * num_tokens_per_chunk.pop(0)
+                            )
+                        else:
+                            new_input_ids.append(token)
 
-            new_input_ids = []
-            for token in input_ids:
-                if token == self.media_token_id:
-                    new_input_ids.extend(
-                        [self.media_token_id] * num_tokens_per_chunk.pop(0)
-                    )
-                else:
-                    new_input_ids.append(token)
-            input_ids = new_input_ids
+                    input_ids[i] = new_input_ids
+        else:
+            text_inputs = {}
 
-        # XXX: _apply_hf_processor_text_mm will call tolist() on input_ids
         return BatchFeature(
-            data={
-                "input_ids": torch.tensor([input_ids]),
-                **mm_inputs,
-            }
+            data={**text_inputs, **mm_inputs},
+            tensor_type=return_tensors,
         )
diff --git a/vllm/transformers_utils/processors/step3_vl.py b/vllm/transformers_utils/processors/step3_vl.py
index 66cf10e39588..71540f433fd1 100644
--- a/vllm/transformers_utils/processors/step3_vl.py
+++ b/vllm/transformers_utils/processors/step3_vl.py
@@ -286,11 +286,9 @@ def _convert_images_to_pixel_values(
 
     def __call__(
         self,
-        images: Image.Image | list[Image.Image] | None = None,
+        images: Image.Image | list[Image.Image],
         return_tensors: str | TensorType | None = None,
     ) -> BatchFeature:
-        if images is None:
-            images = []
         if not isinstance(images, list):
             images = [images]
 
diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py
index ee104a6cc75c..fb6208212ae9 100644
--- a/vllm/utils/deep_gemm.py
+++ b/vllm/utils/deep_gemm.py
@@ -70,10 +70,7 @@ def is_deep_gemm_supported() -> bool:
     """Return `True` if DeepGEMM is supported on the current platform.
     Currently, only Hopper and Blackwell GPUs are supported.
     """
-    is_supported_arch = current_platform.is_cuda() and (
-        current_platform.is_device_capability(90)
-        or current_platform.is_device_capability_family(100)
-    )
+    is_supported_arch = current_platform.support_deep_gemm()
     return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch
 
 
diff --git a/vllm/v1/attention/backend.py b/vllm/v1/attention/backend.py
index d7283b6c846f..cd49ea30e6f4 100644
--- a/vllm/v1/attention/backend.py
+++ b/vllm/v1/attention/backend.py
@@ -362,6 +362,11 @@ class CommonAttentionMetadata:
     dcp_local_seq_lens_cpu: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
+    is_prefilling: torch.Tensor | None = None
+    """(batch_size,) bool tensor: True if request is still in prefill phase
+    (num_computed_tokens < num_prompt_tokens). Used by some backends to
+    distinguish actual decodes from short extends."""
+
     # WARNING: Deprecated fields. Will be removed in a future release (v0.15.0)
     _seq_lens_cpu: torch.Tensor | None = None
     _num_computed_tokens_cpu: torch.Tensor | None = None
@@ -443,6 +448,7 @@ def unpadded(
             encoder_seq_lens_cpu=maybe_slice_reqs(self.encoder_seq_lens_cpu),
             dcp_local_seq_lens=maybe_slice_reqs(self.dcp_local_seq_lens),
             dcp_local_seq_lens_cpu=maybe_slice_reqs(self.dcp_local_seq_lens_cpu),
+            is_prefilling=maybe_slice_reqs(self.is_prefilling),
         )
 
 
diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py
index bdb820eac35e..59f2e7ca51a6 100644
--- a/vllm/v1/attention/backends/mamba_attn.py
+++ b/vllm/v1/attention/backends/mamba_attn.py
@@ -358,7 +358,9 @@ def _compute_common_metadata(
 
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
             split_decodes_and_prefills(
-                common_attn_metadata, decode_threshold=decode_threshold
+                common_attn_metadata,
+                decode_threshold=decode_threshold,
+                treat_short_extends_as_decodes=False,
             )
         )
 
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py
index 3de0dcdd8c01..16d01bd338ca 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py
@@ -77,17 +77,17 @@ def supports_combination(
         use_sparse: bool,
         device_capability: DeviceCapability,
     ) -> str | None:
-        # FlashInfer MLA kernel requires qk_nope_head_dim in [64, 128]
+        # FlashInfer MLA kernel requires qk_nope_head_dim in [64, 128, 192]
         from vllm.config import get_current_vllm_config
 
         vllm_config = get_current_vllm_config()
         if vllm_config.model_config is not None:
             hf_text_config = vllm_config.model_config.hf_text_config
             qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1)
-            if qk_nope_head_dim not in [64, 128]:
+            if qk_nope_head_dim not in [64, 128, 192]:
                 return (
-                    f"FlashInfer MLA kernel requires qk_nope_head_dim in [64, 128], "
-                    f"but got {qk_nope_head_dim}"
+                    "FlashInfer MLA kernel requires qk_nope_head_dim "
+                    f"in [64, 128, 192], but got {qk_nope_head_dim}"
                 )
         return None
 
diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py b/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py
index 9554457b494e..7b5ec0d4976a 100644
--- a/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py
+++ b/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py
@@ -113,17 +113,17 @@ def supports_combination(
         use_sparse: bool,
         device_capability: DeviceCapability,
     ) -> str | None:
-        # FlashInfer MLA sparse kernel requires qk_nope_head_dim == 128
+        # FlashInfer MLA sparse kernel requires qk_nope_head_dim in [128, 192]
         from vllm.config import get_current_vllm_config
 
         vllm_config = get_current_vllm_config()
         if vllm_config.model_config is not None:
             hf_text_config = vllm_config.model_config.hf_text_config
             qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1)
-            if qk_nope_head_dim != 128:
+            if qk_nope_head_dim not in [128, 192]:
                 return (
-                    f"FlashInfer MLA Sparse kernel requires qk_nope_head_dim == 128, "
-                    f"but got {qk_nope_head_dim}"
+                    "FlashInfer MLA Sparse kernel requires qk_nope_head_dim "
+                    f"in [128, 192], but got {qk_nope_head_dim}"
                 )
             # Check for index_topk which indicates sparse model
             if not hasattr(hf_text_config, "index_topk"):
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
index 42459815ef9e..0f41993fc695 100644
--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -489,11 +489,15 @@ def split_decodes_and_prefills(
     common_attn_metadata: CommonAttentionMetadata,
     decode_threshold: int = 1,
     require_uniform: bool = False,
+    treat_short_extends_as_decodes: bool = True,
 ) -> tuple[int, int, int, int]:
     """
     Assuming a reordered batch, finds the boundary between prefill and decode
     requests.
 
+    The batch is expected to be ordered as:
+        decode → short_extend → long_extend → prefill
+
     Args:
         common_attn_metadata: CommonAttentionMetadata object containing the
             batch metadata.
@@ -501,6 +505,9 @@ def split_decodes_and_prefills(
         require_uniform: If True, requires that all decode requests have the
             same query length. When set, some queries may be considered prefills
             even if they are <= decode_threshold, in order to ensure uniformity.
+        treat_short_extends_as_decodes: If True (default), short extends
+            (query_len <= threshold but still prefilling) are counted as
+            decodes. If False, they are counted as prefills.
 
     Returns:
         num_decodes: The number of decode requests.
@@ -513,8 +520,10 @@ def split_decodes_and_prefills(
     num_tokens = common_attn_metadata.num_actual_tokens
     query_start_loc = common_attn_metadata.query_start_loc_cpu
 
-    if max_query_len <= decode_threshold and (
-        not require_uniform or decode_threshold <= 1
+    if (
+        max_query_len <= decode_threshold
+        and (not require_uniform or decode_threshold <= 1)
+        and treat_short_extends_as_decodes
     ):
         return num_reqs, 0, num_tokens, 0
 
@@ -533,11 +542,14 @@ def split_decodes_and_prefills(
     else:
         is_prefill = query_lens > decode_threshold
 
+    if not treat_short_extends_as_decodes:
+        assert common_attn_metadata.is_prefilling is not None
+        is_prefill |= common_attn_metadata.is_prefilling
+
     if not torch.any(is_prefill):
         return num_reqs, 0, num_tokens, 0
 
     first_prefill = is_prefill.int().argmax(dim=-1).item()
-    assert torch.all(query_lens[:first_prefill] <= decode_threshold)
     num_decodes = first_prefill
     num_prefills = num_reqs - num_decodes
     num_decode_tokens = query_start_loc[first_prefill].item()
@@ -581,39 +593,52 @@ def reorder_batch_to_split_decodes_and_prefills(
     Reorders the batch to split into prefill and decode requests; places all
     requests with <= decode_threshold tokens at the front of the batch.
 
+    The batch is reordered into 4 regions:
+        decode:        (num_scheduled <= threshold AND is not prefilling)
+        short_extend:  (num_scheduled <= threshold AND is chunked prefilling)
+        long_extend:   (num_scheduled > threshold AND is chunked prefilling)
+        prefill:       (num_computed == 0)   # First chunks
+
     Returns:
         True if the batch was modified, False otherwise.
     """
-    # We now want to reorder the batch into decode → extend → prefill order
-    # where:
-    #   decode: request with num_scheduled_tokens <= decode_threshold
-    #   extend: non-decode request with existing context
-    #   prefill: non-decode request with no existing context
-    # NOTE for now we loosely use "decode" to mean requests where attention is
-    #  likely memory-bound and "prefill" to mean requests where attention is
-    #  likely compute-bound,
     num_reqs = len(input_batch.req_ids)
     num_scheduled_tokens = [
         scheduler_output.num_scheduled_tokens[id] for id in input_batch.req_ids
     ]
     num_scheduled_tokens_np = np.array(num_scheduled_tokens)
     num_computed_tokens_np = input_batch.num_computed_tokens_cpu[:num_reqs]
-
-    is_prefill = num_computed_tokens_np == 0
-    is_decode = (num_scheduled_tokens_np <= decode_threshold) & (~is_prefill)
-    is_extend = (num_scheduled_tokens_np > decode_threshold) & (~is_prefill)
-
-    # Desired order: decode → extend → prefill
-    req_regions = np.zeros(is_decode.shape, dtype=np.int32)  # 0 = decode by default
-    req_regions[is_extend] = 1
-    req_regions[is_prefill] = 2
+    num_prompt_tokens_np = input_batch.num_prompt_tokens[:num_reqs]
+
+    has_context = num_computed_tokens_np > 0
+    is_below_threshold = num_scheduled_tokens_np <= decode_threshold
+    done_prefilling = num_computed_tokens_np >= num_prompt_tokens_np
+
+    # Mutually exclusive categories (exactly one True per request):
+    # 1. No context yet -> prefill
+    # 2. Has context, above threshold -> long_extend
+    # 3. Has context, below threshold, still prefilling -> short_extend
+    # 4. Has context, below threshold, done prefilling -> decode
+    is_pure_prefill = ~has_context
+    is_long_extend = has_context & ~is_below_threshold
+    is_short_extend = has_context & is_below_threshold & ~done_prefilling
+    is_decode = has_context & is_below_threshold & done_prefilling
+
+    # Desired order: decode → short_extend → long_extend → prefill
+    req_regions = np.zeros(num_reqs, dtype=np.int32)  # 0 = decode by default
+    req_regions[is_short_extend] = 1
+    req_regions[is_long_extend] = 2
+    req_regions[is_pure_prefill] = 3
 
     num_decodes = int(is_decode.sum())
-    num_extends = int(is_extend.sum())
+    num_short_extends = int(is_short_extend.sum())
+    num_long_extends = int(is_long_extend.sum())
+    num_prefills = int(is_pure_prefill.sum())
 
-    target_regions = np.zeros(num_reqs, dtype=np.int32)
-    target_regions[num_decodes : num_decodes + num_extends] = 1
-    target_regions[num_decodes + num_extends :] = 2
+    target_regions = np.repeat(
+        [0, 1, 2, 3],
+        [num_decodes, num_short_extends, num_long_extends, num_prefills],
+    ).astype(np.int32)
 
     needs_swap = req_regions != target_regions
 
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 421b25c0d0d4..0fa59579ee76 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -13,6 +13,7 @@
 from functools import partial
 from inspect import isclass, signature
 from logging import DEBUG
+from multiprocessing.queues import Queue
 from typing import Any, TypeVar, cast
 
 import msgspec
@@ -59,6 +60,7 @@
     UtilityOutput,
     UtilityResult,
 )
+from vllm.v1.engine.tensor_ipc import TensorIpcReceiver
 from vllm.v1.engine.utils import (
     EngineHandshakeMetadata,
     EngineZmqAddresses,
@@ -788,6 +790,7 @@ def __init__(
         executor_class: type[Executor],
         log_stats: bool,
         client_handshake_address: str | None = None,
+        tensor_queue: Queue | None = None,
         *,
         engine_index: int = 0,
     ):
@@ -802,6 +805,12 @@ def __init__(
         self.engines_running = False
         self.shutdown_state = EngineShutdownState.RUNNING
 
+        # Receiver for tensor IPC
+        self.tensor_ipc_receiver: TensorIpcReceiver | None = None
+        if tensor_queue is not None:
+            self.tensor_ipc_receiver = TensorIpcReceiver(tensor_queue)
+            logger.info("Using tensor IPC queue for multimodal tensor sharing")
+
         with self._perform_handshakes(
             handshake_address,
             identity,
@@ -1340,9 +1349,11 @@ def process_input_sockets(
     ):
         """Input socket IO thread."""
 
-        # Msgpack serialization decoding.
-        add_request_decoder = MsgpackDecoder(EngineCoreRequest)
-        generic_decoder = MsgpackDecoder()
+        # Msgpack serialization decoding with optional tensor IPC receiver.
+        add_request_decoder = MsgpackDecoder(
+            EngineCoreRequest, oob_tensor_provider=self.tensor_ipc_receiver
+        )
+        generic_decoder = MsgpackDecoder(oob_tensor_provider=self.tensor_ipc_receiver)
 
         with ExitStack() as stack, zmq.Context() as ctx:
             input_sockets = [
@@ -1418,10 +1429,7 @@ def process_input_sockets(
                     self.input_queue.put_nowait((request_type, request))
 
     def process_output_sockets(
-        self,
-        output_paths: list[str],
-        coord_output_path: str | None,
-        engine_index: int,
+        self, output_paths: list[str], coord_output_path: str | None, engine_index: int
     ):
         """Output socket IO thread."""
 
@@ -1580,6 +1588,7 @@ def __init__(
         executor_class: type[Executor],
         log_stats: bool,
         client_handshake_address: str | None = None,
+        tensor_queue: Queue | None = None,
     ):
         assert vllm_config.model_config.is_moe, (
             "DPEngineCoreProc should only be used for MoE models"
@@ -1605,6 +1614,7 @@ def __init__(
             log_stats,
             client_handshake_address,
             engine_index=dp_rank,
+            tensor_queue=tensor_queue,
         )
 
     def _init_data_parallel(self, vllm_config: VllmConfig):
@@ -1694,6 +1704,8 @@ def run_busy_loop(self):
             if self.eep_scaling_state is not None:
                 _ = self.eep_scaling_state.progress()
                 if self.eep_scaling_state.is_complete():
+                    if self.eep_scaling_state.worker_type == "removing":
+                        raise SystemExit
                     self.process_input_queue_block = True
                     self.eep_scaling_state = None
 
@@ -1857,20 +1869,7 @@ def _eep_scale_up_before_kv_init(self):
             scale_type="scale_up",
             reconfig_request=None,
         )
-        self.model_executor.collective_rpc("init_device")
-        self.model_executor.collective_rpc("load_model")
-        self._eep_send_engine_core_notification(
-            EEPNotificationType.NEW_CORE_ENGINES_WEIGHTS_INIT_READY
-        )
-        self.model_executor.collective_rpc(
-            "elastic_ep_execute", args=("receive_weights",)
-        )
-        self.available_gpu_memory_for_kv_cache = (
-            ParallelConfig.sync_kv_cache_memory_size(self.dp_group, -1)
-        )
-        self.model_executor.collective_rpc(
-            "elastic_ep_execute", args=("prepare_new_worker",)
-        )
+        self.eep_scaling_state.run_pre_kv_init_states()
         self.process_input_queue_block = False
 
 
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index 91664058d8c4..b9a3c7545e16 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -12,6 +12,7 @@
 from collections.abc import Awaitable, Callable, Sequence
 from concurrent.futures import Future
 from dataclasses import dataclass
+from multiprocessing.queues import Queue
 from threading import Thread
 from typing import Any, TypeAlias, TypeVar
 
@@ -45,6 +46,7 @@
 from vllm.v1.engine.coordinator import DPCoordinator
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
 from vllm.v1.engine.exceptions import EngineDeadError
+from vllm.v1.engine.tensor_ipc import TensorIpcSender
 from vllm.v1.engine.utils import (
     CoreEngineActorManager,
     CoreEngineProcManager,
@@ -477,9 +479,6 @@ def __init__(
         client_addresses: dict[str, str] | None = None,
     ):
         self.vllm_config = vllm_config
-        # Serialization setup.
-        self.encoder = MsgpackEncoder()
-        self.decoder = MsgpackDecoder(EngineCoreOutputs)
 
         # ZMQ setup.
         sync_ctx = zmq.Context(io_threads=2)
@@ -501,11 +500,14 @@ def __init__(
             enable_input_socket_handover = parallel_config.enable_elastic_ep
 
             self.stats_update_address: str | None = None
+            tensor_queue: Queue | None = None
             if client_addresses:
                 # Engines are managed externally to this client.
                 input_address = client_addresses["input_address"]
                 output_address = client_addresses["output_address"]
                 self.stats_update_address = client_addresses.get("stats_update_address")
+                # Tensor queues passed via client_addresses for multi-API-server case
+                tensor_queue = client_addresses.get("tensor_queue")  # type: ignore[assignment]
                 self.input_socket = self.resources.input_socket = make_zmq_socket(
                     self.ctx,
                     input_address,
@@ -532,7 +534,7 @@ def __init__(
 
                 with launch_core_engines(
                     vllm_config, executor_class, log_stats, addresses
-                ) as (engine_manager, coordinator, addresses):
+                ) as (engine_manager, coordinator, addresses, tensor_queue):
                     self.resources.coordinator = coordinator
                     self.resources.engine_manager = engine_manager
 
@@ -542,6 +544,17 @@ def __init__(
                         coordinator.get_stats_publish_address()
                     )
 
+            # Serialization setup with tensor queues for multimodal tensor IPC.
+            tensor_ipc_sender: TensorIpcSender | None = None
+            model_config = getattr(vllm_config, "model_config", None)
+            if model_config is not None and model_config.multimodal_config is not None:
+                mm_tensor_ipc = model_config.multimodal_config.mm_tensor_ipc
+                if mm_tensor_ipc == "torch_shm" and tensor_queue is not None:
+                    tensor_ipc_sender = TensorIpcSender(tensor_queue)
+
+            self.encoder = MsgpackEncoder(oob_tensor_consumer=tensor_ipc_sender)
+            self.decoder = MsgpackDecoder(EngineCoreOutputs)
+
             dp_size = parallel_config.data_parallel_size
             dp_rank = parallel_config.data_parallel_index
             dp_local_size = parallel_config.data_parallel_size_local
diff --git a/vllm/v1/engine/tensor_ipc.py b/vllm/v1/engine/tensor_ipc.py
new file mode 100644
index 000000000000..2a8391b17e6d
--- /dev/null
+++ b/vllm/v1/engine/tensor_ipc.py
@@ -0,0 +1,178 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""Tensor IPC transport via torch.multiprocessing.Queue.
+
+This module contains the queue-based transport logic for sharing tensors
+between processes (e.g., API server -> engine core). The msgpack layer
+emits/consumes lightweight :class:`TensorIpcData` values, while transport
+state such as request association, handle generation, queue routing, buffering,
+and cleanup lives here.
+"""
+
+import dataclasses
+import uuid
+from collections import defaultdict
+from dataclasses import field
+from multiprocessing.queues import Queue as MPQueue
+from typing import Any
+
+import torch
+
+from vllm.logger import init_logger
+from vllm.v1.serial_utils import OOBTensorConsumer
+
+logger = init_logger(__name__)
+
+TensorIpcQueue = MPQueue
+
+
+@dataclasses.dataclass
+class TensorIpcData:
+    """
+    Data sent via torch.multiprocessing.Queue for zero-copy IPC.
+
+    Contains the tensor_id and the actual tensor. The tensor is
+    shared in memory (GPU or CPU) for efficient inter-process communication.
+    """
+
+    sender_id: str
+    message_id: int
+    tensor_id: int
+    tensor: torch.Tensor
+
+
+class TensorIpcSender(OOBTensorConsumer):
+    """Send-side logic for tensor IPC via torch.multiprocessing.Queue.
+
+    Uses a single queue targeting rank 0 (the only rank that consumes
+    multimodal tensors during TP>1 / PP>1. Note: DP>1 not supported).
+    """
+
+    def __init__(self, queue: TensorIpcQueue):
+        self.queue = queue
+        self._tensor_id_counter = 0
+        self._message_counter = 0
+        self._sender_id = uuid.uuid4().hex[:8]
+
+    def set_target_engine(self, target_engine: int) -> None:
+        if target_engine != 0:
+            raise IndexError(
+                "TensorIpcSender only supports a single queue; "
+                f"got target engine {target_engine}"
+            )
+
+    def new_message(self) -> None:
+        self._message_counter += 1
+        self._tensor_id_counter = 0
+
+    def __call__(self, tensor: torch.Tensor) -> dict[str, Any] | None:
+        """Send tensor via queue, return its handle. Returns None if failed."""
+        try:
+            # Move tensor to shared memory for IPC
+            # This is required for proper inter-process communication
+            if not tensor.is_shared():
+                tensor = tensor.share_memory_()
+
+            metadata = {
+                "sender_id": self._sender_id,
+                "message_id": self._message_counter,
+                "tensor_id": self._tensor_id_counter,
+            }
+
+            self._tensor_id_counter += 1
+
+            ipc_data = TensorIpcData(**metadata, tensor=tensor)  # type: ignore[arg-type]
+
+            # Use a timeout to avoid blocking indefinitely
+            self.queue.put(ipc_data, timeout=10.0)
+
+            logger.debug(
+                "Sent tensor %s for (shape=%s, device=%s) "
+                "via IPC queue (shared memory)",
+                metadata,
+                tensor.shape,
+                tensor.device,
+            )
+
+            return metadata
+        except Exception as e:
+            logger.warning(
+                "Failed to send tensor via IPC queue: %s. "
+                "Falling back to standard serialization.",
+                e,
+            )
+            return None
+
+
+@dataclasses.dataclass
+class _Sender:
+    current_message_id: int = -1
+    tensors: dict[int, dict[int, torch.Tensor]] = field(default_factory=dict)
+
+
+class TensorIpcReceiver:
+    """Receive-side logic for tensor IPC via torch.multiprocessing.Queue.
+
+    Wraps the queue receive logic previously embedded in MsgpackDecoder.
+    """
+
+    def __init__(self, queue: TensorIpcQueue):
+        self.queue = queue
+        self._tensor_buffers = defaultdict[str, _Sender](_Sender)
+
+    def __call__(
+        self, dtype: str, shape: tuple[int, ...], meta: dict[str, Any]
+    ) -> torch.Tensor:
+        """Retrieve a tensor from torch.multiprocessing.Queue.
+
+        Uses a drain-and-buffer pattern: drains all available tensors from
+        the queue, buffering them, until the requested tensor is found.
+        Works for CUDA and CPU.
+        """
+
+        # Create lookup key from handle
+        sender_id: str = meta["sender_id"]
+        message_id: int = meta["message_id"]
+        tensor_id: int = meta["tensor_id"]
+
+        # Drain all available tensors. We save them regardless if this is
+        # the one we're waiting for as they may arrive out of order from
+        # multiple producers.
+        while True:
+            sender = self._tensor_buffers.get(sender_id)
+            if sender is not None:
+                tensors = sender.tensors
+                tensor = tensors.get(message_id, {}).pop(tensor_id, None)
+                if tensor is not None:
+                    if sender.current_message_id != message_id:
+                        while tensors and (mid := next(iter(tensors))) < message_id:
+                            if sender.tensors.pop(mid):
+                                logger.warning(
+                                    "Discarding %d stale tensors from sender %s",
+                                    sender_id,
+                                )
+                        sender.current_message_id = message_id
+                    logger.debug(
+                        "Received tensor %s from sender %s for (shape=%s, device=%s) "
+                        "via IPC queue (shared memory)",
+                        (message_id, tensor_id),
+                        sender_id,
+                        tensor.shape,
+                        tensor.device,
+                    )
+                    return tensor
+
+            ipc_data: TensorIpcData = self.queue.get(timeout=10.0)
+
+            # Store tensor
+            sender = self._tensor_buffers[ipc_data.sender_id]
+            if sender.current_message_id > ipc_data.message_id:
+                logger.warning(
+                    "Ignoring stale tensor from sender %s", ipc_data.sender_id
+                )
+                continue
+
+            sender.tensors.setdefault(ipc_data.message_id, {})[ipc_data.tensor_id] = (
+                ipc_data.tensor
+            )
diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py
index 52c7217346e8..90ec47edb033 100644
--- a/vllm/v1/engine/utils.py
+++ b/vllm/v1/engine/utils.py
@@ -10,6 +10,7 @@
 from enum import Enum, auto
 from multiprocessing import Process, connection
 from multiprocessing.process import BaseProcess
+from multiprocessing.queues import Queue
 from typing import TYPE_CHECKING
 from unittest.mock import patch
 
@@ -95,6 +96,7 @@ def __init__(
         executor_class: type[Executor],
         log_stats: bool,
         client_handshake_address: str | None = None,
+        tensor_queue: Queue | None = None,
     ):
         context = get_mp_context()
         common_kwargs = {
@@ -103,6 +105,7 @@ def __init__(
             "handshake_address": handshake_address,
             "executor_class": executor_class,
             "log_stats": log_stats,
+            "tensor_queue": tensor_queue,
         }
 
         if client_handshake_address:
@@ -864,6 +867,7 @@ def launch_core_engines(
         CoreEngineProcManager | CoreEngineActorManager | None,
         DPCoordinator | None,
         EngineZmqAddresses,
+        Queue | None,
     ]
 ]:
     """Launch engine and DP coordinator processes as needed."""
@@ -878,6 +882,14 @@ def launch_core_engines(
 
     offline_mode = local_start_index is not None
 
+    # Create a single tensor IPC queue for sharing multimodal tensors between
+    # API servers and engine core. Returns a single queue since we only support
+    # DP=1 for this data flow.
+    tensor_queue: Queue | None = None
+    multimodal_config = vllm_config.model_config.multimodal_config
+    if multimodal_config is not None and multimodal_config.mm_tensor_ipc == "torch_shm":
+        tensor_queue = get_mp_context().Queue()
+
     # Run the DP Coordinator process with rank 0 when in online DP mode.
     # The coordinator is needed for:
     # 1. Internal/hybrid LB: collecting and publishing queue stats for load balancing
@@ -913,7 +925,7 @@ def launch_core_engines(
             log_stats=log_stats,
         )
 
-        yield engine_actor_manager, coordinator, addresses
+        yield engine_actor_manager, coordinator, addresses, tensor_queue
         return
 
     if offline_mode:
@@ -975,11 +987,12 @@ def launch_core_engines(
                 local_engine_count=local_engine_count,
                 start_index=dp_rank,
                 local_start_index=local_start_index or 0,
+                tensor_queue=tensor_queue,
             )
         else:
             local_engine_manager = None
 
-        yield local_engine_manager, coordinator, addresses
+        yield local_engine_manager, coordinator, addresses, tensor_queue
 
         # Now wait for engines to start.
         wait_for_engine_startup(
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index e715a1d767fa..f9b77154067a 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -602,13 +602,14 @@ def __init__(
         )
 
         # Load model
-        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
-        if not is_eep_new_worker:
-            self.worker.init_device()
-            # Update process title now that parallel groups are initialized
-            self.setup_proc_title_and_log_prefix(
-                enable_ep=vllm_config.parallel_config.enable_expert_parallel
-            )
+        self.worker.init_device()
+        # Update process title now that parallel groups are initialized
+        self.setup_proc_title_and_log_prefix(
+            enable_ep=vllm_config.parallel_config.enable_expert_parallel
+        )
+        if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
+            self.worker.elastic_ep_execute("load_model")
+        else:
             self.worker.load_model()
 
         scheduler_config = vllm_config.scheduler_config
diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py
index 1cbc11990e08..c4e5e7bc67ed 100644
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -382,9 +382,10 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData):
             all_kwargs.append(kwargs)
         self.collective_rpc("init_worker", args=(all_kwargs,))
 
-        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
-        if not is_eep_new_worker:
-            self.collective_rpc("init_device")
+        self.collective_rpc("init_device")
+        if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
+            self.collective_rpc("elastic_ep_execute", args=("load_model",))
+        else:
             self.collective_rpc("load_model")
 
         def _update_block_size(worker):
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
index e90a1ab23915..b616c3b7b8ad 100644
--- a/vllm/v1/executor/uniproc_executor.py
+++ b/vllm/v1/executor/uniproc_executor.py
@@ -43,12 +43,14 @@ def _init_executor(self) -> None:
                 max_workers=1, thread_name_prefix="WorkerAsyncOutput"
             )
 
-        is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH
         self.driver_worker.init_worker(all_kwargs=[kwargs])
-        if not is_eep_new_worker:
-            self.driver_worker.init_device()
+        self.driver_worker.init_device()
+
+        if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH:
+            self.driver_worker.elastic_ep_execute("load_model")
+        else:
             self.driver_worker.load_model()
-            current_platform.update_block_size_for_backend(self.vllm_config)
+        current_platform.update_block_size_for_backend(self.vllm_config)
 
     def _distributed_args(self) -> tuple[str, int, int]:
         """Return (distributed_init_method, rank, local_rank)."""
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index f20d78542247..5d5877d1692e 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -5,7 +5,6 @@
 import time
 from abc import ABC, abstractmethod
 from collections.abc import Callable
-from typing import TypeAlias
 
 from prometheus_client import Counter, Gauge, Histogram
 
@@ -14,7 +13,7 @@
 from vllm.config import SupportsMetricsInfo, VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.metrics import (
     KVConnectorLogging,
-    KVConnectorPrometheus,
+    KVConnectorProm,
 )
 from vllm.logger import init_logger
 from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group
@@ -28,6 +27,7 @@
     PromptTokenStats,
     SchedulerStats,
 )
+from vllm.v1.metrics.utils import create_metric_per_engine
 from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm
 
 logger = init_logger(__name__)
@@ -391,7 +391,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase):
     _counter_cls = Counter
     _histogram_cls = Histogram
     _spec_decoding_cls = SpecDecodingProm
-    _kv_connector_cls = KVConnectorPrometheus
+    _kv_connector_cls = KVConnectorProm
     _perf_metrics_cls = PerfMetricsProm
 
     def __init__(
@@ -415,9 +415,10 @@ def __init__(
         model_name = vllm_config.model_config.served_model_name
         max_model_len = vllm_config.model_config.max_model_len
 
-        per_engine_labelvalues: dict[int, list[object]] = {
+        self.per_engine_labelvalues: dict[int, list[object]] = {
             idx: [model_name, str(idx)] for idx in engine_indexes
         }
+        per_engine_labelvalues = self.per_engine_labelvalues
 
         self.spec_decoding_prom = self._spec_decoding_cls(
             vllm_config.speculative_config, labelnames, per_engine_labelvalues
@@ -438,8 +439,8 @@ def __init__(
             multiprocess_mode="mostrecent",
             labelnames=labelnames,
         )
-        self.gauge_scheduler_running = make_per_engine(
-            gauge_scheduler_running, engine_indexes, model_name
+        self.gauge_scheduler_running = create_metric_per_engine(
+            gauge_scheduler_running, per_engine_labelvalues
         )
 
         gauge_scheduler_waiting = self._gauge_cls(
@@ -448,8 +449,8 @@ def __init__(
             multiprocess_mode="mostrecent",
             labelnames=labelnames,
         )
-        self.gauge_scheduler_waiting = make_per_engine(
-            gauge_scheduler_waiting, engine_indexes, model_name
+        self.gauge_scheduler_waiting = create_metric_per_engine(
+            gauge_scheduler_waiting, per_engine_labelvalues
         )
 
         gauge_engine_sleep_state = self._gauge_cls(
@@ -484,8 +485,8 @@ def __init__(
             multiprocess_mode="mostrecent",
             labelnames=labelnames,
         )
-        self.gauge_kv_cache_usage = make_per_engine(
-            gauge_kv_cache_usage, engine_indexes, model_name
+        self.gauge_kv_cache_usage = create_metric_per_engine(
+            gauge_kv_cache_usage, per_engine_labelvalues
         )
 
         if envs.VLLM_COMPUTE_NANS_IN_LOGITS:
@@ -497,8 +498,8 @@ def __init__(
                 ),
                 labelnames=labelnames,
             )
-            self.counter_corrupted_requests = make_per_engine(
-                counter_corrupted_requests, engine_indexes, model_name
+            self.counter_corrupted_requests = create_metric_per_engine(
+                counter_corrupted_requests, per_engine_labelvalues
             )
 
         counter_prefix_cache_queries = self._counter_cls(
@@ -508,8 +509,8 @@ def __init__(
             ),
             labelnames=labelnames,
         )
-        self.counter_prefix_cache_queries = make_per_engine(
-            counter_prefix_cache_queries, engine_indexes, model_name
+        self.counter_prefix_cache_queries = create_metric_per_engine(
+            counter_prefix_cache_queries, per_engine_labelvalues
         )
 
         counter_prefix_cache_hits = self._counter_cls(
@@ -517,8 +518,8 @@ def __init__(
             documentation=("Prefix cache hits, in terms of number of cached tokens."),
             labelnames=labelnames,
         )
-        self.counter_prefix_cache_hits = make_per_engine(
-            counter_prefix_cache_hits, engine_indexes, model_name
+        self.counter_prefix_cache_hits = create_metric_per_engine(
+            counter_prefix_cache_hits, per_engine_labelvalues
         )
 
         #
@@ -533,8 +534,8 @@ def __init__(
             ),
             labelnames=labelnames,
         )
-        self.counter_connector_prefix_cache_queries = make_per_engine(
-            counter_connector_prefix_cache_queries, engine_indexes, model_name
+        self.counter_connector_prefix_cache_queries = create_metric_per_engine(
+            counter_connector_prefix_cache_queries, per_engine_labelvalues
         )
 
         counter_connector_prefix_cache_hits = self._counter_cls(
@@ -545,8 +546,8 @@ def __init__(
             ),
             labelnames=labelnames,
         )
-        self.counter_connector_prefix_cache_hits = make_per_engine(
-            counter_connector_prefix_cache_hits, engine_indexes, model_name
+        self.counter_connector_prefix_cache_hits = create_metric_per_engine(
+            counter_connector_prefix_cache_hits, per_engine_labelvalues
         )
 
         #
@@ -560,8 +561,8 @@ def __init__(
             ),
             labelnames=labelnames,
         )
-        self.counter_mm_cache_queries = make_per_engine(
-            counter_mm_cache_queries, engine_indexes, model_name
+        self.counter_mm_cache_queries = create_metric_per_engine(
+            counter_mm_cache_queries, per_engine_labelvalues
         )
 
         counter_mm_cache_hits = self._counter_cls(
@@ -571,8 +572,8 @@ def __init__(
             ),
             labelnames=labelnames,
         )
-        self.counter_mm_cache_hits = make_per_engine(
-            counter_mm_cache_hits, engine_indexes, model_name
+        self.counter_mm_cache_hits = create_metric_per_engine(
+            counter_mm_cache_hits, per_engine_labelvalues
         )
 
         #
@@ -583,8 +584,8 @@ def __init__(
             documentation="Cumulative number of preemption from the engine.",
             labelnames=labelnames,
         )
-        self.counter_num_preempted_reqs = make_per_engine(
-            counter_num_preempted_reqs, engine_indexes, model_name
+        self.counter_num_preempted_reqs = create_metric_per_engine(
+            counter_num_preempted_reqs, per_engine_labelvalues
         )
 
         counter_prompt_tokens = self._counter_cls(
@@ -592,8 +593,8 @@ def __init__(
             documentation="Number of prefill tokens processed.",
             labelnames=labelnames,
         )
-        self.counter_prompt_tokens = make_per_engine(
-            counter_prompt_tokens, engine_indexes, model_name
+        self.counter_prompt_tokens = create_metric_per_engine(
+            counter_prompt_tokens, per_engine_labelvalues
         )
 
         # Labeled prompt token counters by source
@@ -617,8 +618,8 @@ def __init__(
             documentation="Number of cached prompt tokens (local + external).",
             labelnames=labelnames,
         )
-        self.counter_prompt_tokens_cached = make_per_engine(
-            counter_prompt_tokens_cached, engine_indexes, model_name
+        self.counter_prompt_tokens_cached = create_metric_per_engine(
+            counter_prompt_tokens_cached, per_engine_labelvalues
         )
 
         # Recomputed tokens (last token recomputed when entire prompt is cached)
@@ -627,8 +628,8 @@ def __init__(
             documentation="Number of cached tokens recomputed for forward pass.",
             labelnames=labelnames,
         )
-        self.counter_prompt_tokens_recomputed = make_per_engine(
-            counter_prompt_tokens_recomputed, engine_indexes, model_name
+        self.counter_prompt_tokens_recomputed = create_metric_per_engine(
+            counter_prompt_tokens_recomputed, per_engine_labelvalues
         )
 
         counter_generation_tokens = self._counter_cls(
@@ -636,8 +637,8 @@ def __init__(
             documentation="Number of generation tokens processed.",
             labelnames=labelnames,
         )
-        self.counter_generation_tokens = make_per_engine(
-            counter_generation_tokens, engine_indexes, model_name
+        self.counter_generation_tokens = create_metric_per_engine(
+            counter_generation_tokens, per_engine_labelvalues
         )
 
         self.counter_request_success: dict[FinishReason, dict[int, Counter]] = {}
@@ -663,8 +664,8 @@ def __init__(
             buckets=build_1_2_5_buckets(max_model_len),
             labelnames=labelnames,
         )
-        self.histogram_num_prompt_tokens_request = make_per_engine(
-            histogram_num_prompt_tokens_request, engine_indexes, model_name
+        self.histogram_num_prompt_tokens_request = create_metric_per_engine(
+            histogram_num_prompt_tokens_request, per_engine_labelvalues
         )
 
         histogram_num_generation_tokens_request = self._histogram_cls(
@@ -673,8 +674,8 @@ def __init__(
             buckets=build_1_2_5_buckets(max_model_len),
             labelnames=labelnames,
         )
-        self.histogram_num_generation_tokens_request = make_per_engine(
-            histogram_num_generation_tokens_request, engine_indexes, model_name
+        self.histogram_num_generation_tokens_request = create_metric_per_engine(
+            histogram_num_generation_tokens_request, per_engine_labelvalues
         )
 
         # TODO: This metric might be incorrect in case of using multiple
@@ -686,8 +687,8 @@ def __init__(
             buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
             labelnames=labelnames,
         )
-        self.histogram_iteration_tokens = make_per_engine(
-            histogram_iteration_tokens, engine_indexes, model_name
+        self.histogram_iteration_tokens = create_metric_per_engine(
+            histogram_iteration_tokens, per_engine_labelvalues
         )
 
         histogram_max_num_generation_tokens_request = self._histogram_cls(
@@ -696,8 +697,8 @@ def __init__(
             buckets=build_1_2_5_buckets(max_model_len),
             labelnames=labelnames,
         )
-        self.histogram_max_num_generation_tokens_request = make_per_engine(
-            histogram_max_num_generation_tokens_request, engine_indexes, model_name
+        self.histogram_max_num_generation_tokens_request = create_metric_per_engine(
+            histogram_max_num_generation_tokens_request, per_engine_labelvalues
         )
 
         histogram_n_request = self._histogram_cls(
@@ -706,8 +707,8 @@ def __init__(
             buckets=[1, 2, 5, 10, 20],
             labelnames=labelnames,
         )
-        self.histogram_n_request = make_per_engine(
-            histogram_n_request, engine_indexes, model_name
+        self.histogram_n_request = create_metric_per_engine(
+            histogram_n_request, per_engine_labelvalues
         )
 
         histogram_max_tokens_request = self._histogram_cls(
@@ -716,8 +717,8 @@ def __init__(
             buckets=build_1_2_5_buckets(max_model_len),
             labelnames=labelnames,
         )
-        self.histogram_max_tokens_request = make_per_engine(
-            histogram_max_tokens_request, engine_indexes, model_name
+        self.histogram_max_tokens_request = create_metric_per_engine(
+            histogram_max_tokens_request, per_engine_labelvalues
         )
 
         #
@@ -752,8 +753,8 @@ def __init__(
             ],
             labelnames=labelnames,
         )
-        self.histogram_time_to_first_token = make_per_engine(
-            histogram_time_to_first_token, engine_indexes, model_name
+        self.histogram_time_to_first_token = create_metric_per_engine(
+            histogram_time_to_first_token, per_engine_labelvalues
         )
 
         histogram_inter_token_latency = self._histogram_cls(
@@ -782,8 +783,8 @@ def __init__(
             ],
             labelnames=labelnames,
         )
-        self.histogram_inter_token_latency = make_per_engine(
-            histogram_inter_token_latency, engine_indexes, model_name
+        self.histogram_inter_token_latency = create_metric_per_engine(
+            histogram_inter_token_latency, per_engine_labelvalues
         )
 
         histogram_request_time_per_output_token = self._histogram_cls(
@@ -812,8 +813,8 @@ def __init__(
             ],
             labelnames=labelnames,
         )
-        self.histogram_request_time_per_output_token = make_per_engine(
-            histogram_request_time_per_output_token, engine_indexes, model_name
+        self.histogram_request_time_per_output_token = create_metric_per_engine(
+            histogram_request_time_per_output_token, per_engine_labelvalues
         )
 
         request_latency_buckets = [
@@ -845,8 +846,8 @@ def __init__(
             buckets=request_latency_buckets,
             labelnames=labelnames,
         )
-        self.histogram_e2e_time_request = make_per_engine(
-            histogram_e2e_time_request, engine_indexes, model_name
+        self.histogram_e2e_time_request = create_metric_per_engine(
+            histogram_e2e_time_request, per_engine_labelvalues
         )
 
         histogram_queue_time_request = self._histogram_cls(
@@ -855,8 +856,8 @@ def __init__(
             buckets=request_latency_buckets,
             labelnames=labelnames,
         )
-        self.histogram_queue_time_request = make_per_engine(
-            histogram_queue_time_request, engine_indexes, model_name
+        self.histogram_queue_time_request = create_metric_per_engine(
+            histogram_queue_time_request, per_engine_labelvalues
         )
 
         histogram_inference_time_request = self._histogram_cls(
@@ -865,8 +866,8 @@ def __init__(
             buckets=request_latency_buckets,
             labelnames=labelnames,
         )
-        self.histogram_inference_time_request = make_per_engine(
-            histogram_inference_time_request, engine_indexes, model_name
+        self.histogram_inference_time_request = create_metric_per_engine(
+            histogram_inference_time_request, per_engine_labelvalues
         )
 
         histogram_prefill_time_request = self._histogram_cls(
@@ -875,8 +876,8 @@ def __init__(
             buckets=request_latency_buckets,
             labelnames=labelnames,
         )
-        self.histogram_prefill_time_request = make_per_engine(
-            histogram_prefill_time_request, engine_indexes, model_name
+        self.histogram_prefill_time_request = create_metric_per_engine(
+            histogram_prefill_time_request, per_engine_labelvalues
         )
 
         histogram_decode_time_request = self._histogram_cls(
@@ -885,8 +886,8 @@ def __init__(
             buckets=request_latency_buckets,
             labelnames=labelnames,
         )
-        self.histogram_decode_time_request = make_per_engine(
-            histogram_decode_time_request, engine_indexes, model_name
+        self.histogram_decode_time_request = create_metric_per_engine(
+            histogram_decode_time_request, per_engine_labelvalues
         )
 
         histogram_prefill_kv_computed_request = self._histogram_cls(
@@ -898,8 +899,8 @@ def __init__(
             buckets=build_1_2_5_buckets(max_model_len),
             labelnames=labelnames,
         )
-        self.histogram_prefill_kv_computed_request = make_per_engine(
-            histogram_prefill_kv_computed_request, engine_indexes, model_name
+        self.histogram_prefill_kv_computed_request = create_metric_per_engine(
+            histogram_prefill_kv_computed_request, per_engine_labelvalues
         )
 
         #
@@ -939,8 +940,8 @@ def __init__(
                 buckets=kv_cache_residency_buckets,
                 labelnames=labelnames,
             )
-            self.histogram_kv_block_lifetime = make_per_engine(
-                histogram_kv_block_lifetime, engine_indexes, model_name
+            self.histogram_kv_block_lifetime = create_metric_per_engine(
+                histogram_kv_block_lifetime, per_engine_labelvalues
             )
 
             histogram_kv_block_idle_before_evict = self._histogram_cls(
@@ -952,8 +953,8 @@ def __init__(
                 buckets=kv_cache_residency_buckets,
                 labelnames=labelnames,
             )
-            self.histogram_kv_block_idle_before_evict = make_per_engine(
-                histogram_kv_block_idle_before_evict, engine_indexes, model_name
+            self.histogram_kv_block_idle_before_evict = create_metric_per_engine(
+                histogram_kv_block_idle_before_evict, per_engine_labelvalues
             )
 
             histogram_kv_block_reuse_gap = self._histogram_cls(
@@ -967,8 +968,8 @@ def __init__(
                 buckets=kv_cache_residency_buckets,
                 labelnames=labelnames,
             )
-            self.histogram_kv_block_reuse_gap = make_per_engine(
-                histogram_kv_block_reuse_gap, engine_indexes, model_name
+            self.histogram_kv_block_reuse_gap = create_metric_per_engine(
+                histogram_kv_block_reuse_gap, per_engine_labelvalues
             )
         else:
             self.histogram_kv_block_lifetime = {}
@@ -1203,15 +1204,6 @@ def log_engine_initialized(self):
         self.log_metrics_info("cache_config", self.vllm_config.cache_config)
 
 
-PromMetric: TypeAlias = Gauge | Counter | Histogram
-
-
-def make_per_engine(
-    metric: PromMetric, engine_idxs: list[int], model_name: object
-) -> dict[int, PromMetric]:
-    return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs}
-
-
 def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]:
     """
     Builds a list of buckets with increasing powers of 10 multiplied by
diff --git a/vllm/v1/metrics/perf.py b/vllm/v1/metrics/perf.py
index 81348efc13b3..91629cb57816 100644
--- a/vllm/v1/metrics/perf.py
+++ b/vllm/v1/metrics/perf.py
@@ -27,6 +27,7 @@
     get_kv_cache_torch_dtype,
 )
 from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.metrics.utils import create_metric_per_engine
 
 logger = init_logger(__name__)
 
@@ -1291,7 +1292,9 @@ def __init__(
             ),
             labelnames=labelnames,
         )
-        self.counter_flops = make_per_engine(counter_flops, per_engine_labelvalues)
+        self.counter_flops = create_metric_per_engine(
+            counter_flops, per_engine_labelvalues
+        )
 
         counter_read_bytes = self._counter_cls(
             name="vllm:estimated_read_bytes_per_gpu_total",
@@ -1301,7 +1304,7 @@ def __init__(
             ),
             labelnames=labelnames,
         )
-        self.counter_read_bytes = make_per_engine(
+        self.counter_read_bytes = create_metric_per_engine(
             counter_read_bytes, per_engine_labelvalues
         )
 
@@ -1313,7 +1316,7 @@ def __init__(
             ),
             labelnames=labelnames,
         )
-        self.counter_write_bytes = make_per_engine(
+        self.counter_write_bytes = create_metric_per_engine(
             counter_write_bytes, per_engine_labelvalues
         )
 
@@ -1329,16 +1332,6 @@ def observe(self, perf_stats: PerfStats, engine_idx: int = 0):
         self.counter_write_bytes[engine_idx].inc(perf_stats.num_write_bytes_per_gpu)
 
 
-def make_per_engine(
-    counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[object]]
-):
-    """Create a counter for each label value."""
-    return {
-        idx: counter.labels(*labelvalues)
-        for idx, labelvalues in per_engine_labelvalues.items()
-    }
-
-
 ## util functions
 
 
diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py
index abc53f3802ea..a11b92680779 100644
--- a/vllm/v1/metrics/ray_wrappers.py
+++ b/vllm/v1/metrics/ray_wrappers.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
 
-from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus
+from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorProm
 from vllm.v1.metrics.loggers import PrometheusStatLogger
 from vllm.v1.metrics.perf import PerfMetricsProm
 from vllm.v1.spec_decode.metrics import SpecDecodingProm
@@ -168,9 +168,9 @@ class RaySpecDecodingProm(SpecDecodingProm):
     _counter_cls = RayCounterWrapper
 
 
-class RayKVConnectorPrometheus(KVConnectorPrometheus):
+class RayKVConnectorProm(KVConnectorProm):
     """
-    RayKVConnectorPrometheus is used by RayMetrics to log Ray
+    RayKVConnectorProm is used by RayMetrics to log Ray
     metrics. Provides the same metrics as KV connectors but
     uses Ray's util.metrics library.
     """
@@ -197,7 +197,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger):
     _counter_cls = RayCounterWrapper
     _histogram_cls = RayHistogramWrapper
     _spec_decoding_cls = RaySpecDecodingProm
-    _kv_connector_cls = RayKVConnectorPrometheus
+    _kv_connector_cls = RayKVConnectorProm
     _perf_metrics_cls = RayPerfMetricsProm
 
     @staticmethod
diff --git a/vllm/v1/metrics/utils.py b/vllm/v1/metrics/utils.py
new file mode 100644
index 000000000000..1ef56fc94869
--- /dev/null
+++ b/vllm/v1/metrics/utils.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import TypeAlias
+
+from prometheus_client import Counter, Gauge, Histogram
+
+PromMetric: TypeAlias = Gauge | Counter | Histogram
+
+
+def create_metric_per_engine(
+    metric: PromMetric,
+    per_engine_labelvalues: dict[int, list[object]],
+) -> dict[int, PromMetric]:
+    """Create a labeled metric child for each engine index."""
+    return {
+        idx: metric.labels(*labelvalues)
+        for idx, labelvalues in per_engine_labelvalues.items()
+    }
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
index be880bec22ac..204c8bd0e411 100644
--- a/vllm/v1/serial_utils.py
+++ b/vllm/v1/serial_utils.py
@@ -4,6 +4,7 @@
 import dataclasses
 import importlib
 import pickle
+from abc import ABC, abstractmethod
 from collections.abc import Callable, Sequence
 from functools import partial
 from inspect import isclass
@@ -53,6 +54,27 @@
 bytestr: TypeAlias = bytes | bytearray | memoryview | zmq.Frame
 
 
+class OOBTensorConsumer(ABC):
+    @abstractmethod
+    def __call__(self, tensor: torch.Tensor) -> dict | None:
+        """
+        Called with tensors for the current message.
+        Returns None to reject the tensor (falls back to regular serialization),
+        otherwise a dict with arbitrary placeholder data to be included
+        in the serialized message.
+        """
+        return None
+
+    @abstractmethod
+    def new_message(self) -> None:
+        """Called at the start of each new encoded message."""
+        pass
+
+
+# dtype, shape, metadata -> tensor
+OOBTensorProvider = Callable[[str, tuple[int, ...], dict], torch.Tensor]
+
+
 def _log_insecure_serialization_warning():
     logger.warning_once(
         "Allowing insecure serialization using pickle due to "
@@ -119,9 +141,16 @@ class MsgpackEncoder:
 
     By default, arrays below 256B are serialized inline Larger will get sent
     via dedicated messages. Note that this is a per-tensor limit.
+
+    When a ``oob_tensor_consumer`` is provided, tensors (CUDA and CPU) will be
+    offered to it for out-of-band handling.
     """
 
-    def __init__(self, size_threshold: int | None = None):
+    def __init__(
+        self,
+        size_threshold: int | None = None,
+        oob_tensor_consumer: OOBTensorConsumer | None = None,
+    ):
         if size_threshold is None:
             size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD
         self.encoder = msgpack.Encoder(enc_hook=self.enc_hook)
@@ -130,11 +159,14 @@ def __init__(self, size_threshold: int | None = None):
         # pass custom data to the hook otherwise.
         self.aux_buffers: list[bytestr] | None = None
         self.size_threshold = size_threshold
+        self.oob_tensor_consumer = oob_tensor_consumer
         if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
             _log_insecure_serialization_warning()
 
     def encode(self, obj: Any) -> Sequence[bytestr]:
         try:
+            if self.oob_tensor_consumer is not None:
+                self.oob_tensor_consumer.new_message()
             self.aux_buffers = bufs = [b""]
             bufs[0] = self.encoder.encode(obj)
             # This `bufs` list allows us to collect direct pointers to backing
@@ -147,6 +179,8 @@ def encode(self, obj: Any) -> Sequence[bytestr]:
 
     def encode_into(self, obj: Any, buf: bytearray) -> Sequence[bytestr]:
         try:
+            if self.oob_tensor_consumer is not None:
+                self.oob_tensor_consumer.new_message()
             self.aux_buffers = [buf]
             bufs = self.aux_buffers
             self.encoder.encode_into(obj, buf)
@@ -222,17 +256,19 @@ def _encode_ndarray(
 
     def _encode_tensor(
         self, obj: torch.Tensor
-    ) -> tuple[str, tuple[int, ...], int | memoryview]:
-        assert self.aux_buffers is not None
+    ) -> tuple[str, tuple[int, ...], int | dict | memoryview]:
+        oob_consumer = self.oob_tensor_consumer
         # view the tensor as a contiguous 1D array of bytes
-        arr_data = tensor_data(obj)
-        if obj.nbytes < self.size_threshold:
+        if obj.nbytes < self.size_threshold and obj.is_cpu:
             # Smaller tensors are encoded inline, just like ndarrays.
-            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr_data)
+            data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, tensor_data(obj))
+        elif oob_consumer is not None and (data := oob_consumer(obj)) is not None:
+            assert isinstance(data, dict)
         else:
             # Otherwise encode index of backing buffer to avoid copy.
+            assert self.aux_buffers is not None
             data = len(self.aux_buffers)
-            self.aux_buffers.append(arr_data)
+            self.aux_buffers.append(tensor_data(obj))
         dtype = str(obj.dtype).removeprefix("torch.")
         return dtype, obj.shape, data
 
@@ -279,9 +315,17 @@ class MsgpackDecoder:
 
     Note that unlike vanilla `msgspec` Decoders, this interface is generally
     not thread-safe when encoding tensors / numpy arrays.
+
+    ``oob_tensor_provider`` must be used when an OOBTensorConsumer is used on the
+    encoder side.
     """
 
-    def __init__(self, t: Any | None = None, share_mem: bool = True):
+    def __init__(
+        self,
+        t: Any | None = None,
+        share_mem: bool = True,
+        oob_tensor_provider: OOBTensorProvider | None = None,
+    ):
         self.share_mem = share_mem
         self.pin_tensors = is_pin_memory_available()
         args = () if t is None else (t,)
@@ -289,6 +333,7 @@ def __init__(self, t: Any | None = None, share_mem: bool = True):
             *args, ext_hook=self.ext_hook, dec_hook=self.dec_hook
         )
         self.aux_buffers: Sequence[bytestr] = ()
+        self.oob_tensor_provider = oob_tensor_provider
         if envs.VLLM_ALLOW_INSECURE_SERIALIZATION:
             _log_insecure_serialization_warning()
 
@@ -353,6 +398,12 @@ def _decode_ndarray(self, arr: Any) -> np.ndarray:
 
     def _decode_tensor(self, arr: Any) -> torch.Tensor:
         dtype, shape, data = arr
+        if isinstance(data, dict):
+            assert self.oob_tensor_provider, (
+                "Received OOB tensor but tensor provider is not set"
+            )
+            return self.oob_tensor_provider(dtype, shape, data)
+
         is_aux = isinstance(data, int)
         buffer = self.aux_buffers[data] if is_aux else data
         buffer = buffer if isinstance(buffer, memoryview) else memoryview(buffer)
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 6c16bc686d16..9a41ff5c818c 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -9,6 +9,7 @@
 
 from vllm.config import SpeculativeConfig
 from vllm.logger import init_logger
+from vllm.v1.metrics.utils import create_metric_per_engine
 
 logger = init_logger(__name__)
 
@@ -155,7 +156,7 @@ def __init__(
             documentation="Number of spec decoding drafts.",
             labelnames=labelnames,
         )
-        self.counter_spec_decode_num_drafts = make_per_engine(
+        self.counter_spec_decode_num_drafts = create_metric_per_engine(
             counter_drafts, per_engine_labelvalues
         )
 
@@ -164,7 +165,7 @@ def __init__(
             documentation="Number of draft tokens.",
             labelnames=labelnames,
         )
-        self.counter_spec_decode_num_draft_tokens = make_per_engine(
+        self.counter_spec_decode_num_draft_tokens = create_metric_per_engine(
             counter_draft_tokens, per_engine_labelvalues
         )
 
@@ -173,7 +174,7 @@ def __init__(
             documentation="Number of accepted tokens.",
             labelnames=labelnames,
         )
-        self.counter_spec_decode_num_accepted_tokens = make_per_engine(
+        self.counter_spec_decode_num_accepted_tokens = create_metric_per_engine(
             counter_accepted_tokens, per_engine_labelvalues
         )
 
@@ -212,14 +213,3 @@ def observe(self, spec_decoding_stats: SpecDecodingStats, engine_idx: int = 0):
             self.counter_spec_decode_num_accepted_tokens_per_pos[engine_idx]
         ):
             counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos])
-
-
-def make_per_engine(
-    counter: prometheus_client.Counter,
-    per_engine_labelvalues: dict[int, list[object]],
-):
-    """Create a counter for each label value."""
-    return {
-        idx: counter.labels(*labelvalues)
-        for idx, labelvalues in per_engine_labelvalues.items()
-    }
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index 970465089e10..3710593dbd44 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -10,6 +10,7 @@
 from dataclasses import dataclass
 from multiprocessing import connection
 from multiprocessing.process import BaseProcess
+from multiprocessing.queues import Queue
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -173,6 +174,7 @@ def __init__(
         input_addresses: list[str],
         output_addresses: list[str],
         stats_update_address: str | None = None,
+        tensor_queue: Queue | None = None,
     ):
         """Initialize and start API server worker processes.
 
@@ -185,6 +187,7 @@ def __init__(
             input_addresses: Input addresses for each API server
             output_addresses: Output addresses for each API server
             stats_update_address: Optional stats update address
+            tensor_queue: Optional tensor IPC queue for sharing MM tensors
         """
         self.listen_address = listen_address
         self.sock = sock
@@ -205,6 +208,8 @@ def __init__(
             }
             if stats_update_address is not None:
                 client_config["stats_update_address"] = stats_update_address
+            if tensor_queue is not None:
+                client_config["tensor_queue"] = tensor_queue
 
             proc = spawn_context.Process(
                 target=target_server_fn,
@@ -419,7 +424,7 @@ def tensor_data(tensor: torch.Tensor) -> memoryview:
     Returns:
         A memoryview of the tensor data as uint8.
     """
-    return tensor.flatten().contiguous().view(torch.uint8).numpy().data
+    return tensor.flatten().cpu().contiguous().view(torch.uint8).numpy().data
 
 
 @dataclass
diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py
index 59786ed7a153..8e5bb11e4dad 100644
--- a/vllm/v1/worker/gpu/attn_utils.py
+++ b/vllm/v1/worker/gpu/attn_utils.py
@@ -111,6 +111,7 @@ def _reshape_kv_cache(
     kv_cache_config: KVCacheConfig,
     kv_cache_raw_tensors: dict[str, torch.Tensor],
     attn_backends: dict[str, AttentionBackend],
+    cache_dtype: str,
 ) -> dict[str, torch.Tensor]:
     kv_caches: dict[str, torch.Tensor] = {}
     for kv_cache_group_spec in kv_cache_config.kv_cache_groups:
@@ -127,6 +128,7 @@ def _reshape_kv_cache(
                 kv_cache_spec.block_size,
                 kv_cache_spec.num_kv_heads,
                 kv_cache_spec.head_size,
+                cache_dtype,
             )
 
             # FIXME(woosuk): Add kv_cache_stride_order to all attention backends.
@@ -155,9 +157,12 @@ def init_kv_cache(
     kv_cache_config: KVCacheConfig,
     attn_backends: dict[str, AttentionBackend],
     device: torch.device,
+    cache_dtype: str,
 ) -> dict[str, torch.Tensor]:
     kv_cache_raw_tensors = _allocate_kv_cache(kv_cache_config, device)
-    kv_caches = _reshape_kv_cache(kv_cache_config, kv_cache_raw_tensors, attn_backends)
+    kv_caches = _reshape_kv_cache(
+        kv_cache_config, kv_cache_raw_tensors, attn_backends, cache_dtype
+    )
     bind_kv_cache(kv_caches, forward_context, runner_kv_caches)
     return kv_caches
 
diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py
index 3a2c0562a92c..e79a7afbd81e 100644
--- a/vllm/v1/worker/gpu/block_table.py
+++ b/vllm/v1/worker/gpu/block_table.py
@@ -169,7 +169,7 @@ def get_dummy_slot_mappings(self, num_tokens: int) -> torch.Tensor:
         return self.slot_mappings[:, :num_tokens]
 
 
-@triton.jit
+@triton.jit(do_not_specialize=["num_reqs"])
 def _gather_block_tables_kernel(
     batch_idx_to_req_idx,  # [batch_size]
     src_block_table_ptrs,  # [num_kv_cache_groups]
diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py
index b4a0c26cec02..5788b31d2583 100644
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -195,7 +195,6 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
             num_speculative_steps=self.num_speculative_steps,
             vocab_size=self.vocab_size,
             device=self.device,
-            cache_draft_logits=not use_strict_rejection_sampling,
         )
         self.input_buffers = InputBuffers(
             max_num_reqs=self.max_num_reqs,
@@ -360,6 +359,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             self.kv_cache_config,
             self.attn_backends,
             self.device,
+            self.cache_config.cache_dtype,
         )
         self.kv_connector = get_kv_connector(self.vllm_config, kv_caches_dict)
 
@@ -446,7 +446,6 @@ def _dummy_run(
                 next_prefill_tokens=self.req_states.next_prefill_tokens,
                 temperature=self.sampler.sampling_states.temperature.gpu,
                 seeds=self.sampler.sampling_states.seeds.gpu,
-                draft_logits_out=self.req_states.draft_logits,
                 num_tokens_across_dp=num_tokens_across_dp,
                 dummy_run=True,
                 skip_attn_for_dummy_run=skip_attn,
@@ -557,18 +556,23 @@ def capture_model(self) -> int:
         )
         return cuda_graph_size
 
+    def _remove_request(self, req_id: str) -> bool:
+        if not self.req_states.remove_request(req_id):
+            return False
+        if self.encoder_cache is not None:
+            self.encoder_cache.remove_request(req_id)
+        if self.prompt_logprobs_worker is not None:
+            self.prompt_logprobs_worker.remove_request(req_id)
+        self.lora_state.remove_request(req_id)
+        return True
+
     def finish_requests(self, scheduler_output: SchedulerOutput) -> None:
         finished_req_ids = scheduler_output.finished_req_ids
         preempted_req_ids = scheduler_output.preempted_req_ids
         if preempted_req_ids:
             finished_req_ids = finished_req_ids.union(preempted_req_ids)
         for req_id in finished_req_ids:
-            self.req_states.remove_request(req_id)
-            if self.encoder_cache is not None:
-                self.encoder_cache.remove_request(req_id)
-            if self.prompt_logprobs_worker is not None:
-                self.prompt_logprobs_worker.remove_request(req_id)
-            self.lora_state.remove_request(req_id)
+            self._remove_request(req_id)
 
     def free_states(self, scheduler_output: SchedulerOutput) -> None:
         if self.encoder_cache is not None:
@@ -580,6 +584,12 @@ def add_requests(self, scheduler_output: SchedulerOutput) -> None:
             assert new_req_data.prompt_token_ids is not None
             assert new_req_data.prefill_token_ids is not None
             req_id = new_req_data.req_id
+
+            # Streaming input update: request already exists from a prior
+            # chunk. Remove old state so it can be cleanly re-added below
+            # with the updated prompt_token_ids and mm_features.
+            self._remove_request(req_id)
+
             prompt_len = len(new_req_data.prompt_token_ids)
             self.req_states.add_request(
                 req_id=req_id,
@@ -815,11 +825,12 @@ def sample(
         else:
             # Rejection sampling for spec decoding.
             assert self.rejection_sampler is not None
+            assert self.speculator is not None
             sampler_output = self.rejection_sampler(
                 logits,
                 input_batch,
                 # Draft logits are needed for probabilistic rejection sampling.
-                self.req_states.draft_logits,
+                self.speculator.draft_logits,
             )
 
         # Get the number of sampled and rejected tokens.
@@ -1145,7 +1156,6 @@ def sample_tokens(
                 self.req_states.next_prefill_tokens,
                 self.sampler.sampling_states.temperature.gpu,
                 self.sampler.sampling_states.seeds.gpu,
-                self.req_states.draft_logits,
                 num_tokens_across_dp=num_tokens_across_dp,
             )
             self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens
diff --git a/vllm/v1/worker/gpu/model_states/default.py b/vllm/v1/worker/gpu/model_states/default.py
index 104e4c1948b5..8e73867deb2e 100644
--- a/vllm/v1/worker/gpu/model_states/default.py
+++ b/vllm/v1/worker/gpu/model_states/default.py
@@ -7,6 +7,7 @@
 
 from vllm.config import VllmConfig
 from vllm.config.compilation import CUDAGraphMode
+from vllm.tasks import GenerationTask
 from vllm.v1.core.sched.output import NewRequestData
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.worker.gpu.attn_utils import build_attn_metadata
@@ -61,6 +62,28 @@ def __init__(
             device=self.device,
         )
 
+    def get_supported_generation_tasks(self) -> tuple[GenerationTask, ...]:
+        from vllm.model_executor.models.interfaces import (
+            supports_realtime,
+            supports_transcription,
+        )
+        from vllm.model_executor.models.interfaces_base import is_text_generation_model
+
+        supported_tasks = list[GenerationTask]()
+
+        if is_text_generation_model(self.model):
+            supported_tasks.append("generate")
+
+        if supports_transcription(self.model):
+            if self.model.supports_transcription_only:
+                return ("transcription",)
+            supported_tasks.append("transcription")
+
+        if supports_realtime(self.model):
+            supported_tasks.append("realtime")
+
+        return tuple(supported_tasks)
+
     def add_request(self, req_index: int, new_req_data: NewRequestData) -> None:
         if self.rope_state is not None:
             assert new_req_data.prefill_token_ids is not None
diff --git a/vllm/v1/worker/gpu/model_states/interface.py b/vllm/v1/worker/gpu/model_states/interface.py
index 1c114496ddd8..d83ab2fc515f 100644
--- a/vllm/v1/worker/gpu/model_states/interface.py
+++ b/vllm/v1/worker/gpu/model_states/interface.py
@@ -28,8 +28,9 @@ def __init__(
     ) -> None:
         raise NotImplementedError
 
+    @abstractmethod
     def get_supported_generation_tasks(self) -> tuple[GenerationTask, ...]:
-        return ("generate",)
+        raise NotImplementedError
 
     def add_request(self, req_index: int, new_req_data: NewRequestData) -> None:
         return None
diff --git a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
index 49b6b5331b5c..4df88bf95c4c 100644
--- a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
+++ b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py
@@ -76,6 +76,17 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device):
             device=device,
         )
 
+        cache_draft_logits = self.speculative_config.rejection_sample_method != "strict"
+        self.draft_logits: torch.Tensor | None = None
+        if cache_draft_logits:
+            self.draft_logits = torch.zeros(
+                self.max_num_reqs,
+                self.num_speculative_steps,
+                self.vocab_size,
+                dtype=torch.float32,
+                device=device,
+            )
+
         # currently we don't  support PIECEWISE for Eagle.
         cudagraph_mode = vllm_config.compilation_config.cudagraph_mode
         if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL:
@@ -158,7 +169,6 @@ def generate_draft(
         slot_mappings: dict[str, torch.Tensor] | None,
         num_tokens_across_dp: torch.Tensor | None,
         cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE,
-        draft_logits_out: torch.Tensor | None = None,
     ) -> None:
         pos = self.input_buffers.positions[:num_reqs]
         query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1]
@@ -185,8 +195,8 @@ def generate_draft(
                 self.seeds,
                 pos + 1,
                 apply_temperature=True,
-                processed_logits_out=draft_logits_out[:, step]
-                if draft_logits_out is not None
+                processed_logits_out=self.draft_logits[:, step]
+                if self.draft_logits is not None
                 else None,
             )
             self.draft_tokens[:num_reqs, step] = draft_tokens
@@ -241,8 +251,6 @@ def propose(
         temperature: torch.Tensor,
         # [max_num_reqs]
         seeds: torch.Tensor,
-        # [max_num_reqs, num_speculative_steps, vocab_size]
-        draft_logits_out: torch.Tensor | None,
         num_tokens_across_dp: torch.Tensor | None = None,
         dummy_run: bool = False,
         skip_attn_for_dummy_run: bool = False,
@@ -308,8 +316,8 @@ def propose(
             self.seeds,
             pos + 1,
             apply_temperature=True,
-            processed_logits_out=draft_logits_out[:, 0]
-            if draft_logits_out is not None
+            processed_logits_out=self.draft_logits[:, 0]
+            if self.draft_logits is not None
             else None,
         )
 
@@ -394,7 +402,6 @@ def propose(
             slot_mappings_updated,
             num_tokens_across_dp=num_tokens_across_dp,
             cudagraph_runtime_mode=batch_desc.cg_mode,
-            draft_logits_out=draft_logits_out,
         )
         return self.draft_tokens[:num_reqs]
 
diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py
index 3fb02c12d999..24d225886106 100644
--- a/vllm/v1/worker/gpu/states.py
+++ b/vllm/v1/worker/gpu/states.py
@@ -15,7 +15,6 @@ def __init__(
         num_speculative_steps: int,
         vocab_size: int,
         device: torch.device,
-        cache_draft_logits: bool,
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -71,18 +70,6 @@ def __init__(
             dtype=torch.int64,
             device=device,
         )
-        # Draft token logits.
-        # NOTE: This tensor maintains the "processed" logits after applying temperature,
-        # top-p, etc.
-        self.draft_logits: torch.Tensor | None = None
-        if cache_draft_logits:
-            self.draft_logits = torch.zeros(
-                self.max_num_reqs,
-                self.num_speculative_steps,
-                self.vocab_size,
-                dtype=torch.float32,
-                device=device,
-            )
 
         self.next_prefill_tokens = torch.zeros(
             self.max_num_reqs, dtype=torch.int32, device=device
@@ -122,13 +109,14 @@ def apply_staged_writes(self) -> None:
         self.all_token_ids.apply_write()
         self.num_computed_tokens.apply_write()
 
-    def remove_request(self, req_id: str) -> None:
+    def remove_request(self, req_id: str) -> bool:
         req_idx = self.req_id_to_index.pop(req_id, None)
         if req_idx is None:
             # Request not found.
-            return
+            return False
         self.index_to_req_id.pop(req_idx, None)
         self.free_indices.append(req_idx)
+        return True
 
     def any_prefills(self, idx_mapping_np: np.ndarray) -> bool:
         return np.any(
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 34bcc241f769..fb7795e04740 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -134,7 +134,13 @@ def __init__(
             pin_memory=pin_memory,
         )
         self.num_tokens_no_spec = self.num_tokens_no_spec_cpu_tensor.numpy()
-        self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32)
+        self.num_prompt_tokens_cpu_tensor = torch.zeros(
+            (max_num_reqs,),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.num_prompt_tokens = self.num_prompt_tokens_cpu_tensor.numpy()
         self.num_computed_tokens_cpu_tensor = torch.zeros(
             (max_num_reqs,),
             device="cpu",
@@ -886,7 +892,7 @@ def get_pooling_metadata(self) -> PoolingMetadata:
         pooling_states = self.get_pooling_states()
 
         return PoolingMetadata(
-            prompt_lens=torch.from_numpy(self.num_prompt_tokens[: self.num_reqs]),
+            prompt_lens=self.num_prompt_tokens_cpu_tensor[: self.num_reqs].clone(),
             prompt_token_ids=self.sampling_metadata.prompt_token_ids,
             pooling_params=pooling_params,
             pooling_states=pooling_states,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 0365a9938d12..81326b6d11fa 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -740,19 +740,6 @@ def __init__(
 
         self.uniform_decode_query_len = 1 + self.num_spec_tokens
 
-        # When spec decode is active, the mamba backend classifies requests
-        # with query_len <= reorder_batch_threshold as "decodes". Prefill
-        # chunks that fall under this threshold get processed via the decode
-        # path, which stores intermediate states at sequential slots. We must
-        # set num_accepted_tokens to the chunk's query_len for those requests
-        # so the next iteration reads from the correct final-state slot.
-        # Prefills that went through the actual prefill path should keep the
-        # default value of 1 (the prefill path stores state at slot 0 only).
-        self.needs_prefill_as_decode_slots: bool = False
-        self.prefill_as_decode_num_tokens = self._make_buffer(
-            self.max_num_reqs, dtype=torch.int32
-        )
-
         # Cudagraph dispatcher for runtime cudagraph dispatching.
         self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config)
 
@@ -1369,16 +1356,6 @@ def _update_states_after_model_execute(
             .int()
             .argmax(-1)
         )
-        spec_decode_active = bool(scheduler_output.scheduled_spec_decode_tokens)
-        if self.needs_prefill_as_decode_slots and spec_decode_active:
-            mamba_utils.update_accepted_tokens_for_prefill_as_decode(
-                self.input_batch,
-                self.prefill_as_decode_num_tokens,
-                self.num_accepted_tokens.gpu,
-                scheduler_output,
-                self.reorder_batch_threshold,
-                num_reqs,
-            )
 
         if self.cache_config.mamba_cache_mode == "align":
             for i, num_tokens in enumerate(
@@ -1982,14 +1959,23 @@ def _get_block_table(kv_cache_gid: int):
             attn_gid = self.routed_experts_attn_gid
             slot_mapping_attn = slot_mappings[attn_gid]
             self.slot_mapping = slot_mapping_attn[:num_tokens].cpu().numpy()
+        # Compute is_prefilling: True if request is still in prefill phase
+        # (num_computed_tokens < num_prompt_tokens). Used by mamba backends to
+        # distinguish actual decodes from short extends.
+        num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[
+            :num_reqs_padded
+        ]
+        num_prompt_tokens_cpu = self.input_batch.num_prompt_tokens_cpu_tensor[
+            :num_reqs_padded
+        ]
+        is_prefilling = num_computed_tokens_cpu < num_prompt_tokens_cpu
+
         cm_base = CommonAttentionMetadata(
             query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1],
             query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1],
             seq_lens=self.seq_lens.gpu[:num_reqs_padded],
             _seq_lens_cpu=self.seq_lens.cpu[:num_reqs_padded],
-            _num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[
-                :num_reqs_padded
-            ],
+            _num_computed_tokens_cpu=num_computed_tokens_cpu,
             num_reqs=num_reqs_padded,
             num_actual_tokens=num_tokens_padded,
             max_query_len=max_query_len,
@@ -1997,6 +1983,7 @@ def _get_block_table(kv_cache_gid: int):
             block_table_tensor=block_table_gid_0,
             slot_mapping=slot_mapping_gid_0,
             causal=True,
+            is_prefilling=is_prefilling,
         )
 
         if self.dcp_world_size > 1:
@@ -2048,8 +2035,6 @@ def _build_attn_group_metadata(
                 else 0
             )
 
-            if isinstance(builder, Mamba2AttentionMetadataBuilder):
-                self.needs_prefill_as_decode_slots = True
             extra_attn_metadata_args = {}
             if use_spec_decode and isinstance(
                 builder, (Mamba2AttentionMetadataBuilder, GDNAttentionMetadataBuilder)
@@ -2834,15 +2819,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]:
         if not is_pooling_model(model):
             return []
 
-        supported_tasks = list(model.pooler.get_supported_tasks())
-
-        if "score" in supported_tasks:
-            num_labels = getattr(self.model_config.hf_config, "num_labels", 0)
-            if num_labels != 1:
-                supported_tasks.remove("score")
-                logger.debug_once("Score API is only enabled for num_labels == 1.")
-
-        return supported_tasks
+        return list(model.pooler.get_supported_tasks())
 
     def get_supported_tasks(self) -> tuple[SupportedTask, ...]:
         tasks = list[SupportedTask]()
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index d101edc18100..91dcdc2b9798 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -315,30 +315,12 @@ def init_device(self):
 
     # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
     # to hijack tensor allocation.
-    def load_model(self) -> None:
-        dummy_weights = os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1"
-        if dummy_weights:
-            (
-                expanded_physical_to_logical,
-                num_logical_experts,
-                old_num_physical_experts,
-            ) = self.elastic_ep_executor.receive_expert_mapping()
-            num_physical_experts = expanded_physical_to_logical.shape[1]
-            self.parallel_config.eplb_config.num_redundant_experts = (
-                num_physical_experts - num_logical_experts
-            )
-
+    def load_model(self, *, load_dummy_weights: bool = False) -> None:
         with (
             self._maybe_get_memory_pool_context(tag="weights"),
             set_current_vllm_config(self.vllm_config),
         ):
-            self.model_runner.load_model(load_dummy_weights=dummy_weights)
-
-        if dummy_weights:
-            self.model_runner.setup_eplb_from_mapping(
-                expanded_physical_to_logical, old_num_physical_experts
-            )
-            self.model_runner.eep_eplb_suppressed = True
+            self.model_runner.load_model(load_dummy_weights=load_dummy_weights)
 
     def update_config(self, overrides: dict[str, Any]) -> None:
         self.model_runner.update_config(overrides)
@@ -417,9 +399,7 @@ def determine_available_memory(self) -> int:
         )
 
         self.non_torch_memory = profile_result.non_torch_increase
-        self.peak_activation_memory = (
-            profile_result.torch_peak_increase + cudagraph_memory_estimate_applied
-        )
+        self.peak_activation_memory = profile_result.torch_peak_increase
         self.cudagraph_memory_estimate = cudagraph_memory_estimate
 
         free_gpu_memory = profile_result.after_profile.free_memory
@@ -638,6 +618,7 @@ def compile_or_warm_up_model(self) -> float:
             # slightly underestimate the memory consumption.
             # So leave a small buffer (=150MiB) to avoid OOM.
             redundancy_buffer_memory = 150 * (1 << 20)
+
             non_kv_cache_memory = (
                 self.model_runner.model_memory_usage
                 + self.peak_activation_memory
diff --git a/vllm/v1/worker/mamba_utils.py b/vllm/v1/worker/mamba_utils.py
index 68172133eb99..ed618e09973f 100644
--- a/vllm/v1/worker/mamba_utils.py
+++ b/vllm/v1/worker/mamba_utils.py
@@ -67,6 +67,8 @@ class MambaCopyBuffers:
     src_ptrs: CpuGpuBuffer
     dst_ptrs: CpuGpuBuffer
     sizes: CpuGpuBuffer
+    mamba_group_ids: list[int]
+    mamba_spec: MambaSpec
     offset: int = 0
 
     @classmethod
@@ -77,7 +79,7 @@ def create(
         copy_funcs: tuple[MambaStateCopyFunc, ...],
         make_buffer: Callable[..., CpuGpuBuffer],
     ) -> "MambaCopyBuffers":
-        mamba_group_ids, _ = get_mamba_groups(kv_cache_config)
+        mamba_group_ids, mamba_spec = get_mamba_groups(kv_cache_config)
         entries_per_req = sum(
             len(kv_cache_config.kv_cache_groups[gid].layer_names)
             for gid in mamba_group_ids
@@ -87,6 +89,8 @@ def create(
             src_ptrs=make_buffer(n, dtype=torch.int64),
             dst_ptrs=make_buffer(n, dtype=torch.int64),
             sizes=make_buffer(n, dtype=torch.int32),
+            mamba_group_ids=mamba_group_ids,
+            mamba_spec=mamba_spec,
         )
 
 
@@ -155,7 +159,8 @@ def preprocess_mamba(
     Copy the mamba state of previous step to the last
     (1 + num_speculative_blocks) block.
     """
-    mamba_group_ids, mamba_spec = get_mamba_groups(kv_cache_config)
+    mamba_group_ids = copy_bufs.mamba_group_ids
+    mamba_spec = copy_bufs.mamba_spec
     num_speculative_blocks = mamba_spec.num_speculative_blocks
     # TODO(Chen): we need to optimize this function a lot
     assert cache_config.enable_prefix_caching
@@ -231,8 +236,8 @@ def postprocess_mamba(
     num_scheduled_tokens_dict = scheduler_output.num_scheduled_tokens
     scheduled_spec_decode_tokens_dict = scheduler_output.scheduled_spec_decode_tokens
     num_accepted_tokens_cpu = input_batch.num_accepted_tokens_cpu
-    # NOTE: can be optimized as this function always returns the same result
-    mamba_group_ids, mamba_spec = get_mamba_groups(kv_cache_config)
+    mamba_group_ids = copy_bufs.mamba_group_ids
+    mamba_spec = copy_bufs.mamba_spec
     copy_bufs.offset = 0
     for i, req_id in enumerate(input_batch.req_ids):
         req_state = requests[req_id]
@@ -266,45 +271,3 @@ def postprocess_mamba(
             if src_block_idx == dest_block_idx:
                 num_accepted_tokens_cpu[i] = 1
     do_mamba_copy_block(copy_bufs)
-
-
-def update_accepted_tokens_for_prefill_as_decode(
-    input_batch: GPUInputBatch,
-    prefill_as_decode_num_tokens: CpuGpuBuffer,
-    num_accepted_tokens_gpu: torch.Tensor,
-    scheduler_output: SchedulerOutput,
-    decode_qlen_threshold: int | None,
-    num_reqs: int,
-):
-    """
-    Adjusts num_accepted_tokens for prefill chunks processed via the decode path.
-    This ensures subsequent iterations read from the correct sequential state slot
-    instead of the default prefill slot 0. Not used by GDN attention, which manually
-    separates short prefills and short decodes when building the attention metadata.
-    """
-    any_is_prefill = False
-    for i in range(num_reqs):
-        num_computed = input_batch.num_computed_tokens_cpu[i]
-        num_prompt = input_batch.num_prompt_tokens[i]
-        is_prefill = num_computed < num_prompt
-        req_id = input_batch.req_ids[i]
-        query_len = scheduler_output.num_scheduled_tokens[req_id]
-
-        if is_prefill:
-            classified_as_decode = (
-                decode_qlen_threshold is not None and query_len <= decode_qlen_threshold
-            )
-            num_tokens = query_len if classified_as_decode else 1
-            any_is_prefill = True
-        else:
-            num_tokens = -1
-        prefill_as_decode_num_tokens.np[i] = num_tokens
-
-    # We can skip the GPU transfer if there aren't any values to update
-    if any_is_prefill:
-        prefill_as_decode_num_tokens.copy_to_gpu(num_reqs)
-        num_accepted_tokens_gpu[:num_reqs] = torch.where(
-            prefill_as_decode_num_tokens.gpu[:num_reqs] != -1,
-            prefill_as_decode_num_tokens.gpu[:num_reqs],
-            num_accepted_tokens_gpu[:num_reqs],
-        )
diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py
index b6ba8adf8336..041fff637b87 100644
--- a/vllm/v1/worker/worker_base.py
+++ b/vllm/v1/worker/worker_base.py
@@ -122,7 +122,7 @@ def get_model_inspection(self) -> str:
 
         return format_model_inspection(self.get_model())
 
-    def load_model(self) -> None:
+    def load_model(self, *, load_dummy_weights: bool = False) -> None:
         """Load model onto target device."""
         raise NotImplementedError