diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 4cacc2710f10..64b285a0dc1b 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -326,8 +326,7 @@ apply_rocm_test_overrides() { if [[ $cmds == *" kernels/moe"* ]]; then cmds="${cmds} \ --ignore=kernels/moe/test_moe.py \ - --ignore=kernels/moe/test_cutlass_moe.py \ - --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py" + --ignore=kernels/moe/test_cutlass_moe.py" fi # --- Entrypoint ignores --- diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh index 6ec6ab94ff08..1def2c4682b1 100755 --- a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh +++ b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh @@ -127,7 +127,7 @@ run_and_track_test() { # --- Actual Test Execution --- run_and_track_test 1 "test_struct_output_generate.py" \ - "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" + "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\"" run_and_track_test 2 "test_moe_pallas.py" \ "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py" run_and_track_test 3 "test_lora.py" \ diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 82e97bfbb1b2..1fd3d0e2488d 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -39,8 +39,7 @@ ##################################################################################################################################### # # # IMPORTANT: # -# * Currently AMD CI has MI300 agents, MI325 agents, and MI355 agents. Of those, AMD is using mostly MI325 and MI355. AMD team # -# is actively working on enabling more MI300 machines. All upcoming feature improvements are tracked in: # +# * Currently AMD CI has MI250 agents, MI325 agents, and MI355 agents. All upcoming feature improvements are tracked in: # # https://github.com/vllm-project/vllm/issues/34994 # # # #-----------------------------------------------------------------------------------------------------------------------------------# @@ -49,13 +48,15 @@ # * [Pytorch Nightly Dependency Override Check]: if this test fails, it means the nightly torch version is not compatible with # # some of the dependencies. Please check the error message and add the package to # # whitelist in `/vllm/tools/pre_commit/generate_nightly_torch_test.py`. # -# * [Entrypoints Integration Test (LLM)]: # +# * [Entrypoints Integration (LLM)]: # # - {`pytest -v -s entrypoints/llm/test_generate.py`}: It needs a clean process # # - {`pytest -v -s entrypoints/offline_mode`}: Needs to avoid interference with other tests # -# * [V1 Test e2e + engine]: The test uses 4 GPUs, but we schedule it on 8-GPU machines for stability. See discussion here: # -# https://github.com/vllm-project/vllm/pull/31040 # -# * [V1 others]: # -# - Split the tests to avoid interference # +# * [Engine / Engine (1 GPU) / e2e Scheduling / e2e Core / V1 e2e / Spec Decode / V1 Sample + Logits / V1 Core + KV + Metrics]: # +# - Previously a single "V1 Test e2e + engine" step, now split across multiple groups. # +# - V1 e2e (2/4 GPUs) uses 4 GPUs but is scheduled on 8-GPU machines for stability. See: # +# https://github.com/vllm-project/vllm/pull/31040 # +# * [V1 Sample + Logits / V1 Core + KV + Metrics / V1 others (CPU)]: # +# - Previously a single "V1 others" step, now split to avoid interference. # # - Integration test for streaming correctness (requires special branch for __harness__ lib). # # * [V1 others (CPU)]: Split the tests to avoid interference # # * [PyTorch Compilation Unit Tests]: Run unit tests defined directly under `compile/`, not including subdirectories, which # @@ -83,9 +84,9 @@ # run plamo2 model in vLLM. # # * [Language Models Test (Extended Generation)]: Install fast path packages for testing against transformers (mamba, conv1d) # # and to run plamo2 model in vLLM. # -# * [Multi-Modal Models (Standard)]: # +# * [Multi-Modal Models (Standard) 1-4]: # # - Do NOT remove `VLLM_WORKER_MULTIPROC_METHOD=spawn` setting as ROCm requires this for certain models to function. # -# * [Transformers Nightly Models Test]: Whisper needs `VLLM_WORKER_MULTIPROC_METHOD=spawn` to avoid deadlock. # +# * [Transformers Nightly Models]: Whisper needs `VLLM_WORKER_MULTIPROC_METHOD=spawn` to avoid deadlock. # # * [Plugin Tests (2 GPUs)]: # # - {`pytest -v -s entrypoints/openai/test_oot_registration.py`}: It needs a clean process # # - {`pytest -v -s models/test_oot_registration.py`}: It needs a clean process # @@ -94,11 +95,11 @@ # - There is some Tensor Parallelism related processing logic in LoRA that requires multi-GPU testing for validation. # # - {`pytest -v -s -x lora/test_gptoss_tp.py`}: Disabled for now because MXFP4 backend on non-cuda platform doesn't support # # LoRA yet. # -# * [Distributed Tests (GPU_TAG)]: Don't test llama model here, it seems hf implementation is buggy. See: # -# https://github.com/vllm-project/vllm/pull/5689 # -# * [Distributed Tests (GPU_TAG)]: Some old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 in # -# favor of new tests in fusions_e2e. We avoid replicating the new jobs in # -# this file as it's deprecated. # +# * [Distributed Tests (NxGPUs)(HW-TAG)]: Don't test llama model here, it seems hf implementation is buggy. See: # +# https://github.com/vllm-project/vllm/pull/5689 # +# * [Distributed Tests (NxGPUs)(HW-TAG)]: Some old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293 # +# in favor of new tests in fusions_e2e. We avoid replicating the new jobs in # +# this file as it's deprecated. # # # ##################################################################################################################################### @@ -223,7 +224,7 @@ steps: - vllm/platforms/rocm.py commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration (LLM) # TBD @@ -254,11 +255,11 @@ steps: source_file_dependencies: - vllm/ - tests/entrypoints/rpc - - tests/entrypoints/instrumentator + - tests/entrypoints/serve/instrumentator - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/instrumentator + - pytest -v -s entrypoints/serve/instrumentator - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use @@ -483,19 +484,6 @@ steps: - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism" -- label: Entrypoints V1 # TBD - timeout_in_minutes: 180 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/entrypoints - - - label: V1 Sample + Logits # TBD timeout_in_minutes: 60 mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] @@ -1173,14 +1161,14 @@ steps: - vllm/v1/engine/ - vllm/v1/worker/ - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py - label: Distributed Compile + RPC Tests (2 GPUs) # TBD @@ -1402,7 +1390,7 @@ steps: - label: Distributed Tests (2 GPUs)(H100-MI250) # TBD timeout_in_minutes: 180 mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250] - agent_pool: mi250_2 + agent_pool: mi325_2 num_gpus: 2 working_dir: "/vllm-workspace/" source_file_dependencies: @@ -1412,7 +1400,6 @@ steps: - vllm/v1/attention/backends/ - vllm/v1/attention/selector.py - tests/distributed/test_context_parallel.py - - tests/v1/distributed/test_dbo.py - examples/offline_inference/data_parallel.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py @@ -1420,7 +1407,6 @@ steps: - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s tests/distributed/test_context_parallel.py - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization - - pytest -v -s tests/v1/distributed/test_dbo.py ##################################################################################################################################### @@ -1477,11 +1463,11 @@ steps: source_file_dependencies: - vllm/ - tests/entrypoints/rpc - - tests/entrypoints/instrumentator + - tests/entrypoints/serve/instrumentator - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/instrumentator + - pytest -v -s entrypoints/serve/instrumentator - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use @@ -1511,8 +1497,6 @@ steps: - vllm/distributed/ - tests/distributed/test_torchrun_example.py - tests/distributed/test_torchrun_example_moe.py - - examples/offline_inference/rlhf.py - - examples/offline_inference/rlhf_colocate.py - examples/rl/ - tests/examples/offline_inference/data_parallel.py - vllm/platforms/rocm.py @@ -1762,6 +1746,7 @@ steps: timeout_in_minutes: 106 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -1770,19 +1755,6 @@ steps: - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy" -- label: Entrypoints V1 # 25.7m - timeout_in_minutes: 45 - mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] - agent_pool: mi325_1 - optional: true - working_dir: "/vllm-workspace/tests" - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/entrypoints - - - label: V1 Spec Decode # TBD timeout_in_minutes: 40 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] @@ -2395,14 +2367,14 @@ steps: - vllm/v1/engine/ - vllm/v1/worker/ - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/entrypoints/openai/test_multi_api_servers.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py - label: Distributed Compile + RPC Tests (2 GPUs) # 56.1m @@ -2580,6 +2552,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 num_gpus: 4 + optional: true working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/ @@ -2596,21 +2569,16 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_2 num_gpus: 2 - optional: true working_dir: "/vllm-workspace/" source_file_dependencies: - vllm/distributed/ - vllm/v1/distributed/ - vllm/model_executor/layers/fused_moe/ - - tests/distributed/test_context_parallel.py - tests/v1/distributed/test_dbo.py - - examples/offline_inference/data_parallel.py - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - export TORCH_NCCL_BLOCKING_WAIT=1 - - pytest -v -s tests/distributed/test_context_parallel.py - - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py @@ -2669,7 +2637,7 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8.txt + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt - label: LM Eval Large Models (H200-MI325) # TBD @@ -2700,6 +2668,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 num_gpus: 4 + optional: true working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ @@ -2720,6 +2689,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 num_gpus: 4 + optional: true working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ @@ -2785,6 +2755,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 num_gpus: 4 + optional: true working_dir: "/vllm-workspace" source_file_dependencies: - vllm/model_executor/models/ @@ -2827,6 +2798,7 @@ steps: timeout_in_minutes: 11 mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 + optional: true working_dir: "/vllm-workspace" source_file_dependencies: - vllm/model_executor/models/ @@ -2848,6 +2820,7 @@ steps: mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325] agent_pool: mi325_4 num_gpus: 4 + optional: true working_dir: "/vllm-workspace" source_file_dependencies: - vllm/model_executor/models/ @@ -2990,11 +2963,11 @@ steps: source_file_dependencies: - vllm/ - tests/entrypoints/rpc - - tests/entrypoints/instrumentator + - tests/entrypoints/serve/instrumentator - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/instrumentator + - pytest -v -s entrypoints/serve/instrumentator - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use @@ -3599,7 +3572,7 @@ steps: - vllm/_aiter_ops.py - vllm/platforms/rocm.py commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8.txt + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt - label: LM Eval Large Models (4 GPUs)(FP8) # TBD diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 6cf8b43f57c4..0b76c0223f93 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -27,14 +27,14 @@ steps: - vllm/v1/engine/ - vllm/v1/worker/ - tests/v1/distributed - - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/entrypoints/openai/test_multi_api_servers.py commands: # https://github.com/NVIDIA/nccl/issues/1838 - export NCCL_CUMEM_HOST_ENABLE=0 - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py - label: Distributed Compile + RPC Tests (2 GPUs) timeout_in_minutes: 20 @@ -88,7 +88,6 @@ steps: - vllm/distributed/ - tests/distributed/test_torchrun_example.py - tests/distributed/test_torchrun_example_moe.py - - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - examples/rl/ - tests/examples/offline_inference/data_parallel.py diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml index be83bab8fa29..ed0df3e4d879 100644 --- a/.buildkite/test_areas/engine.yaml +++ b/.buildkite/test_areas/engine.yaml @@ -70,3 +70,15 @@ steps: device: mi325_4 depends_on: - image-build-amd + +- label: V1 e2e (4xH100) + timeout_in_minutes: 60 + device: h100 + num_devices: 4 + optional: true + source_file_dependencies: + - vllm/v1/attention/backends/utils.py + - vllm/v1/worker/gpu_model_runner.py + - tests/v1/e2e/test_hybrid_chunked_prefill.py + commands: + - pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml index ac6be8e141f2..25c22c4ded9d 100644 --- a/.buildkite/test_areas/entrypoints.yaml +++ b/.buildkite/test_areas/entrypoints.yaml @@ -10,7 +10,7 @@ steps: - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration (LLM) timeout_in_minutes: 40 @@ -34,7 +34,7 @@ steps: - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/test_chat_utils.py mirror: amd: @@ -48,11 +48,11 @@ steps: source_file_dependencies: - vllm/ - tests/entrypoints/rpc - - tests/entrypoints/instrumentator + - tests/entrypoints/serve/instrumentator - tests/tool_use commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/instrumentator + - pytest -v -s entrypoints/serve/instrumentator - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc - pytest -v -s tool_use @@ -75,19 +75,6 @@ steps: commands: - pytest -v -s entrypoints/openai/responses -- label: Entrypoints V1 - timeout_in_minutes: 50 - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - - pytest -v -s v1/entrypoints - mirror: - amd: - device: mi325_1 - depends_on: - - image-build-amd - - label: OpenAI API Correctness timeout_in_minutes: 30 source_file_dependencies: diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml index 3e2610e70a31..29f8cb3bc6c1 100644 --- a/.buildkite/test_areas/lm_eval.yaml +++ b/.buildkite/test_areas/lm_eval.yaml @@ -45,6 +45,22 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt +- label: LM Eval Qwen3.5 Models (B200) + timeout_in_minutes: 120 + device: b200 + optional: true + num_devices: 2 + source_file_dependencies: + - vllm/model_executor/models/qwen3_5.py + - vllm/model_executor/models/qwen3_5_mtp.py + - vllm/transformers_utils/configs/qwen3_5.py + - vllm/transformers_utils/configs/qwen3_5_moe.py + - vllm/model_executor/models/qwen3_next.py + - vllm/model_executor/models/qwen3_next_mtp.py + - vllm/model_executor/layers/fla/ops/ + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt + - label: LM Eval Large Models (H200) timeout_in_minutes: 60 device: h200 diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml index 85421399d1b8..238d5956a025 100644 --- a/.buildkite/test_areas/model_runner_v2.yaml +++ b/.buildkite/test_areas/model_runner_v2.yaml @@ -11,7 +11,7 @@ steps: - vllm/v1/attention/ - tests/v1/engine/test_llm_engine.py - tests/v1/e2e/ - - tests/v1/entrypoints/llm/test_struct_output_generate.py + - tests/entrypoints/llm/test_struct_output_generate.py commands: - set -x - export VLLM_USE_V2_MODEL_RUNNER=1 @@ -22,7 +22,7 @@ steps: - pytest -v -s v1/e2e/general/test_context_length.py - pytest -v -s v1/e2e/general/test_min_tokens.py # Temporary hack filter to exclude ngram spec decoding based tests. - - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0" + - pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0" - label: Model Runner V2 Examples timeout_in_minutes: 45 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index b0e49432775f..c0ceae044d25 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -75,7 +75,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson /tests/multimodal @DarkLight1337 @ywang96 @NickLucche /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety /tests/test_inputs.py @DarkLight1337 @ywang96 -/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm +/tests/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm /tests/v1/structured_output @mgoin @russellb @aarnphm /tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery /tests/weight_loading @mgoin @youkaichao @yewentao256 diff --git a/.github/mergify.yml b/.github/mergify.yml index 1c6837277831..eace1f479035 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -260,7 +260,7 @@ pull_request_rules: - files=examples/offline_inference/structured_outputs.py - files=examples/online_serving/structured_outputs/structured_outputs.py - files~=^tests/v1/structured_output/ - - files=tests/v1/entrypoints/llm/test_struct_output_generate.py + - files=tests/entrypoints/llm/test_struct_output_generate.py - files~=^vllm/v1/structured_output/ actions: label: diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index 442b20e41de5..60e10e53391a 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -26,6 +26,16 @@ #define __HIP__GFX9__ #endif +#if defined(__HIPCC__) && \ + (defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1150__) || \ + defined(__gfx1151__) || defined(__gfx1200__) || defined(__gfx1201__)) + #define __HIP__GFX1X__ +#endif + +#if defined(__HIPCC__) && (defined(__gfx1200__) || defined(__gfx1201__)) + #define __HIP__GFX12__ +#endif + #if defined(__HIPCC__) && (defined(__gfx942__) || defined(__gfx950__)) #define __HIP__MI3XX__ #endif @@ -37,15 +47,31 @@ #endif int get_lds_size() { - static bool is_cached = false; - static int result; - if (is_cached == false) { - auto dprops = at::cuda::getCurrentDeviceProperties(); - std::string device_arch = dprops->gcnArchName; - size_t substring = device_arch.find("gfx95"); - result = (substring == std::string::npos ? 64 * 1024 : 160 * 1024); - is_cached = true; - } + static const int result = [] { + const auto* dprops = at::cuda::getCurrentDeviceProperties(); + const std::string device_arch = dprops->gcnArchName; + return device_arch.find("gfx95") == std::string::npos ? 64 * 1024 + : 160 * 1024; + }(); + return result; +} + +bool on_gfx1x() { + static const bool result = [] { + const auto* dprops = at::cuda::getCurrentDeviceProperties(); + const std::string device_arch = dprops->gcnArchName; + return device_arch.find("gfx11") != std::string::npos || + device_arch.find("gfx12") != std::string::npos; + }(); + return result; +} + +bool on_gfx12() { + static const bool result = [] { + const auto* dprops = at::cuda::getCurrentDeviceProperties(); + const std::string device_arch = dprops->gcnArchName; + return device_arch.find("gfx12") != std::string::npos; + }(); return result; } @@ -286,21 +312,35 @@ torch::Tensor LLMM1(at::Tensor& in_a, at::Tensor& in_b, return out_c; } -#define DOT2C(V0, V2, V3) \ - if constexpr (std::is_same_v) { \ - asm("v_dot2c_f32_f16 %0, %2, %3" : "=v"(V0) : "0"(V0), "v"(V2), "v"(V3)); \ - } else if constexpr (std::is_same_v) { \ - float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \ - __bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \ - V0 += (s.x + s.y); \ - } +#if defined(__HIP__GFX9__) && !defined(__HIP__GFX1X__) + #define DOT2C(V0, V2, V3) \ + if constexpr (std::is_same_v) { \ + asm("v_dot2c_f32_f16 %0, %2, %3" \ + : "=v"(V0) \ + : "0"(V0), "v"(V2), "v"(V3)); \ + } else if constexpr (std::is_same_v) { \ + float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \ + __bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \ + V0 += (s.x + s.y); \ + } +#elif defined(__HIP__GFX1X__) + // gfx1x: v_dot2_f32_f16 (VOP3-P, dot10-insts, available on gfx11+gfx12) + #define DOT2C(V0, V2, V3) \ + if constexpr (std::is_same_v) { \ + asm("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(V0) : "v"(V2), "v"(V3)); \ + } else if constexpr (std::is_same_v) { \ + float2 s = __bfloat1622float2(*((__hip_bfloat162*)(&(V2)))) * \ + __bfloat1622float2(*((__hip_bfloat162*)(&(V3)))); \ + V0 += (s.x + s.y); \ + } +#endif // To avoid LLVM silently upcasting to double __device__ inline unsigned int min__(uint32_t a, uint32_t b) { return min(a, b); } -#if defined(__HIP__GFX9__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__) // This version targets cases where A[] fits LDS capacity template @@ -442,14 +482,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) 1); // row_shr2 sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf, 1); // row_shr1 + #if defined(__HIP__GFX9__) sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf, 1); // ROW_BCAST15 sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf, 1); // ROW_BCAST31 + #else + sum[n][y] += __shfl_xor(sum[n][y], 16); + #endif } } - if (threadIdx.x == 63) { + if (threadIdx.x == (THRDS - 1)) { scalar_t biases[N][YTILE] = {}; if (BIAS) for (int n = 0; n < N; n++) { @@ -469,9 +513,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } } else { - #pragma unroll + #ifdef __HIP__GFX9__ + #pragma unroll for (int n = 0; n < N; n++) { - #pragma unroll + #pragma unroll for (int y = 0; y < YTILE; y++) { /*float accm1 = 0; for (int i=0; i<64; i++) @@ -498,7 +543,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) sum4[n][y][0] = accm; } } - if (threadIdx.x == 63) { + if (threadIdx.x == (THRDS - 1)) { scalar_t biases[N][YTILE] = {}; if (BIAS) for (int n = 0; n < N; n++) { @@ -513,11 +558,12 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } } + #endif // __HIP__GFX9__ (MFMA path) } m += CuCount * _WvPrGrp * YTILE; } } -#else // !defined(__HIP__GFX9__) TODO: Add NAVI support +#else template __global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap, @@ -528,9 +574,9 @@ __global__ void wvSplitK_hf_sml_(const int K, const int Kbp, const int Kap, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__GFX9__) TODO: Add NAVI support +#endif -#if defined(__HIP__GFX9__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__) // This version targets cases where A[] marginally exceeds LDS capacity template @@ -657,14 +703,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) 1); // row_shr2 sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf, 1); // row_shr1 + #if defined(__HIP__GFX9__) sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf, 1); // ROW_BCAST15 sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf, 1); // ROW_BCAST31 + #else + sum[n][y] += __shfl_xor(sum[n][y], 16); + #endif } } - if (threadIdx.x == 63) { + if (threadIdx.x == (THRDS - 1)) { scalar_t biases[N][YTILE] = {}; if (BIAS) for (int n = 0; n < N; n++) { @@ -686,9 +736,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } } else { - #pragma unroll + #ifdef __HIP__GFX9__ + #pragma unroll for (int n = 0; n < N; n++) { - #pragma unroll + #pragma unroll for (int y = 0; y < YTILE; y++) { // float accm1 = 0; // for (int i=0; i<64; i++) @@ -713,7 +764,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) sum4[n][y][0] = accm; } } - if (threadIdx.x == 63) { + if (threadIdx.x == (THRDS - 1)) { scalar_t biases[N][YTILE] = {}; if (BIAS) for (int n = 0; n < N; n++) { @@ -730,6 +781,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } } + #endif // __HIP__GFX9__ (MFMA path) } m += CuCount * _WvPrGrp * YTILE; @@ -746,7 +798,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } -#else // !defined(__HIP__GFX9__) TODO: Add NAVI support +#else template __global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap, @@ -756,9 +808,9 @@ __global__ void wvSplitK_hf_(const int K, const int Kbp, const int Kap, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__GFX9__) TODO: Add NAVI support +#endif -#if defined(__HIP__GFX9__) // TODO: Add NAVI support +#if defined(__HIP__GFX9__) || defined(__HIP__GFX1X__) // This version targets big A[] cases, where it is much larger than LDS capacity template @@ -1004,14 +1056,18 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) 1); // row_shr2 sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x111, 0xf, 0xf, 1); // row_shr1 + #if defined(__HIP__GFX9__) sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x142, 0xf, 0xf, 1); // ROW_BCAST15 sum[n][y] += __builtin_amdgcn_mov_dpp(sum[n][y], 0x143, 0xf, 0xf, 1); // ROW_BCAST31 + #else + sum[n][y] += __shfl_xor(sum[n][y], 16); + #endif } } - if (threadIdx.x == 63) { + if (threadIdx.x == (THRDS - 1)) { scalar_t biases[N][YTILE] = {}; if (BIAS) for (int n = 0; n < N; n++) { @@ -1033,9 +1089,10 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } } else { - #pragma unroll + #ifdef __HIP__GFX9__ + #pragma unroll for (int n = 0; n < N; n++) { - #pragma unroll + #pragma unroll for (int y = 0; y < YTILE; y++) { float accm = sum4[n][y][0]; accm += __builtin_amdgcn_mov_dpp(sum4[n][y][1], 0x101, 0xf, 0xf, @@ -1057,7 +1114,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) sum4[n][y][0] = accm; } } - if (threadIdx.x == 63) { + if (threadIdx.x == (THRDS - 1)) { scalar_t biases[N][YTILE] = {}; if (BIAS) for (int n = 0; n < N; n++) { @@ -1074,6 +1131,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } } + #endif // __HIP__GFX9__ (MFMA path) } m += CuCount * _WvPrGrp * YTILE; @@ -1090,7 +1148,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) } } } -#else // !defined(__HIP__GFX9__) TODO: Add NAVI support +#else template __global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap, @@ -1101,7 +1159,7 @@ __global__ void wvSplitK_hf_big_(const int K, const int Kbp, const int Kap, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__GFX9__) TODO: Add NAVI support +#endif // Find the min val of div2 that doesn't increase N/(div1*div2) int mindiv(int N, int div1, int div2) { @@ -1148,40 +1206,40 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b, const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const int max_lds_len = get_lds_size() / 2; -#define WVSPLITK(_YTILE, _UNRL, _N) \ +#define WVSPLITK_CFG(_THRDS, _WVPRGRP, _YTILE, _UNRL, _N) \ { \ - dim3 block(64, 16); \ - int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \ + dim3 block(_THRDS, _WVPRGRP); \ + int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, _WVPRGRP); \ if ((Kbp_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \ - wvSplitK_hf_sml_ \ + wvSplitK_hf_sml_ \ <<>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \ By_in, af4, bf4, biasf4, c, __wvPrGrp, \ CuCount); \ else if (Kbp_in * N_in <= max_lds_len * 1.2) \ - wvSplitK_hf_ \ + wvSplitK_hf_ \ <<>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \ By_in, af4, bf4, biasf4, c, __wvPrGrp, \ CuCount); \ else \ - wvSplitK_hf_big_ \ + wvSplitK_hf_big_ \ <<>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \ By_in, af4, bf4, biasf4, c, __wvPrGrp, \ CuCount); \ } -#define WVSPLIT_TILE(_sYT, __N) \ +#define WVSPLIT_TILE_CFG(_THRDS, _WVPRGRP, _sYT, __N) \ { \ bool fit_lds = (Kbp_in * N_in <= max_lds_len); \ if (_sYT <= 1) \ - WVSPLITK(1, 4, __N) \ + WVSPLITK_CFG(_THRDS, _WVPRGRP, 1, 4, __N) \ else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \ - WVSPLITK(2, 2, __N) \ + WVSPLITK_CFG(_THRDS, _WVPRGRP, 2, 2, __N) \ else if (_sYT <= 4 * 3) \ - WVSPLITK(3, 2, __N) \ + WVSPLITK_CFG(_THRDS, _WVPRGRP, 3, 2, __N) \ else if (__N == 4) \ - WVSPLITK(4, 1, __N) \ + WVSPLITK_CFG(_THRDS, _WVPRGRP, 4, 1, __N) \ else \ - WVSPLITK(4, 2, __N) \ + WVSPLITK_CFG(_THRDS, _WVPRGRP, 4, 2, __N) \ } AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] { @@ -1198,18 +1256,31 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b, // then cut the active waves to balance their distribution... int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4); + const bool use_wave32 = on_gfx1x(); switch (N_in) { case 1: - WVSPLIT_TILE(sYT, 1) + if (use_wave32) + WVSPLIT_TILE_CFG(32, 16, sYT, 1) + else + WVSPLIT_TILE_CFG(64, 16, sYT, 1) break; case 2: - WVSPLIT_TILE(sYT, 2) + if (use_wave32) + WVSPLIT_TILE_CFG(32, 16, sYT, 2) + else + WVSPLIT_TILE_CFG(64, 16, sYT, 2) break; case 3: - WVSPLIT_TILE(sYT, 3) + if (use_wave32) + WVSPLIT_TILE_CFG(32, 16, sYT, 3) + else + WVSPLIT_TILE_CFG(64, 16, sYT, 3) break; case 4: - WVSPLIT_TILE(sYT, 4) + if (use_wave32) + WVSPLIT_TILE_CFG(32, 16, sYT, 4) + else + WVSPLIT_TILE_CFG(64, 16, sYT, 4) break; default: throw std::runtime_error( @@ -1653,7 +1724,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) #endif } } -#else // !defined(__HIP__GFX9__) TODO: Add NAVI support +#else template __global__ void wvSplitKrc_(const int actlN, const int K, const int Kap, @@ -1688,6 +1759,8 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b, TORCH_CHECK(in_a.dtype() == torch::kFloat16 || in_a.dtype() == torch::kBFloat16); + const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a)); + auto out_c = torch::empty( {N_in, M_in}, torch::TensorOptions().dtype(in_a.dtype()).device(in_a.device())); @@ -1696,7 +1769,6 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b, dim3 grid(CuCount); - const at::cuda::OptionalCUDAGuard device_guard(device_of(in_a)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); // const int max_lds_len = get_lds_size() / 2; @@ -1773,7 +1845,7 @@ torch::Tensor wvSplitKrc(const at::Tensor& in_a, const at::Tensor& in_b, return out_c; } -#if defined(__HIP__MI3XX__) // TODO: Add NAVI support +#if defined(__HIP__MI3XX__) || defined(__HIP__GFX12__) template __global__ void __launch_bounds__(WvPrGrp* THRDS) @@ -1817,12 +1889,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE; - using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float; float sA = *s_A; float sB = *s_B; while (m < M) { + #ifdef __HIP__GFX12__ + // gfx12: per-lane scalar accumulation via v_dot4_f32_fp8_fp8 + float sum[N][YTILE] = {}; + #else + // gfx9: MFMA accumulation scalar8 sum[N][YTILE] = {}; + #endif for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) { bigType bigA[N][UNRL] = {}; bigType bigB[YTILE][UNRL]; @@ -1854,6 +1931,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) #pragma unroll for (uint32_t k2 = 0; k2 < UNRL; k2++) { for (uint32_t n = 0; n < N; n++) { + #ifdef __HIP__GFX12__ + // gfx12: 4 x dot4 per A_CHUNK=16 bytes (4 FP8 per dot4) + for (int y = 0; y < YTILE; ++y) { + #pragma unroll + for (int i = 0; i < A_CHUNK / 4; i++) { + sum[n][y] = __builtin_amdgcn_dot4_f32_fp8_fp8( + bigA[n][k2].i[i], bigB[y][k2].i[i], sum[n][y]); + } + } + #else + // gfx9: MFMA path for (int i = 0; i < A_CHUNK; i += 8) { for (int y = 0; y < YTILE; ++y) { sum[n][y] = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8( @@ -1861,11 +1949,33 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) 0); } } + #endif } } } // Final reduction + #ifdef __HIP__GFX12__ + // gfx12 wave32: DPP row_shr within 16-lane rows + cross-row shuffle + for (int n = 0; n < N; n++) { + for (int y = 0; y < YTILE; y++) { + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:1 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + sum[n][y] += __shfl_xor(sum[n][y], 16); + } + } + #else + // gfx9 MFMA reduction for (int n = 0; n < N; n++) { for (int y = 0; y < YTILE; y++) { float accm0 = sum[n][y][0]; @@ -1880,8 +1990,15 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) sum[n][y][0] = accm0; } } + #endif - if (threadIdx.x == 0) { + const bool writeback_lane = + #ifdef __HIP__GFX12__ + threadIdx.x == (THRDS - 1); + #else + threadIdx.x == 0; + #endif + if (writeback_lane) { scalar_t biases[N][YTILE] = {}; if (BIAS) for (int n = 0; n < N; n++) { @@ -1892,13 +2009,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) for (int n = 0; n < N; n++) { for (int y = 0; y < YTILE; y++) { if (y + m >= M) break; // To avoid mem access fault. - sum[n][y][0] *= sA * sB; + #ifdef __HIP__GFX12__ + float result = sum[n][y] * sA * sB; + #else + float result = sum[n][y][0] * sA * sB; + #endif if constexpr (std::is_same_v) { - sum[n][y][0] += __half2float(biases[n][y]); + result += __half2float(biases[n][y]); } else if constexpr (std::is_same_v) { - sum[n][y][0] += __bfloat162float(biases[n][y]); + result += __bfloat162float(biases[n][y]); } - C[m + y + n * M] = __float2s(sum[n][y][0]); + C[m + y + n * M] = __float2s(result); } } } @@ -1906,7 +2027,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; } } -#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support +#else // !defined(__HIP__MI3XX__) && !defined(__HIP__GFX12__) template __global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp, @@ -1918,9 +2039,9 @@ __global__ void wvSplitKQ_hf_sml_(const int K, const int Kap, const int Kbp, const int _WvPrGrp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support +#endif // defined(__HIP__MI3XX__) || defined(__HIP__GFX12__) -#if defined(__HIP__MI3XX__) // TODO: Add NAVI support +#if defined(__HIP__MI3XX__) || defined(__HIP__GFX12__) template __global__ void __launch_bounds__(WvPrGrp* THRDS) @@ -1963,12 +2084,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) uint32_t m = (blockIdx.x * _WvPrGrp + (threadIdx.y % _WvPrGrp)) * YTILE; - using floatx16 = __attribute__((__vector_size__(16 * sizeof(float)))) float; float sA = *s_A; float sB = *s_B; while (m < M) { + #ifdef __HIP__GFX12__ + // gfx12: per-lane scalar accumulation via v_dot4_f32_fp8_fp8 + float sum[N][YTILE] = {}; + #else + // gfx9: MFMA accumulation scalar8 sum[N][YTILE] = {}; + #endif for (uint32_t k1 = 0; k1 < K; k1 += THRDS * A_CHUNK * UNRL) { bigType bigA[N][UNRL] = {}; bigType bigB[YTILE][UNRL]; @@ -2002,6 +2128,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) #pragma unroll for (uint32_t k2 = 0; k2 < UNRL; k2++) { for (uint32_t n = 0; n < N; n++) { + #ifdef __HIP__GFX12__ + // gfx12: 4 x dot4 per A_CHUNK=16 bytes (4 FP8 per dot4) + for (int y = 0; y < YTILE; ++y) { + #pragma unroll + for (int i = 0; i < A_CHUNK / 4; i++) { + sum[n][y] = __builtin_amdgcn_dot4_f32_fp8_fp8( + bigA[n][k2].i[i], bigB[y][k2].i[i], sum[n][y]); + } + } + #else + // gfx9: MFMA path for (int i = 0; i < A_CHUNK; i += 8) { for (int y = 0; y < YTILE; ++y) { sum[n][y] = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8( @@ -2009,11 +2146,33 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) 0); } } + #endif } } } // Final reduction + #ifdef __HIP__GFX12__ + // gfx12 wave32: DPP row_shr within 16-lane rows + cross-row shuffle + for (int n = 0; n < N; n++) { + for (int y = 0; y < YTILE; y++) { + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:8 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:4 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:2 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + asm("s_nop 0\n\tv_add_f32 %0, %2, %3 row_shr:1 bound_ctrl:0 " + : "=v"(sum[n][y]) + : "0"(sum[n][y]), "v"(sum[n][y]), "v"(sum[n][y])); + sum[n][y] += __shfl_xor(sum[n][y], 16); + } + } + #else + // gfx9 MFMA reduction for (int n = 0; n < N; n++) { for (int y = 0; y < YTILE; y++) { float accm0 = sum[n][y][0]; @@ -2028,8 +2187,15 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) sum[n][y][0] = accm0; } } + #endif - if (threadIdx.x == 0) { + const bool writeback_lane = + #ifdef __HIP__GFX12__ + threadIdx.x == (THRDS - 1); + #else + threadIdx.x == 0; + #endif + if (writeback_lane) { scalar_t biases[N][YTILE] = {}; if (BIAS) for (int n = 0; n < N; n++) { @@ -2040,13 +2206,17 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) for (int n = 0; n < N; n++) { for (int y = 0; y < YTILE; y++) { if (y + m >= M) break; // To avoid mem access fault. - sum[n][y][0] *= sA * sB; + #ifdef __HIP__GFX12__ + float result = sum[n][y] * sA * sB; + #else + float result = sum[n][y][0] * sA * sB; + #endif if constexpr (std::is_same_v) { - sum[n][y][0] += __half2float(biases[n][y]); + result += __half2float(biases[n][y]); } else if constexpr (std::is_same_v) { - sum[n][y][0] += __bfloat162float(biases[n][y]); + result += __bfloat162float(biases[n][y]); } - C[m + y + n * M] = __float2s(sum[n][y][0]); + C[m + y + n * M] = __float2s(result); } } } @@ -2054,7 +2224,7 @@ __global__ void __launch_bounds__(WvPrGrp* THRDS) m += CuCount * _WvPrGrp * YTILE; } } -#else // !defined(__HIP__MI3XX__) TODO: Add NAVI support +#else // !defined(__HIP__MI3XX__) && !defined(__HIP__GFX12__) template __global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp, @@ -2066,7 +2236,7 @@ __global__ void wvSplitKQ_hf_(const int K, const int Kap, const int Kbp, const int CuCount) { UNREACHABLE_CODE } -#endif // defined(__HIP__MI3XX__) TODO: Add NAVI support +#endif // defined(__HIP__MI3XX__) || defined(__HIP__GFX12__) void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a, const std::optional& in_bias, at::Tensor& out_c, @@ -2099,24 +2269,30 @@ void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a, const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const int max_lds_len = get_lds_size(); -#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \ - { \ - dim3 block(64, _WvPrGrp); \ - if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ - int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16)); \ - wvSplitKQ_hf_sml_ \ - <<>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \ - By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \ - s_a, s_b, __wvPrGrp, CuCount); \ - } else { \ - int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16)); \ - wvSplitKQ_hf_ \ - <<>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \ - By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \ - s_a, s_b, __wvPrGrp, CuCount); \ - } \ +#define WVSPLITKQ_IMPL(_THRDS, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \ + { \ + dim3 block(_THRDS, _WvPrGrp); \ + if ((Kap_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ + int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEs, 16)); \ + wvSplitKQ_hf_sml_<<>>( \ + K_in, Kap_in, Kbp_in, M_in, Bx_in, By_in, b_ptr, a_ptr, bias_ptr, \ + c_ptr, s_a, s_b, __wvPrGrp, CuCount); \ + } else { \ + int __wvPrGrp = min(_WvPrGrp, mindiv(M_in, CuCount * _YTILEm, 16)); \ + wvSplitKQ_hf_ \ + <<>>(K_in, Kap_in, Kbp_in, M_in, Bx_in, \ + By_in, b_ptr, a_ptr, bias_ptr, c_ptr, \ + s_a, s_b, __wvPrGrp, CuCount); \ + } \ } +#define WVSPLITKQ(_WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \ + if (on_gfx12()) \ + WVSPLITKQ_IMPL(32, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) \ + else \ + WVSPLITKQ_IMPL(64, _WvPrGrp, _YTILEs, _YTILEm, _UNRLs, _UNRLm, _N) + AT_DISPATCH_REDUCED_FLOATING_TYPES(out_c.scalar_type(), "wvSplitKQ", [&] { using fptype = typename scalar::type; auto c_ptr = reinterpret_cast(out_c.data_ptr()); @@ -2136,10 +2312,10 @@ void wvSplitKQ(const at::Tensor& in_b, const at::Tensor& in_a, WVSPLITKQ(16, 2, 2, 2, 2, 2) break; case 3: - WVSPLITKQ(16, 2, 2, 2, 2, 3) + WVSPLITKQ(16, 2, 2, 1, 1, 3) break; case 4: - WVSPLITKQ(16, 2, 2, 2, 2, 4) + WVSPLITKQ(16, 2, 2, 1, 1, 4) break; default: throw std::runtime_error( diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index c6e972e89d00..e5a216c77ba6 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -44,7 +44,7 @@ ENV DEBIAN_FRONTEND=noninteractive # Install Python and other dependencies RUN apt-get update -y \ - && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev \ + && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 libopenmpi-dev libpci-dev liblzma-dev pkg-config \ && for i in 1 2 3; do \ add-apt-repository -y ppa:deadsnakes/ppa && break || \ { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \ diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index e4bb0b696727..1d12d63549a0 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -3,6 +3,10 @@ !!! warning Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference. +!!! tip "Choosing a profiler" + - Use **Nsight Systems** for low-overhead, performance-critical profiling. + - Use **PyTorch Profiler** for medium-overhead profiling with richer debugging information (e.g., stack traces, memory, shapes). Note that enabling these features adds overhead and is not recommended for benchmarking. + ## Profile with PyTorch Profiler We support tracing vLLM workers using different profilers. You can enable profiling by setting the `--profiler-config` flag when launching the server. diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 3d2e02e9d165..6045a4014209 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -88,8 +88,8 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k | flashinfer | standard | nvfp4,
fp8 | T | 5 | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | | gpt oss triton | standard | N/A | N/A | 5 | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],
[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] | | marlin | standard,
batched | 3 / N/A | 3 / N/A | silu,
swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],
[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | -| trtllm | standard | mxfp4,
nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] | -| rocm aiter moe | standard | fp8 | G(128),A,T | silu, gelu | Y | N | [`rocm_aiter_fused_experts`][vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe.rocm_aiter_fused_experts] | +| trtllm | standard | mxfp4,
nvfp4 | G(16),G(32) | 5 | N | Y | [`TrtLlmMxfp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsMonolithic],
[`TrtLlmMxfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe.TrtLlmMxfp4ExpertsModular],
[`TrtLlmNvFp4ExpertsMonolithic`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsMonolithic],
[`TrtLlmNvfp4ExpertsModular`][vllm.model_executor.layers.fused_moe.experts.trtllm_nvfp4_moe.TrtLlmNvFp4ExpertsModular] | +| rocm aiter moe | standard | mxfp4,
fp8 | G(32),G(128),A,T | silu, gelu,
swigluoai | Y | N | `rocm_aiter_fused_experts`,
`AiterExperts` | | cpu_fused_moe | standard | N/A | N/A | silu | N | N | [`CPUFusedMOE`][vllm.model_executor.layers.fused_moe.cpu_fused_moe.CPUFusedMOE] | | naive batched4 | batched | int8,
fp8 | G,A,T | silu, gelu | 6 | Y | [`NaiveBatchedExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.NaiveBatchedExperts] | diff --git a/docs/models/pooling_models/README.md b/docs/models/pooling_models/README.md index b34cc1efe6ae..02e2c82cf009 100644 --- a/docs/models/pooling_models/README.md +++ b/docs/models/pooling_models/README.md @@ -31,28 +31,29 @@ Of course, we also have "plugin" tasks that allow users to customize input and o ### Pooling Tasks -| Pooling Tasks | Granularity | Outputs | -|--------------------|---------------|-------------------------------------------------| -| `classify` | Sequence-wise | probability vector of classes for each sequence | -| `score` (see note) | Sequence-wise | reranker score for each sequence | -| `embed` | Sequence-wise | vector representations for each sequence | -| `token_classify` | Token-wise | probability vector of classes for each token | -| `token_embed` | Token-wise | vector representations for each token | +| Pooling Tasks | Granularity | Outputs | +|-----------------------|---------------|-------------------------------------------------| +| `classify` (see note) | Sequence-wise | probability vector of classes for each sequence | +| `embed` | Sequence-wise | vector representations for each sequence | +| `token_classify` | Token-wise | probability vector of classes for each token | +| `token_embed` | Token-wise | vector representations for each token | !!! note Within classification tasks, there is a specialized subcategory: Cross-encoder (aka reranker) models. These models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. ### Score Types -| Pooling Tasks | Granularity | Outputs | Score Types | scoring function | -|--------------------|---------------|-------------------------------------------------|--------------------|--------------------------| -| `classify` | Sequence-wise | probability vector of classes for each sequence | nan | nan | -| `score` (see note) | Sequence-wise | reranker score for each sequence | `cross-encoder` | linear classifier | -| `embed` | Sequence-wise | vector representations for each sequence | `bi-encoder` | cosine similarity | -| `token_classify` | Token-wise | probability vector of classes for each token | nan | nan | -| `token_embed` | Token-wise | vector representations for each token | `late-interaction` | late interaction(MaxSim) | +The scoring models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`. -The score models is designed to compute similarity scores between two input prompts. It supports three model types (aka `score_type`): `cross-encoder`, `late-interaction`, and `bi-encoder`. +| Pooling Tasks | Granularity | Outputs | Score Types | scoring function | +|-----------------------|---------------|----------------------------------------------|--------------------|--------------------------| +| `classify` (see note) | Sequence-wise | reranker score for each sequence | `cross-encoder` | linear classifier | +| `embed` | Sequence-wise | vector representations for each sequence | `bi-encoder` | cosine similarity | +| `token_classify` | Token-wise | probability vector of classes for each token | nan | nan | +| `token_embed` | Token-wise | vector representations for each token | `late-interaction` | late interaction(MaxSim) | + +!!! note + Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled. ### Pooling Usages @@ -85,14 +86,16 @@ enabling the corresponding APIs. ### Offline APIs corresponding to pooling tasks -| Task | APIs | -|------------------|----------------------------------------------------------------------------| -| `embed` | `LLM.embed(...)`,`LLM.encode(..., pooling_task="embed")`, `LLM.score(...)` | -| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")` | -| `score` | `LLM.score(...)` | -| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` | -| `token_embed` | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)` | -| `plugin` | `LLM.encode(..., pooling_task="plugin")` | +| Task | APIs | +|------------------|---------------------------------------------------------------------------------------| +| `embed` | `LLM.embed(...)`, `LLM.encode(..., pooling_task="embed")`, `LLM.score(...)`(see note) | +| `classify` | `LLM.classify(...)`, `LLM.encode(..., pooling_task="classify")`, `LLM.score(...)` | +| `token_classify` | `LLM.reward(...)`, `LLM.encode(..., pooling_task="token_classify")` | +| `token_embed` | `LLM.encode(..., pooling_task="token_embed")`, `LLM.score(...)` | +| `plugin` | `LLM.encode(..., pooling_task="plugin")` | + +!!! note + Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled. ### `LLM.classify` @@ -206,11 +209,11 @@ If `--runner pooling` has been set (manually or automatically) but the model doe vLLM will attempt to automatically convert the model according to the architecture names shown in the table below. -| Architecture | `--convert` | Supported pooling tasks | -| ----------------------------------------------- | ----------- | ------------------------------------- | -| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` | -| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` | -| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify`, `score` | +| Architecture | `--convert` | Supported pooling tasks | +|-------------------------------------------------|-------------|------------------------------| +| `*ForTextEncoding`, `*EmbeddingModel`, `*Model` | `embed` | `token_embed`, `embed` | +| `*ForRewardModeling`, `*RewardModel` | `embed` | `token_embed`, `embed` | +| `*For*Classification`, `*ClassificationModel` | `classify` | `token_classify`, `classify` | !!! tip You can explicitly set `--convert ` to specify how to convert the model. @@ -251,3 +254,7 @@ Pooling models now default support all pooling, you can use it without any setti - Extracting hidden states prefers using `token_embed` task. - Named Entity Recognition (NER) and reward models prefers using `token_classify` task. + +### Score task + +`score` task is deprecated and will be removed in v0.20. Please use `classify` instead. Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled. diff --git a/docs/models/pooling_models/classify.md b/docs/models/pooling_models/classify.md index 10d7892b5361..1247bb4a0bbc 100644 --- a/docs/models/pooling_models/classify.md +++ b/docs/models/pooling_models/classify.md @@ -17,6 +17,8 @@ The key distinction between (sequence) classification and token classification l Many classification models support both (sequence) classification and token classification. For further details on token classification, please refer to [this page](token_classify.md). +Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled, please refer to [this page](scoring.md). + ## Typical Use Cases ### Classification @@ -54,7 +56,7 @@ If your model is not in the above list, we will try to automatically convert the Cross-encoder (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. Most classification models can also be used as [cross-encoder models](scoring.md#cross-encoder-models). For more information on cross-encoder models, please refer to [this page](scoring.md). ---8<-- "docs/models/pooling_models/scoring.md:supported-score-models" +--8<-- "docs/models/pooling_models/scoring.md:supported-cross-encoder-models" ### Reward Models diff --git a/docs/models/pooling_models/scoring.md b/docs/models/pooling_models/scoring.md index 6227b689acb0..ac94a0cd76bc 100644 --- a/docs/models/pooling_models/scoring.md +++ b/docs/models/pooling_models/scoring.md @@ -10,11 +10,11 @@ The score models is designed to compute similarity scores between two input prom - Model Usage: Scoring - Pooling Task: -| Score Types | Pooling Tasks | scoring function | -|--------------------|---------------|--------------------------| -| `cross-encoder` | `score` | linear classifier | -| `late-interaction` | `token_embed` | late interaction(MaxSim) | -| `bi-encoder` | `embed` | cosine similarity | +| Score Types | Pooling Tasks | scoring function | +|--------------------|-----------------------|--------------------------| +| `cross-encoder` | `classify` (see note) | linear classifier | +| `late-interaction` | `token_embed` | late interaction(MaxSim) | +| `bi-encoder` | `embed` | cosine similarity | - Offline APIs: - `LLM.score` @@ -22,13 +22,16 @@ The score models is designed to compute similarity scores between two input prom - [Score API](scoring.md#score-api) (`/score`) - [Rerank API](scoring.md#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`) +!!! note + Only when a classification model outputs num_labels equal to 1 can it be used as a scoring model and have its scoring API enabled. + ## Supported Models ### Cross-encoder models [Cross-encoder](https://www.sbert.net/examples/applications/cross-encoder/README.html) (aka reranker) models are a subset of classification models that accept two prompts as input and output num_labels equal to 1. ---8<-- [start:supported-score-models] +--8<-- [start:supported-cross-encoder-models] #### Text-only Models @@ -99,7 +102,7 @@ The score models is designed to compute similarity scores between two input prom vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' ``` ---8<-- [end:supported-score-models] +--8<-- [end:supported-cross-encoder-models] ### Late-interaction models diff --git a/docs/models/pooling_models/specific_models.md b/docs/models/pooling_models/specific_models.md index 4b0027a3dd4b..0d908c1aa1a3 100644 --- a/docs/models/pooling_models/specific_models.md +++ b/docs/models/pooling_models/specific_models.md @@ -11,6 +11,7 @@ vLLM supports ColBERT models with multiple encoder backbones: | `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | +| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | **BERT-based ColBERT** models work out of the box: @@ -29,6 +30,10 @@ vllm serve lightonai/GTE-ModernColBERT-v1 \ vllm serve jinaai/jina-colbert-v2 \ --hf-overrides '{"architectures": ["ColBERTJinaRobertaModel"]}' \ --trust-remote-code + +# LFM2 backbone +vllm serve LiquidAI/LFM2-ColBERT-350M \ + --hf-overrides '{"architectures": ["ColBERTLfm2Model"]}' ``` Then you can use the rerank API: diff --git a/docs/models/pooling_models/token_embed.md b/docs/models/pooling_models/token_embed.md index c950d2e99376..e847fb09bcbb 100644 --- a/docs/models/pooling_models/token_embed.md +++ b/docs/models/pooling_models/token_embed.md @@ -39,6 +39,7 @@ Models of any architecture can be converted into embedding models using `--conve | Architecture | Models | Example HF Models | [LoRA](../../features/lora.md) | [PP](../../serving/parallelism_scaling.md) | | ------------ | ------ | ----------------- | -------------------- | ------------------------- | +| `ColBERTLfm2Model` | LFM2 | `LiquidAI/LFM2-ColBERT-350M` | | | | `ColBERTModernBertModel` | ModernBERT | `lightonai/GTE-ModernColBERT-v1` | | | | `ColBERTJinaRobertaModel` | Jina XLM-RoBERTa | `jinaai/jina-colbert-v2` | | | | `HF_ColBERT` | BERT | `answerdotai/answerai-colbert-small-v1`, `colbert-ir/colbertv2.0` | | | diff --git a/requirements/test.in b/requirements/test.in index 8bd00514435b..be4c2e5795f4 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -21,6 +21,7 @@ vocos # required for minicpmo_26 test peft>=0.15.0 # required for phi-4-mm test pqdm ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests +resampy # required for audio tests sentence-transformers>=5.2.0 # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests diff --git a/requirements/test.txt b/requirements/test.txt index e2f9040beecc..7d3a988a729d 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -544,6 +544,7 @@ numba==0.61.2 # via # -r requirements/test.in # librosa + # resampy numpy==2.2.6 # via # -r requirements/test.in @@ -584,6 +585,7 @@ numpy==2.2.6 # pyogrio # pywavelets # rasterio + # resampy # rioxarray # rouge-score # runai-model-streamer @@ -995,6 +997,8 @@ requests==2.32.3 # tiktoken # transformers # wandb +resampy==0.4.3 + # via -r requirements/test.in responses==0.25.3 # via genai-perf rfc3339-validator==0.1.4 diff --git a/setup.py b/setup.py index 7b5c49e98b6b..2f251a6a296d 100644 --- a/setup.py +++ b/setup.py @@ -987,11 +987,11 @@ def _read_requirements(filename: str) -> list[str]: "instanttensor": ["instanttensor >= 0.1.5"], "runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"], "audio": [ - "librosa", + "av", + "resampy", "scipy", "soundfile", "mistral_common[audio]", - "av", ], # Required for audio processing "video": [], # Kept for backwards compatibility "flashinfer": [], # Kept for backwards compatibility diff --git a/tests/compile/fusions_e2e/conftest.py b/tests/compile/fusions_e2e/conftest.py index 5716c95bb241..7cd2acdf56c2 100644 --- a/tests/compile/fusions_e2e/conftest.py +++ b/tests/compile/fusions_e2e/conftest.py @@ -84,7 +84,10 @@ def run( # TODO: remove this after finishing migration from envs to model kwargs if model_name == "openai/gpt-oss-20b": - monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1") + from .common import is_blackwell + + if is_blackwell(): + monkeypatch.setenv("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8", "1") # Disable, compile cache to make sure custom passes run. # Otherwise, we can't verify fusion happened through the logs. diff --git a/tests/compile/test_aot_compile.py b/tests/compile/test_aot_compile.py index 9f6a1a13e8ea..8a5191ed226c 100644 --- a/tests/compile/test_aot_compile.py +++ b/tests/compile/test_aot_compile.py @@ -14,6 +14,7 @@ import pytest import torch +import vllm.envs as envs import vllm.model_executor.layers.activation from vllm.compilation.backends import VllmBackend from vllm.compilation.caching import ( @@ -162,6 +163,9 @@ def test_save_and_load(monkeypatch: pytest.MonkeyPatch): @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") def test_save_and_load_slice(monkeypatch: pytest.MonkeyPatch): + from torch._subclasses import FakeTensorMode + from torch.fx.experimental.symbolic_shapes import ShapeEnv + def foo(x: torch.Tensor): return x[slice(0, x.shape[0])] @@ -172,12 +176,13 @@ def foo(x: torch.Tensor): gm = torch.fx.symbolic_trace(foo) assert "getitem_1 = x[slice(0, getitem, None)]" in gm.code with use_vllm_config(vllm_config): - payload = VllmSerializableFunction.serialize_compile_artifacts( - VllmSerializableFunction(gm, (example_input,), "", foo) + payload = VllmSerializableFunction.serialize_graph_module(gm) + fake_mode = FakeTensorMode(shape_env=ShapeEnv()) + loaded_gm = VllmSerializableFunction.deserialize_graph_module( + payload, fake_mode ) - fn = VllmSerializableFunction.deserialize_compile_artifacts(payload) - assert gm.code == fn.graph_module.code + assert gm.code == loaded_gm.code @pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10") @@ -725,6 +730,10 @@ def test_deduplication(self): ]: assert cache.get(submod, shape) == shared_data + @pytest.mark.skipif( + envs.VLLM_USE_MEGA_AOT_ARTIFACT, + reason="There's no AOT Autograd run with mega artifact", + ) def test_functorch_config(self): vllm_config = make_vllm_config() example_inputs = (torch.randn(10, 10),) diff --git a/tests/compile/test_startup.py b/tests/compile/test_startup.py index 545299565c16..32a586011590 100644 --- a/tests/compile/test_startup.py +++ b/tests/compile/test_startup.py @@ -9,11 +9,15 @@ import multiprocessing as mp +import pytest from torch._dynamo.utils import counters +import vllm.envs as envs from vllm.compilation.counter import compilation_counter from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode +from ..utils import fork_new_process_for_each_test + MODEL = "microsoft/Phi-tiny-MoE-instruct" @@ -45,8 +49,11 @@ def _cold_start(vllm_runner): assert counters["aot_autograd"]["autograd_cache_hit"] == 0 -def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache): +@fork_new_process_for_each_test +@pytest.mark.parametrize("mega_aot_artifact", ["0", "1"]) +def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache, mega_aot_artifact): monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0") + monkeypatch.setenv("VLLM_USE_MEGA_AOT_ARTIFACT", mega_aot_artifact) # Cold start in a forked child (must fork before CUDA init). # This model has 32 identical transformer layers which produce @@ -64,7 +71,12 @@ def test_moe_startup(monkeypatch, vllm_runner, fresh_vllm_cache): num_compiled_artifacts_saved=0, ): _run_vllm(vllm_runner) - assert counters["aot_autograd"]["total"] == 30 + if envs.VLLM_USE_MEGA_AOT_ARTIFACT: + # MEGA_AOT_ARTIFACT is enabled, so we expect no aot_autograd running on + # subgraphs. + assert counters["aot_autograd"]["total"] == 0 + else: + assert counters["aot_autograd"]["total"] == 30 assert counters["aot_autograd"]["autograd_cache_miss"] == 0 assert ( counters["aot_autograd"]["autograd_cache_hit"] == 0 diff --git a/tests/conftest.py b/tests/conftest.py index 719bfa5ed1f0..f3b22d898903 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,9 +6,6 @@ from tblib import pickling_support -# Import fixture -from tests.v1.entrypoints.conftest import sample_json_schema # noqa - # ruff: noqa # Install support for pickling exceptions so that we can nicely propagate @@ -81,6 +78,55 @@ logger = init_logger(__name__) + +@pytest.fixture +def sample_json_schema(): + return { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + "skills": { + "type": "array", + "items": { + "type": "string", + }, + }, + "grade": { + "type": "string", + "pattern": "^[A-D]$", + }, + "email": { + "type": "string", + "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", + }, + "work_history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": {"type": "string"}, + "duration": { + "type": "number", + "minimum": 0.0, + "maximum": 100.0, + }, + "position": {"type": "string"}, + }, + "required": ["company", "duration", "position"], + "additionalProperties": False, + }, + "minItems": 0, + "maxItems": 3, + }, + }, + "required": ["name", "age", "skills", "grade", "email", "work_history"], + "additionalProperties": False, + "minProperties": 1, + "maxProperties": 10, + } + + _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/entrypoints/llm/test_struct_output_generate.py similarity index 91% rename from tests/v1/entrypoints/llm/test_struct_output_generate.py rename to tests/entrypoints/llm/test_struct_output_generate.py index 70c6d250bc1b..3ece27234368 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/entrypoints/llm/test_struct_output_generate.py @@ -24,6 +24,108 @@ StructuredOutputsParams, ) +SAMPLE_REGEX = ( + r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" + r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" +) + +# Note: Ensure this only uses attributes compatible with xgrammar +SAMPLE_JSON_SCHEMA = { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"}, + "skills": { + "type": "array", + "items": { + "type": "string", + }, + }, + "grade": { + "type": "string", + "pattern": "^[A-D]$", # Regex pattern + }, + "email": { + "type": "string", + "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", + }, + "work_history": { + "type": "array", + "items": { + "type": "object", + "properties": { + "company": {"type": "string"}, + "duration": { + "type": "number", + "minimum": 0.0, + "maximum": 100.0, # Numeric range + }, + "position": {"type": "string"}, + }, + "required": ["company", "duration", "position"], + "additionalProperties": False, + }, + "minItems": 0, + "maxItems": 3, + }, + }, + "required": ["name", "age", "skills", "grade", "email", "work_history"], + "additionalProperties": False, + "minProperties": 1, + "maxProperties": 10, +} + +# A schema unsupported by xgrammar +UNSUPPORTED_JSON_SCHEMA = { + "type": "object", + "properties": { + "score": { + "type": "integer", + "multipleOf": 5, # Numeric multiple + }, + "tags": { + "type": "array", + "items": {"type": "string", "minLength": 10, "maxLength": 20}, + }, + }, + "required": ["score", "tags"], + "additionalProperties": False, + "patternProperties": { + "^score$": {"type": "integer"}, + }, +} + +SAMPLE_STRUCTURED_OUTPUTS_CHOICES = [ + "Python", + "Java", + "JavaScript", + "C++", + "C#", + "PHP", + "TypeScript", + "Ruby", + "Swift", + "Kotlin", +] + +SAMPLE_SQL_EBNF = """ +root ::= select_statement +select_statement ::= "SELECT" column "from" table "where" condition +column ::= "col_1" | "col_2" +table ::= "table_1" | "table_2" +condition ::= column "=" number +number ::= "1" | "2" +""" + +SAMPLE_SQL_LARK = """ +start: select_statement +select_statement: "SELECT" column "from" table "where" condition +column: "col_1" | "col_2" +table: "table_1" | "table_2" +condition: column "=" number +number: "1" | "2" +""" + NGRAM_SPEC_CONFIG = { "model": "[ngram]", "num_speculative_tokens": 5, @@ -110,17 +212,17 @@ class CarDescription(BaseModel): PARAMS_MODELS_BACKENDS_TOKENIZER_MODE, ) def test_structured_output( - sample_json_schema: dict[str, Any], - unsupported_json_schema: dict[str, Any], - sample_sql_ebnf: str, - sample_sql_lark: str, - sample_regex: str, - sample_structured_outputs_choices: str, backend: str, tokenizer_mode: str, model_name: str, speculative_config: dict[str, Any], ): + sample_json_schema = SAMPLE_JSON_SCHEMA + unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA + sample_sql_ebnf = SAMPLE_SQL_EBNF + sample_sql_lark = SAMPLE_SQL_LARK + sample_regex = SAMPLE_REGEX + sample_structured_outputs_choices = SAMPLE_STRUCTURED_OUTPUTS_CHOICES if current_platform.is_tpu() and speculative_config: pytest.skip("TPU does not support speculative decoding") @@ -702,10 +804,10 @@ def test_structured_output_with_reasoning_matrices( @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE) def test_structured_output_auto_mode( - unsupported_json_schema: dict[str, Any], model_name: str, tokenizer_mode: str, ): + unsupported_json_schema = UNSUPPORTED_JSON_SCHEMA llm = LLM( model=model_name, max_model_len=1024, @@ -808,9 +910,9 @@ def generate_with_backend(backend): @pytest.mark.parametrize("backend", ["guidance", "xgrammar", "outlines"]) def test_structured_output_batched_with_non_structured_outputs_requests( - sample_json_schema: dict[str, Any], backend: str, ): + sample_json_schema = SAMPLE_JSON_SCHEMA # Don't use eager execution on TPUs because we want to test for no # recompilation at runtime enforce_eager = bool(not current_platform.is_tpu()) diff --git a/tests/v1/entrypoints/openai/test_chat_completion.py b/tests/entrypoints/openai/chat_completion/test_chat_completion.py similarity index 100% rename from tests/v1/entrypoints/openai/test_chat_completion.py rename to tests/entrypoints/openai/chat_completion/test_chat_completion.py diff --git a/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py b/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py index 704598a5708b..965b21351302 100644 --- a/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py +++ b/tests/entrypoints/openai/chat_completion/test_completion_with_function_calling.py @@ -231,13 +231,14 @@ def k2_server(): "--gpu-memory-utilization", "0.4", ] + ROCM_EXTRA_ARGS - # hack to test kimi_k2 tool use tool_id format. - # avoid error in is_deepseek_mla check by setting kv_lora_rank=null + # Test kimi_k2 tool use tool_id format by overriding model_type. + # is_deepseek_mla safely returns False via getattr when kv_lora_rank + # is absent from the underlying config. with RemoteOpenAIServer( MODEL_NAME, args, env_dict=ROCM_ENV_OVERRIDES, - override_hf_configs={"model_type": "kimi_k2", "kv_lora_rank": None}, + override_hf_configs={"model_type": "kimi_k2"}, ) as remote_server: yield remote_server diff --git a/tests/v1/entrypoints/openai/test_completion_with_image_embeds.py b/tests/entrypoints/openai/chat_completion/test_completion_with_image_embeds.py similarity index 100% rename from tests/v1/entrypoints/openai/test_completion_with_image_embeds.py rename to tests/entrypoints/openai/chat_completion/test_completion_with_image_embeds.py diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/completion/test_completion.py similarity index 100% rename from tests/v1/entrypoints/openai/test_completion.py rename to tests/entrypoints/openai/completion/test_completion.py diff --git a/tests/entrypoints/openai/cpu/__init__.py b/tests/entrypoints/openai/cpu/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py b/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py index e9bde638d4a3..4ac48699a022 100644 --- a/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py +++ b/tests/entrypoints/openai/speech_to_text/test_transcription_validation.py @@ -152,5 +152,5 @@ async def test_basic_audio_foscolo(foscolo, rocm_aiter_fa_attention, model_name) model_name, foscolo, language="it", - expected_text="ove il mio corpo fanciulletto giacque", + expected_text="ove il mio corpo fanciulletto", ) diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/entrypoints/openai/test_multi_api_servers.py similarity index 100% rename from tests/v1/entrypoints/openai/test_multi_api_servers.py rename to tests/entrypoints/openai/test_multi_api_servers.py diff --git a/tests/entrypoints/openai/test_run_batch.py b/tests/entrypoints/openai/test_run_batch.py index cf7e2a7b0c07..bf670105bbc4 100644 --- a/tests/entrypoints/openai/test_run_batch.py +++ b/tests/entrypoints/openai/test_run_batch.py @@ -275,7 +275,7 @@ ] ) -MINIMAL_WAV_BASE64 = "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=" +MINIMAL_WAV_BASE64 = "UklGRigAAABXQVZFZm10IBAAAAABAAEAgD4AAAB9AAACABAAZGF0YQQAAAAAAP9/" INPUT_TRANSCRIPTION_BATCH = ( json.dumps( { diff --git a/tests/entrypoints/instrumentator/__init__.py b/tests/entrypoints/serve/instrumentator/__init__.py similarity index 100% rename from tests/entrypoints/instrumentator/__init__.py rename to tests/entrypoints/serve/instrumentator/__init__.py diff --git a/tests/entrypoints/instrumentator/test_basic.py b/tests/entrypoints/serve/instrumentator/test_basic.py similarity index 99% rename from tests/entrypoints/instrumentator/test_basic.py rename to tests/entrypoints/serve/instrumentator/test_basic.py index 5f48fb266efb..1ab963dc1801 100644 --- a/tests/entrypoints/instrumentator/test_basic.py +++ b/tests/entrypoints/serve/instrumentator/test_basic.py @@ -11,11 +11,10 @@ import requests from fastapi import Request +from tests.utils import RemoteOpenAIServer from vllm.v1.engine.exceptions import EngineDeadError from vllm.version import __version__ as VLLM_VERSION -from ...utils import RemoteOpenAIServer - MODEL_NAME = "Qwen/Qwen3-0.6B" diff --git a/tests/entrypoints/instrumentator/test_metrics.py b/tests/entrypoints/serve/instrumentator/test_metrics.py similarity index 100% rename from tests/entrypoints/instrumentator/test_metrics.py rename to tests/entrypoints/serve/instrumentator/test_metrics.py diff --git a/tests/entrypoints/instrumentator/test_optional_middleware.py b/tests/entrypoints/serve/instrumentator/test_optional_middleware.py similarity index 98% rename from tests/entrypoints/instrumentator/test_optional_middleware.py rename to tests/entrypoints/serve/instrumentator/test_optional_middleware.py index c2c7fbdb0114..fef10cdc0cdf 100644 --- a/tests/entrypoints/instrumentator/test_optional_middleware.py +++ b/tests/entrypoints/serve/instrumentator/test_optional_middleware.py @@ -10,7 +10,7 @@ import pytest import requests -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer # Use a small embeddings model for faster startup and smaller memory footprint. # Since we are not testing any chat functionality, diff --git a/tests/entrypoints/instrumentator/test_orca_metrics.py b/tests/entrypoints/serve/instrumentator/test_orca_metrics.py similarity index 98% rename from tests/entrypoints/instrumentator/test_orca_metrics.py rename to tests/entrypoints/serve/instrumentator/test_orca_metrics.py index 1ce043df0cd8..923951367767 100644 --- a/tests/entrypoints/instrumentator/test_orca_metrics.py +++ b/tests/entrypoints/serve/instrumentator/test_orca_metrics.py @@ -5,7 +5,7 @@ import pytest import pytest_asyncio -from ...utils import RemoteOpenAIServer +from tests.utils import RemoteOpenAIServer # any model with a chat template should work here MODEL_NAME = "Qwen/Qwen3-0.6B" diff --git a/tests/entrypoints/instrumentator/test_sleep.py b/tests/entrypoints/serve/instrumentator/test_sleep.py similarity index 100% rename from tests/entrypoints/instrumentator/test_sleep.py rename to tests/entrypoints/serve/instrumentator/test_sleep.py diff --git a/tests/entrypoints/openai/cpu/test_render.py b/tests/entrypoints/serve/render/test_render.py similarity index 100% rename from tests/entrypoints/openai/cpu/test_render.py rename to tests/entrypoints/serve/render/test_render.py diff --git a/tests/entrypoints/openai/cpu/test_render_multimodal.py b/tests/entrypoints/serve/render/test_render_multimodal.py similarity index 100% rename from tests/entrypoints/openai/cpu/test_render_multimodal.py rename to tests/entrypoints/serve/render/test_render_multimodal.py diff --git a/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml new file mode 100644 index 000000000000..62be504e2c52 --- /dev/null +++ b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml @@ -0,0 +1,8 @@ +model_name: "Qwen/Qwen3.5-35B-A3B" +accuracy_threshold: 0.86 +num_questions: 1319 +num_fewshot: 5 +server_args: >- + --max-model-len 4096 + --data-parallel-size 2 + --enable-expert-parallel diff --git a/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml new file mode 100644 index 000000000000..9380e0b25803 --- /dev/null +++ b/tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml @@ -0,0 +1,9 @@ +model_name: "Qwen/Qwen3.5-35B-A3B-FP8" +accuracy_threshold: 0.86 +num_questions: 1319 +num_fewshot: 5 +server_args: >- + --max-model-len 4096 + --data-parallel-size 2 + --enable-expert-parallel + --kv-cache-dtype fp8 diff --git a/tests/evals/gsm8k/configs/models-mi3xx-quantized.txt b/tests/evals/gsm8k/configs/models-mi3xx-fp8-and-mixed.txt similarity index 100% rename from tests/evals/gsm8k/configs/models-mi3xx-quantized.txt rename to tests/evals/gsm8k/configs/models-mi3xx-fp8-and-mixed.txt diff --git a/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt b/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt new file mode 100644 index 000000000000..4e7af71c7f4a --- /dev/null +++ b/tests/evals/gsm8k/configs/models-qwen35-blackwell.txt @@ -0,0 +1 @@ +Qwen3.5-35B-A3B-DEP2.yaml diff --git a/tests/kernels/attention/test_attention_selector.py b/tests/kernels/attention/test_attention_selector.py index 347205755c68..3ebf9cc3713a 100644 --- a/tests/kernels/attention/test_attention_selector.py +++ b/tests/kernels/attention/test_attention_selector.py @@ -14,8 +14,19 @@ ) from vllm.platforms import current_platform from vllm.platforms.cpu import CpuPlatform -from vllm.platforms.cuda import CudaPlatform -from vllm.platforms.rocm import RocmPlatform + +# CudaPlatform and RocmPlatform import their respective compiled C extensions +# at module level, raising ModuleNotFoundError on incompatible builds. +try: + from vllm.platforms.cuda import CudaPlatform +except (ImportError, ModuleNotFoundError): + CudaPlatform = None + +try: + from vllm.platforms.rocm import RocmPlatform +except (ImportError, ModuleNotFoundError): + RocmPlatform = None + from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.selector import _cached_get_attn_backend, get_attn_backend @@ -101,6 +112,8 @@ def test_backend_selection( assert backend.get_name() == "CPU_ATTN" elif device == "hip": + if RocmPlatform is None: + pytest.skip("RocmPlatform not available") with patch("vllm.platforms.current_platform", RocmPlatform()): if use_mla: # ROCm MLA backend logic: @@ -126,6 +139,8 @@ def test_backend_selection( assert backend.get_name() == expected elif device == "cuda": + if CudaPlatform is None: + pytest.skip("CudaPlatform not available") with patch("vllm.platforms.current_platform", CudaPlatform()): capability = torch.cuda.get_device_capability() if use_mla: @@ -214,7 +229,7 @@ def test_backend_selection( assert backend.get_name() == expected -@pytest.mark.parametrize("device", ["cpu", "cuda"]) +@pytest.mark.parametrize("device", ["cpu", "cuda", "hip"]) def test_fp32_fallback(device: str): """Test attention backend selection with fp32.""" # Use default config (no backend specified) @@ -227,10 +242,25 @@ def test_fp32_fallback(device: str): assert backend.get_name() == "CPU_ATTN" elif device == "cuda": + if CudaPlatform is None: + pytest.skip("CudaPlatform not available") with patch("vllm.platforms.current_platform", CudaPlatform()): backend = get_attn_backend(16, torch.float32, None) assert backend.get_name() == "FLEX_ATTENTION" + elif device == "hip": + if RocmPlatform is None: + pytest.skip("RocmPlatform not available") + # ROCm backends do not support head_size=16 (minimum is 32). + # No known HuggingFace transformer model uses head_size=16. + # Revisit if a real model with this head size is identified + # and accuracy-tested. + with ( + patch("vllm.platforms.current_platform", RocmPlatform()), + pytest.raises(ValueError, match="No valid attention backend"), + ): + get_attn_backend(16, torch.float32, None) + def test_flash_attn(monkeypatch: pytest.MonkeyPatch): """Test FlashAttn validation.""" @@ -367,6 +397,8 @@ def test_per_head_quant_scales_backend_selection( attention_config=attention_config, cache_config=cache_config ) + if CudaPlatform is None: + pytest.skip("CudaPlatform not available") with ( set_current_vllm_config(vllm_config), patch("vllm.platforms.current_platform", CudaPlatform()), diff --git a/tests/kernels/moe/test_cutedsl_moe.py b/tests/kernels/moe/test_cutedsl_moe.py index 66a97b48bdc3..bca3eba0f91c 100644 --- a/tests/kernels/moe/test_cutedsl_moe.py +++ b/tests/kernels/moe/test_cutedsl_moe.py @@ -17,7 +17,7 @@ from torch.nn import functional as F from vllm.model_executor.layers.activation import SiluAndMul -from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import ( +from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutedsl_moe import ( flashinfer_cutedsl_moe_masked, ) from vllm.utils.flashinfer import ( diff --git a/tests/kernels/moe/test_gpt_oss_triton_kernels.py b/tests/kernels/moe/test_gpt_oss_triton_kernels.py index 630ea2e3fe9d..1b2067148bd8 100644 --- a/tests/kernels/moe/test_gpt_oss_triton_kernels.py +++ b/tests/kernels/moe/test_gpt_oss_triton_kernels.py @@ -6,6 +6,7 @@ import torch import torch.nn.functional as F +from vllm.platforms import current_platform from vllm.utils.import_utils import has_triton_kernels if not has_triton_kernels(): @@ -14,6 +15,7 @@ allow_module_level=True, ) +import triton_kernels.matmul_ogs_details.opt_flags as opt_flags import triton_kernels.swiglu from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig from triton_kernels.numerics import InFlexData @@ -21,12 +23,16 @@ from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor from triton_kernels.tensor_details import layout from triton_kernels.testing import assert_close +from triton_kernels.topk import topk as topk_fn from vllm.model_executor.layers.fused_moe.config import mxfp4_w4a16_moe_quant_config from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + legacy_routing, + make_routing_data, triton_kernel_moe_forward, ) from vllm.utils.math_utils import round_up +from vllm.utils.torch_utils import set_random_seed from .utils import shuffle_weight @@ -299,6 +305,12 @@ def test_equiv(num_token, a_dtype, w_dtype, tp, workspace_init): pc2, ) = init_compute_data(M, K, N, E, a_dtype, w_dtype, num_warps=8) + if current_platform.is_device_capability_family(100): + constraints = { + "is_persistent": True, + } + opt_flags.update_opt_flags_constraints(constraints) + if a_dtype == "bf16" and w_dtype == "mx4": quant_config = mxfp4_w4a16_moe_quant_config( w1_scale=pc1, @@ -355,3 +367,43 @@ def test_unit_shuffle(): ) assert_close(ref=out_ref, tri=out) + + +@pytest.mark.parametrize("num_tokens", [2, 8, 64]) +@pytest.mark.parametrize("num_experts", [32, 128]) +@pytest.mark.parametrize("topk", [1, 4]) +@pytest.mark.parametrize("renormalize", [True, False]) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +def test_legacy_routing( + num_tokens: int, num_experts: int, topk: int, renormalize: bool, dtype: torch.dtype +): + set_random_seed(0) + gating_output = torch.randn(num_tokens, num_experts, device="cuda", dtype=dtype) + + sm_first = not renormalize + logits = gating_output + if sm_first: + logits = torch.softmax(logits, dim=-1) + sparse_logits = topk_fn(logits, topk, apply_softmax=not sm_first) + topk_ids = sparse_logits.indx.to(torch.long) + topk_weights = sparse_logits.vals + routing_data_ref, gather_indx_ref, scatter_indx_ref = make_routing_data( + topk_ids, topk_weights, num_experts + ) + + routing_data, gather_indx, scatter_indx = legacy_routing( + gating_output, topk, sm_first=sm_first + ) + + assert_close( + ref=gather_indx_ref.src_indx, tri=gather_indx.src_indx, maxtol=0, rmstol=0 + ) + assert_close( + ref=gather_indx_ref.dst_indx, tri=gather_indx.dst_indx, maxtol=0, rmstol=0 + ) + assert_close( + ref=scatter_indx_ref.src_indx, tri=scatter_indx.src_indx, maxtol=0, rmstol=0 + ) + assert_close( + ref=scatter_indx_ref.dst_indx, tri=scatter_indx.dst_indx, maxtol=0, rmstol=0 + ) diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py index cf9021663809..e54e7a9cd18e 100644 --- a/tests/kernels/moe/test_ocp_mx_moe.py +++ b/tests/kernels/moe/test_ocp_mx_moe.py @@ -82,7 +82,7 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase): model_case.model_id, tensor_parallel_size=model_case.tp, load_format="dummy", - cudagraph_capture_sizes=[16], + compilation_config={"cudagraph_capture_sizes": [16]}, ) as llm: # Disabled as check_model is broken: https://github.com/vllm-project/vllm/pull/18465#issuecomment-3329880562 # def check_model(model): diff --git a/tests/kernels/quantization/test_mxfp4_triton_ep.py b/tests/kernels/quantization/test_mxfp4_triton_ep.py index d4eb91058906..6c8aebe42c07 100644 --- a/tests/kernels/quantization/test_mxfp4_triton_ep.py +++ b/tests/kernels/quantization/test_mxfp4_triton_ep.py @@ -17,89 +17,6 @@ import pytest import torch -from vllm.model_executor.layers.quantization.mxfp4 import ( - Mxfp4Backend, - Mxfp4MoEMethod, -) - - -def _make_mock_moe_config(ep_size: int = 1) -> MagicMock: - """Create a mock FusedMoEConfig with the given EP size.""" - parallel_config = MagicMock() - parallel_config.ep_size = ep_size - - moe_config = MagicMock() - moe_config.ep_size = ep_size - moe_config.is_lora_enabled = False - moe_config.moe_parallel_config = parallel_config - return moe_config - - -class TestMxfp4TritonIsMonolithic: - """Verify that is_monolithic is always True for the TRITON backend, - regardless of EP size, since triton_kernel_moe_forward now handles - expert_map remapping internally.""" - - @pytest.mark.parametrize( - "backend,ep_size,expected_monolithic", - [ - # TRITON is always monolithic (handles EP via expert_map remapping) - (Mxfp4Backend.TRITON, 1, True), - (Mxfp4Backend.TRITON, 2, True), - (Mxfp4Backend.TRITON, 4, True), - # SM100 backends are always monolithic - (Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM, 1, True), - (Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM, 2, True), - (Mxfp4Backend.SM100_FI_MXFP4_BF16, 1, True), - (Mxfp4Backend.SM100_FI_MXFP4_BF16, 2, True), - # MARLIN is never monolithic - (Mxfp4Backend.MARLIN, 1, False), - (Mxfp4Backend.MARLIN, 2, False), - ], - ids=[ - "triton-no-ep", - "triton-ep2", - "triton-ep4", - "sm100-trtllm-no-ep", - "sm100-trtllm-ep2", - "sm100-bf16-no-ep", - "sm100-bf16-ep2", - "marlin-no-ep", - "marlin-ep2", - ], - ) - @patch( - "vllm.model_executor.layers.quantization.mxfp4.get_mxfp4_backend", - ) - @patch( - "vllm.model_executor.layers.quantization.mxfp4.get_current_vllm_config", - ) - def test_is_monolithic( - self, - mock_get_config, - mock_get_backend, - backend, - ep_size, - expected_monolithic, - ): - """is_monolithic should be True for TRITON regardless of EP size.""" - mock_get_backend.return_value = backend - - mock_compilation_config = MagicMock() - mock_compilation_config.max_cudagraph_capture_size = 1024 - mock_vllm_config = MagicMock() - mock_vllm_config.compilation_config = mock_compilation_config - mock_get_config.return_value = mock_vllm_config - - moe_config = _make_mock_moe_config(ep_size=ep_size) - method = Mxfp4MoEMethod(moe_config) - - assert method.is_monolithic == expected_monolithic, ( - f"Expected is_monolithic={expected_monolithic} for " - f"backend={backend.name}, ep_size={ep_size}, " - f"but got {method.is_monolithic}." - ) - class TestTritonMoeForwardExpertMap: """Test that triton_kernel_moe_forward applies expert_map remapping diff --git a/tests/kernels/quantization/test_rocm_skinny_gemms.py b/tests/kernels/quantization/test_rocm_skinny_gemms.py index 91b774c47464..d2123db2e8da 100644 --- a/tests/kernels/quantization/test_rocm_skinny_gemms.py +++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py @@ -160,6 +160,8 @@ def test_rocm_wvsplitkrc_kernel(xnorm, n, k, m, dtype, seed, padded_a, bias_mode BIAS = torch.rand(m, dtype=dtype, device="cuda") * 2 - 1 elif bias_mode == 2: BIAS = torch.rand(n, m, dtype=dtype, device="cuda") * 2 - 1 + elif bias_mode == 3: + BIAS = torch.rand(1, m, dtype=dtype, device="cuda") * 2 - 1 ref_out = torch.nn.functional.linear(A, B, BIAS) out = ops.wvSplitKrc(A, B, cu_count, BIAS) @@ -224,10 +226,9 @@ def test_rocm_wvsplitk_kernel( ref_out = torch.nn.functional.linear(A, B, BIAS) out = ops.wvSplitK(B, A.view(-1, A.size(-1)), cu_count, BIAS) - if xnorm: - assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-8) - else: - assert torch.allclose(out, ref_out, atol=1e-3, rtol=1e-2) + # Accumulation error in fp16 GEMM scales with sqrt(K) + atol = torch.finfo(dtype).eps * math.sqrt(k) + torch.testing.assert_close(out, ref_out, atol=atol, rtol=1e-2) @pytest.mark.parametrize("xnorm", [False, True]) diff --git a/tests/model_executor/layers/test_rocm_unquantized_gemm.py b/tests/model_executor/layers/test_rocm_unquantized_gemm.py new file mode 100644 index 000000000000..c435a6e72422 --- /dev/null +++ b/tests/model_executor/layers/test_rocm_unquantized_gemm.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from unittest.mock import MagicMock + +import pytest +import torch + +from vllm.platforms import current_platform + +if current_platform.is_cuda(): + pytest.skip( + "ROCm skinny GEMM tests are not supported on CUDA.", + allow_module_level=True, + ) + +from vllm.model_executor.layers import utils + + +def test_rocm_unquantized_gemm_gfx1x_wvsplitk_path(monkeypatch): + x = torch.randn(1, 64, dtype=torch.float16) + weight = torch.randn(128, 64, dtype=torch.float16) + + monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False) + monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True) + monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: True) + monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False) + monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: False) + monkeypatch.setattr(utils, "get_cu_count", lambda: 120) + + wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t()) + monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock) + llmm1_mock = MagicMock(side_effect=lambda w, x_view, _: x_view @ w.t()) + monkeypatch.setattr(utils.ops, "LLMM1", llmm1_mock) + + out = utils.rocm_unquantized_gemm_impl(x, weight, None) + ref = torch.nn.functional.linear(x, weight, None) + + wvsplitk_mock.assert_called_once() + llmm1_mock.assert_not_called() + assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3) + + +def test_rocm_unquantized_gemm_gfx1x_n_gt_4_falls_back(monkeypatch): + x = torch.randn(5, 64, dtype=torch.float16) + weight = torch.randn(128, 64, dtype=torch.float16) + + monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False) + monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True) + monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: True) + monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False) + monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: False) + monkeypatch.setattr(utils, "get_cu_count", lambda: 120) + + wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t()) + monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock) + llmm1_mock = MagicMock(side_effect=lambda w, x_view, _: x_view @ w.t()) + monkeypatch.setattr(utils.ops, "LLMM1", llmm1_mock) + + out = utils.rocm_unquantized_gemm_impl(x, weight, None) + ref = torch.nn.functional.linear(x, weight, None) + + wvsplitk_mock.assert_not_called() + llmm1_mock.assert_not_called() + assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3) + + +def test_rocm_unquantized_gemm_gfx950_wvsplitkrc_path(monkeypatch): + x = torch.randn(16, 1024, dtype=torch.float16) + weight = torch.randn(256, 1024, dtype=torch.float16) + + monkeypatch.setattr(utils, "use_aiter_triton_gemm", lambda *args: False) + monkeypatch.setattr(utils.envs, "VLLM_ROCM_USE_SKINNY_GEMM", True) + monkeypatch.setattr("vllm.platforms.rocm.on_gfx1x", lambda: False) + monkeypatch.setattr("vllm.platforms.rocm.on_gfx9", lambda: False) + monkeypatch.setattr("vllm.platforms.rocm.on_gfx950", lambda: True) + monkeypatch.setattr(utils, "get_cu_count", lambda: 120) + + wvsplitkrc_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t()) + monkeypatch.setattr(utils.ops, "wvSplitKrc", wvsplitkrc_mock) + wvsplitk_mock = MagicMock(side_effect=lambda w, x_view, _, __: x_view @ w.t()) + monkeypatch.setattr(utils.ops, "wvSplitK", wvsplitk_mock) + + out = utils.rocm_unquantized_gemm_impl(x, weight, None) + ref = torch.nn.functional.linear(x, weight, None) + + wvsplitkrc_mock.assert_called_once() + wvsplitk_mock.assert_not_called() + assert torch.allclose(out, ref, atol=1e-3, rtol=1e-3) diff --git a/tests/models/language/pooling/test_colbert.py b/tests/models/language/pooling/test_colbert.py index 6edd9c28c519..a245f879ba2b 100644 --- a/tests/models/language/pooling/test_colbert.py +++ b/tests/models/language/pooling/test_colbert.py @@ -59,6 +59,22 @@ "model_cls": "AutoModel", }, }, + "lfm2": { + "model": "LiquidAI/LFM2-ColBERT-350M", + "colbert_dim": 128, + "max_model_len": 511, + "extra_kwargs": { + "hf_overrides": { + "architectures": ["ColBERTLfm2Model"], + }, + }, + "hf_comparison": { + "weights_file": "1_Dense/model.safetensors", + "weights_key": "linear.weight", + "trust_remote_code": False, + "model_cls": "AutoModel", + }, + }, } diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index c16efd065e1b..1404d9628faa 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -220,7 +220,10 @@ vllm_runner_kwargs={ "model_impl": "transformers", }, - marks=[pytest.mark.core_model], + marks=[ + pytest.mark.core_model, + *([large_gpu_mark(min_gb=80)] if current_platform.is_rocm() else []), + ], ), "idefics3-transformers": VLMTestInfo( models=["HuggingFaceTB/SmolVLM-256M-Instruct"], @@ -542,8 +545,12 @@ auto_cls=AutoModelForImageTextToText, ), "isaac": VLMTestInfo( + # NOTE: PerceptronAI/Isaac-0.1 removed because the upstream HF + # repo has a stale model.safetensors.index.json that references + # shard files which no longer exist (consolidated into a single + # model.safetensors on 2026-03-20). Re-add once upstream fixes + # the index file. models=[ - "PerceptronAI/Isaac-0.1", "PerceptronAI/Isaac-0.2-2B-Preview", ], test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), diff --git a/tests/models/multimodal/generation/test_granite_speech.py b/tests/models/multimodal/generation/test_granite_speech.py index 1519a50c1a0c..f0650d4c234d 100644 --- a/tests/models/multimodal/generation/test_granite_speech.py +++ b/tests/models/multimodal/generation/test_granite_speech.py @@ -39,7 +39,11 @@ def vllm_to_hf_output( def granite_speech_attention_config(): """Return attention config for Granite Speech tests on ROCm.""" if current_platform.is_rocm(): - return {"backend": "ROCM_AITER_FA"} + from vllm.platforms.rocm import on_mi3xx + + if on_mi3xx(): + return {"backend": "ROCM_AITER_FA"} + return {"backend": "TRITON_ATTN"} return None diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index 47852453c058..1b7e2347be2f 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -323,10 +323,7 @@ def build_audio_inputs_from_test_info( test_info.audio_idx_to_prompt, test_info.prompt_formatter, ) - resampler = AudioResampler( - target_sr=16000, - method="librosa", - ) + resampler = AudioResampler(target_sr=16000) audios = [asset.audio_and_sample_rate for asset in audio_assets] resampled_audios = [ ( diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py index 9bdedb3c5c25..0a692387cffc 100644 --- a/tests/models/multimodal/generation/vlm_utils/model_utils.py +++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py @@ -24,6 +24,7 @@ GenerationConfig, GenerationMixin, ) +from transformers.masking_utils import create_causal_mask from transformers.video_utils import VideoMetadata from vllm.logprobs import SampleLogprobs @@ -680,10 +681,14 @@ def patched_forward( sin = sin.to(inputs_embeds.dtype) # Prepare attention mask - if attention_mask is not None: - attention_mask = self._update_causal_mask( - attention_mask, inputs_embeds, cache_position, past_key_values, False - ) + attention_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + past_key_values=past_key_values, + position_ids=position_ids, + cache_position=cache_position, + ) # Initialize and collect hidden states hidden_states = inputs_embeds diff --git a/tests/models/multimodal/pooling/test_llama_nemotron_vl.py b/tests/models/multimodal/pooling/test_llama_nemotron_vl.py index 4c92d41c31db..6bea808152f6 100644 --- a/tests/models/multimodal/pooling/test_llama_nemotron_vl.py +++ b/tests/models/multimodal/pooling/test_llama_nemotron_vl.py @@ -22,8 +22,10 @@ ChatCompletionContentPartTextParam, ) from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam +from vllm.platforms import current_platform from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import ROCM_ENGINE_KWARGS from ...utils import check_embeddings_close # Prefixes used by the model API @@ -70,6 +72,7 @@ def _run_test( max_model_len=2048, enforce_eager=True, trust_remote_code=True, + **ROCM_ENGINE_KWARGS, ) as vllm_model: vllm_outputs = vllm_model.embed(input_texts, images=input_images) @@ -250,6 +253,7 @@ def _run_vllm_reranker( max_model_len=2048, enforce_eager=True, trust_remote_code=True, + **ROCM_ENGINE_KWARGS, ) as vllm_model: has_images = any(img is not None for _, img in docs) @@ -322,8 +326,11 @@ def _run_reranker_test( assert len(hf_scores) == len(vllm_scores), ( f"Output length mismatch: HF={len(hf_scores)}, vLLM={len(vllm_scores)}" ) + # NOTE: ROCm shows slightly higher numerical variance dues to different attention + # backend between vLLM and HF; use a marginally looser tolerance + rel_tol = 0.022 if current_platform.is_rocm() else 0.02 for i, (hf_score, vllm_score) in enumerate(zip(hf_scores, vllm_scores)): - assert hf_score == pytest.approx(vllm_score, rel=0.02), ( + assert hf_score == pytest.approx(vllm_score, rel=rel_tol), ( f"Score mismatch at index {i}: HF={hf_score:.4f}, vLLM={vllm_score:.4f}" ) diff --git a/tests/models/registry.py b/tests/models/registry.py index aac707a9065b..ff997706ccf2 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -628,6 +628,11 @@ def check_available_online( trust_remote_code=True, hf_overrides={"architectures": ["ColBERTJinaRobertaModel"]}, ), + "ColBERTLfm2Model": _HfExamplesInfo( + "LiquidAI/LFM2-ColBERT-350M", + trust_remote_code=True, + hf_overrides={"architectures": ["ColBERTLfm2Model"]}, + ), # [Multimodal] "ColModernVBertForRetrieval": _HfExamplesInfo( "ModernVBERT/colmodernvbert-merged", diff --git a/tests/multimodal/media/test_audio.py b/tests/multimodal/media/test_audio.py index 18f142008c31..4361066ab885 100644 --- a/tests/multimodal/media/test_audio.py +++ b/tests/multimodal/media/test_audio.py @@ -10,6 +10,8 @@ from vllm.multimodal.media import AudioMediaIO +from ...conftest import AudioTestAssets + pytestmark = pytest.mark.cpu_test ASSETS_DIR = Path(__file__).parent.parent / "assets" @@ -22,40 +24,32 @@ def dummy_audio(): @pytest.fixture -def dummy_audio_bytes(): - return b"FAKEAUDIOBYTES" +def dummy_audio_bytes(audio_assets: AudioTestAssets): + with open(audio_assets[0].get_local_path(), "rb") as f: + return f.read() def test_audio_media_io_load_bytes(dummy_audio_bytes): audio_io = AudioMediaIO() - with patch("librosa.load") as mock_load: - mock_load.return_value = (np.array([0.1, 0.2]), 16000) - out = audio_io.load_bytes(dummy_audio_bytes) - mock_load.assert_called_once() - assert isinstance(out[0], np.ndarray) - assert out[1] == 16000 + out = audio_io.load_bytes(dummy_audio_bytes) + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 def test_audio_media_io_load_base64(dummy_audio_bytes): audio_io = AudioMediaIO() encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8") - with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes: - mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000) - out = audio_io.load_base64("audio/wav", encoded) - mock_load_bytes.assert_called_once() - assert isinstance(out[0], np.ndarray) - assert out[1] == 16000 + out = audio_io.load_base64("audio/wav", encoded) + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 -def test_audio_media_io_load_file(): +def test_audio_media_io_load_file(audio_assets: AudioTestAssets): audio_io = AudioMediaIO() - path = Path("/fake/path.wav") - with patch("librosa.load") as mock_load: - mock_load.return_value = (np.array([0.1, 0.2]), 16000) - out = audio_io.load_file(path) - mock_load.assert_called_once_with(path, sr=None) - assert isinstance(out[0], np.ndarray) - assert out[1] == 16000 + path = audio_assets[0].get_local_path() + out = audio_io.load_file(path) + assert isinstance(out[0], np.ndarray) + assert out[1] == 16000 def test_audio_media_io_encode_base64(dummy_audio): diff --git a/tests/multimodal/test_audio.py b/tests/multimodal/test_audio.py index 3cc6bcadbec4..0bc8988452f0 100644 --- a/tests/multimodal/test_audio.py +++ b/tests/multimodal/test_audio.py @@ -14,7 +14,7 @@ AudioSpec, ChannelReduction, normalize_audio, - resample_audio_librosa, + resample_audio_pyav, resample_audio_scipy, split_audio, ) @@ -25,14 +25,14 @@ def dummy_audio(): return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float) -def test_resample_audio_librosa(dummy_audio): - with patch("vllm.multimodal.audio.librosa.resample") as mock_resample: - mock_resample.return_value = dummy_audio * 2 - out = resample_audio_librosa(dummy_audio, orig_sr=44100, target_sr=22050) - mock_resample.assert_called_once_with( - dummy_audio, orig_sr=44100, target_sr=22050 - ) - assert np.all(out == dummy_audio * 2) +def test_resample_audio_pyav(dummy_audio): + out_down = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=2) + out_up = resample_audio_pyav(dummy_audio, orig_sr=2, target_sr=4) + out_same = resample_audio_pyav(dummy_audio, orig_sr=4, target_sr=4) + + assert len(out_down) == 3 + assert len(out_up) == 10 + assert np.all(out_same == dummy_audio) def test_resample_audio_scipy(dummy_audio): @@ -56,9 +56,9 @@ def test_resample_audio_scipy_non_integer_ratio(dummy_audio): assert np.isfinite(out).all() -def test_audio_resampler_librosa_calls_resample(dummy_audio): - resampler = AudioResampler(target_sr=22050, method="librosa") - with patch("vllm.multimodal.audio.resample_audio_librosa") as mock_resample: +def test_audio_resampler_pyav_calls_resample(dummy_audio): + resampler = AudioResampler(target_sr=22050, method="pyav") + with patch("vllm.multimodal.audio.resample_audio_pyav") as mock_resample: mock_resample.return_value = dummy_audio out = resampler.resample(dummy_audio, orig_sr=44100) mock_resample.assert_called_once_with( @@ -423,13 +423,13 @@ def test_soundfile_format_normalized_to_mono_e2e(self): # Verify channel averaging: mean of [0.5, -0.5] = 0.0 np.testing.assert_array_almost_equal(audio_output, np.zeros(16000), decimal=5) - def test_librosa_mono_passthrough_e2e(self): - """Full pipeline: librosa mono format → preserved as mono.""" + def test_pyav_mono_passthrough_e2e(self): + """Full pipeline: pyav mono format → preserved as mono.""" from vllm.multimodal.parse import MultiModalDataParser - # Simulate librosa output: already mono (time,) format - mono_librosa = np.random.randn(16000).astype(np.float32) - assert mono_librosa.shape == (16000,) + # Simulate pyav output: already mono (time,) format + mono_pyav = np.random.randn(16000).astype(np.float32) + assert mono_pyav.shape == (16000,) # Create parser with mono normalization parser = MultiModalDataParser( @@ -438,7 +438,7 @@ def test_librosa_mono_passthrough_e2e(self): ) # Process audio through the parser - result = parser._parse_audio_data((mono_librosa, 16000)) + result = parser._parse_audio_data((mono_pyav, 16000)) audio_output = result.get(0) # Verify output is still mono 1D @@ -446,7 +446,7 @@ def test_librosa_mono_passthrough_e2e(self): assert audio_output.shape == (16000,) # Verify audio content is preserved - np.testing.assert_array_almost_equal(audio_output, mono_librosa) + np.testing.assert_array_almost_equal(audio_output, mono_pyav) def test_multichannel_5_1_surround_to_mono_e2e(self): """Full pipeline: 5.1 surround (6 channels) → mono output.""" diff --git a/tests/quantization/test_ptpc_fp8.py b/tests/quantization/test_ptpc_fp8.py deleted file mode 100644 index 6858062b9183..000000000000 --- a/tests/quantization/test_ptpc_fp8.py +++ /dev/null @@ -1,57 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -"""Tests whether PTPC w8a8 FP8 computation is enabled correctly. - -Run `pytest tests/quantization/test_ptpc_fp8.py --forked`. -""" - -import pytest - -from tests.quantization.utils import is_quant_method_supported -from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod -from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod -from vllm.platforms import current_platform - - -@pytest.fixture(scope="function", autouse=True) -def enable_pickle(monkeypatch): - """`LLM.apply_model` requires pickling a function.""" - monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1") - - -@pytest.mark.skipif( - not is_quant_method_supported("ptpc_fp8"), - reason="PTPC FP8 is not supported on this GPU type.", -) -@pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.") -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) -def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None: - llm = vllm_runner( - "facebook/opt-125m", - dtype=dtype, - quantization="ptpc_fp8", - enforce_eager=True, - kv_cache_dtype=kv_cache_dtype, - allow_deprecated_quantization=True, - ) - - with llm: - - def check_model(model): - fc1 = model.model.decoder.layers[0].fc1 - assert isinstance(fc1.quant_method, PTPCFp8LinearMethod) - if kv_cache_dtype == "ptpc_fp8": - attn = model.model.decoder.layers[0].self_attn.attn - assert isinstance(attn.quant_method, Fp8KVCacheMethod) - assert attn._k_scale == 1.0 - assert attn._v_scale == 1.0 - - # For GPUs with hardware support, we keep weights in fp8 - if current_platform.has_device_capability(94): - assert fc1.weight.dtype == current_platform.fp8_dtype() - - llm.apply_model(check_model) - - output = llm.generate_greedy("Hello my name is", max_tokens=4) - assert output diff --git a/tests/test_pooling_params.py b/tests/test_pooling_params.py index 54a577d2bf84..6cf2a82d2ff1 100644 --- a/tests/test_pooling_params.py +++ b/tests/test_pooling_params.py @@ -74,7 +74,7 @@ def test_embed_dimensions(model_info: EmbedModelInfo): pooling_params.verify(model_config) -@pytest.mark.parametrize("task", ["score", "classify"]) +@pytest.mark.parametrize("task", ["classify"]) def test_classify(task): model_config = MockModelConfig(pooler_config=PoolerConfig(seq_pooling_type="CLS")) diff --git a/tests/test_regression.py b/tests/test_regression.py index 978e0783919d..a38b4428dea5 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -12,6 +12,7 @@ import pytest import torch +from tests.utils import large_gpu_mark from vllm import LLM, SamplingParams from vllm.platforms import current_platform @@ -32,10 +33,21 @@ def test_duplicated_ignored_sequence_group(): assert len(prompts) == len(outputs) -def test_max_tokens_none(): +@pytest.mark.parametrize( + "model", + [ + pytest.param( + "distilbert/distilgpt2", + marks=[ + *([large_gpu_mark(min_gb=80)] if current_platform.is_rocm() else []), + ], + ), + ], +) +def test_max_tokens_none(model): sampling_params = SamplingParams(temperature=0.01, top_p=0.1, max_tokens=None) llm = LLM( - model="distilbert/distilgpt2", + model=model, max_num_batched_tokens=4096, tensor_parallel_size=1, ) diff --git a/tests/v1/attention/test_batch_reordering.py b/tests/v1/attention/test_batch_reordering.py index 6265e12f9a7d..f59740238da7 100644 --- a/tests/v1/attention/test_batch_reordering.py +++ b/tests/v1/attention/test_batch_reordering.py @@ -10,9 +10,10 @@ class MockInputBatch: - def __init__(self, req_ids, num_computed_tokens_cpu): + def __init__(self, req_ids, num_computed_tokens_cpu, num_prompt_tokens): self.req_ids = req_ids self.num_computed_tokens_cpu = num_computed_tokens_cpu + self.num_prompt_tokens = num_prompt_tokens def swap_states(self, i, j): self.req_ids[i], self.req_ids[j] = self.req_ids[j], self.req_ids[i] @@ -20,6 +21,10 @@ def swap_states(self, i, j): self.num_computed_tokens_cpu[j], self.num_computed_tokens_cpu[i], ) + self.num_prompt_tokens[i], self.num_prompt_tokens[j] = ( + self.num_prompt_tokens[j], + self.num_prompt_tokens[i], + ) class MockSchedulerOutput: @@ -29,96 +34,139 @@ def __init__(self, num_scheduled_tokens): @dataclass class ReorderTestCase: - requests: list[tuple[int, int]] # (num_scheduled_tokens, num_computed_tokens) + # (num_scheduled_tokens, num_computed_tokens, num_prompt_tokens) + requests: list[tuple[int, int, int]] expected_order: list[int] expected_modified: bool decode_threshold: int = 1 # Test cases for batch reordering +# Format: (num_scheduled, num_computed, num_prompt) REORDER_TEST_CASES = { "all_decodes": ReorderTestCase( - requests=[(1, 10), (1, 20), (1, 30)], + requests=[(1, 10, 10), (1, 20, 20), (1, 30, 30)], expected_order=[0, 1, 2], expected_modified=False, ), - "all_prefills": ReorderTestCase( - requests=[(100, 100), (200, 200), (300, 300)], + "all_long_extends": ReorderTestCase( + requests=[(100, 100, 100), (200, 200, 200), (300, 300, 300)], expected_order=[0, 1, 2], expected_modified=False, ), - "mixed_interleaved": ReorderTestCase( - requests=[(100, 100), (1, 10), (200, 200), (1, 20)], - expected_order=[3, 1, 2, 0], # Only swap 0↔3, keep 1 and 2 in place + "mixed_decodes_long_extends": ReorderTestCase( + requests=[(100, 100, 100), (1, 10, 10), (200, 200, 200), (1, 20, 20)], + expected_order=[3, 1, 2, 0], expected_modified=True, ), "already_ordered": ReorderTestCase( - requests=[(1, 10), (1, 20), (100, 100), (200, 0)], + requests=[(1, 10, 10), (1, 20, 20), (100, 100, 100), (200, 0, 200)], expected_order=[0, 1, 2, 3], expected_modified=False, ), "single_request": ReorderTestCase( - requests=[(1, 10)], + requests=[(1, 10, 10)], expected_order=[0], expected_modified=False, ), "higher_threshold": ReorderTestCase( - requests=[(2, 10), (3, 20), (5, 30), (6, 40)], + requests=[(2, 10, 10), (3, 20, 20), (5, 30, 30), (6, 40, 40)], expected_order=[0, 1, 2, 3], expected_modified=False, decode_threshold=4, ), "decodes_at_end": ReorderTestCase( - requests=[(100, 100), (200, 200), (1, 10), (1, 20)], + requests=[(100, 100, 100), (200, 200, 200), (1, 10, 10), (1, 20, 20)], expected_order=[2, 3, 0, 1], expected_modified=True, ), - "decode_extend_prefill": ReorderTestCase( - requests=[(100, 0), (10, 50), (1, 10)], + "decode_long_extend_prefill": ReorderTestCase( + requests=[(100, 0, 100), (10, 50, 50), (1, 10, 10)], expected_order=[2, 1, 0], expected_modified=True, ), - "extend_prefill_only": ReorderTestCase( - requests=[(100, 0), (10, 50), (200, 0), (20, 75)], - expected_order=[3, 1, 2, 0], # Only swap 0↔3, keep 1 and 2 in place + "long_extend_prefill_only": ReorderTestCase( + requests=[(100, 0, 100), (10, 50, 50), (200, 0, 200), (20, 75, 75)], + expected_order=[3, 1, 2, 0], expected_modified=True, ), - "complicated_mixed_interleaved": ReorderTestCase( + "complicated_mixed": ReorderTestCase( requests=[ - (1, 20), - (1, 50), - (374, 0), - (300, 20), - (1, 20), - (256, 0), - (1, 5), - (27, 0), - (1, 4), + (1, 20, 20), # decode + (1, 50, 50), # decode + (374, 0, 374), # prefill + (300, 20, 20), # long_extend + (1, 20, 20), # decode + (256, 0, 256), # prefill + (1, 5, 5), # decode + (27, 0, 27), # prefill + (1, 4, 4), # decode ], expected_order=[0, 1, 6, 8, 4, 3, 2, 7, 5], expected_modified=True, ), "new_request_single_token_prefill": ReorderTestCase( requests=[ - (100, 0), - (1, 0), # New request with only 1 token (STILL prefill) - (50, 100), - (1, 10), + (100, 0, 100), # prefill + (1, 0, 1), # prefill (single token, still prefill) + (50, 100, 100), # long_extend + (1, 10, 10), # decode ], - # Only index 3 is a true decode (has num_computed_tokens > 0) expected_order=[3, 2, 0, 1], expected_modified=True, ), "multiple_new_requests_single_token_prefill": ReorderTestCase( requests=[ - (1, 0), # New prefill (1 token, no computed) - (1, 0), # New prefill (1 token, no computed) - (1, 50), - (200, 0), + (1, 0, 1), # prefill + (1, 0, 1), # prefill + (1, 50, 50), # decode + (200, 0, 200), # prefill ], expected_order=[2, 1, 0, 3], expected_modified=True, ), + "four_way_already_ordered": ReorderTestCase( + requests=[ + (1, 100, 100), # decode + (1, 50, 100), # short_extend + (10, 50, 100), # long_extend + (100, 0, 100), # prefill + ], + expected_order=[0, 1, 2, 3], + expected_modified=False, + ), + "four_way_needs_reorder": ReorderTestCase( + requests=[ + (100, 0, 100), # prefill + (1, 50, 100), # short_extend + (1, 100, 100), # decode + (10, 50, 100), # long_extend + ], + expected_order=[2, 1, 3, 0], + expected_modified=True, + ), + "four_way_multiple_short_extends": ReorderTestCase( + requests=[ + (2, 100, 100), # decode + (2, 50, 200), # short_extend + (2, 75, 150), # short_extend + (2, 200, 200), # decode + ], + expected_order=[0, 3, 2, 1], + expected_modified=True, + decode_threshold=2, + ), + "four_way_spec_decode_threshold": ReorderTestCase( + requests=[ + (5, 100, 100), # decode + (5, 50, 100), # short_extend + (5, 0, 100), # prefill + (10, 50, 100), # long_extend + ], + expected_order=[0, 1, 3, 2], + expected_modified=True, + decode_threshold=5, + ), } @@ -129,8 +177,9 @@ def test_reorder_batch_to_split_decodes_and_prefills(test_case: ReorderTestCase) req_ids = [f"r{i}" for i in range(len(test_case.requests))] num_computed_tokens = np.array([r[1] for r in test_case.requests], dtype=np.int32) num_scheduled_tokens = {f"r{i}": r[0] for i, r in enumerate(test_case.requests)} + num_prompt_tokens = np.array([r[2] for r in test_case.requests], dtype=np.int32) - input_batch = MockInputBatch(req_ids, num_computed_tokens) + input_batch = MockInputBatch(req_ids, num_computed_tokens, num_prompt_tokens) scheduler_output = MockSchedulerOutput(num_scheduled_tokens) modified = reorder_batch_to_split_decodes_and_prefills( diff --git a/tests/v1/e2e/test_hybrid_chunked_prefill.py b/tests/v1/e2e/test_hybrid_chunked_prefill.py index 030081a38af3..1790343ca836 100644 --- a/tests/v1/e2e/test_hybrid_chunked_prefill.py +++ b/tests/v1/e2e/test_hybrid_chunked_prefill.py @@ -43,7 +43,7 @@ pytest.param("Qwen/Qwen3.5-4B", marks=[large_gpu_mark(min_gb=40)]), pytest.param( "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-FP8", - marks=[large_gpu_mark(min_gb=80)] + multi_gpu_marks(num_gpus=2), + marks=[large_gpu_mark(min_gb=80)] + multi_gpu_marks(num_gpus=4), ), ], ) @@ -68,7 +68,7 @@ def test_mtp_speculative_mixed_batch_short_prefill( max_num_batched_tokens=chunk_size, max_model_len=512, enforce_eager=True, - tensor_parallel_size=2, + tensor_parallel_size=4, trust_remote_code=True, enable_chunked_prefill=True, enable_prefix_caching=enable_prefix_caching, diff --git a/tests/v1/entrypoints/__init__.py b/tests/v1/entrypoints/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/v1/entrypoints/conftest.py b/tests/v1/entrypoints/conftest.py deleted file mode 100644 index bc9674ee86cf..000000000000 --- a/tests/v1/entrypoints/conftest.py +++ /dev/null @@ -1,173 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import pytest - - -@pytest.fixture -def sample_prompts(): - return [ - "Hello, my name is", - "The president of the United States is", - "The capital of France is", - "The future of AI is", - ] - - -@pytest.fixture -def sample_token_ids(): - return [ - [0], - [0, 1], - [0, 2, 1], - [0, 3, 1, 2], - ] - - -@pytest.fixture -def sample_regex(): - return ( - r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}" - r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)" - ) - - -# Note: Ensure this only uses attributes compatible with xgrammar -@pytest.fixture -def sample_json_schema(): - return { - "type": "object", - "properties": { - "name": {"type": "string"}, - "age": {"type": "integer"}, - "skills": { - "type": "array", - "items": { - "type": "string", - }, - }, - "grade": { - "type": "string", - "pattern": "^[A-D]$", # Regex pattern - }, - "email": { - "type": "string", - "pattern": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$", - }, - "work_history": { - "type": "array", - "items": { - "type": "object", - "properties": { - "company": {"type": "string"}, - "duration": { - "type": "number", - "minimum": 0.0, - "maximum": 100.0, # Numeric range - }, - "position": {"type": "string"}, - }, - "required": ["company", "duration", "position"], - "additionalProperties": False, - }, - "minItems": 0, - "maxItems": 3, - }, - }, - "required": ["name", "age", "skills", "grade", "email", "work_history"], - "additionalProperties": False, - "minProperties": 1, - "maxProperties": 10, - } - - -# A schema unsupported by xgrammar -@pytest.fixture -def unsupported_json_schema(): - return { - "type": "object", - "properties": { - "score": { - "type": "integer", - "multipleOf": 5, # Numeric multiple - }, - "tags": { - "type": "array", - "items": {"type": "string", "minLength": 10, "maxLength": 20}, - }, - }, - "required": ["score", "tags"], - "additionalProperties": False, - "patternProperties": { - "^score$": {"type": "integer"}, - }, - } - - -@pytest.fixture -def sample_definition_json_schema(): - return { - "$defs": { - "Step": { - "properties": { - "explanation": {"title": "Explanation", "type": "string"}, - "output": {"title": "Output", "type": "string"}, - }, - "required": ["explanation", "output"], - "title": "Step", - "type": "object", - } - }, - "properties": { - "steps": { - "items": {"$ref": "#/$defs/Step"}, - "title": "Steps", - "type": "array", - }, - "final_answer": {"title": "Final Answer", "type": "string"}, - }, - "required": ["steps", "final_answer"], - "title": "MathReasoning", - "type": "object", - "additionalProperties": False, - } - - -@pytest.fixture -def sample_structured_outputs_choices(): - return [ - "Python", - "Java", - "JavaScript", - "C++", - "C#", - "PHP", - "TypeScript", - "Ruby", - "Swift", - "Kotlin", - ] - - -@pytest.fixture -def sample_sql_ebnf(): - return """ -root ::= select_statement -select_statement ::= "SELECT" column "from" table "where" condition -column ::= "col_1" | "col_2" -table ::= "table_1" | "table_2" -condition ::= column "=" number -number ::= "1" | "2" -""" - - -@pytest.fixture -def sample_sql_lark(): - return """ -start: select_statement -select_statement: "SELECT" column "from" table "where" condition -column: "col_1" | "col_2" -table: "table_1" | "table_2" -condition: column "=" number -number: "1" | "2" -""" diff --git a/tests/v1/entrypoints/llm/__init__.py b/tests/v1/entrypoints/llm/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/v1/streaming_input/test_gpu_model_runner_v2_streaming.py b/tests/v1/streaming_input/test_gpu_model_runner_v2_streaming.py new file mode 100644 index 000000000000..8fde0f117ca2 --- /dev/null +++ b/tests/v1/streaming_input/test_gpu_model_runner_v2_streaming.py @@ -0,0 +1,207 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Unit tests for MRv2 GPUModelRunner.add_requests streaming input support.""" + +from unittest.mock import Mock + +import pytest +import torch + +from vllm.v1.core.sched.output import ( + CachedRequestData, + NewRequestData, + SchedulerOutput, +) +from vllm.v1.worker.gpu.model_runner import GPUModelRunner +from vllm.v1.worker.gpu.states import RequestState + +pytestmark = pytest.mark.cpu_test + + +@pytest.fixture +def mock_model_runner_with_req_states(): + """Create a mock MRv2 GPUModelRunner with a real RequestState.""" + + runner = Mock(spec=GPUModelRunner) + runner.req_states = RequestState( + max_num_reqs=10, + max_model_len=1024, + max_num_batched_tokens=1024, + num_speculative_steps=0, + vocab_size=32000, + device=torch.device("cpu"), + model_dtype=torch.float32, + cache_draft_logits=False, + ) + runner.encoder_cache = None + runner.model_state = Mock() + runner.block_tables = Mock() + runner.lora_state = Mock() + runner.sampler = None + runner.prompt_logprobs_worker = None + runner.is_last_pp_rank = False + + # Mock staged writes — they use Triton kernels that require GPU + runner.req_states.apply_staged_writes = Mock() + + # Bind the real methods to our mock + runner._remove_request = GPUModelRunner._remove_request.__get__(runner) + runner.add_requests = GPUModelRunner.add_requests.__get__(runner) + return runner + + +def _make_scheduler_output(new_reqs): + return SchedulerOutput( + scheduled_new_reqs=new_reqs, + scheduled_cached_reqs=CachedRequestData.make_empty(), + num_scheduled_tokens={}, + total_num_scheduled_tokens=0, + scheduled_spec_decode_tokens={}, + scheduled_encoder_inputs={}, + num_common_prefix_blocks=[], + finished_req_ids=set(), + free_encoder_mm_hashes=[], + ) + + +def test_e2e_streaming_request_update_basic_flow( + mock_model_runner_with_req_states, +): + """Test that streaming sessions are updated correctly. + + This test validates that when a streaming session is updated with new + prompt tokens: + 1. The old request state is removed (no free_indices leak) + 2. The new state is written with updated prefill_token_ids + 3. model_state and block_tables are re-registered for the new state + """ + runner = mock_model_runner_with_req_states + req_states = runner.req_states + req_id = "streaming_req_0" + initial_free = len(req_states.free_indices) + + # Step 1: Add initial request with 3 prompt tokens, all computed + initial_req_data = NewRequestData( + req_id=req_id, + prompt_token_ids=[1, 2, 3], + prefill_token_ids=[1, 2, 3], + mm_features=[], + sampling_params=None, + pooling_params=None, + block_ids=([0],), + num_computed_tokens=3, + lora_request=None, + ) + runner.add_requests(_make_scheduler_output([initial_req_data])) + assert req_id in req_states.req_id_to_index + assert len(req_states.free_indices) == initial_free - 1 + + # Step 2: Create streaming update with extended prompt + # The scheduler has already set prefill_token_ids to the full sequence + # (original prompt + intermediate output + new prompt tokens) + updated_req_data = NewRequestData( + req_id=req_id, + prompt_token_ids=[1, 2, 3], + prefill_token_ids=[1, 2, 3, 10, 4, 5], + mm_features=[], + sampling_params=None, + pooling_params=None, + block_ids=([0, 1],), + num_computed_tokens=4, # 3 original prompt + 1 intermediate output + lora_request=None, + ) + runner.add_requests(_make_scheduler_output([updated_req_data])) + + # Step 3: Verify no free_indices leak (old slot recycled) + assert len(req_states.free_indices) == initial_free - 1 + + # Verify the request is still tracked with exactly one index + assert req_id in req_states.req_id_to_index + assert sum(1 for v in req_states.index_to_req_id.values() if v == req_id) == 1 + + # Verify state was updated with new values + new_idx = req_states.req_id_to_index[req_id] + assert req_states.prompt_len.np[new_idx] == 3 + assert req_states.prefill_len.np[new_idx] == 6 + assert req_states.num_computed_prefill_tokens[new_idx] == 4 + + # Verify model_state and block_tables were re-registered + runner.model_state.add_request.assert_called_with(new_idx, updated_req_data) + runner.block_tables.append_block_ids.assert_called_with( + new_idx, ([0, 1],), overwrite=True + ) + + +def test_e2e_streaming_with_multimodal_features( + mock_model_runner_with_req_states, +): + """Test that streaming sessions with multimodal features are updated. + + This test validates that when a streaming session with mm features + is updated: + 1. The old request state is removed (no free_indices leak) + 2. encoder_cache is cleaned up and re-registered with new mm_features + 3. model_state is re-registered (recomputes M-RoPE positions etc.) + """ + runner = mock_model_runner_with_req_states + req_states = runner.req_states + req_id = "streaming_mm_req_0" + initial_free = len(req_states.free_indices) + + # Enable encoder_cache for multimodal + runner.encoder_cache = Mock() + + # Step 1: Add initial request with one audio feature + mm_feature_1 = Mock() + initial_req_data = NewRequestData( + req_id=req_id, + prompt_token_ids=[1, 2] + [0] * 10 + [3, 4], + prefill_token_ids=[1, 2] + [0] * 10 + [3, 4], + mm_features=[mm_feature_1], + sampling_params=None, + pooling_params=None, + block_ids=([0],), + num_computed_tokens=14, + lora_request=None, + ) + runner.add_requests(_make_scheduler_output([initial_req_data])) + assert req_id in req_states.req_id_to_index + + # Reset mocks to track only the streaming update calls + runner.encoder_cache.reset_mock() + runner.model_state.reset_mock() + + # Step 2: Create streaming update with additional multimodal feature + # The scheduler has folded the intermediate output (100) into + # prefill_token_ids and added a new audio chunk + mm_feature_2 = Mock() + updated_req_data = NewRequestData( + req_id=req_id, + prompt_token_ids=[1, 2] + [0] * 10 + [3, 4], + prefill_token_ids=[1, 2] + [0] * 10 + [3, 4, 100] + [0] * 5 + [5], + mm_features=[mm_feature_1, mm_feature_2], + sampling_params=None, + pooling_params=None, + block_ids=([0, 1],), + num_computed_tokens=14, + lora_request=None, + ) + runner.add_requests(_make_scheduler_output([updated_req_data])) + + # Step 3: Verify no free_indices leak + assert len(req_states.free_indices) == initial_free - 1 + assert sum(1 for v in req_states.index_to_req_id.values() if v == req_id) == 1 + + # Verify encoder_cache was cleaned up and re-registered + runner.encoder_cache.remove_request.assert_called_once_with(req_id) + runner.encoder_cache.add_request.assert_called_once_with( + req_id, [mm_feature_1, mm_feature_2] + ) + + # Verify model_state was re-registered with new data + new_idx = req_states.req_id_to_index[req_id] + runner.model_state.add_request.assert_called_once_with(new_idx, updated_req_data) + + # Verify updated prefill length + assert req_states.prefill_len.np[new_idx] == 21 diff --git a/tests/v1/test_serial_utils.py b/tests/v1/test_serial_utils.py index a5dc1773d477..4ed8724e60fb 100644 --- a/tests/v1/test_serial_utils.py +++ b/tests/v1/test_serial_utils.py @@ -278,3 +278,148 @@ def test_custom_class_serialization_disallowed_without_pickle(): with pytest.raises(TypeError): # Attempt to encode the custom class encoder.encode(obj) + + +@dataclass +class RequestWithTensor: + """Mock request with non-multimodal tensor field like EngineCoreRequest.""" + + prompt_embeds: torch.Tensor | None + data: str + + +def test_non_multimodal_tensor_with_ipc(): + """Test that non-multimodal tensor fields work correctly with IPC enabled. + + This reproduces the bug where fields like prompt_embeds: torch.Tensor | None + would fail to decode when IPC is enabled because _decode_tensor expected a + raw tensor tuple but received a msgpack-decoded TensorIpcHandle list. + """ + import torch.multiprocessing as torch_mp + + from vllm.v1.engine.tensor_ipc import TensorIpcReceiver, TensorIpcSender + + # Create tensor queues for IPC + tensor_queues = [torch_mp.Queue()] + + # Create encoder with IPC sender + sender = TensorIpcSender(tensor_queues[0]) + encoder = MsgpackEncoder(oob_tensor_consumer=sender) + + # Create decoder with IPC receiver + receiver = TensorIpcReceiver(tensor_queues[0]) + decoder = MsgpackDecoder(RequestWithTensor, oob_tensor_provider=receiver) + + # Create a request with a non-multimodal tensor + original_tensor = torch.randn(5, 10, dtype=torch.float32) + request = RequestWithTensor(prompt_embeds=original_tensor, data="test_data") + + # Encode the request - this should send the tensor via IPC + encoded = encoder.encode(request) + + # Verify encoding succeeded + assert len(encoded) > 0 + + # Decode the request - this should retrieve the tensor from IPC queue + # Previously this would fail because the decoder tried to unpack the + # handle list as raw tensor bytes metadata. + decoded = decoder.decode(encoded) + + # Verify the decoded request matches the original + assert isinstance(decoded, RequestWithTensor) + assert decoded.data == "test_data" + assert decoded.prompt_embeds is not None + assert torch.allclose(decoded.prompt_embeds, original_tensor), ( + "Decoded tensor does not match the original tensor." + ) + + +def test_non_multimodal_tensor_with_ipc_none_value(): + """Test that None values for tensor fields work correctly with IPC enabled.""" + import torch.multiprocessing as torch_mp + + from vllm.v1.engine.tensor_ipc import TensorIpcReceiver, TensorIpcSender + + # Create tensor queues for IPC + tensor_queues = [torch_mp.Queue()] + + # Create encoder with IPC sender + sender = TensorIpcSender(tensor_queues[0]) + encoder = MsgpackEncoder(oob_tensor_consumer=sender) + + # Create decoder with IPC receiver + receiver = TensorIpcReceiver(tensor_queues[0]) + decoder = MsgpackDecoder(RequestWithTensor, oob_tensor_provider=receiver) + + # Create a request with None for the tensor field + request = RequestWithTensor(prompt_embeds=None, data="test_data_with_none") + + # Encode and decode the request + encoded = encoder.encode(request) + decoded = decoder.decode(encoded) + + # Verify the decoded request matches the original + assert isinstance(decoded, RequestWithTensor) + assert decoded.data == "test_data_with_none" + assert decoded.prompt_embeds is None + + +def test_multiple_senders_single_receiver_ipc(): + """Test N senders sharing a queue with a single receiver via msgpack. + + Simulates the real vLLM topology where multiple API server frontends + each have their own MsgpackEncoder + TensorIpcSender, all putting + tensors onto the same torch.mp queue, and a single engine core + decodes them with one MsgpackDecoder + TensorIpcReceiver. + """ + import torch.multiprocessing as torch_mp + + from vllm.v1.engine.tensor_ipc import TensorIpcReceiver, TensorIpcSender + + num_senders = 3 + num_messages_per_sender = 2 + tensor_queue = torch_mp.Queue() + + # Create N independent senders (each gets its own uuid-based sender_id) + senders = [] + encoders = [] + for _ in range(num_senders): + s = TensorIpcSender(tensor_queue) + senders.append(s) + encoders.append(MsgpackEncoder(oob_tensor_consumer=s)) + + # Single receiver + receiver = TensorIpcReceiver(tensor_queue) + decoder = MsgpackDecoder(RequestWithTensor, oob_tensor_provider=receiver) + + # Encode messages from all senders, interleaving the order + # so that tensors from different senders land on the queue interleaved. + encoded_payloads: list[tuple[int, int, torch.Tensor, list]] = [] + for msg_idx in range(num_messages_per_sender): + for sender_idx in range(num_senders): + tensor = torch.full( + (sender_idx + 1, msg_idx + 2), + float(sender_idx * 100 + msg_idx), + dtype=torch.float32, + ) + req = RequestWithTensor( + prompt_embeds=tensor, + data=f"s{sender_idx}_m{msg_idx}", + ) + encoded = encoders[sender_idx].encode(req) + encoded_payloads.append((sender_idx, msg_idx, tensor, encoded)) + + # Decode all messages — the receiver must correctly match each + # tensor handle to the right TensorIpcData from the shared queue. + for sender_idx, msg_idx, original_tensor, encoded in encoded_payloads: + decoded = decoder.decode(encoded) + assert isinstance(decoded, RequestWithTensor) + assert decoded.data == f"s{sender_idx}_m{msg_idx}" + assert decoded.prompt_embeds is not None + assert decoded.prompt_embeds.shape == original_tensor.shape, ( + f"Shape mismatch for sender {sender_idx} msg {msg_idx}: " + f"{decoded.prompt_embeds.shape} != {original_tensor.shape}" + ) + assert torch.allclose(decoded.prompt_embeds, original_tensor), ( + f"Value mismatch for sender {sender_idx} msg {msg_idx}" + ) diff --git a/tests/v1/test_tensor_ipc_queue.py b/tests/v1/test_tensor_ipc_queue.py new file mode 100644 index 000000000000..a3fcb97ca171 --- /dev/null +++ b/tests/v1/test_tensor_ipc_queue.py @@ -0,0 +1,943 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Tests for tensor IPC queue functionality.""" + +import contextlib +import multiprocessing as mp +from dataclasses import dataclass +from multiprocessing.synchronize import Barrier as BarrierType +from multiprocessing.synchronize import Event as EventType +from typing import Any + +import pytest +import torch +import torch.multiprocessing as torch_mp + +from vllm.v1.engine.tensor_ipc import ( + TensorIpcData, + TensorIpcReceiver, + TensorIpcSender, +) +from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder + + +@pytest.fixture(scope="module", autouse=True) +def setup_multiprocessing(): + """Set multiprocessing start method to 'spawn' for compatibility.""" + with contextlib.suppress(RuntimeError): + # Already set, which is fine + torch_mp.set_start_method("spawn", force=True) + yield + + +@dataclass +# Use a typed container so the test covers the real vLLM path where tensor IPC +# handles are encoded and decoded as fields nested inside larger msgpack payloads. +class TensorEnvelope: + tensor: torch.Tensor + label: str + + +def encoder_process( + tensor_queue: torch_mp.Queue, + payload_queue: mp.Queue, + result_queue: mp.Queue, + tensor_data: dict[str, Any], + ready_event: EventType, + retrieval_done: EventType, +): + """Process that msgpack-encodes and sends tensors via IPC.""" + try: + sender = TensorIpcSender(tensor_queue) + encoder = MsgpackEncoder(oob_tensor_consumer=sender) + + if torch.cuda.is_available(): + device = "cuda:0" + tensor = torch.randn( + *tensor_data["shape"], dtype=tensor_data["dtype"], device=device + ) + else: + # Fall back to CPU for testing + device = "cpu" + tensor = torch.randn(*tensor_data["shape"], dtype=tensor_data["dtype"]) + + message = TensorEnvelope(tensor=tensor, label="cuda-msgpack") + encoded = encoder.encode(message) + payload_queue.put(encoded, timeout=10.0) + + ready_event.set() + + result_queue.put( + { + "success": True, + "encoded_length": len(encoded), + "device": str(device), + "tensor_shape": tuple(tensor.shape), + } + ) + retrieval_done.wait(timeout=30.0) + except Exception as e: + import traceback + + ready_event.set() + retrieval_done.set() + result_queue.put( + {"success": False, "error": str(e), "traceback": traceback.format_exc()} + ) + + +def decoder_process( + tensor_queue: torch_mp.Queue, + payload_queue: mp.Queue, + result_queue: mp.Queue, + expected_shape: tuple, + encoder_ready: EventType, + retrieval_done: EventType, +): + """Process that msgpack-decodes tensors received via IPC.""" + try: + if not encoder_ready.wait(timeout=10.0): + raise TimeoutError("Encoder did not signal ready") + + encoded = payload_queue.get(timeout=5.0) + receiver = TensorIpcReceiver(tensor_queue) + decoder = MsgpackDecoder(TensorEnvelope, oob_tensor_provider=receiver) + decoded = decoder.decode(encoded) + + result_queue.put( + { + "success": True, + "tensor_shape": tuple(decoded.tensor.shape), + "device": str(decoded.tensor.device), + "label": decoded.label, + "matches_expected": tuple(decoded.tensor.shape) == expected_shape, + } + ) + except Exception as e: + import traceback + + retrieval_done.set() + result_queue.put( + {"success": False, "error": str(e), "traceback": traceback.format_exc()} + ) + else: + retrieval_done.set() + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_cuda_tensor_queue_basic(): + """Test CUDA tensor IPC through the msgpack encoder/decoder path.""" + tensor_queue = torch_mp.Queue() + payload_queue: mp.Queue = mp.Queue() + result_queue: mp.Queue = mp.Queue() + encoder_ready = mp.Event() + retrieval_done = mp.Event() + + tensor_shape = (4, 8, 16) + tensor_dtype = torch.float32 + + encoder_proc = mp.Process( + target=encoder_process, + args=( + tensor_queue, + payload_queue, + result_queue, + {"shape": tensor_shape, "dtype": tensor_dtype}, + encoder_ready, + retrieval_done, + ), + ) + encoder_proc.start() + + decoder_proc = mp.Process( + target=decoder_process, + args=( + tensor_queue, + payload_queue, + result_queue, + tensor_shape, + encoder_ready, + retrieval_done, + ), + ) + decoder_proc.start() + + encoder_result = result_queue.get(timeout=10.0) + decoder_result = result_queue.get(timeout=10.0) + + encoder_proc.join(timeout=5.0) + decoder_proc.join(timeout=5.0) + + # Verify results + assert encoder_result["success"], ( + f"Encoder failed: {encoder_result.get('error')}\n" + f"{encoder_result.get('traceback', '')}" + ) + assert decoder_result["success"], ( + f"Decoder failed: {decoder_result.get('error')}\n" + f"{decoder_result.get('traceback', '')}" + ) + assert decoder_result["matches_expected"], "Tensor shape mismatch" + assert "cuda" in decoder_result["device"], "Tensor not on CUDA device" + assert decoder_result["label"] == "cuda-msgpack" + + +def test_cpu_tensor_fallback(): + """Test that CPU tensors use standard serialization path.""" + encoder = MsgpackEncoder() + + # Create a CPU tensor + tensor = torch.randn(3, 4, dtype=torch.float32) + + # Encode the tensor (should use standard path, not queue) + encoded = encoder.encode({"test_tensor": tensor}) + + # Verify encoding succeeded + assert len(encoded) > 0 + assert isinstance(encoded, (list, tuple)) + + # Basic check: no queue should be used, so tensor goes through standard path + # This is mainly to ensure no exceptions are raised + + +def test_msgpack_encoder_decoder_with_ipc(): + """Test the full msgpack + tensor IPC path in one process.""" + tensor_queue = torch_mp.Queue() + sender = TensorIpcSender(tensor_queue) + encoder = MsgpackEncoder(oob_tensor_consumer=sender) + receiver = TensorIpcReceiver(tensor_queue) + decoder = MsgpackDecoder(TensorEnvelope, oob_tensor_provider=receiver) + + # Use CPU here to exercise the msgpack + sender/receiver integration + # without relying on same-process CUDA IPC behavior. + tensor = torch.randn(2, 3) + + message = TensorEnvelope(tensor=tensor, label="test") + encoded = encoder.encode(message) + assert len(encoded) > 0 + + decoded = decoder.decode(encoded) + assert isinstance(decoded, TensorEnvelope) + assert decoded.label == "test" + assert torch.allclose(decoded.tensor, tensor) + + +def test_decoder_buffer_management(): + """Test receiver's tensor buffer management when draining queue.""" + tensor_queue = torch_mp.Queue() + + sender_id = "test_sender" + message_id = 1 + + # Put multiple tensors in queue using TensorIpcData + tensors_data = [ + (0, torch.randn(2, 3)), + (1, torch.randn(4, 5)), + (2, torch.randn(6, 7)), + ] + + for tensor_id, tensor in tensors_data: + ipc_data = TensorIpcData( + sender_id=sender_id, + message_id=message_id, + tensor_id=tensor_id, + tensor=tensor, + ) + tensor_queue.put(ipc_data) + + # Create receiver directly + receiver = TensorIpcReceiver(tensor_queue) + + # Request tensor_id=2 (should buffer tensor_id=0 and tensor_id=1) + handle = {"sender_id": sender_id, "message_id": message_id, "tensor_id": 2} + + result = receiver("float32", (6, 7), handle) + assert result.shape == (6, 7) + + # Verify buffer has tensor_id 0 and 1 + sender = receiver._tensor_buffers[sender_id] + tensors = sender.tensors.get(message_id, {}) + assert 0 in tensors + assert 1 in tensors + + # Request buffered tensor + handle2 = {"sender_id": sender_id, "message_id": message_id, "tensor_id": 0} + + result2 = receiver("float32", (2, 3), handle2) + assert result2.shape == (2, 3) + # tensor_id 0 should be removed from buffer + sender = receiver._tensor_buffers[sender_id] + tensors = sender.tensors.get(message_id, {}) + assert 0 not in tensors + + +def api_server_worker( + server_id: int, + tensor_queue: torch_mp.Queue, + result_queue: mp.Queue, + barrier: BarrierType, + retrieval_done: EventType, +): + """Worker simulating an API server sending tensors.""" + try: + # Each server sends a unique tensor + tensor = torch.ones(server_id + 1, server_id + 2) * server_id + sender_id = f"server_{server_id}" + + # Wait for all servers to be ready + barrier.wait() + + # Send tensor using TensorIpcData + ipc_data = TensorIpcData( + sender_id=sender_id, + message_id=0, + tensor_id=0, + tensor=tensor, + ) + tensor_queue.put(ipc_data) + + result_queue.put({"server_id": server_id, "success": True}) + + # Keep process alive until main process has retrieved all tensors + # This prevents shared memory handles from being invalidated + retrieval_done.wait(timeout=30.0) + except Exception as e: + import traceback + + result_queue.put( + { + "server_id": server_id, + "success": False, + "error": str(e), + "traceback": traceback.format_exc(), + } + ) + + +def test_multiple_api_servers_to_engine(): + """Test multiple API servers sending to one engine core via multiprocessing.""" + num_api_servers = 3 + tensor_queue = torch_mp.Queue() + result_queue: mp.Queue = mp.Queue() + barrier = mp.Barrier(num_api_servers) + retrieval_done = mp.Event() + + # Start multiple API server processes + processes = [] + for server_id in range(num_api_servers): + proc = mp.Process( + target=api_server_worker, + args=(server_id, tensor_queue, result_queue, barrier, retrieval_done), + ) + proc.start() + processes.append(proc) + + # Collect results from all servers + results = [] + for _ in range(num_api_servers): + result = result_queue.get(timeout=10.0) + results.append(result) + + # Verify all servers succeeded + for result in results: + assert result["success"], ( + f"Server {result['server_id']} failed: {result.get('error')}" + ) + + # Verify all tensors are in queue + received_tensors = [] + for _ in range(num_api_servers): + ipc_data = tensor_queue.get(timeout=1.0) + received_tensors.append((ipc_data.sender_id, ipc_data.tensor)) + + assert len(received_tensors) == num_api_servers + + # Verify tensor content (order may vary with multiprocessing) + tensor_by_sender = {sid: t for sid, t in received_tensors} + for server_id in range(num_api_servers): + expected_id = f"server_{server_id}" + assert expected_id in tensor_by_sender, ( + f"Missing tensor from server {server_id}" + ) + expected_tensor = torch.ones(server_id + 1, server_id + 2) * server_id + assert torch.allclose(tensor_by_sender[expected_id], expected_tensor) + + # Signal workers that retrieval is complete + retrieval_done.set() + + # Wait for all processes to complete + for proc in processes: + proc.join(timeout=5.0) + + +def mixed_tensor_encoder_process( + tensor_queue: torch_mp.Queue, + result_queue: mp.Queue, + ready_event: EventType, + retrieval_done: EventType, +): + """Process that encodes mixed CPU/CUDA tensors.""" + try: + sender = TensorIpcSender(tensor_queue) + _encoder = MsgpackEncoder(oob_tensor_consumer=sender) + + # Create only CUDA tensor for IPC (CPU will be serialized) + # But actually, let's just send CUDA tensor directly + cuda_tensor = torch.randn(4, 5, device="cuda:0") + + # Manually send via IPC to test the mechanism + cuda_tensor_shared = cuda_tensor.share_memory_() + + ipc_data = TensorIpcData( + sender_id="mixed_encoder", + message_id=0, + tensor_id=0, + tensor=cuda_tensor_shared, + ) + tensor_queue.put(ipc_data, timeout=10.0) + + ready_event.set() + + result_queue.put({"success": True, "sent_cuda": True}) + + # Keep process alive until decoder has retrieved the tensor + retrieval_done.wait(timeout=30.0) + except Exception as e: + import traceback + + ready_event.set() + result_queue.put( + {"success": False, "error": str(e), "traceback": traceback.format_exc()} + ) + + +def mixed_tensor_decoder_process( + tensor_queue: torch_mp.Queue, + result_queue: mp.Queue, + encoder_ready: EventType, + retrieval_done: EventType, +): + """Process that retrieves mixed tensors from queue.""" + try: + # Wait for encoder to finish + if not encoder_ready.wait(timeout=10.0): + raise TimeoutError("Encoder did not signal ready") + + # Try to get CUDA tensor from queue + ipc_data = tensor_queue.get(timeout=5.0) + + result_queue.put( + { + "success": True, + "is_cuda": ipc_data.tensor.is_cuda, + "shape": tuple(ipc_data.tensor.shape), + } + ) + + # Signal that retrieval is complete + retrieval_done.set() + except Exception as e: + import traceback + + retrieval_done.set() # Signal even on failure + result_queue.put( + {"success": False, "error": str(e), "traceback": traceback.format_exc()} + ) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +def test_mixed_cpu_cuda_tensors(): + """Test encoding with mixed CPU and CUDA tensors using multiprocessing.""" + tensor_queue = torch_mp.Queue() + result_queue: mp.Queue = mp.Queue() + encoder_ready = mp.Event() + retrieval_done = mp.Event() + + # Start encoder process + encoder_proc = mp.Process( + target=mixed_tensor_encoder_process, + args=(tensor_queue, result_queue, encoder_ready, retrieval_done), + ) + encoder_proc.start() + + # Start decoder process + decoder_proc = mp.Process( + target=mixed_tensor_decoder_process, + args=(tensor_queue, result_queue, encoder_ready, retrieval_done), + ) + decoder_proc.start() + + # Get results + encoder_result = result_queue.get(timeout=10.0) + decoder_result = result_queue.get(timeout=10.0) + + encoder_proc.join(timeout=5.0) + decoder_proc.join(timeout=5.0) + + # Verify encoder succeeded + assert encoder_result["success"], ( + f"Encoder failed: {encoder_result.get('error')}\n" + f"{encoder_result.get('traceback', '')}" + ) + + # Verify decoder succeeded and got CUDA tensor + assert decoder_result["success"], ( + f"Decoder failed: {decoder_result.get('error')}\n" + f"{decoder_result.get('traceback', '')}" + ) + assert decoder_result["is_cuda"], "Retrieved tensor is not on CUDA" + assert decoder_result["shape"] == (4, 5), ( + f"Unexpected shape: {decoder_result['shape']}" + ) + + +def cpu_tensor_ipc_encoder_process( + tensor_queue: torch_mp.Queue, + result_queue: mp.Queue, + tensor_shape: tuple, + ready_event: EventType, + retrieval_done: EventType, +): + """Process that encodes and sends CPU tensors via IPC queue.""" + try: + # Create encoder with IPC enabled for all tensors + sender = TensorIpcSender(tensor_queue) + encoder = MsgpackEncoder(oob_tensor_consumer=sender) + + # Create a CPU tensor + tensor = torch.randn(*tensor_shape, dtype=torch.float32) + + # Encode the tensor (should use IPC queue, not standard serialization) + encoded = encoder.encode({"test_tensor": tensor}) + + # Signal that encoding is complete + ready_event.set() + + result_queue.put( + { + "success": True, + "encoded_length": len(encoded), + "device": str(tensor.device), + "tensor_shape": tuple(tensor.shape), + } + ) + + # Keep process alive until decoder has retrieved the tensor + # This is necessary for CPU tensor shared memory to remain valid + retrieval_done.wait(timeout=30.0) + except Exception as e: + import traceback + + ready_event.set() + result_queue.put( + {"success": False, "error": str(e), "traceback": traceback.format_exc()} + ) + + +def cpu_tensor_ipc_decoder_process( + tensor_queue: torch_mp.Queue, + result_queue: mp.Queue, + expected_shape: tuple, + encoder_ready: EventType, + retrieval_done: EventType, +): + """Process that decodes and receives CPU tensors from IPC queue.""" + try: + # Wait for encoder to finish sending + if not encoder_ready.wait(timeout=10.0): + raise TimeoutError("Encoder did not signal ready") + + # Get tensor from queue + ipc_data = tensor_queue.get(timeout=5.0) + + result_queue.put( + { + "success": True, + "tensor_id": ipc_data.tensor_id, + "tensor_shape": tuple(ipc_data.tensor.shape), + "device": str(ipc_data.tensor.device), + "matches_expected": tuple(ipc_data.tensor.shape) == expected_shape, + "is_cpu": ipc_data.tensor.device.type == "cpu", + } + ) + + # Signal that retrieval is complete + retrieval_done.set() + except Exception as e: + import traceback + + retrieval_done.set() # Signal even on failure + result_queue.put( + {"success": False, "error": str(e), "traceback": traceback.format_exc()} + ) + + +def test_cpu_tensor_ipc(): + """Test CPU tensor sharing via IPC queue when mm_tensor_ipc is enabled.""" + # Set up single queue and synchronization + tensor_queue = torch_mp.Queue() + result_queue: mp.Queue = mp.Queue() + encoder_ready = mp.Event() + retrieval_done = mp.Event() + + tensor_shape = (3, 5, 7) + + # Start encoder process + encoder_proc = mp.Process( + target=cpu_tensor_ipc_encoder_process, + args=( + tensor_queue, + result_queue, + tensor_shape, + encoder_ready, + retrieval_done, + ), + ) + encoder_proc.start() + + # Start decoder process + decoder_proc = mp.Process( + target=cpu_tensor_ipc_decoder_process, + args=( + tensor_queue, + result_queue, + tensor_shape, + encoder_ready, + retrieval_done, + ), + ) + decoder_proc.start() + + # Wait for processes and collect results + encoder_result = result_queue.get(timeout=10.0) + decoder_result = result_queue.get(timeout=10.0) + + encoder_proc.join(timeout=5.0) + decoder_proc.join(timeout=5.0) + + # Verify results + assert encoder_result["success"], ( + f"Encoder failed: {encoder_result.get('error')}\n" + f"{encoder_result.get('traceback', '')}" + ) + assert decoder_result["success"], ( + f"Decoder failed: {decoder_result.get('error')}\n" + f"{decoder_result.get('traceback', '')}" + ) + assert decoder_result["matches_expected"], "Tensor shape mismatch" + assert decoder_result["is_cpu"], "Tensor not on CPU device" + + +def test_ipc_disabled_mode(): + """Test that IPC is disabled when no sender is provided.""" + tensor_queues = [torch_mp.Queue()] + + # Create encoder without IPC sender (IPC disabled) + encoder = MsgpackEncoder() + + # Create a CPU tensor + cpu_tensor = torch.randn(2, 3, dtype=torch.float32) + + # Encode the tensor (should use standard serialization, not IPC) + encoded = encoder.encode({"test_tensor": cpu_tensor}) + + # Verify encoding succeeded + assert len(encoded) > 0 + assert isinstance(encoded, (list, tuple)) + + # Verify queue is empty (no IPC was used) + assert tensor_queues[0].empty(), "Tensor queue should be empty when IPC is disabled" + + # If CUDA is available, test with CUDA tensor too + if torch.cuda.is_available(): + cuda_tensor = torch.randn(4, 5, device="cuda:0") + encoded_cuda = encoder.encode({"cuda_tensor": cuda_tensor}) + assert len(encoded_cuda) > 0 + assert tensor_queues[0].empty(), ( + "Tensor queue should be empty for CUDA tensor when IPC is disabled" + ) + + +@dataclass +class MultiTensorMessage: + """Message with multiple tensors to test multi-tensor IPC.""" + + t1: torch.Tensor + t2: torch.Tensor + sender_label: str + + +def concurrent_sender_process( + tensor_queue: torch_mp.Queue, + payload_queue: mp.Queue, + result_queue: mp.Queue, + sender_index: int, + num_messages: int, + barrier: BarrierType, + retrieval_done: EventType, +): + """Process that acts as one of N concurrent senders.""" + try: + sender = TensorIpcSender(tensor_queue) + encoder = MsgpackEncoder(oob_tensor_consumer=sender) + + # Wait for all senders to be ready before sending + barrier.wait(timeout=10.0) + + encoded_payloads = [] + for msg_idx in range(num_messages): + # Each sender creates uniquely-shaped tensors so we can + # verify correct routing on the receiver side. + t1 = torch.full((sender_index + 1, 3), float(msg_idx), dtype=torch.float32) + t2 = torch.full( + (2, sender_index + 2), float(msg_idx + 100), dtype=torch.float64 + ) + msg = MultiTensorMessage( + t1=t1, + t2=t2, + sender_label=f"sender_{sender_index}_msg_{msg_idx}", + ) + encoded = encoder.encode(msg) + encoded_payloads.append(encoded) + + # Send all encoded payloads via the regular (non-tensor) queue + for encoded in encoded_payloads: + payload_queue.put(encoded, timeout=10.0) + + result_queue.put( + { + "success": True, + "sender_index": sender_index, + "num_sent": num_messages, + } + ) + + # Keep alive so shared-memory handles remain valid + retrieval_done.wait(timeout=30.0) + except Exception as e: + import traceback + + result_queue.put( + { + "success": False, + "sender_index": sender_index, + "error": str(e), + "traceback": traceback.format_exc(), + } + ) + + +def test_concurrent_senders_single_receiver(): + """Test N concurrent senders sharing one queue with a single receiver. + + Each sender encodes multiple messages (each containing two tensors) via + its own MsgpackEncoder + TensorIpcSender. A single TensorIpcReceiver + on the receiving side must correctly drain-and-buffer interleaved + TensorIpcData items from the shared queue and match them back to the + right message handles during decode. + """ + num_senders = 4 + num_messages_per_sender = 3 + tensor_queue = torch_mp.Queue() + payload_queue: mp.Queue = mp.Queue() + result_queue: mp.Queue = mp.Queue() + barrier = mp.Barrier(num_senders) + retrieval_done = mp.Event() + + # Launch sender processes + processes = [] + for i in range(num_senders): + proc = mp.Process( + target=concurrent_sender_process, + args=( + tensor_queue, + payload_queue, + result_queue, + i, + num_messages_per_sender, + barrier, + retrieval_done, + ), + ) + proc.start() + processes.append(proc) + + # Collect send confirmations + send_results = [] + for _ in range(num_senders): + send_results.append(result_queue.get(timeout=15.0)) + for r in send_results: + assert r["success"], ( + f"Sender {r['sender_index']} failed: {r.get('error')}\n" + f"{r.get('traceback', '')}" + ) + + # Now decode all messages from the main process using a single receiver + receiver = TensorIpcReceiver(tensor_queue) + decoder = MsgpackDecoder(MultiTensorMessage, oob_tensor_provider=receiver) + + decoded_messages: list[MultiTensorMessage] = [] + total = num_senders * num_messages_per_sender + for _ in range(total): + encoded = payload_queue.get(timeout=10.0) + decoded = decoder.decode(encoded) + assert isinstance(decoded, MultiTensorMessage) + decoded_messages.append(decoded) + + # Signal senders they can exit + retrieval_done.set() + + # Group by sender_label prefix to verify all messages arrived + by_sender: dict[int, list[MultiTensorMessage]] = {} + for msg in decoded_messages: + # label format: "sender_{i}_msg_{j}" + parts = msg.sender_label.split("_") + sender_idx = int(parts[1]) + by_sender.setdefault(sender_idx, []).append(msg) + + assert len(by_sender) == num_senders, ( + f"Expected {num_senders} senders, got {len(by_sender)}" + ) + + for sender_idx in range(num_senders): + msgs = sorted(by_sender[sender_idx], key=lambda m: m.sender_label) + assert len(msgs) == num_messages_per_sender, ( + f"Sender {sender_idx}: expected {num_messages_per_sender} " + f"messages, got {len(msgs)}" + ) + for msg_idx, msg in enumerate(msgs): + assert msg.sender_label == f"sender_{sender_idx}_msg_{msg_idx}" + # Verify tensor shapes match what the sender created + assert msg.t1.shape == (sender_idx + 1, 3) + assert msg.t2.shape == (2, sender_idx + 2) + # Verify tensor values + assert torch.allclose(msg.t1, torch.full_like(msg.t1, float(msg_idx))) + assert torch.allclose(msg.t2, torch.full_like(msg.t2, float(msg_idx + 100))) + + for proc in processes: + proc.join(timeout=5.0) + + +def test_concurrent_senders_interleaved_buffer(): + """Test receiver buffering when tensors from multiple senders interleave. + + Manually enqueue TensorIpcData from two senders in an interleaved order + and verify the receiver correctly buffers and retrieves each tensor by + its (sender_id, message_id, tensor_id) handle. + """ + tensor_queue = torch_mp.Queue() + + # Sender A: 2 tensors for message 1 + a_t0 = torch.randn(2, 3) + a_t1 = torch.randn(4, 5) + # Sender B: 2 tensors for message 1 + b_t0 = torch.randn(6, 7) + b_t1 = torch.randn(8, 9) + + # Interleave: B_t0, A_t0, B_t1, A_t1 + for sid, mid, tid, t in [ + ("B", 1, 0, b_t0), + ("A", 1, 0, a_t0), + ("B", 1, 1, b_t1), + ("A", 1, 1, a_t1), + ]: + tensor_queue.put( + TensorIpcData(sender_id=sid, message_id=mid, tensor_id=tid, tensor=t) + ) + + receiver = TensorIpcReceiver(tensor_queue) + + # Request A_t1 first — receiver must drain and buffer B_t0, A_t0, B_t1 + result = receiver( + "float32", a_t1.shape, {"sender_id": "A", "message_id": 1, "tensor_id": 1} + ) + assert torch.equal(result, a_t1) + + # Now request B_t0 from buffer + result = receiver( + "float32", b_t0.shape, {"sender_id": "B", "message_id": 1, "tensor_id": 0} + ) + assert torch.equal(result, b_t0) + + # Request A_t0 from buffer + result = receiver( + "float32", a_t0.shape, {"sender_id": "A", "message_id": 1, "tensor_id": 0} + ) + assert torch.equal(result, a_t0) + + # Request B_t1 from buffer + result = receiver( + "float64", b_t1.shape, {"sender_id": "B", "message_id": 1, "tensor_id": 1} + ) + assert torch.equal(result, b_t1) + + # All buffers should be drained + for sid in ("A", "B"): + tensors = receiver._tensor_buffers[sid].tensors.get(1, {}) + assert len(tensors) == 0, f"Sender {sid} buffer not empty: {tensors}" + + +def test_mixed_cpu_cuda_with_ipc_enabled(): + """Test that encoder is configured correctly for IPC with all tensor types.""" + if not torch.cuda.is_available(): + pytest.skip("CUDA not available") + + tensor_queue = torch_mp.Queue() + + # Create sender and encoder with IPC enabled + sender = TensorIpcSender(tensor_queue) + encoder = MsgpackEncoder(oob_tensor_consumer=sender) + + # Verify sender configuration + assert encoder.oob_tensor_consumer is not None, "Consumer should be set" + + # Note: Actual IPC transfer only works across processes + # (tested in test_cpu_tensor_ipc) + # This test just verifies the configuration is correct + + +def test_tensor_cleanup_after_decode(): + """Test that tensors are removed from tracking after successful decode.""" + # Create a tensor queue + tensor_queue = torch_mp.Queue() + + # Create and encode a tensor + tensor = torch.randn(5, 5) + # Move to shared memory for IPC + if not tensor.is_shared(): + tensor.share_memory_() + + # Manually create a TensorIpcData and put it in the queue + sender_id = "test_sender" + message_id = 0 + tensor_id = 0 + ipc_data = TensorIpcData( + sender_id=sender_id, + message_id=message_id, + tensor_id=tensor_id, + tensor=tensor, + ) + tensor_queue.put(ipc_data) + + # Create receiver directly + receiver = TensorIpcReceiver(tensor_queue) + + handle = { + "sender_id": sender_id, + "message_id": message_id, + "tensor_id": tensor_id, + } + + # Receive the tensor - this should retrieve it from the queue + decoded_tensor = receiver( + str(tensor.dtype).removeprefix("torch."), tensor.shape, handle + ) + + # Verify the tensor was decoded + assert decoded_tensor.shape == tensor.shape, "Decoded tensor should match shape" + + # Verify the tensor was removed from buffer after decode + sender = receiver._tensor_buffers[sender_id] + tensors = sender.tensors.get(message_id, {}) + assert tensor_id not in tensors, "Tensor should be removed from buffer" diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 6ea65c6944b0..c4a55c8370e0 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -378,3 +378,65 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, swap_list: lis ref_input_batch.refresh_metadata() _compare_objs(input_batch, ref_input_batch) + + +def _construct_pooling_request(req_id_suffix: int): + from vllm.pooling_params import PoolingParams + + prompt_token_ids = [ + np.random.randint(0, VOCAB_SIZE) + for _ in range(np.random.randint(10, MAX_PROMPT_SIZE)) + ] + return CachedRequestState( + req_id=f"pool_req_{req_id_suffix}", + prompt_token_ids=prompt_token_ids, + sampling_params=None, + pooling_params=PoolingParams(task="classify"), + mm_features=[], + block_ids=([],), + generator=None, + num_computed_tokens=0, + output_token_ids=[], + ) + + +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_pooling_prompt_lens_not_aliased(device: str): + """Verify that prompt_lens in PoolingMetadata does not share memory + with the internal num_prompt_tokens pinned buffer. Guards against possible + non-determinism in pooling metadata due to mutations to the internal buffer. + """ + batch_size = 4 + input_batch = InputBatch( + max_num_reqs=batch_size * 2, + max_model_len=MAX_PROMPT_SIZE + NUM_OUTPUT_TOKENS, + max_num_batched_tokens=batch_size * (MAX_PROMPT_SIZE + NUM_OUTPUT_TOKENS), + device=torch.device(device), + pin_memory=is_pin_memory_available(), + vocab_size=VOCAB_SIZE, + block_sizes=[16], + kernel_block_sizes=[16], + is_pooling_model=True, + ) + + reqs = [] + # Add requests + for i in range(batch_size): + req = _construct_pooling_request(i) + input_batch.add_request(req) + reqs.append(req) + input_batch.refresh_metadata() + + # prompt_lens must be a snapshot + metadata = input_batch.get_pooling_metadata() + prompt_lens_snapshot = metadata.prompt_lens.clone() + + # Mutate the internal buffer (simulates next batch adding new requests) + input_batch.num_prompt_tokens_cpu_tensor.fill_(999) + + # prompt_lens must be unaffected by the mutation + assert torch.equal(metadata.prompt_lens, prompt_lens_snapshot), ( + "prompt_lens shares memory with internal pinned buffer; " + "mutations to num_prompt_tokens_cpu_tensor corrupted prompt_lens. " + f"Expected {prompt_lens_snapshot}, got {metadata.prompt_lens}" + ) diff --git a/tests/v1/worker/test_mamba_utils.py b/tests/v1/worker/test_mamba_utils.py index df3b7de9b4c9..c5d0661476e3 100644 --- a/tests/v1/worker/test_mamba_utils.py +++ b/tests/v1/worker/test_mamba_utils.py @@ -36,6 +36,7 @@ def test_resumed_req_ids_cleared_from_mamba_state_idx(): spec = MagicMock(block_size=64, num_speculative_blocks=0) cache_config = MagicMock(enable_prefix_caching=True) input_batch = MagicMock(req_ids=[]) + copy_bufs = MagicMock(mamba_group_ids=[0], mamba_spec=spec) mamba_state_idx = { "finished": 1, @@ -62,7 +63,7 @@ def test_resumed_req_ids_cleared_from_mamba_state_idx(): {}, {}, (), - MagicMock(), + copy_bufs, ) assert mamba_state_idx == {"keep": 99} diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index b527ffcf9b18..24a5b9bee3f5 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -8,15 +8,10 @@ import numpy.typing as npt -from vllm.utils.import_utils import PlaceholderModule +from vllm.multimodal.media.audio import load_audio from .base import VLLM_S3_BUCKET_URL, get_vllm_public_assets -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] - ASSET_DIR = "multimodal_asset" AudioAssetName = Literal["winning_call", "mary_had_lamb"] @@ -33,7 +28,7 @@ def filename(self) -> str: @property def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]: audio_path = get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR) - return librosa.load(audio_path, sr=None) + return load_audio(audio_path, sr=None) def get_local_path(self) -> Path: return get_vllm_public_assets(filename=self.filename, s3_prefix=ASSET_DIR) diff --git a/vllm/assets/video.py b/vllm/assets/video.py index d025368cbd43..f5e443db978f 100644 --- a/vllm/assets/video.py +++ b/vllm/assets/video.py @@ -10,15 +10,10 @@ from huggingface_hub import hf_hub_download from PIL import Image -from vllm.utils.import_utils import PlaceholderModule +from vllm.multimodal.media.audio import load_audio_pyav from .base import get_cache_dir -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] - @lru_cache def download_video_asset(filename: str) -> str: @@ -146,4 +141,4 @@ def get_audio(self, sampling_rate: float | None = None) -> npt.NDArray: See also: examples/offline_inference/qwen2_5_omni/only_thinker.py """ - return librosa.load(self.video_path, sr=sampling_rate)[0] + return load_audio_pyav(self.video_path, sr=sampling_rate)[0] diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 1e0a63dd6eb3..8304e8703b55 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -38,6 +38,7 @@ from vllm.lora.request import LoRARequest from vllm.lora.utils import get_adapter_absolute_path from vllm.multimodal import MultiModalDataDict +from vllm.multimodal.audio import get_audio_duration from vllm.multimodal.image import convert_image_mode from vllm.tokenizers import TokenizerLike from vllm.utils.argparse_utils import FlexibleArgumentParser @@ -54,10 +55,6 @@ except ImportError: pd = PlaceholderModule("pandas") -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") logger = logging.getLogger(__name__) @@ -3253,7 +3250,7 @@ def sample( break audio = item["audio"] y, sr = audio["array"], audio["sampling_rate"] - duration_s = librosa.get_duration(y=y, sr=sr) + duration_s = get_audio_duration(y=y, sr=sr) if duration_s < asr_min_audio_len_sec or duration_s > asr_max_audio_len_sec: skipped += 1 continue diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py index 2b667344ff37..c089f02a37ff 100644 --- a/vllm/compilation/caching.py +++ b/vllm/compilation/caching.py @@ -11,10 +11,13 @@ from unittest.mock import patch import torch +from torch._subclasses import FakeTensorMode +from torch.fx._graph_pickler import GraphPickler, Options from torch.utils import _pytree as pytree import vllm.envs as envs from vllm.compilation.compiler_interface import get_inductor_factors +from vllm.compilation.counter import compilation_counter from vllm.config import VllmConfig, get_current_vllm_config from vllm.config.utils import hash_factors from vllm.logger import init_logger @@ -59,6 +62,7 @@ def insert(self, submod_name: str, shape: str, entry: bytes) -> None: self.submodule_bytes[f"{submod_name}_{shape}"] = hex_digest if hex_digest not in self.submodule_bytes_store: self.submodule_bytes_store[hex_digest] = entry + compilation_counter.num_compiled_artifacts_saved += 1 logger.debug( "inserting new artifact for submod %s with shape %s " "(%s bytes) at hash %s", @@ -122,6 +126,7 @@ def load_all(self) -> None: def _load_entry(entry_bytes: bytes) -> AOTCompiledArtifact: entry = pickle.loads(entry_bytes) + compilation_counter.num_compiled_artifacts_loaded += 1 return AOTCompiledArtifact.deserialize(entry) with concurrent.futures.ThreadPoolExecutor() as executor: @@ -206,26 +211,8 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any: return self.optimized_call(*args, **kwargs) @classmethod - def serialize_compile_artifacts( - cls, compiled_fn: "VllmSerializableFunction" - ) -> bytes: + def serialize_graph_module(cls, graph_module: torch.fx.GraphModule) -> bytes: import sympy - from torch._subclasses import FakeTensorMode - from torch.fx._graph_pickler import GraphPickler, Options - - state = compiled_fn.__dict__.copy() - state.pop("optimized_call") - state.pop("shape_env") - state.pop("vllm_backend", None) - state.pop("_fake_mode", None) - for node in state["graph_module"].graph.nodes: - node.meta.pop("source_fn_stack", None) - node.meta.pop("nn_module_stack", None) - for name, submod in state["graph_module"].named_children(): - if hasattr(submod, "graph"): - for node in submod.graph.nodes: - node.meta.pop("source_fn_stack", None) - node.meta.pop("nn_module_stack", None) graph_reducer_override = GraphPickler.reducer_override @@ -242,6 +229,37 @@ def _graph_reducer_override( return type(None), () return graph_reducer_override(self, obj) + with ( + patch.object(GraphPickler, "reducer_override", _graph_reducer_override), + patch_pytree_map_over_slice(), + ): + return GraphPickler.dumps(graph_module, Options(ops_filter=None)) + + @classmethod + def deserialize_graph_module( + cls, data: bytes, fake_mode: FakeTensorMode + ) -> torch.fx.GraphModule: + with patch_pytree_map_over_slice(): + return GraphPickler.loads(data, fake_mode) + + @classmethod + def serialize_compile_artifacts( + cls, compiled_fn: "VllmSerializableFunction" + ) -> bytes: + state = compiled_fn.__dict__.copy() + state.pop("optimized_call") + state.pop("shape_env") + state.pop("vllm_backend", None) + state.pop("_fake_mode", None) + for node in state["graph_module"].graph.nodes: + node.meta.pop("source_fn_stack", None) + node.meta.pop("nn_module_stack", None) + for name, submod in state["graph_module"].named_children(): + if hasattr(submod, "graph"): + for node in submod.graph.nodes: + node.meta.pop("source_fn_stack", None) + node.meta.pop("nn_module_stack", None) + if state.get("sym_tensor_indices"): # put tensor inputs on meta device since their data # isn't needed, yet we need the meta for make_copy_and_call @@ -257,14 +275,9 @@ def _graph_reducer_override( lambda inp: torch.empty_like(inp, device="meta"), state["example_inputs"], ) - with ( - patch.object(GraphPickler, "reducer_override", _graph_reducer_override), - patch_pytree_map_over_slice(), - ): - state["graph_module"] = GraphPickler.dumps( - state["graph_module"], Options(ops_filter=None) - ) - state["example_inputs"] = GraphPickler.dumps(state["example_inputs"]) + + state["graph_module"] = cls.serialize_graph_module(state["graph_module"]) + state["example_inputs"] = GraphPickler.dumps(state["example_inputs"]) if compiled_fn.vllm_backend: ( @@ -280,14 +293,14 @@ def _graph_reducer_override( @classmethod def deserialize_compile_artifacts(cls, data: bytes) -> "VllmSerializableFunction": from torch._guards import TracingContext, tracing - from torch._subclasses import FakeTensorMode - from torch.fx._graph_pickler import GraphPickler from torch.fx.experimental.symbolic_shapes import ShapeEnv state = pickle.loads(data) fake_mode = FakeTensorMode(shape_env=ShapeEnv()) - with patch_pytree_map_over_slice(): - state["graph_module"] = GraphPickler.loads(state["graph_module"], fake_mode) + + state["graph_module"] = cls.deserialize_graph_module( + state["graph_module"], fake_mode + ) state["graph_module"].recompile() state["example_inputs"] = GraphPickler.loads(state["example_inputs"], fake_mode) diff --git a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py index f141a7c171f7..d55b305992e9 100644 --- a/vllm/compilation/passes/fusion/allreduce_rms_fusion.py +++ b/vllm/compilation/passes/fusion/allreduce_rms_fusion.py @@ -62,6 +62,11 @@ 4: 32, # 32MB 8: 1, # 1MB }, + 103: { + 2: 64, # 64MB + 4: 64, # 64MB + 8: 2, # 2MB + }, } # Max size of the input tensor per world size per device capability @@ -78,6 +83,11 @@ 4: 4, # 4MB 8: 1, # 1MB }, + 103: { + 2: 32, # 32MB + 4: 4, # 4MB + 8: 2, # 2MB + }, } @@ -86,8 +96,6 @@ destroy_fi_ar_workspace, get_fi_ar_quant_workspace, get_fi_ar_workspace, - initialize_fi_ar_quant_workspace, - initialize_fi_ar_workspace, ) ar_fusion_patterns = flashinfer_comm.AllReduceFusionPattern @@ -133,15 +141,23 @@ def call_trtllm_fused_allreduce_norm( # Select workspace based on pattern: quant patterns use the # trtllm quant workspace, non-quant patterns use the primary workspace. - if pattern_code in ( + is_quant_pattern = pattern_code in ( ar_fusion_patterns.kARResidualRMSNormFP8Quant, ar_fusion_patterns.kARResidualRMSNormFP4Quant, - ): - workspace = get_fi_ar_quant_workspace() - else: - workspace = get_fi_ar_workspace() + ) + get_workspace_fn = ( + get_fi_ar_quant_workspace if is_quant_pattern else get_fi_ar_workspace + ) + workspace = get_workspace_fn( + world_size=world_size, + rank=get_tensor_model_parallel_rank(), + max_token_num=max_token_num, + hidden_dim=hidden_size, + dtype=allreduce_in.dtype, + group=get_tp_group().device_group, + ) assert workspace is not None, ( - "Flashinfer workspace must be initialized when using flashinfer" + "Flashinfer allreduce workspace must be initialized when using flashinfer" ) assert flashinfer_comm is not None if norm_out is None: @@ -753,35 +769,29 @@ def __init__(self, config: VllmConfig) -> None: scope="global", ) - for workspace_init_fn in [ - initialize_fi_ar_workspace, - initialize_fi_ar_quant_workspace, - ]: - try: - workspace_init_fn( - world_size=self.tp_size, - rank=rank, - max_token_num=self.max_token_num, - hidden_dim=self.hidden_dim, - dtype=self.model_dtype, - group=self.group, - ) - except Exception as e: - if "multicast" in str(e).lower(): - logger.warning( - "AllReduce fusion pass is disabled: flashinfer workspace " - "creation failed: %s. This is expected on GPUs without " - "NVSwitch (e.g., NVLink bridge-only or PCIe topologies). " - "Falling back to non-fused allreduce.", - str(e), - ) - else: - logger.warning( - "Failed to initialize FlashInfer All Reduce workspace: %s. " - "AllReduce fusion pass will be disabled.", - e, - ) - return + workspace_kwargs = dict( + world_size=self.tp_size, + rank=rank, + max_token_num=self.max_token_num, + hidden_dim=self.hidden_dim, + dtype=self.model_dtype, + group=self.group, + ) + if get_fi_ar_workspace(**workspace_kwargs) is None: + logger.warning_once( + "Failed to initialize Flashinfer allreduce workspace. " + "Flashinfer allreduce-norm fusion will be disabled." + ) + return + + self.supports_quant_fusion = ( + get_fi_ar_quant_workspace(**workspace_kwargs) is not None + ) + if not self.supports_quant_fusion: + logger.warning_once( + "Failed to initialize Flashinfer allreduce workspace. " + "Flashinfer allreduce-norm-quant fusion will be disabled." + ) self.allreduce_params = FlashInferFusedAllReduceParams( world_size=self.tp_size, @@ -793,9 +803,8 @@ def __init__(self, config: VllmConfig) -> None: @enable_fake_mode def register_patterns(self) -> None: - supports_quantization = get_fi_ar_quant_workspace() is not None for epsilon in [1e-5, 1e-6]: - if supports_quantization: + if self.supports_quant_fusion: AllReduceFusedRMSNormStaticQuantFP8Pattern( epsilon, self.model_dtype, diff --git a/vllm/config/model.py b/vllm/config/model.py index b12202f9c712..19787f80b50f 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -14,7 +14,12 @@ from vllm.config.model_arch import ( ModelArchitectureConfig, ) -from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig +from vllm.config.multimodal import ( + MMCacheType, + MMEncoderTPMode, + MMTensorIPC, + MultiModalConfig, +) from vllm.config.pooler import PoolerConfig from vllm.config.scheduler import RunnerType from vllm.config.utils import config, getattr_iter @@ -310,6 +315,7 @@ class ModelConfig: interleave_mm_strings: InitVar[bool | None] = None skip_mm_profiling: InitVar[bool | None] = None video_pruning_rate: InitVar[float | None] = None + mm_tensor_ipc: InitVar[MMTensorIPC] = None def compute_hash(self) -> str: """ @@ -430,6 +436,7 @@ def __post_init__( interleave_mm_strings: bool | None, skip_mm_profiling: bool | None, video_pruning_rate: float | None, + mm_tensor_ipc: MMTensorIPC, ) -> None: # Keep set served_model_name before maybe_model_redirect(self.model) self.served_model_name = get_served_model_name( @@ -612,6 +619,7 @@ def __post_init__( interleave_mm_strings=interleave_mm_strings, skip_mm_profiling=skip_mm_profiling, video_pruning_rate=video_pruning_rate, + mm_tensor_ipc=mm_tensor_ipc, ) mm_config_kwargs = { @@ -1112,6 +1120,22 @@ def verify_with_parallel_config( f"({parallel_config.decode_context_parallel_size})." ) + # torch_shm uses a single IPC queue to rank 0; DP>1 is + # incompatible because API servers can't know which + # CoreEngine the scheduler will assign work to. TP>1 is + # also not supported because this requires broadcasting + # MM tensors between all TP ranks. + if ( + self.multimodal_config is not None + and self.multimodal_config.mm_tensor_ipc == "torch_shm" + and parallel_config.world_size_across_dp > 1 + ): + raise ValueError( + "mm_tensor_ipc='torch_shm' is not supported with " + "data_parallel_size > 1 or tensor_parallel_size > 1 " + "or pipeline_parallel_size > 1." + ) + def get_sliding_window(self) -> int | None: """Get the sliding window size from the HF text config if present.""" return getattr(self.hf_text_config, "sliding_window", None) @@ -1435,10 +1459,10 @@ def requires_raw_input_tokens(self) -> bool: @property def score_type(self) -> ScoreType: """ - Score API handles score/rerank for: - - "score" task (score_type: cross-encoder models) - - "embed" task (score_type: bi-encoder models) - - "token_embed" task (score_type: late interaction models) + Scoring API handles score/rerank for:\n + - "classify" task (score_type: cross-encoder models)\n + - "embed" task (score_type: bi-encoder models)\n + - "token_embed" task (score_type: late interaction models)\n """ # fixme: self._model_info.score_type is the score type before # as_seq_cls_model, which is "bi-encoder", rather than the diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index f95a2e140c67..1c9bc43b01ca 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -59,6 +59,7 @@ class MultiModalDummyOptionsBuiltins(TypedDict, total=False): MMEncoderTPMode = Literal["weights", "data"] MMCacheType = Literal["shm", "lru"] +MMTensorIPC = Literal["direct_rpc", "torch_shm"] MMDummyOptions: TypeAlias = dict[str, BaseDummyOptions] """ A dictionary containing an entry for each modality type of dummy data. @@ -172,6 +173,11 @@ class MultiModalConfig: Value sits in range [0;1) and determines fraction of media tokens from each video to be pruned. """ + mm_tensor_ipc: MMTensorIPC = "direct_rpc" + """IPC (inter-process communication) method for multimodal tensors. + - "direct_rpc": Use msgspec serialization via RPC + - "torch_shm": Use torch.multiprocessing shared memory for zero-copy IPC + Defaults to "direct_rpc". """ @field_validator("limit_per_prompt", mode="before") @classmethod diff --git a/vllm/config/profiler.py b/vllm/config/profiler.py index 6a40b9daddc0..e79e213106db 100644 --- a/vllm/config/profiler.py +++ b/vllm/config/profiler.py @@ -45,10 +45,10 @@ class ProfilerConfig: worker's traces (CPU & GPU) will be saved under this directory. Note that it must be an absolute path.""" - torch_profiler_with_stack: bool = False - """If `True`, enables stack tracing in the torch profiler. Disabled by default - to reduce overhead. Can be enabled via VLLM_TORCH_PROFILER_WITH_STACK=1 env var - or --profiler-config.torch_profiler_with_stack=true CLI flag.""" + torch_profiler_with_stack: bool = True + """If `True`, enables stack tracing in the torch profiler. Enabled by default + as it is useful for debugging. Can be disabled via + --profiler-config.torch_profiler_with_stack=false CLI flag.""" torch_profiler_with_flops: bool = False """If `True`, enables FLOPS counting in the torch profiler. Disabled by default.""" diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index f525ac871c3e..a178a8f54bc2 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -120,7 +120,7 @@ def enable_allreduce_rms_fusion(cfg: "VllmConfig") -> bool: and current_platform.is_cuda() and has_flashinfer() and ( - current_platform.is_device_capability(100) + current_platform.is_device_capability_family(100) or current_platform.is_device_capability(90) ) # tp-dp combination broken: @@ -766,6 +766,17 @@ def __post_init__(self): else: self.parallel_config.disable_nccl_for_dp_synchronization = False + if ( + self.model_config is not None + and self.model_config.multimodal_config is not None + and self.model_config.multimodal_config.mm_tensor_ipc == "torch_shm" + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn" + ): + raise ValueError( + "torch_shm is known to fail without " + "VLLM_WORKER_MULTIPROC_METHOD set to spawn" + ) + from vllm.platforms import current_platform if ( diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py index 0cdff90320da..075f4e0859e4 100644 --- a/vllm/distributed/device_communicators/all2all.py +++ b/vllm/distributed/device_communicators/all2all.py @@ -10,6 +10,7 @@ from vllm.distributed import get_dp_group, get_ep_group from vllm.forward_context import get_forward_context from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.utils.flashinfer import ( has_flashinfer_nvlink_one_sided, has_flashinfer_nvlink_two_sided, @@ -325,14 +326,20 @@ def _make_all2all_kwargs(self) -> dict[Any, Any]: assert num_rdma_bytes is not None assert num_qps_per_rank is not None - return dict( + # TODO: remove platform-specific logic + # once ROCm DeepEP is updated with the latest APIs. + kwargs = dict( group=self.cpu_group, num_nvl_bytes=num_nvl_bytes, num_rdma_bytes=num_rdma_bytes, low_latency_mode=False, num_qps_per_rank=num_qps_per_rank, - explicitly_destroy=True, ) + if not current_platform.is_rocm(): + kwargs.update( + explicitly_destroy=True, + ) + return kwargs def get_handle(self, kwargs): assert len(kwargs) == 0, ( @@ -397,16 +404,22 @@ def _make_all2all_kwargs( ) assert num_rdma_bytes is not None - return dict( + # TODO: remove platform-specific logic + # once ROCm DeepEP is updated with the latest APIs. + kwargs = dict( group=self.cpu_group, num_nvl_bytes=num_nvl_bytes, num_rdma_bytes=num_rdma_bytes, low_latency_mode=True, num_qps_per_rank=num_qps_per_rank, - allow_nvlink_for_low_latency_mode=True, - allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL, - explicitly_destroy=True, ) + if not current_platform.is_rocm(): + kwargs.update( + allow_nvlink_for_low_latency_mode=True, + allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL, + explicitly_destroy=True, + ) + return kwargs def get_handle(self, kwargs): """ diff --git a/vllm/distributed/device_communicators/all_reduce_utils.py b/vllm/distributed/device_communicators/all_reduce_utils.py index 3c347ef756d4..9777be5aa7f8 100644 --- a/vllm/distributed/device_communicators/all_reduce_utils.py +++ b/vllm/distributed/device_communicators/all_reduce_utils.py @@ -44,6 +44,12 @@ 6: 1 * MiB, # 1 MB 8: 1 * MiB, # 1 MB }, + "10.3": { + 2: 4 * MiB, # 4 MB + 4: 4 * MiB, # 4 MB + 6: 8 * MiB, # 8 MB + 8: 4 * MiB, # 4 MB + }, } SYMM_MEM_ALL_REDUCE_MAX_SIZES = { @@ -59,6 +65,12 @@ 6: 128 * MiB, # 128 MB 8: 128 * MiB, # 128 MB }, + "10.3": { + 2: 4 * MiB, # 4 MB + 4: 32 * MiB, # 32 MB + 6: 32 * MiB, # 32 MB + 8: 64 * MiB, # 64 MB + }, } # NCCL symmetric memory allreduce configuration based on H100 and GB200 benchmarks. diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index bd5741e8dc72..4550bdb25629 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -338,6 +338,7 @@ def broadcast(self, tensor: torch.Tensor, src: int = 0) -> torch.Tensor: def destroy(self): if self.pynccl_comm is not None: + self.pynccl_comm.destroy() self.pynccl_comm = None if self.ca_comm is not None: self.ca_comm = None diff --git a/vllm/distributed/device_communicators/flashinfer_all_reduce.py b/vllm/distributed/device_communicators/flashinfer_all_reduce.py index 66e089182869..b2edfc15d731 100644 --- a/vllm/distributed/device_communicators/flashinfer_all_reduce.py +++ b/vllm/distributed/device_communicators/flashinfer_all_reduce.py @@ -29,50 +29,27 @@ except ImportError: pass -# Global workspace for standalone allreduce and non-quant ar+rms fusion +# Workspace for standalone allreduce and non-quant ar+rms fusion _fi_ar_workspace = None # Extra workspace for quant fusion patterns (only supported by trtllm backend) -# Only created if primary workspace is not already trtllm _fi_ar_quant_workspace = None -def get_fi_ar_workspace(): - return _fi_ar_workspace - - -def get_fi_ar_quant_workspace(): - return _fi_ar_quant_workspace - - -def initialize_fi_ar_workspace( +def _create_workspace( + backend: str, world_size: int, rank: int, max_token_num: int, hidden_dim: int, dtype: torch.dtype, group: ProcessGroup, -) -> None: - """ - Initialize the workspace if not already initialized. - - Currently, this function is called by either the AllReduceFusionPass - or the FlashInferAllReduce backend for standalone allreduce. - If the fusion pass is enabled via - --compilation-config.pass_config.fuse_allreduce_rms=true, - it will create the workspace first, and the standalone backend - will reuse the workspace. Otherwise, the standalone backend will - create the workspace. - """ - global _fi_ar_workspace - if _fi_ar_workspace is not None: - return - - backend = envs.VLLM_FLASHINFER_ALLREDUCE_BACKEND +): + """Create a flashinfer allreduce workspace, returning None on failure.""" comm_backend = TorchDistBackend(group=group) rng_state = random.getstate() try: random.seed(int.from_bytes(os.urandom(16), byteorder="big")) - _fi_ar_workspace = flashinfer_comm.create_allreduce_fusion_workspace( + workspace = flashinfer_comm.create_allreduce_fusion_workspace( backend=backend, world_size=world_size, rank=rank, @@ -81,9 +58,22 @@ def initialize_fi_ar_workspace( dtype=dtype, comm_backend=comm_backend, ) + except Exception as e: + if "multicast" in str(e).lower(): + logger.warning_once( + "Failed to initialize FlashInfer All Reduce workspace: %s. " + "This is expected on GPUs without NVSwitch (e.g., NVLink " + "bridge-only or PCIe topologies).", + e, + ) + else: + logger.warning_once( + "Failed to initialize FlashInfer All Reduce workspace: %s.", + e, + ) + return None finally: random.setstate(rng_state) - assert _fi_ar_workspace is not None logger.debug( "Initialized FlashInfer All Reduce workspace: backend=%s, " "world_size=%d, rank=%d, max_token_num=%d, hidden_dim=%d, dtype=%s", @@ -94,70 +84,84 @@ def initialize_fi_ar_workspace( hidden_dim, dtype, ) + return workspace + + +def get_fi_ar_workspace( + world_size: int, + rank: int, + max_token_num: int, + hidden_dim: int, + dtype: torch.dtype, + group: ProcessGroup, +): + """ + Return the allreduce workspace for non-quant patterns, initializing if needed. + + Used by AllReduceFusionPass (non-quant patterns) and FlashInferAllReduce + for standalone allreduce. Backend is controlled by + VLLM_FLASHINFER_ALLREDUCE_BACKEND env var. + """ + global _fi_ar_workspace + if _fi_ar_workspace is not None: + return _fi_ar_workspace + + backend = envs.VLLM_FLASHINFER_ALLREDUCE_BACKEND + + # Reuse the quant workspace if it was already created with the same backend + if _fi_ar_quant_workspace is not None and _fi_ar_quant_workspace.backend == backend: + _fi_ar_workspace = _fi_ar_quant_workspace + return _fi_ar_workspace + + _fi_ar_workspace = _create_workspace( + backend, world_size, rank, max_token_num, hidden_dim, dtype, group + ) + return _fi_ar_workspace -def initialize_fi_ar_quant_workspace( +def get_fi_ar_quant_workspace( world_size: int, rank: int, max_token_num: int, hidden_dim: int, dtype: torch.dtype, group: ProcessGroup, -) -> None: +): """ - Initialize the workspace used by quantization fusion patterns. + Return the allreduce workspace for quant patterns, initializing if needed. - Currently this always creates a workspace for trtllm backend as only it - supports quantization fusion (FP8/FP4). If the primary workspace - is already trtllm, the quant workspace aliases to it. + Always uses trtllm backend as it is the only one supporting quantization + fusion (FP8/FP4). """ global _fi_ar_quant_workspace if _fi_ar_quant_workspace is not None: - return + return _fi_ar_quant_workspace - # If primary workspace is already trtllm, reuse it + # Reuse the non-quant workspace if it was already created with trtllm if _fi_ar_workspace is not None and _fi_ar_workspace.backend == "trtllm": _fi_ar_quant_workspace = _fi_ar_workspace - return + return _fi_ar_quant_workspace - comm_backend = TorchDistBackend(group=group) - _fi_ar_quant_workspace = flashinfer_comm.create_allreduce_fusion_workspace( - backend="trtllm", - world_size=world_size, - rank=rank, - max_token_num=max_token_num, - hidden_dim=hidden_dim, - dtype=dtype, - comm_backend=comm_backend, - ) - assert _fi_ar_quant_workspace is not None - logger.debug( - "Initialized FlashInfer All Reduce workspace: backend=trtllm, " - "world_size=%d, rank=%d, max_token_num=%d, hidden_dim=%d, dtype=%s", - world_size, - rank, - max_token_num, - hidden_dim, - dtype, + _fi_ar_quant_workspace = _create_workspace( + "trtllm", world_size, rank, max_token_num, hidden_dim, dtype, group ) + return _fi_ar_quant_workspace _fi_ar_workspace_lock = threading.Lock() def destroy_fi_ar_workspace(): - global _fi_ar_workspace - global _fi_ar_quant_workspace + global _fi_ar_workspace, _fi_ar_quant_workspace with _fi_ar_workspace_lock: - if ( - _fi_ar_quant_workspace is not None - and _fi_ar_quant_workspace is not _fi_ar_workspace - ): - _fi_ar_quant_workspace.destroy() - _fi_ar_quant_workspace = None + is_alias = _fi_ar_workspace is _fi_ar_quant_workspace + if _fi_ar_workspace is not None: _fi_ar_workspace.destroy() - _fi_ar_workspace = None + if _fi_ar_quant_workspace is not None and not is_alias: + _fi_ar_quant_workspace.destroy() + + _fi_ar_workspace = _fi_ar_quant_workspace = None atexit.register(destroy_fi_ar_workspace) @@ -209,29 +213,21 @@ def __init__( def _ensure_workspace(self, hidden_dim: int, dtype: torch.dtype) -> bool: """Ensure the all reduce workspace is initialized.""" - if get_fi_ar_workspace() is not None: - return True if self.max_num_tokens == 0: element_size = torch.tensor([], dtype=dtype, device="cpu").element_size() self.max_num_tokens = self.max_workspace_size // (hidden_dim * element_size) - try: - initialize_fi_ar_workspace( - world_size=self.world_size, - rank=self.rank, - max_token_num=self.max_num_tokens, - hidden_dim=hidden_dim, - dtype=dtype, - group=self.group, - ) - return True - except Exception as e: - logger.warning( - "Failed to initialize FlashInfer All Reduce workspace: %s. " - "FlashInfer All Reduce will be disabled.", - e, - ) + workspace = get_fi_ar_workspace( + world_size=self.world_size, + rank=self.rank, + max_token_num=self.max_num_tokens, + hidden_dim=hidden_dim, + dtype=dtype, + group=self.group, + ) + if workspace is None: self.disabled = True return False + return True def should_use_fi_ar(self, input_tensor: torch.Tensor) -> bool: if self.disabled: @@ -257,7 +253,15 @@ def should_use_fi_ar(self, input_tensor: torch.Tensor) -> bool: return self._ensure_workspace(hidden_dim, input_tensor.dtype) def all_reduce(self, input_tensor: torch.Tensor) -> torch.Tensor: - workspace = get_fi_ar_workspace() + _, hidden_dim = input_tensor.shape + workspace = get_fi_ar_workspace( + world_size=self.world_size, + rank=self.rank, + max_token_num=self.max_num_tokens, + hidden_dim=hidden_dim, + dtype=input_tensor.dtype, + group=self.group, + ) return flashinfer_comm.allreduce_fusion( input=input_tensor, workspace=workspace, diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 84a032541015..6ac3b9ea3c7c 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -145,6 +145,13 @@ def __init__( stream.synchronize() del data + def destroy(self): + if self.available and not self.disabled: + with torch.accelerator.device_index(self.device.index): + self.nccl.ncclCommDestroy(self.comm) + self.available = False + self.disabled = True + def all_reduce( self, in_tensor: torch.Tensor, diff --git a/vllm/distributed/device_communicators/symm_mem.py b/vllm/distributed/device_communicators/symm_mem.py index 98c7ac20a171..c25ff8cf1a8d 100644 --- a/vllm/distributed/device_communicators/symm_mem.py +++ b/vllm/distributed/device_communicators/symm_mem.py @@ -28,6 +28,7 @@ class SymmMemCommunicator: _WORLD_SIZES_MULTIMEM = { "9.0": [4, 6, 8], "10.0": [6, 8], + "10.3": [6, 8], } def __init__( diff --git a/vllm/distributed/elastic_ep/elastic_execute.py b/vllm/distributed/elastic_ep/elastic_execute.py index 00ac6d84b425..8b05c58eaec5 100644 --- a/vllm/distributed/elastic_ep/elastic_execute.py +++ b/vllm/distributed/elastic_ep/elastic_execute.py @@ -145,11 +145,37 @@ def execute(self, execute_method: str, *args, **kwargs): raise ValueError(f"Unknown execute method: {execute_method}") return method(*args, **kwargs) + def _set_eplb_suppressed(self, suppressed: bool) -> None: + self.worker.model_runner.eep_eplb_suppressed = suppressed + ep_group = get_standby_ep_group() or get_ep_group() + if ep_group.rank == 0: + logger.info( + "[Elastic EP] EPLB %s elastic scaling transition", + "disabled during" if suppressed else "re-enabled after", + ) + + def load_model(self) -> None: + ( + expanded_physical_to_logical, + num_logical_experts, + old_num_physical_experts, + ) = self.receive_expert_mapping() + num_physical_experts = expanded_physical_to_logical.shape[1] + self.worker.parallel_config.eplb_config.num_redundant_experts = ( + num_physical_experts - num_logical_experts + ) + self.worker.load_model(load_dummy_weights=True) + self.worker.model_runner.setup_eplb_from_mapping( + expanded_physical_to_logical, old_num_physical_experts + ) + self._set_eplb_suppressed(True) + def create_standby_groups( self, reconfig_request: ReconfigureDistributedRequest ) -> None: self.reconfig_request = reconfig_request new_dp_size = reconfig_request.new_data_parallel_size + old_dp_size = get_dp_group().world_size world_size = self.worker.vllm_config.parallel_config.world_size new_world_size_across_dp = world_size * new_dp_size updated_config = copy.copy(self.worker.vllm_config) @@ -165,11 +191,8 @@ def create_standby_groups( coord_store_port=reconfig_request.coord_store_port, enable_eplb=updated_config.parallel_config.enable_eplb, ) - self.worker.model_runner.eep_eplb_suppressed = True - standby_ep_group = get_standby_ep_group() - assert standby_ep_group is not None - if standby_ep_group.rank == 0: - logger.info("[Elastic EP] EPLB disabled during elastic scaling transition") + if new_dp_size > old_dp_size: + self._set_eplb_suppressed(True) def transfer_weights(self, old_dp_size: int, new_dp_size: int) -> None: standby_dp_group = get_standby_dp_group() @@ -237,13 +260,31 @@ def broadcast_expert_mapping(self) -> None: device=self.worker.device, ) + def _release_cuda_graphs(self) -> None: + if isinstance(self.worker.model_runner.model, CUDAGraphWrapper): + wrapper = self.worker.model_runner.model + wrapper.concrete_cudagraph_entries = {} + + elif isinstance(self.worker.model_runner.model, UBatchWrapper): + raise RuntimeError("DBO is not yet supported in elastic EP") + + torch.compiler.reset() + with set_current_vllm_config(self.worker.vllm_config): + reset_compile_wrapper(self.worker.model_runner.get_model()) + + gc.collect() + torch.accelerator.synchronize() + torch.accelerator.empty_cache() + def switch_and_remove(self) -> None: + self._release_cuda_graphs() _replace_active_groups(world=None, dp=None, ep=None, eplb=None, node_count=None) def switch_and_prepare(self) -> None: old_dp_size = get_dp_group().world_size old_ep_size = get_ep_group().world_size + self._release_cuda_graphs() _replace_active_groups(**pop_standby_groups()) parallel_config = self.worker.vllm_config.parallel_config @@ -384,13 +425,6 @@ def switch_and_prepare(self) -> None: compilation_counter.stock_torch_compile_count += 1 self.worker.model_runner.model.compile(fullgraph=True, backend=backend) - # release all previously captured CUDA graphs - if isinstance(self.worker.model_runner.model, CUDAGraphWrapper): - wrapper = self.worker.model_runner.model - wrapper.concrete_cudagraph_entries = {} - elif isinstance(self.worker.model_runner.model, UBatchWrapper): - raise RuntimeError("DBO is not yet supported in elastic EP") - multi_block_table = self.worker.model_runner.input_batch.block_table saved_block_tables: list[tuple[torch.Tensor, torch.Tensor]] = [] for bt in multi_block_table.block_tables: @@ -399,14 +433,6 @@ def switch_and_prepare(self) -> None: ) multi_block_table.clear() - # reset the compile wrapper - torch.compiler.reset() - with set_current_vllm_config(self.worker.vllm_config): - reset_compile_wrapper(self.worker.model_runner.get_model()) - - gc.collect() - torch.accelerator.synchronize() - torch.accelerator.empty_cache() unlock_workspace() self.worker.compile_or_warm_up_model() lock_workspace() @@ -416,8 +442,12 @@ def switch_and_prepare(self) -> None: ): bt.block_table.gpu.copy_(saved_gpu) bt.block_table.cpu.copy_(saved_cpu) + if new_dp_size < old_dp_size: + self._set_eplb_suppressed(False) - def perform_eplb_reshuffle(self, new_dp_size: int | None = None) -> None: + def _perform_eplb_reshuffle( + self, rank_mapping: dict[int, int] | None = None + ) -> None: if get_ep_group().rank == 0: logger.info("[Elastic EP] Starting expert resharding...") @@ -428,20 +458,9 @@ def perform_eplb_reshuffle(self, new_dp_size: int | None = None) -> None: eplb_model_state = eplb_state.model_states[model_config.compute_hash()] is_async_enabled = eplb_state.is_async eplb_state.is_async = False - if new_dp_size is None: + if rank_mapping is None: eplb_state.rearrange() else: - # scale down - parallel_config = self.worker.vllm_config.parallel_config - tp_size = parallel_config.tensor_parallel_size - old_ep_size = parallel_config.data_parallel_size * tp_size - new_ep_size = new_dp_size * tp_size - - rank_mapping = { - old_ep_rank: old_ep_rank if old_ep_rank < new_ep_size else -1 - for old_ep_rank in range(old_ep_size) - } - eplb_state.rearrange(rank_mapping=rank_mapping) # NOTE(yongji): check whether we need to synchronize here torch.accelerator.synchronize() @@ -451,10 +470,25 @@ def perform_eplb_reshuffle(self, new_dp_size: int | None = None) -> None: eplb_model_state.physical_to_logical_map.shape[1] ) eplb_state.is_async = is_async_enabled - self.worker.model_runner.eep_eplb_suppressed = False if get_ep_group().rank == 0: logger.info("[Elastic EP] Expert resharding completed") + def perform_eplb_reshuffle(self) -> None: + self._perform_eplb_reshuffle() + self._set_eplb_suppressed(False) + + def perform_scale_down_eplb_reshuffle(self, new_dp_size: int) -> None: + self._set_eplb_suppressed(True) + parallel_config = self.worker.vllm_config.parallel_config + tp_size = parallel_config.tensor_parallel_size + old_ep_size = parallel_config.data_parallel_size * tp_size + new_ep_size = new_dp_size * tp_size + rank_mapping = { + old_ep_rank: old_ep_rank if old_ep_rank < new_ep_size else -1 + for old_ep_rank in range(old_ep_size) + } + self._perform_eplb_reshuffle(rank_mapping=rank_mapping) + def receive_weights(self) -> None: dp_group = get_dp_group() assert isinstance(dp_group, StatelessGroupCoordinator) diff --git a/vllm/distributed/elastic_ep/elastic_state.py b/vllm/distributed/elastic_ep/elastic_state.py index cd989a49a2b8..bace771a2ab6 100644 --- a/vllm/distributed/elastic_ep/elastic_state.py +++ b/vllm/distributed/elastic_ep/elastic_state.py @@ -43,9 +43,10 @@ class ScaleUpExistingEngineState(enum.IntEnum): class ScaleUpNewEngineState(enum.IntEnum): - PREPARE = 0 - EPLB_RESHUFFLE = 1 - COMPLETE = 2 + PRE_KV_INIT = 0 + PREPARE = 1 + EPLB_RESHUFFLE = 2 + COMPLETE = 3 class ScaleDownRemainingEngineState(enum.IntEnum): @@ -104,7 +105,7 @@ def __init__( self.state: EngineState if scale_type == "scale_up": self.state = ( - ScaleUpNewEngineState.PREPARE + ScaleUpNewEngineState.PRE_KV_INIT if worker_type == "new" else ScaleUpExistingEngineState.WAIT_NEW_CORE_ENGINES_INIT ) @@ -142,6 +143,12 @@ def progress(self) -> bool: else self._progress_remaining_engine() ) + def run_pre_kv_init_states(self) -> None: + assert self.scale_type == "scale_up" and self.worker_type == "new" + assert self.state == ScaleUpNewEngineState.PRE_KV_INIT + assert self.progress() + assert self.state == ScaleUpNewEngineState.PREPARE + def _execute_tcp_store_barrier( self, dp_store, group_rank, group_size, barrier_id, timeout=None ): @@ -303,7 +310,23 @@ def _progress_new_engine(self) -> bool: state = self.state assert self.new_dp_group is not None and self.new_dp_store is not None - if state == ScaleUpNewEngineState.PREPARE: + if state == ScaleUpNewEngineState.PRE_KV_INIT: + self.engine_core._eep_send_engine_core_notification( + EEPNotificationType.NEW_CORE_ENGINES_WEIGHTS_INIT_READY + ) + self.model_executor.collective_rpc( + "elastic_ep_execute", args=("receive_weights",) + ) + self.engine_core.available_gpu_memory_for_kv_cache = ( + ParallelConfig.sync_kv_cache_memory_size(self.new_dp_group, -1) + ) + self.model_executor.collective_rpc( + "elastic_ep_execute", args=("prepare_new_worker",) + ) + self.state = ScaleUpNewEngineState.PREPARE + return True + + elif state == ScaleUpNewEngineState.PREPARE: tensor = torch.tensor([0, 0, 0], dtype=torch.int32, device="cpu") torch.distributed.all_reduce( tensor, @@ -403,7 +426,6 @@ def _progress_removing_engine(self) -> bool: self.engine_core._eep_send_engine_core_notification( EEPNotificationType.SHUTDOWN_COMPLETE ) - self.engine_core.shutdown() return True else: @@ -525,7 +547,7 @@ def _eplb_reshuffle_before_scale_down(self): self.model_executor.collective_rpc( "elastic_ep_execute", args=( - "perform_eplb_reshuffle", + "perform_scale_down_eplb_reshuffle", self.reconfig_request.new_data_parallel_size, ), ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py index db77d41c487f..faaffd72eca3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/metrics.py @@ -126,28 +126,17 @@ def __init__( self._labelnames = labelnames self.per_engine_labelvalues = per_engine_labelvalues - def make_per_engine(self, metric: PromMetric) -> dict[int, PromMetric]: - """ - Create a per-engine child of a prometheus_client.Metric with - the appropriate labels set. The parent metric must be created - using the labelnames list. - """ - return { - idx: metric.labels(*labelvalues) - for idx, labelvalues in self.per_engine_labelvalues.items() - } - def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0): """ Record the supplied transfer statistics to Prometheus metrics. These statistics are engine-specific, and should be recorded to a metric with the appropriate 'engine' label. These metric instances can be - created using the make_per_engine() helper method. + created using the create_metric_per_engine() helper method. """ raise NotImplementedError -class KVConnectorPrometheus: +class KVConnectorProm: """ Support for registering per-connector Prometheus metrics, and recording transfer statistics to those metrics. Uses diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py index 1861c9e8e3d0..dcde7665f344 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/moriio/moriio_connector.py @@ -1396,9 +1396,6 @@ def _write_blocks_for_req(self, req_id: ReqId, meta: ReqMeta, layer_name, kv_lay remote_ip=meta.remote_host, ) - def _is_last_layer(self, layer_name): - return layer_name == list(self.kv_caches.keys())[-1] - def merge_contiguous_blocks( self, offsets_local: list[int], diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index ed53c35c9ed9..a86a52a6a6fb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -65,6 +65,7 @@ SlidingWindowSpec, UniformTypeKVCacheSpecs, ) +from vllm.v1.metrics.utils import create_metric_per_engine from vllm.v1.worker.block_table import BlockTable from vllm.v1.worker.utils import select_common_block_size @@ -3057,7 +3058,9 @@ def __init__( buckets=buckets[1:], labelnames=labelnames, ) - self.nixl_histogram_xfer_time = self.make_per_engine(nixl_histogram_xfer_time) + self.nixl_histogram_xfer_time = create_metric_per_engine( + nixl_histogram_xfer_time, self.per_engine_labelvalues + ) nixl_histogram_post_time = self._histogram_cls( name="vllm:nixl_post_time_seconds", documentation="Histogram of transfer post time for NIXL KV" @@ -3065,7 +3068,9 @@ def __init__( buckets=buckets, labelnames=labelnames, ) - self.nixl_histogram_post_time = self.make_per_engine(nixl_histogram_post_time) + self.nixl_histogram_post_time = create_metric_per_engine( + nixl_histogram_post_time, self.per_engine_labelvalues + ) # uniform 2kb to 16gb range buckets = [2 ** (10 + i) for i in range(1, 25, 2)] nixl_histogram_bytes_transferred = self._histogram_cls( @@ -3074,8 +3079,8 @@ def __init__( buckets=buckets, labelnames=labelnames, ) - self.nixl_histogram_bytes_transferred = self.make_per_engine( - nixl_histogram_bytes_transferred + self.nixl_histogram_bytes_transferred = create_metric_per_engine( + nixl_histogram_bytes_transferred, self.per_engine_labelvalues ) buckets = [ 10, @@ -3100,24 +3105,24 @@ def __init__( buckets=buckets, labelnames=labelnames, ) - self.nixl_histogram_num_descriptors = self.make_per_engine( - nixl_histogram_num_descriptors + self.nixl_histogram_num_descriptors = create_metric_per_engine( + nixl_histogram_num_descriptors, self.per_engine_labelvalues ) counter_nixl_num_failed_transfers = self._counter_cls( name="vllm:nixl_num_failed_transfers", documentation="Number of failed NIXL KV Cache transfers.", labelnames=labelnames, ) - self.counter_nixl_num_failed_transfers = self.make_per_engine( - counter_nixl_num_failed_transfers + self.counter_nixl_num_failed_transfers = create_metric_per_engine( + counter_nixl_num_failed_transfers, self.per_engine_labelvalues ) counter_nixl_num_failed_notifications = self._counter_cls( name="vllm:nixl_num_failed_notifications", documentation="Number of failed NIXL KV Cache notifications.", labelnames=labelnames, ) - self.counter_nixl_num_failed_notifications = self.make_per_engine( - counter_nixl_num_failed_notifications + self.counter_nixl_num_failed_notifications = create_metric_per_engine( + counter_nixl_num_failed_notifications, self.per_engine_labelvalues ) counter_nixl_num_kv_expired_reqs = self._counter_cls( @@ -3126,8 +3131,8 @@ def __init__( "NOTE: This metric is tracked on the P instance.", labelnames=labelnames, ) - self.counter_nixl_num_kv_expired_reqs = self.make_per_engine( - counter_nixl_num_kv_expired_reqs + self.counter_nixl_num_kv_expired_reqs = create_metric_per_engine( + counter_nixl_num_kv_expired_reqs, self.per_engine_labelvalues ) def observe(self, transfer_stats_data: dict[str, Any], engine_idx: int = 0): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 730641a184fc..e344bae26402 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -79,7 +79,7 @@ RunnerOption, TokenizerMode, ) -from vllm.config.multimodal import MMCacheType, MMEncoderTPMode +from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MMTensorIPC from vllm.config.observability import DetailedTraceModules from vllm.config.parallel import ( All2AllBackend, @@ -509,6 +509,7 @@ class EngineArgs: io_processor_plugin: str | None = None skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling video_pruning_rate: float | None = MultiModalConfig.video_pruning_rate + mm_tensor_ipc: MMTensorIPC = MultiModalConfig.mm_tensor_ipc # LoRA fields enable_lora: bool = False max_loras: int = LoRAConfig.max_loras @@ -1097,6 +1098,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: multimodal_group.add_argument( "--video-pruning-rate", **multimodal_kwargs["video_pruning_rate"] ) + multimodal_group.add_argument( + "--mm-tensor-ipc", **multimodal_kwargs["mm_tensor_ipc"] + ) # LoRA related configs lora_kwargs = get_kwargs(LoRAConfig) @@ -1423,6 +1427,7 @@ def create_model_config(self) -> ModelConfig: override_attention_dtype=self.override_attention_dtype, logits_processors=self.logits_processors, video_pruning_rate=self.video_pruning_rate, + mm_tensor_ipc=self.mm_tensor_ipc, io_processor_plugin=self.io_processor_plugin, ) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 195b945bcbce..65e31b829833 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -290,7 +290,7 @@ def signal_handler(signum, frame): with launch_core_engines( vllm_config, executor_class, log_stats, addresses, num_api_servers - ) as (local_engine_manager, coordinator, addresses): + ) as (local_engine_manager, coordinator, addresses, tensor_queue): # Construct common args for the APIServerProcessManager up-front. api_server_manager_kwargs = dict( target_server_fn=run_api_server_worker_proc, @@ -303,6 +303,7 @@ def signal_handler(signum, frame): stats_update_address=coordinator.get_stats_publish_address() if coordinator else None, + tensor_queue=tensor_queue, ) # For dp ranks > 0 in external/hybrid DP LB modes, we must delay the diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 5909b3043007..4b617333c02f 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -1477,9 +1477,9 @@ def _cross_encoding_score( data_1 = data_1 * len(data_2) if pooling_params is None: - pooling_params = PoolingParams(task="score") + pooling_params = PoolingParams(task="classify") elif pooling_params.task is None: - pooling_params.task = "score" + pooling_params.task = "classify" pooling_params_list = list[PoolingParams]() diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 4d5c5eae8de0..95e831b51ec0 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -22,7 +22,7 @@ from starlette.datastructures import State import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import ModelConfig, VllmConfig from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import load_chat_template @@ -155,7 +155,9 @@ async def build_async_engine_client_from_engine_args( def build_app( - args: Namespace, supported_tasks: tuple["SupportedTask", ...] | None = None + args: Namespace, + supported_tasks: tuple["SupportedTask", ...] | None = None, + model_config: ModelConfig | None = None, ) -> FastAPI: if supported_tasks is None: warnings.warn( @@ -191,7 +193,7 @@ def build_app( attach_router as register_sagemaker_api_router, ) - register_sagemaker_api_router(app, supported_tasks) + register_sagemaker_api_router(app, supported_tasks, model_config) if "generate" in supported_tasks: from vllm.entrypoints.openai.generate.api_router import ( @@ -242,7 +244,7 @@ def build_app( if any(task in POOLING_TASKS for task in supported_tasks): from vllm.entrypoints.pooling import register_pooling_api_routers - register_pooling_api_routers(app, supported_tasks) + register_pooling_api_routers(app, supported_tasks, model_config) app.root_path = args.root_path app.add_middleware( @@ -583,8 +585,10 @@ async def build_and_serve( uvicorn_kwargs["log_config"] = log_config supported_tasks = await engine_client.get_supported_tasks() + model_config = engine_client.model_config + logger.info("Supported tasks: %s", supported_tasks) - app = build_app(args, supported_tasks) + app = build_app(args, supported_tasks, model_config) await init_app_state(engine_client, app.state, args, supported_tasks) logger.info("Starting vLLM server on %s", listen_address) diff --git a/vllm/entrypoints/openai/responses/context.py b/vllm/entrypoints/openai/responses/context.py index bab59e0aa1ec..a4c55c23c588 100644 --- a/vllm/entrypoints/openai/responses/context.py +++ b/vllm/entrypoints/openai/responses/context.py @@ -9,7 +9,7 @@ from collections.abc import Callable from contextlib import AsyncExitStack from dataclasses import replace -from typing import TYPE_CHECKING, Final, Union +from typing import TYPE_CHECKING, Any, Final, Union from openai.types.responses.response_function_tool_call_output_item import ( ResponseFunctionToolCallOutputItem, @@ -182,6 +182,7 @@ def __init__(self): self.all_turn_metrics = [] self.input_messages: list[ResponseRawMessageAndToken] = [] + self.kv_transfer_params: dict[str, Any] | None = None def append_output(self, output) -> None: self.last_output = output @@ -190,6 +191,8 @@ def append_output(self, output) -> None: self.num_prompt_tokens = len(output.prompt_token_ids or []) self.num_cached_tokens = output.num_cached_tokens or 0 self.num_output_tokens += len(output.outputs[0].token_ids or []) + if output.kv_transfer_params is not None: + self.kv_transfer_params = output.kv_transfer_params # Accumulate text, token_ids, and logprobs for streaming mode delta_output = output.outputs[0] @@ -308,11 +311,14 @@ def __init__( self.input_messages: list[ResponseRawMessageAndToken] = [] self.output_messages: list[ResponseRawMessageAndToken] = [] self._accumulated_token_ids: list[int] = [] + self.kv_transfer_params: dict[str, Any] | None = None def append_output(self, output: RequestOutput) -> None: self.num_prompt_tokens = len(output.prompt_token_ids or []) self.num_cached_tokens = output.num_cached_tokens or 0 self.num_output_tokens += len(output.outputs[0].token_ids or []) + if output.kv_transfer_params is not None: + self.kv_transfer_params = output.kv_transfer_params self.parser.process(output.outputs[0]) output_token_ids = output.outputs[0].token_ids or [] self._accumulated_token_ids.extend(output_token_ids) @@ -538,6 +544,7 @@ def __init__( self.all_turn_metrics: list[TurnMetrics] = [] self.is_first_turn = True self.first_tok_of_message = True # For streaming support + self.kv_transfer_params: dict[str, Any] | None = None def _update_num_reasoning_tokens(self): channel = self.parser.current_channel @@ -557,6 +564,8 @@ def append_output(self, output: RequestOutput) -> None: self._update_num_reasoning_tokens() self._update_prefill_token_usage(output) self._update_decode_token_usage(output) + if output.kv_transfer_params is not None: + self.kv_transfer_params = output.kv_transfer_params # Append current turn to all turn list for next turn's calculations self.all_turn_metrics.append(self.current_turn_metrics.copy()) self.current_turn_metrics.reset() @@ -868,6 +877,8 @@ def append_output(self, output: RequestOutput) -> None: if last_delta_text: self.last_content_delta = last_delta_text self._update_decode_token_usage(output) + if output.kv_transfer_params is not None: + self.kv_transfer_params = output.kv_transfer_params # For streaming, update previous turn when message is complete if output.finished: diff --git a/vllm/entrypoints/openai/responses/protocol.py b/vllm/entrypoints/openai/responses/protocol.py index a5f62bdd8c39..43fbba1dd43f 100644 --- a/vllm/entrypoints/openai/responses/protocol.py +++ b/vllm/entrypoints/openai/responses/protocol.py @@ -252,6 +252,10 @@ class ResponsesRequest(OpenAIBaseModel): "numeric values, used by custom extensions." ), ) + kv_transfer_params: dict[str, Any] | None = Field( + default=None, + description="KVTransfer parameters used for disaggregated serving.", + ) # --8<-- [end:responses-extra-params] def build_chat_params( @@ -351,6 +355,10 @@ def to_sampling_params( if isinstance(stop, str): stop = [stop] + extra_args: dict[str, Any] = self.vllm_xargs if self.vllm_xargs else {} + if self.kv_transfer_params: + extra_args["kv_transfer_params"] = self.kv_transfer_params + return SamplingParams.from_optional( temperature=temperature, top_p=top_p, @@ -367,7 +375,7 @@ def to_sampling_params( ), structured_outputs=structured_outputs, logit_bias=self.logit_bias, - extra_args=self.vllm_xargs or {}, + extra_args=extra_args, skip_clone=True, # Created fresh per request, safe to skip clone skip_special_tokens=self.skip_special_tokens, include_stop_str_in_output=self.include_stop_str_in_output, @@ -488,6 +496,11 @@ class ResponsesResponse(OpenAIBaseModel): usage: ResponseUsage | None = None user: str | None = None + # vLLM-specific fields that are not in OpenAI spec + kv_transfer_params: dict[str, Any] | None = Field( + default=None, description="KVTransfer parameters." + ) + # --8<-- [start:responses-response-extra-params] # These are populated when enable_response_messages is set to True # NOTE: custom serialization is needed @@ -531,6 +544,7 @@ def from_request( usage: ResponseUsage | None = None, input_messages: ResponseInputOutputMessage | None = None, output_messages: ResponseInputOutputMessage | None = None, + kv_transfer_params: dict[str, Any] | None = None, ) -> "ResponsesResponse": incomplete_details: IncompleteDetails | None = None if status == "incomplete": @@ -566,6 +580,7 @@ def from_request( truncation=request.truncation, user=request.user, usage=usage, + kv_transfer_params=kv_transfer_params, ) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 574282c4cdc6..53c28693ade7 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -873,6 +873,7 @@ async def responses_full_generator( output=output, status=status, usage=usage, + kv_transfer_params=context.kv_transfer_params, ) if request.store: diff --git a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py index 4a6030d71b63..bf58273f7504 100644 --- a/vllm/entrypoints/openai/speech_to_text/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text/speech_to_text.py @@ -42,32 +42,13 @@ from vllm.logger import init_logger from vllm.logprobs import FlatLogprobs, Logprob from vllm.model_executor.models import SupportsTranscription -from vllm.multimodal.audio import split_audio -from vllm.multimodal.media.audio import extract_audio_from_video_bytes +from vllm.multimodal.audio import get_audio_duration, split_audio +from vllm.multimodal.media.audio import load_audio from vllm.outputs import RequestOutput from vllm.renderers.inputs import DictPrompt, EncoderDecoderDictPrompt from vllm.renderers.inputs.preprocess import parse_enc_dec_prompt, parse_model_prompt from vllm.sampling_params import BeamSearchParams, SamplingParams from vllm.tokenizers import get_tokenizer -from vllm.utils.import_utils import PlaceholderModule - -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] - -try: - import soundfile as sf -except ImportError: - sf = PlaceholderModule("soundfile") # type: ignore[assignment] - -# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile -# being librosa's main backend. Used to validate if an audio loading error is due to a -# server error vs a client error (invalid audio file). -# 1 = unrecognised format (file is not a supported audio container) -# 3 = malformed file (corrupt or structurally invalid audio) -# 4 = unsupported encoding (codec not supported by this libsndfile build) -_BAD_SF_CODES = {1, 3, 4} SpeechToTextResponse: TypeAlias = TranscriptionResponse | TranslationResponse SpeechToTextResponseVerbose: TypeAlias = ( @@ -214,32 +195,13 @@ async def _preprocess_speech_to_text( # pre-requisite for chunking, as it assumes Whisper SR. try: with io.BytesIO(audio_data) as buf: - y, sr = librosa.load(buf, sr=self.asr_config.sample_rate) # type: ignore[return-value] - except sf.LibsndfileError as exc: - # Only fall back for known format-detection failures. - # Re-raise anything else (e.g. corrupt but recognised format). - if exc.code not in _BAD_SF_CODES: - raise - logger.debug( - "librosa/soundfile could not decode audio from BytesIO " - "(code=%s: %s); falling back to pyav in-process decode", - exc.code, - exc, - ) - try: - native_y, native_sr = extract_audio_from_video_bytes(audio_data) - sr = self.asr_config.sample_rate - y = librosa.resample(native_y, orig_sr=native_sr, target_sr=sr) - except Exception as pyav_exc: - logger.debug( - "pyAV fallback also failed: %s", - pyav_exc, - ) - raise ValueError("Invalid or unsupported audio file.") from pyav_exc + y, sr = load_audio(buf, sr=self.asr_config.sample_rate) + except Exception as exc: + raise ValueError("Invalid or unsupported audio file.") from exc - duration = librosa.get_duration(y=y, sr=sr) - do_split_audio = ( - self.asr_config.allow_audio_chunking + duration = get_audio_duration(y=y, sr=sr) + do_split_audio = self.asr_config.allow_audio_chunking and ( + self.asr_config.max_audio_clip_s is not None and duration > self.asr_config.max_audio_clip_s ) diff --git a/vllm/entrypoints/pooling/__init__.py b/vllm/entrypoints/pooling/__init__.py index d2baea8959d2..e115b710ceeb 100644 --- a/vllm/entrypoints/pooling/__init__.py +++ b/vllm/entrypoints/pooling/__init__.py @@ -5,6 +5,9 @@ from fastapi import FastAPI +from vllm.config import ModelConfig +from vllm.logger import init_logger + if TYPE_CHECKING: from argparse import Namespace @@ -17,9 +20,30 @@ RequestLogger = object SupportedTask = object +logger = init_logger(__name__) + + +def enable_scoring_api( + supported_tasks: tuple["SupportedTask", ...], + model_config: ModelConfig | None = None, +) -> bool: + if any(t in supported_tasks for t in ("embed", "token_embed")): + return True + + if model_config is not None and "classify" in supported_tasks: + num_labels = getattr(model_config.hf_config, "num_labels", 0) + if num_labels != 1: + logger.debug_once("Score API is only enabled for num_labels == 1.") + return False + return True + + return False + def register_pooling_api_routers( - app: FastAPI, supported_tasks: tuple["SupportedTask", ...] + app: FastAPI, + supported_tasks: tuple["SupportedTask", ...], + model_config: ModelConfig | None = None, ): from vllm.entrypoints.pooling.pooling.api_router import router as pooling_router @@ -37,11 +61,7 @@ def register_pooling_api_routers( app.include_router(embed_router) - # Score API handles score/rerank for: - # - "score" task (score_type: cross-encoder models) - # - "embed" task (score_type: bi-encoder models) - # - "token_embed" task (score_type: late interaction models) - if any(t in supported_tasks for t in ("score", "embed", "token_embed")): + if enable_scoring_api(supported_tasks, model_config): from vllm.entrypoints.pooling.score.api_router import router as score_router app.include_router(score_router) @@ -61,6 +81,8 @@ def init_pooling_state( from vllm.entrypoints.pooling.score.serving import ServingScores from vllm.tasks import POOLING_TASKS + model_config = engine_client.model_config + resolved_chat_template = load_chat_template(args.chat_template) state.serving_pooling = ( @@ -102,10 +124,6 @@ def init_pooling_state( if "classify" in supported_tasks else None ) - # Score API handles score/rerank for: - # - "score" task (score_type: cross-encoder models) - # - "embed" task (score_type: bi-encoder models) - # - "token_embed" task (score_type: late interaction models) state.serving_scores = ( ServingScores( engine_client, @@ -114,6 +132,6 @@ def init_pooling_state( score_template=resolved_chat_template, log_error_stack=args.log_error_stack, ) - if any(t in supported_tasks for t in ("embed", "score", "token_embed")) + if enable_scoring_api(supported_tasks, model_config) else None ) diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index 2aea1bd7b27a..bb633fc28b3c 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -35,7 +35,7 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: max_total_tokens_param="max_model_len", ) - def to_pooling_params(self, task: PoolingTask = "score"): + def to_pooling_params(self, task: PoolingTask = "classify"): return PoolingParams( task=task, use_activation=self.use_activation, @@ -111,7 +111,7 @@ def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: max_total_tokens_param="max_model_len", ) - def to_pooling_params(self, task: PoolingTask = "score"): + def to_pooling_params(self, task: PoolingTask = "classify"): return PoolingParams( task=task, use_activation=self.use_activation, diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index c58fe6d36c07..d8cbff99d068 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -413,7 +413,7 @@ async def _cross_encoding_score( # Schedule the request and get the result generator. generators: list[AsyncGenerator[PoolingRequestOutput, None]] = [] - default_pooling_params = request.to_pooling_params("score") + default_pooling_params = request.to_pooling_params("classify") for i, engine_prompt in enumerate(engine_prompts): request_id_item = f"{request_id}-{i}" diff --git a/vllm/entrypoints/sagemaker/api_router.py b/vllm/entrypoints/sagemaker/api_router.py index 32faaa02e681..e8c48d1c6d53 100644 --- a/vllm/entrypoints/sagemaker/api_router.py +++ b/vllm/entrypoints/sagemaker/api_router.py @@ -10,9 +10,11 @@ from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request from fastapi.responses import JSONResponse, Response +from vllm.config import ModelConfig from vllm.entrypoints.openai.engine.protocol import ErrorResponse from vllm.entrypoints.openai.engine.serving import OpenAIServing from vllm.entrypoints.openai.utils import validate_json_request +from vllm.entrypoints.pooling import enable_scoring_api from vllm.entrypoints.pooling.base.serving import PoolingServing from vllm.entrypoints.serve.instrumentator.basic import base from vllm.entrypoints.serve.instrumentator.health import health @@ -25,7 +27,10 @@ EndpointFn = Callable[[RequestType, Request], Awaitable[Any]] -def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]): +def get_invocation_types( + supported_tasks: tuple["SupportedTask", ...], + model_config: ModelConfig | None = None, +): # NOTE: Items defined earlier take higher priority INVOCATION_TYPES: list[tuple[RequestType, tuple[GetHandlerFn, EndpointFn]]] = [] @@ -70,7 +75,7 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]): (ClassificationRequest, (classify, create_classify)), ] - if "score" in supported_tasks: + if enable_scoring_api(supported_tasks, model_config): from vllm.entrypoints.pooling.score.api_router import do_rerank, rerank from vllm.entrypoints.pooling.score.protocol import RerankRequest @@ -78,7 +83,6 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]): (RerankRequest, (rerank, do_rerank)), ] - if "score" in supported_tasks or "embed" in supported_tasks: from vllm.entrypoints.pooling.score.api_router import create_score, score from vllm.entrypoints.pooling.score.protocol import ScoreRequest @@ -97,11 +101,15 @@ def get_invocation_types(supported_tasks: tuple["SupportedTask", ...]): return INVOCATION_TYPES -def attach_router(app: FastAPI, supported_tasks: tuple["SupportedTask", ...]): +def attach_router( + app: FastAPI, + supported_tasks: tuple["SupportedTask", ...], + model_config: ModelConfig | None = None, +): router = APIRouter() # NOTE: Construct the TypeAdapters only once - INVOCATION_TYPES = get_invocation_types(supported_tasks) + INVOCATION_TYPES = get_invocation_types(supported_tasks, model_config) INVOCATION_VALIDATORS = [ (pydantic.TypeAdapter(request_type), (get_handler, endpoint)) for request_type, (get_handler, endpoint) in INVOCATION_TYPES diff --git a/vllm/model_executor/kernels/linear/mixed_precision/conch.py b/vllm/model_executor/kernels/linear/mixed_precision/conch.py index e98676e01754..82dd32da19a0 100644 --- a/vllm/model_executor/kernels/linear/mixed_precision/conch.py +++ b/vllm/model_executor/kernels/linear/mixed_precision/conch.py @@ -124,6 +124,14 @@ def apply_weights( w_q, w_s, w_zp, _ = self._get_weight_params(layer) + # Map channelwise group_size=-1 to the actual input dimension K. + # The conch kernel computes stride_mul = block_k / group_size; + # passing -1 produces a negative stride that reads out-of-bounds + # scale values for all K-blocks after the first. + group_size = self.config.group_size + if group_size == -1: + group_size = x.shape[-1] + output = mixed_precision_gemm( x=x, w_q_packed=w_q.data, @@ -131,7 +139,7 @@ def apply_weights( w_zp=w_zp.data if w_zp is not None else None, weight_size_bits=self.config.weight_type.size_bits, weight_bias=self.config.weight_type.bias, - group_size=self.config.group_size, + group_size=group_size, ) if bias is not None: diff --git a/vllm/model_executor/kernels/linear/mixed_precision/exllama.py b/vllm/model_executor/kernels/linear/mixed_precision/exllama.py index 537a8e278a39..3ad43a225fa8 100644 --- a/vllm/model_executor/kernels/linear/mixed_precision/exllama.py +++ b/vllm/model_executor/kernels/linear/mixed_precision/exllama.py @@ -59,6 +59,13 @@ def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]: f"{cls.SUPPORTED_QUANT_TYPES}", ) + if c.group_size <= 0: + return ( + False, + f"Group size ({c.group_size}) must be positive, " + "Exllama does not support channelwise quantization", + ) + if c.full_weight_shape[0] % c.group_size != 0: return ( False, diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 2eb0f49217d8..f4e3ed8e055c 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -346,7 +346,7 @@ def g2_alphas(self) -> torch.Tensor | None: @property def use_fp8_w8a8(self) -> bool: - return self.quant_dtype == torch.float8_e4m3fn + return self.quant_dtype == current_platform.fp8_dtype() @property def use_int8_w8a8(self) -> bool: @@ -566,7 +566,7 @@ def fp8_w8a8_moe_quant_config( Construct a quant config for fp8 activations and fp8 weights. """ return FusedMoEQuantConfig.make( - torch.float8_e4m3fn, + current_platform.fp8_dtype(), w1_scale=w1_scale, g1_alphas=g1_alphas, w2_scale=w2_scale, diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py index a22b89415364..a3266f5e847b 100644 --- a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py @@ -16,6 +16,7 @@ moe_kernel_quantize_input, normalize_batched_scales_shape, ) +from vllm.platforms import current_platform from vllm.v1.worker.ubatching import ( dbo_current_ubatch_id, dbo_enabled, @@ -158,11 +159,6 @@ def _map_global_to_physical_ids(self, topk_ids: torch.Tensor) -> torch.Tensor: return topk_ids return self.global_to_physical[topk_ids] - def _map_local_to_global_ids(self, expert_topk_ids: torch.Tensor) -> torch.Tensor: - if self.local_expert_global_ids is None: - return expert_topk_ids - return self.local_expert_global_ids[expert_topk_ids] - def _do_quant( self, x: torch.Tensor | tuple[torch.Tensor, torch.Tensor], @@ -295,23 +291,46 @@ def prepare_async( # Dispatch dispatch_topk_ids = self._map_global_to_physical_ids(topk_ids) - expert_x, expert_num_tokens, handle, _, hook = self.buffer.low_latency_dispatch( - a1, - dispatch_topk_ids, - self.max_tokens_per_rank, - num_experts, - use_fp8=self.use_fp8_dispatch, - round_scale=self.use_ue8m0_dispatch, - use_ue8m0=self.use_ue8m0_dispatch, - **(dict(use_nvfp4=True) if use_nvfp4 else dict()), - **( - dict(x_global_scale=qc_a1_gscale_or_scale) - if qc_a1_gscale_or_scale is not None - else dict() - ), - async_finish=False, - return_recv_hook=True, - ) + if current_platform.is_rocm(): + ( + expert_x, + expert_num_tokens, + handle, + _, + hook, + ) = self.buffer.low_latency_dispatch( + a1, + dispatch_topk_ids, + self.max_tokens_per_rank, + num_experts, + use_fp8=self.use_fp8_dispatch, + async_finish=False, + return_recv_hook=True, + ) + else: + ( + expert_x, + expert_num_tokens, + handle, + _, + hook, + ) = self.buffer.low_latency_dispatch( + a1, + dispatch_topk_ids, + self.max_tokens_per_rank, + num_experts, + use_fp8=self.use_fp8_dispatch, + round_scale=self.use_ue8m0_dispatch, + use_ue8m0=self.use_ue8m0_dispatch, + **(dict(use_nvfp4=True) if use_nvfp4 else dict()), + **( + dict(x_global_scale=qc_a1_gscale_or_scale) + if qc_a1_gscale_or_scale is not None + else dict() + ), + async_finish=False, + return_recv_hook=True, + ) self.handles[a2a_idx] = handle return ( diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py b/vllm/model_executor/layers/fused_moe/experts/flashinfer_cutedsl_moe.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/flashinfer_cutedsl_moe.py rename to vllm/model_executor/layers/fused_moe/experts/flashinfer_cutedsl_moe.py diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py index 501c10ab0cf4..f57a05dc6ecc 100644 --- a/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_fp8_moe.py @@ -269,9 +269,16 @@ def _supports_routing_method( weight_key: QuantKey | None, activation_key: QuantKey | None, ) -> bool: - """Monolithic kernels need to express router support.""" + """Monolithic kernels need to express router support. + Renormalize/RenormalizeNaive are excluded: the monolithic kernel's + internal routing for these methods produces output uncorrelated + with the modular kernel's output and with Triton kernel's output + for Qwen3.5-35B-A3B-FP8. + See: https://github.com/vllm-project/vllm/issues/37591 + """ # NOTE(dbari): TopK routing could also be enabled, but need to validate models # NOTE(dbari): Default is not implemented and should not be enabled until it is + if (weight_key, activation_key) in [ (kFp8Static128BlockSym, kFp8Dynamic128Sym), (kMxfp8Static, kMxfp8Dynamic), @@ -279,16 +286,12 @@ def _supports_routing_method( # NOTE(rob): potentially allow others here. This is a conservative list. return routing_method in [ RoutingMethodType.DeepSeekV3, - RoutingMethodType.Renormalize, - RoutingMethodType.RenormalizeNaive, ] elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym): # NOTE(dbari): as above, potentially allow others here. return routing_method in [ RoutingMethodType.DeepSeekV3, RoutingMethodType.Llama4, - RoutingMethodType.Renormalize, - RoutingMethodType.RenormalizeNaive, ] else: raise ValueError("Unsupported quantization scheme.") diff --git a/vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py b/vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py new file mode 100644 index 000000000000..d084283360c4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/experts/trtllm_mxfp4_moe.py @@ -0,0 +1,352 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.activation import MoEActivation +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + FusedMoEParallelConfig, + FusedMoEQuantConfig, + RoutingMethodType, +) +from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( + TopKWeightAndReduceNoOP, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + QuantKey, + kMxfp4Static, + kMxfp8Dynamic, +) +from vllm.platforms import current_platform +from vllm.utils.flashinfer import has_flashinfer + + +class TrtLlmMxfp4ExpertsBase: + """ + MXFP4 TRTLLM-Gen MoE kernels. Shared base for modular and monolithic. + """ + + def __init__( + self, + moe_config: FusedMoEConfig, + quant_config: FusedMoEQuantConfig, + ): + # NOTE: FusedMoEExperts.__init__ is called by the concrete subclass + # (Monolithic/Modular) via MRO, not here, to avoid mypy issues with + # multiple inheritance. This matches the NvFP4 expert pattern. + self.moe_config = moe_config + self.quant_config = quant_config + + self.routing_method_type = moe_config.routing_method + self.topk = moe_config.experts_per_token + self.intermediate_size_per_partition = ( + moe_config.intermediate_size_per_partition + ) + self.hidden_dim = moe_config.hidden_dim + self.local_num_experts = moe_config.num_local_experts + self.ep_rank = moe_config.moe_parallel_config.ep_rank + + # MXFP4-specific TRTLLM parameters + device = torch.accelerator.current_device_index() + self.gemm1_alpha = torch.tensor( + [1.702] * self.local_num_experts, + dtype=torch.float32, + device=device, + ) + self.gemm1_beta = torch.tensor( + [1.0] * self.local_num_experts, + dtype=torch.float32, + device=device, + ) + self.gemm1_clamp_limit = torch.tensor( + [7.0] * self.local_num_experts, + dtype=torch.float32, + device=device, + ) + + from vllm.config import get_current_vllm_config + + self.max_capture_size = ( + get_current_vllm_config().compilation_config.max_cudagraph_capture_size + ) + + # P1-5 fix: use public quant_dtype property instead of private _a1 + self.use_mxfp8_input = quant_config.quant_dtype == "mxfp8" + + @staticmethod + def _supports_current_device() -> bool: + p = current_platform + return p.is_cuda() and p.is_device_capability_family(100) and has_flashinfer() + + @staticmethod + def _supports_no_act_and_mul() -> bool: + return False + + @staticmethod + def _supports_quant_scheme( + weight_key: QuantKey | None, + activation_key: QuantKey | None, + ) -> bool: + SUPPORTED_W_A = [ + (kMxfp4Static, None), + (kMxfp4Static, kMxfp8Dynamic), + ] + return (weight_key, activation_key) in SUPPORTED_W_A + + @staticmethod + def _supports_activation(activation: MoEActivation) -> bool: + return activation == MoEActivation.SWIGLUOAI + + @staticmethod + def activation_format() -> mk.FusedMoEActivationFormat: + return mk.FusedMoEActivationFormat.Standard + + def supports_chunking(self) -> bool: + return False + + def supports_expert_map(self) -> bool: + return False + + @property + def expects_unquantized_inputs(self) -> bool: + # Expert handles MXFP8 quantization internally if needed + return True + + +class TrtLlmMxfp4ExpertsMonolithic( + TrtLlmMxfp4ExpertsBase, mk.FusedMoEExpertsMonolithic +): + """ + Monolithic version of the MXFP4 TRTLLM kernel (router + experts). + Wraps flashinfer.trtllm_fp4_block_scale_moe(). + """ + + @staticmethod + def _supports_parallel_config( + moe_parallel_config: FusedMoEParallelConfig, + ) -> bool: + return ( + not moe_parallel_config.use_all2all_kernels + and not moe_parallel_config.enable_eplb + and moe_parallel_config.dp_size <= 1 + ) + + @staticmethod + def _supports_routing_method( + routing_method: RoutingMethodType, + weight_key: QuantKey | None, + activation_key: QuantKey | None, + ) -> bool: + return routing_method in [ + RoutingMethodType.Renormalize, + RoutingMethodType.RenormalizeNaive, + ] + + @staticmethod + def _supports_router_logits_dtype( + router_logits_dtype: torch.dtype | None, + routing_method: RoutingMethodType, + ) -> bool: + # Kernel converts to bfloat16 internally + return True + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + router_logits: torch.Tensor, + activation: MoEActivation, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + apply_router_weight_on_input: bool, + # grouped topk + fused topk bias parameters + num_expert_group: int | None = None, + e_score_correction_bias: torch.Tensor | None = None, + routed_scaling_factor: float | None = None, + topk_group: int | None = None, + ) -> torch.Tensor: + from flashinfer import trtllm_fp4_block_scale_moe + + # Handle input quantization + if self.use_mxfp8_input: + from flashinfer import mxfp8_quantize + + x_quant, x_scale = mxfp8_quantize( + hidden_states, + is_sf_swizzled_layout=False, + alignment=256, + ) + x_scale = x_scale.view(torch.float8_e4m3fn).reshape( + *hidden_states.shape[:-1], -1 + ) + else: + assert hidden_states.dtype == torch.bfloat16 + x_quant = hidden_states + x_scale = None + + output = torch.empty_like(hidden_states) + + return trtllm_fp4_block_scale_moe( + routing_logits=router_logits.to(torch.bfloat16), + routing_bias=None, + hidden_states=x_quant, + hidden_states_scale=x_scale, + gemm1_weights=w1, + gemm1_weights_scale=self.w1_scale, + gemm1_bias=self.w1_bias, + gemm1_alpha=self.gemm1_alpha, + gemm1_beta=self.gemm1_beta, + gemm1_clamp_limit=self.gemm1_clamp_limit, + gemm2_weights=w2, + gemm2_weights_scale=self.w2_scale, + gemm2_bias=self.w2_bias, + output1_scale_scalar=None, + output1_scale_gate_scalar=None, + output2_scale_scalar=None, + num_experts=global_num_experts, + top_k=self.topk, + n_group=None, + topk_group=None, + intermediate_size=self.intermediate_size_per_partition, + local_expert_offset=self.ep_rank * self.local_num_experts, + local_num_experts=self.local_num_experts, + routed_scaling_factor=None, + routing_method_type=self.routing_method_type, + do_finalize=True, + tune_max_num_tokens=max(self.max_capture_size, 1), + output=output, + )[0] + + +class TrtLlmMxfp4ExpertsModular(TrtLlmMxfp4ExpertsBase, mk.FusedMoEExpertsModular): + """ + Modular version of the MXFP4 TRTLLM kernel (just the experts). + Wraps flashinfer.trtllm_fp4_block_scale_routed_moe(). + Moved from trtllm_moe.py. + """ + + @property + def expects_unquantized_inputs(self) -> bool: + return True + + @staticmethod + def _supports_parallel_config( + moe_parallel_config: FusedMoEParallelConfig, + ) -> bool: + return True + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + return TopKWeightAndReduceNoOP() + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + activation: MoEActivation, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + # The workspaces for this implementation are managed by flashinfer. + workspace1 = (0,) + workspace2 = (0,) + output = (M, K) + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: MoEActivation, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + topk = topk_ids.size(-1) + local_num_experts = w1.size(0) + intermediate_size = w2.size(1) + local_expert_offset = self.moe_config.ep_rank * local_num_experts + + # Handle input quantization + if self.use_mxfp8_input: + from flashinfer import mxfp8_quantize + + x_quant, x_scale = mxfp8_quantize( + hidden_states, + is_sf_swizzled_layout=False, + alignment=256, + ) + x_scale = x_scale.view(torch.float8_e4m3fn).reshape( + *hidden_states.shape[:-1], -1 + ) + else: + assert hidden_states.dtype == torch.bfloat16 + x_quant = hidden_states + x_scale = None + + packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to( + torch.bfloat16 + ).view(torch.int16) + + assert self.w1_scale is not None + assert self.w2_scale is not None + kwargs = { + "topk_ids": packed_tensor, + "routing_bias": None, + "hidden_states": x_quant, + "hidden_states_scale": x_scale, + "gemm1_weights": w1, + "gemm1_weights_scale": self.w1_scale, + "gemm1_bias": self.w1_bias, + "gemm1_alpha": self.gemm1_alpha, + "gemm1_beta": self.gemm1_beta, + "gemm1_clamp_limit": self.gemm1_clamp_limit, + "gemm2_weights": w2, + "gemm2_weights_scale": self.w2_scale, + "gemm2_bias": self.w2_bias, + "output1_scale_scalar": None, + "output1_scale_gate_scalar": None, + "output2_scale_scalar": None, + "num_experts": global_num_experts, + "top_k": topk, + "n_group": None, + "topk_group": None, + "intermediate_size": intermediate_size, + "local_expert_offset": local_expert_offset, + "local_num_experts": local_num_experts, + "routed_scaling_factor": None, + "routing_method_type": self.routing_method_type, + "do_finalize": True, + "output": output, + "tune_max_num_tokens": max(self.max_capture_size, 1), + } + + from flashinfer import trtllm_fp4_block_scale_routed_moe + + from vllm.utils.flashinfer import autotune + + with autotune(False): + # Enable autotune when, + # https://github.com/flashinfer-ai/flashinfer/issues/2023 is + # resolved. + trtllm_fp4_block_scale_routed_moe(**kwargs) + + return output diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index 9df94b72d246..e2b5a8f6764e 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -1017,6 +1017,7 @@ def apply( torch.float16, torch.bfloat16, torch.float8_e4m3fn, + torch.float8_e4m3fnuz, ] assert expert_tokens_meta is not None @@ -1046,7 +1047,7 @@ def apply( compute_type = tl.float16 elif hidden_states.dtype == torch.float32: compute_type = tl.float32 - elif hidden_states.dtype == torch.float8_e4m3fn: + elif hidden_states.dtype == current_platform.fp8_dtype(): compute_type = tl.bfloat16 else: raise ValueError(f"Unsupported compute_type: {hidden_states.dtype}") diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 45575ab09c40..136a8188d6a0 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -40,6 +40,7 @@ kFp8Static128BlockSym, kFp8StaticChannelSym, kFp8StaticTensorSym, + kMxfp4Static, kNvfp4Static, ) from vllm.platforms import current_platform @@ -574,12 +575,13 @@ def _supports_quant_scheme( weight_key: QuantKey | None, activation_key: QuantKey | None, ) -> bool: - # TODO(rob): add int4, mxfp4, int8 as integrations + # TODO(rob): add int4, int8 as integrations # are migrated to use the oracle one-by-one. SUPPORTED_W = [ kFp8Static128BlockSym, kFp8StaticChannelSym, kFp8StaticTensorSym, + kMxfp4Static, kNvfp4Static, ] return weight_key in SUPPORTED_W diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 03ca8ba119c0..d5b8feb3c9b9 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -1616,7 +1616,7 @@ def _get_config_quant_dtype( fused_experts_impl. """ if use_fp8_w8a8: - return torch.float8_e4m3fn + return current_platform.fp8_dtype() elif use_int8_w8a8: return torch.int8 elif ocp_mx_scheme == "w_mxfp4_a_mxfp4": diff --git a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py index 82b0a21cba93..5862abe20518 100644 --- a/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py +++ b/vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py @@ -11,8 +11,10 @@ from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.model_executor.layers.fused_moe.config import ( FUSED_MOE_UNQUANTIZED_CONFIG, + FusedMoEConfig, FusedMoEParallelConfig, FusedMoEQuantConfig, + RoutingMethodType, ) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP, @@ -20,6 +22,7 @@ from vllm.model_executor.layers.fused_moe.utils import _resize_cache from vllm.model_executor.layers.quantization.utils.quant_utils import ( QuantKey, + kMxfp4Static, ) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton @@ -142,6 +145,33 @@ def legacy_routing_from_bitmatrix( return routing_data, gather_idx, scatter_idx +def legacy_routing_from_sparsematrix( + sparse_logits: "SparseMatrix", + n_expts_tot: int, + n_expts_act: int, +) -> tuple["RoutingData", "GatherIndx", "ScatterIndx"]: + """ + Creates routing data from a SparseMatrix representation. + """ + dispatch_indx = sparse_logits.mask_metadata.row_sorted_indx + combine_indx = sparse_logits.mask_metadata.col_sorted_indx + ragged_batch_metadata = make_ragged_tensor_metadata( + sparse_logits.mask_metadata.col_sum, + dispatch_indx.shape[0], + ) + gate_scal = sparse_logits.vals.flatten()[combine_indx] + routing_data = RoutingData( + gate_scal, + ragged_batch_metadata.block_sizes, + n_expts_tot, + n_expts_act, + ragged_batch_metadata, + ) + gather_idx = GatherIndx(combine_indx, dispatch_indx) + scatter_idx = ScatterIndx(dispatch_indx, combine_indx) + return routing_data, gather_idx, scatter_idx + + def legacy_routing( logits: torch.Tensor, n_expts_act: int, @@ -158,10 +188,8 @@ def legacy_routing( if sm_first: logits = torch.softmax(logits, dim=-1) sparse_logits = topk(logits, n_expts_act, apply_softmax=not sm_first) - return legacy_routing_from_bitmatrix( - sparse_logits.mask, - sparse_logits.vals, - sparse_logits.indx, + return legacy_routing_from_sparsematrix( + sparse_logits, logits.shape[-1], n_expts_act, ) @@ -512,43 +540,43 @@ def make_routing_data( class BaseOAITritonExperts(mk.FusedMoEExpertsModular): + @property + def expects_unquantized_inputs(self) -> bool: + return True + @staticmethod def _supports_current_device() -> bool: - raise NotImplementedError( - "OAITritonExperts is not yet used by an Oracle. " - "This method should not be called." - ) + p = current_platform + if not p.is_cuda_alike(): + return False + cap = p.get_device_capability() + if cap is None: + return False + # (9,0) <= cap < (11,0) covers CUDA SM90 (Hopper), SM100+ (Blackwell) + # and ROCm gfx942/gfx950 (which map to 9.4/9.5). + return (9, 0) <= (cap.major, cap.minor) < (11, 0) @staticmethod def _supports_no_act_and_mul() -> bool: - raise NotImplementedError( - "OAITritonExperts is not yet used by an Oracle. " - "This method should not be called." - ) + return False @staticmethod def _supports_quant_scheme( weight_key: QuantKey | None, activation_key: QuantKey | None, ) -> bool: - raise NotImplementedError( - "OAITritonExperts is not yet used by an Oracle. " - "This method should not be called." - ) + SUPPORTED_W_A = [ + (kMxfp4Static, None), + ] + return (weight_key, activation_key) in SUPPORTED_W_A @staticmethod def _supports_activation(activation: MoEActivation) -> bool: - raise NotImplementedError( - "OAITritonExperts is not yet used by an Oracle. " - "This method should not be called." - ) + raise NotImplementedError @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: - raise NotImplementedError( - "OAITritonExperts is not yet used by an Oracle. " - "This method should not be called." - ) + return True def supports_expert_map(self) -> bool: return True @@ -605,6 +633,10 @@ def _make_routing_data( class OAITritonExperts(BaseOAITritonExperts): """OAI Triton-based fused MoE expert implementation.""" + @staticmethod + def _supports_activation(activation: MoEActivation) -> bool: + return activation == MoEActivation.SWIGLUOAI + @staticmethod def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard @@ -689,6 +721,15 @@ class UnfusedOAITritonExperts(BaseOAITritonExperts): One use case for it is to inject LoRA modules on the activation and moe_sum. """ + @staticmethod + def _supports_activation(activation: MoEActivation) -> bool: + return activation in [ + MoEActivation.SILU, + MoEActivation.GELU, + MoEActivation.SWIGLUOAI, + MoEActivation.SWIGLUSTEP, + ] + @staticmethod def activation_format() -> mk.FusedMoEActivationFormat: return mk.FusedMoEActivationFormat.Standard @@ -814,3 +855,118 @@ def apply( ) self.moe_sum(intermediate_cache3.view(-1, topk, K), output) + + +class OAITritonMxfp4ExpertsMonolithic(mk.FusedMoEExpertsMonolithic): + """Monolithic Triton MXFP4 expert. Wraps triton_kernel_moe_forward().""" + + def __init__( + self, + moe_config: FusedMoEConfig, + quant_config: FusedMoEQuantConfig, + ): + super().__init__(moe_config, quant_config) + self.topk = moe_config.experts_per_token + self.renormalize = moe_config.routing_method in ( + RoutingMethodType.Renormalize, + RoutingMethodType.RenormalizeNaive, + ) + + @staticmethod + def activation_format() -> mk.FusedMoEActivationFormat: + return mk.FusedMoEActivationFormat.Standard + + @staticmethod + def _supports_current_device() -> bool: + p = current_platform + if not p.is_cuda_alike(): + return False + cap = p.get_device_capability() + if cap is None: + return False + # (9,0) <= cap < (11,0) covers CUDA SM90 (Hopper), SM100+ (Blackwell) + # and ROCm gfx942/gfx950 (which map to 9.4/9.5). + return (9, 0) <= (cap.major, cap.minor) < (11, 0) + + @staticmethod + def _supports_no_act_and_mul() -> bool: + return False + + @staticmethod + def _supports_quant_scheme( + weight_key: QuantKey | None, + activation_key: QuantKey | None, + ) -> bool: + SUPPORTED_W_A = [ + (kMxfp4Static, None), + ] + return (weight_key, activation_key) in SUPPORTED_W_A + + @staticmethod + def _supports_activation(activation: MoEActivation) -> bool: + return activation == MoEActivation.SWIGLUOAI + + @staticmethod + def _supports_parallel_config( + moe_parallel_config: FusedMoEParallelConfig, + ) -> bool: + return ( + not moe_parallel_config.use_all2all_kernels + and not moe_parallel_config.enable_eplb + and moe_parallel_config.dp_size <= 1 + ) + + @staticmethod + def _supports_routing_method( + routing_method: RoutingMethodType, + weight_key: QuantKey | None, + activation_key: QuantKey | None, + ) -> bool: + return routing_method in [ + RoutingMethodType.Renormalize, + RoutingMethodType.RenormalizeNaive, + ] + + @staticmethod + def _supports_router_logits_dtype( + router_logits_dtype: torch.dtype | None, + routing_method: RoutingMethodType, + ) -> bool: + return True + + def supports_expert_map(self) -> bool: + return True + + @property + def expects_unquantized_inputs(self) -> bool: + return True + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + router_logits: torch.Tensor, + activation: MoEActivation, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + apply_router_weight_on_input: bool, + # grouped topk + fused topk bias parameters + num_expert_group: int | None = None, + e_score_correction_bias: torch.Tensor | None = None, + routed_scaling_factor: float | None = None, + topk_group: int | None = None, + ) -> torch.Tensor: + return triton_kernel_moe_forward( + hidden_states=hidden_states, + w1=w1, + w2=w2, + gating_output=router_logits, + topk=self.topk, + renormalize=self.renormalize, + global_num_experts=global_num_experts, + expert_map=expert_map, + quant_config=self.quant_config, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 2f704569209c..85fd1813a363 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -52,7 +52,6 @@ QuantizationConfig, ) from vllm.platforms import current_platform -from vllm.utils.math_utils import round_up logger = init_logger(__name__) @@ -218,7 +217,6 @@ def maybe_roundup_hidden_size( moe_parallel_config: FusedMoEParallelConfig, is_lora_enabled: bool, model_type: str | None, - is_mxfp4_quant: bool, ) -> int: """ Given layer hidden size and MoE configurations, round up hidden_size @@ -232,7 +230,6 @@ def maybe_roundup_hidden_size( is used in the case of mxfp4 quantization in selecting the MxFP4Backend. model_type: for checking if gpt-oss - is_mxfp4_quant: whether the layer is quantized with mxfp4 Return: Rounded up hidden_size if rounding up is required based on the configs. @@ -246,28 +243,6 @@ def maybe_roundup_hidden_size( hidden_size, act_dtype, moe_parallel_config ) - # we are padding globally so EP buffer allocation works - if model_type == "gpt_oss" and is_mxfp4_quant: - from vllm.model_executor.layers.quantization.mxfp4 import ( - Mxfp4Backend, - get_mxfp4_backend, - ) - - current_mxfp4_backend = get_mxfp4_backend(is_lora_enabled) - - if ( - current_mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 - or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS - ): - hidden_size = round_up(hidden_size, 128) - elif ( - current_platform.is_rocm() - or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM - or current_mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 - or current_mxfp4_backend == Mxfp4Backend.MARLIN - ): - hidden_size = round_up(hidden_size, 256) - return hidden_size @@ -540,9 +515,6 @@ def __init__( moe_parallel_config=self.moe_parallel_config, is_lora_enabled=vllm_config.lora_config is not None, model_type=self.model_type, - is_mxfp4_quant=( - quant_config is not None and quant_config.is_mxfp4_quant(prefix, self) - ), ) self.hidden_size = hidden_size diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py new file mode 100644 index 000000000000..ddc6588dc517 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py @@ -0,0 +1,847 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from enum import Enum +from typing import Union + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm import envs +from vllm.logger import init_logger +from vllm.model_executor.layers.fused_moe import ( + FusedMoEConfig, +) +from vllm.model_executor.layers.fused_moe.all2all_utils import ( + maybe_make_prepare_finalize, +) +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEQuantConfig, + mxfp4_mxfp8_moe_quant_config, + mxfp4_w4a16_moe_quant_config, + ocp_mx_moe_quant_config, +) +from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( + _swizzle_mxfp4, + get_padding_alignment, +) +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + QuantKey, + kMxfp4Static, + kMxfp8Dynamic, +) +from vllm.platforms import current_platform +from vllm.utils.import_utils import has_triton_kernels +from vllm.utils.math_utils import round_up + +logger = init_logger(__name__) + +if has_triton_kernels(): + try: + from triton_kernels.matmul_ogs import PrecisionConfig + except (ImportError, AttributeError) as e: + logger.error( + "Failed to import Triton kernels. Please make sure your triton " + "version is compatible. Error: %s", + e, + ) + + +class Mxfp4MoeBackend(Enum): + NONE = "None" + # FlashInfer TRTLLM backends + FLASHINFER_TRTLLM_MXFP4_MXFP8 = "FLASHINFER_TRTLLM_MXFP4_MXFP8" + FLASHINFER_TRTLLM_MXFP4_BF16 = "FLASHINFER_TRTLLM_MXFP4_BF16" + # FlashInfer CUTLASS backends + FLASHINFER_CUTLASS_MXFP4_MXFP8 = "FLASHINFER_CUTLASS_MXFP4_MXFP8" + FLASHINFER_CUTLASS_MXFP4_BF16 = "FLASHINFER_CUTLASS_MXFP4_BF16" + # Marlin + BATCHED_MARLIN = "BATCHED_MARLIN" + MARLIN = "MARLIN" + # ROCm AITER (CK) + CK = "CK" + # Triton + TRITON = "TRITON" + TRITON_UNFUSED = "TRITON_UNFUSED" + # XPU + XPU = "XPU" + + +# Backends that share the same TRTLLM weight format +TRTLLM_BACKENDS = ( + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16, + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8, +) + +TRITON_BACKENDS = ( + Mxfp4MoeBackend.TRITON, + Mxfp4MoeBackend.TRITON_UNFUSED, +) + + +def backend_to_kernel_cls( + backend: Mxfp4MoeBackend, +) -> list[type[mk.FusedMoEExperts]]: + if backend in ( + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16, + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8, + ): + from vllm.model_executor.layers.fused_moe.experts.trtllm_mxfp4_moe import ( + TrtLlmMxfp4ExpertsModular, + TrtLlmMxfp4ExpertsMonolithic, + ) + + # NOTE: prefer Monolithic > Modular, so return Monolithic first. + return [TrtLlmMxfp4ExpertsMonolithic, TrtLlmMxfp4ExpertsModular] + + elif backend in ( + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16, + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8, + ): + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, + ) + + return [FlashInferExperts] + + elif backend == Mxfp4MoeBackend.TRITON: + from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + OAITritonExperts, + OAITritonMxfp4ExpertsMonolithic, + ) + + # NOTE: prefer Monolithic > Modular, so return Monolithic first. + return [OAITritonMxfp4ExpertsMonolithic, OAITritonExperts] + + elif backend == Mxfp4MoeBackend.TRITON_UNFUSED: + from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( + UnfusedOAITritonExperts, + ) + + return [UnfusedOAITritonExperts] + + elif backend == Mxfp4MoeBackend.MARLIN: + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + MarlinExperts, + ) + + return [MarlinExperts] + + elif backend == Mxfp4MoeBackend.BATCHED_MARLIN: + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + BatchedMarlinExperts, + ) + + return [BatchedMarlinExperts] + + elif backend == Mxfp4MoeBackend.CK: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + AiterExperts, + ) + + return [AiterExperts] + + elif backend == Mxfp4MoeBackend.XPU: + raise NotImplementedError("XPU backend uses XpuMxfp4MoEMethod directly.") + else: + raise ValueError(f"Unknown MXFP4 MoE backend: {backend.value}") + + +def map_mxfp4_backend(runner_backend: str) -> Mxfp4MoeBackend: + """Map user's moe_backend string to Mxfp4MoeBackend.""" + mapping: dict[str, Mxfp4MoeBackend] = { + "flashinfer_trtllm": Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16, + "flashinfer_trtllm_afp8": Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8, + "flashinfer_cutlass": Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16, + "flashinfer_cutlass_afp8": Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8, + "triton": Mxfp4MoeBackend.TRITON, + "marlin": Mxfp4MoeBackend.MARLIN, + "ck": Mxfp4MoeBackend.CK, + } + if backend := mapping.get(runner_backend): + return backend + raise ValueError( + f"moe_backend='{runner_backend}' is not supported for MXFP4 MoE. " + f"Expected one of {list(mapping.keys())}." + ) + + +def _get_priority_backends() -> list[Mxfp4MoeBackend]: + """ + Get available backends in priority order based on platform and config. + Only includes BF16 backends. MXFP8 backends are selected via env vars. + """ + _AVAILABLE_BACKENDS = [ + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16, + Mxfp4MoeBackend.CK, + Mxfp4MoeBackend.TRITON, + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16, + Mxfp4MoeBackend.TRITON_UNFUSED, + Mxfp4MoeBackend.MARLIN, + Mxfp4MoeBackend.BATCHED_MARLIN, + ] + return _AVAILABLE_BACKENDS + + +def _backend_activation_key(backend: Mxfp4MoeBackend) -> QuantKey | None: + """Map backend to its activation key (MXFP8 or None for BF16).""" + if backend in ( + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8, + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8, + ): + return kMxfp8Dynamic + return None + + +def select_mxfp4_moe_backend( + config: FusedMoEConfig, +) -> tuple[Mxfp4MoeBackend, type[mk.FusedMoEExperts] | None]: + """ + Select the primary MXFP4 MoE backend. + Note: Shape-specific fallbacks may still occur at runtime. + """ + triton_kernels_supported = has_triton_kernels() and ( + 9, + 0, + ) <= current_platform.get_device_capability() < (11, 0) + + # LoRA: separate experts backend path + if config.is_lora_enabled: + if not current_platform.is_cuda(): + raise NotImplementedError("Mxfp4 LoRA only supported on CUDA Platform.") + if envs.VLLM_MXFP4_USE_MARLIN is False and triton_kernels_supported: + logger.info_once("Using Triton backend for mxfp4 lora") + return Mxfp4MoeBackend.TRITON_UNFUSED, backend_to_kernel_cls( + Mxfp4MoeBackend.TRITON_UNFUSED + )[0] + logger.info_once("Using Marlin backend for mxfp4 lora") + return Mxfp4MoeBackend.MARLIN, backend_to_kernel_cls(Mxfp4MoeBackend.MARLIN)[0] + + activation_format = ( + mk.FusedMoEActivationFormat.BatchedExperts + if config.moe_parallel_config.use_batched_activation_format + else mk.FusedMoEActivationFormat.Standard + ) + + def _make_log_backend(backend: Mxfp4MoeBackend): + return f"Using '{backend.value}' Mxfp4 MoE backend." + + def _make_log_unsupported(backend: Mxfp4MoeBackend, reason: str | None) -> str: + if reason: + return ( + f"Mxfp4 MoE backend '{backend.value}' does not support the " + f"deployment configuration since {reason}." + ) + return ( + f"Mxfp4 MoE backend '{backend.value}' does not support the " + "deployment configuration." + ) + + def _return_or_raise( + backend: Mxfp4MoeBackend, + config: FusedMoEConfig, + weight_key: QuantKey | None, + activation_key: QuantKey | None, + activation_format: mk.FusedMoEActivationFormat, + ) -> tuple[Mxfp4MoeBackend, type[mk.FusedMoEExperts]]: + reason: str | None = None + for k_cls in backend_to_kernel_cls(backend): + supported, reason = k_cls.is_supported_config( + k_cls, config, weight_key, activation_key, activation_format + ) + if supported: + logger.info_once(_make_log_backend(backend), scope="local") + return backend, k_cls + raise ValueError(_make_log_unsupported(backend, reason)) + + runner_backend = config.moe_backend + if runner_backend != "auto": + requested_backend = map_mxfp4_backend(runner_backend) + if ( + activation_format == mk.FusedMoEActivationFormat.BatchedExperts + and requested_backend == Mxfp4MoeBackend.MARLIN + ): + requested_backend = Mxfp4MoeBackend.BATCHED_MARLIN + return _return_or_raise( + requested_backend, + config, + kMxfp4Static, + _backend_activation_key(requested_backend), + activation_format, + ) + + # Select kernels in order of backend. + AVAILABLE_BACKENDS = _get_priority_backends() + + # Handle explicit FlashInfer MXFP4 BF16 configuration. + if envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_BF16"): + if not envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: + AVAILABLE_BACKENDS.remove(Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16) + AVAILABLE_BACKENDS.remove(Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16) + else: + if current_platform.is_device_capability(90): + return _return_or_raise( + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16, + config, + kMxfp4Static, + None, + activation_format, + ) + if current_platform.is_device_capability_family(100): + return _return_or_raise( + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16, + config, + kMxfp4Static, + None, + activation_format, + ) + raise ValueError( + "VLLM_USE_FLASHINFER_MOE_MXFP4_BF16=1 is set but the " + "current device capability is not supported. " + "Only SM90 (CUTLASS) and SM100+ (TRTLLM) are supported." + ) + + # Handle explicit FlashInfer MXFP4 MXFP8 TRTLLM configuration. + if ( + envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8") + and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 + ): + return _return_or_raise( + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8, + config, + kMxfp4Static, + kMxfp8Dynamic, + activation_format, + ) + + # Handle explicit FlashInfer MXFP4 MXFP8 CUTLASS configuration. + if ( + envs.is_set("VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS") + and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS + ): + return _return_or_raise( + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8, + config, + kMxfp4Static, + kMxfp8Dynamic, + activation_format, + ) + + # Handle explicit Marlin MXFP4 configuration. + if envs.is_set("VLLM_MXFP4_USE_MARLIN") and envs.VLLM_MXFP4_USE_MARLIN: + return _return_or_raise( + Mxfp4MoeBackend.MARLIN, + config, + kMxfp4Static, + None, + activation_format, + ) + + for backend in AVAILABLE_BACKENDS: + activation_key = _backend_activation_key(backend) + for k_cls in backend_to_kernel_cls(backend): + supported, reason = k_cls.is_supported_config( + k_cls, config, kMxfp4Static, activation_key, activation_format + ) + if supported: + logger.info_once(_make_log_backend(backend), scope="local") + return backend, k_cls + else: + logger.debug_once(_make_log_unsupported(backend, reason), scope="local") + + if current_platform.is_xpu(): + backend = Mxfp4MoeBackend.XPU + logger.info_once(_make_log_backend(backend)) + return backend, None + + if current_platform.is_cuda() or current_platform.is_rocm(): + raise NotImplementedError( + "No MXFP4 MoE backend supports the deployment configuration." + ) + + return Mxfp4MoeBackend.NONE, None + + +def mxfp4_round_up_hidden_size_and_intermediate_size( + backend: Mxfp4MoeBackend, hidden_size: int, intermediate_size: int +) -> tuple[int, int]: + """Round up hidden_size and intermediate_size based on backend requirements.""" + if backend in (Mxfp4MoeBackend.MARLIN, Mxfp4MoeBackend.BATCHED_MARLIN): + intermediate_size = round_up(intermediate_size, 128) + if current_platform.is_xpu(): + hidden_size = round_up(hidden_size, 128) + else: + hidden_size = round_up(hidden_size, 256) + elif backend in TRTLLM_BACKENDS: + intermediate_size = round_up(intermediate_size, 256) + hidden_size = round_up(hidden_size, 256) + elif backend in ( + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16, + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8, + ): + intermediate_size = round_up(intermediate_size, 128) + hidden_size = round_up(hidden_size, 128) + elif current_platform.is_rocm(): + pad_align = get_padding_alignment() + intermediate_size = round_up(intermediate_size, pad_align) + hidden_size = round_up(hidden_size, pad_align) + else: + intermediate_size = round_up(intermediate_size, 64) + return hidden_size, intermediate_size + + +def convert_to_mxfp4_moe_kernel_format( + mxfp4_backend: Mxfp4MoeBackend, + layer: torch.nn.Module, + w13_weight: torch.Tensor, + w2_weight: torch.Tensor, + w13_weight_scale: torch.Tensor, + w2_weight_scale: torch.Tensor, + w13_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, + _cache_permute_indices: dict[torch.Size, torch.Tensor] | None = None, +) -> tuple[ + torch.Tensor, + torch.Tensor, + Union[torch.Tensor, "PrecisionConfig"], + Union[torch.Tensor, "PrecisionConfig"], + torch.Tensor | None, + torch.Tensor | None, +]: + """Convert loaded weights into backend-specific kernel format.""" + + num_experts = w13_weight.shape[0] + intermediate_size = w13_weight.shape[1] // 2 + hidden_size = w13_weight.shape[2] * 2 + + sf_block_size = 32 # mxfp4 block size + + if mxfp4_backend in (Mxfp4MoeBackend.MARLIN, Mxfp4MoeBackend.BATCHED_MARLIN): + from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( + prepare_moe_mxfp4_layer_for_marlin, + ) + + return prepare_moe_mxfp4_layer_for_marlin( + layer, + w13_weight, + w2_weight, + w13_weight_scale, + w2_weight_scale, + w13_bias, + w2_bias, + ) + + elif mxfp4_backend in TRTLLM_BACKENDS: + assert _cache_permute_indices is not None + from flashinfer.fp4_quantization import nvfp4_block_scale_interleave + from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache + + # gemm1_alpha/beta/clamp_limit are created by the expert class + # (TrtLlmMxfp4ExpertsBase), not on the layer. + + w13_weight = w13_weight.data + w2_weight = w2_weight.data + w13_weight_scale = w13_weight_scale.data + w2_weight_scale = w2_weight_scale.data + assert w13_bias is not None and w2_bias is not None + w13_bias = w13_bias.data.to(torch.float32) + w2_bias = w2_bias.data.to(torch.float32) + + # Swap w1 and w3 as the definition of swiglu is different in trtllm-gen + def swap_every_two_rows(x, axis=-1): + shape = x.shape + if axis < 0: + axis = len(shape) + axis + new_shape = list(shape) + new_shape[axis] = shape[axis] // 2 + new_shape.insert(axis + 1, 2) + x = x.reshape(*new_shape) + x = x.flip(axis + 1) + new_shape = list(shape) + return x.reshape(*new_shape) + + w13_weight_scale = swap_every_two_rows(w13_weight_scale, -2) + w13_weight = swap_every_two_rows(w13_weight, -2) + w13_bias = swap_every_two_rows(w13_bias, -1) + + # Shuffle weights and scaling factors for transposed mma output + gemm1_weights_shuffled = [] + gemm1_scales_shuffled = [] + gemm2_weights_shuffled = [] + gemm2_scales_shuffled = [] + gemm1_bias_shuffled = [] + gemm2_bias_shuffled = [] + epilogue_tile_m = 128 + for i in range(num_experts): + # w13 weight + permute_indices = get_w2_permute_indices_with_cache( + _cache_permute_indices, + w13_weight[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm1_weights_shuffled.append( + w13_weight[i] + .view(torch.uint8)[permute_indices.to(w13_weight.device)] + .contiguous() + ) + # w13 scale + permute_sf_indices = get_w2_permute_indices_with_cache( + _cache_permute_indices, + w13_weight_scale[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) + gemm1_scales_shuffled.append( + nvfp4_block_scale_interleave( + w13_weight_scale[i] + .view(torch.uint8)[permute_sf_indices.to(w13_weight_scale.device)] + .contiguous() + ) + ) + # w13 bias + permute_bias_indices = get_w2_permute_indices_with_cache( + _cache_permute_indices, + w13_bias[i].clone().reshape(-1, 1), + epilogue_tile_m, + ) + gemm1_bias_shuffled.append( + w13_bias[i] + .clone() + .reshape(-1, 1)[permute_bias_indices.to(w13_bias.device)] + .contiguous() + ) + # w2 weight + permute_indices = get_w2_permute_indices_with_cache( + _cache_permute_indices, + w2_weight[i].view(torch.uint8), + epilogue_tile_m, + ) + gemm2_weights_shuffled.append( + w2_weight[i] + .view(torch.uint8)[permute_indices.to(w2_weight.device)] + .contiguous() + ) + # w2 scale + permute_sf_indices = get_w2_permute_indices_with_cache( + _cache_permute_indices, + w2_weight_scale[i].view(torch.uint8), + epilogue_tile_m, + num_elts_per_sf=16, + ) + gemm2_scales_shuffled.append( + nvfp4_block_scale_interleave( + w2_weight_scale[i] + .view(torch.uint8)[permute_sf_indices.to(w2_weight_scale.device)] + .contiguous() + ) + ) + # w2 bias + permute_indices = get_w2_permute_indices_with_cache( + _cache_permute_indices, + w2_bias[i].clone().reshape(-1, 1), + epilogue_tile_m, + ) + gemm2_bias_shuffled.append( + w2_bias[i] + .clone() + .reshape(-1, 1)[permute_indices.to(w2_bias.device)] + .contiguous() + ) + + w13_weight = torch.stack(gemm1_weights_shuffled) + w13_weight_scale = ( + torch.stack(gemm1_scales_shuffled) + .reshape(num_experts, 2 * intermediate_size, hidden_size // sf_block_size) + .view(torch.float8_e4m3fn) + ) + w2_weight = torch.stack(gemm2_weights_shuffled) + w2_weight_scale = ( + torch.stack(gemm2_scales_shuffled) + .reshape(num_experts, hidden_size, intermediate_size // sf_block_size) + .view(torch.float8_e4m3fn) + ) + w13_bias = torch.stack(gemm1_bias_shuffled).reshape(num_experts, -1) + w2_bias = torch.stack(gemm2_bias_shuffled).reshape(num_experts, -1) + + return ( + w13_weight, + w2_weight, + w13_weight_scale, + w2_weight_scale, + w13_bias, + w2_bias, + ) + + elif mxfp4_backend in ( + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16, + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8, + ): + # De-interleave and swap for w13 weight, bias, and scales + w13_w = w13_weight.data + gate_w, up_w = w13_w[:, ::2, :], w13_w[:, 1::2, :] + deinterleaved_w13_w = torch.cat([gate_w, up_w], dim=1) + w1_w, w3_w = torch.chunk(deinterleaved_w13_w, 2, dim=1) + w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1) + + assert w13_bias is not None and w2_bias is not None + w13_b = w13_bias.data.to(torch.float32) + gate_b, up_b = w13_b[:, ::2], w13_b[:, 1::2] + deinterleaved_w13_b = torch.cat([gate_b, up_b], dim=1) + b1, b3 = torch.chunk(deinterleaved_w13_b, 2, dim=-1) + w13_bias_swapped = torch.cat([b3, b1], dim=-1).to(torch.bfloat16) + + w13_s = w13_weight_scale.data + gate_s, up_s = w13_s[:, ::2, :], w13_s[:, 1::2, :] + deinterleaved_w13_s = torch.cat([gate_s, up_s], dim=1) + s1, s3 = torch.chunk(deinterleaved_w13_s, 2, dim=1) + w13_scale_swapped = torch.cat([s3, s1], dim=1) + + if mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8: + from flashinfer import block_scale_interleave + + orig_shape = w13_scale_swapped.shape + w13_scale_interleaved = block_scale_interleave( + w13_scale_swapped.view(torch.uint8) + ).reshape(orig_shape) + + w2_s = w2_weight_scale.data + orig_shape = w2_s.shape + w2_scale_interleaved = block_scale_interleave( + w2_s.view(torch.uint8) + ).reshape(orig_shape) + + return ( + w13_weight_swapped, + w2_weight, + w13_scale_interleaved, + w2_scale_interleaved, + w13_bias_swapped, + w2_bias, + ) + + else: + assert mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16 + + def _interleave_mxfp4_cutlass_sm90(w): + w_shape = w.shape + w_interleaved = w.reshape(w_shape[0], w_shape[1], (w_shape[2] // 4), 4) + w_interleaved = w_interleaved.permute(0, 2, 1, 3) + w_interleaved = w_interleaved.reshape( + w_shape[0], w_shape[2] // 4, w_shape[1] * 4 + ) + return w_interleaved + + w31_scales = w13_scale_swapped.to(torch.uint8) + w31_scales_interleaved = _interleave_mxfp4_cutlass_sm90(w31_scales) + + w2_scale = w2_weight_scale.data.to(torch.uint8) + w2_scale_interleaved = _interleave_mxfp4_cutlass_sm90(w2_scale) + + return ( + w13_weight_swapped, + w2_weight, + w31_scales_interleaved, + w2_scale_interleaved, + w13_bias_swapped, + w2_bias, + ) + + elif mxfp4_backend == Mxfp4MoeBackend.CK: + from vllm._aiter_ops import rocm_aiter_ops + + if w13_bias is not None: + w13_bias = w13_bias.data.to(torch.float32) + if w2_bias is not None: + w2_bias = w2_bias.data.to(torch.float32) + + e, n, k = w13_weight.shape + + # De-interleave w13 rows: gate/up pairs -> contiguous gate, up blocks + w13_weight.view(torch.uint8).copy_( + w13_weight.data.view(torch.uint8) + .view(e, n // 2, 2, k) + .permute(0, 2, 1, 3) + .contiguous() + .view(e, n, k) + ) + w13_weight_scale.data = ( + w13_weight_scale.data.view(e, n // 2, 2, -1) + .permute(0, 2, 1, 3) + .contiguous() + .view(e, n, -1) + ) + + # View as native FP4 dtype for AITER shuffle + w13_weight.data = w13_weight.data.view(torch.float4_e2m1fn_x2) + w2_weight.data = w2_weight.data.view(torch.float4_e2m1fn_x2) + + # Shuffle weights and scales for AITER CK kernel layout + w13_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w13_weight, 16, True) + shuffled_w13_scale = rocm_aiter_ops.shuffle_scale_a16w4( + w13_weight_scale.view(-1, w13_weight_scale.shape[-1]), + num_experts, + True, + ) + + w2_weight.data = rocm_aiter_ops.shuffle_weight_a16w4(w2_weight, 16, False) + shuffled_w2_scale = rocm_aiter_ops.shuffle_scale_a16w4( + w2_weight_scale.view(-1, w2_weight_scale.shape[-1]), + num_experts, + False, + ) + + # Permute bias to match de-interleaved weight layout + if w13_bias is not None: + w13_bias = ( + w13_bias.data.view(-1, n // 2, 2) + .permute(0, 2, 1) + .contiguous() + .view(-1, n) + ) + + return ( + w13_weight, + w2_weight, + shuffled_w13_scale, + shuffled_w2_scale, + w13_bias, + w2_bias, + ) + + elif mxfp4_backend in TRITON_BACKENDS: + from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig + + assert w13_bias is not None and w2_bias is not None + w13_bias = w13_bias.to(torch.float32) + w2_bias = w2_bias.to(torch.float32) + + w13_weight, w13_flex, w13_scale = _swizzle_mxfp4( + w13_weight, + w13_weight_scale, + ) + w2_weight, w2_flex, w2_scale = _swizzle_mxfp4( + w2_weight, + w2_weight_scale, + ) + + w13_precision_config = PrecisionConfig( + weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex) + ) + w2_precision_config = PrecisionConfig( + weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex) + ) + + del layer.w13_weight + del layer.w2_weight + + return ( + w13_weight, + w2_weight, + w13_precision_config, + w2_precision_config, + w13_bias, + w2_bias, + ) + else: + raise ValueError( + f"Unsupported mxfp4_backend: {mxfp4_backend}: " + f"should be one of: {list(Mxfp4MoeBackend)}." + ) + + +def make_mxfp4_moe_quant_config( + mxfp4_backend: Mxfp4MoeBackend, + w1_scale: Union[torch.Tensor, "PrecisionConfig"], + w2_scale: Union[torch.Tensor, "PrecisionConfig"], + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +) -> FusedMoEQuantConfig | None: + """Create a FusedMoEQuantConfig for the given MXFP4 backend.""" + if mxfp4_backend in ( + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8, + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_MXFP8, + ): + return mxfp4_mxfp8_moe_quant_config( + w1_bias=w1_bias, + w2_bias=w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) + elif mxfp4_backend in ( + Mxfp4MoeBackend.MARLIN, + Mxfp4MoeBackend.BATCHED_MARLIN, + Mxfp4MoeBackend.TRITON, + Mxfp4MoeBackend.TRITON_UNFUSED, + Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_BF16, + Mxfp4MoeBackend.FLASHINFER_CUTLASS_MXFP4_BF16, + Mxfp4MoeBackend.CK, + ): + return mxfp4_w4a16_moe_quant_config( + w1_bias=w1_bias, + w2_bias=w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) + else: + return ocp_mx_moe_quant_config( + quant_dtype="mxfp4", + w1_bias=w1_bias, + w2_bias=w2_bias, + w1_scale=w1_scale, + w2_scale=w2_scale, + ) + + +def make_mxfp4_moe_kernel( + moe_quant_config: FusedMoEQuantConfig, + moe_config: FusedMoEConfig, + experts_cls: type[mk.FusedMoEExperts], + mxfp4_backend: Mxfp4MoeBackend, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + shared_experts: torch.nn.Module | None = None, +) -> mk.FusedMoEKernel: + """Create a FusedMoEKernel for the given MXFP4 backend.""" + is_monolithic = issubclass(experts_cls, mk.FusedMoEExpertsMonolithic) + + # Create Prepare/Finalize. + prepare_finalize = maybe_make_prepare_finalize( + moe=moe_config, + quant_config=moe_quant_config, + routing_tables=routing_tables, + allow_new_interface=True, + use_monolithic=is_monolithic, + ) + assert prepare_finalize is not None + + logger.info_once("Using %s", prepare_finalize.__class__.__name__, scope="local") + + # Create Experts. + if prepare_finalize.activation_format == mk.FusedMoEActivationFormat.BatchedExperts: + max_num_tokens = prepare_finalize.max_num_tokens_per_rank() + assert max_num_tokens is not None + experts = experts_cls( + moe_config=moe_config, + quant_config=moe_quant_config, + max_num_tokens=max_num_tokens, + num_dispatchers=prepare_finalize.num_dispatchers(), + ) + else: + experts = experts_cls( + moe_config=moe_config, + quant_config=moe_quant_config, + ) + + kernel = mk.FusedMoEKernel( + prepare_finalize, + experts, + shared_experts=( + shared_experts + if moe_config.moe_parallel_config.use_deepep_ll_kernels + else None + ), + moe_parallel_config=moe_config.moe_parallel_config, + inplace=( + not moe_config.disable_inplace and mxfp4_backend not in TRTLLM_BACKENDS + ), + ) + + return kernel diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py index 8a224cb39e7c..35451e87dd7d 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py @@ -14,7 +14,6 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, - mxfp4_w4a16_moe_quant_config, nvfp4_moe_quant_config, nvfp4_w4a16_moe_quant_config, ) @@ -87,7 +86,7 @@ def backend_to_kernel_cls( return [FlashInferExperts] elif backend == NvFp4MoeBackend.FLASHINFER_CUTEDSL: - from vllm.model_executor.layers.fused_moe.flashinfer_cutedsl_moe import ( + from vllm.model_executor.layers.fused_moe.experts.flashinfer_cutedsl_moe import ( # noqa: E501 FlashInferCuteDSLExperts, ) @@ -347,16 +346,6 @@ def convert_to_nvfp4_moe_kernel_format( ) -def make_mxfp4_moe_quant_config( - w13_scale: torch.Tensor, - w2_scale: torch.Tensor, -) -> FusedMoEQuantConfig: - return mxfp4_w4a16_moe_quant_config( - w1_scale=w13_scale, - w2_scale=w2_scale, - ) - - def make_nvfp4_moe_quant_config( backend: NvFp4MoeBackend, w13_scale: torch.Tensor, diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index b9f161ae88ec..98af53fcec58 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -24,6 +24,7 @@ kFp8Static128BlockSym, kFp8StaticChannelSym, kFp8StaticTensorSym, + kMxfp4Static, ) @@ -201,6 +202,8 @@ def rocm_aiter_fused_experts( activation_method = ActivationMethod.SILU elif activation == MoEActivation.GELU: activation_method = ActivationMethod.GELU + elif activation == MoEActivation.SWIGLUOAI: + activation_method = rocm_aiter_ops.get_aiter_activation_type("swiglu") else: raise ValueError(f"Unsupported activation: {activation}") @@ -247,8 +250,8 @@ def rocm_aiter_fused_experts( else: quant_method = QuantMethod.NO.value - # quark moe for mxfp4 w_dtype mxfp4 a_dtype - if quant_config.use_mxfp4_w4a4: + # mxfp4: both w4a4 (quark) and w4a16 (oracle CK) use BLOCK_1X32 + if quant_config.use_mxfp4_w4a4 or quant_config.use_mxfp4_w4a16: quant_method = QuantMethod.BLOCK_1X32.value # w8a8 block-scaled if quant_config.block_shape is not None and quant_config.use_fp8_w8a8: @@ -289,6 +292,8 @@ def rocm_aiter_fused_experts( doweight_stage1=apply_router_weight_on_input, num_local_tokens=num_local_tokens, output_dtype=output_dtype, + bias1=quant_config.w1_bias if quant_config.use_mxfp4_w4a16 else None, + bias2=quant_config.w2_bias if quant_config.use_mxfp4_w4a16 else None, ) @@ -319,21 +324,23 @@ def _supports_quant_scheme( weight_key: QuantKey | None, activation_key: QuantKey | None, ) -> bool: - # TODO(rob): AITER also supports MXFP4, which is not - # yet supported via an Oracle. Once it is, we will add - # MXFP4 to this list. SUPPORTED_W_A = [ (None, None), (kFp8Static128BlockSym, kFp8Dynamic128Sym), (kFp8StaticTensorSym, kFp8StaticTensorSym), (kFp8StaticTensorSym, kFp8DynamicTensorSym), (kFp8StaticChannelSym, kFp8DynamicTokenSym), + (kMxfp4Static, None), ] return (weight_key, activation_key) in SUPPORTED_W_A @staticmethod def _supports_activation(activation: MoEActivation) -> bool: - return activation in [MoEActivation.SILU, MoEActivation.GELU] + return activation in [ + MoEActivation.SILU, + MoEActivation.GELU, + MoEActivation.SWIGLUOAI, + ] @staticmethod def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: diff --git a/vllm/model_executor/layers/fused_moe/trtllm_moe.py b/vllm/model_executor/layers/fused_moe/trtllm_moe.py deleted file mode 100644 index 30ed77a8b64b..000000000000 --- a/vllm/model_executor/layers/fused_moe/trtllm_moe.py +++ /dev/null @@ -1,184 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -import torch - -import vllm.model_executor.layers.fused_moe.modular_kernel as mk -from vllm.model_executor.layers.fused_moe.activation import MoEActivation -from vllm.model_executor.layers.fused_moe.config import ( - FusedMoEConfig, - FusedMoEParallelConfig, - FusedMoEQuantConfig, -) -from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( - TopKWeightAndReduceNoOP, -) -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - QuantKey, -) - - -class TrtLlmGenExperts(mk.FusedMoEExpertsModular): - """TensorRT-LLM-based fused MoE expert implementation.""" - - def __init__( - self, - moe_config: FusedMoEConfig, - quant_config: FusedMoEQuantConfig, - max_capture_size, - ): - super().__init__(moe_config, quant_config) - self.device = torch.accelerator.current_device_index() - self.num_experts = moe_config.num_local_experts - self.gemm1_alpha = torch.tensor( - [1.702] * self.num_experts, dtype=torch.float32, device=self.device - ) - self.gemm1_beta = torch.tensor( - [1.0] * self.num_experts, dtype=torch.float32, device=self.device - ) - self.gemm1_clamp_limit = torch.tensor( - [7.0] * self.num_experts, dtype=torch.float32, device=self.device - ) - self.max_capture_size = max_capture_size - - @staticmethod - def activation_format() -> mk.FusedMoEActivationFormat: - return mk.FusedMoEActivationFormat.Standard - - @staticmethod - def _supports_current_device() -> bool: - raise NotImplementedError( - "TrtLlmGenExperts is not yet used by an Oracle. " - "This method should not be called." - ) - - @staticmethod - def _supports_no_act_and_mul() -> bool: - raise NotImplementedError( - "TrtLlmGenExperts is not yet used by an Oracle. " - "This method should not be called." - ) - - @staticmethod - def _supports_quant_scheme( - weight_key: QuantKey | None, - activation_key: QuantKey | None, - ) -> bool: - raise NotImplementedError( - "TrtLlmGenExperts is not yet used by an Oracle. " - "This method should not be called." - ) - - @staticmethod - def _supports_activation(activation: MoEActivation) -> bool: - raise NotImplementedError( - "TrtLlmGenExperts is not yet used by an Oracle. " - "This method should not be called." - ) - - @staticmethod - def _supports_parallel_config(moe_parallel_config: FusedMoEParallelConfig) -> bool: - raise NotImplementedError( - "TrtLlmGenExperts is not yet used by an Oracle. " - "This method should not be called." - ) - - def supports_expert_map(self) -> bool: - return True - - def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: - return TopKWeightAndReduceNoOP() - - def workspace_shapes( - self, - M: int, - N: int, - K: int, - topk: int, - global_num_experts: int, - local_num_experts: int, - expert_tokens_meta: mk.ExpertTokensMetadata | None, - activation: MoEActivation, - ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: - # The workspaces for this implementation are managed by flashinfer. - workspace1 = (0,) - workspace2 = (0,) - output = (M, K) - return (workspace1, workspace2, output) - - def apply( - self, - output: torch.Tensor, - hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - activation: MoEActivation, - global_num_experts: int, - expert_map: torch.Tensor | None, - a1q_scale: torch.Tensor | None, - a2_scale: torch.Tensor | None, - workspace13: torch.Tensor, - workspace2: torch.Tensor, - expert_tokens_meta: mk.ExpertTokensMetadata | None, - apply_router_weight_on_input: bool, - ): - topk = topk_ids.size(-1) - local_num_experts = w1.size(0) - intermediate_size = w2.size(1) - local_expert_offset = self.moe_config.ep_rank * local_num_experts - - x_quant = hidden_states - x_scale = a1q_scale - if x_scale is not None: - x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*x_quant.shape[:-1], -1) - - packed_tensor = (topk_ids.to(torch.int32) << 16) | topk_weights.to( - torch.bfloat16 - ).view(torch.int16) - - assert self.w1_scale is not None - assert self.w2_scale is not None - kwargs = { - "topk_ids": packed_tensor, - "routing_bias": None, - "hidden_states": x_quant, - "hidden_states_scale": x_scale, - "gemm1_weights": w1, - "gemm1_weights_scale": self.w1_scale, - "gemm1_bias": self.w1_bias, - "gemm1_alpha": self.gemm1_alpha, - "gemm1_beta": self.gemm1_beta, - "gemm1_clamp_limit": self.gemm1_clamp_limit, - "gemm2_weights": w2, - "gemm2_weights_scale": self.w2_scale, - "gemm2_bias": self.w2_bias, - "output1_scale_scalar": None, - "output1_scale_gate_scalar": None, - "output2_scale_scalar": None, - "num_experts": global_num_experts, - "top_k": topk, - "n_group": None, - "topk_group": None, - "intermediate_size": intermediate_size, - "local_expert_offset": local_expert_offset, - "local_num_experts": local_num_experts, - "routed_scaling_factor": None, - "routing_method_type": 1, - "do_finalize": True, - "output": output, - "tune_max_num_tokens": max(self.max_capture_size, 1), - } - - from flashinfer import trtllm_fp4_block_scale_routed_moe - - from vllm.utils.flashinfer import autotune - - with autotune(False): - # Enable autotune when, - # https://github.com/flashinfer-ai/flashinfer/issues/2023 is - # resolved. - trtllm_fp4_block_scale_routed_moe(**kwargs) - - return output diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 4adb7f1cfa0e..ba4494f6cdc3 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -25,6 +25,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( per_tensor_dequantize, ) +from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.math_utils import cdiv from vllm.utils.torch_utils import is_torch_equal_or_newer @@ -265,7 +266,7 @@ def moe_kernel_quantize_input( # weights are already dequantized, and we proceed with normal # activation quantization below. - if quant_dtype == torch.float8_e4m3fn: + if quant_dtype == current_platform.fp8_dtype(): return _fp8_quantize(A, A_scale, per_act_token_quant, block_shape) elif quant_dtype == torch.int8: return _int8_quantize(A, A_scale, per_act_token_quant, block_shape) @@ -316,27 +317,6 @@ def normalize_batched_scales_shape( return scales -def _validate_scale_shape( - a: torch.Tensor, - a_scale: torch.Tensor | None, - per_act_token_quant: bool, - block_shape: list[int] | None, -) -> None: - if a_scale is None: - return - - if not per_act_token_quant and block_shape is None: - assert a_scale.numel() == 1, f"{a_scale.shape}" - elif per_act_token_quant: - assert a_scale.shape[0] == a.shape[0] and a_scale.shape[1] == 1, ( - f"{a_scale.shape[0]} == {a.shape[0]} and {a_scale.shape[1]} == 1" - ) - else: - assert block_shape is not None - expected = (a.shape[0], cdiv(a.shape[1], block_shape[1])) - assert a_scale.shape == expected, f"{a_scale.shape} == {expected}" - - # Torch custom ops can't deal with outputs aliasing inputs so we need to # disable inplace for torch >= 2.9. # See https://github.com/vllm-project/vllm/issues/26378 diff --git a/vllm/model_executor/layers/pooler/activations.py b/vllm/model_executor/layers/pooler/activations.py index b57e6ba68b94..4213ee7b85cb 100644 --- a/vllm/model_executor/layers/pooler/activations.py +++ b/vllm/model_executor/layers/pooler/activations.py @@ -16,25 +16,22 @@ logger = init_logger(__name__) -def get_classification_act_fn( +def get_act_fn( config: PretrainedConfig, + static_num_labels: bool = True, ) -> "PoolerActivation": + # get classification act_fn # Implement alignment with transformers ForSequenceClassificationLoss # https://github.com/huggingface/transformers/blob/57bb6db6ee4cfaccc45b8d474dfad5a17811ca60/src/transformers/loss/loss_utils.py#L92 problem_type = getattr(config, "problem_type", "") if problem_type == "regression": return PoolerIdentity() if problem_type == "single_label_classification": - return PoolerClassify() + return PoolerClassify(static_num_labels=static_num_labels) if problem_type == "multi_label_classification": return PoolerMultiLabelClassify() - return PoolerClassify() - - -def get_cross_encoder_act_fn( - config: PretrainedConfig, -) -> "PoolerActivation": + # get cross_encoder act_fn function_name: str | None = None if ( hasattr(config, "sentence_transformers") @@ -55,24 +52,16 @@ def get_cross_encoder_act_fn( fn = resolve_obj_by_qualname(function_name)() return PoolerActivation.wraps(fn) - return PoolerClassify() + return PoolerClassify(static_num_labels=static_num_labels) def resolve_classifier_act_fn( model_config: ModelConfig, static_num_labels: bool = True, - act_fn: "PoolerActivation | str | None" = None, + act_fn: "PoolerActivation | None" = None, ): - if isinstance(act_fn, str): - if act_fn == "classify": - return get_classification_act_fn(model_config.hf_config) - if act_fn == "score": - return get_cross_encoder_act_fn(model_config.hf_config) - - raise ValueError(f"act_fn [{act_fn=}] not supported.") - if act_fn is None: - return PoolerClassify(static_num_labels=static_num_labels) + return get_act_fn(model_config.hf_config, static_num_labels) assert callable(act_fn) return act_fn @@ -97,9 +86,8 @@ def forward_chunk(self, pooled_data: torch.Tensor) -> torch.Tensor: def forward(self, pooled_data: _T) -> _T: # shape: - # classify (& score) -> (batch_size, num_classes) - # embed -> (batch_size, embedding_dim) or list(embedding_dim) - # (batch_size, dimensions) or list(dimensions) if using MRL + # classify -> (batch_size, num_classes) + # embed -> (batch_size, embedding_size) or list(embedding_size) if isinstance(pooled_data, list): return [self.forward_chunk(data) for data in pooled_data] diff --git a/vllm/model_executor/layers/pooler/seqwise/heads.py b/vllm/model_executor/layers/pooler/seqwise/heads.py index 42059284e5cd..31a961223927 100644 --- a/vllm/model_executor/layers/pooler/seqwise/heads.py +++ b/vllm/model_executor/layers/pooler/seqwise/heads.py @@ -56,29 +56,31 @@ def forward( if isinstance(pooled_data, list): pooled_data = torch.stack(pooled_data) - # pooled_data shape: [batchsize, hidden_dimension] + # pooled_data shape: [batchsize, hidden_size] if self.head_dtype is not None: pooled_data = pooled_data.to(self.head_dtype) # Apply ST projector if self.projector is not None: - pooled_data = self.projector(pooled_data) - # pooled_data shape: [batchsize, embedding_dimension] + embeddings = self.projector(pooled_data) + else: + embeddings = pooled_data + # embeddings shape: [batchsize, embedding_size] # for matryoshka representation dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params] if any(d is not None for d in dimensions_list): # change the output dimension - assert len(pooled_data) == len(dimensions_list) - if len(set(dimensions_list)) == 1 and not isinstance(pooled_data, list): + assert len(embeddings) == len(dimensions_list) + if len(set(dimensions_list)) == 1 and not isinstance(embeddings, list): # if all dimensions are the same d = dimensions_list[0] - pooled_data = pooled_data[..., :d] + embeddings = embeddings[..., :d] else: - pooled_data = [ + embeddings = [ vecs if d is None else vecs[..., :d] - for vecs, d in zip(pooled_data, dimensions_list) + for vecs, d in zip(embeddings, dimensions_list) ] # for normalize @@ -86,15 +88,15 @@ def forward( flags = [p.use_activation for p in pooling_params] if len(set(flags)) == 1: if flags[0]: - pooled_data = self.activation(pooled_data) + embeddings = self.activation(embeddings) else: - pooled_data = [ + embeddings = [ self.activation(vecs) if f else vecs - for vecs, f in zip(pooled_data, flags) + for vecs, f in zip(embeddings, flags) ] - # pooled_data shape: [batchsize, embedding_dimension] - return pooled_data + # embeddings shape: [batchsize, embedding_size] + return embeddings class ClassifierPoolerHead(SequencePoolerHead): @@ -113,7 +115,7 @@ def __init__( self.activation = activation def get_supported_tasks(self) -> Set[PoolingTask]: - return {"classify", "score"} + return {"classify"} def forward( self, @@ -131,21 +133,23 @@ def forward( pooled_data = pooled_data.to(self.head_dtype) if self.classifier is not None: - pooled_data = self.classifier(pooled_data) - # pooled_data shape: [batchsize, num_labels] + logits = self.classifier(pooled_data) + else: + logits = pooled_data + # logits shape: [batchsize, num_labels] if self.logit_bias is not None: - pooled_data -= self.logit_bias + logits -= self.logit_bias if self.activation is not None: flags = [p.use_activation for p in pooling_params] if len(set(flags)) == 1: - pooled_data = self.activation(pooled_data) if flags[0] else pooled_data + logits = self.activation(logits) if flags[0] else logits else: - pooled_data = [ + logits = [ self.activation(vecs) if f else vecs - for vecs, f in zip(pooled_data, flags) + for vecs, f in zip(logits, flags) ] - # pooled_data shape: [batchsize, num_labels] - return pooled_data + # logits shape: [batchsize, num_labels] + return logits diff --git a/vllm/model_executor/layers/pooler/seqwise/methods.py b/vllm/model_executor/layers/pooler/seqwise/methods.py index 5d8551095096..f3c7f29d6092 100644 --- a/vllm/model_executor/layers/pooler/seqwise/methods.py +++ b/vllm/model_executor/layers/pooler/seqwise/methods.py @@ -17,7 +17,7 @@ class SequencePoolingMethod(nn.Module, ABC): def get_supported_tasks(self) -> Set[PoolingTask]: - return {"token_embed", "token_classify", "embed", "classify", "score"} + return {"token_embed", "token_classify", "embed", "classify"} def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate: return PoolingParamsUpdate() diff --git a/vllm/model_executor/layers/pooler/seqwise/poolers.py b/vllm/model_executor/layers/pooler/seqwise/poolers.py index 8bf3e25e66b6..f46834a7c3f2 100644 --- a/vllm/model_executor/layers/pooler/seqwise/poolers.py +++ b/vllm/model_executor/layers/pooler/seqwise/poolers.py @@ -108,7 +108,7 @@ def pooler_for_classify( *, pooling: SequencePoolingMethod | SequencePoolingFn | None = None, classifier: ClassifierFn | None = None, - act_fn: PoolerActivation | str | None = None, + act_fn: PoolerActivation | None = None, ): if pooling is None: pooling = get_seq_pooling_method(pooler_config.get_seq_pooling_type()) diff --git a/vllm/model_executor/layers/pooler/special.py b/vllm/model_executor/layers/pooler/special.py index 5e0f9ec75597..686072632685 100644 --- a/vllm/model_executor/layers/pooler/special.py +++ b/vllm/model_executor/layers/pooler/special.py @@ -52,13 +52,6 @@ def for_seq_cls( pooler_config, pooling=pooling, classifier=classifier, - act_fn="classify", - ), - "score": pooler_for_classify( - pooler_config, - pooling=pooling, - classifier=classifier, - act_fn="score", ), } ) @@ -115,7 +108,7 @@ def extra_repr(self) -> str: class IdentityPooler(Pooler): def get_supported_tasks(self) -> Set[PoolingTask]: - return {"plugin", "score"} + return {"plugin"} def forward( self, diff --git a/vllm/model_executor/layers/pooler/tokwise/heads.py b/vllm/model_executor/layers/pooler/tokwise/heads.py index 4183f5b1ba25..80c5c831fa08 100644 --- a/vllm/model_executor/layers/pooler/tokwise/heads.py +++ b/vllm/model_executor/layers/pooler/tokwise/heads.py @@ -68,22 +68,24 @@ def forward_chunk( if self.head_dtype is not None: pooled_data = pooled_data.to(self.head_dtype) - # pooled_data shape: [n_tokens, hidden_dimension] + # pooled_data shape: [n_tokens, hidden_size] # Apply ST projector if self.projector is not None: - pooled_data = self.projector(pooled_data) - # pooled_data shape: [n_tokens, embedding_dimension] + embeddings = self.projector(pooled_data) + else: + embeddings = pooled_data + # embeddings shape: [n_tokens, embedding_size] # for matryoshka representation - pooled_data = pooled_data[..., : pooling_param.dimensions] + embeddings = embeddings[..., : pooling_param.dimensions] # for normalize if self.activation is not None and pooling_param.use_activation: - pooled_data = self.activation(pooled_data) + embeddings = self.activation(embeddings) - # pooled_data shape: [n_tokens, embedding_dimension] - return pooled_data + # embeddings shape: [n_tokens, embedding_size] + return embeddings class TokenClassifierPoolerHead(TokenPoolerHead): @@ -118,16 +120,16 @@ def forward_chunk( # hidden_states shape: [n_token, hidden_size] if self.classifier is not None: - scores = self.classifier(pooled_data) + logits = self.classifier(pooled_data) else: - scores = pooled_data - # scores shape: [n_token, num_labels] + logits = pooled_data + # logits shape: [n_token, num_labels] if self.logit_bias is not None: - scores -= self.logit_bias + logits -= self.logit_bias if self.activation is not None and pooling_param.use_activation: - scores = self.activation(scores) + logits = self.activation(logits) - # scores shape: [n_token, num_labels] - return scores + # logits shape: [n_token, num_labels] + return logits diff --git a/vllm/model_executor/layers/pooler/tokwise/poolers.py b/vllm/model_executor/layers/pooler/tokwise/poolers.py index 996f20d98cc9..c56970fcabaa 100644 --- a/vllm/model_executor/layers/pooler/tokwise/poolers.py +++ b/vllm/model_executor/layers/pooler/tokwise/poolers.py @@ -116,7 +116,7 @@ def pooler_for_token_classify( *, pooling: TokenPoolingMethod | TokenPoolingFn | None = None, classifier: ClassifierFn | None = None, - act_fn: PoolerActivation | str | None = None, + act_fn: PoolerActivation | None = None, ): if pooling is None: pooling = get_tok_pooling_method(pooler_config.get_tok_pooling_type()) diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index e08a6456aba7..9aceb3be054d 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -12,7 +12,6 @@ QuantizationMethods = Literal[ "awq", "fp8", - "ptpc_fp8", "fbgemm_fp8", "fp_quant", "modelopt", @@ -39,7 +38,6 @@ DEPRECATED_QUANTIZATION_METHODS = [ "tpu_int8", - "ptpc_fp8", "fbgemm_fp8", "fp_quant", "experts_int8", @@ -132,7 +130,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: from .mxfp4 import Mxfp4Config from .mxfp8 import Mxfp8Config from .petit import PetitNvFp4Config - from .ptpc_fp8 import PTPCFp8Config from .torchao import TorchAOConfig method_to_config: dict[str, type[QuantizationConfig]] = { @@ -150,7 +147,6 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]: "gptq": GPTQConfig, "compressed-tensors": CompressedTensorsConfig, "bitsandbytes": BitsAndBytesConfig, - "ptpc_fp8": PTPCFp8Config, "experts_int8": ExpertsInt8Config, "quark": QuarkConfig, "moe_wna16": MoeWNA16Config, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 29115fbbc255..5e14d1712aec 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -45,11 +45,14 @@ make_fp8_moe_quant_config, select_fp8_moe_backend, ) +from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import ( + Mxfp4MoeBackend, + make_mxfp4_moe_kernel, + make_mxfp4_moe_quant_config, +) from vllm.model_executor.layers.fused_moe.oracle.nvfp4 import ( - NvFp4MoeBackend, convert_to_nvfp4_moe_kernel_format, is_global_sf_supported_for_nvfp4_backend, - make_mxfp4_moe_quant_config, make_nvfp4_moe_kernel, make_nvfp4_moe_quant_config, select_nvfp4_moe_backend, @@ -235,7 +238,7 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod): def __init__(self, moe): super().__init__(moe) self.group_size = 32 - self.mxfp4_backend = NvFp4MoeBackend.MARLIN + self.mxfp4_backend = Mxfp4MoeBackend.MARLIN self.experts_cls = MarlinExperts def create_weights( @@ -310,7 +313,9 @@ def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: return make_mxfp4_moe_quant_config( - w13_scale=layer.w13_weight_scale, w2_scale=layer.w2_weight_scale + mxfp4_backend=self.mxfp4_backend, + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, ) def process_weights_after_loading(self, layer: FusedMoE) -> None: @@ -334,10 +339,11 @@ def process_weights_after_loading(self, layer: FusedMoE) -> None: self.moe_quant_config = self.get_fused_moe_quant_config(layer) if self.moe_quant_config is not None: - self.moe_kernel = make_nvfp4_moe_kernel( + self.moe_kernel = make_mxfp4_moe_kernel( moe_quant_config=self.moe_quant_config, moe_config=self.moe, experts_cls=self.experts_cls, + mxfp4_backend=self.mxfp4_backend, shared_experts=layer.shared_experts, routing_tables=layer._maybe_init_expert_routing_tables(), ) diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index f992d0f86c4e..22077be8a44b 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1,12 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from enum import Enum import torch -from torch.nn.parameter import Parameter -from vllm import envs -from vllm._aiter_ops import rocm_aiter_ops from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.attention import Attention @@ -17,173 +13,31 @@ MoEActivation, ) from vllm.model_executor.layers.fused_moe import modular_kernel as mk -from vllm.model_executor.layers.fused_moe.all2all_utils import ( - maybe_make_prepare_finalize, -) from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, - mxfp4_mxfp8_moe_quant_config, - mxfp4_w4a16_moe_quant_config, - ocp_mx_moe_quant_config, -) -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - BatchedMarlinExperts, - MarlinExperts, ) -from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( - OAITritonExperts, - UnfusedOAITritonExperts, +from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import ( + TRITON_BACKENDS, + Mxfp4MoeBackend, + convert_to_mxfp4_moe_kernel_format, + make_mxfp4_moe_kernel, + make_mxfp4_moe_quant_config, + mxfp4_round_up_hidden_size_and_intermediate_size, + select_mxfp4_moe_backend, ) -from vllm.model_executor.layers.fused_moe.trtllm_moe import TrtLlmGenExperts from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase, ) -from vllm.model_executor.layers.quantization.utils.marlin_utils import ( - get_marlin_input_dtype, -) -from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( - prepare_moe_fp4_layer_for_marlin, -) -from vllm.model_executor.layers.quantization.utils.mxfp4_utils import ( - CK_MXFP4_MOE_DIM_ALIGNMENT, - _can_support_mxfp4, - _swizzle_mxfp4, - get_padding_alignment, -) from vllm.model_executor.layers.quantization.utils.quant_utils import is_layer_skipped -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm.platforms import current_platform -from vllm.utils.flashinfer import has_flashinfer -from vllm.utils.import_utils import has_triton_kernels -from vllm.utils.math_utils import round_up logger = init_logger(__name__) -# enum for mxfp4 backend -class Mxfp4Backend(Enum): - NONE = 0 - - # FlashInfer Backend - SM100_FI_MXFP4_MXFP8_TRTLLM = 1 - SM100_FI_MXFP4_MXFP8_CUTLASS = 2 - SM100_FI_MXFP4_BF16 = 3 - SM90_FI_MXFP4_BF16 = 4 - - # Marlin Backend - MARLIN = 5 - - # Triton Backend - TRITON = 6 - - CK = 7 - - -def get_mxfp4_backend_with_lora() -> Mxfp4Backend: - """ - Not all MXFP4 backends support LoRA. Select backends that are known to - have LoRA support. - """ - if not current_platform.is_cuda(): - return Mxfp4Backend.NONE - - # If FlashInfer is not available, try either Marlin or Triton - triton_kernels_supported = ( - has_triton_kernels() - # NOTE: triton_kernels are only confirmed to work on SM90 and SM100 - # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317 - # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498 - and (9, 0) <= current_platform.get_device_capability() < (11, 0) - ) - if envs.VLLM_MXFP4_USE_MARLIN is False and triton_kernels_supported: - logger.info_once("[get_mxfp4_backend_with_lora] Using Triton backend") - return Mxfp4Backend.TRITON - - logger.info_once("[get_mxfp4_backend_with_lora] Using Marlin backend") - return Mxfp4Backend.MARLIN - - -def get_mxfp4_backend(with_lora_support: bool) -> Mxfp4Backend: - # Backend Selection - - if with_lora_support: - return get_mxfp4_backend_with_lora() - - if current_platform.is_cuda(): - if ( - current_platform.is_device_capability(90) - and has_flashinfer() - and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_BF16 - ): - logger.info_once("Using FlashInfer MXFP4 BF16 backend for SM90") - return Mxfp4Backend.SM90_FI_MXFP4_BF16 - elif ( - current_platform.is_device_capability_family(100) - and has_flashinfer() - and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS - ): - logger.info_once("Using FlashInfer MXFP4 MXFP8 CUTLASS backend for SM100") - return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS - elif ( - current_platform.is_device_capability_family(100) - and has_flashinfer() - and envs.VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8 - ): - logger.info_once( - "Using FlashInfer MXFP4 MXFP8 TRTLLM backend for SM100", scope="local" - ) - return Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM - elif current_platform.is_device_capability_family(100) and has_flashinfer(): - logger.info_once( - "Using FlashInfer MXFP4 BF16 backend for SM100, " - "For faster performance on SM100, consider setting " - "VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8=1, though this may impact " - "accuracy." - ) - return Mxfp4Backend.SM100_FI_MXFP4_BF16 - elif ( - current_platform.is_device_capability_family(100) - or current_platform.is_device_capability(90) - ) and not has_flashinfer(): - logger.warning_once( - "MXFP4 MoE is enabled on Hopper/Blackwell but FlashInfer " - "is not available. This may result in degraded performance. " - "Please `pip install vllm[flashinfer]` for best results." - ) - - # If FlashInfer is not available, try either Marlin or Triton - triton_kernels_supported = ( - has_triton_kernels() - # NOTE: triton_kernels are only confirmed to work on SM90 and SM100 - # SM110 fails with this error: https://github.com/vllm-project/vllm/issues/29317 - # SM120 needs this fix: https://github.com/triton-lang/triton/pull/8498 - and (9, 0) <= current_platform.get_device_capability() < (11, 0) - ) - if envs.VLLM_MXFP4_USE_MARLIN or not triton_kernels_supported: - logger.info_once("Using Marlin backend") - return Mxfp4Backend.MARLIN - else: - logger.info_once("Using Triton backend") - return Mxfp4Backend.TRITON - elif current_platform.is_xpu(): - logger.info_once("Using xpu backend on XPU") - return Mxfp4Backend.MARLIN - elif current_platform.is_rocm(): - from vllm.platforms.rocm import on_gfx950 - - if rocm_aiter_ops.is_enabled() and on_gfx950(): - logger.info_once("Using CK MXFP4 MoE backend (Aiter ROCm)") - return Mxfp4Backend.CK - elif has_triton_kernels(): - logger.info_once("Using Triton backend") - return Mxfp4Backend.TRITON - - return Mxfp4Backend.NONE - - class Mxfp4Config(QuantizationConfig): def __init__(self, ignored_layers: list[str] | None = None): super().__init__() @@ -219,9 +73,6 @@ def get_quant_method( fused_mapping=self.packed_modules_mapping, ): return UnquantizedLinearMethod() - # TODO: Add support for MXFP4 Linear Method. - # MXFP4 LinearMethod is available in AMD-Quark, refer to that implementation - # if you are interested in enabling MXFP4 here. logger.debug_once( "MXFP4 linear layer is not implemented - falling back to " "UnquantizedLinearMethod.", @@ -232,10 +83,8 @@ def get_quant_method( if current_platform.is_xpu(): return XpuMxfp4MoEMethod(layer.moe_config) else: - quant_method = Mxfp4MoEMethod(layer.moe_config) - return quant_method + return Mxfp4MoEMethod(layer.moe_config) elif isinstance(layer, Attention): - # TODO: Add support for MXFP4 Attention. logger.debug_once( "MXFP4 attention layer is not implemented. " "Skipping quantization for this layer.", @@ -254,51 +103,36 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): def __init__(self, moe: FusedMoEConfig): super().__init__(moe) self.weight_dtype = "mxfp4" - self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled) + self.mxfp4_backend, self.experts_cls = select_mxfp4_moe_backend(moe) self.max_capture_size = ( get_current_vllm_config().compilation_config.max_cudagraph_capture_size ) - # CK's pre-compiled MXFP4 MoE GEMM kernel instances have dimension - # alignment requirements. Fall back to Triton when not met. - if ( - self.mxfp4_backend == Mxfp4Backend.CK - and moe.intermediate_size_per_partition % CK_MXFP4_MOE_DIM_ALIGNMENT != 0 - ): - if has_triton_kernels(): - logger.warning_once( - "CK MXFP4 MoE GEMM does not support " - "intermediate_size_per_partition=%d (not a multiple of " - "%d). Falling back to Triton backend.", - moe.intermediate_size_per_partition, - CK_MXFP4_MOE_DIM_ALIGNMENT, - ) - self.mxfp4_backend = Mxfp4Backend.TRITON - else: - raise ValueError( - f"CK MXFP4 MoE GEMM does not support " - f"intermediate_size_per_partition=" - f"{moe.intermediate_size_per_partition} (not a multiple " - f"of {CK_MXFP4_MOE_DIM_ALIGNMENT}) and no Triton " - f"fallback is available. Use a compatible " - f"tensor_parallel_size." - ) - - assert self.mxfp4_backend != Mxfp4Backend.NONE, ( - f"get_mxfp4_backend(with_lora_support={moe.is_lora_enabled}) found" - "no compatible MXFP4 MoE backend (FlashInfer/Marlin/Triton)." - "Please check your environment and try again." - ) self._cache_permute_indices: dict[torch.Size, torch.Tensor] = {} - # Initialized in process_weights_after_loading for CUTLASS/SM90 backends self.moe_kernel: mk.FusedMoEKernel | None = None + # Round up dims once based on backend. This mutates the shared + # FusedMoEConfig in-place so that create_weights() and all + # downstream code see the padded dimensions. This must happen + # before create_weights() is called. + self.moe.hidden_dim, self.moe.intermediate_size_per_partition = ( + mxfp4_round_up_hidden_size_and_intermediate_size( + self.mxfp4_backend, + self.moe.hidden_dim, + self.moe.intermediate_size_per_partition, + ) + ) + + # Used for triton kernel precision configs + self.w13_precision_config = None + self.w2_precision_config = None + @property def skip_forward_padding(self) -> bool: # SM100_FI_MXFP4_MXFP8_TRTLLM supports padding with mxfp8 quant # so can skip the padding in the forward before applying the moe method - return self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM + return self.mxfp4_backend == Mxfp4MoeBackend.FLASHINFER_TRTLLM_MXFP4_MXFP8 def create_weights( self, @@ -312,77 +146,14 @@ def create_weights( self.num_experts = num_experts weight_dtype = torch.uint8 scale_dtype = torch.uint8 - - # FIXME (zyongye): ship after torch and safetensors support mxfp4 - # is_torch_mxfp4_available = ( - # hasattr(torch, "float4_e2m1fn_x2") and - # hasattr(torch, "float8_e8m0fnu")) - # if is_torch_mxfp4_available: - # weight_dtype = torch.float4_e2m1fn_x2 - # scale_dtype = torch.float8_e8m0fnu - mxfp4_block = 32 - intermediate_size_per_partition_after_pad = intermediate_size_per_partition - if self.mxfp4_backend == Mxfp4Backend.MARLIN: - # The moe marlin kernel requires that for each linear - # n % 256 == 0 and k % 128 == 0. - # In gate_up_proj: - # n = 2 * intermediate_size_per_partition_after_pad - # k = hidden_size - # In down_proj - # n = hidden_size - # k = intermediate_size_per_partition_after_pad - intermediate_size_per_partition_after_pad = round_up( - intermediate_size_per_partition, 128 - ) - if current_platform.is_xpu(): - hidden_size = round_up(hidden_size, 128) - else: - hidden_size = round_up(hidden_size, 256) - - layer.params_dtype = params_dtype - layer.num_experts = num_experts - layer.hidden_size = hidden_size - layer.intermediate_size_per_partition = ( - intermediate_size_per_partition_after_pad - ) - elif ( - self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM - or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 - ): - # pad the intermediate size to be a multiple of 2 * mxfp4_block - # for to hold non-uniform sharded tensor as well as swizzling - # other padding to increase performance - intermediate_size_per_partition_after_pad = round_up( - intermediate_size_per_partition, 256 - ) - hidden_size = round_up(hidden_size, 256) - elif ( - self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS - or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 - ): - intermediate_size_per_partition_after_pad = round_up( - intermediate_size_per_partition, 128 - ) - hidden_size = round_up(hidden_size, 128) - elif current_platform.is_rocm(): - pad_align = get_padding_alignment() - intermediate_size_per_partition_after_pad = round_up( - intermediate_size_per_partition, pad_align - ) - hidden_size = round_up(hidden_size, pad_align) - else: - intermediate_size_per_partition_after_pad = round_up( - intermediate_size_per_partition, 64 - ) - - self.intermediate_size = intermediate_size_per_partition_after_pad - self.hidden_size = hidden_size - self.hidden_pad = extra_weight_attrs.get("hidden_pad", 0) - self.intermediate_pad = ( - intermediate_size_per_partition_after_pad - intermediate_size_per_partition + # Use pre-rounded sizes from config + self.intermediate_size = intermediate_size_per_partition_after_pad = ( + self.moe.intermediate_size_per_partition ) + self.hidden_size = hidden_size = self.moe.hidden_dim + # Fused gate_up_proj (column parallel) w13_weight = torch.nn.Parameter( torch.zeros( @@ -408,17 +179,6 @@ def create_weights( layer.register_parameter("w13_weight_scale", w13_weight_scale) set_weight_attrs(w13_weight_scale, extra_weight_attrs) - w13_bias = torch.nn.Parameter( - torch.zeros( - num_experts, - 2 * intermediate_size_per_partition_after_pad, - dtype=torch.bfloat16, - ), - requires_grad=False, - ) - layer.register_parameter("w13_bias", w13_bias) - set_weight_attrs(w13_bias, extra_weight_attrs) - # down_proj (row parallel) w2_weight = torch.nn.Parameter( torch.zeros( @@ -444,604 +204,170 @@ def create_weights( layer.register_parameter("w2_weight_scale", w2_weight_scale) set_weight_attrs(w2_weight_scale, extra_weight_attrs) - w2_bias = torch.nn.Parameter( - torch.zeros( - num_experts, - hidden_size, - dtype=torch.bfloat16, - ), - requires_grad=False, - ) - layer.register_parameter("w2_bias", w2_bias) - set_weight_attrs(w2_bias, extra_weight_attrs) - - def process_weights_after_loading(self, layer): - if self.mxfp4_backend == Mxfp4Backend.MARLIN: - prepare_moe_fp4_layer_for_marlin( - layer, input_dtype=get_marlin_input_dtype() - ) - - self.moe_quant_config = self.get_fused_moe_quant_config(layer) - assert self.moe_quant_config is not None - - prepare_finalize = maybe_make_prepare_finalize( - moe=self.moe, - quant_config=self.moe_quant_config, - routing_tables=layer._maybe_init_expert_routing_tables(), - allow_new_interface=True, - ) - assert prepare_finalize is not None - - self.moe_kernel = mk.FusedMoEKernel( - prepare_finalize, - MarlinExperts( - self.moe, - self.moe_quant_config, + if self.moe.has_bias: + w13_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + 2 * intermediate_size_per_partition_after_pad, + dtype=torch.bfloat16, ), - inplace=not self.moe.disable_inplace, - shared_experts=None, - ) - elif ( - self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM - or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 - ): - from flashinfer.fp4_quantization import nvfp4_block_scale_interleave - from flashinfer.fused_moe.core import get_w2_permute_indices_with_cache - - layer.gemm1_alpha = Parameter( - torch.tensor([1.702] * self.num_experts, dtype=torch.float32).cuda(), requires_grad=False, ) - layer.gemm1_beta = Parameter( - torch.tensor([1.0] * self.num_experts, dtype=torch.float32).cuda(), - requires_grad=False, - ) - layer.gemm1_clamp_limit = Parameter( - torch.tensor([7.0] * self.num_experts, dtype=torch.float32).cuda(), - requires_grad=False, - ) - sf_block_size = 32 # mxfp4 block size - - assert ( - layer.w13_weight.dim() == 3 - and layer.w13_weight.shape[0] == self.num_experts - and layer.w13_weight.shape[1] == self.intermediate_size * 2 - and layer.w13_weight.shape[2] == self.hidden_size // 2 - ) - assert ( - layer.w13_weight_scale.dim() == 3 - and layer.w13_weight_scale.shape[0] == self.num_experts - and layer.w13_weight_scale.shape[1] == self.intermediate_size * 2 - and layer.w13_weight_scale.shape[2] == self.hidden_size // sf_block_size - ) - assert ( - layer.w2_weight.dim() == 3 - and layer.w2_weight.shape[0] == self.num_experts - and layer.w2_weight.shape[1] == self.hidden_size - and layer.w2_weight.shape[2] == self.intermediate_size // 2 - ) - assert ( - layer.w2_weight_scale.dim() == 3 - and layer.w2_weight_scale.shape[1] == self.hidden_size - and layer.w2_weight_scale.shape[2] - == self.intermediate_size // sf_block_size - ) - assert ( - layer.w13_bias.dim() == 2 - and layer.w13_bias.shape[0] == self.num_experts - and layer.w13_bias.shape[1] == self.intermediate_size * 2 - ) - assert ( - layer.w2_bias.dim() == 2 - and layer.w2_bias.shape[0] == self.num_experts - and layer.w2_bias.shape[1] == self.hidden_size - ) - - w13_weight_scale = layer.w13_weight_scale.data - w2_weight_scale = layer.w2_weight_scale.data - w13_weight = layer.w13_weight.data - w2_weight = layer.w2_weight.data - w13_bias = layer.w13_bias.data.to(torch.float32) - w2_bias = layer.w2_bias.data.to(torch.float32) - - # Swap w1 and w3 as the definition of - # swiglu is different in the trtllm-gen - def swap_every_two_rows(x, axis=-1): - shape = x.shape - if axis < 0: - axis = len(shape) + axis - - # Create a new shape with pairs swapped along specified axis - new_shape = list(shape) - new_shape[axis] = shape[axis] // 2 - new_shape.insert(axis + 1, 2) + layer.register_parameter("w13_bias", w13_bias) + set_weight_attrs(w13_bias, extra_weight_attrs) - # Reshape to expose pairs, swap them, and reshape back - x = x.reshape(*new_shape) - x = x.flip(axis + 1) - new_shape = list(shape) - return x.reshape(*new_shape) - - w13_weight_scale = swap_every_two_rows(w13_weight_scale, -2) - w13_weight = swap_every_two_rows(w13_weight, -2) - w13_bias = swap_every_two_rows(w13_bias, -1) - - # Do not interleave as the checkpoint is already interleaved - - # Shuffle weights and scaling factors for transposed mma output - gemm1_weights_mxfp4_shuffled = [] - gemm1_scales_mxfp4_shuffled = [] - gemm2_weights_mxfp4_shuffled = [] - gemm2_scales_mxfp4_shuffled = [] - gemm1_bias_shuffled = [] - gemm2_bias_shuffled = [] - epilogue_tile_m = 128 # FIXME: this depends on the kernel internals - for i in range(self.num_experts): - # w13 weight shuffling - permute_indices = get_w2_permute_indices_with_cache( - self._cache_permute_indices, - w13_weight[i].view(torch.uint8), - epilogue_tile_m, - ) - gemm1_weights_mxfp4_shuffled.append( - w13_weight[i] - .view(torch.uint8)[permute_indices.to(w13_weight.device)] - .contiguous() - ) - # w13 scale shuffling - permute_sf_indices = get_w2_permute_indices_with_cache( - self._cache_permute_indices, - w13_weight_scale[i].view(torch.uint8), - epilogue_tile_m, - num_elts_per_sf=16, - ) - gemm1_scales_mxfp4_shuffled.append( - nvfp4_block_scale_interleave( - w13_weight_scale[i] - .view(torch.uint8)[ - permute_sf_indices.to(w13_weight_scale.device) - ] - .contiguous() - ) - ) - # w13 bias shuffling - permute_bias_indices = get_w2_permute_indices_with_cache( - self._cache_permute_indices, - w13_bias[i].clone().reshape(-1, 1), - epilogue_tile_m, - ) - gemm1_bias_shuffled.append( - w13_bias[i] - .clone() - .reshape(-1, 1)[permute_bias_indices.to(w13_bias.device)] - .contiguous() - ) - # w2 weight shuffling - permute_indices = get_w2_permute_indices_with_cache( - self._cache_permute_indices, - w2_weight[i].view(torch.uint8), - epilogue_tile_m, - ) - gemm2_weights_mxfp4_shuffled.append( - w2_weight[i] - .view(torch.uint8)[permute_indices.to(w2_weight.device)] - .contiguous() - ) - # w2 scale shuffling - permute_sf_indices = get_w2_permute_indices_with_cache( - self._cache_permute_indices, - w2_weight_scale[i].view(torch.uint8), - epilogue_tile_m, - num_elts_per_sf=16, - ) - gemm2_scales_mxfp4_shuffled.append( - nvfp4_block_scale_interleave( - w2_weight_scale[i] - .view(torch.uint8)[ - permute_sf_indices.to(w2_weight_scale.device) - ] - .contiguous() - ) - ) - # w2 bias shuffling - permute_indices = get_w2_permute_indices_with_cache( - self._cache_permute_indices, - w2_bias[i].clone().reshape(-1, 1), - epilogue_tile_m, - ) - gemm2_bias_shuffled.append( - w2_bias[i] - .clone() - .reshape(-1, 1)[permute_indices.to(w2_bias.device)] - .contiguous() - ) - - w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled) - w13_weight_scale = ( - torch.stack(gemm1_scales_mxfp4_shuffled) - .reshape( - self.num_experts, - 2 * self.intermediate_size, - self.hidden_size // sf_block_size, - ) - .view(torch.float8_e4m3fn) - ) - - w2_weight = torch.stack(gemm2_weights_mxfp4_shuffled) - w2_weight_scale = ( - torch.stack(gemm2_scales_mxfp4_shuffled) - .reshape( - self.num_experts, - self.hidden_size, - self.intermediate_size // sf_block_size, - ) - .view(torch.float8_e4m3fn) - ) - - layer.w13_weight = Parameter(w13_weight, requires_grad=False) - layer.w13_weight_scale = Parameter(w13_weight_scale, requires_grad=False) - layer.w2_weight = Parameter(w2_weight, requires_grad=False) - layer.w2_weight_scale = Parameter(w2_weight_scale, requires_grad=False) - layer.w13_bias = Parameter( - torch.stack(gemm1_bias_shuffled).reshape(self.num_experts, -1), - requires_grad=False, - ) - layer.w2_bias = Parameter( - torch.stack(gemm2_bias_shuffled).reshape(self.num_experts, -1), + w2_bias = torch.nn.Parameter( + torch.zeros( + num_experts, + hidden_size, + dtype=torch.bfloat16, + ), requires_grad=False, ) - elif ( - self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS - or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 - ): - sf_block_size = 32 # mxfp4 block size + layer.register_parameter("w2_bias", w2_bias) + set_weight_attrs(w2_bias, extra_weight_attrs) - # Common shape assertions - assert ( - layer.w13_weight.dim() == 3 - and layer.w13_weight.shape[0] == self.num_experts - and layer.w13_weight.shape[1] == self.intermediate_size * 2 - and layer.w13_weight.shape[2] == self.hidden_size // 2 - ) - assert ( - layer.w13_weight_scale.dim() == 3 - and layer.w13_weight_scale.shape[0] == self.num_experts - and layer.w13_weight_scale.shape[1] == self.intermediate_size * 2 - and layer.w13_weight_scale.shape[2] == self.hidden_size // sf_block_size - ) - assert ( - layer.w2_weight.dim() == 3 - and layer.w2_weight.shape[0] == self.num_experts - and layer.w2_weight.shape[1] == self.hidden_size - and layer.w2_weight.shape[2] == self.intermediate_size // 2 - ) - assert ( - layer.w2_weight_scale.dim() == 3 - and layer.w2_weight_scale.shape[1] == self.hidden_size - and layer.w2_weight_scale.shape[2] - == self.intermediate_size // sf_block_size - ) + def _setup_kernel( + self, + layer: FusedMoE, + w13: torch.Tensor, + w2: torch.Tensor, + w13_scale: torch.Tensor, + w2_scale: torch.Tensor, + w13_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, + ) -> None: + num_experts = self.num_experts + intermediate_size = self.intermediate_size + hidden_size = self.hidden_size + sf_block_size = 32 + + # Shape assertions + assert ( + w13.dim() == 3 + and w13.shape[0] == num_experts + and w13.shape[1] == intermediate_size * 2 + and w13.shape[2] == hidden_size // 2 + ) + assert ( + w13_scale.dim() == 3 + and w13_scale.shape[0] == num_experts + and w13_scale.shape[1] == intermediate_size * 2 + and w13_scale.shape[2] == hidden_size // sf_block_size + ) + assert ( + w2.dim() == 3 + and w2.shape[0] == num_experts + and w2.shape[1] == hidden_size + and w2.shape[2] == intermediate_size // 2 + ) + assert ( + w2_scale.dim() == 3 + and w2_scale.shape[1] == hidden_size + and w2_scale.shape[2] == intermediate_size // sf_block_size + ) + if w13_bias is not None: assert ( - layer.w13_bias.dim() == 2 - and layer.w13_bias.shape[0] == self.num_experts - and layer.w13_bias.shape[1] == self.intermediate_size * 2 + w13_bias.dim() == 2 + and w13_bias.shape[0] == num_experts + and w13_bias.shape[1] == intermediate_size * 2 ) + if w2_bias is not None: assert ( - layer.w2_bias.dim() == 2 - and layer.w2_bias.shape[0] == self.num_experts - and layer.w2_bias.shape[1] == self.hidden_size - ) - - # De-interleave and swap for w13 weight, bias, and scales - w13_w = layer.w13_weight.data - gate_w, up_w = w13_w[:, ::2, :], w13_w[:, 1::2, :] - deinterleaved_w13_w = torch.cat([gate_w, up_w], dim=1) - w1_w, w3_w = torch.chunk(deinterleaved_w13_w, 2, dim=1) - w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1) - - w13_b = layer.w13_bias.data.to(torch.float32) - gate_b, up_b = w13_b[:, ::2], w13_b[:, 1::2] - deinterleaved_w13_b = torch.cat([gate_b, up_b], dim=1) - b1, b3 = torch.chunk(deinterleaved_w13_b, 2, dim=-1) - w13_bias_swapped = torch.cat([b3, b1], dim=-1).to(torch.bfloat16) - - w13_s = layer.w13_weight_scale.data - gate_s, up_s = w13_s[:, ::2, :], w13_s[:, 1::2, :] - deinterleaved_w13_s = torch.cat([gate_s, up_s], dim=1) - s1, s3 = torch.chunk(deinterleaved_w13_s, 2, dim=1) - w13_scale_swapped = torch.cat([s3, s1], dim=1) - - if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS: - from flashinfer import block_scale_interleave - - orig_shape = w13_scale_swapped.shape - w13_scale_interleaved = block_scale_interleave( - w13_scale_swapped.view(torch.uint8) - ).reshape(orig_shape) - - w2_s = layer.w2_weight_scale.data - orig_shape = w2_s.shape - w2_scale_interleaved = block_scale_interleave( - w2_s.view(torch.uint8) - ).reshape(orig_shape) - - layer.w13_weight = Parameter(w13_weight_swapped, requires_grad=False) - layer.w13_weight_scale = Parameter( - w13_scale_interleaved, requires_grad=False - ) - layer.w13_bias = Parameter(w13_bias_swapped, requires_grad=False) - layer.w2_weight_scale = Parameter( - w2_scale_interleaved, requires_grad=False - ) - elif self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16: - - def _interleave_mxfp4_cutlass_sm90(w): - w_shape = w.shape - w_interleaved = w.reshape( - w_shape[0], w_shape[1], (w_shape[2] // 4), 4 - ) - w_interleaved = w_interleaved.permute(0, 2, 1, 3) - w_interleaved = w_interleaved.reshape( - w_shape[0], w_shape[2] // 4, w_shape[1] * 4 - ) - return w_interleaved - - w31_scales = w13_scale_swapped.to(torch.uint8).view(torch.uint8) - w31_scales_interleaved = _interleave_mxfp4_cutlass_sm90(w31_scales) - - w2_weight_scale = layer.w2_weight_scale.data - w2_scales = w2_weight_scale.to(torch.uint8).view(torch.uint8) - w2_scales_interleaved = _interleave_mxfp4_cutlass_sm90(w2_scales) - - layer.w13_weight = torch.nn.Parameter( - torch.cat([w3_w, w1_w], dim=1), requires_grad=False - ) - layer.w13_bias = torch.nn.Parameter( - w13_bias_swapped, requires_grad=False - ) - layer.w13_weight_scale = torch.nn.Parameter( - w31_scales_interleaved, requires_grad=False - ) - layer.w2_weight_scale = torch.nn.Parameter( - w2_scales_interleaved, requires_grad=False - ) - - # theses two kernels go through the `flashinfer_cutlass_fused_moe` path - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( - FlashInferExperts, + w2_bias.dim() == 2 + and w2_bias.shape[0] == num_experts + and w2_bias.shape[1] == hidden_size + ) + + # Convert weights to kernel format + w13, w2, w13_scale, w2_scale, w13_bias, w2_bias = ( + convert_to_mxfp4_moe_kernel_format( + mxfp4_backend=self.mxfp4_backend, + layer=layer, + w13_weight=w13, + w2_weight=w2, + w13_weight_scale=w13_scale, + w2_weight_scale=w2_scale, + w13_bias=w13_bias, + w2_bias=w2_bias, + _cache_permute_indices=self._cache_permute_indices, ) + ) - self.moe_quant_config = self.get_fused_moe_quant_config(layer) - assert self.moe_quant_config is not None - prepare_finalize = maybe_make_prepare_finalize( - moe=self.moe, - quant_config=self.moe_quant_config, + # For TRITON backends, weights are wrapped tensors from triton_kernels + # that don't support .detach(). Manually assign parameters. + if self.mxfp4_backend not in TRITON_BACKENDS: + replace_parameter(layer, "w13_weight", w13) + replace_parameter(layer, "w2_weight", w2) + replace_parameter(layer, "w13_weight_scale", w13_scale) + replace_parameter(layer, "w2_weight_scale", w2_scale) + else: + layer.w13_weight = w13 + layer.w2_weight = w2 + self.w13_precision_config = w13_scale + self.w2_precision_config = w2_scale + + if w13_bias is not None and w2_bias is not None: + replace_parameter(layer, "w13_bias", w13_bias) + replace_parameter(layer, "w2_bias", w2_bias) + + # Build quant config + self.moe_quant_config = self.get_fused_moe_quant_config(layer) + + # Build kernel (modular or monolithic) + if self.moe_quant_config is not None and self.experts_cls is not None: + self.moe_kernel = make_mxfp4_moe_kernel( + moe_quant_config=self.moe_quant_config, + moe_config=self.moe, + mxfp4_backend=self.mxfp4_backend, + experts_cls=self.experts_cls, routing_tables=layer._maybe_init_expert_routing_tables(), - allow_new_interface=True, - ) - assert prepare_finalize is not None - - self.moe_kernel = mk.FusedMoEKernel( - prepare_finalize, - FlashInferExperts( - moe_config=self.moe, - quant_config=self.moe_quant_config, - ), - shared_experts=None, - ) - elif self.mxfp4_backend == Mxfp4Backend.CK: - if layer.w13_bias is not None: - layer.w13_bias.data = layer.w13_bias.data.to(torch.float32) - if layer.w2_bias.data is not None: - layer.w2_bias.data = layer.w2_bias.data.to(torch.float32) - - e, n, k = layer.w13_weight.shape - layer.w13_weight.view(torch.uint8).copy_( - layer.w13_weight.data.view(torch.uint8) - .view(e, n // 2, 2, k) - .permute(0, 2, 1, 3) - .contiguous() - .view(e, n, k) - ) - layer.w13_weight_scale.data = ( - layer.w13_weight_scale.data.view(e, n // 2, 2, -1) - .permute(0, 2, 1, 3) - .contiguous() - .view(e, n, -1) - ) - layer.w13_weight.data = layer.w13_weight.data.view(torch.float4_e2m1fn_x2) - layer.w2_weight.data = layer.w2_weight.data.view(torch.float4_e2m1fn_x2) - - layer.w13_weight.data = rocm_aiter_ops.shuffle_weight_a16w4( - layer.w13_weight, 16, True - ) - shuffled_w13_scale = rocm_aiter_ops.shuffle_scale_a16w4( - layer.w13_weight_scale.view(-1, layer.w13_weight_scale.shape[-1]), - self.num_experts, - True, - ) - - layer.w2_weight.data = rocm_aiter_ops.shuffle_weight_a16w4( - layer.w2_weight, 16, False - ) - shuffled_w2_scale = rocm_aiter_ops.shuffle_scale_a16w4( - layer.w2_weight_scale.view(-1, layer.w2_weight_scale.shape[-1]), - self.num_experts, - False, + shared_experts=layer.shared_experts, ) - layer.w13_bias.data = ( - layer.w13_bias.data.view(-1, n // 2, 2) - .permute(0, 2, 1) - .contiguous() - .view(-1, n) - ) - - layer.w13_weight_scale = torch.nn.Parameter( - shuffled_w13_scale, requires_grad=False - ) - layer.w2_weight_scale = torch.nn.Parameter( - shuffled_w2_scale, requires_grad=False - ) - # replace_parameter(layer, "w13_bias", w13_bias) - # replace_parameter(layer, "w13_weight_scale", w13_weight_scale) - # replace_parameter(layer, "w2_weight_scale", w2_weight_scale) - # replace_parameter(layer, "w13_weight", w13_weight) - # replace_parameter(layer, "w2_weight", w2_weight) - - elif self.mxfp4_backend == Mxfp4Backend.TRITON: - from triton_kernels.matmul_ogs import FlexCtx, PrecisionConfig - - w13_bias = layer.w13_bias.to(torch.float32) - w2_bias = layer.w2_bias.to(torch.float32) - - layer.w13_bias = Parameter(w13_bias, requires_grad=False) - layer.w2_bias = Parameter(w2_bias, requires_grad=False) - # Ideally we'd use FusedMoEModularKernel.prepare_finalize object - # (stored in self.fused_experts) to determine if the MoE has a - # batched activation format. As self.fused_experts is not - # initialized at this point, we resort to checking the MoE config - # directly. - is_batched_moe = ( - self.moe.use_deepep_ll_kernels or self.moe.use_nixl_ep_kernels - ) - if is_batched_moe: - num_warps = 4 if envs.VLLM_MOE_DP_CHUNK_SIZE <= 512 else 8 - else: - num_warps = 8 - w13_weight, w13_flex, w13_scale = _swizzle_mxfp4( - layer.w13_weight, layer.w13_weight_scale, num_warps - ) - w2_weight, w2_flex, w2_scale = _swizzle_mxfp4( - layer.w2_weight, layer.w2_weight_scale, num_warps - ) + def process_weights_after_loading(self, layer): + w13 = layer.w13_weight + w2 = layer.w2_weight + w13_scale = layer.w13_weight_scale + w2_scale = layer.w2_weight_scale + w13_bias = getattr(layer, "w13_bias", None) + w2_bias = getattr(layer, "w2_bias", None) - self.w13_precision_config = PrecisionConfig( - weight_scale=w13_scale, flex_ctx=FlexCtx(rhs_data=w13_flex) - ) - self.w2_precision_config = PrecisionConfig( - weight_scale=w2_scale, flex_ctx=FlexCtx(rhs_data=w2_flex) - ) - self.w13_weight = w13_weight - self.w2_weight = w2_weight - del layer.w13_weight - del layer.w2_weight - layer.w13_weight = w13_weight - layer.w2_weight = w2_weight + if self.mxfp4_backend == Mxfp4MoeBackend.NONE: + return - else: - raise ValueError( - f"Unsupported mxfp4_backend: {self.mxfp4_backend}: " - f"should be one of: {list(Mxfp4Backend)}." - ) + self._setup_kernel(layer, w13, w2, w13_scale, w2_scale, w13_bias, w2_bias) def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: - if self.mxfp4_backend == Mxfp4Backend.MARLIN: - return mxfp4_w4a16_moe_quant_config( - w1_bias=layer.w13_bias, - w2_bias=layer.w2_bias, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - ) - elif self.mxfp4_backend == Mxfp4Backend.TRITON: + w1_scale = layer.w13_weight_scale + w2_scale = layer.w2_weight_scale + w1_bias = getattr(layer, "w13_bias", None) + w2_bias = getattr(layer, "w2_bias", None) + + if self.mxfp4_backend in TRITON_BACKENDS: + assert self.w13_precision_config is not None + assert self.w2_precision_config is not None w1_scale = self.w13_precision_config w2_scale = self.w2_precision_config - return mxfp4_w4a16_moe_quant_config( - w1_bias=layer.w13_bias, - w2_bias=layer.w2_bias, - w1_scale=w1_scale, - w2_scale=w2_scale, - ) - elif self.mxfp4_backend in [ - Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM, - Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS, - ]: - return mxfp4_mxfp8_moe_quant_config( - w1_bias=layer.w13_bias, - w2_bias=layer.w2_bias, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - ) - elif self.mxfp4_backend in [ - Mxfp4Backend.SM100_FI_MXFP4_BF16, - Mxfp4Backend.SM90_FI_MXFP4_BF16, - Mxfp4Backend.CK, - ]: - return mxfp4_w4a16_moe_quant_config( - w1_bias=layer.w13_bias, - w2_bias=layer.w2_bias, - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - ) - else: - w1_scale = layer.w13_weight_scale - w2_scale = layer.w2_weight_scale - return ocp_mx_moe_quant_config( - quant_dtype="mxfp4", - w1_bias=layer.w13_bias, - w2_bias=layer.w2_bias, - w1_scale=w1_scale, - w2_scale=w2_scale, - ) + + return make_mxfp4_moe_quant_config( + mxfp4_backend=self.mxfp4_backend, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_bias=w1_bias, + w2_bias=w2_bias, + ) def select_gemm_impl( self, - prepare_finalize: mk.FusedMoEPrepareAndFinalizeModular, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, layer: torch.nn.Module, ) -> mk.FusedMoEExpertsModular: - if ( - prepare_finalize.activation_format - == mk.FusedMoEActivationFormat.BatchedExperts - ): - if self.mxfp4_backend == Mxfp4Backend.MARLIN: - max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() - assert max_num_tokens_per_rank is not None - assert self.moe_quant_config is not None - return BatchedMarlinExperts( - max_num_tokens=max_num_tokens_per_rank, - num_dispatchers=prepare_finalize.num_dispatchers(), - quant_config=self.moe_quant_config, - moe_config=self.moe, - ) - else: - raise NotImplementedError( - f"Incompatible Mxfp4 backend ({self.mxfp4_backend}) for " - "EP batched experts format" - ) - else: - assert self.moe_quant_config is not None - if ( - self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM - or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 - ): - # B200 code-path - kwargs = { - # TODO(bnell): part of quant_config - "max_capture_size": self.max_capture_size, - } - return TrtLlmGenExperts(self.moe, self.moe_quant_config, **kwargs) - elif self.mxfp4_backend == Mxfp4Backend.MARLIN: - return MarlinExperts(self.moe, self.moe_quant_config) - elif self.mxfp4_backend == Mxfp4Backend.TRITON: - if self.moe.is_lora_enabled: - return UnfusedOAITritonExperts(self.moe, self.moe_quant_config) - return OAITritonExperts(self.moe, self.moe_quant_config) - else: - raise NotImplementedError( - f"Incompatible Mxfp4 backend ({self.mxfp4_backend}) for EP" - ) - - @property - def is_monolithic(self) -> bool: - if self.moe.is_lora_enabled: - return False - return ( - self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM - or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 - or self.mxfp4_backend == Mxfp4Backend.TRITON - or self.mxfp4_backend == Mxfp4Backend.CK + raise ValueError( + f"{self.__class__.__name__} uses the new modular kernel " + "initialization logic. This function should not be called." ) def apply( @@ -1053,30 +379,6 @@ def apply( shared_experts_input: torch.Tensor | None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert not self.is_monolithic - if layer.enable_eplb: - raise NotImplementedError("EPLB is not supported for mxfp4") - - assert _can_support_mxfp4( - layer.use_grouped_topk, - layer.topk_group, - layer.num_expert_group, - layer.expert_map, - layer.custom_routing_function, - layer.e_score_correction_bias, - layer.apply_router_weight_on_input, - layer.scoring_func, - layer.activation, - layer.eplb_state.expert_load_view, - layer.eplb_state.logical_to_physical_map, - layer.eplb_state.logical_replica_count, - ), "MXFP4 are not supported with this configuration." - - assert ( - self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_CUTLASS - or self.mxfp4_backend == Mxfp4Backend.SM90_FI_MXFP4_BF16 - or self.mxfp4_backend == Mxfp4Backend.MARLIN - ) - assert self.moe_kernel is not None return self.moe_kernel.apply( hidden_states=x, @@ -1098,126 +400,17 @@ def apply_monolithic( router_logits: torch.Tensor, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: assert self.is_monolithic - - if layer.enable_eplb: - raise NotImplementedError("EPLB is not supported for mxfp4") - - assert _can_support_mxfp4( - layer.use_grouped_topk, - layer.topk_group, - layer.num_expert_group, - layer.expert_map, - layer.custom_routing_function, - layer.e_score_correction_bias, - layer.apply_router_weight_on_input, - layer.scoring_func, - layer.activation, - layer.eplb_state.expert_load_view, - layer.eplb_state.logical_to_physical_map, - layer.eplb_state.logical_replica_count, - ), "MXFP4 are not supported with this configuration." - - # Apply routing simulation strategy if specified. - # This applies to all monolithic backends (SM100_FI and TRITON). - routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY - if routing_strategy == "uniform_random": - router_logits = torch.rand_like(router_logits) - - if ( - self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM - or self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16 - ): - from flashinfer import trtllm_fp4_block_scale_moe - - if self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_BF16: - assert x.dtype == torch.bfloat16 - x_quant = x - x_scale = None - elif self.mxfp4_backend == Mxfp4Backend.SM100_FI_MXFP4_MXFP8_TRTLLM: - from flashinfer import mxfp8_quantize - - # x_quant is padded in hidden dimension with alignment=256 - x_quant, x_scale = mxfp8_quantize( - x, - is_sf_swizzled_layout=False, - alignment=256, - ) - x_scale = x_scale.view(torch.float8_e4m3fn).reshape(*x.shape[:-1], -1) - - # output with original unpadded hidden size - output = torch.empty_like(x) - - trtllm_gen_output = trtllm_fp4_block_scale_moe( - routing_logits=router_logits.to(torch.bfloat16), - routing_bias=None, - hidden_states=x_quant, - hidden_states_scale=x_scale, - gemm1_weights=layer.w13_weight, # uint8 (e2m1 x 2) - gemm1_weights_scale=layer.w13_weight_scale, # uint8 (e4m3 x 2) - gemm1_bias=layer.w13_bias, # fp32 per expert per channel - gemm1_alpha=layer.gemm1_alpha, # fp32 per expert - gemm1_beta=layer.gemm1_beta, # fp32 per expert - gemm1_clamp_limit=layer.gemm1_clamp_limit, # fp32 per expert - gemm2_weights=layer.w2_weight, # uint8 (e2m1 x 2) - gemm2_weights_scale=layer.w2_weight_scale, # ue8m0 - gemm2_bias=layer.w2_bias, # fp32 per expert per channel - output1_scale_scalar=None, - output1_scale_gate_scalar=None, - output2_scale_scalar=None, - num_experts=layer.global_num_experts, - top_k=layer.top_k, - n_group=None, - topk_group=None, - intermediate_size=self.intermediate_size, # padded to multiple of 256 - local_expert_offset=layer.ep_rank * layer.local_num_experts, - local_num_experts=self.num_experts, - routed_scaling_factor=None, - routing_method_type=1 if layer.renormalize else 0, - do_finalize=True, - tune_max_num_tokens=max(self.max_capture_size, 1), - output=output, - )[0] - return trtllm_gen_output - elif self.mxfp4_backend == Mxfp4Backend.CK: - topk_weights, topk_ids = rocm_aiter_ops.fused_topk( - x, router_logits, layer.top_k, True - ) - output = rocm_aiter_ops.fused_moe( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights, - topk_ids, - activation_method=rocm_aiter_ops.get_aiter_activation_type("swiglu"), - quant_method=rocm_aiter_ops.get_aiter_quant_type("per_1x32"), - w1_scale=layer.w13_weight_scale, - w2_scale=layer.w2_weight_scale, - doweight_stage1=False, - hidden_pad=self.hidden_pad // 128 * 128, - intermediate_pad=self.intermediate_pad // 64 * 64 * 2, - bias1=layer.w13_bias, - bias2=layer.w2_bias, - ) - return output - elif self.mxfp4_backend == Mxfp4Backend.TRITON: - from vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe import ( # noqa: E501 - triton_kernel_moe_forward, - ) - - return triton_kernel_moe_forward( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - gating_output=router_logits, - topk=layer.top_k, - renormalize=layer.renormalize, - global_num_experts=layer.global_num_experts, - expert_map=layer.expert_map, - quant_config=self.moe_quant_config, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - ) - else: - raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") + assert self.moe_kernel is not None + return self.moe_kernel.apply_monolithic( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + router_logits=router_logits, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + ) class XpuMxfp4MoEMethod(Mxfp4MoEMethod): diff --git a/vllm/model_executor/layers/quantization/ptpc_fp8.py b/vllm/model_executor/layers/quantization/ptpc_fp8.py deleted file mode 100644 index 5d7b7b54adc8..000000000000 --- a/vllm/model_executor/layers/quantization/ptpc_fp8.py +++ /dev/null @@ -1,132 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from typing import Any - -import torch -from torch.nn.parameter import Parameter - -from vllm import _custom_ops as ops -from vllm.model_executor.kernels.linear import ( - init_fp8_linear_kernel, -) -from vllm.model_executor.layers.attention import Attention -from vllm.model_executor.layers.linear import LinearBase, UnquantizedLinearMethod -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.model_executor.layers.quantization.base_config import QuantizeMethodBase -from vllm.model_executor.layers.quantization.fp8 import ( - Fp8Config, - Fp8KVCacheMethod, - Fp8LinearMethod, -) -from vllm.model_executor.layers.quantization.utils.quant_utils import ( - is_layer_skipped, - kFp8DynamicTokenSym, -) -from vllm.platforms import current_platform - - -class PTPCFp8Config(Fp8Config): - """Config class for Per-Token-Per-Channel Dynamic Quantization Fp8.""" - - def __init__( - self, - activation_scheme: str = "dynamic", - ignored_layers: list[str] | None = None, - ) -> None: - if not current_platform.is_rocm(): - raise ValueError("ptpc_fp8 quantization is supported only on ROCm.") - - if not current_platform.has_device_capability(94): - raise ValueError( - "ptpc_fp8 quantization is supported only on AMD Instinct MI300 GPUs and newer." # noqa: E501 - ) - if activation_scheme == "static": - raise ValueError("ptpc_fp8 as of now only support dynamic quantization.") - - super().__init__( - is_checkpoint_fp8_serialized=False, - activation_scheme=activation_scheme, - ignored_layers=ignored_layers, - ) - - @classmethod - def get_name(cls) -> QuantizationMethods: - return "ptpc_fp8" - - @classmethod - def from_config(cls, config: dict[str, Any]) -> "PTPCFp8Config": - activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) - ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) - return cls(activation_scheme=activation_scheme, ignored_layers=ignored_layers) - - def get_quant_method( - self, layer: torch.nn.Module, prefix: str - ) -> "QuantizeMethodBase | None": - if isinstance(layer, LinearBase): - if is_layer_skipped(prefix, self.ignored_layers): - return UnquantizedLinearMethod() - return PTPCFp8LinearMethod(self) - elif isinstance(layer, Attention): - return Fp8KVCacheMethod(self) - return None - - -class PTPCFp8LinearMethod(Fp8LinearMethod): - """Linear method for Per-Token and Per-Channel FP8 Quantization. - Only supports loading quantized BF16 model checkpoints with dynamic - activation scaling. To load FP16 model checkpoints, user must specify - to convert the FP16 model weight loading into BF16. - The weight scaling factor will be initialized after - the model weights are loaded. - - Limitations: - 1. Only support float8_e4m3fnuz data type due to the limitation of - torch._scaled_mm (https://github.com/ROCm/pytorch/blob/8c0504d7f3fb0ee4c278c096a5c3caedb01129fa/aten/src/ATen/native/cuda/Blas.cpp#L1041) - - Args: - quant_config: The quantization config. - """ - - def __init__(self, quant_config: PTPCFp8Config): - assert current_platform.is_rocm(), ( - "PTPCFp8LinearMethod is only supported on ROCm." - ) - super().__init__(quant_config=quant_config) - # Force weight quantization - self.fp8_linear = init_fp8_linear_kernel( - activation_quant_key=kFp8DynamicTokenSym, - weight_quant_key=kFp8DynamicTokenSym, - out_dtype=torch.get_default_dtype(), - module_name=self.__class__.__name__, - ) - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - assert layer.weight.data.dtype not in (torch.float16, torch.float32), ( - "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only support " - f"output dtype of bfloat16. {layer.weight.data.dtype} is specified." - ) - - if layer.weight.data.dtype == torch.bfloat16: - # Quantize the weights. - qweight, weight_scale = ops.scaled_fp8_quant( - layer.weight, scale=None, use_per_token_if_dynamic=True - ) - - # Update the layer with the new values. - layer.weight = Parameter( - qweight.t(), requires_grad=False - ) # Pretranspose the weight - layer.weight_scale = Parameter(weight_scale, requires_grad=False) - else: - assert layer.weight.data.dtype == current_platform.fp8_dtype() - assert getattr(layer, "weight_scale", None) is not None - layer.input_scale = None - - def apply( - self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: torch.Tensor | None = None, - ) -> torch.Tensor: - return self.fp8_linear.apply_weights(layer, x, bias) diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 0a5db4e71fdb..b2b77e6688c1 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -25,9 +25,9 @@ ocp_mx_moe_quant_config, ) from vllm.model_executor.layers.fused_moe.fused_marlin_moe import fused_marlin_moe -from vllm.model_executor.layers.quantization.mxfp4 import ( - Mxfp4Backend, - get_mxfp4_backend, +from vllm.model_executor.layers.fused_moe.oracle.mxfp4 import ( + Mxfp4MoeBackend, + select_mxfp4_moe_backend, ) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( prepare_fp8_moe_layer_for_marlin, @@ -92,7 +92,8 @@ def get_moe_method( rocm_aiter_ops.is_fused_moe_enabled() ) if ( - input_config.get("dtype") == "fp8_e4m3" + input_config is not None + and input_config.get("dtype") == "fp8_e4m3" and not input_config.get("is_dynamic") and not emulate ): @@ -698,9 +699,9 @@ def __init__( f"Please check that the combination is supported in OCP_MX_Scheme." ) - self.mxfp4_backend: Mxfp4Backend | None = None + self.mxfp4_backend: Mxfp4MoeBackend | None = None if self.ocp_mx_scheme == "w_mxfp4": - self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled) + self.mxfp4_backend, _ = select_mxfp4_moe_backend(moe) if self.input_quant is not None: self.static_input_scales = not self.input_quant.get("is_dynamic") diff --git a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py index 6917bb6f2deb..1b30f5b82c6a 100644 --- a/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py +++ b/vllm/model_executor/layers/quantization/quark/schemes/quark_ocp_mx.py @@ -176,7 +176,7 @@ class QuarkOCP_MX(QuarkScheme): def __init__( self, weight_quant_spec: dict[str, Any], - input_quant_spec: dict[str, Any], + input_quant_spec: dict[str, Any] | None, dynamic_mxfp4_quant: bool = False, ): self.out_dtype = torch.get_default_dtype() @@ -185,7 +185,13 @@ def __init__( self.input_quant_spec = input_quant_spec self.dynamic_mxfp4_quant = dynamic_mxfp4_quant self.weight_dtype = weight_quant_spec["dtype"].replace("fp", "mxfp") - self.input_dtype = input_quant_spec["dtype"].replace("fp", "mxfp") + self.input_dtype: str | None = None + if input_quant_spec is not None: + input_quant = input_quant_spec["dtype"] + if input_quant == "fp8_e4m3": + self.input_dtype = "fp8" + else: + self.input_dtype = input_quant.replace("fp", "mxfp") self.ocp_mx_scheme = OCP_MX_Scheme.from_quant_dtype( self.input_dtype, self.weight_dtype @@ -200,14 +206,21 @@ def __init__( dequant_mxfp6, quant_dtype=self.weight_dtype.replace("mx", "") ) - if self.input_dtype == "mxfp4": + if self.input_dtype is None: + self.quant_dequant_func: Callable[[torch.Tensor], torch.Tensor] = ( + lambda x: x + ) # no input Q/DQ for weight-only + elif self.input_dtype == "mxfp4": self.quant_dequant_func = quant_dequant_mxfp4 else: self.quant_dequant_func = partial( quant_dequant_mxfp6, quant_dtype=self.input_dtype.replace("mx", "") ) - self.static_input_scales = not input_quant_spec.get("is_dynamic") + if input_quant_spec is None: + self.static_input_scales = False + else: + self.static_input_scales = not input_quant_spec.get("is_dynamic") if self.static_input_scales: raise NotImplementedError( diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py index d6b32c4bbef2..9bc58d2f302d 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp4.py @@ -389,9 +389,9 @@ def prepare_moe_fp4_layer_for_marlin( group_size = 16 if is_nvfp4 else 32 - e = layer.num_experts - k = layer.hidden_size - n = layer.intermediate_size_per_partition + e = layer.moe_config.num_experts + k = layer.moe_config.hidden_dim + n = layer.moe_config.intermediate_size_per_partition # WORKSPACE device = layer.w13_weight.device @@ -500,6 +500,120 @@ def prepare_moe_fp4_layer_for_marlin( setattr(layer, name, bias) +def prepare_moe_mxfp4_layer_for_marlin( + layer: torch.nn.Module, + w13: torch.Tensor, + w2: torch.Tensor, + w13_scale: torch.Tensor, + w2_scale: torch.Tensor, + w13_bias: torch.Tensor | None, + w2_bias: torch.Tensor | None, +) -> tuple[ + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor, + torch.Tensor | None, + torch.Tensor | None, +]: + """Pure-function version of prepare_moe_fp4_layer_for_marlin for MXFP4. + + Takes weight tensors as inputs and returns transformed tensors. + Does NOT modify the layer in-place. + """ + input_dtype = get_marlin_input_dtype() + if ( + input_dtype is not None + and input_dtype.itemsize == 1 + and input_dtype != torch.float8_e4m3fn + ): + raise RuntimeError("MXFP4 weight + INT8 activation is not supported.") + + group_size = 32 # MXFP4 block size + + # Derive dimensions from actual weight shapes to handle rounded/padded + # sizes correctly (e.g., Mxfp4MoEMethod rounds up hidden_dim). + # w13 shape: (E, 2*N, K//2) + e = w13.shape[0] + n = w13.shape[1] // 2 # intermediate_size_per_partition + k = w13.shape[2] * 2 # hidden_size + + device = w13.device + param_dtype = layer.params_dtype + is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1 + perm = torch.empty(0, dtype=torch.int, device=device) + + # WEIGHT: Repack weights to marlin format + def repack_weight(weight: torch.Tensor, name: str) -> torch.Tensor: + tensor_list = [] + if "w13" in name: + size_n, size_k = n * 2, k + else: + size_n, size_k = k, n + + assert weight.shape == (e, size_n, size_k // 2) + + for i in range(e): + qweight = weight[i].view(torch.int32).T.contiguous() + marlin_qweight = ops.gptq_marlin_repack( + b_q_weight=qweight, + perm=perm, + size_k=size_k, + size_n=size_n, + num_bits=4, + is_a_8bit=is_a_8bit, + ) + tensor_list.append(marlin_qweight) + return torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + + w13 = repack_weight(w13, "w13") + w2 = repack_weight(w2, "w2") + + # WEIGHT SCALES: Permute scales + def permute_scales(scales: torch.Tensor, name: str) -> torch.Tensor: + scales = scales.view(torch.float8_e8m0fnu) + scales = scales.to(param_dtype) + + tensor_list = [] + if "w13" in name: + size_n, size_k = n * 2, k + else: + size_n, size_k = k, n + + for i in range(e): + scale = scales[i].T + marlin_scales = marlin_permute_scales( + s=scale, + size_k=size_k, + size_n=size_n, + group_size=group_size, + is_a_8bit=is_a_8bit, + ) + marlin_scales = mxfp4_marlin_process_scales( + marlin_scales, input_dtype=input_dtype + ) + tensor_list.append(marlin_scales) + return torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + + w13_scale = permute_scales(w13_scale, "w13") + w2_scale = permute_scales(w2_scale, "w2") + + # BIAS: Permute bias + def permute_bias(bias: torch.Tensor | None) -> torch.Tensor | None: + if bias is None: + return None + bias = bias.to(param_dtype) + tensor_list = [] + for i in range(e): + tensor_list.append(marlin_permute_bias(bias[i])) + return torch.cat([x.unsqueeze(0) for x in tensor_list], 0) + + w13_bias = permute_bias(w13_bias) + w2_bias = permute_bias(w2_bias) + + return w13, w2, w13_scale, w2_scale, w13_bias, w2_bias + + def rand_marlin_weight_nvfp4_like(weight, group_size, input_dtype=None): is_a_8bit = input_dtype is not None and input_dtype.itemsize == 1 diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py index 23d7cf55474a..49ddc8accc29 100644 --- a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py +++ b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py @@ -1,12 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any import torch from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.activation import MoEActivation from vllm.platforms import current_platform from vllm.triton_utils import triton from vllm.utils.import_utils import has_triton_kernels @@ -22,7 +20,7 @@ CK_MXFP4_MOE_DIM_ALIGNMENT = 256 -def _swizzle_mxfp4(quant_tensor, scale, num_warps): +def _swizzle_mxfp4(quant_tensor, scale, num_warps=8): """weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel""" assert has_triton_kernels() import triton_kernels.matmul_ogs_details.opt_flags as opt_flags @@ -87,35 +85,6 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps): return quant_tensor, InFlexData(), scale -def _can_support_mxfp4( - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - scoring_func: str = "softmax", - activation: MoEActivation = MoEActivation.SWIGLUOAI, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, -): - return not ( - use_grouped_topk - or topk_group - or num_expert_group - or custom_routing_function - or e_score_correction_bias - or apply_router_weight_on_input - or scoring_func != "softmax" - or activation != MoEActivation.SWIGLUOAI - or expert_load_view - or logical_to_physical_map - or logical_replica_count - ) - - def get_padding_alignment(): return ( 256 diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 757d1ecc5284..4918c83bdc39 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -122,7 +122,7 @@ def use_aiter_triton_gemm(n, m, k, dtype): def rocm_unquantized_gemm_impl( x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor | None = None ) -> torch.Tensor: - from vllm.platforms.rocm import on_gfx9, on_gfx950 + from vllm.platforms.rocm import on_gfx1x, on_gfx9, on_gfx950 n = x.numel() // x.size(-1) m = weight.shape[0] @@ -169,12 +169,12 @@ def rocm_unquantized_gemm_impl( use_skinny = ( envs.VLLM_ROCM_USE_SKINNY_GEMM - and on_gfx9() + and (on_gfx9() or on_gfx1x()) and x.dtype in [torch.float16, torch.bfloat16] and k % 8 == 0 ) - if use_skinny is not True: + if not use_skinny: return torch.nn.functional.linear(x, weight, bias) x_view = x.reshape(-1, x.size(-1)) diff --git a/vllm/model_executor/models/colbert.py b/vllm/model_executor/models/colbert.py index 66def505f1f7..7b6889899762 100644 --- a/vllm/model_executor/models/colbert.py +++ b/vllm/model_executor/models/colbert.py @@ -27,8 +27,9 @@ from vllm.model_executor.layers.pooler.tokwise import pooler_for_token_embed from .bert import BertEmbeddingModel, BertModel -from .interfaces import SupportsLateInteraction +from .interfaces import HasInnerState, IsHybrid, SupportsLateInteraction from .interfaces_base import default_pooling_type +from .lfm2 import Lfm2ForCausalLM, Lfm2Model class ColBERTMixin(nn.Module, SupportsLateInteraction): @@ -414,3 +415,98 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loaded.update(colbert_loaded) return loaded + + +# ----------------------------------------------------------------------- +# Concrete model: ColBERT + LFM2 backbone +# ----------------------------------------------------------------------- + + +@default_pooling_type(seq_pooling_type="CLS", tok_pooling_type="ALL") +class ColBERTLfm2Model(ColBERTMixin, nn.Module, HasInnerState, IsHybrid): + """ColBERT late interaction model with LFM2 backbone. + + For ``LiquidAI/LFM2-ColBERT-350M`` and similar models. + + The projection is auto-loaded from sentence-transformers ``1_Dense/`` + when not present in the main checkpoint. + """ + + is_pooling_model = True + # LFM2 is a hybrid model (attention + SSM layers); these flags ensure + # HybridAttentionMambaModelConfig.verify_and_update_config runs so that + # mamba_block_size and related cache settings are correctly initialised. + is_hybrid = True + has_inner_state = True + + @classmethod + def get_mamba_state_shape_from_config(cls, vllm_config: VllmConfig): + return Lfm2ForCausalLM.get_mamba_state_shape_from_config(vllm_config) + + @classmethod + def get_mamba_state_dtype_from_config(cls, vllm_config: VllmConfig): + return Lfm2ForCausalLM.get_mamba_state_dtype_from_config(vllm_config) + + @classmethod + def get_mamba_state_copy_func(cls): + return Lfm2ForCausalLM.get_mamba_state_copy_func() + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + + colbert_dim = self.get_colbert_dim_from_config(config) + self._init_colbert_components( + hidden_size=config.hidden_size, + colbert_dim=colbert_dim, + head_dtype=vllm_config.model_config.head_dtype, + ) + + self.model = Lfm2Model( + vllm_config=vllm_config, + prefix=prefix, + ) + + pooler_config = vllm_config.model_config.pooler_config + assert pooler_config is not None + self.pooler = self._build_colbert_pooler(pooler_config) + + def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.embed_input_ids(input_ids) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + intermediate_tensors=None, + inputs_embeds: torch.Tensor | None = None, + ) -> torch.Tensor: + return self.model( + input_ids=input_ids, + positions=positions, + inputs_embeds=inputs_embeds, + intermediate_tensors=intermediate_tensors, + ) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + other_weights, colbert_loaded = self._load_colbert_weights(weights) + + # Strip "model." prefix added by the embedding adapter + model_weights = [ + (n[len("model.") :] if n.startswith("model.") else n, w) + for n, w in other_weights + ] + loaded_model = self.model.load_weights(model_weights) + loaded = {f"model.{name}" for name in loaded_model} | colbert_loaded + + # When the ST projector was auto-loaded during init + # (not from the main checkpoint), mark its params as loaded + # so the weight validator doesn't complain. + if hasattr(self.pooler, "head"): + head = self.pooler.head + projector = getattr(head, "projector", None) + if projector is not None and isinstance(projector, nn.Module): + for name, _ in projector.named_parameters(): + loaded.add(f"pooler.head.projector.{name}") + + return loaded diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 488cfa35c14f..a5644a414aee 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -113,8 +113,24 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: Args: vllm_config: vLLM Config """ + cache_config = vllm_config.cache_config + + # Disable calculate_kv_scales for hybrid models: uninitialized + # recurrent state corrupts scales during the calibration pass. + # See issue: https://github.com/vllm-project/vllm/issues/37554 + if cache_config.calculate_kv_scales: + logger.warning( + "Disabling calculate_kv_scales for hybrid model '%s'. " + "Hybrid models with recurrent layers (GDN, Mamba, SSM) " + "produce unreliable KV cache scales during the " + "calibration pass because recurrent state is " + "uninitialized. Using default scale of 1.0 instead.", + vllm_config.model_config.model, + ) + cache_config.calculate_kv_scales = False + # Save the user input before it gets modified by MambaModelConfig - mamba_block_size = vllm_config.cache_config.mamba_block_size + mamba_block_size = cache_config.mamba_block_size # Enable FULL_AND_PIECEWISE by default MambaModelConfig.verify_and_update_config(vllm_config) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index 55c42e5fa57e..0c182a891cd3 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -194,18 +194,18 @@ class VllmModelForPooling(VllmModel[T_co], Protocol[T_co]): [vllm.config.model.ModelConfig.score_type][] to use by default. - Score API handles score/rerank for: - - "score" task (score_type: cross-encoder models) - - "embed" task (score_type: bi-encoder models) - - "token_embed" task (score_type: late interaction models) + Scoring API handles score/rerank for:\n + - "classify" task (score_type: cross-encoder models)\n + - "embed" task (score_type: bi-encoder models)\n + - "token_embed" task (score_type: late interaction models)\n - score_type defaults to bi-encoder, then the Score API uses the "embed" task. + score_type defaults to bi-encoder, then the Score API uses the "embed" task.\n If you set score_type to cross-encoder via [vllm.model_executor.models.interfaces.SupportsCrossEncoding][], - then the Score API uses the "score" task. + then the Score API uses the "score" task.\n If you set score_type to late-interaction via [vllm.model_executor.models.interfaces.SupportsLateInteraction][], - then the Score API uses the "token_embed" task. + then the Score API uses the "token_embed" task.\n """ pooler: Pooler diff --git a/vllm/model_executor/models/isaac.py b/vllm/model_executor/models/isaac.py index 8e03e29a77c3..e29646182137 100644 --- a/vllm/model_executor/models/isaac.py +++ b/vllm/model_executor/models/isaac.py @@ -334,15 +334,14 @@ def get_hf_config(self) -> IsaacConfig: return IsaacConfig() def get_image_processor(self, **kwargs) -> IsaacImageProcessor: - return IsaacImageProcessor(kwargs) + return IsaacImageProcessor(**kwargs) def get_hf_processor(self, **kwargs) -> IsaacProcessor: hf_config = self.get_hf_config() - return self.ctx.init_processor( - IsaacProcessor, + return IsaacProcessor( tokenizer=self.get_tokenizer(), - image_processor=self.get_image_processor(), + image_processor=self.get_image_processor(**kwargs), image_token=hf_config.vision_token, ) diff --git a/vllm/model_executor/models/kimi_k25.py b/vllm/model_executor/models/kimi_k25.py index 4b2b6a4b60a2..10d21aab0cf8 100644 --- a/vllm/model_executor/models/kimi_k25.py +++ b/vllm/model_executor/models/kimi_k25.py @@ -104,19 +104,25 @@ class KimiK25ProcessingInfo(BaseProcessingInfo): def __init__(self, ctx: InputProcessingContext) -> None: super().__init__(ctx) - self.hf_config = self.get_hf_config() - self.media_token_id = self.hf_config.media_placeholder_token_id - media_processor = cached_get_image_processor( + + self.hf_config = hf_config = self.get_hf_config() + + tokenizer = self.get_tokenizer() + image_processor = cached_get_image_processor( self.ctx.model_config.model, trust_remote_code=self.ctx.model_config.trust_remote_code, ) - self.media_processor = media_processor + + self.media_token_id = media_token_id = hf_config.media_placeholder_token_id + self.media_token = tokenizer.decode(media_token_id) + + self.image_processor = image_processor self.hf_processor = KimiK25Processor( - media_processor=self.media_processor, - tokenizer=self.get_tokenizer(), - media_token_id=self.media_token_id, + tokenizer=tokenizer, + image_processor=image_processor, + media_token_id=media_token_id, ) - self.media_tokens_calculator = self.media_processor.media_tokens_calculator + self.media_tokens_calculator = image_processor.media_tokens_calculator def get_hf_processor(self): return self.hf_processor @@ -132,20 +138,15 @@ def get_supported_mm_limits(self) -> Mapping[str, int | None]: class KimiK25DummyInputsBuilder(BaseDummyInputsBuilder[KimiK25ProcessingInfo]): """Builds dummy inputs for Kimi-K2.5 model profiling.""" - def __init__(self, info: KimiK25ProcessingInfo) -> None: - super().__init__(info) - self.media_token_id = self.info.media_token_id - self.frame_per_chunk = self.info.media_processor.num_frames_per_chunk - def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: num_media = mm_counts.get("vision_chunk", 0) - return "<|media_pad|>" * num_media + return self.info.media_token * num_media def get_dummy_mm_items(self): dummy_videos = self._get_dummy_images( height=MaxImageTokenMeta.height, width=MaxImageTokenMeta.width, - num_images=self.frame_per_chunk, + num_images=self.info.image_processor.num_frames_per_chunk, ) video_chunk_dummy_item = VisionChunkVideo( @@ -236,9 +237,6 @@ def get_replacement(item_idx: int): ), ] - def split_video_chunks(self, video): - return self.info.media_processor.split_video_chunks(video) - @MULTIMODAL_REGISTRY.register_processor( KimiK25MultiModalProcessor, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 5ff9c5f04b5e..1741e18fdda6 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -12,6 +12,7 @@ import warnings from collections.abc import Iterable, Mapping, Sequence from functools import cached_property +from io import BytesIO from typing import Annotated, Literal, TypeAlias import torch @@ -53,7 +54,7 @@ MultiModalKwargsItems, VideoItem, ) -from vllm.multimodal.media.audio import extract_audio_from_video_bytes +from vllm.multimodal.media.audio import load_audio_pyav from vllm.multimodal.parse import ( AudioProcessorItems, ImageEmbeddingItems, @@ -553,7 +554,7 @@ def _extract_audio_from_videos( "video must be loaded with keep_video_bytes=True (e.g. via " "the chat API with a model that sets use_audio_in_video)." ) - audio_items.append(extract_audio_from_video_bytes(video_bytes)) + audio_items.append(load_audio_pyav(BytesIO(video_bytes))) # Create a new VideoProcessorItems with metadata that does not contain # the large video bytes, to avoid modifying the input `mm_items`. diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 8b1455359f57..eaf5843a3516 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -66,9 +66,11 @@ from .interfaces import ( MultiModalEmbeddings, + SupportsEagle3, SupportsLoRA, SupportsMultiModal, SupportsPP, + supports_eagle3, ) from .module_mapping import MultiModelKeys from .utils import StageMissingLayer, init_vllm_registered_model, maybe_prefix @@ -262,7 +264,7 @@ def _cached_apply_hf_processor( dummy_inputs=PixtralDummyInputsBuilder, ) class PixtralForConditionalGeneration( - nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP + nn.Module, SupportsLoRA, SupportsEagle3, SupportsMultiModal, SupportsPP ): @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: @@ -390,6 +392,21 @@ def compute_logits( ) -> torch.Tensor | None: return self.language_model.compute_logits(hidden_states) + def _require_language_model_eagle3(self) -> None: + if not supports_eagle3(self.language_model): + raise RuntimeError( + f"EAGLE-3 speculative decoding requires the language model to " + f"support EAGLE-3, but {type(self.language_model).__name__} does not." + ) + + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self._require_language_model_eagle3() + self.language_model.set_aux_hidden_state_layers(layers) + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + self._require_language_model_eagle3() + return self.language_model.get_eagle3_aux_hidden_state_layers() + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): def is_vision_encoder_weights(weight: tuple[str, torch.Tensor]): return weight[0].startswith(("vision_encoder", "vision_tower")) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 9b1e52722923..c3e7edb7da4a 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -269,6 +269,7 @@ "HF_ColBERT": ("colbert", "ColBERTModel"), "ColBERTModernBertModel": ("colbert", "ColBERTModernBertModel"), "ColBERTJinaRobertaModel": ("colbert", "ColBERTJinaRobertaModel"), + "ColBERTLfm2Model": ("colbert", "ColBERTLfm2Model"), # [Multimodal] "ColModernVBertForRetrieval": ("colmodernvbert", "ColModernVBertForRetrieval"), "ColPaliForRetrieval": ("colpali", "ColPaliModel"), diff --git a/vllm/model_executor/models/whisper_causal.py b/vllm/model_executor/models/whisper_causal.py index 6774ea11dd0c..8e4322ea335d 100644 --- a/vllm/model_executor/models/whisper_causal.py +++ b/vllm/model_executor/models/whisper_causal.py @@ -150,8 +150,10 @@ def build( new_common_attn_metadata.query_start_loc *= block_pool_size new_common_attn_metadata.query_start_loc_cpu *= block_pool_size new_common_attn_metadata.seq_lens *= block_pool_size - new_common_attn_metadata._seq_lens_cpu *= block_pool_size - new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size + if new_common_attn_metadata._seq_lens_cpu is not None: + new_common_attn_metadata._seq_lens_cpu *= block_pool_size + if new_common_attn_metadata._num_computed_tokens_cpu is not None: + new_common_attn_metadata._num_computed_tokens_cpu *= block_pool_size new_common_attn_metadata.num_actual_tokens *= block_pool_size new_common_attn_metadata.max_query_len *= block_pool_size new_common_attn_metadata.max_seq_len *= block_pool_size diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 28f066d112ed..0a748a6d15c6 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -12,17 +12,35 @@ from vllm.utils.import_utils import PlaceholderModule try: - import librosa + import av as av except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] + av = PlaceholderModule("av") # type: ignore[assignment] +try: + import resampy +except ImportError: + resampy = PlaceholderModule("resampy") # type: ignore[assignment] try: import scipy.signal as scipy_signal except ImportError: scipy_signal = PlaceholderModule("scipy").placeholder_attr("signal") # type: ignore[assignment] + # ============================================================ +# Aligned with `librosa.get_duration` function +def get_audio_duration(*, y: npt.NDArray[np.floating], sr: float = 22050) -> float: + """Get the duration of an audio array in seconds. + + Args: + y: Audio time series. Can be 1D (samples,) or 2D (channels, samples). + sr: Sample rate of the audio in Hz. + + Returns: + Duration of the audio in seconds. + """ + n_samples = y.shape[-1] + return float(n_samples) / sr class ChannelReduction(str, Enum): @@ -153,13 +171,71 @@ def normalize_audio( # ============================================================ -def resample_audio_librosa( +def resample_audio_pyav( audio: npt.NDArray[np.floating], *, orig_sr: float, target_sr: float, ) -> npt.NDArray[np.floating]: - return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) + """Resample audio using PyAV (libswresample via FFmpeg). + + Args: + audio: Input audio. Can be: + - 1D array ``(samples,)``: mono audio + - 2D array ``(channels, samples)``: stereo audio + orig_sr: Original sample rate in Hz. + target_sr: Target sample rate in Hz. + + Returns: + Resampled audio with the same shape as the input (1D → 1D, 2D → 2D). + """ + orig_sr_int = int(round(orig_sr)) + target_sr_int = int(round(target_sr)) + + if orig_sr_int == target_sr_int: + return audio + + if audio.ndim == 2: + # Resample each channel independently and re-stack. + return np.stack( + [ + resample_audio_pyav(ch, orig_sr=orig_sr, target_sr=target_sr) + for ch in audio + ], + axis=0, + ) + + expected_len = int(math.ceil(audio.shape[-1] * target_sr_int / orig_sr_int)) + + # from_ndarray expects shape (channels, samples) for planar formats. + # libswresample requires a minimum number of input samples to produce + # output frames; pad short inputs with zeros so we always get output, + # then trim to the expected output length. + _MIN_SAMPLES = 1024 + audio_f32 = np.asarray(audio, dtype=np.float32) + if len(audio_f32) < _MIN_SAMPLES: + audio_f32 = np.pad(audio_f32, (0, _MIN_SAMPLES - len(audio_f32))) + audio_f32 = audio_f32.reshape(1, -1) + + resampler = av.AudioResampler(format="fltp", layout="mono", rate=target_sr_int) + + frame = av.AudioFrame.from_ndarray(audio_f32, format="fltp", layout="mono") + frame.sample_rate = orig_sr_int + + out_frames = resampler.resample(frame) + out_frames.extend(resampler.resample(None)) # flush buffered samples + + result = np.concatenate([f.to_ndarray() for f in out_frames], axis=1).squeeze(0) + return result[:expected_len] + + +def resample_audio_resampy( + audio: npt.NDArray[np.floating], + *, + orig_sr: float, + target_sr: float, +) -> npt.NDArray[np.floating]: + return resampy.resample(audio, sr_orig=orig_sr, sr_new=target_sr) def resample_audio_scipy( @@ -167,7 +243,7 @@ def resample_audio_scipy( *, orig_sr: float, target_sr: float, -): +) -> npt.NDArray[np.floating]: if orig_sr > target_sr: return scipy_signal.resample_poly(audio, 1, orig_sr // target_sr) elif orig_sr < target_sr: @@ -181,7 +257,7 @@ class AudioResampler: def __init__( self, target_sr: float | None = None, - method: Literal["librosa", "scipy"] = "librosa", + method: Literal["pyav", "resampy", "scipy"] = "resampy", ): self.target_sr = target_sr self.method = method @@ -203,8 +279,10 @@ def resample( abs_tol=1e-6, ): return audio - if self.method == "librosa": - return resample_audio_librosa( + if self.method == "pyav": + return resample_audio_pyav(audio, orig_sr=orig_sr, target_sr=self.target_sr) + if self.method == "resampy": + return resample_audio_resampy( audio, orig_sr=orig_sr, target_sr=self.target_sr ) elif self.method == "scipy": @@ -214,7 +292,7 @@ def resample( else: raise ValueError( f"Invalid resampling method: {self.method}. " - "Supported methods are 'librosa' and 'scipy'." + "Supported methods are 'pyav' and 'scipy'." ) diff --git a/vllm/multimodal/media/audio.py b/vllm/multimodal/media/audio.py index 88dcb0b0186a..ae0a9f55bdce 100644 --- a/vllm/multimodal/media/audio.py +++ b/vllm/multimodal/media/audio.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import math from io import BytesIO from pathlib import Path @@ -14,58 +15,80 @@ from .base import MediaIO try: - import librosa + import av except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] + av = PlaceholderModule("av") # type: ignore[assignment] try: import soundfile except ImportError: soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] + try: - import av + import resampy except ImportError: - av = PlaceholderModule("av") # type: ignore[assignment] + resampy = PlaceholderModule("resampy") # type: ignore[assignment] -def extract_audio_from_video_bytes( - data: bytes, -) -> tuple[npt.NDArray, float]: - """Extract the audio track from raw video bytes using PyAV. +# Public libsndfile error codes exposed via `soundfile.LibsndfileError.code`, soundfile +# being librosa's main backend. Used to validate if an audio loading error is due to a +# server error vs a client error (invalid audio file). +# 1 = unrecognised format (file is not a supported audio container) +# 3 = malformed file (corrupt or structurally invalid audio) +# 4 = unsupported encoding (codec not supported by this libsndfile build) +_BAD_SF_CODES = {1, 3, 4} - PyAV wraps FFmpeg's C libraries in-process — no subprocess is - spawned, which is critical to avoid crashing CUDA-active vLLM - worker processes. - The returned waveform is at the native sample rate of the video's - audio stream. Resampling to a model-specific rate is left to the - downstream :class:`AudioResampler` in the parsing pipeline. +def load_audio_pyav( + path: BytesIO | Path | str, + *, + sr: float | None = 22050, + mono: bool = True, +) -> tuple[npt.NDArray, float]: + """Load an audio file using PyAV (FFmpeg), returning float32 mono waveform. + + Decodes the audio stream at its native sample rate. Channel reduction to + mono is performed by averaging across channels. Resampling to a + model-specific rate is left to the downstream :class:`AudioResampler`. Args: - data: Raw video file bytes (e.g. from an mp4 file). + path: A :class:`~io.BytesIO` buffer, a filesystem + :class:`~pathlib.Path`, or a string path. Returns: - A tuple of ``(waveform, sample_rate)`` suitable for use as an - :class:`AudioItem`. + ``(waveform, sample_rate)`` where *waveform* is a 1-D float32 + NumPy array and *sample_rate* is the native sample rate in Hz. """ - if data is None or len(data) == 0: - raise ValueError( - "Cannot extract audio: video bytes are missing or empty. " - "Ensure video was loaded with keep_video_bytes=True for " - "audio-in-video extraction." - ) + native_sr = None try: - with av.open(BytesIO(data)) as container: + with av.open(path) as container: if not container.streams.audio: - raise ValueError("No audio stream found in the video.") + raise ValueError("No audio stream found.") stream = container.streams.audio[0] + stream.thread_type = "AUTO" native_sr = stream.rate + sr = sr or native_sr chunks: list[npt.NDArray] = [] - for frame in container.decode(audio=0): - arr = frame.to_ndarray() - chunks.append(arr.mean(axis=0) if arr.ndim > 1 else arr) + needs_resampling = not math.isclose( + float(sr), + float(native_sr), + rel_tol=0.0, + abs_tol=1e-6, + ) + resampler = ( + av.AudioResampler(format="fltp", layout="mono", rate=sr) + if needs_resampling + else None + ) + for frame in container.decode(stream): + if needs_resampling: + assert resampler is not None + for out_frame in resampler.resample(frame): + chunks.append(out_frame.to_ndarray()) + else: + chunks.append(frame.to_ndarray()) except ValueError: raise except Exception as e: @@ -77,37 +100,54 @@ def extract_audio_from_video_bytes( if not chunks: raise ValueError("No audio found in the video.") - audio = np.concatenate(chunks).astype(np.float32) - return audio, float(native_sr) + audio = np.concatenate(chunks, axis=-1).astype(np.float32) + if mono and audio.ndim > 1: + audio = np.mean(audio, axis=0) + return audio, sr -def is_video(data: bytes) -> bool: - """Check if the fetched bytes are video""" - if len(data) < 12: - return False - box_type = data[4:8] - major_brand = data[8:12] +def load_audio_soundfile( + path: BytesIO | Path | str, + *, + sr: float | None = 22050, + mono: bool = True, +) -> tuple[np.ndarray, int]: + """Load audio via soundfile""" + with soundfile.SoundFile(path) as f: + native_sr = f.samplerate + y = f.read(dtype="float32", always_2d=False).T - MP4_BRANDS = { - b"mp41", - b"mp42", # MP4 - b"isom", # ISO Base Media - b"iso2", - b"iso4", - b"iso5", - b"iso6", - b"M4V ", - b"M4A ", # Apple - b"avc1", # H.264 - b"dash", # DASH - b"mmp4", - b"MSNV", - } + if mono and y.ndim > 1: + y = np.mean(y, axis=tuple(range(y.ndim - 1))) - is_avi = data[:4] == b"RIFF" and major_brand == b"AVI " - is_mp4 = box_type == b"ftyp" and major_brand in MP4_BRANDS - return is_mp4 or is_avi + if sr is not None and sr != native_sr: + y = resampy.resample(y, sr_orig=native_sr, sr_new=sr) + return y, int(sr) + return y, native_sr + + +def load_audio( + path: BytesIO | Path | str, + *, + sr: float | None = 22050, + mono: bool = True, +): + try: + return load_audio_soundfile(path, sr=sr, mono=mono) + except soundfile.LibsndfileError as exc: + # Only fall back for known format-detection failures. + # Re-raise anything else (e.g. corrupt but recognised format). + if exc.code not in _BAD_SF_CODES: + raise + # soundfile may have advanced the BytesIO seek position before failing; + # reset it so PyAV can read from the beginning. + if isinstance(path, BytesIO): + path.seek(0) + try: + return load_audio_pyav(path, sr=sr, mono=mono) + except Exception as pyav_exc: + raise ValueError("Invalid or unsupported audio file.") from pyav_exc class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): @@ -128,9 +168,7 @@ def __init__(self, **kwargs) -> None: self.kwargs = kwargs def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]: - if is_video(data): - return extract_audio_from_video_bytes(data) - return librosa.load(BytesIO(data), sr=None) + return load_audio(BytesIO(data), sr=None) def load_base64( self, @@ -140,7 +178,7 @@ def load_base64( return self.load_bytes(pybase64.b64decode(data)) def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]: - return librosa.load(filepath, sr=None) + return load_audio(filepath, sr=None) def encode_base64( self, diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 6a588dad0207..9e1774e3921b 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -497,7 +497,7 @@ def __init__( *, target_sr: float | None = None, target_channels: int | None = None, - audio_resample_method: Literal["librosa", "scipy"] = "librosa", + audio_resample_method: Literal["pyav", "scipy"] = "pyav", video_needs_metadata: bool = False, expected_hidden_size: int | None = None, ) -> None: diff --git a/vllm/parser/parser_manager.py b/vllm/parser/parser_manager.py index 4331eba9884f..5577dfb1d8bb 100644 --- a/vllm/parser/parser_manager.py +++ b/vllm/parser/parser_manager.py @@ -199,7 +199,7 @@ def get_tool_parser( parser: type[ToolParser] | None = None if not enable_auto_tools or tool_parser_name is None: return parser - logger.info('"auto" tool choice has been enabled.') + logger.info_once('"auto" tool choice has been enabled.') try: if ( diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 7070fd0b604d..50a79cbb0b8d 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -511,6 +511,11 @@ def support_hybrid_kv_cache(cls) -> bool: def support_static_graph_mode(cls) -> bool: return True + @classmethod + def support_deep_gemm(cls) -> bool: + """Currently, only Hopper and Blackwell GPUs are supported.""" + return cls.is_device_capability(90) or cls.is_device_capability_family(100) + @classmethod def num_compute_units(cls, device_id: int = 0) -> int: return torch.cuda.get_device_properties(device_id).multi_processor_count diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 619b403ba4c1..39688bb8b235 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -712,6 +712,13 @@ def support_static_graph_mode(cls) -> bool: """ return False + @classmethod + def support_deep_gemm(cls) -> bool: + """ + Returns if DeepGEMM is supported by the current platform. + """ + return False + @classmethod def use_custom_op_collectives(cls) -> bool: """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 3c5f8a0795c5..29d7d5ce8592 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -28,6 +28,7 @@ from amdsmi import ( AmdSmiException, amdsmi_get_gpu_asic_info, + amdsmi_get_gpu_device_uuid, amdsmi_get_processor_handles, amdsmi_init, amdsmi_shut_down, @@ -377,7 +378,6 @@ class RocmPlatform(Platform): "fbgemm_fp8", "gguf", "quark", - "ptpc_fp8", "mxfp4", "petit_nvfp4", "torchao", @@ -608,6 +608,20 @@ def get_device_name(cls, device_id: int = 0) -> str: return _ROCM_DEVICE_ID_NAME_MAP[device_name] return asic_info["market_name"] + @classmethod + @with_amdsmi_context + def get_device_uuid(cls, device_id: int = 0) -> str: + try: + device = amdsmi_get_processor_handles()[device_id] + except AmdSmiException as error: + logger.error("amdsmi device query failed ", exc_info=error) + return "" + try: + device_uuid = amdsmi_get_gpu_device_uuid(device) + except AmdSmiException as error: + logger.error("amdsmi device uuid query failed ", exc_info=error) + return device_uuid + @classmethod def get_device_total_memory(cls, device_id: int = 0) -> int: device_props = torch.cuda.get_device_properties(device_id) diff --git a/vllm/pooling_params.py b/vllm/pooling_params.py index e5e993b75556..b347ec831abc 100644 --- a/vllm/pooling_params.py +++ b/vllm/pooling_params.py @@ -7,9 +7,12 @@ import msgspec from vllm.config import ModelConfig, PoolerConfig +from vllm.logger import init_logger from vllm.sampling_params import RequestOutputKind from vllm.tasks import PoolingTask +logger = init_logger(__name__) + class LateInteractionParams( msgspec.Struct, @@ -54,10 +57,6 @@ class PoolingParams( dimensions: int | None = None # --8<-- [end:embed-pooling-params] - ## for classification, scoring and rerank - # --8<-- [start:classify-pooling-params] - # --8<-- [end:classify-pooling-params] - ## for step pooling models step_tag_id: int | None = None returned_token_ids: list[int] | None = None @@ -79,7 +78,6 @@ def valid_parameters(self): return { "embed": ["dimensions", "use_activation"], "classify": ["use_activation"], - "score": ["use_activation"], "token_embed": ["dimensions", "use_activation"], "token_classify": ["use_activation"], } @@ -89,6 +87,13 @@ def clone(self) -> "PoolingParams": return deepcopy(self) def verify(self, model_config: ModelConfig) -> None: + if self.task == "score": + logger.warning_once( + "`score` task is deprecated and will be removed in v0.20. " + "Please use `classify` instead." + ) + self.task = "classify" + # plugin task uses io_processor.parse_request to verify inputs, # skipping PoolingParams verify if self.task == "plugin": @@ -184,7 +189,7 @@ def _set_default_parameters(self, model_config: ModelConfig): elif self.dimensions < 1: raise ValueError("Dimensions must be greater than 0") - elif self.task in ["classify", "score", "token_classify"]: + elif self.task in ["classify", "token_classify"]: if self.use_activation is None: self.use_activation = True else: diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index b468712adb0c..63946e8fdd22 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -172,9 +172,6 @@ def warmup(self, chat_params: ChatParams) -> None: For chat requests: - Jinja2 template compilation - - For multi-modal requests: - - Importing libraries such as librosa triggers JIT compilation. """ from vllm.entrypoints.chat_utils import ChatTemplateResolutionError diff --git a/vllm/tasks.py b/vllm/tasks.py index 83dd7f85eee0..4e324c188519 100644 --- a/vllm/tasks.py +++ b/vllm/tasks.py @@ -8,7 +8,6 @@ PoolingTask = Literal[ "embed", "classify", - "score", "token_embed", "token_classify", "plugin", @@ -16,10 +15,6 @@ ] POOLING_TASKS: tuple[PoolingTask, ...] = get_args(PoolingTask) -# Score API handles score/rerank for: -# - "score" task (score_type: cross-encoder models) -# - "embed" task (score_type: bi-encoder models) -# - "token_embed" task (score_type: late interaction models) ScoreType = Literal["bi-encoder", "cross-encoder", "late-interaction"] FrontendTask = Literal["render"] diff --git a/vllm/transformers_utils/configs/colpali.py b/vllm/transformers_utils/configs/colpali.py index f64aa7564fd6..c40c58b25ce1 100644 --- a/vllm/transformers_utils/configs/colpali.py +++ b/vllm/transformers_utils/configs/colpali.py @@ -27,7 +27,6 @@ def __init__( embedding_dim: int | None = None, embed_dim: int | None = None, dim: int | None = None, - projection_dim: int | None = None, colbert_dim: int | None = None, pooling: str | None = None, vlm_config: dict | None = None, @@ -37,7 +36,6 @@ def __init__( self.embedding_dim = embedding_dim self.embed_dim = embed_dim self.dim = dim - self.projection_dim = projection_dim self.colbert_dim = colbert_dim self.pooling = pooling diff --git a/vllm/transformers_utils/configs/deepseek_vl2.py b/vllm/transformers_utils/configs/deepseek_vl2.py index 822e8cdd0bcf..80fedd1017ca 100644 --- a/vllm/transformers_utils/configs/deepseek_vl2.py +++ b/vllm/transformers_utils/configs/deepseek_vl2.py @@ -90,8 +90,6 @@ def __init__( class DeepseekVLV2Config(PretrainedConfig): model_type = "deepseek_vl_v2" architectures: list[str] | None = None - vision_config: VisionEncoderConfig - projector_config: MlpProjectorConfig tile_tag: str = "2D" global_view_pos: str = "head" diff --git a/vllm/transformers_utils/configs/mistral.py b/vllm/transformers_utils/configs/mistral.py index 90728bbffb60..bdeadec1bf07 100644 --- a/vllm/transformers_utils/configs/mistral.py +++ b/vllm/transformers_utils/configs/mistral.py @@ -257,7 +257,6 @@ def _remap_mistral_audio_args(config: dict) -> dict: encoder_attention_heads=encoder_args["n_heads"], encoder_head_dim=encoder_args["head_dim"], vocab_size=encoder_args["vocab_size"], - max_source_positions=encoder_args["max_source_positions"], is_encoder_decoder=False, # Override WhisperConfig default is_causal=encoder_args.get("causal", False), sliding_window=encoder_args.get("sliding_window", None), @@ -270,6 +269,10 @@ def _remap_mistral_audio_args(config: dict) -> dict: max_position_embeddings=block_pool_size * config["max_position_embeddings"], ), } + # Sometimes max_source_positions is explicitly set to None in params.json but this + # is not a valid value for WhisperConfig (or downstream code that uses it). + if (max_source_positions := encoder_args.get("max_source_positions")) is not None: + config["audio_config"].max_source_positions = max_source_positions if quant_config: config["quantization_config"] = quant_config return config diff --git a/vllm/transformers_utils/configs/parakeet.py b/vllm/transformers_utils/configs/parakeet.py index efd4c466478b..7c7a5ddd800e 100644 --- a/vllm/transformers_utils/configs/parakeet.py +++ b/vllm/transformers_utils/configs/parakeet.py @@ -6,11 +6,21 @@ class ParakeetConfig(ParakeetEncoderConfig): - llm_hidden_size: int - projection_hidden_size: int - projection_bias: bool - projection_eps: float = 1e-5 - sampling_rate: int + def __init__( + self, + llm_hidden_size: int, + projection_hidden_size: int, + projection_bias: bool, + sampling_rate: int, + projection_eps: float = 1e-5, + **kwargs, + ): + super().__init__(**kwargs) + self.llm_hidden_size = llm_hidden_size + self.projection_hidden_size = projection_hidden_size + self.projection_bias = projection_bias + self.sampling_rate = sampling_rate + self.projection_eps = projection_eps @staticmethod def from_hf_config( diff --git a/vllm/transformers_utils/configs/qwen3_asr.py b/vllm/transformers_utils/configs/qwen3_asr.py index 28fa96e72f40..a08b2b7de34e 100644 --- a/vllm/transformers_utils/configs/qwen3_asr.py +++ b/vllm/transformers_utils/configs/qwen3_asr.py @@ -408,7 +408,6 @@ def __init__( support_languages=None, **kwargs, ): - super().__init__(**kwargs) if thinker_config is None: thinker_config = {} logger.info( @@ -417,6 +416,7 @@ def __init__( self.thinker_config = Qwen3ASRThinkerConfig(**thinker_config) self.support_languages = support_languages + super().__init__(**kwargs) def get_text_config(self, decoder=False) -> "PretrainedConfig": """ diff --git a/vllm/transformers_utils/configs/speculators/base.py b/vllm/transformers_utils/configs/speculators/base.py index 2a39e2f16b06..697c9d52e81b 100644 --- a/vllm/transformers_utils/configs/speculators/base.py +++ b/vllm/transformers_utils/configs/speculators/base.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import os +from dataclasses import fields, is_dataclass from typing import Any from transformers import PretrainedConfig @@ -15,11 +16,21 @@ class SpeculatorsConfig(PretrainedConfig): model_type = "speculators" def __init__(self, **kwargs): - """In Transformers v5, `PretrainedConfig` is decorated with `dataclass` and - `huggingface_hub.dataclasses.strict(accept_kwargs=True)`. - Inheriting classes do not inherit the `accept_kwargs=True` behaviour so we must - explicitly pass any kwargs to `PretrainedConfig.__init__`.""" - super().__init__(**kwargs) + # Transformers v4 - super().__init__ which sets all kwargs as attributes + if not is_dataclass(PretrainedConfig): + return super().__init__(**kwargs) + # Transformers v5 - super().__init__ performs some validation before + # setting all kwargs as attributes, so we set them first to be safe + pre_trained_config_fields = {f.name for f in fields(PretrainedConfig)} + super_kwargs = dict() + for key, value in kwargs.items(): + if key == "model_type": + continue # model_type is set as a class variable, so skip it here + elif key in pre_trained_config_fields: + super_kwargs[key] = value + else: + setattr(self, key, value) + super().__init__(**super_kwargs) @classmethod def from_pretrained( diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py index 26fc0404200f..f5fb290d1491 100644 --- a/vllm/transformers_utils/model_arch_config_convertor.py +++ b/vllm/transformers_utils/model_arch_config_convertor.py @@ -228,7 +228,7 @@ def is_deepseek_mla(self) -> bool: "pangu_ultra_moe_mtp", "bailing_hybrid", ): - return self.hf_text_config.kv_lora_rank is not None + return getattr(self.hf_text_config, "kv_lora_rank", None) is not None elif self.hf_text_config.model_type == "eagle": # if the model is an EAGLE module, check for the # underlying architecture @@ -241,7 +241,7 @@ def is_deepseek_mla(self) -> bool: "deepseek_v32", "deepseek_mtp", ) - and self.hf_text_config.kv_lora_rank is not None + and getattr(self.hf_text_config, "kv_lora_rank", None) is not None ) return False diff --git a/vllm/transformers_utils/processors/fireredasr2.py b/vllm/transformers_utils/processors/fireredasr2.py index 4bde53015003..bba7e7ee0495 100644 --- a/vllm/transformers_utils/processors/fireredasr2.py +++ b/vllm/transformers_utils/processors/fireredasr2.py @@ -188,7 +188,7 @@ def padding_position_is_0(padded_input, input_lengths): for speech in raw_speech: """ We must multiply by 32768 here because FireRedASR2 loads audio data - using kaldiio.load_mat, while vLLM loads audio data using librosa. + using kaldiio.load_mat, while vLLM loads audio data using pyav. """ speech = speech * 32768 fbank = self.fbank(sampling_rate, speech) diff --git a/vllm/transformers_utils/processors/isaac.py b/vllm/transformers_utils/processors/isaac.py index 986b70840d25..1464afc6677f 100644 --- a/vllm/transformers_utils/processors/isaac.py +++ b/vllm/transformers_utils/processors/isaac.py @@ -6,12 +6,14 @@ from typing import Any import numpy as np -import PIL.Image import torch import torch.nn.functional as F +from PIL import Image from transformers import BatchFeature, ProcessorMixin, TensorType from typing_extensions import TypedDict, Unpack +from vllm.tokenizers.hf import HfTokenizer + MAX_PIXELS = 60_000_000 # 60-megapixel ceiling ≈ 8200 × 7300 px # Vision preprocessing constants @@ -39,7 +41,7 @@ def _make_writeable(arr: np.ndarray) -> np.ndarray: return arr.copy() -def extract_image_pil(image: PIL.Image.Image) -> torch.Tensor | None: +def extract_image_pil(image: Image.Image) -> torch.Tensor: if image.width * image.height > MAX_PIXELS: raise ValueError( f"Image (w={image.width}, h={image.height}) > MAX=`{MAX_PIXELS}`" @@ -314,31 +316,30 @@ class IsaacImageProcessorKwargs(TypedDict, total=False): class IsaacImageProcessor: - patch_size = 16 - max_num_patches = 6144 - min_num_patches = 256 - pixel_shuffle_scale = 2 - valid_kwargs = IsaacImageProcessorKwargs model_input_names = ["pixel_values", "image_grid_thw"] - def __init__(self, kwargs): - self.patch_size = kwargs.pop("patch_size", self.patch_size) - self.vision_max_num_patches = kwargs.pop( - "vision_max_num_patches", self.max_num_patches - ) - self.vision_min_num_patches = kwargs.pop( - "vision_min_num_patches", self.min_num_patches - ) - self.pixel_shuffle_scale = kwargs.pop("pixel_shuffle_scale", 2) - - def preprocess( + def __init__( self, - images: list[torch.Tensor], - return_tensors: str | TensorType | None, + patch_size: int = 16, + vision_max_num_patches: int = 6144, + vision_min_num_patches: int = 256, + pixel_shuffle_scale: int = 2, + ) -> None: + self.patch_size = patch_size + self.vision_max_num_patches = vision_max_num_patches + self.vision_min_num_patches = vision_min_num_patches + self.pixel_shuffle_scale = pixel_shuffle_scale + + def __call__( + self, + images: Image.Image | list[Image.Image], + return_tensors: str | TensorType | None = None, **kwargs: Unpack[IsaacImageProcessorKwargs], ) -> BatchFeature: """Preprocess images into format compatible with vLLM input processing.""" + if not isinstance(images, list): + images = [images] all_pixel_values: list[torch.Tensor] = [] all_image_grids: list[torch.Tensor] = [] @@ -388,23 +389,40 @@ def preprocess( class IsaacProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] - def __init__(self, image_processor=None, tokenizer=None, **kwargs): - self.image_token = kwargs.pop("image_token", "") + def __init__( + self, + image_processor: IsaacImageProcessor, + tokenizer: HfTokenizer, + image_token: str = "", + ): self.image_processor = image_processor self.tokenizer = tokenizer - def __call__(self, text=None, images=None, **kwargs) -> BatchFeature: - result = {} + self.image_token = image_token + def __call__( + self, + text: str | list[str] | None = None, + images: Image.Image | list[Image.Image] | None = None, + return_tensors: str | TensorType | None = None, + **kwargs, + ) -> BatchFeature: if images is not None: - image_inputs = self.image_processor.preprocess(images, **kwargs) + image_inputs = self.image_processor( + images, + return_tensors=return_tensors, + **kwargs, + ) image_grid_thw = image_inputs["image_grid_thw"] - result.update(image_inputs) + else: + image_inputs = {} + image_grid_thw = [] - if text is not None: - if not isinstance(text, list): - text = [text] + if text is not None: + if not isinstance(text, list): + text = [text] + if image_inputs: text = text.copy() # below lines change text in-place merge_length = self.image_processor.pixel_shuffle_scale**2 index = 0 @@ -417,10 +435,14 @@ def __call__(self, text=None, images=None, **kwargs) -> BatchFeature: index += 1 text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>") - if text is not None: - result.update(self.tokenizer(text, **kwargs)) + text_inputs = self.tokenizer(text, return_tensors=return_tensors) + else: + text_inputs = {} - return BatchFeature(result) + return BatchFeature( + data={**text_inputs, **image_inputs}, + tensor_type=return_tensors, + ) def apply_chat_template( self, diff --git a/vllm/transformers_utils/processors/kimi_k25.py b/vllm/transformers_utils/processors/kimi_k25.py index 06147f2113dd..edee9734ce42 100644 --- a/vllm/transformers_utils/processors/kimi_k25.py +++ b/vllm/transformers_utils/processors/kimi_k25.py @@ -1,38 +1,41 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import torch -from transformers import BatchFeature +from transformers import BaseImageProcessor, BatchFeature, TensorType from transformers.processing_utils import ProcessorMixin from vllm.multimodal.inputs import VisionChunk +from vllm.tokenizers.hf import HfTokenizer class KimiK25Processor(ProcessorMixin): - attributes = ["tokenizer"] - tokenizer_class = "AutoTokenizer" + attributes = ["image_processor", "tokenizer"] def __init__( - self, media_processor=None, tokenizer=None, media_token_id: int | None = None - ): - super().__init__(tokenizer) - self.media_processor = media_processor + self, + image_processor: BaseImageProcessor, + tokenizer: HfTokenizer, + media_token_id: int, + ) -> None: + self.image_processor = image_processor + self.tokenizer = tokenizer + self.media_token_id = media_token_id - assert self.media_token_id is not None def __call__( self, + text: str | list[str] | None = None, vision_chunks: list[VisionChunk] | None = None, - *, - text: list[int] | str, + return_tensors: str | TensorType | None = None, **kwargs, ) -> BatchFeature: """ Args: - vision_chunks: List of VisionChunk items to be processed. - For image: VisionChunkImage with type='image', image=PIL.Image - For video_chunk: VisionChunkVideo with type='video_chunk', - video_chunk=list[PIL.Image] - text: The token ids to be fed to a model (required). + text: The text to be field to the model. + vision_chunks: List of `VisionChunk` items to be processed. + For image: `VisionChunkImage` with + `type='image', image=PIL.Image` + For video_chunk: `VisionChunkVideo` with + `type='video_chunk', video_chunk=list[PIL.Image]` Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: @@ -42,31 +45,44 @@ def __call__( - **grid_thws** -- list of image 3D grid in LLM. Returned when `vision_chunks` is not `None`. """ - mm_inputs = {} - input_ids = self.tokenizer.encode(text) if isinstance(text, str) else text if vision_chunks is not None: - assert isinstance(vision_chunks, list) - mm_inputs = self.media_processor.preprocess(vision_chunks) + mm_inputs = self.image_processor.preprocess( + vision_chunks, + return_tensors=return_tensors, + ) + else: + mm_inputs = {} + + if text is not None: + if not isinstance(text, list): + text = [text] + + text_inputs = self.tokenizer(text) + + # Note: Modify in-place + input_ids: list[list[int]] = text_inputs["input_ids"] # type: ignore + + if vision_chunks is not None: + num_tokens_per_chunk = [ + self.image_processor.media_tokens_calculator(chunk) + for chunk in vision_chunks + ] - num_tokens_per_chunk = [ - self.media_processor.media_tokens_calculator(chunk) - for chunk in vision_chunks - ] + for i in range(len(input_ids)): + new_input_ids = [] + for token in input_ids[i]: + if token == self.media_token_id: + new_input_ids.extend( + [self.media_token_id] * num_tokens_per_chunk.pop(0) + ) + else: + new_input_ids.append(token) - new_input_ids = [] - for token in input_ids: - if token == self.media_token_id: - new_input_ids.extend( - [self.media_token_id] * num_tokens_per_chunk.pop(0) - ) - else: - new_input_ids.append(token) - input_ids = new_input_ids + input_ids[i] = new_input_ids + else: + text_inputs = {} - # XXX: _apply_hf_processor_text_mm will call tolist() on input_ids return BatchFeature( - data={ - "input_ids": torch.tensor([input_ids]), - **mm_inputs, - } + data={**text_inputs, **mm_inputs}, + tensor_type=return_tensors, ) diff --git a/vllm/transformers_utils/processors/step3_vl.py b/vllm/transformers_utils/processors/step3_vl.py index 66cf10e39588..71540f433fd1 100644 --- a/vllm/transformers_utils/processors/step3_vl.py +++ b/vllm/transformers_utils/processors/step3_vl.py @@ -286,11 +286,9 @@ def _convert_images_to_pixel_values( def __call__( self, - images: Image.Image | list[Image.Image] | None = None, + images: Image.Image | list[Image.Image], return_tensors: str | TensorType | None = None, ) -> BatchFeature: - if images is None: - images = [] if not isinstance(images, list): images = [images] diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index ee104a6cc75c..fb6208212ae9 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -70,10 +70,7 @@ def is_deep_gemm_supported() -> bool: """Return `True` if DeepGEMM is supported on the current platform. Currently, only Hopper and Blackwell GPUs are supported. """ - is_supported_arch = current_platform.is_cuda() and ( - current_platform.is_device_capability(90) - or current_platform.is_device_capability_family(100) - ) + is_supported_arch = current_platform.support_deep_gemm() return envs.VLLM_USE_DEEP_GEMM and has_deep_gemm() and is_supported_arch diff --git a/vllm/v1/attention/backend.py b/vllm/v1/attention/backend.py index d7283b6c846f..cd49ea30e6f4 100644 --- a/vllm/v1/attention/backend.py +++ b/vllm/v1/attention/backend.py @@ -362,6 +362,11 @@ class CommonAttentionMetadata: dcp_local_seq_lens_cpu: torch.Tensor | None = None """Sequence lengths of the local rank in decode context parallelism world""" + is_prefilling: torch.Tensor | None = None + """(batch_size,) bool tensor: True if request is still in prefill phase + (num_computed_tokens < num_prompt_tokens). Used by some backends to + distinguish actual decodes from short extends.""" + # WARNING: Deprecated fields. Will be removed in a future release (v0.15.0) _seq_lens_cpu: torch.Tensor | None = None _num_computed_tokens_cpu: torch.Tensor | None = None @@ -443,6 +448,7 @@ def unpadded( encoder_seq_lens_cpu=maybe_slice_reqs(self.encoder_seq_lens_cpu), dcp_local_seq_lens=maybe_slice_reqs(self.dcp_local_seq_lens), dcp_local_seq_lens_cpu=maybe_slice_reqs(self.dcp_local_seq_lens_cpu), + is_prefilling=maybe_slice_reqs(self.is_prefilling), ) diff --git a/vllm/v1/attention/backends/mamba_attn.py b/vllm/v1/attention/backends/mamba_attn.py index bdb820eac35e..59f2e7ca51a6 100644 --- a/vllm/v1/attention/backends/mamba_attn.py +++ b/vllm/v1/attention/backends/mamba_attn.py @@ -358,7 +358,9 @@ def _compute_common_metadata( num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( split_decodes_and_prefills( - common_attn_metadata, decode_threshold=decode_threshold + common_attn_metadata, + decode_threshold=decode_threshold, + treat_short_extends_as_decodes=False, ) ) diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla.py b/vllm/v1/attention/backends/mla/flashinfer_mla.py index 3de0dcdd8c01..16d01bd338ca 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla.py @@ -77,17 +77,17 @@ def supports_combination( use_sparse: bool, device_capability: DeviceCapability, ) -> str | None: - # FlashInfer MLA kernel requires qk_nope_head_dim in [64, 128] + # FlashInfer MLA kernel requires qk_nope_head_dim in [64, 128, 192] from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() if vllm_config.model_config is not None: hf_text_config = vllm_config.model_config.hf_text_config qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1) - if qk_nope_head_dim not in [64, 128]: + if qk_nope_head_dim not in [64, 128, 192]: return ( - f"FlashInfer MLA kernel requires qk_nope_head_dim in [64, 128], " - f"but got {qk_nope_head_dim}" + "FlashInfer MLA kernel requires qk_nope_head_dim " + f"in [64, 128, 192], but got {qk_nope_head_dim}" ) return None diff --git a/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py b/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py index 9554457b494e..7b5ec0d4976a 100644 --- a/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashinfer_mla_sparse.py @@ -113,17 +113,17 @@ def supports_combination( use_sparse: bool, device_capability: DeviceCapability, ) -> str | None: - # FlashInfer MLA sparse kernel requires qk_nope_head_dim == 128 + # FlashInfer MLA sparse kernel requires qk_nope_head_dim in [128, 192] from vllm.config import get_current_vllm_config vllm_config = get_current_vllm_config() if vllm_config.model_config is not None: hf_text_config = vllm_config.model_config.hf_text_config qk_nope_head_dim = getattr(hf_text_config, "qk_nope_head_dim", 1) - if qk_nope_head_dim != 128: + if qk_nope_head_dim not in [128, 192]: return ( - f"FlashInfer MLA Sparse kernel requires qk_nope_head_dim == 128, " - f"but got {qk_nope_head_dim}" + "FlashInfer MLA Sparse kernel requires qk_nope_head_dim " + f"in [128, 192], but got {qk_nope_head_dim}" ) # Check for index_topk which indicates sparse model if not hasattr(hf_text_config, "index_topk"): diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 42459815ef9e..0f41993fc695 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -489,11 +489,15 @@ def split_decodes_and_prefills( common_attn_metadata: CommonAttentionMetadata, decode_threshold: int = 1, require_uniform: bool = False, + treat_short_extends_as_decodes: bool = True, ) -> tuple[int, int, int, int]: """ Assuming a reordered batch, finds the boundary between prefill and decode requests. + The batch is expected to be ordered as: + decode → short_extend → long_extend → prefill + Args: common_attn_metadata: CommonAttentionMetadata object containing the batch metadata. @@ -501,6 +505,9 @@ def split_decodes_and_prefills( require_uniform: If True, requires that all decode requests have the same query length. When set, some queries may be considered prefills even if they are <= decode_threshold, in order to ensure uniformity. + treat_short_extends_as_decodes: If True (default), short extends + (query_len <= threshold but still prefilling) are counted as + decodes. If False, they are counted as prefills. Returns: num_decodes: The number of decode requests. @@ -513,8 +520,10 @@ def split_decodes_and_prefills( num_tokens = common_attn_metadata.num_actual_tokens query_start_loc = common_attn_metadata.query_start_loc_cpu - if max_query_len <= decode_threshold and ( - not require_uniform or decode_threshold <= 1 + if ( + max_query_len <= decode_threshold + and (not require_uniform or decode_threshold <= 1) + and treat_short_extends_as_decodes ): return num_reqs, 0, num_tokens, 0 @@ -533,11 +542,14 @@ def split_decodes_and_prefills( else: is_prefill = query_lens > decode_threshold + if not treat_short_extends_as_decodes: + assert common_attn_metadata.is_prefilling is not None + is_prefill |= common_attn_metadata.is_prefilling + if not torch.any(is_prefill): return num_reqs, 0, num_tokens, 0 first_prefill = is_prefill.int().argmax(dim=-1).item() - assert torch.all(query_lens[:first_prefill] <= decode_threshold) num_decodes = first_prefill num_prefills = num_reqs - num_decodes num_decode_tokens = query_start_loc[first_prefill].item() @@ -581,39 +593,52 @@ def reorder_batch_to_split_decodes_and_prefills( Reorders the batch to split into prefill and decode requests; places all requests with <= decode_threshold tokens at the front of the batch. + The batch is reordered into 4 regions: + decode: (num_scheduled <= threshold AND is not prefilling) + short_extend: (num_scheduled <= threshold AND is chunked prefilling) + long_extend: (num_scheduled > threshold AND is chunked prefilling) + prefill: (num_computed == 0) # First chunks + Returns: True if the batch was modified, False otherwise. """ - # We now want to reorder the batch into decode → extend → prefill order - # where: - # decode: request with num_scheduled_tokens <= decode_threshold - # extend: non-decode request with existing context - # prefill: non-decode request with no existing context - # NOTE for now we loosely use "decode" to mean requests where attention is - # likely memory-bound and "prefill" to mean requests where attention is - # likely compute-bound, num_reqs = len(input_batch.req_ids) num_scheduled_tokens = [ scheduler_output.num_scheduled_tokens[id] for id in input_batch.req_ids ] num_scheduled_tokens_np = np.array(num_scheduled_tokens) num_computed_tokens_np = input_batch.num_computed_tokens_cpu[:num_reqs] - - is_prefill = num_computed_tokens_np == 0 - is_decode = (num_scheduled_tokens_np <= decode_threshold) & (~is_prefill) - is_extend = (num_scheduled_tokens_np > decode_threshold) & (~is_prefill) - - # Desired order: decode → extend → prefill - req_regions = np.zeros(is_decode.shape, dtype=np.int32) # 0 = decode by default - req_regions[is_extend] = 1 - req_regions[is_prefill] = 2 + num_prompt_tokens_np = input_batch.num_prompt_tokens[:num_reqs] + + has_context = num_computed_tokens_np > 0 + is_below_threshold = num_scheduled_tokens_np <= decode_threshold + done_prefilling = num_computed_tokens_np >= num_prompt_tokens_np + + # Mutually exclusive categories (exactly one True per request): + # 1. No context yet -> prefill + # 2. Has context, above threshold -> long_extend + # 3. Has context, below threshold, still prefilling -> short_extend + # 4. Has context, below threshold, done prefilling -> decode + is_pure_prefill = ~has_context + is_long_extend = has_context & ~is_below_threshold + is_short_extend = has_context & is_below_threshold & ~done_prefilling + is_decode = has_context & is_below_threshold & done_prefilling + + # Desired order: decode → short_extend → long_extend → prefill + req_regions = np.zeros(num_reqs, dtype=np.int32) # 0 = decode by default + req_regions[is_short_extend] = 1 + req_regions[is_long_extend] = 2 + req_regions[is_pure_prefill] = 3 num_decodes = int(is_decode.sum()) - num_extends = int(is_extend.sum()) + num_short_extends = int(is_short_extend.sum()) + num_long_extends = int(is_long_extend.sum()) + num_prefills = int(is_pure_prefill.sum()) - target_regions = np.zeros(num_reqs, dtype=np.int32) - target_regions[num_decodes : num_decodes + num_extends] = 1 - target_regions[num_decodes + num_extends :] = 2 + target_regions = np.repeat( + [0, 1, 2, 3], + [num_decodes, num_short_extends, num_long_extends, num_prefills], + ).astype(np.int32) needs_swap = req_regions != target_regions diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 421b25c0d0d4..0fa59579ee76 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -13,6 +13,7 @@ from functools import partial from inspect import isclass, signature from logging import DEBUG +from multiprocessing.queues import Queue from typing import Any, TypeVar, cast import msgspec @@ -59,6 +60,7 @@ UtilityOutput, UtilityResult, ) +from vllm.v1.engine.tensor_ipc import TensorIpcReceiver from vllm.v1.engine.utils import ( EngineHandshakeMetadata, EngineZmqAddresses, @@ -788,6 +790,7 @@ def __init__( executor_class: type[Executor], log_stats: bool, client_handshake_address: str | None = None, + tensor_queue: Queue | None = None, *, engine_index: int = 0, ): @@ -802,6 +805,12 @@ def __init__( self.engines_running = False self.shutdown_state = EngineShutdownState.RUNNING + # Receiver for tensor IPC + self.tensor_ipc_receiver: TensorIpcReceiver | None = None + if tensor_queue is not None: + self.tensor_ipc_receiver = TensorIpcReceiver(tensor_queue) + logger.info("Using tensor IPC queue for multimodal tensor sharing") + with self._perform_handshakes( handshake_address, identity, @@ -1340,9 +1349,11 @@ def process_input_sockets( ): """Input socket IO thread.""" - # Msgpack serialization decoding. - add_request_decoder = MsgpackDecoder(EngineCoreRequest) - generic_decoder = MsgpackDecoder() + # Msgpack serialization decoding with optional tensor IPC receiver. + add_request_decoder = MsgpackDecoder( + EngineCoreRequest, oob_tensor_provider=self.tensor_ipc_receiver + ) + generic_decoder = MsgpackDecoder(oob_tensor_provider=self.tensor_ipc_receiver) with ExitStack() as stack, zmq.Context() as ctx: input_sockets = [ @@ -1418,10 +1429,7 @@ def process_input_sockets( self.input_queue.put_nowait((request_type, request)) def process_output_sockets( - self, - output_paths: list[str], - coord_output_path: str | None, - engine_index: int, + self, output_paths: list[str], coord_output_path: str | None, engine_index: int ): """Output socket IO thread.""" @@ -1580,6 +1588,7 @@ def __init__( executor_class: type[Executor], log_stats: bool, client_handshake_address: str | None = None, + tensor_queue: Queue | None = None, ): assert vllm_config.model_config.is_moe, ( "DPEngineCoreProc should only be used for MoE models" @@ -1605,6 +1614,7 @@ def __init__( log_stats, client_handshake_address, engine_index=dp_rank, + tensor_queue=tensor_queue, ) def _init_data_parallel(self, vllm_config: VllmConfig): @@ -1694,6 +1704,8 @@ def run_busy_loop(self): if self.eep_scaling_state is not None: _ = self.eep_scaling_state.progress() if self.eep_scaling_state.is_complete(): + if self.eep_scaling_state.worker_type == "removing": + raise SystemExit self.process_input_queue_block = True self.eep_scaling_state = None @@ -1857,20 +1869,7 @@ def _eep_scale_up_before_kv_init(self): scale_type="scale_up", reconfig_request=None, ) - self.model_executor.collective_rpc("init_device") - self.model_executor.collective_rpc("load_model") - self._eep_send_engine_core_notification( - EEPNotificationType.NEW_CORE_ENGINES_WEIGHTS_INIT_READY - ) - self.model_executor.collective_rpc( - "elastic_ep_execute", args=("receive_weights",) - ) - self.available_gpu_memory_for_kv_cache = ( - ParallelConfig.sync_kv_cache_memory_size(self.dp_group, -1) - ) - self.model_executor.collective_rpc( - "elastic_ep_execute", args=("prepare_new_worker",) - ) + self.eep_scaling_state.run_pre_kv_init_states() self.process_input_queue_block = False diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 91664058d8c4..b9a3c7545e16 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -12,6 +12,7 @@ from collections.abc import Awaitable, Callable, Sequence from concurrent.futures import Future from dataclasses import dataclass +from multiprocessing.queues import Queue from threading import Thread from typing import Any, TypeAlias, TypeVar @@ -45,6 +46,7 @@ from vllm.v1.engine.coordinator import DPCoordinator from vllm.v1.engine.core import EngineCore, EngineCoreProc from vllm.v1.engine.exceptions import EngineDeadError +from vllm.v1.engine.tensor_ipc import TensorIpcSender from vllm.v1.engine.utils import ( CoreEngineActorManager, CoreEngineProcManager, @@ -477,9 +479,6 @@ def __init__( client_addresses: dict[str, str] | None = None, ): self.vllm_config = vllm_config - # Serialization setup. - self.encoder = MsgpackEncoder() - self.decoder = MsgpackDecoder(EngineCoreOutputs) # ZMQ setup. sync_ctx = zmq.Context(io_threads=2) @@ -501,11 +500,14 @@ def __init__( enable_input_socket_handover = parallel_config.enable_elastic_ep self.stats_update_address: str | None = None + tensor_queue: Queue | None = None if client_addresses: # Engines are managed externally to this client. input_address = client_addresses["input_address"] output_address = client_addresses["output_address"] self.stats_update_address = client_addresses.get("stats_update_address") + # Tensor queues passed via client_addresses for multi-API-server case + tensor_queue = client_addresses.get("tensor_queue") # type: ignore[assignment] self.input_socket = self.resources.input_socket = make_zmq_socket( self.ctx, input_address, @@ -532,7 +534,7 @@ def __init__( with launch_core_engines( vllm_config, executor_class, log_stats, addresses - ) as (engine_manager, coordinator, addresses): + ) as (engine_manager, coordinator, addresses, tensor_queue): self.resources.coordinator = coordinator self.resources.engine_manager = engine_manager @@ -542,6 +544,17 @@ def __init__( coordinator.get_stats_publish_address() ) + # Serialization setup with tensor queues for multimodal tensor IPC. + tensor_ipc_sender: TensorIpcSender | None = None + model_config = getattr(vllm_config, "model_config", None) + if model_config is not None and model_config.multimodal_config is not None: + mm_tensor_ipc = model_config.multimodal_config.mm_tensor_ipc + if mm_tensor_ipc == "torch_shm" and tensor_queue is not None: + tensor_ipc_sender = TensorIpcSender(tensor_queue) + + self.encoder = MsgpackEncoder(oob_tensor_consumer=tensor_ipc_sender) + self.decoder = MsgpackDecoder(EngineCoreOutputs) + dp_size = parallel_config.data_parallel_size dp_rank = parallel_config.data_parallel_index dp_local_size = parallel_config.data_parallel_size_local diff --git a/vllm/v1/engine/tensor_ipc.py b/vllm/v1/engine/tensor_ipc.py new file mode 100644 index 000000000000..2a8391b17e6d --- /dev/null +++ b/vllm/v1/engine/tensor_ipc.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +"""Tensor IPC transport via torch.multiprocessing.Queue. + +This module contains the queue-based transport logic for sharing tensors +between processes (e.g., API server -> engine core). The msgpack layer +emits/consumes lightweight :class:`TensorIpcData` values, while transport +state such as request association, handle generation, queue routing, buffering, +and cleanup lives here. +""" + +import dataclasses +import uuid +from collections import defaultdict +from dataclasses import field +from multiprocessing.queues import Queue as MPQueue +from typing import Any + +import torch + +from vllm.logger import init_logger +from vllm.v1.serial_utils import OOBTensorConsumer + +logger = init_logger(__name__) + +TensorIpcQueue = MPQueue + + +@dataclasses.dataclass +class TensorIpcData: + """ + Data sent via torch.multiprocessing.Queue for zero-copy IPC. + + Contains the tensor_id and the actual tensor. The tensor is + shared in memory (GPU or CPU) for efficient inter-process communication. + """ + + sender_id: str + message_id: int + tensor_id: int + tensor: torch.Tensor + + +class TensorIpcSender(OOBTensorConsumer): + """Send-side logic for tensor IPC via torch.multiprocessing.Queue. + + Uses a single queue targeting rank 0 (the only rank that consumes + multimodal tensors during TP>1 / PP>1. Note: DP>1 not supported). + """ + + def __init__(self, queue: TensorIpcQueue): + self.queue = queue + self._tensor_id_counter = 0 + self._message_counter = 0 + self._sender_id = uuid.uuid4().hex[:8] + + def set_target_engine(self, target_engine: int) -> None: + if target_engine != 0: + raise IndexError( + "TensorIpcSender only supports a single queue; " + f"got target engine {target_engine}" + ) + + def new_message(self) -> None: + self._message_counter += 1 + self._tensor_id_counter = 0 + + def __call__(self, tensor: torch.Tensor) -> dict[str, Any] | None: + """Send tensor via queue, return its handle. Returns None if failed.""" + try: + # Move tensor to shared memory for IPC + # This is required for proper inter-process communication + if not tensor.is_shared(): + tensor = tensor.share_memory_() + + metadata = { + "sender_id": self._sender_id, + "message_id": self._message_counter, + "tensor_id": self._tensor_id_counter, + } + + self._tensor_id_counter += 1 + + ipc_data = TensorIpcData(**metadata, tensor=tensor) # type: ignore[arg-type] + + # Use a timeout to avoid blocking indefinitely + self.queue.put(ipc_data, timeout=10.0) + + logger.debug( + "Sent tensor %s for (shape=%s, device=%s) " + "via IPC queue (shared memory)", + metadata, + tensor.shape, + tensor.device, + ) + + return metadata + except Exception as e: + logger.warning( + "Failed to send tensor via IPC queue: %s. " + "Falling back to standard serialization.", + e, + ) + return None + + +@dataclasses.dataclass +class _Sender: + current_message_id: int = -1 + tensors: dict[int, dict[int, torch.Tensor]] = field(default_factory=dict) + + +class TensorIpcReceiver: + """Receive-side logic for tensor IPC via torch.multiprocessing.Queue. + + Wraps the queue receive logic previously embedded in MsgpackDecoder. + """ + + def __init__(self, queue: TensorIpcQueue): + self.queue = queue + self._tensor_buffers = defaultdict[str, _Sender](_Sender) + + def __call__( + self, dtype: str, shape: tuple[int, ...], meta: dict[str, Any] + ) -> torch.Tensor: + """Retrieve a tensor from torch.multiprocessing.Queue. + + Uses a drain-and-buffer pattern: drains all available tensors from + the queue, buffering them, until the requested tensor is found. + Works for CUDA and CPU. + """ + + # Create lookup key from handle + sender_id: str = meta["sender_id"] + message_id: int = meta["message_id"] + tensor_id: int = meta["tensor_id"] + + # Drain all available tensors. We save them regardless if this is + # the one we're waiting for as they may arrive out of order from + # multiple producers. + while True: + sender = self._tensor_buffers.get(sender_id) + if sender is not None: + tensors = sender.tensors + tensor = tensors.get(message_id, {}).pop(tensor_id, None) + if tensor is not None: + if sender.current_message_id != message_id: + while tensors and (mid := next(iter(tensors))) < message_id: + if sender.tensors.pop(mid): + logger.warning( + "Discarding %d stale tensors from sender %s", + sender_id, + ) + sender.current_message_id = message_id + logger.debug( + "Received tensor %s from sender %s for (shape=%s, device=%s) " + "via IPC queue (shared memory)", + (message_id, tensor_id), + sender_id, + tensor.shape, + tensor.device, + ) + return tensor + + ipc_data: TensorIpcData = self.queue.get(timeout=10.0) + + # Store tensor + sender = self._tensor_buffers[ipc_data.sender_id] + if sender.current_message_id > ipc_data.message_id: + logger.warning( + "Ignoring stale tensor from sender %s", ipc_data.sender_id + ) + continue + + sender.tensors.setdefault(ipc_data.message_id, {})[ipc_data.tensor_id] = ( + ipc_data.tensor + ) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index 52c7217346e8..90ec47edb033 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -10,6 +10,7 @@ from enum import Enum, auto from multiprocessing import Process, connection from multiprocessing.process import BaseProcess +from multiprocessing.queues import Queue from typing import TYPE_CHECKING from unittest.mock import patch @@ -95,6 +96,7 @@ def __init__( executor_class: type[Executor], log_stats: bool, client_handshake_address: str | None = None, + tensor_queue: Queue | None = None, ): context = get_mp_context() common_kwargs = { @@ -103,6 +105,7 @@ def __init__( "handshake_address": handshake_address, "executor_class": executor_class, "log_stats": log_stats, + "tensor_queue": tensor_queue, } if client_handshake_address: @@ -864,6 +867,7 @@ def launch_core_engines( CoreEngineProcManager | CoreEngineActorManager | None, DPCoordinator | None, EngineZmqAddresses, + Queue | None, ] ]: """Launch engine and DP coordinator processes as needed.""" @@ -878,6 +882,14 @@ def launch_core_engines( offline_mode = local_start_index is not None + # Create a single tensor IPC queue for sharing multimodal tensors between + # API servers and engine core. Returns a single queue since we only support + # DP=1 for this data flow. + tensor_queue: Queue | None = None + multimodal_config = vllm_config.model_config.multimodal_config + if multimodal_config is not None and multimodal_config.mm_tensor_ipc == "torch_shm": + tensor_queue = get_mp_context().Queue() + # Run the DP Coordinator process with rank 0 when in online DP mode. # The coordinator is needed for: # 1. Internal/hybrid LB: collecting and publishing queue stats for load balancing @@ -913,7 +925,7 @@ def launch_core_engines( log_stats=log_stats, ) - yield engine_actor_manager, coordinator, addresses + yield engine_actor_manager, coordinator, addresses, tensor_queue return if offline_mode: @@ -975,11 +987,12 @@ def launch_core_engines( local_engine_count=local_engine_count, start_index=dp_rank, local_start_index=local_start_index or 0, + tensor_queue=tensor_queue, ) else: local_engine_manager = None - yield local_engine_manager, coordinator, addresses + yield local_engine_manager, coordinator, addresses, tensor_queue # Now wait for engines to start. wait_for_engine_startup( diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index e715a1d767fa..f9b77154067a 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -602,13 +602,14 @@ def __init__( ) # Load model - is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH - if not is_eep_new_worker: - self.worker.init_device() - # Update process title now that parallel groups are initialized - self.setup_proc_title_and_log_prefix( - enable_ep=vllm_config.parallel_config.enable_expert_parallel - ) + self.worker.init_device() + # Update process title now that parallel groups are initialized + self.setup_proc_title_and_log_prefix( + enable_ep=vllm_config.parallel_config.enable_expert_parallel + ) + if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: + self.worker.elastic_ep_execute("load_model") + else: self.worker.load_model() scheduler_config = vllm_config.scheduler_config diff --git a/vllm/v1/executor/ray_executor.py b/vllm/v1/executor/ray_executor.py index 1cbc11990e08..c4e5e7bc67ed 100644 --- a/vllm/v1/executor/ray_executor.py +++ b/vllm/v1/executor/ray_executor.py @@ -382,9 +382,10 @@ def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): all_kwargs.append(kwargs) self.collective_rpc("init_worker", args=(all_kwargs,)) - is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH - if not is_eep_new_worker: - self.collective_rpc("init_device") + self.collective_rpc("init_device") + if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: + self.collective_rpc("elastic_ep_execute", args=("load_model",)) + else: self.collective_rpc("load_model") def _update_block_size(worker): diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py index e90a1ab23915..b616c3b7b8ad 100644 --- a/vllm/v1/executor/uniproc_executor.py +++ b/vllm/v1/executor/uniproc_executor.py @@ -43,12 +43,14 @@ def _init_executor(self) -> None: max_workers=1, thread_name_prefix="WorkerAsyncOutput" ) - is_eep_new_worker = envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH self.driver_worker.init_worker(all_kwargs=[kwargs]) - if not is_eep_new_worker: - self.driver_worker.init_device() + self.driver_worker.init_device() + + if envs.VLLM_ELASTIC_EP_SCALE_UP_LAUNCH: + self.driver_worker.elastic_ep_execute("load_model") + else: self.driver_worker.load_model() - current_platform.update_block_size_for_backend(self.vllm_config) + current_platform.update_block_size_for_backend(self.vllm_config) def _distributed_args(self) -> tuple[str, int, int]: """Return (distributed_init_method, rank, local_rank).""" diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index f20d78542247..5d5877d1692e 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -5,7 +5,6 @@ import time from abc import ABC, abstractmethod from collections.abc import Callable -from typing import TypeAlias from prometheus_client import Counter, Gauge, Histogram @@ -14,7 +13,7 @@ from vllm.config import SupportsMetricsInfo, VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( KVConnectorLogging, - KVConnectorPrometheus, + KVConnectorProm, ) from vllm.logger import init_logger from vllm.plugins import STAT_LOGGER_PLUGINS_GROUP, load_plugins_by_group @@ -28,6 +27,7 @@ PromptTokenStats, SchedulerStats, ) +from vllm.v1.metrics.utils import create_metric_per_engine from vllm.v1.spec_decode.metrics import SpecDecodingLogging, SpecDecodingProm logger = init_logger(__name__) @@ -391,7 +391,7 @@ class PrometheusStatLogger(AggregateStatLoggerBase): _counter_cls = Counter _histogram_cls = Histogram _spec_decoding_cls = SpecDecodingProm - _kv_connector_cls = KVConnectorPrometheus + _kv_connector_cls = KVConnectorProm _perf_metrics_cls = PerfMetricsProm def __init__( @@ -415,9 +415,10 @@ def __init__( model_name = vllm_config.model_config.served_model_name max_model_len = vllm_config.model_config.max_model_len - per_engine_labelvalues: dict[int, list[object]] = { + self.per_engine_labelvalues: dict[int, list[object]] = { idx: [model_name, str(idx)] for idx in engine_indexes } + per_engine_labelvalues = self.per_engine_labelvalues self.spec_decoding_prom = self._spec_decoding_cls( vllm_config.speculative_config, labelnames, per_engine_labelvalues @@ -438,8 +439,8 @@ def __init__( multiprocess_mode="mostrecent", labelnames=labelnames, ) - self.gauge_scheduler_running = make_per_engine( - gauge_scheduler_running, engine_indexes, model_name + self.gauge_scheduler_running = create_metric_per_engine( + gauge_scheduler_running, per_engine_labelvalues ) gauge_scheduler_waiting = self._gauge_cls( @@ -448,8 +449,8 @@ def __init__( multiprocess_mode="mostrecent", labelnames=labelnames, ) - self.gauge_scheduler_waiting = make_per_engine( - gauge_scheduler_waiting, engine_indexes, model_name + self.gauge_scheduler_waiting = create_metric_per_engine( + gauge_scheduler_waiting, per_engine_labelvalues ) gauge_engine_sleep_state = self._gauge_cls( @@ -484,8 +485,8 @@ def __init__( multiprocess_mode="mostrecent", labelnames=labelnames, ) - self.gauge_kv_cache_usage = make_per_engine( - gauge_kv_cache_usage, engine_indexes, model_name + self.gauge_kv_cache_usage = create_metric_per_engine( + gauge_kv_cache_usage, per_engine_labelvalues ) if envs.VLLM_COMPUTE_NANS_IN_LOGITS: @@ -497,8 +498,8 @@ def __init__( ), labelnames=labelnames, ) - self.counter_corrupted_requests = make_per_engine( - counter_corrupted_requests, engine_indexes, model_name + self.counter_corrupted_requests = create_metric_per_engine( + counter_corrupted_requests, per_engine_labelvalues ) counter_prefix_cache_queries = self._counter_cls( @@ -508,8 +509,8 @@ def __init__( ), labelnames=labelnames, ) - self.counter_prefix_cache_queries = make_per_engine( - counter_prefix_cache_queries, engine_indexes, model_name + self.counter_prefix_cache_queries = create_metric_per_engine( + counter_prefix_cache_queries, per_engine_labelvalues ) counter_prefix_cache_hits = self._counter_cls( @@ -517,8 +518,8 @@ def __init__( documentation=("Prefix cache hits, in terms of number of cached tokens."), labelnames=labelnames, ) - self.counter_prefix_cache_hits = make_per_engine( - counter_prefix_cache_hits, engine_indexes, model_name + self.counter_prefix_cache_hits = create_metric_per_engine( + counter_prefix_cache_hits, per_engine_labelvalues ) # @@ -533,8 +534,8 @@ def __init__( ), labelnames=labelnames, ) - self.counter_connector_prefix_cache_queries = make_per_engine( - counter_connector_prefix_cache_queries, engine_indexes, model_name + self.counter_connector_prefix_cache_queries = create_metric_per_engine( + counter_connector_prefix_cache_queries, per_engine_labelvalues ) counter_connector_prefix_cache_hits = self._counter_cls( @@ -545,8 +546,8 @@ def __init__( ), labelnames=labelnames, ) - self.counter_connector_prefix_cache_hits = make_per_engine( - counter_connector_prefix_cache_hits, engine_indexes, model_name + self.counter_connector_prefix_cache_hits = create_metric_per_engine( + counter_connector_prefix_cache_hits, per_engine_labelvalues ) # @@ -560,8 +561,8 @@ def __init__( ), labelnames=labelnames, ) - self.counter_mm_cache_queries = make_per_engine( - counter_mm_cache_queries, engine_indexes, model_name + self.counter_mm_cache_queries = create_metric_per_engine( + counter_mm_cache_queries, per_engine_labelvalues ) counter_mm_cache_hits = self._counter_cls( @@ -571,8 +572,8 @@ def __init__( ), labelnames=labelnames, ) - self.counter_mm_cache_hits = make_per_engine( - counter_mm_cache_hits, engine_indexes, model_name + self.counter_mm_cache_hits = create_metric_per_engine( + counter_mm_cache_hits, per_engine_labelvalues ) # @@ -583,8 +584,8 @@ def __init__( documentation="Cumulative number of preemption from the engine.", labelnames=labelnames, ) - self.counter_num_preempted_reqs = make_per_engine( - counter_num_preempted_reqs, engine_indexes, model_name + self.counter_num_preempted_reqs = create_metric_per_engine( + counter_num_preempted_reqs, per_engine_labelvalues ) counter_prompt_tokens = self._counter_cls( @@ -592,8 +593,8 @@ def __init__( documentation="Number of prefill tokens processed.", labelnames=labelnames, ) - self.counter_prompt_tokens = make_per_engine( - counter_prompt_tokens, engine_indexes, model_name + self.counter_prompt_tokens = create_metric_per_engine( + counter_prompt_tokens, per_engine_labelvalues ) # Labeled prompt token counters by source @@ -617,8 +618,8 @@ def __init__( documentation="Number of cached prompt tokens (local + external).", labelnames=labelnames, ) - self.counter_prompt_tokens_cached = make_per_engine( - counter_prompt_tokens_cached, engine_indexes, model_name + self.counter_prompt_tokens_cached = create_metric_per_engine( + counter_prompt_tokens_cached, per_engine_labelvalues ) # Recomputed tokens (last token recomputed when entire prompt is cached) @@ -627,8 +628,8 @@ def __init__( documentation="Number of cached tokens recomputed for forward pass.", labelnames=labelnames, ) - self.counter_prompt_tokens_recomputed = make_per_engine( - counter_prompt_tokens_recomputed, engine_indexes, model_name + self.counter_prompt_tokens_recomputed = create_metric_per_engine( + counter_prompt_tokens_recomputed, per_engine_labelvalues ) counter_generation_tokens = self._counter_cls( @@ -636,8 +637,8 @@ def __init__( documentation="Number of generation tokens processed.", labelnames=labelnames, ) - self.counter_generation_tokens = make_per_engine( - counter_generation_tokens, engine_indexes, model_name + self.counter_generation_tokens = create_metric_per_engine( + counter_generation_tokens, per_engine_labelvalues ) self.counter_request_success: dict[FinishReason, dict[int, Counter]] = {} @@ -663,8 +664,8 @@ def __init__( buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames, ) - self.histogram_num_prompt_tokens_request = make_per_engine( - histogram_num_prompt_tokens_request, engine_indexes, model_name + self.histogram_num_prompt_tokens_request = create_metric_per_engine( + histogram_num_prompt_tokens_request, per_engine_labelvalues ) histogram_num_generation_tokens_request = self._histogram_cls( @@ -673,8 +674,8 @@ def __init__( buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames, ) - self.histogram_num_generation_tokens_request = make_per_engine( - histogram_num_generation_tokens_request, engine_indexes, model_name + self.histogram_num_generation_tokens_request = create_metric_per_engine( + histogram_num_generation_tokens_request, per_engine_labelvalues ) # TODO: This metric might be incorrect in case of using multiple @@ -686,8 +687,8 @@ def __init__( buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384], labelnames=labelnames, ) - self.histogram_iteration_tokens = make_per_engine( - histogram_iteration_tokens, engine_indexes, model_name + self.histogram_iteration_tokens = create_metric_per_engine( + histogram_iteration_tokens, per_engine_labelvalues ) histogram_max_num_generation_tokens_request = self._histogram_cls( @@ -696,8 +697,8 @@ def __init__( buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames, ) - self.histogram_max_num_generation_tokens_request = make_per_engine( - histogram_max_num_generation_tokens_request, engine_indexes, model_name + self.histogram_max_num_generation_tokens_request = create_metric_per_engine( + histogram_max_num_generation_tokens_request, per_engine_labelvalues ) histogram_n_request = self._histogram_cls( @@ -706,8 +707,8 @@ def __init__( buckets=[1, 2, 5, 10, 20], labelnames=labelnames, ) - self.histogram_n_request = make_per_engine( - histogram_n_request, engine_indexes, model_name + self.histogram_n_request = create_metric_per_engine( + histogram_n_request, per_engine_labelvalues ) histogram_max_tokens_request = self._histogram_cls( @@ -716,8 +717,8 @@ def __init__( buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames, ) - self.histogram_max_tokens_request = make_per_engine( - histogram_max_tokens_request, engine_indexes, model_name + self.histogram_max_tokens_request = create_metric_per_engine( + histogram_max_tokens_request, per_engine_labelvalues ) # @@ -752,8 +753,8 @@ def __init__( ], labelnames=labelnames, ) - self.histogram_time_to_first_token = make_per_engine( - histogram_time_to_first_token, engine_indexes, model_name + self.histogram_time_to_first_token = create_metric_per_engine( + histogram_time_to_first_token, per_engine_labelvalues ) histogram_inter_token_latency = self._histogram_cls( @@ -782,8 +783,8 @@ def __init__( ], labelnames=labelnames, ) - self.histogram_inter_token_latency = make_per_engine( - histogram_inter_token_latency, engine_indexes, model_name + self.histogram_inter_token_latency = create_metric_per_engine( + histogram_inter_token_latency, per_engine_labelvalues ) histogram_request_time_per_output_token = self._histogram_cls( @@ -812,8 +813,8 @@ def __init__( ], labelnames=labelnames, ) - self.histogram_request_time_per_output_token = make_per_engine( - histogram_request_time_per_output_token, engine_indexes, model_name + self.histogram_request_time_per_output_token = create_metric_per_engine( + histogram_request_time_per_output_token, per_engine_labelvalues ) request_latency_buckets = [ @@ -845,8 +846,8 @@ def __init__( buckets=request_latency_buckets, labelnames=labelnames, ) - self.histogram_e2e_time_request = make_per_engine( - histogram_e2e_time_request, engine_indexes, model_name + self.histogram_e2e_time_request = create_metric_per_engine( + histogram_e2e_time_request, per_engine_labelvalues ) histogram_queue_time_request = self._histogram_cls( @@ -855,8 +856,8 @@ def __init__( buckets=request_latency_buckets, labelnames=labelnames, ) - self.histogram_queue_time_request = make_per_engine( - histogram_queue_time_request, engine_indexes, model_name + self.histogram_queue_time_request = create_metric_per_engine( + histogram_queue_time_request, per_engine_labelvalues ) histogram_inference_time_request = self._histogram_cls( @@ -865,8 +866,8 @@ def __init__( buckets=request_latency_buckets, labelnames=labelnames, ) - self.histogram_inference_time_request = make_per_engine( - histogram_inference_time_request, engine_indexes, model_name + self.histogram_inference_time_request = create_metric_per_engine( + histogram_inference_time_request, per_engine_labelvalues ) histogram_prefill_time_request = self._histogram_cls( @@ -875,8 +876,8 @@ def __init__( buckets=request_latency_buckets, labelnames=labelnames, ) - self.histogram_prefill_time_request = make_per_engine( - histogram_prefill_time_request, engine_indexes, model_name + self.histogram_prefill_time_request = create_metric_per_engine( + histogram_prefill_time_request, per_engine_labelvalues ) histogram_decode_time_request = self._histogram_cls( @@ -885,8 +886,8 @@ def __init__( buckets=request_latency_buckets, labelnames=labelnames, ) - self.histogram_decode_time_request = make_per_engine( - histogram_decode_time_request, engine_indexes, model_name + self.histogram_decode_time_request = create_metric_per_engine( + histogram_decode_time_request, per_engine_labelvalues ) histogram_prefill_kv_computed_request = self._histogram_cls( @@ -898,8 +899,8 @@ def __init__( buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames, ) - self.histogram_prefill_kv_computed_request = make_per_engine( - histogram_prefill_kv_computed_request, engine_indexes, model_name + self.histogram_prefill_kv_computed_request = create_metric_per_engine( + histogram_prefill_kv_computed_request, per_engine_labelvalues ) # @@ -939,8 +940,8 @@ def __init__( buckets=kv_cache_residency_buckets, labelnames=labelnames, ) - self.histogram_kv_block_lifetime = make_per_engine( - histogram_kv_block_lifetime, engine_indexes, model_name + self.histogram_kv_block_lifetime = create_metric_per_engine( + histogram_kv_block_lifetime, per_engine_labelvalues ) histogram_kv_block_idle_before_evict = self._histogram_cls( @@ -952,8 +953,8 @@ def __init__( buckets=kv_cache_residency_buckets, labelnames=labelnames, ) - self.histogram_kv_block_idle_before_evict = make_per_engine( - histogram_kv_block_idle_before_evict, engine_indexes, model_name + self.histogram_kv_block_idle_before_evict = create_metric_per_engine( + histogram_kv_block_idle_before_evict, per_engine_labelvalues ) histogram_kv_block_reuse_gap = self._histogram_cls( @@ -967,8 +968,8 @@ def __init__( buckets=kv_cache_residency_buckets, labelnames=labelnames, ) - self.histogram_kv_block_reuse_gap = make_per_engine( - histogram_kv_block_reuse_gap, engine_indexes, model_name + self.histogram_kv_block_reuse_gap = create_metric_per_engine( + histogram_kv_block_reuse_gap, per_engine_labelvalues ) else: self.histogram_kv_block_lifetime = {} @@ -1203,15 +1204,6 @@ def log_engine_initialized(self): self.log_metrics_info("cache_config", self.vllm_config.cache_config) -PromMetric: TypeAlias = Gauge | Counter | Histogram - - -def make_per_engine( - metric: PromMetric, engine_idxs: list[int], model_name: object -) -> dict[int, PromMetric]: - return {idx: metric.labels(model_name, str(idx)) for idx in engine_idxs} - - def build_buckets(mantissa_lst: list[int], max_value: int) -> list[int]: """ Builds a list of buckets with increasing powers of 10 multiplied by diff --git a/vllm/v1/metrics/perf.py b/vllm/v1/metrics/perf.py index 81348efc13b3..91629cb57816 100644 --- a/vllm/v1/metrics/perf.py +++ b/vllm/v1/metrics/perf.py @@ -27,6 +27,7 @@ get_kv_cache_torch_dtype, ) from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.metrics.utils import create_metric_per_engine logger = init_logger(__name__) @@ -1291,7 +1292,9 @@ def __init__( ), labelnames=labelnames, ) - self.counter_flops = make_per_engine(counter_flops, per_engine_labelvalues) + self.counter_flops = create_metric_per_engine( + counter_flops, per_engine_labelvalues + ) counter_read_bytes = self._counter_cls( name="vllm:estimated_read_bytes_per_gpu_total", @@ -1301,7 +1304,7 @@ def __init__( ), labelnames=labelnames, ) - self.counter_read_bytes = make_per_engine( + self.counter_read_bytes = create_metric_per_engine( counter_read_bytes, per_engine_labelvalues ) @@ -1313,7 +1316,7 @@ def __init__( ), labelnames=labelnames, ) - self.counter_write_bytes = make_per_engine( + self.counter_write_bytes = create_metric_per_engine( counter_write_bytes, per_engine_labelvalues ) @@ -1329,16 +1332,6 @@ def observe(self, perf_stats: PerfStats, engine_idx: int = 0): self.counter_write_bytes[engine_idx].inc(perf_stats.num_write_bytes_per_gpu) -def make_per_engine( - counter: prometheus_client.Counter, per_engine_labelvalues: dict[int, list[object]] -): - """Create a counter for each label value.""" - return { - idx: counter.labels(*labelvalues) - for idx, labelvalues in per_engine_labelvalues.items() - } - - ## util functions diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py index abc53f3802ea..a11b92680779 100644 --- a/vllm/v1/metrics/ray_wrappers.py +++ b/vllm/v1/metrics/ray_wrappers.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time -from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorPrometheus +from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorProm from vllm.v1.metrics.loggers import PrometheusStatLogger from vllm.v1.metrics.perf import PerfMetricsProm from vllm.v1.spec_decode.metrics import SpecDecodingProm @@ -168,9 +168,9 @@ class RaySpecDecodingProm(SpecDecodingProm): _counter_cls = RayCounterWrapper -class RayKVConnectorPrometheus(KVConnectorPrometheus): +class RayKVConnectorProm(KVConnectorProm): """ - RayKVConnectorPrometheus is used by RayMetrics to log Ray + RayKVConnectorProm is used by RayMetrics to log Ray metrics. Provides the same metrics as KV connectors but uses Ray's util.metrics library. """ @@ -197,7 +197,7 @@ class RayPrometheusStatLogger(PrometheusStatLogger): _counter_cls = RayCounterWrapper _histogram_cls = RayHistogramWrapper _spec_decoding_cls = RaySpecDecodingProm - _kv_connector_cls = RayKVConnectorPrometheus + _kv_connector_cls = RayKVConnectorProm _perf_metrics_cls = RayPerfMetricsProm @staticmethod diff --git a/vllm/v1/metrics/utils.py b/vllm/v1/metrics/utils.py new file mode 100644 index 000000000000..1ef56fc94869 --- /dev/null +++ b/vllm/v1/metrics/utils.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import TypeAlias + +from prometheus_client import Counter, Gauge, Histogram + +PromMetric: TypeAlias = Gauge | Counter | Histogram + + +def create_metric_per_engine( + metric: PromMetric, + per_engine_labelvalues: dict[int, list[object]], +) -> dict[int, PromMetric]: + """Create a labeled metric child for each engine index.""" + return { + idx: metric.labels(*labelvalues) + for idx, labelvalues in per_engine_labelvalues.items() + } diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py index be880bec22ac..204c8bd0e411 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -4,6 +4,7 @@ import dataclasses import importlib import pickle +from abc import ABC, abstractmethod from collections.abc import Callable, Sequence from functools import partial from inspect import isclass @@ -53,6 +54,27 @@ bytestr: TypeAlias = bytes | bytearray | memoryview | zmq.Frame +class OOBTensorConsumer(ABC): + @abstractmethod + def __call__(self, tensor: torch.Tensor) -> dict | None: + """ + Called with tensors for the current message. + Returns None to reject the tensor (falls back to regular serialization), + otherwise a dict with arbitrary placeholder data to be included + in the serialized message. + """ + return None + + @abstractmethod + def new_message(self) -> None: + """Called at the start of each new encoded message.""" + pass + + +# dtype, shape, metadata -> tensor +OOBTensorProvider = Callable[[str, tuple[int, ...], dict], torch.Tensor] + + def _log_insecure_serialization_warning(): logger.warning_once( "Allowing insecure serialization using pickle due to " @@ -119,9 +141,16 @@ class MsgpackEncoder: By default, arrays below 256B are serialized inline Larger will get sent via dedicated messages. Note that this is a per-tensor limit. + + When a ``oob_tensor_consumer`` is provided, tensors (CUDA and CPU) will be + offered to it for out-of-band handling. """ - def __init__(self, size_threshold: int | None = None): + def __init__( + self, + size_threshold: int | None = None, + oob_tensor_consumer: OOBTensorConsumer | None = None, + ): if size_threshold is None: size_threshold = envs.VLLM_MSGPACK_ZERO_COPY_THRESHOLD self.encoder = msgpack.Encoder(enc_hook=self.enc_hook) @@ -130,11 +159,14 @@ def __init__(self, size_threshold: int | None = None): # pass custom data to the hook otherwise. self.aux_buffers: list[bytestr] | None = None self.size_threshold = size_threshold + self.oob_tensor_consumer = oob_tensor_consumer if envs.VLLM_ALLOW_INSECURE_SERIALIZATION: _log_insecure_serialization_warning() def encode(self, obj: Any) -> Sequence[bytestr]: try: + if self.oob_tensor_consumer is not None: + self.oob_tensor_consumer.new_message() self.aux_buffers = bufs = [b""] bufs[0] = self.encoder.encode(obj) # This `bufs` list allows us to collect direct pointers to backing @@ -147,6 +179,8 @@ def encode(self, obj: Any) -> Sequence[bytestr]: def encode_into(self, obj: Any, buf: bytearray) -> Sequence[bytestr]: try: + if self.oob_tensor_consumer is not None: + self.oob_tensor_consumer.new_message() self.aux_buffers = [buf] bufs = self.aux_buffers self.encoder.encode_into(obj, buf) @@ -222,17 +256,19 @@ def _encode_ndarray( def _encode_tensor( self, obj: torch.Tensor - ) -> tuple[str, tuple[int, ...], int | memoryview]: - assert self.aux_buffers is not None + ) -> tuple[str, tuple[int, ...], int | dict | memoryview]: + oob_consumer = self.oob_tensor_consumer # view the tensor as a contiguous 1D array of bytes - arr_data = tensor_data(obj) - if obj.nbytes < self.size_threshold: + if obj.nbytes < self.size_threshold and obj.is_cpu: # Smaller tensors are encoded inline, just like ndarrays. - data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, arr_data) + data = msgpack.Ext(CUSTOM_TYPE_RAW_VIEW, tensor_data(obj)) + elif oob_consumer is not None and (data := oob_consumer(obj)) is not None: + assert isinstance(data, dict) else: # Otherwise encode index of backing buffer to avoid copy. + assert self.aux_buffers is not None data = len(self.aux_buffers) - self.aux_buffers.append(arr_data) + self.aux_buffers.append(tensor_data(obj)) dtype = str(obj.dtype).removeprefix("torch.") return dtype, obj.shape, data @@ -279,9 +315,17 @@ class MsgpackDecoder: Note that unlike vanilla `msgspec` Decoders, this interface is generally not thread-safe when encoding tensors / numpy arrays. + + ``oob_tensor_provider`` must be used when an OOBTensorConsumer is used on the + encoder side. """ - def __init__(self, t: Any | None = None, share_mem: bool = True): + def __init__( + self, + t: Any | None = None, + share_mem: bool = True, + oob_tensor_provider: OOBTensorProvider | None = None, + ): self.share_mem = share_mem self.pin_tensors = is_pin_memory_available() args = () if t is None else (t,) @@ -289,6 +333,7 @@ def __init__(self, t: Any | None = None, share_mem: bool = True): *args, ext_hook=self.ext_hook, dec_hook=self.dec_hook ) self.aux_buffers: Sequence[bytestr] = () + self.oob_tensor_provider = oob_tensor_provider if envs.VLLM_ALLOW_INSECURE_SERIALIZATION: _log_insecure_serialization_warning() @@ -353,6 +398,12 @@ def _decode_ndarray(self, arr: Any) -> np.ndarray: def _decode_tensor(self, arr: Any) -> torch.Tensor: dtype, shape, data = arr + if isinstance(data, dict): + assert self.oob_tensor_provider, ( + "Received OOB tensor but tensor provider is not set" + ) + return self.oob_tensor_provider(dtype, shape, data) + is_aux = isinstance(data, int) buffer = self.aux_buffers[data] if is_aux else data buffer = buffer if isinstance(buffer, memoryview) else memoryview(buffer) diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py index 6c16bc686d16..9a41ff5c818c 100644 --- a/vllm/v1/spec_decode/metrics.py +++ b/vllm/v1/spec_decode/metrics.py @@ -9,6 +9,7 @@ from vllm.config import SpeculativeConfig from vllm.logger import init_logger +from vllm.v1.metrics.utils import create_metric_per_engine logger = init_logger(__name__) @@ -155,7 +156,7 @@ def __init__( documentation="Number of spec decoding drafts.", labelnames=labelnames, ) - self.counter_spec_decode_num_drafts = make_per_engine( + self.counter_spec_decode_num_drafts = create_metric_per_engine( counter_drafts, per_engine_labelvalues ) @@ -164,7 +165,7 @@ def __init__( documentation="Number of draft tokens.", labelnames=labelnames, ) - self.counter_spec_decode_num_draft_tokens = make_per_engine( + self.counter_spec_decode_num_draft_tokens = create_metric_per_engine( counter_draft_tokens, per_engine_labelvalues ) @@ -173,7 +174,7 @@ def __init__( documentation="Number of accepted tokens.", labelnames=labelnames, ) - self.counter_spec_decode_num_accepted_tokens = make_per_engine( + self.counter_spec_decode_num_accepted_tokens = create_metric_per_engine( counter_accepted_tokens, per_engine_labelvalues ) @@ -212,14 +213,3 @@ def observe(self, spec_decoding_stats: SpecDecodingStats, engine_idx: int = 0): self.counter_spec_decode_num_accepted_tokens_per_pos[engine_idx] ): counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos]) - - -def make_per_engine( - counter: prometheus_client.Counter, - per_engine_labelvalues: dict[int, list[object]], -): - """Create a counter for each label value.""" - return { - idx: counter.labels(*labelvalues) - for idx, labelvalues in per_engine_labelvalues.items() - } diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 970465089e10..3710593dbd44 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -10,6 +10,7 @@ from dataclasses import dataclass from multiprocessing import connection from multiprocessing.process import BaseProcess +from multiprocessing.queues import Queue from typing import ( TYPE_CHECKING, Any, @@ -173,6 +174,7 @@ def __init__( input_addresses: list[str], output_addresses: list[str], stats_update_address: str | None = None, + tensor_queue: Queue | None = None, ): """Initialize and start API server worker processes. @@ -185,6 +187,7 @@ def __init__( input_addresses: Input addresses for each API server output_addresses: Output addresses for each API server stats_update_address: Optional stats update address + tensor_queue: Optional tensor IPC queue for sharing MM tensors """ self.listen_address = listen_address self.sock = sock @@ -205,6 +208,8 @@ def __init__( } if stats_update_address is not None: client_config["stats_update_address"] = stats_update_address + if tensor_queue is not None: + client_config["tensor_queue"] = tensor_queue proc = spawn_context.Process( target=target_server_fn, @@ -419,7 +424,7 @@ def tensor_data(tensor: torch.Tensor) -> memoryview: Returns: A memoryview of the tensor data as uint8. """ - return tensor.flatten().contiguous().view(torch.uint8).numpy().data + return tensor.flatten().cpu().contiguous().view(torch.uint8).numpy().data @dataclass diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py index 59786ed7a153..8e5bb11e4dad 100644 --- a/vllm/v1/worker/gpu/attn_utils.py +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -111,6 +111,7 @@ def _reshape_kv_cache( kv_cache_config: KVCacheConfig, kv_cache_raw_tensors: dict[str, torch.Tensor], attn_backends: dict[str, AttentionBackend], + cache_dtype: str, ) -> dict[str, torch.Tensor]: kv_caches: dict[str, torch.Tensor] = {} for kv_cache_group_spec in kv_cache_config.kv_cache_groups: @@ -127,6 +128,7 @@ def _reshape_kv_cache( kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size, + cache_dtype, ) # FIXME(woosuk): Add kv_cache_stride_order to all attention backends. @@ -155,9 +157,12 @@ def init_kv_cache( kv_cache_config: KVCacheConfig, attn_backends: dict[str, AttentionBackend], device: torch.device, + cache_dtype: str, ) -> dict[str, torch.Tensor]: kv_cache_raw_tensors = _allocate_kv_cache(kv_cache_config, device) - kv_caches = _reshape_kv_cache(kv_cache_config, kv_cache_raw_tensors, attn_backends) + kv_caches = _reshape_kv_cache( + kv_cache_config, kv_cache_raw_tensors, attn_backends, cache_dtype + ) bind_kv_cache(kv_caches, forward_context, runner_kv_caches) return kv_caches diff --git a/vllm/v1/worker/gpu/block_table.py b/vllm/v1/worker/gpu/block_table.py index 3a2c0562a92c..e79a7afbd81e 100644 --- a/vllm/v1/worker/gpu/block_table.py +++ b/vllm/v1/worker/gpu/block_table.py @@ -169,7 +169,7 @@ def get_dummy_slot_mappings(self, num_tokens: int) -> torch.Tensor: return self.slot_mappings[:, :num_tokens] -@triton.jit +@triton.jit(do_not_specialize=["num_reqs"]) def _gather_block_tables_kernel( batch_idx_to_req_idx, # [batch_size] src_block_table_ptrs, # [num_kv_cache_groups] diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index b4a0c26cec02..5788b31d2583 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -195,7 +195,6 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): num_speculative_steps=self.num_speculative_steps, vocab_size=self.vocab_size, device=self.device, - cache_draft_logits=not use_strict_rejection_sampling, ) self.input_buffers = InputBuffers( max_num_reqs=self.max_num_reqs, @@ -360,6 +359,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: self.kv_cache_config, self.attn_backends, self.device, + self.cache_config.cache_dtype, ) self.kv_connector = get_kv_connector(self.vllm_config, kv_caches_dict) @@ -446,7 +446,6 @@ def _dummy_run( next_prefill_tokens=self.req_states.next_prefill_tokens, temperature=self.sampler.sampling_states.temperature.gpu, seeds=self.sampler.sampling_states.seeds.gpu, - draft_logits_out=self.req_states.draft_logits, num_tokens_across_dp=num_tokens_across_dp, dummy_run=True, skip_attn_for_dummy_run=skip_attn, @@ -557,18 +556,23 @@ def capture_model(self) -> int: ) return cuda_graph_size + def _remove_request(self, req_id: str) -> bool: + if not self.req_states.remove_request(req_id): + return False + if self.encoder_cache is not None: + self.encoder_cache.remove_request(req_id) + if self.prompt_logprobs_worker is not None: + self.prompt_logprobs_worker.remove_request(req_id) + self.lora_state.remove_request(req_id) + return True + def finish_requests(self, scheduler_output: SchedulerOutput) -> None: finished_req_ids = scheduler_output.finished_req_ids preempted_req_ids = scheduler_output.preempted_req_ids if preempted_req_ids: finished_req_ids = finished_req_ids.union(preempted_req_ids) for req_id in finished_req_ids: - self.req_states.remove_request(req_id) - if self.encoder_cache is not None: - self.encoder_cache.remove_request(req_id) - if self.prompt_logprobs_worker is not None: - self.prompt_logprobs_worker.remove_request(req_id) - self.lora_state.remove_request(req_id) + self._remove_request(req_id) def free_states(self, scheduler_output: SchedulerOutput) -> None: if self.encoder_cache is not None: @@ -580,6 +584,12 @@ def add_requests(self, scheduler_output: SchedulerOutput) -> None: assert new_req_data.prompt_token_ids is not None assert new_req_data.prefill_token_ids is not None req_id = new_req_data.req_id + + # Streaming input update: request already exists from a prior + # chunk. Remove old state so it can be cleanly re-added below + # with the updated prompt_token_ids and mm_features. + self._remove_request(req_id) + prompt_len = len(new_req_data.prompt_token_ids) self.req_states.add_request( req_id=req_id, @@ -815,11 +825,12 @@ def sample( else: # Rejection sampling for spec decoding. assert self.rejection_sampler is not None + assert self.speculator is not None sampler_output = self.rejection_sampler( logits, input_batch, # Draft logits are needed for probabilistic rejection sampling. - self.req_states.draft_logits, + self.speculator.draft_logits, ) # Get the number of sampled and rejected tokens. @@ -1145,7 +1156,6 @@ def sample_tokens( self.req_states.next_prefill_tokens, self.sampler.sampling_states.temperature.gpu, self.sampler.sampling_states.seeds.gpu, - self.req_states.draft_logits, num_tokens_across_dp=num_tokens_across_dp, ) self.req_states.draft_tokens[input_batch.idx_mapping] = draft_tokens diff --git a/vllm/v1/worker/gpu/model_states/default.py b/vllm/v1/worker/gpu/model_states/default.py index 104e4c1948b5..8e73867deb2e 100644 --- a/vllm/v1/worker/gpu/model_states/default.py +++ b/vllm/v1/worker/gpu/model_states/default.py @@ -7,6 +7,7 @@ from vllm.config import VllmConfig from vllm.config.compilation import CUDAGraphMode +from vllm.tasks import GenerationTask from vllm.v1.core.sched.output import NewRequestData from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.worker.gpu.attn_utils import build_attn_metadata @@ -61,6 +62,28 @@ def __init__( device=self.device, ) + def get_supported_generation_tasks(self) -> tuple[GenerationTask, ...]: + from vllm.model_executor.models.interfaces import ( + supports_realtime, + supports_transcription, + ) + from vllm.model_executor.models.interfaces_base import is_text_generation_model + + supported_tasks = list[GenerationTask]() + + if is_text_generation_model(self.model): + supported_tasks.append("generate") + + if supports_transcription(self.model): + if self.model.supports_transcription_only: + return ("transcription",) + supported_tasks.append("transcription") + + if supports_realtime(self.model): + supported_tasks.append("realtime") + + return tuple(supported_tasks) + def add_request(self, req_index: int, new_req_data: NewRequestData) -> None: if self.rope_state is not None: assert new_req_data.prefill_token_ids is not None diff --git a/vllm/v1/worker/gpu/model_states/interface.py b/vllm/v1/worker/gpu/model_states/interface.py index 1c114496ddd8..d83ab2fc515f 100644 --- a/vllm/v1/worker/gpu/model_states/interface.py +++ b/vllm/v1/worker/gpu/model_states/interface.py @@ -28,8 +28,9 @@ def __init__( ) -> None: raise NotImplementedError + @abstractmethod def get_supported_generation_tasks(self) -> tuple[GenerationTask, ...]: - return ("generate",) + raise NotImplementedError def add_request(self, req_index: int, new_req_data: NewRequestData) -> None: return None diff --git a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py index 49b6b5331b5c..4df88bf95c4c 100644 --- a/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py +++ b/vllm/v1/worker/gpu/spec_decode/eagle/speculator.py @@ -76,6 +76,17 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): device=device, ) + cache_draft_logits = self.speculative_config.rejection_sample_method != "strict" + self.draft_logits: torch.Tensor | None = None + if cache_draft_logits: + self.draft_logits = torch.zeros( + self.max_num_reqs, + self.num_speculative_steps, + self.vocab_size, + dtype=torch.float32, + device=device, + ) + # currently we don't support PIECEWISE for Eagle. cudagraph_mode = vllm_config.compilation_config.cudagraph_mode if cudagraph_mode.decode_mode() == CUDAGraphMode.FULL: @@ -158,7 +169,6 @@ def generate_draft( slot_mappings: dict[str, torch.Tensor] | None, num_tokens_across_dp: torch.Tensor | None, cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, - draft_logits_out: torch.Tensor | None = None, ) -> None: pos = self.input_buffers.positions[:num_reqs] query_start_loc = self.input_buffers.query_start_loc[: num_reqs + 1] @@ -185,8 +195,8 @@ def generate_draft( self.seeds, pos + 1, apply_temperature=True, - processed_logits_out=draft_logits_out[:, step] - if draft_logits_out is not None + processed_logits_out=self.draft_logits[:, step] + if self.draft_logits is not None else None, ) self.draft_tokens[:num_reqs, step] = draft_tokens @@ -241,8 +251,6 @@ def propose( temperature: torch.Tensor, # [max_num_reqs] seeds: torch.Tensor, - # [max_num_reqs, num_speculative_steps, vocab_size] - draft_logits_out: torch.Tensor | None, num_tokens_across_dp: torch.Tensor | None = None, dummy_run: bool = False, skip_attn_for_dummy_run: bool = False, @@ -308,8 +316,8 @@ def propose( self.seeds, pos + 1, apply_temperature=True, - processed_logits_out=draft_logits_out[:, 0] - if draft_logits_out is not None + processed_logits_out=self.draft_logits[:, 0] + if self.draft_logits is not None else None, ) @@ -394,7 +402,6 @@ def propose( slot_mappings_updated, num_tokens_across_dp=num_tokens_across_dp, cudagraph_runtime_mode=batch_desc.cg_mode, - draft_logits_out=draft_logits_out, ) return self.draft_tokens[:num_reqs] diff --git a/vllm/v1/worker/gpu/states.py b/vllm/v1/worker/gpu/states.py index 3fb02c12d999..24d225886106 100644 --- a/vllm/v1/worker/gpu/states.py +++ b/vllm/v1/worker/gpu/states.py @@ -15,7 +15,6 @@ def __init__( num_speculative_steps: int, vocab_size: int, device: torch.device, - cache_draft_logits: bool, ): self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len @@ -71,18 +70,6 @@ def __init__( dtype=torch.int64, device=device, ) - # Draft token logits. - # NOTE: This tensor maintains the "processed" logits after applying temperature, - # top-p, etc. - self.draft_logits: torch.Tensor | None = None - if cache_draft_logits: - self.draft_logits = torch.zeros( - self.max_num_reqs, - self.num_speculative_steps, - self.vocab_size, - dtype=torch.float32, - device=device, - ) self.next_prefill_tokens = torch.zeros( self.max_num_reqs, dtype=torch.int32, device=device @@ -122,13 +109,14 @@ def apply_staged_writes(self) -> None: self.all_token_ids.apply_write() self.num_computed_tokens.apply_write() - def remove_request(self, req_id: str) -> None: + def remove_request(self, req_id: str) -> bool: req_idx = self.req_id_to_index.pop(req_id, None) if req_idx is None: # Request not found. - return + return False self.index_to_req_id.pop(req_idx, None) self.free_indices.append(req_idx) + return True def any_prefills(self, idx_mapping_np: np.ndarray) -> bool: return np.any( diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 34bcc241f769..fb7795e04740 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -134,7 +134,13 @@ def __init__( pin_memory=pin_memory, ) self.num_tokens_no_spec = self.num_tokens_no_spec_cpu_tensor.numpy() - self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) + self.num_prompt_tokens_cpu_tensor = torch.zeros( + (max_num_reqs,), + device="cpu", + dtype=torch.int32, + pin_memory=pin_memory, + ) + self.num_prompt_tokens = self.num_prompt_tokens_cpu_tensor.numpy() self.num_computed_tokens_cpu_tensor = torch.zeros( (max_num_reqs,), device="cpu", @@ -886,7 +892,7 @@ def get_pooling_metadata(self) -> PoolingMetadata: pooling_states = self.get_pooling_states() return PoolingMetadata( - prompt_lens=torch.from_numpy(self.num_prompt_tokens[: self.num_reqs]), + prompt_lens=self.num_prompt_tokens_cpu_tensor[: self.num_reqs].clone(), prompt_token_ids=self.sampling_metadata.prompt_token_ids, pooling_params=pooling_params, pooling_states=pooling_states, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0365a9938d12..81326b6d11fa 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -740,19 +740,6 @@ def __init__( self.uniform_decode_query_len = 1 + self.num_spec_tokens - # When spec decode is active, the mamba backend classifies requests - # with query_len <= reorder_batch_threshold as "decodes". Prefill - # chunks that fall under this threshold get processed via the decode - # path, which stores intermediate states at sequential slots. We must - # set num_accepted_tokens to the chunk's query_len for those requests - # so the next iteration reads from the correct final-state slot. - # Prefills that went through the actual prefill path should keep the - # default value of 1 (the prefill path stores state at slot 0 only). - self.needs_prefill_as_decode_slots: bool = False - self.prefill_as_decode_num_tokens = self._make_buffer( - self.max_num_reqs, dtype=torch.int32 - ) - # Cudagraph dispatcher for runtime cudagraph dispatching. self.cudagraph_dispatcher = CudagraphDispatcher(self.vllm_config) @@ -1369,16 +1356,6 @@ def _update_states_after_model_execute( .int() .argmax(-1) ) - spec_decode_active = bool(scheduler_output.scheduled_spec_decode_tokens) - if self.needs_prefill_as_decode_slots and spec_decode_active: - mamba_utils.update_accepted_tokens_for_prefill_as_decode( - self.input_batch, - self.prefill_as_decode_num_tokens, - self.num_accepted_tokens.gpu, - scheduler_output, - self.reorder_batch_threshold, - num_reqs, - ) if self.cache_config.mamba_cache_mode == "align": for i, num_tokens in enumerate( @@ -1982,14 +1959,23 @@ def _get_block_table(kv_cache_gid: int): attn_gid = self.routed_experts_attn_gid slot_mapping_attn = slot_mappings[attn_gid] self.slot_mapping = slot_mapping_attn[:num_tokens].cpu().numpy() + # Compute is_prefilling: True if request is still in prefill phase + # (num_computed_tokens < num_prompt_tokens). Used by mamba backends to + # distinguish actual decodes from short extends. + num_computed_tokens_cpu = self.input_batch.num_computed_tokens_cpu_tensor[ + :num_reqs_padded + ] + num_prompt_tokens_cpu = self.input_batch.num_prompt_tokens_cpu_tensor[ + :num_reqs_padded + ] + is_prefilling = num_computed_tokens_cpu < num_prompt_tokens_cpu + cm_base = CommonAttentionMetadata( query_start_loc=self.query_start_loc.gpu[: num_reqs_padded + 1], query_start_loc_cpu=self.query_start_loc.cpu[: num_reqs_padded + 1], seq_lens=self.seq_lens.gpu[:num_reqs_padded], _seq_lens_cpu=self.seq_lens.cpu[:num_reqs_padded], - _num_computed_tokens_cpu=self.input_batch.num_computed_tokens_cpu_tensor[ - :num_reqs_padded - ], + _num_computed_tokens_cpu=num_computed_tokens_cpu, num_reqs=num_reqs_padded, num_actual_tokens=num_tokens_padded, max_query_len=max_query_len, @@ -1997,6 +1983,7 @@ def _get_block_table(kv_cache_gid: int): block_table_tensor=block_table_gid_0, slot_mapping=slot_mapping_gid_0, causal=True, + is_prefilling=is_prefilling, ) if self.dcp_world_size > 1: @@ -2048,8 +2035,6 @@ def _build_attn_group_metadata( else 0 ) - if isinstance(builder, Mamba2AttentionMetadataBuilder): - self.needs_prefill_as_decode_slots = True extra_attn_metadata_args = {} if use_spec_decode and isinstance( builder, (Mamba2AttentionMetadataBuilder, GDNAttentionMetadataBuilder) @@ -2834,15 +2819,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: if not is_pooling_model(model): return [] - supported_tasks = list(model.pooler.get_supported_tasks()) - - if "score" in supported_tasks: - num_labels = getattr(self.model_config.hf_config, "num_labels", 0) - if num_labels != 1: - supported_tasks.remove("score") - logger.debug_once("Score API is only enabled for num_labels == 1.") - - return supported_tasks + return list(model.pooler.get_supported_tasks()) def get_supported_tasks(self) -> tuple[SupportedTask, ...]: tasks = list[SupportedTask]() diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index d101edc18100..91dcdc2b9798 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -315,30 +315,12 @@ def init_device(self): # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool # to hijack tensor allocation. - def load_model(self) -> None: - dummy_weights = os.environ.get("VLLM_ELASTIC_EP_SCALE_UP_LAUNCH") == "1" - if dummy_weights: - ( - expanded_physical_to_logical, - num_logical_experts, - old_num_physical_experts, - ) = self.elastic_ep_executor.receive_expert_mapping() - num_physical_experts = expanded_physical_to_logical.shape[1] - self.parallel_config.eplb_config.num_redundant_experts = ( - num_physical_experts - num_logical_experts - ) - + def load_model(self, *, load_dummy_weights: bool = False) -> None: with ( self._maybe_get_memory_pool_context(tag="weights"), set_current_vllm_config(self.vllm_config), ): - self.model_runner.load_model(load_dummy_weights=dummy_weights) - - if dummy_weights: - self.model_runner.setup_eplb_from_mapping( - expanded_physical_to_logical, old_num_physical_experts - ) - self.model_runner.eep_eplb_suppressed = True + self.model_runner.load_model(load_dummy_weights=load_dummy_weights) def update_config(self, overrides: dict[str, Any]) -> None: self.model_runner.update_config(overrides) @@ -417,9 +399,7 @@ def determine_available_memory(self) -> int: ) self.non_torch_memory = profile_result.non_torch_increase - self.peak_activation_memory = ( - profile_result.torch_peak_increase + cudagraph_memory_estimate_applied - ) + self.peak_activation_memory = profile_result.torch_peak_increase self.cudagraph_memory_estimate = cudagraph_memory_estimate free_gpu_memory = profile_result.after_profile.free_memory @@ -638,6 +618,7 @@ def compile_or_warm_up_model(self) -> float: # slightly underestimate the memory consumption. # So leave a small buffer (=150MiB) to avoid OOM. redundancy_buffer_memory = 150 * (1 << 20) + non_kv_cache_memory = ( self.model_runner.model_memory_usage + self.peak_activation_memory diff --git a/vllm/v1/worker/mamba_utils.py b/vllm/v1/worker/mamba_utils.py index 68172133eb99..ed618e09973f 100644 --- a/vllm/v1/worker/mamba_utils.py +++ b/vllm/v1/worker/mamba_utils.py @@ -67,6 +67,8 @@ class MambaCopyBuffers: src_ptrs: CpuGpuBuffer dst_ptrs: CpuGpuBuffer sizes: CpuGpuBuffer + mamba_group_ids: list[int] + mamba_spec: MambaSpec offset: int = 0 @classmethod @@ -77,7 +79,7 @@ def create( copy_funcs: tuple[MambaStateCopyFunc, ...], make_buffer: Callable[..., CpuGpuBuffer], ) -> "MambaCopyBuffers": - mamba_group_ids, _ = get_mamba_groups(kv_cache_config) + mamba_group_ids, mamba_spec = get_mamba_groups(kv_cache_config) entries_per_req = sum( len(kv_cache_config.kv_cache_groups[gid].layer_names) for gid in mamba_group_ids @@ -87,6 +89,8 @@ def create( src_ptrs=make_buffer(n, dtype=torch.int64), dst_ptrs=make_buffer(n, dtype=torch.int64), sizes=make_buffer(n, dtype=torch.int32), + mamba_group_ids=mamba_group_ids, + mamba_spec=mamba_spec, ) @@ -155,7 +159,8 @@ def preprocess_mamba( Copy the mamba state of previous step to the last (1 + num_speculative_blocks) block. """ - mamba_group_ids, mamba_spec = get_mamba_groups(kv_cache_config) + mamba_group_ids = copy_bufs.mamba_group_ids + mamba_spec = copy_bufs.mamba_spec num_speculative_blocks = mamba_spec.num_speculative_blocks # TODO(Chen): we need to optimize this function a lot assert cache_config.enable_prefix_caching @@ -231,8 +236,8 @@ def postprocess_mamba( num_scheduled_tokens_dict = scheduler_output.num_scheduled_tokens scheduled_spec_decode_tokens_dict = scheduler_output.scheduled_spec_decode_tokens num_accepted_tokens_cpu = input_batch.num_accepted_tokens_cpu - # NOTE: can be optimized as this function always returns the same result - mamba_group_ids, mamba_spec = get_mamba_groups(kv_cache_config) + mamba_group_ids = copy_bufs.mamba_group_ids + mamba_spec = copy_bufs.mamba_spec copy_bufs.offset = 0 for i, req_id in enumerate(input_batch.req_ids): req_state = requests[req_id] @@ -266,45 +271,3 @@ def postprocess_mamba( if src_block_idx == dest_block_idx: num_accepted_tokens_cpu[i] = 1 do_mamba_copy_block(copy_bufs) - - -def update_accepted_tokens_for_prefill_as_decode( - input_batch: GPUInputBatch, - prefill_as_decode_num_tokens: CpuGpuBuffer, - num_accepted_tokens_gpu: torch.Tensor, - scheduler_output: SchedulerOutput, - decode_qlen_threshold: int | None, - num_reqs: int, -): - """ - Adjusts num_accepted_tokens for prefill chunks processed via the decode path. - This ensures subsequent iterations read from the correct sequential state slot - instead of the default prefill slot 0. Not used by GDN attention, which manually - separates short prefills and short decodes when building the attention metadata. - """ - any_is_prefill = False - for i in range(num_reqs): - num_computed = input_batch.num_computed_tokens_cpu[i] - num_prompt = input_batch.num_prompt_tokens[i] - is_prefill = num_computed < num_prompt - req_id = input_batch.req_ids[i] - query_len = scheduler_output.num_scheduled_tokens[req_id] - - if is_prefill: - classified_as_decode = ( - decode_qlen_threshold is not None and query_len <= decode_qlen_threshold - ) - num_tokens = query_len if classified_as_decode else 1 - any_is_prefill = True - else: - num_tokens = -1 - prefill_as_decode_num_tokens.np[i] = num_tokens - - # We can skip the GPU transfer if there aren't any values to update - if any_is_prefill: - prefill_as_decode_num_tokens.copy_to_gpu(num_reqs) - num_accepted_tokens_gpu[:num_reqs] = torch.where( - prefill_as_decode_num_tokens.gpu[:num_reqs] != -1, - prefill_as_decode_num_tokens.gpu[:num_reqs], - num_accepted_tokens_gpu[:num_reqs], - ) diff --git a/vllm/v1/worker/worker_base.py b/vllm/v1/worker/worker_base.py index b6ba8adf8336..041fff637b87 100644 --- a/vllm/v1/worker/worker_base.py +++ b/vllm/v1/worker/worker_base.py @@ -122,7 +122,7 @@ def get_model_inspection(self) -> str: return format_model_inspection(self.get_model()) - def load_model(self) -> None: + def load_model(self, *, load_dummy_weights: bool = False) -> None: """Load model onto target device.""" raise NotImplementedError