bxyu-nvidia · bxyu-nvidia · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -326,8 +326,7 @@ apply_rocm_test_overrides() {
   if [[ $cmds == *" kernels/moe"* ]]; then
     cmds="${cmds} \
     --ignore=kernels/moe/test_moe.py \
-    --ignore=kernels/moe/test_cutlass_moe.py \
-    --ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
+    --ignore=kernels/moe/test_cutlass_moe.py"
   fi
 
   # --- Entrypoint ignores ---

diff --git a/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh b/.buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh
@@ -127,7 +127,7 @@ run_and_track_test() {
 
 # --- Actual Test Execution ---
 run_and_track_test 1 "test_struct_output_generate.py" \
-    "python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
+    "python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
 run_and_track_test 2 "test_moe_pallas.py" \
     "python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
 run_and_track_test 3 "test_lora.py" \

diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml
@@ -27,14 +27,14 @@ steps:
   - vllm/v1/engine/
   - vllm/v1/worker/
   - tests/v1/distributed
-  - tests/v1/entrypoints/openai/test_multi_api_servers.py
+  - tests/entrypoints/openai/test_multi_api_servers.py
   commands:
   # https://github.com/NVIDIA/nccl/issues/1838
   - export NCCL_CUMEM_HOST_ENABLE=0
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
-  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
+  - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py
 
 - label: Distributed Compile + RPC Tests (2 GPUs)
   timeout_in_minutes: 20
@@ -88,7 +88,6 @@ steps:
   - vllm/distributed/
   - tests/distributed/test_torchrun_example.py
   - tests/distributed/test_torchrun_example_moe.py
-  - examples/offline_inference/rlhf.py
   - examples/offline_inference/rlhf_colocate.py
   - examples/rl/
   - tests/examples/offline_inference/data_parallel.py

diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml
@@ -70,3 +70,15 @@ steps:
       device: mi325_4
       depends_on:
       - image-build-amd
+
+- label: V1 e2e (4xH100)
+  timeout_in_minutes: 60
+  device: h100
+  num_devices: 4
+  optional: true
+  source_file_dependencies:
+    - vllm/v1/attention/backends/utils.py
+    - vllm/v1/worker/gpu_model_runner.py
+    - tests/v1/e2e/test_hybrid_chunked_prefill.py
+  commands:
+    - pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py
diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml
@@ -10,7 +10,7 @@ steps:
   - tests/entrypoints/
   commands:
   - pytest -v -s entrypoints/openai/tool_parsers
-  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
+  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling
 
 - label: Entrypoints Integration (LLM)
   timeout_in_minutes: 40
@@ -34,7 +34,7 @@ steps:
   - tests/entrypoints/test_chat_utils
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/  --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses --ignore=entrypoints/openai/test_multi_api_servers.py
   - pytest -v -s entrypoints/test_chat_utils.py
   mirror:
     amd:
@@ -48,11 +48,11 @@ steps:
   source_file_dependencies:
   - vllm/
   - tests/entrypoints/rpc
-  - tests/entrypoints/instrumentator
+  - tests/entrypoints/serve/instrumentator
   - tests/tool_use
   commands:
   - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-  - pytest -v -s entrypoints/instrumentator
+  - pytest -v -s entrypoints/serve/instrumentator
   - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
   - pytest -v -s tool_use
 
@@ -75,19 +75,6 @@ steps:
   commands:
   - pytest -v -s entrypoints/openai/responses
 
-- label: Entrypoints V1
-  timeout_in_minutes: 50
-  source_file_dependencies:
-    - vllm/
-    - tests/v1
-  commands:
-    - pytest -v -s v1/entrypoints
-  mirror:
-    amd:
-      device: mi325_1
-      depends_on:
-      - image-build-amd
-
 - label: OpenAI API Correctness
   timeout_in_minutes: 30
   source_file_dependencies:

diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml
@@ -45,6 +45,22 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt
 
+- label: LM Eval Qwen3.5 Models (B200)
+  timeout_in_minutes: 120
+  device: b200
+  optional: true
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/model_executor/models/qwen3_5.py
+  - vllm/model_executor/models/qwen3_5_mtp.py
+  - vllm/transformers_utils/configs/qwen3_5.py
+  - vllm/transformers_utils/configs/qwen3_5_moe.py
+  - vllm/model_executor/models/qwen3_next.py
+  - vllm/model_executor/models/qwen3_next_mtp.py
+  - vllm/model_executor/layers/fla/ops/
+  commands:
+  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-blackwell.txt
+
 - label: LM Eval Large Models (H200)
   timeout_in_minutes: 60
   device: h200

diff --git a/.buildkite/test_areas/model_runner_v2.yaml b/.buildkite/test_areas/model_runner_v2.yaml
@@ -11,7 +11,7 @@ steps:
   - vllm/v1/attention/
   - tests/v1/engine/test_llm_engine.py
   - tests/v1/e2e/
-  - tests/v1/entrypoints/llm/test_struct_output_generate.py
+  - tests/entrypoints/llm/test_struct_output_generate.py
   commands:
   - set -x
   - export VLLM_USE_V2_MODEL_RUNNER=1
@@ -22,7 +22,7 @@ steps:
   - pytest -v -s v1/e2e/general/test_context_length.py
   - pytest -v -s v1/e2e/general/test_min_tokens.py
   # Temporary hack filter to exclude ngram spec decoding based tests.
-  - pytest -v -s v1/entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
+  - pytest -v -s entrypoints/llm/test_struct_output_generate.py -k "xgrammar and not speculative_config6 and not speculative_config7 and not speculative_config8 and not speculative_config0"
 
 - label: Model Runner V2 Examples
   timeout_in_minutes: 45

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -75,7 +75,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 /tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 /tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 /tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
+/tests/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 /tests/v1/structured_output @mgoin @russellb @aarnphm
 /tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @alexm-redhat @heheda12345 @ApostaC @orozery
 /tests/weight_loading @mgoin @youkaichao @yewentao256

diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -260,7 +260,7 @@ pull_request_rules:
       - files=examples/offline_inference/structured_outputs.py
       - files=examples/online_serving/structured_outputs/structured_outputs.py
       - files~=^tests/v1/structured_output/
-      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
+      - files=tests/entrypoints/llm/test_struct_output_generate.py
       - files~=^vllm/v1/structured_output/
   actions:
     label: