From a4560f6c873ae1dcf7879e6dc4b323e0c01c9d7a Mon Sep 17 00:00:00 2001
From: Meihan-chen <zr010426ztt@outlook.com>
Date: Tue, 23 Jun 2026 16:58:16 +0800
Subject: [PATCH 1/2] fix(data): reuse stored multimodal_inputs in length
 filter

filter_long_prompt re-extracted vision info from sample.prompt via
process_vision_info in the multimodal branch. When apply_chat_template
is set, sample.prompt is the rendered *string* (not a conversation
list), so process_vision_info -> qwen_vl_utils crashed with
"TypeError: string indices must be integers, not 'str'".

This made prompt-length filtering unusable for any VLM dataset: setting
--rollout-max-context-len (which derives rollout_max_prompt_len) or
--rollout-max-prompt-len / --eval-max-prompt-len activates the filter
and hits the crash.

Reuse the multimodal inputs already computed during dataset
construction via build_processor_kwargs (matching the sglang_rollout
path) instead of recomputing them from the string prompt.

Add CPU unit tests covering the multimodal branch and a mixed
text-only + multimodal dataset.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Meihan-chen <zr010426ztt@outlook.com>
---
 slime/utils/data.py                         |   8 +-
 tests/test_filter_long_prompt_multimodal.py | 100 ++++++++++++++++++++
 2 files changed, 105 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_filter_long_prompt_multimodal.py

diff --git a/slime/utils/data.py b/slime/utils/data.py
index 0d26b6dda5..6001726835 100644
--- a/slime/utils/data.py
+++ b/slime/utils/data.py
@@ -105,11 +105,13 @@ def filter_long_prompt(origin_samples: list[Sample], tokenizer, processor, max_l
                 if len(input_ids) <= max_length:
                     filtered_samples.append(sample)
         if multimodal:
-            from slime.utils.processing_utils import process_vision_info
+            from slime.utils.processing_utils import build_processor_kwargs
 
             for sample in multimodal:
-                multimodal_inputs = process_vision_info(sample.prompt, processor)
-                processor_output = processor(text=sample.prompt, **multimodal_inputs)
+                # Reuse the stored multimodal inputs; sample.prompt is the rendered
+                # string here, so re-extracting from it would crash.
+                processor_kwargs = build_processor_kwargs(sample.multimodal_inputs)
+                processor_output = processor(text=sample.prompt, **processor_kwargs)
                 input_ids = processor_output["input_ids"][0]
                 if len(input_ids) <= max_length:
                     filtered_samples.append(sample)
diff --git a/tests/test_filter_long_prompt_multimodal.py b/tests/test_filter_long_prompt_multimodal.py
new file mode 100644
index 0000000000..6a527ffe52
--- /dev/null
+++ b/tests/test_filter_long_prompt_multimodal.py
@@ -0,0 +1,100 @@
+"""CPU unit test for ``slime.utils.data.filter_long_prompt`` on multimodal samples.
+
+Regression test for the bug where length-filtering a multimodal dataset crashed.
+
+When ``apply_chat_template=True``, ``Sample.prompt`` holds the *rendered string*
+(images collapsed into ``<|image_pad|>`` placeholders), while the structured
+images live in ``Sample.multimodal_inputs`` (already computed during dataset
+construction). The multimodal branch of ``filter_long_prompt`` re-derived vision
+info from the *string* prompt via ``process_vision_info`` -- which expects a
+*conversation list* -- and crashed with ``TypeError: string indices must be
+integers, not 'str'``.
+
+The fix reuses the already-computed ``Sample.multimodal_inputs`` instead of
+re-extracting from the string prompt.
+"""
+
+from __future__ import annotations
+
+import types
+
+import pytest
+
+from slime.utils.data import filter_long_prompt
+from slime.utils.types import Sample
+
+
+class _FakeProcessor:
+    """Stand-in for a HF VL processor.
+
+    Returns ``input_ids`` whose length equals the number of images it is given,
+    so the test can assert that filtering used the images from
+    ``Sample.multimodal_inputs`` (1 image -> length 1, 3 images -> length 3).
+    """
+
+    def __init__(self):
+        self.image_processor = types.SimpleNamespace(patch_size=14)
+
+    def __call__(self, text=None, images=None, videos=None, **kwargs):
+        n_images = len(images) if images else 0
+        return {"input_ids": [list(range(n_images))]}
+
+
+def _mm_sample(n_images: int, prompt: str) -> Sample:
+    # prompt is the rendered string produced by apply_chat_template; the
+    # structured images are kept in multimodal_inputs.
+    return Sample(prompt=prompt, multimodal_inputs={"images": ["img"] * n_images, "videos": None})
+
+
+@pytest.mark.unit
+def test_filter_long_prompt_multimodal_reuses_stored_inputs(monkeypatch):
+    # Faithfully reproduce the crash: feeding a *string* prompt to
+    # process_vision_info (qwen_vl_utils) raises. The fixed code must not call it.
+    def _boom(prompt, processor):
+        raise TypeError("string indices must be integers, not 'str'")
+
+    monkeypatch.setattr("slime.utils.processing_utils.process_vision_info", _boom)
+
+    processor = _FakeProcessor()
+    short = _mm_sample(1, prompt="<rendered short prompt>")
+    long = _mm_sample(3, prompt="<rendered long prompt>")
+
+    # max_length=2: short (1 image -> 1 token) kept, long (3 images -> 3 tokens) dropped.
+    result = filter_long_prompt([short, long], tokenizer=None, processor=processor, max_length=2)
+
+    assert result == [short]
+
+
+class _FakeTokenizer:
+    """Batched tokenizer for text-only samples: token count = word count."""
+
+    def __call__(self, prompts, add_special_tokens=False):
+        return {"input_ids": [list(range(len(p.split()))) for p in prompts]}
+
+
+@pytest.mark.unit
+def test_filter_long_prompt_mixed_text_and_multimodal(monkeypatch):
+    # The multimodal branch must not re-derive vision info from the string prompt.
+    def _boom(prompt, processor):
+        raise TypeError("string indices must be integers, not 'str'")
+
+    monkeypatch.setattr("slime.utils.processing_utils.process_vision_info", _boom)
+
+    processor = _FakeProcessor()
+    tokenizer = _FakeTokenizer()
+
+    text_short = Sample(prompt="a b", multimodal_inputs=None)  # 2 tokens -> kept
+    text_long = Sample(prompt="a b c d", multimodal_inputs=None)  # 4 tokens -> dropped
+    mm_short = _mm_sample(1, prompt="<rendered>")  # 1 image -> kept
+    mm_long = _mm_sample(5, prompt="<rendered>")  # 5 images -> dropped
+
+    result = filter_long_prompt(
+        [text_short, text_long, mm_short, mm_long],
+        tokenizer=tokenizer,
+        processor=processor,
+        max_length=2,
+    )
+
+    # Both branches filter independently; order within each branch is preserved.
+    assert text_short in result and mm_short in result
+    assert text_long not in result and mm_long not in result

From 0e0ec783e83bdaf073c8abdd945762467cfd3164 Mon Sep 17 00:00:00 2001
From: Meihan-chen <zr010426ztt@outlook.com>
Date: Tue, 23 Jun 2026 18:04:43 +0800
Subject: [PATCH 2/2] del test

Signed-off-by: Meihan-chen <zr010426ztt@outlook.com>
---
 slime/utils/data.py                         |   2 -
 tests/test_filter_long_prompt_multimodal.py | 100 --------------------
 2 files changed, 102 deletions(-)
 delete mode 100644 tests/test_filter_long_prompt_multimodal.py

diff --git a/slime/utils/data.py b/slime/utils/data.py
index 6001726835..319e006ede 100644
--- a/slime/utils/data.py
+++ b/slime/utils/data.py
@@ -108,8 +108,6 @@ def filter_long_prompt(origin_samples: list[Sample], tokenizer, processor, max_l
             from slime.utils.processing_utils import build_processor_kwargs
 
             for sample in multimodal:
-                # Reuse the stored multimodal inputs; sample.prompt is the rendered
-                # string here, so re-extracting from it would crash.
                 processor_kwargs = build_processor_kwargs(sample.multimodal_inputs)
                 processor_output = processor(text=sample.prompt, **processor_kwargs)
                 input_ids = processor_output["input_ids"][0]
diff --git a/tests/test_filter_long_prompt_multimodal.py b/tests/test_filter_long_prompt_multimodal.py
deleted file mode 100644
index 6a527ffe52..0000000000
--- a/tests/test_filter_long_prompt_multimodal.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""CPU unit test for ``slime.utils.data.filter_long_prompt`` on multimodal samples.
-
-Regression test for the bug where length-filtering a multimodal dataset crashed.
-
-When ``apply_chat_template=True``, ``Sample.prompt`` holds the *rendered string*
-(images collapsed into ``<|image_pad|>`` placeholders), while the structured
-images live in ``Sample.multimodal_inputs`` (already computed during dataset
-construction). The multimodal branch of ``filter_long_prompt`` re-derived vision
-info from the *string* prompt via ``process_vision_info`` -- which expects a
-*conversation list* -- and crashed with ``TypeError: string indices must be
-integers, not 'str'``.
-
-The fix reuses the already-computed ``Sample.multimodal_inputs`` instead of
-re-extracting from the string prompt.
-"""
-
-from __future__ import annotations
-
-import types
-
-import pytest
-
-from slime.utils.data import filter_long_prompt
-from slime.utils.types import Sample
-
-
-class _FakeProcessor:
-    """Stand-in for a HF VL processor.
-
-    Returns ``input_ids`` whose length equals the number of images it is given,
-    so the test can assert that filtering used the images from
-    ``Sample.multimodal_inputs`` (1 image -> length 1, 3 images -> length 3).
-    """
-
-    def __init__(self):
-        self.image_processor = types.SimpleNamespace(patch_size=14)
-
-    def __call__(self, text=None, images=None, videos=None, **kwargs):
-        n_images = len(images) if images else 0
-        return {"input_ids": [list(range(n_images))]}
-
-
-def _mm_sample(n_images: int, prompt: str) -> Sample:
-    # prompt is the rendered string produced by apply_chat_template; the
-    # structured images are kept in multimodal_inputs.
-    return Sample(prompt=prompt, multimodal_inputs={"images": ["img"] * n_images, "videos": None})
-
-
-@pytest.mark.unit
-def test_filter_long_prompt_multimodal_reuses_stored_inputs(monkeypatch):
-    # Faithfully reproduce the crash: feeding a *string* prompt to
-    # process_vision_info (qwen_vl_utils) raises. The fixed code must not call it.
-    def _boom(prompt, processor):
-        raise TypeError("string indices must be integers, not 'str'")
-
-    monkeypatch.setattr("slime.utils.processing_utils.process_vision_info", _boom)
-
-    processor = _FakeProcessor()
-    short = _mm_sample(1, prompt="<rendered short prompt>")
-    long = _mm_sample(3, prompt="<rendered long prompt>")
-
-    # max_length=2: short (1 image -> 1 token) kept, long (3 images -> 3 tokens) dropped.
-    result = filter_long_prompt([short, long], tokenizer=None, processor=processor, max_length=2)
-
-    assert result == [short]
-
-
-class _FakeTokenizer:
-    """Batched tokenizer for text-only samples: token count = word count."""
-
-    def __call__(self, prompts, add_special_tokens=False):
-        return {"input_ids": [list(range(len(p.split()))) for p in prompts]}
-
-
-@pytest.mark.unit
-def test_filter_long_prompt_mixed_text_and_multimodal(monkeypatch):
-    # The multimodal branch must not re-derive vision info from the string prompt.
-    def _boom(prompt, processor):
-        raise TypeError("string indices must be integers, not 'str'")
-
-    monkeypatch.setattr("slime.utils.processing_utils.process_vision_info", _boom)
-
-    processor = _FakeProcessor()
-    tokenizer = _FakeTokenizer()
-
-    text_short = Sample(prompt="a b", multimodal_inputs=None)  # 2 tokens -> kept
-    text_long = Sample(prompt="a b c d", multimodal_inputs=None)  # 4 tokens -> dropped
-    mm_short = _mm_sample(1, prompt="<rendered>")  # 1 image -> kept
-    mm_long = _mm_sample(5, prompt="<rendered>")  # 5 images -> dropped
-
-    result = filter_long_prompt(
-        [text_short, text_long, mm_short, mm_long],
-        tokenizer=tokenizer,
-        processor=processor,
-        max_length=2,
-    )
-
-    # Both branches filter independently; order within each branch is preserved.
-    assert text_short in result and mm_short in result
-    assert text_long not in result and mm_long not in result