From a4560f6c873ae1dcf7879e6dc4b323e0c01c9d7a Mon Sep 17 00:00:00 2001 From: Meihan-chen Date: Tue, 23 Jun 2026 16:58:16 +0800 Subject: [PATCH 1/2] fix(data): reuse stored multimodal_inputs in length filter filter_long_prompt re-extracted vision info from sample.prompt via process_vision_info in the multimodal branch. When apply_chat_template is set, sample.prompt is the rendered *string* (not a conversation list), so process_vision_info -> qwen_vl_utils crashed with "TypeError: string indices must be integers, not 'str'". This made prompt-length filtering unusable for any VLM dataset: setting --rollout-max-context-len (which derives rollout_max_prompt_len) or --rollout-max-prompt-len / --eval-max-prompt-len activates the filter and hits the crash. Reuse the multimodal inputs already computed during dataset construction via build_processor_kwargs (matching the sglang_rollout path) instead of recomputing them from the string prompt. Add CPU unit tests covering the multimodal branch and a mixed text-only + multimodal dataset. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Meihan-chen --- slime/utils/data.py | 8 +- tests/test_filter_long_prompt_multimodal.py | 100 ++++++++++++++++++++ 2 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 tests/test_filter_long_prompt_multimodal.py diff --git a/slime/utils/data.py b/slime/utils/data.py index 0d26b6dda5..6001726835 100644 --- a/slime/utils/data.py +++ b/slime/utils/data.py @@ -105,11 +105,13 @@ def filter_long_prompt(origin_samples: list[Sample], tokenizer, processor, max_l if len(input_ids) <= max_length: filtered_samples.append(sample) if multimodal: - from slime.utils.processing_utils import process_vision_info + from slime.utils.processing_utils import build_processor_kwargs for sample in multimodal: - multimodal_inputs = process_vision_info(sample.prompt, processor) - processor_output = processor(text=sample.prompt, **multimodal_inputs) + # Reuse the stored multimodal inputs; sample.prompt is the rendered + # string here, so re-extracting from it would crash. + processor_kwargs = build_processor_kwargs(sample.multimodal_inputs) + processor_output = processor(text=sample.prompt, **processor_kwargs) input_ids = processor_output["input_ids"][0] if len(input_ids) <= max_length: filtered_samples.append(sample) diff --git a/tests/test_filter_long_prompt_multimodal.py b/tests/test_filter_long_prompt_multimodal.py new file mode 100644 index 0000000000..6a527ffe52 --- /dev/null +++ b/tests/test_filter_long_prompt_multimodal.py @@ -0,0 +1,100 @@ +"""CPU unit test for ``slime.utils.data.filter_long_prompt`` on multimodal samples. + +Regression test for the bug where length-filtering a multimodal dataset crashed. + +When ``apply_chat_template=True``, ``Sample.prompt`` holds the *rendered string* +(images collapsed into ``<|image_pad|>`` placeholders), while the structured +images live in ``Sample.multimodal_inputs`` (already computed during dataset +construction). The multimodal branch of ``filter_long_prompt`` re-derived vision +info from the *string* prompt via ``process_vision_info`` -- which expects a +*conversation list* -- and crashed with ``TypeError: string indices must be +integers, not 'str'``. + +The fix reuses the already-computed ``Sample.multimodal_inputs`` instead of +re-extracting from the string prompt. +""" + +from __future__ import annotations + +import types + +import pytest + +from slime.utils.data import filter_long_prompt +from slime.utils.types import Sample + + +class _FakeProcessor: + """Stand-in for a HF VL processor. + + Returns ``input_ids`` whose length equals the number of images it is given, + so the test can assert that filtering used the images from + ``Sample.multimodal_inputs`` (1 image -> length 1, 3 images -> length 3). + """ + + def __init__(self): + self.image_processor = types.SimpleNamespace(patch_size=14) + + def __call__(self, text=None, images=None, videos=None, **kwargs): + n_images = len(images) if images else 0 + return {"input_ids": [list(range(n_images))]} + + +def _mm_sample(n_images: int, prompt: str) -> Sample: + # prompt is the rendered string produced by apply_chat_template; the + # structured images are kept in multimodal_inputs. + return Sample(prompt=prompt, multimodal_inputs={"images": ["img"] * n_images, "videos": None}) + + +@pytest.mark.unit +def test_filter_long_prompt_multimodal_reuses_stored_inputs(monkeypatch): + # Faithfully reproduce the crash: feeding a *string* prompt to + # process_vision_info (qwen_vl_utils) raises. The fixed code must not call it. + def _boom(prompt, processor): + raise TypeError("string indices must be integers, not 'str'") + + monkeypatch.setattr("slime.utils.processing_utils.process_vision_info", _boom) + + processor = _FakeProcessor() + short = _mm_sample(1, prompt="") + long = _mm_sample(3, prompt="") + + # max_length=2: short (1 image -> 1 token) kept, long (3 images -> 3 tokens) dropped. + result = filter_long_prompt([short, long], tokenizer=None, processor=processor, max_length=2) + + assert result == [short] + + +class _FakeTokenizer: + """Batched tokenizer for text-only samples: token count = word count.""" + + def __call__(self, prompts, add_special_tokens=False): + return {"input_ids": [list(range(len(p.split()))) for p in prompts]} + + +@pytest.mark.unit +def test_filter_long_prompt_mixed_text_and_multimodal(monkeypatch): + # The multimodal branch must not re-derive vision info from the string prompt. + def _boom(prompt, processor): + raise TypeError("string indices must be integers, not 'str'") + + monkeypatch.setattr("slime.utils.processing_utils.process_vision_info", _boom) + + processor = _FakeProcessor() + tokenizer = _FakeTokenizer() + + text_short = Sample(prompt="a b", multimodal_inputs=None) # 2 tokens -> kept + text_long = Sample(prompt="a b c d", multimodal_inputs=None) # 4 tokens -> dropped + mm_short = _mm_sample(1, prompt="") # 1 image -> kept + mm_long = _mm_sample(5, prompt="") # 5 images -> dropped + + result = filter_long_prompt( + [text_short, text_long, mm_short, mm_long], + tokenizer=tokenizer, + processor=processor, + max_length=2, + ) + + # Both branches filter independently; order within each branch is preserved. + assert text_short in result and mm_short in result + assert text_long not in result and mm_long not in result From 0e0ec783e83bdaf073c8abdd945762467cfd3164 Mon Sep 17 00:00:00 2001 From: Meihan-chen Date: Tue, 23 Jun 2026 18:04:43 +0800 Subject: [PATCH 2/2] del test Signed-off-by: Meihan-chen --- slime/utils/data.py | 2 - tests/test_filter_long_prompt_multimodal.py | 100 -------------------- 2 files changed, 102 deletions(-) delete mode 100644 tests/test_filter_long_prompt_multimodal.py diff --git a/slime/utils/data.py b/slime/utils/data.py index 6001726835..319e006ede 100644 --- a/slime/utils/data.py +++ b/slime/utils/data.py @@ -108,8 +108,6 @@ def filter_long_prompt(origin_samples: list[Sample], tokenizer, processor, max_l from slime.utils.processing_utils import build_processor_kwargs for sample in multimodal: - # Reuse the stored multimodal inputs; sample.prompt is the rendered - # string here, so re-extracting from it would crash. processor_kwargs = build_processor_kwargs(sample.multimodal_inputs) processor_output = processor(text=sample.prompt, **processor_kwargs) input_ids = processor_output["input_ids"][0] diff --git a/tests/test_filter_long_prompt_multimodal.py b/tests/test_filter_long_prompt_multimodal.py deleted file mode 100644 index 6a527ffe52..0000000000 --- a/tests/test_filter_long_prompt_multimodal.py +++ /dev/null @@ -1,100 +0,0 @@ -"""CPU unit test for ``slime.utils.data.filter_long_prompt`` on multimodal samples. - -Regression test for the bug where length-filtering a multimodal dataset crashed. - -When ``apply_chat_template=True``, ``Sample.prompt`` holds the *rendered string* -(images collapsed into ``<|image_pad|>`` placeholders), while the structured -images live in ``Sample.multimodal_inputs`` (already computed during dataset -construction). The multimodal branch of ``filter_long_prompt`` re-derived vision -info from the *string* prompt via ``process_vision_info`` -- which expects a -*conversation list* -- and crashed with ``TypeError: string indices must be -integers, not 'str'``. - -The fix reuses the already-computed ``Sample.multimodal_inputs`` instead of -re-extracting from the string prompt. -""" - -from __future__ import annotations - -import types - -import pytest - -from slime.utils.data import filter_long_prompt -from slime.utils.types import Sample - - -class _FakeProcessor: - """Stand-in for a HF VL processor. - - Returns ``input_ids`` whose length equals the number of images it is given, - so the test can assert that filtering used the images from - ``Sample.multimodal_inputs`` (1 image -> length 1, 3 images -> length 3). - """ - - def __init__(self): - self.image_processor = types.SimpleNamespace(patch_size=14) - - def __call__(self, text=None, images=None, videos=None, **kwargs): - n_images = len(images) if images else 0 - return {"input_ids": [list(range(n_images))]} - - -def _mm_sample(n_images: int, prompt: str) -> Sample: - # prompt is the rendered string produced by apply_chat_template; the - # structured images are kept in multimodal_inputs. - return Sample(prompt=prompt, multimodal_inputs={"images": ["img"] * n_images, "videos": None}) - - -@pytest.mark.unit -def test_filter_long_prompt_multimodal_reuses_stored_inputs(monkeypatch): - # Faithfully reproduce the crash: feeding a *string* prompt to - # process_vision_info (qwen_vl_utils) raises. The fixed code must not call it. - def _boom(prompt, processor): - raise TypeError("string indices must be integers, not 'str'") - - monkeypatch.setattr("slime.utils.processing_utils.process_vision_info", _boom) - - processor = _FakeProcessor() - short = _mm_sample(1, prompt="") - long = _mm_sample(3, prompt="") - - # max_length=2: short (1 image -> 1 token) kept, long (3 images -> 3 tokens) dropped. - result = filter_long_prompt([short, long], tokenizer=None, processor=processor, max_length=2) - - assert result == [short] - - -class _FakeTokenizer: - """Batched tokenizer for text-only samples: token count = word count.""" - - def __call__(self, prompts, add_special_tokens=False): - return {"input_ids": [list(range(len(p.split()))) for p in prompts]} - - -@pytest.mark.unit -def test_filter_long_prompt_mixed_text_and_multimodal(monkeypatch): - # The multimodal branch must not re-derive vision info from the string prompt. - def _boom(prompt, processor): - raise TypeError("string indices must be integers, not 'str'") - - monkeypatch.setattr("slime.utils.processing_utils.process_vision_info", _boom) - - processor = _FakeProcessor() - tokenizer = _FakeTokenizer() - - text_short = Sample(prompt="a b", multimodal_inputs=None) # 2 tokens -> kept - text_long = Sample(prompt="a b c d", multimodal_inputs=None) # 4 tokens -> dropped - mm_short = _mm_sample(1, prompt="") # 1 image -> kept - mm_long = _mm_sample(5, prompt="") # 5 images -> dropped - - result = filter_long_prompt( - [text_short, text_long, mm_short, mm_long], - tokenizer=tokenizer, - processor=processor, - max_length=2, - ) - - # Both branches filter independently; order within each branch is preserved. - assert text_short in result and mm_short in result - assert text_long not in result and mm_long not in result