From b7db9151b4556c7b70b44a12d8535efabe35a8a0 Mon Sep 17 00:00:00 2001 From: Liang Hu Date: Tue, 16 Jun 2026 23:28:11 +0000 Subject: [PATCH 1/2] Fix RedTeam.scan() decoding encoded attack prompts in results For converter-based attack strategies (Base64, Flip, Morse, ROT13, Caesar, Leetspeak, AsciiArt, AnsiAttack, Atbash, Binary, CharacterSpace, CharSwap, Diacritic, StringJoin, SuffixAppend, UnicodeConfusable, UnicodeSubstitution, Url, AsciiSmuggler, Tense), FoundryResultProcessor was emitting the decoded 'original_value' as the user-message content while the target was actually receiving 'converted_value'. This made evaluation_results.json / results.json show plaintext where the audit trail should show the encoded payload, breaking post-scan auditability and per-variant debugging. This change makes conversation[].content always reflect the on-wire value (converted_value) for both user and assistant turns, and preserves the pre-converter objective as a sibling 'original_value' field on user messages whenever it differs. Baseline (non-encoded) strategies are unaffected since original_value == converted_value. Adds two regression tests in TestFoundryResultProcessor and a CHANGELOG entry. Resolves Azure/azure-sdk-for-python#47228. --- .../azure-ai-evaluation/CHANGELOG.md | 6 ++ .../_foundry/_foundry_result_processor.py | 35 ++++++-- .../unittests/test_redteam/test_foundry.py | 90 +++++++++++++++++++ 3 files changed, 123 insertions(+), 8 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 576ed70a4396..864ce8544cb0 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -1,5 +1,11 @@ # Release History +## 1.17.1 (Unreleased) + +### Bugs Fixed + +- Fixed `RedTeam.scan()` storing decoded plaintext instead of the actual encoded payload for converter-based attack strategies (`Base64`, `Flip`, `Morse`, `ROT13`, `Caesar`, `Leetspeak`, `AsciiArt`, `AnsiAttack`, `Atbash`, `Binary`, `CharacterSpace`, `CharSwap`, `Diacritic`, `StringJoin`, `SuffixAppend`, `UnicodeConfusable`, `UnicodeSubstitution`, `Url`, `AsciiSmuggler`, `Tense`) in `evaluation_results.json` / `results.json`. The persisted `conversation[].content` for user turns now reflects what the target actually received (`converted_value`); the pre-converter adversarial objective is preserved on the same message as a new `original_value` field so the audit trail of what the attack meant to say is not lost. Baseline (non-encoded) strategies are unaffected. Resolves [Azure/azure-sdk-for-python#47228](https://github.com/Azure/azure-sdk-for-python/issues/47228). + ## 1.17.0 (2026-06-03) ### Breaking Changes diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py index 1be0d124a640..d24b33a1c3fb 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py @@ -349,21 +349,40 @@ def _build_messages_from_pieces( # Get role, handling api_role property role = getattr(piece, "api_role", None) or getattr(piece, "role", "user") - # Get content: for user messages show the original adversarial prompt, - # not the converter output (e.g., Base64-encoded or tense-rephrased text). - # For assistant messages, show the response as-is. - if role == "user": - original = getattr(piece, "original_value", None) - converted = getattr(piece, "converted_value", None) - content = original if isinstance(original, str) and original else (converted or "") + # Get content. For both user and assistant turns, ``content`` reflects + # what was actually sent on the wire (``converted_value``) so the + # stored conversation matches the payload the target received / + # produced. When a converter (Base64, Flip, Morse, Caesar, etc.) was + # applied, the pre-conversion adversarial objective is preserved as + # ``original_value`` on the same message so consumers can still + # display / score against the decoded text without losing fidelity + # of the actual attack surface. + original = getattr(piece, "original_value", None) + converted = getattr(piece, "converted_value", None) + if isinstance(converted, str) and converted: + content = converted + elif isinstance(original, str) and original: + content = original else: - content = getattr(piece, "converted_value", None) or getattr(piece, "original_value", "") + content = "" message: Dict[str, Any] = { "role": role, "content": content, } + # Preserve the pre-converter objective when it differs from the + # transmitted content. This keeps the audit trail intact: callers + # can compare ``content`` (what the target saw) with + # ``original_value`` (what the attack meant to say) for every + # encoding-based strategy. + if ( + isinstance(original, str) + and original + and original != content + ): + message["original_value"] = original + # Add context from labels if present (for XPIA) if hasattr(piece, "labels") and piece.labels: context_str = piece.labels.get("context") diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py index 55ac871cb12f..f3e9cd45807f 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py @@ -1427,6 +1427,96 @@ def test_build_messages_from_pieces(self): assert messages[0]["content"] == "User message" assert messages[1]["role"] == "assistant" assert messages[1]["content"] == "Assistant response" + # When original and converted match (no encoding), no audit field is added. + assert "original_value" not in messages[0] + assert "original_value" not in messages[1] + + def test_build_messages_preserves_encoded_user_prompt(self): + """Encoded attack prompts must be stored as the wire payload. + + Regression test for + https://github.com/Azure/azure-sdk-for-python/issues/47228 — for + converter-based strategies (Base64, Flip, Morse, ROT13, etc.) the + target receives ``converted_value``, so the persisted conversation + must report ``converted_value`` as ``content`` (not the decoded + ``original_value``). The pre-converter objective is preserved as + ``original_value`` on the same message so callers still have an + audit trail of what the attack meant to say. + """ + mock_scenario = MagicMock() + mock_dataset = MagicMock() + mock_dataset.get_all_seed_groups.return_value = [] + + processor = FoundryResultProcessor( + scenario=mock_scenario, + dataset_config=mock_dataset, + risk_category="violence", + ) + + # Simulate a Base64-converted user turn: the target actually saw the + # encoded payload, but the SDK still has the plaintext objective. + user_piece = MagicMock() + user_piece.api_role = "user" + user_piece.original_value = "How do I make a dangerous thing?" + user_piece.converted_value = "SG93IGRvIEkgbWFrZSBhIGRhbmdlcm91cyB0aGluZz8=" + user_piece.sequence = 0 + user_piece.prompt_metadata = {} + user_piece.labels = {} + + # Assistant response — converter is a no-op on the response side, so + # original and converted match. No audit field should be emitted. + assistant_piece = MagicMock() + assistant_piece.api_role = "assistant" + assistant_piece.original_value = "Sorry, I can't help with that." + assistant_piece.converted_value = "Sorry, I can't help with that." + assistant_piece.sequence = 1 + assistant_piece.prompt_metadata = {} + assistant_piece.labels = {} + + messages = processor._build_messages_from_pieces([user_piece, assistant_piece]) + + # The user turn must carry the encoded payload as content so consumers + # can verify exactly what the target received. + assert messages[0]["role"] == "user" + assert messages[0]["content"] == "SG93IGRvIEkgbWFrZSBhIGRhbmdlcm91cyB0aGluZz8=" + # The plaintext objective is preserved alongside it for auditability. + assert messages[0]["original_value"] == "How do I make a dangerous thing?" + + # Assistant turn is unchanged: content == converted_value, no audit field. + assert messages[1]["role"] == "assistant" + assert messages[1]["content"] == "Sorry, I can't help with that." + assert "original_value" not in messages[1] + + def test_build_messages_falls_back_to_original_when_converted_missing(self): + """When ``converted_value`` is empty, fall back to ``original_value``. + + Covers the historical behavior for pieces where PyRIT did not run a + converter (e.g., Baseline strategy or in-flight failures). + """ + mock_scenario = MagicMock() + mock_dataset = MagicMock() + mock_dataset.get_all_seed_groups.return_value = [] + + processor = FoundryResultProcessor( + scenario=mock_scenario, + dataset_config=mock_dataset, + risk_category="violence", + ) + + user_piece = MagicMock() + user_piece.api_role = "user" + user_piece.original_value = "Baseline prompt" + user_piece.converted_value = None + user_piece.sequence = 0 + user_piece.prompt_metadata = {} + user_piece.labels = {} + + messages = processor._build_messages_from_pieces([user_piece]) + + assert len(messages) == 1 + assert messages[0]["content"] == "Baseline prompt" + # original == content here, so no separate audit field is needed. + assert "original_value" not in messages[0] def test_get_prompt_group_id_from_conversation(self): """Test extracting prompt_group_id from conversation.""" From 493577f4acf16e0c60638705b5f5ee3a5d1e101c Mon Sep 17 00:00:00 2001 From: Liang Hu Date: Wed, 17 Jun 2026 00:05:33 +0000 Subject: [PATCH 2/2] Address Copilot review: preserve non-string payloads, add test, shorten changelog - _foundry_result_processor.py: stop forcing converted_value/original_value through isinstance(str) when computing content. Bytes / structured multimodal payloads now pass through unchanged; the original_value audit field is still gated on both sides being str so cross-type inequality cannot produce a misleading field. - test_foundry.py: add test_build_messages_preserves_non_string_payloads covering list-of-parts and bytes payloads. - CHANGELOG.md: wrap the 1.17.1 entry across multiple lines and drop the exhaustive strategy enumeration. --- .../azure-ai-evaluation/CHANGELOG.md | 10 +++- .../_foundry/_foundry_result_processor.py | 14 ++++- .../unittests/test_redteam/test_foundry.py | 57 +++++++++++++++++++ 3 files changed, 77 insertions(+), 4 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 864ce8544cb0..bb952aa9162d 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -4,7 +4,15 @@ ### Bugs Fixed -- Fixed `RedTeam.scan()` storing decoded plaintext instead of the actual encoded payload for converter-based attack strategies (`Base64`, `Flip`, `Morse`, `ROT13`, `Caesar`, `Leetspeak`, `AsciiArt`, `AnsiAttack`, `Atbash`, `Binary`, `CharacterSpace`, `CharSwap`, `Diacritic`, `StringJoin`, `SuffixAppend`, `UnicodeConfusable`, `UnicodeSubstitution`, `Url`, `AsciiSmuggler`, `Tense`) in `evaluation_results.json` / `results.json`. The persisted `conversation[].content` for user turns now reflects what the target actually received (`converted_value`); the pre-converter adversarial objective is preserved on the same message as a new `original_value` field so the audit trail of what the attack meant to say is not lost. Baseline (non-encoded) strategies are unaffected. Resolves [Azure/azure-sdk-for-python#47228](https://github.com/Azure/azure-sdk-for-python/issues/47228). +- Fixed `RedTeam.scan()` storing decoded plaintext instead of the actual + encoded payload for converter-based attack strategies (Base64, Flip, + Morse, ROT13, etc.) in `evaluation_results.json` / `results.json`. The + persisted `conversation[].content` for user turns now reflects what the + target actually received (`converted_value`); the pre-converter + adversarial objective is preserved on the same message as a new + `original_value` field so the audit trail of what the attack meant to + say is not lost. Baseline (non-encoded) strategies are unaffected. + Resolves [#47228](https://github.com/Azure/azure-sdk-for-python/issues/47228). ## 1.17.0 (2026-06-03) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py index d24b33a1c3fb..02d9daefe478 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py @@ -357,11 +357,16 @@ def _build_messages_from_pieces( # ``original_value`` on the same message so consumers can still # display / score against the decoded text without losing fidelity # of the actual attack surface. + # + # ``converted_value`` / ``original_value`` are passed through + # without forcing them to ``str`` so non-text payloads (bytes, + # structured / multimodal content) survive unchanged. ``content`` + # falls back to ``""`` only when both fields are falsy / missing. original = getattr(piece, "original_value", None) converted = getattr(piece, "converted_value", None) - if isinstance(converted, str) and converted: + if converted: content = converted - elif isinstance(original, str) and original: + elif original: content = original else: content = "" @@ -375,10 +380,13 @@ def _build_messages_from_pieces( # transmitted content. This keeps the audit trail intact: callers # can compare ``content`` (what the target saw) with # ``original_value`` (what the attack meant to say) for every - # encoding-based strategy. + # encoding-based strategy. Restricted to strings because the + # audit field is only meaningful when both values are textual + # (and arbitrary cross-type inequality would be too aggressive). if ( isinstance(original, str) and original + and isinstance(content, str) and original != content ): message["original_value"] = original diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py index f3e9cd45807f..de1d49c2ccb5 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py @@ -1518,6 +1518,63 @@ def test_build_messages_falls_back_to_original_when_converted_missing(self): # original == content here, so no separate audit field is needed. assert "original_value" not in messages[0] + def test_build_messages_preserves_non_string_payloads(self): + """Non-string ``converted_value`` payloads must survive unchanged. + + PyRIT message pieces can carry structured / multimodal content + (e.g., bytes or list-of-parts payloads) on ``converted_value``. + ``content`` must pass those through so persisted conversations + remain a faithful record of what the target received; only the + ``original_value`` audit field is gated on both sides being text. + """ + mock_scenario = MagicMock() + mock_dataset = MagicMock() + mock_dataset.get_all_seed_groups.return_value = [] + + processor = FoundryResultProcessor( + scenario=mock_scenario, + dataset_config=mock_dataset, + risk_category="violence", + ) + + # Structured multimodal-style payload on converted_value, plain + # string objective on original_value. + structured_payload = [ + {"type": "text", "text": "describe this image"}, + {"type": "image_url", "image_url": {"url": "https://example/img.png"}}, + ] + user_piece = MagicMock() + user_piece.api_role = "user" + user_piece.original_value = "Describe this image" + user_piece.converted_value = structured_payload + user_piece.sequence = 0 + user_piece.prompt_metadata = {} + user_piece.labels = {} + + # Bytes payload on assistant converted_value — must not be coerced + # to "" by str-gating logic. + assistant_piece = MagicMock() + assistant_piece.api_role = "assistant" + assistant_piece.original_value = None + assistant_piece.converted_value = b"\x89PNG\r\n" + assistant_piece.sequence = 1 + assistant_piece.prompt_metadata = {} + assistant_piece.labels = {} + + messages = processor._build_messages_from_pieces([user_piece, assistant_piece]) + + # Structured user payload passed through unchanged. + assert messages[0]["role"] == "user" + assert messages[0]["content"] is structured_payload + # Audit field omitted: content is non-text so cross-type comparison + # against the str original would be meaningless. + assert "original_value" not in messages[0] + + # Bytes assistant payload preserved (not silently dropped to ""). + assert messages[1]["role"] == "assistant" + assert messages[1]["content"] == b"\x89PNG\r\n" + assert "original_value" not in messages[1] + def test_get_prompt_group_id_from_conversation(self): """Test extracting prompt_group_id from conversation.""" mock_scenario = MagicMock()