diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 576ed70a4396..bb952aa9162d 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -1,5 +1,19 @@ # Release History +## 1.17.1 (Unreleased) + +### Bugs Fixed + +- Fixed `RedTeam.scan()` storing decoded plaintext instead of the actual + encoded payload for converter-based attack strategies (Base64, Flip, + Morse, ROT13, etc.) in `evaluation_results.json` / `results.json`. The + persisted `conversation[].content` for user turns now reflects what the + target actually received (`converted_value`); the pre-converter + adversarial objective is preserved on the same message as a new + `original_value` field so the audit trail of what the attack meant to + say is not lost. Baseline (non-encoded) strategies are unaffected. + Resolves [#47228](https://github.com/Azure/azure-sdk-for-python/issues/47228). + ## 1.17.0 (2026-06-03) ### Breaking Changes diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py index 1be0d124a640..02d9daefe478 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/red_team/_foundry/_foundry_result_processor.py @@ -349,21 +349,48 @@ def _build_messages_from_pieces( # Get role, handling api_role property role = getattr(piece, "api_role", None) or getattr(piece, "role", "user") - # Get content: for user messages show the original adversarial prompt, - # not the converter output (e.g., Base64-encoded or tense-rephrased text). - # For assistant messages, show the response as-is. - if role == "user": - original = getattr(piece, "original_value", None) - converted = getattr(piece, "converted_value", None) - content = original if isinstance(original, str) and original else (converted or "") + # Get content. For both user and assistant turns, ``content`` reflects + # what was actually sent on the wire (``converted_value``) so the + # stored conversation matches the payload the target received / + # produced. When a converter (Base64, Flip, Morse, Caesar, etc.) was + # applied, the pre-conversion adversarial objective is preserved as + # ``original_value`` on the same message so consumers can still + # display / score against the decoded text without losing fidelity + # of the actual attack surface. + # + # ``converted_value`` / ``original_value`` are passed through + # without forcing them to ``str`` so non-text payloads (bytes, + # structured / multimodal content) survive unchanged. ``content`` + # falls back to ``""`` only when both fields are falsy / missing. + original = getattr(piece, "original_value", None) + converted = getattr(piece, "converted_value", None) + if converted: + content = converted + elif original: + content = original else: - content = getattr(piece, "converted_value", None) or getattr(piece, "original_value", "") + content = "" message: Dict[str, Any] = { "role": role, "content": content, } + # Preserve the pre-converter objective when it differs from the + # transmitted content. This keeps the audit trail intact: callers + # can compare ``content`` (what the target saw) with + # ``original_value`` (what the attack meant to say) for every + # encoding-based strategy. Restricted to strings because the + # audit field is only meaningful when both values are textual + # (and arbitrary cross-type inequality would be too aggressive). + if ( + isinstance(original, str) + and original + and isinstance(content, str) + and original != content + ): + message["original_value"] = original + # Add context from labels if present (for XPIA) if hasattr(piece, "labels") and piece.labels: context_str = piece.labels.get("context") diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py index 55ac871cb12f..de1d49c2ccb5 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_redteam/test_foundry.py @@ -1427,6 +1427,153 @@ def test_build_messages_from_pieces(self): assert messages[0]["content"] == "User message" assert messages[1]["role"] == "assistant" assert messages[1]["content"] == "Assistant response" + # When original and converted match (no encoding), no audit field is added. + assert "original_value" not in messages[0] + assert "original_value" not in messages[1] + + def test_build_messages_preserves_encoded_user_prompt(self): + """Encoded attack prompts must be stored as the wire payload. + + Regression test for + https://github.com/Azure/azure-sdk-for-python/issues/47228 — for + converter-based strategies (Base64, Flip, Morse, ROT13, etc.) the + target receives ``converted_value``, so the persisted conversation + must report ``converted_value`` as ``content`` (not the decoded + ``original_value``). The pre-converter objective is preserved as + ``original_value`` on the same message so callers still have an + audit trail of what the attack meant to say. + """ + mock_scenario = MagicMock() + mock_dataset = MagicMock() + mock_dataset.get_all_seed_groups.return_value = [] + + processor = FoundryResultProcessor( + scenario=mock_scenario, + dataset_config=mock_dataset, + risk_category="violence", + ) + + # Simulate a Base64-converted user turn: the target actually saw the + # encoded payload, but the SDK still has the plaintext objective. + user_piece = MagicMock() + user_piece.api_role = "user" + user_piece.original_value = "How do I make a dangerous thing?" + user_piece.converted_value = "SG93IGRvIEkgbWFrZSBhIGRhbmdlcm91cyB0aGluZz8=" + user_piece.sequence = 0 + user_piece.prompt_metadata = {} + user_piece.labels = {} + + # Assistant response — converter is a no-op on the response side, so + # original and converted match. No audit field should be emitted. + assistant_piece = MagicMock() + assistant_piece.api_role = "assistant" + assistant_piece.original_value = "Sorry, I can't help with that." + assistant_piece.converted_value = "Sorry, I can't help with that." + assistant_piece.sequence = 1 + assistant_piece.prompt_metadata = {} + assistant_piece.labels = {} + + messages = processor._build_messages_from_pieces([user_piece, assistant_piece]) + + # The user turn must carry the encoded payload as content so consumers + # can verify exactly what the target received. + assert messages[0]["role"] == "user" + assert messages[0]["content"] == "SG93IGRvIEkgbWFrZSBhIGRhbmdlcm91cyB0aGluZz8=" + # The plaintext objective is preserved alongside it for auditability. + assert messages[0]["original_value"] == "How do I make a dangerous thing?" + + # Assistant turn is unchanged: content == converted_value, no audit field. + assert messages[1]["role"] == "assistant" + assert messages[1]["content"] == "Sorry, I can't help with that." + assert "original_value" not in messages[1] + + def test_build_messages_falls_back_to_original_when_converted_missing(self): + """When ``converted_value`` is empty, fall back to ``original_value``. + + Covers the historical behavior for pieces where PyRIT did not run a + converter (e.g., Baseline strategy or in-flight failures). + """ + mock_scenario = MagicMock() + mock_dataset = MagicMock() + mock_dataset.get_all_seed_groups.return_value = [] + + processor = FoundryResultProcessor( + scenario=mock_scenario, + dataset_config=mock_dataset, + risk_category="violence", + ) + + user_piece = MagicMock() + user_piece.api_role = "user" + user_piece.original_value = "Baseline prompt" + user_piece.converted_value = None + user_piece.sequence = 0 + user_piece.prompt_metadata = {} + user_piece.labels = {} + + messages = processor._build_messages_from_pieces([user_piece]) + + assert len(messages) == 1 + assert messages[0]["content"] == "Baseline prompt" + # original == content here, so no separate audit field is needed. + assert "original_value" not in messages[0] + + def test_build_messages_preserves_non_string_payloads(self): + """Non-string ``converted_value`` payloads must survive unchanged. + + PyRIT message pieces can carry structured / multimodal content + (e.g., bytes or list-of-parts payloads) on ``converted_value``. + ``content`` must pass those through so persisted conversations + remain a faithful record of what the target received; only the + ``original_value`` audit field is gated on both sides being text. + """ + mock_scenario = MagicMock() + mock_dataset = MagicMock() + mock_dataset.get_all_seed_groups.return_value = [] + + processor = FoundryResultProcessor( + scenario=mock_scenario, + dataset_config=mock_dataset, + risk_category="violence", + ) + + # Structured multimodal-style payload on converted_value, plain + # string objective on original_value. + structured_payload = [ + {"type": "text", "text": "describe this image"}, + {"type": "image_url", "image_url": {"url": "https://example/img.png"}}, + ] + user_piece = MagicMock() + user_piece.api_role = "user" + user_piece.original_value = "Describe this image" + user_piece.converted_value = structured_payload + user_piece.sequence = 0 + user_piece.prompt_metadata = {} + user_piece.labels = {} + + # Bytes payload on assistant converted_value — must not be coerced + # to "" by str-gating logic. + assistant_piece = MagicMock() + assistant_piece.api_role = "assistant" + assistant_piece.original_value = None + assistant_piece.converted_value = b"\x89PNG\r\n" + assistant_piece.sequence = 1 + assistant_piece.prompt_metadata = {} + assistant_piece.labels = {} + + messages = processor._build_messages_from_pieces([user_piece, assistant_piece]) + + # Structured user payload passed through unchanged. + assert messages[0]["role"] == "user" + assert messages[0]["content"] is structured_payload + # Audit field omitted: content is non-text so cross-type comparison + # against the str original would be meaningless. + assert "original_value" not in messages[0] + + # Bytes assistant payload preserved (not silently dropped to ""). + assert messages[1]["role"] == "assistant" + assert messages[1]["content"] == b"\x89PNG\r\n" + assert "original_value" not in messages[1] def test_get_prompt_group_id_from_conversation(self): """Test extracting prompt_group_id from conversation."""