From 6bde0d66a3215b1515bf05dcf814cf16fff02cd4 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Wed, 27 May 2026 09:30:14 -0700 Subject: [PATCH 1/2] workload-replay: add tests for the workload anonymizer The anonymizer had no automated coverage despite being a privacy tool with a heuristic core. Add a pytest module, colocated as `mz_workload_anonymize_test.py` so the existing `pytest --doctest-modules misc/python` CI step collects it. Coverage: - end-to-end anonymization of a structurally complete workload: identifiers scrubbed, anonymized names present; - regression tests for the connection/sink/source DDL literal leak (hosts, users, topics) and column defaults; - query string-literal redaction; - cluster SIZE preserved (non-sensitive config); - the no-output-target error and --in-place overwrite; - --no-literals keeps literals while still anonymizing identifiers; - verify_anonymized catches surviving identifiers and literals, accepts both the '' and 'literal_N' placeholders, and exempts cluster literals; - redact_literals_via_parser returns None (fallback signal) without the binary, and the regex fallback warns. Most tests force the regex fallback so they are deterministic regardless of whether the mz-sql-anonymize helper is built. One test exercises the parser-backed path (numeric literal redaction) and is skipped when the binary is absent. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../cli/mz_workload_anonymize_test.py | 270 ++++++++++++++++++ 1 file changed, 270 insertions(+) create mode 100644 misc/python/materialize/cli/mz_workload_anonymize_test.py diff --git a/misc/python/materialize/cli/mz_workload_anonymize_test.py b/misc/python/materialize/cli/mz_workload_anonymize_test.py new file mode 100644 index 0000000000000..1e58132f779e0 --- /dev/null +++ b/misc/python/materialize/cli/mz_workload_anonymize_test.py @@ -0,0 +1,270 @@ +# Copyright Materialize, Inc. and contributors. All rights reserved. +# +# Use of this software is governed by the Business Source License +# included in the LICENSE file at the root of this repository. +# +# As of the Change Date specified in that file, in accordance with +# the Business Source License, use of this software will be governed +# by the Apache License, Version 2.0. + +"""Tests for the workload anonymizer (`mz-workload-anonymize`). + +These exercise the pure-Python anonymization logic and the regex-based literal +fallback, which run without a built `mz-sql-anonymize` binary. One test +covers the parser-backed path and is skipped when that binary is absent. +""" + +from __future__ import annotations + +import sys +from typing import Any +from unittest import mock + +import pytest +import yaml + +from materialize.cli import mz_workload_anonymize + + +def base_workload() -> dict[str, Any]: + """A small but structurally complete workload capture. + + Carries sensitive identifiers (database/table/column/connection/sink names) + and literals (a connection host and user, a sink topic, a column default, + and string + numeric predicates in a query) so tests can assert they are + scrubbed. + """ + return { + "clusters": { + "prod_cluster": { + "create_sql": "CREATE CLUSTER prod_cluster (SIZE = '100cc')", + }, + }, + "databases": { + "customers_db": { + "public": { + "tables": { + "orders": { + "create_sql": "CREATE TABLE orders (id int, note text DEFAULT 'secret note')", + "columns": [ + {"name": "id", "type": "int4"}, + { + "name": "note", + "type": "text", + "default": "'secret note'", + }, + ], + "rows": 5, + }, + }, + "views": {}, + "materialized_views": {}, + "indexes": {}, + "types": {}, + "connections": { + "kafka_conn": { + "create_sql": "CREATE CONNECTION kafka_conn TO KAFKA (BROKER 'prod.internal.acme.com:9092', SASL USERNAME 'admin')", + }, + }, + "sources": {}, + "sinks": { + "out_sink": { + "create_sql": "CREATE SINK out_sink FROM orders INTO KAFKA CONNECTION kafka_conn (TOPIC 'customer-orders-prod')", + }, + }, + }, + }, + }, + "queries": [ + { + "sql": "SELECT id, note FROM customers_db.public.orders WHERE note = 'hunter2' AND id = 987654321", + "cluster": "prod_cluster", + "database": "customers_db", + "search_path": ["public"], + "statement_type": "select", + "finished_status": "success", + }, + ], + } + + +def run_tool( + tmp_path: Any, + workload: dict[str, Any], + *extra_args: str, + in_place: bool = False, +) -> tuple[int, dict[str, Any] | None, str]: + """Run main() against a workload, returning (exit_code, output, dumped_text).""" + inp = tmp_path / "workload.yml" + inp.write_text(yaml.safe_dump(workload)) + argv = ["mz-workload-anonymize", str(inp)] + if in_place: + argv.append("--in-place") + out = inp + else: + out = tmp_path / "out.yml" + argv += ["-o", str(out)] + argv += list(extra_args) + + with mock.patch.object(sys, "argv", argv): + rc = mz_workload_anonymize.main() + + if rc == 0 and out.exists(): + text = out.read_text() + return rc, yaml.safe_load(text), text + return rc, None, "" + + +@pytest.fixture +def force_regex(monkeypatch: pytest.MonkeyPatch) -> None: + """Force the regex literal fallback by hiding the parser binary. + + Keeps the bulk of the tests deterministic regardless of whether the + `mz-sql-anonymize` helper happens to be built in the dev environment. + """ + monkeypatch.setattr(mz_workload_anonymize, "_locate_redactor", lambda: None) + + +def test_anonymizes_identifiers(tmp_path: Any, force_regex: None) -> None: + rc, out, text = run_tool(tmp_path, base_workload()) + assert rc == 0 + assert out is not None + # Original object names must not survive anywhere in the output. + for original in ("customers_db", "orders", "kafka_conn", "out_sink"): + assert original not in text, f"{original!r} leaked" + # And anonymized names should be present. + assert "db_0" in text + assert "table_1" in text + + +def test_connection_and_sink_literals_scrubbed( + tmp_path: Any, force_regex: None +) -> None: + # Regression test for the connection/sink literal leak: hostnames, + # usernames, and topic names live in DDL option strings. + rc, _out, text = run_tool(tmp_path, base_workload()) + assert rc == 0 + assert "prod.internal.acme.com" not in text + assert "admin" not in text + assert "customer-orders-prod" not in text + + +def test_table_default_literal_scrubbed(tmp_path: Any, force_regex: None) -> None: + rc, _out, text = run_tool(tmp_path, base_workload()) + assert rc == 0 + assert "secret note" not in text + + +def test_query_string_literal_scrubbed(tmp_path: Any, force_regex: None) -> None: + rc, _out, text = run_tool(tmp_path, base_workload()) + assert rc == 0 + assert "hunter2" not in text + + +def test_cluster_size_preserved(tmp_path: Any, force_regex: None) -> None: + # Cluster SIZE is non-sensitive config that replay must keep verbatim. + rc, _out, text = run_tool(tmp_path, base_workload()) + assert rc == 0 + assert "100cc" in text + + +def test_no_output_target_errors( + tmp_path: Any, force_regex: None, capsys: pytest.CaptureFixture[str] +) -> None: + inp = tmp_path / "workload.yml" + inp.write_text(yaml.safe_dump(base_workload())) + with mock.patch.object(sys, "argv", ["mz-workload-anonymize", str(inp)]): + rc = mz_workload_anonymize.main() + assert rc == 1 + assert "in-place" in capsys.readouterr().err + + +def test_in_place_overwrites_input(tmp_path: Any, force_regex: None) -> None: + rc, _out, text = run_tool(tmp_path, base_workload(), in_place=True) + assert rc == 0 + assert "customers_db" not in text + assert "hunter2" not in text + + +def test_no_literals_keeps_literals_but_anonymizes_identifiers( + tmp_path: Any, force_regex: None +) -> None: + rc, _out, text = run_tool(tmp_path, base_workload(), "--no-literals") + assert rc == 0 + # Literals retained... + assert "hunter2" in text + # ...but identifiers still anonymized. + assert "customers_db" not in text + + +def test_verify_catches_surviving_identifier() -> None: + new = { + "databases": {}, + "clusters": {}, + "queries": [{"sql": "SELECT * FROM orders"}], + } + mapping = {"orders": "table_1"} + args = mock.Mock(identifiers=True, literals=True) + problems = mz_workload_anonymize.verify_anonymized(new, mapping, args) + assert any("orders" in p for p in problems) + + +def test_verify_catches_unanonymized_literal() -> None: + new = { + "databases": {}, + "clusters": {}, + "queries": [{"sql": "SELECT * FROM t WHERE x = 'leak'"}], + } + args = mock.Mock(identifiers=True, literals=True) + problems = mz_workload_anonymize.verify_anonymized(new, {}, args) + assert any("leak" in p for p in problems) + + +def test_verify_accepts_both_placeholder_styles() -> None: + new = { + "databases": {}, + "clusters": {}, + "queries": [ + {"sql": "SELECT * FROM t WHERE a = 'literal_1' AND b = ''"}, + ], + } + args = mock.Mock(identifiers=True, literals=True) + assert mz_workload_anonymize.verify_anonymized(new, {}, args) == [] + + +def test_verify_exempts_cluster_literals() -> None: + # Cluster create_sql keeps its SIZE literal; verify must not flag it. + new = { + "clusters": {"cluster_0": {"create_sql": "CREATE CLUSTER cluster_0 (SIZE = '100cc')"}}, + "databases": {}, + "queries": [], + } + args = mock.Mock(identifiers=True, literals=True) + assert mz_workload_anonymize.verify_anonymized(new, {}, args) == [] + + +def test_redact_via_parser_returns_none_without_binary( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setattr(mz_workload_anonymize, "_locate_redactor", lambda: None) + assert mz_workload_anonymize.redact_literals_via_parser(["SELECT 1"]) is None + + +def test_regex_fallback_warns( + tmp_path: Any, force_regex: None, capsys: pytest.CaptureFixture[str] +) -> None: + rc, _out, _text = run_tool(tmp_path, base_workload()) + assert rc == 0 + assert "mz-sql-anonymize helper not found" in capsys.readouterr().err + + +@pytest.mark.skipif( + mz_workload_anonymize._locate_redactor() is None, + reason="mz-sql-anonymize binary not built; run `cargo build --release -p mz-sql-anonymize`", +) +def test_parser_path_redacts_numeric_literal(tmp_path: Any) -> None: + # The parser path (unlike the regex) redacts numbers in query predicates. + rc, _out, text = run_tool(tmp_path, base_workload()) + assert rc == 0 + assert "987654321" not in text + assert "" in text From 80446b8551150d0b1963c6c515c18bc187187d42 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Wed, 27 May 2026 23:00:49 -0700 Subject: [PATCH 2/2] workload-replay tests: apply black formatting Co-Authored-By: Claude Opus 4.7 (1M context) --- misc/python/materialize/cli/mz_workload_anonymize_test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/misc/python/materialize/cli/mz_workload_anonymize_test.py b/misc/python/materialize/cli/mz_workload_anonymize_test.py index 1e58132f779e0..9168bd8cfbcab 100644 --- a/misc/python/materialize/cli/mz_workload_anonymize_test.py +++ b/misc/python/materialize/cli/mz_workload_anonymize_test.py @@ -235,7 +235,9 @@ def test_verify_accepts_both_placeholder_styles() -> None: def test_verify_exempts_cluster_literals() -> None: # Cluster create_sql keeps its SIZE literal; verify must not flag it. new = { - "clusters": {"cluster_0": {"create_sql": "CREATE CLUSTER cluster_0 (SIZE = '100cc')"}}, + "clusters": { + "cluster_0": {"create_sql": "CREATE CLUSTER cluster_0 (SIZE = '100cc')"} + }, "databases": {}, "queries": [], }