Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 53 additions & 16 deletions misc/python/materialize/cli/mz_workload_anonymize.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,16 @@ def main() -> int:
help="After anonymizing, scan the output for surviving original identifiers and "
"non-anonymized string literals, and refuse to write if any are found.",
)
parser.add_argument(
"--require-parser",
action=argparse.BooleanOptionalAction,
default=True,
help="Require the mz-sql-anonymize parser for query literal redaction "
"(the default). With --no-require-parser, fall back to a weaker regex "
"that only redacts single-quoted strings (missing numbers, dollar-quoted "
"strings, and comments) when the parser binary is unavailable or a "
"statement does not parse.",
)

parser.add_argument(
"file",
Expand All @@ -272,6 +282,20 @@ def main() -> int:

args = parser.parse_args()

# Resolve the output target up front so an invalid invocation fails before
# any work (and before the parser-availability check below).
if args.output:
output = args.output
elif args.in_place:
output = args.file
else:
print(
"error: specify an output with -o/--output (use '-' for stdout) "
"or pass --in-place to overwrite the input file",
file=sys.stderr,
)
return 1

with open(args.file) as f:
workload = yaml.load(f, Loader=yaml.CSafeLoader)

Expand Down Expand Up @@ -555,22 +579,47 @@ def replace_literals(d: dict[str, Any], entry: str) -> None:
# Redact literals in query SQL with Materialize's own parser, in one batch.
# The parser handles every literal form the dialect supports (numbers, hex
# strings, intervals, dollar-quoted and escape strings) where the regex only
# caught single-quoted strings. Fall back to the regex per-statement when
# the helper binary is unavailable or cannot parse a given statement.
# caught single-quoted strings.
#
# --require-parser gates only the wholesale case: if the parser binary is
# unavailable, the tool errors rather than silently redacting every query
# with the weaker regex. Individual statements that do not parse fall back
# to the regex with a warning either way (this is a property of the captured
# SQL, not of whether the parser is present), and the verify pass still
# scans the result.
if query_literal_targets:
sqls = [q["sql"] for q in query_literal_targets]
redacted = redact_literals_via_parser(sqls)
if redacted is None:
if args.require_parser:
print(
"error: mz-sql-anonymize helper not found, so query literals "
"cannot be redacted with the parser. Build it with:\n"
" cargo build --release -p mz-sql-anonymize\n"
"or pass --no-require-parser to fall back to a weaker regex "
"that only redacts single-quoted strings (missing numbers, "
"dollar-quoted strings, and comments).",
file=sys.stderr,
)
return 1
print(
"warning: mz-sql-anonymize helper not found; using regex literal "
"redaction for queries, which misses numbers, dollar-quoted "
"strings, and comments. Build it for exact redaction:\n"
" cargo build --release -p mz-sql-anonymize",
"strings, and comments (--no-require-parser).",
file=sys.stderr,
)
for q in query_literal_targets:
q["sql"] = anonymize_literals_in_sql(q["sql"])
else:
unparsed = [i for i, red in enumerate(redacted) if red is None]
if unparsed:
print(
f"warning: mz-sql-anonymize could not parse {len(unparsed)} of "
f"{len(redacted)} captured queries; falling back to the regex "
"for those (it only redacts single-quoted strings). The verify "
"pass still scans them.",
file=sys.stderr,
)
for q, red in zip(query_literal_targets, redacted):
q["sql"] = (
red if red is not None else anonymize_literals_in_sql(q["sql"])
Expand All @@ -588,18 +637,6 @@ def replace_literals(d: dict[str, Any], entry: str) -> None:
print(f" {problem}", file=sys.stderr)
return 1

if args.output:
output = args.output
elif args.in_place:
output = args.file
else:
print(
"error: specify an output with -o/--output (use '-' for stdout) "
"or pass --in-place to overwrite the input file",
file=sys.stderr,
)
return 1

if output == "-":
yaml.dump(new, sys.stdout, Dumper=yaml.CSafeDumper)
else:
Expand Down
23 changes: 22 additions & 1 deletion misc/python/materialize/cli/mz_workload_anonymize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,14 @@ def run_tool(
workload: dict[str, Any],
*extra_args: str,
in_place: bool = False,
require_parser: bool = False,
) -> tuple[int, dict[str, Any] | None, str]:
"""Run main() against a workload, returning (exit_code, output, dumped_text)."""
"""Run main() against a workload, returning (exit_code, output, dumped_text).

Defaults to --no-require-parser so tests are deterministic whether or not
the mz-sql-anonymize binary is built; tests that exercise the parser
requirement pass require_parser=True.
"""
inp = tmp_path / "workload.yml"
inp.write_text(yaml.safe_dump(workload))
argv = ["mz-workload-anonymize", str(inp)]
Expand All @@ -104,6 +110,8 @@ def run_tool(
else:
out = tmp_path / "out.yml"
argv += ["-o", str(out)]
if not require_parser:
argv.append("--no-require-parser")
argv += list(extra_args)

with mock.patch.object(sys, "argv", argv):
Expand Down Expand Up @@ -260,6 +268,19 @@ def test_regex_fallback_warns(
assert "mz-sql-anonymize helper not found" in capsys.readouterr().err


def test_require_parser_errors_without_binary(
tmp_path: Any, force_regex: None, capsys: pytest.CaptureFixture[str]
) -> None:
# By default the parser is required: with no binary the tool must refuse to
# run rather than silently fall back to the weaker regex.
rc, out, _text = run_tool(tmp_path, base_workload(), require_parser=True)
assert rc == 1
assert out is None
err = capsys.readouterr().err
assert "mz-sql-anonymize" in err
assert "--no-require-parser" in err


@pytest.mark.skipif(
mz_workload_anonymize._locate_redactor() is None,
reason="mz-sql-anonymize binary not built; run `cargo build --release -p mz-sql-anonymize`",
Expand Down
11 changes: 8 additions & 3 deletions test/workload-replay/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,14 +80,18 @@ Anonymizes identifiers and literals in workload captures for sharing without exp
*Literals (`--literals`, enabled by default):*
- Query SQL is redacted with Materialize's own parser (`mz-sql-anonymize`),
replacing all literals — strings, numbers, hex strings, intervals — with
`'<REDACTED>'`. If the helper binary is not built, the tool falls back to a
regex that only catches single-quoted strings (and prints a warning).
`'<REDACTED>'`. **The parser binary is required by default**: if it is not
built, the tool errors instead of silently redacting every query with the
weaker regex. Pass `--no-require-parser` to allow that regex fallback (it
only catches single-quoted strings, missing numbers, dollar-quoted strings,
and comments). Individual statements that do not parse fall back to the regex
with a warning regardless, and the verify pass still scans them.
- `create_sql` strings (including connection hosts/users, sink topics, source
options, and column defaults) → `'literal_1'`, `'literal_2'`, ... via regex.
The parser is not used here because `to_ast_string_redacted()` intentionally
does not redact DDL option strings.

For exact, parser-based query redaction, build the helper once:
Build the helper once (required for the default `--require-parser` mode):
```bash
cargo build --release -p mz-sql-anonymize
```
Expand All @@ -105,6 +109,7 @@ bin/mz-workload-anonymize <file> [OPTIONS]
| `--identifiers` / `--no-identifiers` | Anonymize object names | enabled |
| `--literals` / `--no-literals` | Anonymize literals | enabled |
| `--verify` / `--no-verify` | Re-scan output for leaks and refuse to write if any are found | enabled |
| `--require-parser` / `--no-require-parser` | Require the parser for query literals; error rather than fall back to the weaker regex | enabled |

**Examples:**
```bash
Expand Down
Loading