From 1e7105e36d4a7d9ac1531e9a385db5c296eba17f Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Fri, 29 May 2026 13:16:41 -0700 Subject: [PATCH 1/2] workload-replay: require the parser by default for query literals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, if the mz-sql-anonymize binary was missing or a statement failed to parse, the anonymizer silently fell back to a regex that only redacts single-quoted strings — leaving numbers, dollar-quoted strings, and comments in query SQL exposed. For a privacy tool, fail-open is the wrong default. Add --require-parser (default on). When set, the tool errors rather than emit weaker output if the parser binary is unavailable or any captured query does not parse. --no-require-parser opts back into the regex fallback, which now warns that it is in use. Also resolve the output target up front so an invalid invocation fails before any work, independent of the new parser check. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../materialize/cli/mz_workload_anonymize.py | 69 ++++++++++++++----- .../cli/mz_workload_anonymize_test.py | 23 ++++++- test/workload-replay/README.md | 10 ++- 3 files changed, 82 insertions(+), 20 deletions(-) diff --git a/misc/python/materialize/cli/mz_workload_anonymize.py b/misc/python/materialize/cli/mz_workload_anonymize.py index 65d64e713cb92..5e0c0572a8852 100644 --- a/misc/python/materialize/cli/mz_workload_anonymize.py +++ b/misc/python/materialize/cli/mz_workload_anonymize.py @@ -263,6 +263,16 @@ def main() -> int: help="After anonymizing, scan the output for surviving original identifiers and " "non-anonymized string literals, and refuse to write if any are found.", ) + parser.add_argument( + "--require-parser", + action=argparse.BooleanOptionalAction, + default=True, + help="Require the mz-sql-anonymize parser for query literal redaction " + "(the default). With --no-require-parser, fall back to a weaker regex " + "that only redacts single-quoted strings (missing numbers, dollar-quoted " + "strings, and comments) when the parser binary is unavailable or a " + "statement does not parse.", + ) parser.add_argument( "file", @@ -272,6 +282,20 @@ def main() -> int: args = parser.parse_args() + # Resolve the output target up front so an invalid invocation fails before + # any work (and before the parser-availability check below). + if args.output: + output = args.output + elif args.in_place: + output = args.file + else: + print( + "error: specify an output with -o/--output (use '-' for stdout) " + "or pass --in-place to overwrite the input file", + file=sys.stderr, + ) + return 1 + with open(args.file) as f: workload = yaml.load(f, Loader=yaml.CSafeLoader) @@ -555,22 +579,47 @@ def replace_literals(d: dict[str, Any], entry: str) -> None: # Redact literals in query SQL with Materialize's own parser, in one batch. # The parser handles every literal form the dialect supports (numbers, hex # strings, intervals, dollar-quoted and escape strings) where the regex only - # caught single-quoted strings. Fall back to the regex per-statement when - # the helper binary is unavailable or cannot parse a given statement. + # caught single-quoted strings. + # + # By default (--require-parser) the regex is NOT an acceptable substitute: + # if the parser binary is unavailable, or any individual statement does not + # parse, the tool errors rather than silently emitting weaker redaction. + # --no-require-parser opts into the regex fallback for those cases. if query_literal_targets: sqls = [q["sql"] for q in query_literal_targets] redacted = redact_literals_via_parser(sqls) if redacted is None: + if args.require_parser: + print( + "error: mz-sql-anonymize helper not found, so query literals " + "cannot be redacted with the parser. Build it with:\n" + " cargo build --release -p mz-sql-anonymize\n" + "or pass --no-require-parser to fall back to a weaker regex " + "that only redacts single-quoted strings (missing numbers, " + "dollar-quoted strings, and comments).", + file=sys.stderr, + ) + return 1 print( "warning: mz-sql-anonymize helper not found; using regex literal " "redaction for queries, which misses numbers, dollar-quoted " - "strings, and comments. Build it for exact redaction:\n" - " cargo build --release -p mz-sql-anonymize", + "strings, and comments (--no-require-parser).", file=sys.stderr, ) for q in query_literal_targets: q["sql"] = anonymize_literals_in_sql(q["sql"]) else: + unparsed = [i for i, red in enumerate(redacted) if red is None] + if unparsed and args.require_parser: + print( + f"error: mz-sql-anonymize could not parse {len(unparsed)} of " + f"{len(redacted)} captured queries, so their literals cannot " + "be redacted with the parser. Pass --no-require-parser to " + "redact these with the weaker regex instead (it only handles " + "single-quoted strings).", + file=sys.stderr, + ) + return 1 for q, red in zip(query_literal_targets, redacted): q["sql"] = ( red if red is not None else anonymize_literals_in_sql(q["sql"]) @@ -588,18 +637,6 @@ def replace_literals(d: dict[str, Any], entry: str) -> None: print(f" {problem}", file=sys.stderr) return 1 - if args.output: - output = args.output - elif args.in_place: - output = args.file - else: - print( - "error: specify an output with -o/--output (use '-' for stdout) " - "or pass --in-place to overwrite the input file", - file=sys.stderr, - ) - return 1 - if output == "-": yaml.dump(new, sys.stdout, Dumper=yaml.CSafeDumper) else: diff --git a/misc/python/materialize/cli/mz_workload_anonymize_test.py b/misc/python/materialize/cli/mz_workload_anonymize_test.py index b982cacdd95ba..17c462272e0d1 100644 --- a/misc/python/materialize/cli/mz_workload_anonymize_test.py +++ b/misc/python/materialize/cli/mz_workload_anonymize_test.py @@ -93,8 +93,14 @@ def run_tool( workload: dict[str, Any], *extra_args: str, in_place: bool = False, + require_parser: bool = False, ) -> tuple[int, dict[str, Any] | None, str]: - """Run main() against a workload, returning (exit_code, output, dumped_text).""" + """Run main() against a workload, returning (exit_code, output, dumped_text). + + Defaults to --no-require-parser so tests are deterministic whether or not + the mz-sql-anonymize binary is built; tests that exercise the parser + requirement pass require_parser=True. + """ inp = tmp_path / "workload.yml" inp.write_text(yaml.safe_dump(workload)) argv = ["mz-workload-anonymize", str(inp)] @@ -104,6 +110,8 @@ def run_tool( else: out = tmp_path / "out.yml" argv += ["-o", str(out)] + if not require_parser: + argv.append("--no-require-parser") argv += list(extra_args) with mock.patch.object(sys, "argv", argv): @@ -260,6 +268,19 @@ def test_regex_fallback_warns( assert "mz-sql-anonymize helper not found" in capsys.readouterr().err +def test_require_parser_errors_without_binary( + tmp_path: Any, force_regex: None, capsys: pytest.CaptureFixture[str] +) -> None: + # By default the parser is required: with no binary the tool must refuse to + # run rather than silently fall back to the weaker regex. + rc, out, _text = run_tool(tmp_path, base_workload(), require_parser=True) + assert rc == 1 + assert out is None + err = capsys.readouterr().err + assert "mz-sql-anonymize" in err + assert "--no-require-parser" in err + + @pytest.mark.skipif( mz_workload_anonymize._locate_redactor() is None, reason="mz-sql-anonymize binary not built; run `cargo build --release -p mz-sql-anonymize`", diff --git a/test/workload-replay/README.md b/test/workload-replay/README.md index 02dbc98a54af3..20cf202948340 100644 --- a/test/workload-replay/README.md +++ b/test/workload-replay/README.md @@ -80,14 +80,17 @@ Anonymizes identifiers and literals in workload captures for sharing without exp *Literals (`--literals`, enabled by default):* - Query SQL is redacted with Materialize's own parser (`mz-sql-anonymize`), replacing all literals — strings, numbers, hex strings, intervals — with - `''`. If the helper binary is not built, the tool falls back to a - regex that only catches single-quoted strings (and prints a warning). + `''`. **This is required by default**: if the helper binary is not + built, or a captured statement does not parse, the tool errors instead of + silently producing weaker output. Pass `--no-require-parser` to fall back to + a regex that only catches single-quoted strings (missing numbers, + dollar-quoted strings, and comments). - `create_sql` strings (including connection hosts/users, sink topics, source options, and column defaults) → `'literal_1'`, `'literal_2'`, ... via regex. The parser is not used here because `to_ast_string_redacted()` intentionally does not redact DDL option strings. -For exact, parser-based query redaction, build the helper once: +Build the helper once (required for the default `--require-parser` mode): ```bash cargo build --release -p mz-sql-anonymize ``` @@ -105,6 +108,7 @@ bin/mz-workload-anonymize [OPTIONS] | `--identifiers` / `--no-identifiers` | Anonymize object names | enabled | | `--literals` / `--no-literals` | Anonymize literals | enabled | | `--verify` / `--no-verify` | Re-scan output for leaks and refuse to write if any are found | enabled | +| `--require-parser` / `--no-require-parser` | Require the parser for query literals; error rather than fall back to the weaker regex | enabled | **Examples:** ```bash From f2a7a8bba4ee76ffe7e45a88ee64f84ae7a05581 Mon Sep 17 00:00:00 2001 From: Jason Hernandez <7144515+jasonhernandez@users.noreply.github.com> Date: Fri, 29 May 2026 13:38:14 -0700 Subject: [PATCH 2/2] workload-replay: relax --require-parser to gate only the missing binary Per-statement parse failures are a property of the captured SQL, not of whether the parser is present, so they fall back to the regex with a warning in both modes (the verify pass still scans them). --require-parser now errors only when the parser binary itself is unavailable. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../materialize/cli/mz_workload_anonymize.py | 22 +++++++++---------- test/workload-replay/README.md | 11 +++++----- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/misc/python/materialize/cli/mz_workload_anonymize.py b/misc/python/materialize/cli/mz_workload_anonymize.py index 5e0c0572a8852..2dd754a6b17fd 100644 --- a/misc/python/materialize/cli/mz_workload_anonymize.py +++ b/misc/python/materialize/cli/mz_workload_anonymize.py @@ -581,10 +581,12 @@ def replace_literals(d: dict[str, Any], entry: str) -> None: # strings, intervals, dollar-quoted and escape strings) where the regex only # caught single-quoted strings. # - # By default (--require-parser) the regex is NOT an acceptable substitute: - # if the parser binary is unavailable, or any individual statement does not - # parse, the tool errors rather than silently emitting weaker redaction. - # --no-require-parser opts into the regex fallback for those cases. + # --require-parser gates only the wholesale case: if the parser binary is + # unavailable, the tool errors rather than silently redacting every query + # with the weaker regex. Individual statements that do not parse fall back + # to the regex with a warning either way (this is a property of the captured + # SQL, not of whether the parser is present), and the verify pass still + # scans the result. if query_literal_targets: sqls = [q["sql"] for q in query_literal_targets] redacted = redact_literals_via_parser(sqls) @@ -610,16 +612,14 @@ def replace_literals(d: dict[str, Any], entry: str) -> None: q["sql"] = anonymize_literals_in_sql(q["sql"]) else: unparsed = [i for i, red in enumerate(redacted) if red is None] - if unparsed and args.require_parser: + if unparsed: print( - f"error: mz-sql-anonymize could not parse {len(unparsed)} of " - f"{len(redacted)} captured queries, so their literals cannot " - "be redacted with the parser. Pass --no-require-parser to " - "redact these with the weaker regex instead (it only handles " - "single-quoted strings).", + f"warning: mz-sql-anonymize could not parse {len(unparsed)} of " + f"{len(redacted)} captured queries; falling back to the regex " + "for those (it only redacts single-quoted strings). The verify " + "pass still scans them.", file=sys.stderr, ) - return 1 for q, red in zip(query_literal_targets, redacted): q["sql"] = ( red if red is not None else anonymize_literals_in_sql(q["sql"]) diff --git a/test/workload-replay/README.md b/test/workload-replay/README.md index 20cf202948340..9a2800ad78ce3 100644 --- a/test/workload-replay/README.md +++ b/test/workload-replay/README.md @@ -80,11 +80,12 @@ Anonymizes identifiers and literals in workload captures for sharing without exp *Literals (`--literals`, enabled by default):* - Query SQL is redacted with Materialize's own parser (`mz-sql-anonymize`), replacing all literals — strings, numbers, hex strings, intervals — with - `''`. **This is required by default**: if the helper binary is not - built, or a captured statement does not parse, the tool errors instead of - silently producing weaker output. Pass `--no-require-parser` to fall back to - a regex that only catches single-quoted strings (missing numbers, - dollar-quoted strings, and comments). + `''`. **The parser binary is required by default**: if it is not + built, the tool errors instead of silently redacting every query with the + weaker regex. Pass `--no-require-parser` to allow that regex fallback (it + only catches single-quoted strings, missing numbers, dollar-quoted strings, + and comments). Individual statements that do not parse fall back to the regex + with a warning regardless, and the verify pass still scans them. - `create_sql` strings (including connection hosts/users, sink topics, source options, and column defaults) → `'literal_1'`, `'literal_2'`, ... via regex. The parser is not used here because `to_ast_string_redacted()` intentionally