diff --git a/misc/python/materialize/cli/mz_workload_anonymize.py b/misc/python/materialize/cli/mz_workload_anonymize.py index 65d64e713cb92..2dd754a6b17fd 100644 --- a/misc/python/materialize/cli/mz_workload_anonymize.py +++ b/misc/python/materialize/cli/mz_workload_anonymize.py @@ -263,6 +263,16 @@ def main() -> int: help="After anonymizing, scan the output for surviving original identifiers and " "non-anonymized string literals, and refuse to write if any are found.", ) + parser.add_argument( + "--require-parser", + action=argparse.BooleanOptionalAction, + default=True, + help="Require the mz-sql-anonymize parser for query literal redaction " + "(the default). With --no-require-parser, fall back to a weaker regex " + "that only redacts single-quoted strings (missing numbers, dollar-quoted " + "strings, and comments) when the parser binary is unavailable or a " + "statement does not parse.", + ) parser.add_argument( "file", @@ -272,6 +282,20 @@ def main() -> int: args = parser.parse_args() + # Resolve the output target up front so an invalid invocation fails before + # any work (and before the parser-availability check below). + if args.output: + output = args.output + elif args.in_place: + output = args.file + else: + print( + "error: specify an output with -o/--output (use '-' for stdout) " + "or pass --in-place to overwrite the input file", + file=sys.stderr, + ) + return 1 + with open(args.file) as f: workload = yaml.load(f, Loader=yaml.CSafeLoader) @@ -555,22 +579,47 @@ def replace_literals(d: dict[str, Any], entry: str) -> None: # Redact literals in query SQL with Materialize's own parser, in one batch. # The parser handles every literal form the dialect supports (numbers, hex # strings, intervals, dollar-quoted and escape strings) where the regex only - # caught single-quoted strings. Fall back to the regex per-statement when - # the helper binary is unavailable or cannot parse a given statement. + # caught single-quoted strings. + # + # --require-parser gates only the wholesale case: if the parser binary is + # unavailable, the tool errors rather than silently redacting every query + # with the weaker regex. Individual statements that do not parse fall back + # to the regex with a warning either way (this is a property of the captured + # SQL, not of whether the parser is present), and the verify pass still + # scans the result. if query_literal_targets: sqls = [q["sql"] for q in query_literal_targets] redacted = redact_literals_via_parser(sqls) if redacted is None: + if args.require_parser: + print( + "error: mz-sql-anonymize helper not found, so query literals " + "cannot be redacted with the parser. Build it with:\n" + " cargo build --release -p mz-sql-anonymize\n" + "or pass --no-require-parser to fall back to a weaker regex " + "that only redacts single-quoted strings (missing numbers, " + "dollar-quoted strings, and comments).", + file=sys.stderr, + ) + return 1 print( "warning: mz-sql-anonymize helper not found; using regex literal " "redaction for queries, which misses numbers, dollar-quoted " - "strings, and comments. Build it for exact redaction:\n" - " cargo build --release -p mz-sql-anonymize", + "strings, and comments (--no-require-parser).", file=sys.stderr, ) for q in query_literal_targets: q["sql"] = anonymize_literals_in_sql(q["sql"]) else: + unparsed = [i for i, red in enumerate(redacted) if red is None] + if unparsed: + print( + f"warning: mz-sql-anonymize could not parse {len(unparsed)} of " + f"{len(redacted)} captured queries; falling back to the regex " + "for those (it only redacts single-quoted strings). The verify " + "pass still scans them.", + file=sys.stderr, + ) for q, red in zip(query_literal_targets, redacted): q["sql"] = ( red if red is not None else anonymize_literals_in_sql(q["sql"]) @@ -588,18 +637,6 @@ def replace_literals(d: dict[str, Any], entry: str) -> None: print(f" {problem}", file=sys.stderr) return 1 - if args.output: - output = args.output - elif args.in_place: - output = args.file - else: - print( - "error: specify an output with -o/--output (use '-' for stdout) " - "or pass --in-place to overwrite the input file", - file=sys.stderr, - ) - return 1 - if output == "-": yaml.dump(new, sys.stdout, Dumper=yaml.CSafeDumper) else: diff --git a/misc/python/materialize/cli/mz_workload_anonymize_test.py b/misc/python/materialize/cli/mz_workload_anonymize_test.py index b982cacdd95ba..17c462272e0d1 100644 --- a/misc/python/materialize/cli/mz_workload_anonymize_test.py +++ b/misc/python/materialize/cli/mz_workload_anonymize_test.py @@ -93,8 +93,14 @@ def run_tool( workload: dict[str, Any], *extra_args: str, in_place: bool = False, + require_parser: bool = False, ) -> tuple[int, dict[str, Any] | None, str]: - """Run main() against a workload, returning (exit_code, output, dumped_text).""" + """Run main() against a workload, returning (exit_code, output, dumped_text). + + Defaults to --no-require-parser so tests are deterministic whether or not + the mz-sql-anonymize binary is built; tests that exercise the parser + requirement pass require_parser=True. + """ inp = tmp_path / "workload.yml" inp.write_text(yaml.safe_dump(workload)) argv = ["mz-workload-anonymize", str(inp)] @@ -104,6 +110,8 @@ def run_tool( else: out = tmp_path / "out.yml" argv += ["-o", str(out)] + if not require_parser: + argv.append("--no-require-parser") argv += list(extra_args) with mock.patch.object(sys, "argv", argv): @@ -260,6 +268,19 @@ def test_regex_fallback_warns( assert "mz-sql-anonymize helper not found" in capsys.readouterr().err +def test_require_parser_errors_without_binary( + tmp_path: Any, force_regex: None, capsys: pytest.CaptureFixture[str] +) -> None: + # By default the parser is required: with no binary the tool must refuse to + # run rather than silently fall back to the weaker regex. + rc, out, _text = run_tool(tmp_path, base_workload(), require_parser=True) + assert rc == 1 + assert out is None + err = capsys.readouterr().err + assert "mz-sql-anonymize" in err + assert "--no-require-parser" in err + + @pytest.mark.skipif( mz_workload_anonymize._locate_redactor() is None, reason="mz-sql-anonymize binary not built; run `cargo build --release -p mz-sql-anonymize`", diff --git a/test/workload-replay/README.md b/test/workload-replay/README.md index 02dbc98a54af3..9a2800ad78ce3 100644 --- a/test/workload-replay/README.md +++ b/test/workload-replay/README.md @@ -80,14 +80,18 @@ Anonymizes identifiers and literals in workload captures for sharing without exp *Literals (`--literals`, enabled by default):* - Query SQL is redacted with Materialize's own parser (`mz-sql-anonymize`), replacing all literals — strings, numbers, hex strings, intervals — with - `''`. If the helper binary is not built, the tool falls back to a - regex that only catches single-quoted strings (and prints a warning). + `''`. **The parser binary is required by default**: if it is not + built, the tool errors instead of silently redacting every query with the + weaker regex. Pass `--no-require-parser` to allow that regex fallback (it + only catches single-quoted strings, missing numbers, dollar-quoted strings, + and comments). Individual statements that do not parse fall back to the regex + with a warning regardless, and the verify pass still scans them. - `create_sql` strings (including connection hosts/users, sink topics, source options, and column defaults) → `'literal_1'`, `'literal_2'`, ... via regex. The parser is not used here because `to_ast_string_redacted()` intentionally does not redact DDL option strings. -For exact, parser-based query redaction, build the helper once: +Build the helper once (required for the default `--require-parser` mode): ```bash cargo build --release -p mz-sql-anonymize ``` @@ -105,6 +109,7 @@ bin/mz-workload-anonymize [OPTIONS] | `--identifiers` / `--no-identifiers` | Anonymize object names | enabled | | `--literals` / `--no-literals` | Anonymize literals | enabled | | `--verify` / `--no-verify` | Re-scan output for leaks and refuse to write if any are found | enabled | +| `--require-parser` / `--no-require-parser` | Require the parser for query literals; error rather than fall back to the weaker regex | enabled | **Examples:** ```bash