diff --git a/engineV2.py b/engineV2.py index ad1c5cec..73e891a8 100644 --- a/engineV2.py +++ b/engineV2.py @@ -267,54 +267,95 @@ def get_memory_info(gpu_id): raise RuntimeError("No supported accelerator (GPU / XPU / Iluvatar) detected.") -def validate_gpu_options(options) -> tuple: - """Validate and normalize GPU-related options.""" - device_count = get_device_count() - if device_count == 0: - raise ValueError("No devices found") - if options.gpu_ids: +ARGUMENT_ERROR_PREFIX = "[argument error]" +ARGUMENT_WARNING_PREFIX = "[argument warning]" +TEST_MODE_ERROR = ( + "specify exactly one test mode: --accuracy, --paddle_only, --paddle_cinn, " + "--paddle_gpu_performance, --torch_gpu_performance, " + "--paddle_torch_gpu_performance, --accuracy_stable, --paddle_custom_device, " + "--custom_device_vs_gpu" +) + + +def _print_argument(prefix, message): + print(f"{prefix} {message}", flush=True) + + +def _parse_gpu_ids(gpu_ids_arg, device_count): + gpu_ids = [] + for raw_part in gpu_ids_arg.split(","): + part = raw_part.strip() + if not part: + continue + if part == "-1": + gpu_ids.append(-1) + continue + if "-" in part: + try: + start, end = map(int, part.split("-", 1)) + except ValueError: + raise ValueError( + f"invalid --gpu_ids='{gpu_ids_arg}': expected integers or ranges like '0,2,4-7'" + ) from None + if start > end: + raise ValueError(f"invalid --gpu_ids='{gpu_ids_arg}': range start must be <= end") + gpu_ids.extend(range(start, end + 1)) + continue try: - gpu_ids = [] - for part in options.gpu_ids.split(","): - part = part.strip() - if not part: - continue - if part.startswith("-") and part[1:].isdigit(): - gpu_ids.append(int(part)) - elif "-" in part and not part.startswith("-"): - start, end = map(int, part.split("-")) - if start > end: - raise ValueError(f"Invalid range: {part} (start > end)") - gpu_ids.extend(range(start, end + 1)) - else: - gpu_ids.append(int(part)) + gpu_ids.append(int(part)) except ValueError: raise ValueError( - f"Invalid gpu_ids: {options.gpu_ids} (int or range expected)" + f"invalid --gpu_ids='{gpu_ids_arg}': expected integers or ranges like '0,2,4-7'" ) from None - if len(gpu_ids) != len(set(gpu_ids)): - raise ValueError(f"Invalid gpu_ids: {options.gpu_ids} (duplicates)") - gpu_ids = sorted(set(gpu_ids)) - if len(gpu_ids) > 1 and -1 in gpu_ids: - raise ValueError(f"Invalid gpu_ids: {options.gpu_ids} (-1 allowed only)") - if gpu_ids != [-1] and not all(0 <= id < device_count for id in gpu_ids): - raise ValueError( - f"Invalid gpu_ids: {options.gpu_ids} (valid range [0, {device_count}))" - ) - else: - gpu_ids = [-1] + + if not gpu_ids: + raise ValueError(f"invalid --gpu_ids='{gpu_ids_arg}': expected at least one GPU id") + seen_gpu_ids = set() + for gpu_id in gpu_ids: + if gpu_id in seen_gpu_ids: + raise ValueError(f"invalid --gpu_ids='{gpu_ids_arg}': duplicate GPU id {gpu_id}") + seen_gpu_ids.add(gpu_id) + if len(gpu_ids) > 1 and -1 in gpu_ids: + raise ValueError( + f"invalid --gpu_ids='{gpu_ids_arg}': -1 cannot be combined with explicit GPU IDs" + ) + if gpu_ids != [-1] and not all(0 <= gpu_id < device_count for gpu_id in gpu_ids): + raise ValueError( + f"invalid --gpu_ids='{gpu_ids_arg}': valid GPU id range is [0, {device_count})" + ) + return tuple(sorted(gpu_ids)) + + +def validate_gpu_options(options) -> tuple: + """Validate and normalize GPU-related options.""" + device_count = get_device_count() + if device_count == 0: + raise ValueError("no accelerator devices were found") + + gpu_ids = _parse_gpu_ids(options.gpu_ids, device_count) if options.gpu_ids else (-1,) if options.num_gpus < -1 or options.num_gpus == 0 or options.num_gpus > device_count: - raise ValueError(f"Invalid num_gpus: {options.num_gpus}") + raise ValueError( + f"invalid --num_gpus={options.num_gpus}: expected -1 or a value in [1, {device_count}]" + ) if options.num_gpus == -1: - options.num_gpus = device_count if gpu_ids == [-1] else len(gpu_ids) - if gpu_ids == [-1]: - gpu_ids = list(range(options.num_gpus)) + options.num_gpus = device_count if gpu_ids == (-1,) else len(gpu_ids) + if gpu_ids == (-1,): + gpu_ids = tuple(range(options.num_gpus)) elif len(gpu_ids) != options.num_gpus: - raise ValueError(f"num_gpus {options.num_gpus} mismatches gpu_ids {gpu_ids}") + raise ValueError( + f"invalid --num_gpus={options.num_gpus}: expected {len(gpu_ids)} " + f"to match --gpu_ids={gpu_ids}" + ) if options.num_workers_per_gpu < -1 or options.num_workers_per_gpu == 0: - raise ValueError(f"Invalid num_workers_per_gpu: {options.num_workers_per_gpu}") + raise ValueError( + f"invalid --num_workers_per_gpu={options.num_workers_per_gpu}: " + "expected -1 or a positive integer" + ) if options.required_memory <= 0: - raise ValueError(f"Invalid required_memory: {options.required_memory}") + raise ValueError( + f"invalid --required_memory={options.required_memory:g}: " + "expected a positive number of GiB" + ) return tuple(gpu_ids) @@ -329,6 +370,19 @@ def parse_bool(value): raise ValueError(f"Invalid boolean value: {value} parsed from command line") +def _prepare_single_config_gpu(options): + if options.test_cpu: + return None + + gpu_ids = validate_gpu_options(options) + if len(gpu_ids) != 1: + raise ValueError( + f"single --api_config run supports exactly one GPU; got {len(gpu_ids)} GPUs: {gpu_ids}" + ) + os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_ids[0]) + return gpu_ids[0] + + def check_gpu_memory(gpu_ids, num_workers_per_gpu, required_memory): # required_memory in GB assert isinstance(gpu_ids, tuple) and len(gpu_ids) > 0 available_gpus = [] @@ -542,117 +596,132 @@ def main(): except Exception: paddle_version = "unknown" - parser = argparse.ArgumentParser(description="API Test") - parser.add_argument("--api_config_file", default="") + parser = argparse.ArgumentParser(description="Run Paddle API test cases") + parser.add_argument( + "--api_config_file", + default="", + help=( + "Path to a config file. Mutually exclusive with " + "--api_config_file_pattern and --api_config." + ), + ) parser.add_argument( "--api_config_file_pattern", default="", - help="Pattern to match multiple config files (e.g., 'tester/api_config/api_config_support2torch_*.txt')", + help="Glob pattern(s) for config files; comma-separated patterns are supported.", + ) + parser.add_argument( + "--api_config", + default="", + help="Run one API config string directly. Single-case mode supports at most one GPU.", ) - parser.add_argument("--api_config", default="") parser.add_argument( "--paddle_only", type=parse_bool, default=False, - help="test paddle api only to figure out whether the api is supported", + help="Run Paddle-only API support checks.", ) parser.add_argument( "--paddle_cinn", type=parse_bool, default=False, - help="test paddle api in dynamic graph mode and cinn mode", + help="Run Paddle dynamic graph vs CINN checks.", ) parser.add_argument( "--accuracy", type=parse_bool, default=False, - help="test paddle api to corespoding torch api", + help="Run Paddle vs corresponding Torch accuracy checks.", ) parser.add_argument( "--paddle_gpu_performance", type=parse_bool, default=False, - help="test paddle api performance", + help="Run Paddle GPU performance checks.", ) parser.add_argument( "--torch_gpu_performance", type=parse_bool, default=False, - help="test torch api performance", + help="Run Torch GPU performance checks.", ) parser.add_argument( "--paddle_torch_gpu_performance", type=parse_bool, default=False, - help="test paddle and torch api performance", + help="Run Paddle and Torch GPU performance checks.", ) parser.add_argument( "--accuracy_stable", type=parse_bool, default=False, - help="test paddle api to corespoding torch api steadily", + help="Run stable Paddle vs corresponding Torch accuracy checks.", ) parser.add_argument( "--paddle_custom_device", type=parse_bool, default=False, - help="test paddle api on custom device vs CPU", + help="Run Paddle custom device vs CPU checks.", ) parser.add_argument( "--test_amp", type=parse_bool, default=False, - help="Whether to test in auto mixed precision (AMP) mode", + help="Enable auto mixed precision (AMP) checks.", ) parser.add_argument( "--num_gpus", type=int, default=-1, - help="Number of GPUs to use, -1 to use all available", + help="Number of GPUs to use. Use -1 for all selected GPUs.", ) parser.add_argument( "--num_workers_per_gpu", type=int, default=1, - help="Number of workers per GPU, -1 to maximize based on memory", + help="Workers per GPU. Use -1 to maximize workers based on free memory.", ) parser.add_argument( "--gpu_ids", type=str, default="", - help="GPU IDs to use ('-1' for all available). " - "Accepts comma-separated values and/or ranges (e.g., '0-3,6,7')", + help="GPU IDs to use, e.g. '0', '0,2', '0-3'. Use '-1' for all GPUs.", ) parser.add_argument( "--required_memory", type=float, default=10.0, - help="Required memory per worker in GB", + help="Minimum free memory required per worker, in GiB.", ) parser.add_argument( "--test_cpu", type=parse_bool, default=False, - help="Whether to test CPU mode", + help="Run Paddle in CPU mode.", + ) + parser.add_argument( + "--use_cached_numpy", + type=parse_bool, + default=False, + help="Reuse cached NumPy inputs when available.", ) - parser.add_argument("--use_cached_numpy", type=bool, default=False) parser.add_argument( "--log_dir", type=str, default="", - help="Log directory", + help="Directory for test logs.", ) parser.add_argument( "--atol", type=float, default=1e-2, - help="Absolute tolerance for accuracy tests", + help="Absolute tolerance for accuracy checks.", ) parser.add_argument( "--rtol", type=float, default=1e-2, - help="Relative tolerance for accuracy tests", + help="Relative tolerance for accuracy checks.", ) parser.add_argument( "--manual_threshold_config_file", @@ -664,62 +733,62 @@ def main(): "--test_tol", type=parse_bool, default=False, - help="Whether to test tolerance range in accuracy mode", + help="Enable tolerance range checks in accuracy mode.", ) parser.add_argument( "--test_backward", type=parse_bool, default=False, - help="Whether to test backward in paddle_cinn mode", + help="Enable backward checks in paddle_cinn mode.", ) parser.add_argument( "--timeout", type=int, default=1800, - help="Timeout setting for a single test case, in seconds", + help="Timeout per test case, in seconds.", ) parser.add_argument( "--show_runtime_status", type=parse_bool, default=True, - help="Whether to show the current test progress in real-time. If set to False, only failed cases will be output", + help="Show real-time progress; when False, only failed cases are printed.", ) parser.add_argument( "--random_seed", type=int, default=0, - help="The numpy random seed ", + help="NumPy random seed.", ) parser.add_argument( "--custom_device_vs_gpu", type=parse_bool, default=False, - help="test paddle api on custom device vs GPU", + help="Run Paddle custom device vs GPU checks.", ) parser.add_argument( "--custom_device_vs_gpu_mode", type=str, choices=["upload", "download"], default="upload", - help="operation mode for custom_device_vs_gpu: 'upload' or 'download'", + help="Operation mode for custom_device_vs_gpu.", ) parser.add_argument( "--bitwise_alignment", type=bool, default=False, - help="Whether to using bitwise alignment when run accuracy test", + help="Use bitwise alignment for accuracy checks.", ) parser.add_argument( "--generate_failed_tests", type=parse_bool, default=False, - help="Whether to generate reproducible test files for failed cases", + help="Generate reproducible test files for failed cases.", ) parser.add_argument( "--exit_on_error", type=parse_bool, default=False, - help="Whether to exit the process when a paddle_error occurs.", + help="Exit the process when a paddle_error occurs.", ) options = parser.parse_args() @@ -741,19 +810,7 @@ def main(): options.custom_device_vs_gpu, ] if len([m for m in mode if m is True]) != 1: - print( - "Specify only one test mode:" - "--accuracy," - "--paddle_only," - "--paddle_cinn," - "--paddle_gpu_performance," - "--torch_gpu_performance," - "--paddle_torch_gpu_performance" - "--accuracy_stable" - "--paddle_custom_device" - "--custom_device_vs_gpu", - flush=True, - ) + _print_argument(ARGUMENT_ERROR_PREFIX, TEST_MODE_ERROR) return # 处理 custom_device_vs_gpu 模式的配置 @@ -791,9 +848,13 @@ def main(): return if options.test_tol and not options.accuracy: - print("--test_tol takes effect when --accuracy is True.", flush=True) + _print_argument( + ARGUMENT_WARNING_PREFIX, "--test_tol takes effect only when --accuracy=True" + ) if options.test_backward and not options.paddle_cinn: - print("--test_backward takes effect when --paddle_cinn is True.", flush=True) + _print_argument( + ARGUMENT_WARNING_PREFIX, "--test_backward takes effect only when --paddle_cinn=True" + ) os.environ["USE_CACHED_NUMPY"] = str(options.use_cached_numpy) if options.bitwise_alignment: options.atol = 0.0 @@ -802,6 +863,12 @@ def main(): set_test_log_path(options.log_dir) if options.api_config: + try: + _prepare_single_config_gpu(options) + except ValueError as err: + _print_argument(ARGUMENT_ERROR_PREFIX, str(err)) + return + # Single config execution # Load custom ops from paddlefleet to register _run_custom_op operators try: diff --git a/engineV4.py b/engineV4.py index fd8a4d17..933db0dd 100644 --- a/engineV4.py +++ b/engineV4.py @@ -90,6 +90,7 @@ "generate_failed_tests", "exit_on_error", } +SANITIZER_FORWARD_ARGS_SORTED = tuple(sorted(SANITIZER_FORWARD_ARGS)) DEVICE_TYPE = None DEVICE_TYPE_DETECTED = False @@ -253,16 +254,18 @@ def _format_cli_value(value): return str(value) -def _build_sanitizer_case_command(api_config_str, options, log_dir): +def _build_sanitizer_case_command(api_config_str, options, log_dir, sanitizer_cmd=None): + if sanitizer_cmd is None: + sanitizer_cmd = shlex.split(options.sanitizer_command) cmd = [ - *shlex.split(options.sanitizer_command), + *sanitizer_cmd, sys.executable, str(Path(__file__).resolve()), f"--api_config={api_config_str}", f"--log_dir={log_dir}", "--_sanitizer_child=True", ] - for key in sorted(SANITIZER_FORWARD_ARGS): + for key in SANITIZER_FORWARD_ARGS_SORTED: if key == "log_dir": continue value = getattr(options, key, None) @@ -283,6 +286,9 @@ def _sanitizer_worker_loop(slot_index, gpu_id, input_queue, result_queue, option redirect_stdio() child_process = None + sanitizer_cmd = shlex.split(options.sanitizer_command) + child_env = os.environ.copy() + child_env["CUDA_VISIBLE_DEVICES"] = str(gpu_id) def terminate_child(*args): if child_process is not None and child_process.poll() is None: @@ -317,13 +323,13 @@ def terminate_child(*args): shutil.rmtree(case_log_dir) case_log_dir.mkdir(parents=True, exist_ok=True) try: - cmd = _build_sanitizer_case_command(api_config_str, options, str(case_log_dir)) + cmd = _build_sanitizer_case_command( + api_config_str, options, str(case_log_dir), sanitizer_cmd + ) except ValueError as err: shutil.rmtree(case_log_dir, ignore_errors=True) result_queue.put(("error", slot_index, api_config_str, str(err))) continue - env = os.environ.copy() - env["CUDA_VISIBLE_DEVICES"] = str(gpu_id) print( f"{datetime.now()} Sanitizer slot {slot_index} launch: {' '.join(shlex.quote(part) for part in cmd)}", @@ -332,7 +338,7 @@ def terminate_child(*args): try: child_process = subprocess.Popen( cmd, - env=env, + env=child_env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, @@ -844,54 +850,95 @@ def get_memory_info(gpu_id): raise RuntimeError("No supported accelerator (GPU / XPU / Iluvatar) detected.") -def validate_gpu_options(options) -> tuple: - """Validate and normalize GPU-related options.""" - device_count = get_device_count() - if device_count == 0: - raise ValueError("No devices found") - if options.gpu_ids: +ARGUMENT_ERROR_PREFIX = "[argument error]" +ARGUMENT_WARNING_PREFIX = "[argument warning]" +TEST_MODE_ERROR = ( + "specify exactly one test mode: --accuracy, --paddle_only, --paddle_cinn, " + "--paddle_gpu_performance, --torch_gpu_performance, " + "--paddle_torch_gpu_performance, --accuracy_stable, --paddle_custom_device, " + "--custom_device_vs_gpu" +) + + +def _print_argument(prefix, message): + print(f"{prefix} {message}", flush=True) + + +def _parse_gpu_ids(gpu_ids_arg, device_count): + gpu_ids = [] + for raw_part in gpu_ids_arg.split(","): + part = raw_part.strip() + if not part: + continue + if part == "-1": + gpu_ids.append(-1) + continue + if "-" in part: + try: + start, end = map(int, part.split("-", 1)) + except ValueError: + raise ValueError( + f"invalid --gpu_ids='{gpu_ids_arg}': expected integers or ranges like '0,2,4-7'" + ) from None + if start > end: + raise ValueError(f"invalid --gpu_ids='{gpu_ids_arg}': range start must be <= end") + gpu_ids.extend(range(start, end + 1)) + continue try: - gpu_ids = [] - for part in options.gpu_ids.split(","): - part = part.strip() - if not part: - continue - if part.startswith("-") and part[1:].isdigit(): - gpu_ids.append(int(part)) - elif "-" in part and not part.startswith("-"): - start, end = map(int, part.split("-")) - if start > end: - raise ValueError(f"Invalid range: {part} (start > end)") - gpu_ids.extend(range(start, end + 1)) - else: - gpu_ids.append(int(part)) + gpu_ids.append(int(part)) except ValueError: raise ValueError( - f"Invalid gpu_ids: {options.gpu_ids} (int or range expected)" + f"invalid --gpu_ids='{gpu_ids_arg}': expected integers or ranges like '0,2,4-7'" ) from None - if len(gpu_ids) != len(set(gpu_ids)): - raise ValueError(f"Invalid gpu_ids: {options.gpu_ids} (duplicates)") - gpu_ids = sorted(set(gpu_ids)) - if len(gpu_ids) > 1 and -1 in gpu_ids: - raise ValueError(f"Invalid gpu_ids: {options.gpu_ids} (-1 allowed only)") - if gpu_ids != [-1] and not all(0 <= id < device_count for id in gpu_ids): - raise ValueError( - f"Invalid gpu_ids: {options.gpu_ids} (valid range [0, {device_count}))" - ) - else: - gpu_ids = [-1] + + if not gpu_ids: + raise ValueError(f"invalid --gpu_ids='{gpu_ids_arg}': expected at least one GPU id") + seen_gpu_ids = set() + for gpu_id in gpu_ids: + if gpu_id in seen_gpu_ids: + raise ValueError(f"invalid --gpu_ids='{gpu_ids_arg}': duplicate GPU id {gpu_id}") + seen_gpu_ids.add(gpu_id) + if len(gpu_ids) > 1 and -1 in gpu_ids: + raise ValueError( + f"invalid --gpu_ids='{gpu_ids_arg}': -1 cannot be combined with explicit GPU IDs" + ) + if gpu_ids != [-1] and not all(0 <= gpu_id < device_count for gpu_id in gpu_ids): + raise ValueError( + f"invalid --gpu_ids='{gpu_ids_arg}': valid GPU id range is [0, {device_count})" + ) + return tuple(sorted(gpu_ids)) + + +def validate_gpu_options(options) -> tuple: + """Validate and normalize GPU-related options.""" + device_count = get_device_count() + if device_count == 0: + raise ValueError("no accelerator devices were found") + + gpu_ids = _parse_gpu_ids(options.gpu_ids, device_count) if options.gpu_ids else (-1,) if options.num_gpus < -1 or options.num_gpus == 0 or options.num_gpus > device_count: - raise ValueError(f"Invalid num_gpus: {options.num_gpus}") + raise ValueError( + f"invalid --num_gpus={options.num_gpus}: expected -1 or a value in [1, {device_count}]" + ) if options.num_gpus == -1: - options.num_gpus = device_count if gpu_ids == [-1] else len(gpu_ids) - if gpu_ids == [-1]: - gpu_ids = list(range(options.num_gpus)) + options.num_gpus = device_count if gpu_ids == (-1,) else len(gpu_ids) + if gpu_ids == (-1,): + gpu_ids = tuple(range(options.num_gpus)) elif len(gpu_ids) != options.num_gpus: - raise ValueError(f"num_gpus {options.num_gpus} mismatches gpu_ids {gpu_ids}") + raise ValueError( + f"invalid --num_gpus={options.num_gpus}: expected {len(gpu_ids)} " + f"to match --gpu_ids={gpu_ids}" + ) if options.num_workers_per_gpu < -1 or options.num_workers_per_gpu == 0: - raise ValueError(f"Invalid num_workers_per_gpu: {options.num_workers_per_gpu}") + raise ValueError( + f"invalid --num_workers_per_gpu={options.num_workers_per_gpu}: " + "expected -1 or a positive integer" + ) if options.required_memory <= 0: - raise ValueError(f"Invalid required_memory: {options.required_memory}") + raise ValueError( + f"invalid --required_memory={options.required_memory:g}: " + "expected a positive number of GiB" + ) return tuple(gpu_ids) @@ -906,6 +953,87 @@ def parse_bool(value): raise ValueError(f"Invalid boolean value: {value} parsed from command line") +def _prepare_single_config_gpu(options): + if options.test_cpu: + return None + + gpu_ids = validate_gpu_options(options) + if len(gpu_ids) != 1: + raise ValueError( + f"single --api_config run supports exactly one GPU; got {len(gpu_ids)} GPUs: {gpu_ids}" + ) + os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_ids[0]) + return gpu_ids[0] + + +def _filter_sanitizer_output(output, returncode, sanitizer_error_exitcode): + if returncode != sanitizer_error_exitcode: + return output + return "\n".join(line for line in output.splitlines() if not line.startswith("[Pass]")) + + +def _validate_sanitizer_command(command): + try: + sanitizer_cmd = shlex.split(command) + except ValueError as err: + _print_argument(ARGUMENT_ERROR_PREFIX, f"invalid --sanitizer_command: {err}") + return None + if not sanitizer_cmd: + _print_argument( + ARGUMENT_ERROR_PREFIX, + "invalid --sanitizer_command: command cannot be empty", + ) + return None + if shutil.which(sanitizer_cmd[0]) is None: + _print_argument( + ARGUMENT_ERROR_PREFIX, + f"sanitizer executable not found: {sanitizer_cmd[0]}", + ) + return None + return sanitizer_cmd + + +def _run_single_config_with_sanitizer(options): + sanitizer_cmd = _validate_sanitizer_command(options.sanitizer_command) + if sanitizer_cmd is None: + return 2 + + try: + gpu_id = _prepare_single_config_gpu(options) + except ValueError as err: + _print_argument(ARGUMENT_ERROR_PREFIX, str(err)) + return 2 + + api_config = options.api_config.strip() + cmd = _build_sanitizer_case_command(api_config, options, sanitizer_cmd) + env = os.environ.copy() + if gpu_id is not None: + env["CUDA_VISIBLE_DEVICES"] = str(gpu_id) + + result = subprocess.run( + cmd, + env=env, + check=False, + capture_output=True, + text=True, + encoding="utf-8", + errors="replace", + ) + output = _filter_sanitizer_output( + f"{result.stdout or ''}{result.stderr or ''}", + result.returncode, + options.sanitizer_error_exitcode, + ) + if output: + print(output, end="" if output.endswith("\n") else "\n", flush=True) + if result.returncode == options.sanitizer_error_exitcode: + print( + f"[error] compute-sanitizer reported errors for {api_config} (exit={result.returncode})", + flush=True, + ) + return result.returncode + + def check_gpu_memory(gpu_ids, num_workers_per_gpu, required_memory): # required_memory in GB assert isinstance(gpu_ids, tuple) and len(gpu_ids) > 0 available_gpus = [] @@ -1031,117 +1159,132 @@ def main(): except Exception: paddle_version = "unknown" - parser = argparse.ArgumentParser(description="API Test") - parser.add_argument("--api_config_file", default="") + parser = argparse.ArgumentParser(description="Run Paddle API test cases") + parser.add_argument( + "--api_config_file", + default="", + help=( + "Path to a config file. Mutually exclusive with " + "--api_config_file_pattern and --api_config." + ), + ) parser.add_argument( "--api_config_file_pattern", default="", - help="Pattern to match multiple config files (e.g., 'tester/api_config/api_config_support2torch_*.txt')", + help="Glob pattern(s) for config files; comma-separated patterns are supported.", + ) + parser.add_argument( + "--api_config", + default="", + help="Run one API config string directly. Single-case mode supports at most one GPU.", ) - parser.add_argument("--api_config", default="") parser.add_argument( "--paddle_only", type=parse_bool, default=False, - help="test paddle api only to figure out whether the api is supported", + help="Run Paddle-only API support checks.", ) parser.add_argument( "--paddle_cinn", type=parse_bool, default=False, - help="test paddle api in dynamic graph mode and cinn mode", + help="Run Paddle dynamic graph vs CINN checks.", ) parser.add_argument( "--accuracy", type=parse_bool, default=False, - help="test paddle api to corespoding torch api", + help="Run Paddle vs corresponding Torch accuracy checks.", ) parser.add_argument( "--paddle_gpu_performance", type=parse_bool, default=False, - help="test paddle api performance", + help="Run Paddle GPU performance checks.", ) parser.add_argument( "--torch_gpu_performance", type=parse_bool, default=False, - help="test torch api performance", + help="Run Torch GPU performance checks.", ) parser.add_argument( "--paddle_torch_gpu_performance", type=parse_bool, default=False, - help="test paddle and torch api performance", + help="Run Paddle and Torch GPU performance checks.", ) parser.add_argument( "--accuracy_stable", type=parse_bool, default=False, - help="test paddle api to corespoding torch api steadily", + help="Run stable Paddle vs corresponding Torch accuracy checks.", ) parser.add_argument( "--paddle_custom_device", type=parse_bool, default=False, - help="test paddle api on custom device vs CPU", + help="Run Paddle custom device vs CPU checks.", ) parser.add_argument( "--test_amp", type=parse_bool, default=False, - help="Whether to test in auto mixed precision (AMP) mode", + help="Enable auto mixed precision (AMP) checks.", ) parser.add_argument( "--num_gpus", type=int, default=-1, - help="Number of GPUs to use, -1 to use all available", + help="Number of GPUs to use. Use -1 for all selected GPUs.", ) parser.add_argument( "--num_workers_per_gpu", type=int, default=1, - help="Number of workers per GPU, -1 to maximize based on memory", + help="Workers per GPU. Use -1 to maximize workers based on free memory.", ) parser.add_argument( "--gpu_ids", type=str, default="", - help="GPU IDs to use ('-1' for all available). " - "Accepts comma-separated values and/or ranges (e.g., '0-3,6,7')", + help="GPU IDs to use, e.g. '0', '0,2', '0-3'. Use '-1' for all GPUs.", ) parser.add_argument( "--required_memory", type=float, default=10.0, - help="Required memory per worker in GB", + help="Minimum free memory required per worker, in GiB.", ) parser.add_argument( "--test_cpu", type=parse_bool, default=False, - help="Whether to test CPU mode", + help="Run Paddle in CPU mode.", + ) + parser.add_argument( + "--use_cached_numpy", + type=parse_bool, + default=False, + help="Reuse cached NumPy inputs when available.", ) - parser.add_argument("--use_cached_numpy", type=bool, default=False) parser.add_argument( "--log_dir", type=str, default="", - help="Log directory", + help="Directory for test logs.", ) parser.add_argument( "--atol", type=float, default=1e-2, - help="Absolute tolerance for accuracy tests", + help="Absolute tolerance for accuracy checks.", ) parser.add_argument( "--rtol", type=float, default=1e-2, - help="Relative tolerance for accuracy tests", + help="Relative tolerance for accuracy checks.", ) parser.add_argument( "--manual_threshold_config_file", @@ -1153,68 +1296,68 @@ def main(): "--test_tol", type=parse_bool, default=False, - help="Whether to test tolerance range in accuracy mode", + help="Enable tolerance range checks in accuracy mode.", ) parser.add_argument( "--test_backward", type=parse_bool, default=False, - help="Whether to test backward in paddle_cinn mode", + help="Enable backward checks in paddle_cinn mode.", ) parser.add_argument( "--timeout", type=int, default=1800, - help="Timeout setting for a single test case, in seconds", + help="Timeout per test case, in seconds.", ) parser.add_argument( "--show_runtime_status", type=parse_bool, default=True, - help="Whether to show the current test progress in real-time. If set to False, only failed cases will be output", + help="Show real-time progress; when False, only failed cases are printed.", ) parser.add_argument( "--random_seed", type=int, default=0, - help="The numpy random seed ", + help="NumPy random seed.", ) parser.add_argument( "--custom_device_vs_gpu", type=parse_bool, default=False, - help="test paddle api on custom device vs GPU", + help="Run Paddle custom device vs GPU checks.", ) parser.add_argument( "--custom_device_vs_gpu_mode", type=str, choices=["upload", "download"], default="upload", - help="operation mode for custom_device_vs_gpu: 'upload' or 'download'", + help="Operation mode for custom_device_vs_gpu.", ) parser.add_argument( "--bitwise_alignment", type=bool, default=False, - help="Whether to using bitwise alignment when run accuracy test", + help="Use bitwise alignment for accuracy checks.", ) parser.add_argument( "--generate_failed_tests", type=parse_bool, default=False, - help="Whether to generate reproducible test files for failed cases", + help="Generate reproducible test files for failed cases.", ) parser.add_argument( "--exit_on_error", type=parse_bool, default=False, - help="Whether to exit the process when a paddle_error occurs.", + help="Exit the process when a paddle_error occurs.", ) parser.add_argument( "--use_compute_sanitizer", type=parse_bool, default=False, - help="Run each worker case in a compute-sanitizer wrapped subprocess.", + help="Run each case in a compute-sanitizer wrapped subprocess.", ) parser.add_argument( "--sanitizer_command", @@ -1256,19 +1399,7 @@ def main(): options.custom_device_vs_gpu, ] if len([m for m in mode if m is True]) != 1: - print( - "Specify only one test mode:" - "--accuracy," - "--paddle_only," - "--paddle_cinn," - "--paddle_gpu_performance," - "--torch_gpu_performance," - "--paddle_torch_gpu_performance" - "--accuracy_stable" - "--paddle_custom_device" - "--custom_device_vs_gpu", - flush=True, - ) + _print_argument(ARGUMENT_ERROR_PREFIX, TEST_MODE_ERROR) return # 处理 custom_device_vs_gpu 模式的配置 @@ -1306,9 +1437,13 @@ def main(): return if options.test_tol and not options.accuracy: - print("--test_tol takes effect when --accuracy is True.", flush=True) + _print_argument( + ARGUMENT_WARNING_PREFIX, "--test_tol takes effect only when --accuracy=True" + ) if options.test_backward and not options.paddle_cinn: - print("--test_backward takes effect when --paddle_cinn is True.", flush=True) + _print_argument( + ARGUMENT_WARNING_PREFIX, "--test_backward takes effect only when --paddle_cinn=True" + ) os.environ["USE_CACHED_NUMPY"] = str(options.use_cached_numpy) if options.bitwise_alignment: options.atol = 0.0 @@ -1334,6 +1469,14 @@ def main(): return if options.api_config: + if options.use_compute_sanitizer: + sys.exit(_run_single_config_with_sanitizer(options)) + try: + _prepare_single_config_gpu(options) + except ValueError as err: + _print_argument(ARGUMENT_ERROR_PREFIX, str(err)) + return + # Single config execution # Load custom ops from paddlefleet to register _run_custom_op operators try: @@ -1512,18 +1655,11 @@ def main(): ) return - if options.use_compute_sanitizer: - try: - sanitizer_cmd = shlex.split(options.sanitizer_command) - except ValueError as err: - print(f"invalid sanitizer_command: {err}", flush=True) - return - if not sanitizer_cmd: - print("sanitizer_command cannot be empty", flush=True) - return - if shutil.which(sanitizer_cmd[0]) is None: - print(f"sanitizer command not found: {sanitizer_cmd[0]}", flush=True) - return + if ( + options.use_compute_sanitizer + and _validate_sanitizer_command(options.sanitizer_command) is None + ): + return total_workers = sum(max_workers_per_gpu.values()) print(