diff --git a/engine.py b/engine.py index dcb50080..7ec46a51 100644 --- a/engine.py +++ b/engine.py @@ -116,6 +116,12 @@ def main(): default=1e-2, help="Relative tolerance for accuracy tests", ) + parser.add_argument( + "--manual_threshold_config_file", + type=str, + default="", + help="YAML file with per-API manual accuracy thresholds", + ) parser.add_argument( "--exit_on_error", type=parse_bool, @@ -165,6 +171,7 @@ def main(): test_amp=options.test_amp, atol=options.atol, rtol=options.rtol, + manual_threshold_config_file=options.manual_threshold_config_file, exit_on_error=options.exit_on_error, ) else: @@ -202,6 +209,7 @@ def main(): test_amp=options.test_amp, atol=options.atol, rtol=options.rtol, + manual_threshold_config_file=options.manual_threshold_config_file, ) else: case = test_class(api_config, test_amp=options.test_amp) diff --git a/engineV2.py b/engineV2.py index e1ebc653..a9c80f6d 100644 --- a/engineV2.py +++ b/engineV2.py @@ -47,6 +47,7 @@ "test_backward", "atol", "rtol", + "manual_threshold_config_file", "test_tol", "operation_mode", "bos_path", @@ -654,6 +655,12 @@ def main(): default=1e-2, help="Relative tolerance for accuracy tests", ) + parser.add_argument( + "--manual_threshold_config_file", + type=str, + default="", + help="YAML file with per-API manual accuracy thresholds", + ) parser.add_argument( "--test_tol", type=parse_bool, @@ -873,6 +880,7 @@ def main(): test_amp=options.test_amp, atol=options.atol, rtol=options.rtol, + manual_threshold_config_file=options.manual_threshold_config_file, test_tol=options.test_tol, bitwise_alignment=options.bitwise_alignment, exit_on_error=options.exit_on_error, diff --git a/engineV4.py b/engineV4.py index 26119c76..cc961f73 100644 --- a/engineV4.py +++ b/engineV4.py @@ -51,6 +51,7 @@ "test_backward", "atol", "rtol", + "manual_threshold_config_file", "test_tol", "operation_mode", "bos_path", @@ -80,6 +81,7 @@ "required_memory", "atol", "rtol", + "manual_threshold_config_file", "test_tol", "test_backward", "show_runtime_status", @@ -1127,6 +1129,12 @@ def main(): default=1e-2, help="Relative tolerance for accuracy tests", ) + parser.add_argument( + "--manual_threshold_config_file", + type=str, + default="", + help="YAML file with per-API manual accuracy thresholds", + ) parser.add_argument( "--test_tol", type=parse_bool, @@ -1387,6 +1395,7 @@ def main(): test_amp=options.test_amp, atol=options.atol, rtol=options.rtol, + manual_threshold_config_file=options.manual_threshold_config_file, test_tol=options.test_tol, bitwise_alignment=options.bitwise_alignment, exit_on_error=options.exit_on_error, diff --git a/test_pipeline/V4/run_dsv4_v2_accuracy_manual_threshold.sh b/test_pipeline/V4/run_dsv4_v2_accuracy_manual_threshold.sh new file mode 100644 index 00000000..261df946 --- /dev/null +++ b/test_pipeline/V4/run_dsv4_v2_accuracy_manual_threshold.sh @@ -0,0 +1,250 @@ +#!/bin/bash +set -euo pipefail + +# ============================================================ +# PaddleAPITest 运行脚本 +# +# 使用方式: +# ./run.sh 正常启动(后台) +# ./run.sh --stop 终止上次启动的后台进程 +# ./run.sh --status 查看运行状态 +# +# 配置方法: 修改下方变量 / 注释切换即可 +# ============================================================ + +# ── 引擎选择 ────────────────────────────────────────────────── +ENGINE=engineV4 # engineV2 | engineV4 + +# ── 运行模式开关 ────────────────────────────────────────────── +FOREGROUND=true # true=前台运行(调试用,Ctrl+C终止) +DRY_RUN=false # true=只打印最终命令,不执行 + +# ── compute-sanitizer ───────────────────────────────────────── +# true=由 engineV4 为每个 worker slot 单独启动 compute-sanitizer 子进程,保留多 GPU/多 worker 并发 +USE_COMPUTE_SANITIZER=false +SANITIZER_COMMAND="compute-sanitizer --target-processes all --error-exitcode=86" +SANITIZER_ERROR_EXITCODE=86 + +# ── Paddle Flags ────────────────────────────────────────────── +# export FLAGS_use_system_allocator=true +# export FLAGS_check_cuda_error=true +# export FLAGS_alloc_fill_value=255 +# export FLAGS_check_nan_inf=true + +# ── 输入输出 ────────────────────────────────────────────────── +# NUM_GPUS!=0 时,引擎不受外部 "CUDA_VISIBLE_DEVICES" 影响 +FILE_INPUT="tester/api_config/monitor_config/dsv4_v2/test9/1M_9.txt" +FILE_THRESHOLD_CONFIG="tester/api_config/monitor_config/dsv4_v2/test9/manual_threshold_config_file.yaml" +# FILE_PATTERN="tester/api_config/5_accuracy/accuracy_*.txt" +LOG_DIR="tester/api_config/monitor_config/dsv4_v2/test9" + +# ── GPU 调度 ────────────────────────────────────────────────── +NUM_GPUS=-1 +NUM_WORKERS_PER_GPU=1 +GPU_IDS="-1" +# REQUIRED_MEMORY=10 +TIME_OUT=1200 + +# ── 测试模式(取消注释启用)────────────────────────────────── +TEST_MODE_ARGS=( + # --accuracy=True + # --paddle_only=True + # --paddle_cinn=True + # --paddle_gpu_performance=True + # --torch_gpu_performance=True + # --paddle_torch_gpu_performance=True + # --accuracy_stable=True + --accuracy=True + --manual_threshold_config_file="$FILE_THRESHOLD_CONFIG" + # --test_amp=True + # --test_cpu=True + --use_cached_numpy=True + # --atol=1e-2 + # --rtol=1e-2 + # --test_tol=True + # --test_backward=True +) + +# ============================================================ +# ========== 以下为运行逻辑,通常不需要修改 ==================== +# ============================================================ + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [[ ! -f "$ENGINE.py" || ! -d "tester" ]]; then + echo "错误: 请在 PaddleAPITest 项目根目录执行此脚本" + exit 1 +fi +SCRIPT_NAME="${BASH_SOURCE[0]##*/}" +SCRIPT_NAME="${SCRIPT_NAME%.sh}" +PID_FILE="${SCRIPT_DIR}/.${SCRIPT_NAME}.pid" + +# ── 运维命令处理 ── +case "${1:-}" in + --stop) + if [[ -f "$PID_FILE" ]]; then + pid=$(cat "$PID_FILE") + if kill -0 "$pid" 2>/dev/null; then + # Kill entire process group (main + all workers) + kill -- -"$pid" 2>/dev/null || kill "$pid" 2>/dev/null + echo "已终止进程组 PGID=$pid" + else + echo "进程 PID=$pid 已不存在" + fi + rm -f "$PID_FILE" + else + echo "未找到 PID 文件,没有正在运行的任务" + fi + exit 0 + ;; + --status) + if [[ -f "$PID_FILE" ]]; then + pid=$(cat "$PID_FILE") + if kill -0 "$pid" 2>/dev/null; then + echo -e "\033[32m运行中\033[0m PID=$pid 引擎=$ENGINE" + # 显示子进程(worker) + children=$(pgrep -P "$pid" 2>/dev/null | wc -l) + echo " Worker 进程数: $children" + # 显示运行时长 + elapsed=$(ps -o etime= -p "$pid" 2>/dev/null | xargs) + echo " 已运行: ${elapsed:-unknown}" + # 显示日志文件 + log=$(ls -t "$LOG_DIR"/log_*.log 2>/dev/null | head -1) + [[ -n "${log:-}" ]] && echo " 最新日志: $log" + else + echo -e "\033[31m已结束\033[0m PID=$pid (进程不存在)" + rm -f "$PID_FILE" + fi + else + echo "无运行记录(PID 文件不存在)" + fi + exit 0 + ;; + --help|-h) + echo "Usage: ./run.sh [--stop|--status|--help]" + echo "" + echo " (无参数) 启动测试任务" + echo " --stop 终止后台任务" + echo " --status 查看运行状态" + echo "" + echo "配置方法: 编辑脚本顶部变量,注释/取消注释切换参数" + exit 0 + ;; + "") ;; # 正常启动 + *) + echo "未知参数: $1 (使用 --help 查看帮助)" + exit 1 + ;; +esac + +# ── 防重复启动 ── +if [[ -f "$PID_FILE" ]]; then + old_pid=$(cat "$PID_FILE") + if kill -0 "$old_pid" 2>/dev/null; then + echo -e "\033[33m警告: 已有运行中的任务 PID=$old_pid\033[0m" + echo "使用 ./run.sh --stop 终止后再启动,或删除 $PID_FILE 强制启动" + exit 1 + fi + rm -f "$PID_FILE" +fi + +# ── 组装参数 ── +IN_OUT_ARGS=( + --api_config_file="$FILE_INPUT" + # --api_config_file_pattern="$FILE_PATTERN" + --log_dir="$LOG_DIR" +) + +PARALLEL_ARGS=( + --num_gpus="$NUM_GPUS" + --num_workers_per_gpu="$NUM_WORKERS_PER_GPU" + --gpu_ids="$GPU_IDS" + # --required_memory="$REQUIRED_MEMORY" +) + +TIME_OUT_ARGS=( + --timeout="$TIME_OUT" +) + +SANITIZER_ARGS=( + --use_compute_sanitizer="$USE_COMPUTE_SANITIZER" + --sanitizer_command="$SANITIZER_COMMAND" + --sanitizer_error_exitcode="$SANITIZER_ERROR_EXITCODE" +) + +ALL_ARGS=( + "${TEST_MODE_ARGS[@]}" + "${IN_OUT_ARGS[@]}" + "${PARALLEL_ARGS[@]}" + "${TIME_OUT_ARGS[@]}" + "${SANITIZER_ARGS[@]}" +) + +# ── 打印有效配置 ── +echo "── PaddleAPITest ──────────────────────────" +echo " 引擎: $ENGINE.py" +echo " 输入: $FILE_INPUT" +echo " 日志: $LOG_DIR" +echo " GPU: ids=$GPU_IDS workers/gpu=$NUM_WORKERS_PER_GPU" +echo " 超时: ${TIME_OUT}s" +echo " 模式: ${TEST_MODE_ARGS[*]:-<无>}" +echo " Sanitizer: enabled=$USE_COMPUTE_SANITIZER exitcode=$SANITIZER_ERROR_EXITCODE command='$SANITIZER_COMMAND'" +echo "────────────────────────────────────────────" + +# ── Dry-run 模式 ── +if [[ "$DRY_RUN" == "true" ]]; then + echo "" + echo "[DRY-RUN] 最终命令:" + echo " python $ENGINE.py ${ALL_ARGS[*]}" + exit 0 +fi + +# ── 创建日志目录 ── +mkdir -p "$LOG_DIR" || { + echo "错误: 无法创建日志目录 '$LOG_DIR'" + exit 1 +} + +LOG_FILE="$LOG_DIR/log_$(date +%Y%m%d_%H%M%S).log" + +# ── 启动 ── +if [[ "$FOREGROUND" == "true" ]]; then + echo -e "\n\033[36m[前台模式] Ctrl+C 终止\033[0m" + echo "日志同时写入: $LOG_FILE" + echo "" + python "$ENGINE.py" "${ALL_ARGS[@]}" 2>&1 | tee "$LOG_FILE" +else + nohup setsid python "$ENGINE.py" "${ALL_ARGS[@]}" >> "$LOG_FILE" 2>&1 & + PYTHON_PID=$! + echo "$PYTHON_PID" > "$PID_FILE" + + # 任务自然结束时清理 PID 文件;若已启动新任务,则不误删新 PID。 + ( + while kill -0 "$PYTHON_PID" 2>/dev/null; do + sleep 5 + done + + recorded_pid="$(cat "$PID_FILE" 2>/dev/null || true)" + if [[ "$recorded_pid" == "$PYTHON_PID" ]]; then + rm -f "$PID_FILE" + fi + ) >/dev/null 2>&1 & + + sleep 1 + if ! kill -0 "$PYTHON_PID" 2>/dev/null; then + echo -e "\033[31m错误: $ENGINE 启动失败\033[0m" + echo "查看日志: tail -50 $LOG_FILE" + rm -f "$PID_FILE" + exit 1 + fi + + echo "" + echo -e "\033[32m启动成功! PID=$PYTHON_PID\033[0m" + echo "" + echo "常用操作:" + echo " 查看状态: ./run.sh --status" + echo " 终止任务: ./run.sh --stop" + echo " 跟踪日志: tail -f $LOG_FILE" + echo " GPU监控: watch -n 1 nvidia-smi" + echo "" + echo "进程已在后台运行,关闭终端不影响执行" +fi diff --git a/tester/accuracy.py b/tester/accuracy.py index 6a18cde7..132ef715 100644 --- a/tester/accuracy.py +++ b/tester/accuracy.py @@ -6,6 +6,7 @@ import numpy import paddle import torch +import yaml from .api_config.log_writer import write_to_log from .base import CUDA_ERROR, CUDA_OOM, APITestBase @@ -18,15 +19,48 @@ class APITestAccuracy(APITestBase): def __init__(self, api_config, **kwargs): super().__init__(api_config) self.test_amp = kwargs.get("test_amp", False) - self.atol = kwargs.get("atol", 1e-2) - self.rtol = kwargs.get("rtol", 1e-2) + self.atol = kwargs.get("atol", 0) + self.rtol = kwargs.get("rtol", 0) self.test_tol = kwargs.get("test_tol", False) self.exit_on_error = kwargs.get("exit_on_error", False) self.bitwise_alignment = kwargs.get("bitwise_alignment", False) + self.manual_threshold_config_file = kwargs.get("manual_threshold_config_file", "") + self.manual_threshold_config = self._load_manual_threshold_config( + self.manual_threshold_config_file + ) if self.test_tol: torch.set_printoptions(profile="short") self.converter = get_converter() + def _load_manual_threshold_config(self, manual_threshold_config_file): + if not manual_threshold_config_file: + return {} + with open(manual_threshold_config_file, encoding="utf-8") as f: + config = yaml.safe_load(f) or {} + return config.get("manual_threshold_config") or {} + + def get_atol(self): + api_name = ( + self.paddle_args[0] + if self.api_config.api_name == "paddle._C_ops._run_custom_op" + else self.api_config.api_name + ) + threshold = self.manual_threshold_config.get(api_name) + if threshold is not None: + return threshold[0] + return self.atol + + def get_rtol(self): + api_name = ( + self.paddle_args[0] + if self.api_config.api_name == "paddle._C_ops._run_custom_op" + else self.api_config.api_name + ) + threshold = self.manual_threshold_config.get(api_name) + if threshold is not None: + return threshold[1] + return self.rtol + def _reset_random_state(self, seed: int = 42): """Reset numpy / paddle / torch (CPU+CUDA) RNGs so random APIs (uniform, normal, randn, bernoulli, dropout, ...) produce @@ -310,7 +344,7 @@ def compare_paddle_and_torch(paddle_tensor, torch_tensor, idx=0) -> bool: # torch_tensor = torch_tensor.to(dtype=torch.float32) # self.np_assert_accuracy(paddle_tensor.numpy(), torch_tensor.numpy(), atol=self.atol, rtol=self.rtol) self.torch_assert_accuracy( - paddle_tensor, torch_tensor, atol=self.atol, rtol=self.rtol + paddle_tensor, torch_tensor, atol=self.get_atol(), rtol=self.get_rtol() ) except Exception as err: if self.is_backward: @@ -386,8 +420,8 @@ def compare_paddle_and_torch(paddle_tensor, torch_tensor, idx=0) -> bool: self.np_assert_accuracy( numpy.array(paddle_item), numpy.array(torch_item), - atol=self.atol, - rtol=self.rtol, + atol=self.get_atol(), + rtol=self.get_rtol(), ) # especially for paddle.vision.ops.distribute_fpn_proposals elif isinstance(paddle_item, list) and isinstance(torch_item, list): @@ -537,8 +571,8 @@ def compare_paddle_and_torch(paddle_tensor, torch_tensor, idx=0) -> bool: self.np_assert_accuracy( numpy.array(paddle_item), numpy.array(torch_item), - atol=self.atol, - rtol=self.rtol, + atol=self.get_atol(), + rtol=self.get_rtol(), ) elif ( paddle_item is None