diff --git a/clean.sh b/clean.sh new file mode 100644 index 0000000..0040eed --- /dev/null +++ b/clean.sh @@ -0,0 +1,11 @@ +cd FC_INPUT/ +rm -rf * +cd .. +rm -rf dump_info/ +cd XPU/output +rm -rf * +cd ../output_backward +rm -rf * +cd ../../paddleapex/apex/ +rm -rf XPU +rm -rf GPU diff --git a/paddleapex/apex/acc_direct_paddle.py b/paddleapex/apex/acc_direct_paddle.py new file mode 100644 index 0000000..8bf3b62 --- /dev/null +++ b/paddleapex/apex/acc_direct_paddle.py @@ -0,0 +1,249 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import csv +import argparse +import sys +import time +import paddle +import tqdm +import pandas as pd + +import paddle.distributed as dist + +from compare_utils.compare import Comparator +from compare_utils.compare_dependency import print_info_log, FileOpen + +current_time = time.strftime("%Y%m%d%H%M%S") +rank = dist.get_rank() + +RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + "_" + str(rank) + ".csv" +DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + "_" + str(rank) + ".csv" + +tqdm_params = { + "smoothing": 0, # 平滑进度条的预计剩余时间,取值范围0到1 + "desc": "Processing", # 进度条前的描述文字 + "leave": True, # 迭代完成后保留进度条的显示 + "ncols": 75, # 进度条的固定宽度 + "mininterval": 0.1, # 更新进度条的最小间隔秒数 + "maxinterval": 1.0, # 更新进度条的最大间隔秒数 + "miniters": 1, # 更新进度条之间的最小迭代次数 + "ascii": None, # 根据环境自动使用ASCII或Unicode字符 + "unit": "it", # 迭代单位 + "unit_scale": True, # 自动根据单位缩放 + "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 + "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出格式 +} + + +def _compare_parser(parser): + parser.add_argument( + "-bench", + "--benchmark", + dest="bench_dir", + type=str, + help="The executed output api tensor path directory on BENCH", + required=True, + ) + parser.add_argument( + "-device", + "--device", + dest="device_dir", + type=str, + help="The executed output api tensor path directory on DEVICE", + required=True, + ) + parser.add_argument( + "-o", + "--output_path", + dest="out_path", + default="", + type=str, + help=" The result out path", + ) + + +def compare_command(args): + out_path = os.path.realpath(args.out_path) if args.out_path else "./" + os.makedirs(out_path, exist_ok=True) + result_csv_path = os.path.join(out_path, RESULT_FILE_NAME) + details_csv_path = os.path.join(out_path, DETAILS_FILE_NAME) + print_info_log(f"Compare task result will be saved in {result_csv_path}") + print_info_log(f"Compare task details will be saved in {details_csv_path}") + #bench_dir = os.path.join(args.bench_dir, "./rank_" + str(rank) + "/output") + #device_dir = os.path.join(args.device_dir, "./rank_" + str(rank) + "/output") + #bench_back_dir = os.path.join(args.bench_dir, "./rank_" + str(rank) + "/output_backward") + #device_back_dir = os.path.join(args.device_dir, "./rank_" + str(rank) + "/output_backward") + bench_dir = os.path.join(args.bench_dir, "./output") + device_dir = os.path.join(args.device_dir, "./output") + bench_back_dir = os.path.join(args.bench_dir, "./output_backward") + device_back_dir = os.path.join(args.device_dir, "./output_backward") + + + compare_device_bench( + result_csv_path, + details_csv_path, + bench_dir, + device_dir, + out_path, + bench_back_dir, + device_back_dir, + ) + + +def compare_device_bench( + result_csv_path, + details_csv_path, + bench_dir, + device_dir, + out_path, + bench_grad_dir=None, + device_grad_dir=None, +): + Warning_list = [] + compare = Comparator(result_csv_path, details_csv_path, False) + with FileOpen(result_csv_path, "r") as file: + csv_reader = csv.reader(file) + next(csv_reader) + api_pt_files_bench = os.listdir(bench_dir) + api_pt_files_device = os.listdir(device_dir) + api_pt_files_all = list(set(api_pt_files_bench + api_pt_files_device)) + api_pt_files_all = sorted(api_pt_files_all) + + errors = [] + errors_forward_info = [] + errors_bacward_info = [] + for i, api_file in enumerate(tqdm.tqdm(api_pt_files_all, **tqdm_params)): + bench_out_tensor, device_out_tensor = None, None + bench_grad_tensor_list, device_grad_tensor_list = None, None + try: + print("=" * 100) + bench_pt_path = os.path.join(bench_dir, api_file) + device_pt_path = os.path.join(device_dir, api_file) + if os.path.exists(bench_pt_path) and os.path.exists(device_pt_path): + print(f"Loading {bench_pt_path} & {device_pt_path}") + bench_BF16_flag, bench_out_tensor = paddle.load(bench_pt_path) + device_BF16_flag, device_out_tensor = paddle.load(device_pt_path) + elif os.path.exists(bench_pt_path) or os.path.exists(device_pt_path): + msg = f"{api_file} One framework has No output!" + Warning_list.append(msg) + print(msg) + continue + else: + msg = f"{api_file} has no output, please refer to run_ut warning log info." + Warning_list.append(msg) + print(msg) + continue + print(bench_grad_dir) + print(device_grad_dir) + if bench_grad_dir and device_grad_dir: + bench_grad_path = os.path.join(bench_grad_dir, api_file) + device_grad_path = os.path.join(device_grad_dir, api_file) + if os.path.exists(bench_grad_path) and os.path.exists(device_grad_path): + _, bench_grad_tensor_list = paddle.load(bench_grad_path) + _, device_grad_tensor_list = paddle.load(device_grad_path) + print(f"Loading {bench_grad_path} & {device_grad_path}") + elif os.path.exists(bench_grad_path) or os.path.exists( + device_grad_path + ): + msg = f"{api_file} One framework has No gard output!" + Warning_list.append(msg) + print(msg) + else: + msg = f"{api_file} has no grad output, please refer to run_ut warning log info." + Warning_list.append(msg) + print(msg) + + error_i = [] + msg = f"{api_file} forward -------------" + Warning_list.append(msg) + print(msg) + compare_result(bench_out_tensor, device_out_tensor, error_i, api_file + " forward") + errors_forward_info = errors_forward_info + error_i + + error_i = [] + msg = f"{api_file} backward -------------" + Warning_list.append(msg) + print(msg) + compare_result(bench_grad_tensor_list, device_grad_tensor_list, error_i, api_file + " backward") + errors_bacward_info = errors_bacward_info + error_i + except Exception as err: + print(err) + errors_bacward_info.sort(key=lambda x: x[1]) + errors_forward_info.sort(key=lambda x: x[1]) + df = pd.DataFrame(errors_bacward_info, columns=["operator_name", "error<0.001", "bench_data", "device_data", "diff_index"]) + df.to_csv("log/rank" + str(dist.get_rank()) + "_backward_output.csv", index=False) + df = pd.DataFrame(errors_forward_info, columns=["operator_name", "error<0.001", "bench_data", "device_data", "diff_index"]) + df.to_csv("log/rank" + str(dist.get_rank()) + "_forward_output.csv", index=False) + + warning_log_pth = os.path.join(out_path, "./compare_warning.txt") + File = open(warning_log_pth, "w") + for item in Warning_list: + File.write(item + "\n") + File.close() + +def normalize_t(tensor0, tensor1): + min_val0, min_val1 = paddle.min(tensor0), paddle.min(tensor1) + max_val0, max_val1 = paddle.max(tensor0), paddle.max(tensor1) + min_val = min(min_val0, min_val1) + max_val = max(max_val0, max_val1) + if len(tensor0) == 1: + return tensor0 / max_val, tensor1 / max_val + if min_val == max_val: + return paddle.ones_like(tensor0), paddle.ones_like(tensor1) + return (tensor0 - min_val) / (max_val - min_val), (tensor1 - min_val) / (max_val - min_val) + +def compare_result(bench_output, device_output, errors, name): + if isinstance(bench_output, (list, tuple)): + for b_out_i, n_out_i in zip(bench_output, device_output): + compare_result(b_out_i, n_out_i, errors, name) + if isinstance(bench_output, paddle.Tensor): + bench_output_o = bench_output.reshape([-1,]) + device_output_o = device_output.reshape([-1,]) + bench_output, device_output = normalize_t(bench_output_o, device_output_o) + # bench_output = paddle.cast(bench_output, "float") + # device_output = paddle.cast(device_output, "float") + diff = paddle.cast((bench_output - device_output).abs(), "float") + num = len(diff) + diff005 = (diff < 0.05).sum() / num + diff001 = (diff < 0.01).sum() / num + diff0005 = (diff < 0.005).sum() / num + diff0001 = (diff < 0.001).sum() / num + diff00005 = (diff < 0.0005).sum() / num + if diff0001 < 1 or len(bench_output) == 1: + diff_value, diff_index = paddle.topk(diff, k=min(10, num)) + error_info = diff0001.numpy() + bench_n = paddle.cast(bench_output_o[diff_index], "float").numpy().tolist() + device_n = paddle.cast(device_output_o[diff_index], "float").numpy().tolist() + diff_index_n = diff_index.numpy().tolist() + # diff_value_n = diff_value.numpy().tolist() + # errors.append((name, error_info, str(bench_n), str(device_n), str(diff_value_n), str(diff_index_n))) + errors.append((name, error_info, str(bench_n), str(device_n), str(diff_index_n))) + print("diff is too large---------------------------- erorr Erorr ERORR----------------------------") + print("bench_output----------") + print(bench_output_o[diff_index]) + print("device_output---------") + print(device_output_o[diff_index]) + print("diff < 0.05: ", diff005.numpy()) + print("diff < 0.01: ", diff001.numpy()) + print("diff < 0.005: ", diff0005.numpy()) + print("diff < 0.001: ", diff0001.numpy()) + print("diff < 0.0005: ", diff00005.numpy()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + _compare_parser(parser) + args = parser.parse_args(sys.argv[1:]) + compare_command(args) diff --git a/paddleapex/apex/combine_file.py b/paddleapex/apex/combine_file.py new file mode 100644 index 0000000..78dc668 --- /dev/null +++ b/paddleapex/apex/combine_file.py @@ -0,0 +1,45 @@ +import pandas as pd +import glob +import os + +# 定义包含 CSV 文件的目录 +csv_dir = 'log/' + +# 使用 glob 模块查找目录中所有的 CSV 文件 +csv_files = glob.glob(os.path.join(csv_dir, '*forward*.csv')) +dataframes = [] +for file in csv_files: + df = pd.read_csv(file) + dataframes.append(df) + +combined_df = pd.concat(dataframes, axis=0, ignore_index=True) + +# 假设所有 CSV 的列名和顺序相同,按第二列排序 +# 使用 iloc[:, 1] 获取第二列的列名 +second_column_name = combined_df.columns[1] +# 按第二列排序 +sorted_df = combined_df.sort_values(by=second_column_name) +# 输出排序后的 DataFrame +print(sorted_df) +# 可选:将排序后的 DataFrame 保存为新的 CSV 文件 +sorted_df.to_csv('sorted_combined_forward.csv', index=False) + + +# 使用 glob 模块查找目录中所有的 CSV 文件 +csv_files = glob.glob(os.path.join(csv_dir, '*backward*.csv')) +dataframes = [] +for file in csv_files: + df = pd.read_csv(file) + dataframes.append(df) + +combined_df = pd.concat(dataframes, axis=0, ignore_index=True) + +# 假设所有 CSV 的列名和顺序相同,按第二列排序 +# 使用 iloc[:, 1] 获取第二列的列名 +second_column_name = combined_df.columns[1] +# 按第二列排序 +sorted_df = combined_df.sort_values(by=second_column_name) +# 输出排序后的 DataFrame +print(sorted_df) +# 可选:将排序后的 DataFrame 保存为新的 CSV 文件 +sorted_df.to_csv('sorted_combined_backward.csv', index=False) diff --git a/paddleapex/apex/run_llama10b_xpu.sh b/paddleapex/apex/run_llama10b_xpu.sh new file mode 100644 index 0000000..875b1a4 --- /dev/null +++ b/paddleapex/apex/run_llama10b_xpu.sh @@ -0,0 +1,61 @@ +task_name_or_path="llama-10b" +export XPU_FORCE_USERMODE_LAUNCH=1 +export PYTHONPATH=$PYTHONPATH:/work/APEX/PaddleAPEX:/work/APEX/PaddleNLP +export XPUAPI_DEBUG=0x1 +runtime_location=/workspace/so-runtime +bkcl_location=/workspace/so-bkcl +export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:$LD_LIBRARY_PATH + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 + +export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=1 +export BKCL_TREE_THRESHOLD=0 +export BKCL_FORCE_L3_RDMA=0 +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +echo "bkcl version:" +strings ${bkcl_location}/libbkcl.so | grep COM +master_ip=$POD_0_IP +nnodes=$PADDLE_TRAINERS_NUM +echo "master ip:" +echo $master_ip + +export CUDA_DEVICE_MAX_CONNECTIONS=8 + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) +echo "PaddleNLP_DIR: "$PaddleNLP_DIR + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 + +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 +export XPU_PADDLE_FC_LOCAL_INT16=1 + + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_paddle.py \ + -json \ + "/work/APEX/PaddleNLP/dump_info/rank0_step5/forward_rank0_all.json /work/APEX/PaddleNLP/dump_info/rank1_step5/forward_rank1_all.json /work/APEX/PaddleNLP/dump_info/rank2_step5/forward_rank2_all.json /work/APEX/PaddleNLP/dump_info/rank3_step5/forward_rank3_all.json /work/APEX/PaddleNLP/dump_info/rank4_step5/forward_rank4_all.json /work/APEX/PaddleNLP/dump_info/rank5_step5/forward_rank5_all.json /work/APEX/PaddleNLP/dump_info/rank6_step5/forward_rank6_all.json /work/APEX/PaddleNLP/dump_info/rank7_step5/forward_rank7_all.json" \ + -backend xpu \ + -real \ + "/work/APEX/PaddleNLP/dump_info/rank0_step5/ /work/APEX/PaddleNLP/dump_info/rank1_step5/ /work/APEX/PaddleNLP/dump_info/rank2_step5/ /work/APEX/PaddleNLP/dump_info/rank3_step5/ /work/APEX/PaddleNLP/dump_info/rank4_step5/ /work/APEX/PaddleNLP/dump_info/rank5_step5/ /work/APEX/PaddleNLP/dump_info/rank6_step5/ /work/APEX/PaddleNLP/dump_info/rank7_step5/" \ + -out /work/APEX/PaddleNLP/result/ -mode acc -class 1 -dist 1 + diff --git a/paddleapex/apex/run_llama20b_xpu.sh b/paddleapex/apex/run_llama20b_xpu.sh new file mode 100644 index 0000000..cd24622 --- /dev/null +++ b/paddleapex/apex/run_llama20b_xpu.sh @@ -0,0 +1,77 @@ +#!/bin/bash +task_name_or_path="llama-20b" +#export XPUAPI_DEBUG=0x1 +#export XPURT_DISPATCH_MODE=PROFILING +export XPU_FORCE_USERMODE_LAUNCH=1 +export PYTHONPATH=$PYTHONPATH:/ssd3/zhouxiangquan/PaddleAPEX:/ssd3/zhouxiangquan/PaddleNLP + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 +export XPU_PADDLE_FC_LOCAL_INT16=1 + +# BKCL +# export BKCL_DEBUG=1 +# Multi-computer RDMA +#export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=0 +export BKCL_TREE_THRESHOLD=0 +#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 +#export BKCL_SOCKET_IFNAME=eth0 +export BKCL_FORCE_L3_RDMA=0 + +export CUDA_DEVICE_MAX_CONNECTIONS=8 +export CUDA_DEVICE_ORDER=OAM_ID +export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 +export XPU_AUTO_BF16_TF32_RADIO=1 +export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 +export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 + + +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" acc_direct_cmp_zxq.py --bench /ssd3/zhouxiangquan/llama20b/GPU/ --device /ssd3/zhouxiangquan/llama20b/result/rank_0/ -o /ssd3/zhouxiangquan/llama20b/ +#python lot_t.py + +#python run_paddle.py -json /ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/test.json -backend xpu -real /ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_class.py \ + -json \ + "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/forward_rank0_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/forward_rank1_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/forward_rank2_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/forward_rank3_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/forward_rank4_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/forward_rank5_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/forward_rank6_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/forward_rank7_all.json" \ + -backend xpu \ + -real \ + "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/" \ + -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc # -class 1 -dist 1 + +# +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ +# -json \ +# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/distributed.json" \ +# -backend xpu \ +# -real \ +# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/" \ +# -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc +# +# +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_without_distributed.py \ +# -json \ +# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/common.json" \ +# -backend xpu \ +# -real \ +# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/" \ +# -out /ssd3/zhouxiangquan/llama20b/result/rank_0/ -mode acc +# + diff --git a/paddleapex/apex/run_moe_xpu.sh b/paddleapex/apex/run_moe_xpu.sh new file mode 100644 index 0000000..d62ae40 --- /dev/null +++ b/paddleapex/apex/run_moe_xpu.sh @@ -0,0 +1,46 @@ +ask_name_or_path="llama-moe" +export XPU_FORCE_USERMODE_LAUNCH=1 +export PYTHONPATH=$PYTHONPATH:/ssd3/zhouxiangquan/PaddleAPEX:/ssd3/zhouxiangquan/PaddleNLP + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 + +# BKCL +export BKCL_TREE_THRESHOLD=0 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" + +export CUDA_DEVICE_MAX_CONNECTIONS=8 +#export XPUAPI_DEBUG=0x1 +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) + +export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 +export XPU_PADDLE_FC_LOCAL_INT16=1 +# --resume_from_checkpoint "/workspace/mnt/moe_workspace/llama-moe-gpu-checkpoint-2" \ + + +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" acc_direct_paddle.py --bench /ssd3/zhouxiangquan/moe/GPU/ --device /ssd3/zhouxiangquan/moe/result/ -o /ssd3/zhouxiangquan/moe/ + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_paddle.py \ + -json \ + "/ssd3/zhouxiangquan/moe/dump_info/rank0_step0/forward_rank0_all.json /ssd3/zhouxiangquan/moe/dump_info/rank1_step0/forward_rank1_all.json /ssd3/zhouxiangquan/moe/dump_info/rank2_step0/forward_rank2_all.json /ssd3/zhouxiangquan/moe/dump_info/rank3_step0/forward_rank3_all.json /ssd3/zhouxiangquan/moe/dump_info/rank4_step0/forward_rank4_all.json /ssd3/zhouxiangquan/moe/dump_info/rank5_step0/forward_rank5_all.json /ssd3/zhouxiangquan/moe/dump_info/rank6_step0/forward_rank6_all.json /ssd3/zhouxiangquan/moe/dump_info/rank7_step0/forward_rank7_all.json" \ + -backend xpu \ + -real \ + "/ssd3/zhouxiangquan/moe/dump_info/rank0_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank1_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank2_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank3_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank4_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank5_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank6_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank7_step0/" \ + -out /ssd3/zhouxiangquan/moe/result/ -mode pro -class 1 -dist 1 + diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 2fa51b3..037f73d 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -12,18 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. +#import paddlenlp # if you wanna test nlp fusion operations import argparse import os +from importlib import import_module import shutil import time import copy +import json +import yaml +import re from tqdm import tqdm +import pickle import paddle +import paddle.distributed as dist +from paddle.distributed import fleet from paddle import framework from paddle.base import core from utils import ( print_info_log, gen_api_params, + create_model, api_json_read, check_grad_list, rand_like, @@ -31,13 +40,35 @@ print_warn_log, ) +os.environ["USE_CASUAL_MASK"] = "True" + type_map = { "FP16": paddle.float16, "FP32": paddle.float32, "BF16": paddle.bfloat16, } + +yaml_path = "../api_tracer/configs/op_target.yaml" +f = open(yaml_path, "r") +Ops = yaml.safe_load(f) +target_op = Ops.get("target_op") +ignored_op = Ops.get("ignored_op") +target_class = Ops.get("target_class") +distributed_op = Ops.get("distributed_op") +if target_op is None: + target_op = [] +if ignored_op is None: + ignored_op = [] +if target_class is None: + target_class = [] +if distributed_op is None: + distributed_op = [] +f.close() + Warning_list = [] +IGNORED_LIST = ["paddle._C_ops.gaussian"] + current_time = time.strftime("%Y%m%d%H%M%S") tqdm_params = { @@ -54,7 +85,10 @@ "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 } -PROFILE_RUN_TIMES = 100 + +PROFILE_WARM_TIMES = 5 +PROFILE_RUN_TIMES = 5 + def recursive_delete_arg(arg_in): if isinstance(arg_in, (list, tuple)): @@ -107,9 +141,13 @@ def convert_out2fp32(arg_in): return flag, res elif isinstance(arg_in, paddle.Tensor): if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": - arg_in = arg_in.cast("float32") - flag = True - return flag, arg_in + try: + arg_in = arg_in.cast("float32") + flag = True + except Exception as err: + print(arg_in) + return False, arg_in + return flag, arg_in def recursive_arg_to_cpu(arg_in): @@ -137,11 +175,11 @@ def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): arg_in = arg_in.cuda() if "cpu" in backend: arg_in = arg_in.cpu() - if arg_in.dtype.name == "BF16": + if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": arg_in = arg_in.cast("float32") else: arg_in = arg_in.to(backend) - if enforce_dtype and arg_in.dtype.name in ["BF16", "FP16", "FP32"]: + if enforce_dtype and arg_in.dtype.name in ["BF16", "BFLOAT16", "FP16", "FP32"]: arg_in = arg_in.cast(enforce_dtype) arg_in.stop_gradient = grad_status return arg_in @@ -150,6 +188,8 @@ def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): + if not dist.get_rank() == 0: + return if dtype_name == "": bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) fwd_output_dir = os.path.abspath(os.path.join(out_path, "output")) @@ -162,12 +202,28 @@ def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=" bwd_output_path = os.path.join(bwd_output_dir, api_call_name) os.makedirs(fwd_output_dir, exist_ok=True) os.makedirs(bwd_output_dir, exist_ok=True) - if not isinstance(forward_res, type(None)): - fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) - paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) - if not isinstance(backward_res, type(None)): - bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) - paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) + if isinstance(forward_res, (type(None), list, tuple, paddle.Tensor)): + try: + fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) + paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) + except Exception as err: + msg = "save_forward Error: %s" % str(err) + print_warn_log(msg) + return + else: + print(forward_res) + print_warn_log("forward_res not supported!") + if isinstance(backward_res, (type(None), list, tuple, paddle.Tensor)): + try: + bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) + paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) + except Exception as err: + msg = "save_bacward Error: %s" % str(err) + print_warn_log(msg) + return + else: + print(backward_res) + print_warn_log("bacward_res not supported!") def evoke_related_test_func(test_mode): @@ -200,6 +256,9 @@ def ut_case_parsing(forward_content, cfg): for i, (api_call_name, api_info_dict) in enumerate( tqdm(forward_content.items(), **tqdm_params) ): + api_call_stack = api_call_name.rsplit("*")[0] + if api_call_stack in IGNORED_LIST: + continue if debug_mode and api_call_name not in debug_case: continue if len(multi_dtype_ut) > 0: @@ -213,7 +272,7 @@ def ut_case_parsing(forward_content, cfg): else: print(api_call_name) args = api_call_name, api_info_dict, backend, out_path - kwargs = {"enforce_dtype": None, "debug_case": debug_case} + kwargs = {"enforce_dtype": None, "debug_case": debug_case, "real_data_path": cfg.real_data} if isinstance(run_case_funcs, list): for run_case in run_case_funcs: run_case(*args, **kwargs) @@ -246,8 +305,8 @@ def run_forward(api_call_name, device_args, device_kwargs): api_call_stack = api_call_name.rsplit("*")[0] try: device_out = eval(api_call_stack)(*device_args, **device_kwargs) + paddle.device.synchronize() return device_out - except Exception as err: msg = f"Run API {api_call_name} Forward Error: %s" % str(err) print_warn_log(msg) @@ -274,10 +333,57 @@ def get_grad_tensor(args, kwargs): return device_grad_out -def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=None): +def get_need_grad_out(args): + device_grad_out = [] + if isinstance(args, paddle.Tensor): + device_grad_out.append(args) + if isinstance(args, (list, tuple)): + for x in args: + if isinstance(x, paddle.Tensor) and x.stop_gradient == False: + device_grad_out.append(x) + return device_grad_out + + +def print_tensor_name(args): + if isinstance(args, paddle.Tensor): + print(args.name) + if isinstance(args, (list, tuple)): + for x in args: + print_tensor_name(x) + + +def get_dout_sequence(dout_info_dict, order): + if isinstance(dout_info_dict, dict): + rel_data_path = dout_info_dict.get("real_data_path") + match = re.search(r'grad_(\d+)\.pt$', rel_data_path) + if match: + order.append(int(match.group(1))) + else: + print("match faile, check it!!!!!!") + elif isinstance(dout_info_dict, (list, tuple)): + for info in dout_info_dict: + get_dout_sequence(info, order) + else: + print("match faile, check it!!!!!!") + + +def reorder_dout(dout_info_dict, dout): + if dout_info_dict[0] == "Failed": + return dout + order = [] + get_dout_sequence(dout_info_dict, order) + ordered_out = [None] * len(dout) + for i in range(len(order)): + ordered_out[order[i]] = dout[i] + return ordered_out + + +def run_backward(dout_info_dict, api_call_name, device_out, dout, args, kwargs, need_backward=None): if need_backward: try: - paddle.autograd.backward([device_out], dout) + device_out = get_need_grad_out(device_out) + dout = reorder_dout(dout_info_dict, dout) + paddle.autograd.backward(device_out, dout) device_grad_out = get_grad_tensor(args, kwargs) device_grad_out = check_grad_list(device_grad_out) if device_grad_out is None: @@ -296,34 +402,71 @@ def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=No return None +def run_model_forward(model, device_args, device_kwargs): + try: + device_out = model(*device_args, **device_kwargs) + paddle.device.synchronize() + return device_out + except Exception as err: + msg = f"Run Forward Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return None + + def run_acc_case( api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None ): + api_call_stack = api_call_name.rsplit("*")[0] api_info_dict_copy = copy.deepcopy(api_info_dict) device_args, device_kwargs, need_backward = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) + api_info_dict_copy, backend, enforce_dtype, real_data_path) print(f"Running {api_call_name} acc test!") - if api_call_name in debug_case: + if api_call_name in debug_case: x = [device_args, device_kwargs] out_path = os.path.realpath(out_path) if out_path else "./" save_pth = os.path.join(out_path, "input_data", api_call_name) paddle.save(x, save_pth) - try: - device_out = run_forward(api_call_name, device_args, device_kwargs) - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return + # if this case is class + if api_call_stack in target_class: + if real_data_path == None: + msg = (f"Running {api_call_name} acc Failed! Don't support run class without real_data_path!") + print_warn_log(msg) + Warning_list.append(msg) + return + else: + try: + model = create_model(api_call_name.rsplit("*")[0], real_data_path + api_call_name) + device_out = run_model_forward(model, device_args, device_kwargs) + except Exception as err: + msg = "Run_class_forward Error: %s" % str(err) + print_warn_log(msg) + return + else: + try: + device_out = run_forward(api_call_name, device_args, device_kwargs) + if api_call_stack in distributed_op: + from paddle.base.libpaddle import task + if type(device_out) is task: + print('this is distributed op: ', api_call_name) + device_out = device_args + except Exception as err: + msg = "Run_op_forward Error: %s" % str(err) + print_warn_log(msg) + return + try: device_grad_out = [] - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - device_grad_out = run_backward( - api_call_name, device_out, dout, device_args, device_kwargs, need_backward - ) + if api_info_dict["dout_list"][0] != "Failed": + dout = create_dout( + api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path + ) + device_grad_out = run_backward( + api_info_dict["dout_list"], api_call_name, device_out, dout, device_args, device_kwargs, need_backward + ) + else: + device_grad_out = None except Exception as err: msg = "Run_backward Error: %s" % str(err) print_warn_log(msg) @@ -346,6 +489,7 @@ def run_acc_case( def run_profile_case( api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None ): + api_call_stack = api_call_name.rsplit("*")[0] print(f"Running {api_call_name} profile test!") api_info_dict_copy = copy.deepcopy(api_info_dict) device_args, device_kwargs, need_backward = create_input_args( @@ -356,56 +500,89 @@ def run_profile_case( out_path = os.path.realpath(out_path) if out_path else "./" save_pth = os.path.join(out_path, "input_data", api_call_name) paddle.save(x, save_pth) - # device warmming up - try: - device_out = run_forward(api_call_name, device_args, device_kwargs) - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - paddle.autograd.backward([device_out], dout) - except Exception as err: - msg = "Failed in device warming up: %s" % str(err) - print_warn_log(msg) - return + + if api_info_dict["dout_list"][0] == "Failed": + need_backward = False input_shape1 = get_shape(device_args) input_shape2 = get_shape(device_kwargs) input_shape_lst = merge_two_lists(input_shape1, input_shape2) - output_shape_lst = get_shape(device_out) + output_shape_lst = [] def profile_inner_loop_(): + is_model = False try: + if api_call_stack in target_class: + if real_data_path == None: + msg = (f"Running {api_call_name} acc Failed! Don't support run class without real_data_path!") + print_warn_log(msg) + Warning_list.append(msg) + return -1, -1, output_shape_lst + else: + model = create_model(api_call_name.rsplit("*")[0], real_data_path + api_call_name) + is_model = True paddle.device.synchronize() - fwd_start_time = time.time() - for _ in range(PROFILE_RUN_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) - paddle.device.synchronize() - fwd_end_time = time.time() + fwd_start_time = 0 + fwd_end_time = 0 + if is_model: + for _ in range(PROFILE_WARM_TIMES): + device_out = model(*device_args, **device_kwargs) + output_shape_lst = get_shape(device_out) + paddle.device.synchronize() + fwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + device_out = model(*device_args, **device_kwargs) + paddle.device.synchronize() + fwd_end_time = time.time() + else: + for _ in range(PROFILE_WARM_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + output_shape_lst = get_shape(device_out) + paddle.device.synchronize() + fwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + paddle.device.synchronize() + fwd_end_time = time.time() fwd_time = fwd_end_time - fwd_start_time fwd_time = fwd_time * 1000000 / float(PROFILE_RUN_TIMES) # fwd_time is in us except Exception as err: msg = "Run_forward Error: %s" % str(err) print_warn_log(msg) - return -1, -1 + return -1, -1, output_shape_lst try: if not need_backward: - return fwd_time, -1 + return fwd_time, -1, output_shape_lst + bwd_start_time = 0 + bwd_end_time = 0 + dout = create_dout(api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path) + dout = reorder_dout(api_info_dict["dout_list"], dout) + device_out_list = [] paddle.device.synchronize() - bwd_start_time = time.time() for _ in range(PROFILE_RUN_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) - paddle.autograd.backward([device_out], dout) + if is_model: + output = model(*device_args, **device_kwargs) + else: + output = run_forward(api_call_name, device_args, device_kwargs) + output = get_need_grad_out(output) + if len(output) == 0: + return fwd_time, -1, output_shape_lst + device_out_list.append(output) + + paddle.device.synchronize() + bwd_start_time = time.time() + for i in range(PROFILE_RUN_TIMES): + paddle.autograd.backward(device_out_list[i], dout) paddle.device.synchronize() bwd_end_time = time.time() bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us - bwd_time = bwd_time - fwd_time except Exception as err: msg = "Run_backward Error: %s" % str(err) print_warn_log(msg) - return fwd_time, -1 - return fwd_time, bwd_time + return fwd_time, -1, output_shape_lst + return fwd_time, bwd_time, output_shape_lst try: - fwd_time, bwd_time = profile_inner_loop_() + fwd_time, bwd_time, output_shape_lst = profile_inner_loop_() except Exception as err: msg = f"Run {api_call_name} profile Error: %s" % str(err) print_warn_log(msg) @@ -543,16 +720,138 @@ def arg_parser(parser): help="debug_op name", required=False, ) + parser.add_argument( + "-class", + "--class_op", + dest="test_class", + default=False, + type=bool, + help="test class op", + required=False, + ) + parser.add_argument( + "-class_type", + "--class_type", + dest="class_default_type", + default="bfloat16", + type=str, + help="the default type of class", + required=False, + ) + parser.add_argument( + "-dp", + "--dp_degree", + dest="dp_degree", + default=1, + type=int, + help="dp_degree", + required=False, + ) + parser.add_argument( + "-mp", + "--mp_degree", + dest="mp_degree", + default=8, + type=int, + help="mp_degree", + required=False, + ) + parser.add_argument( + "-pp", + "--pp_degree", + dest="pp_degree", + default=1, + type=int, + help="pp_degree", + required=False, + ) + parser.add_argument( + "-sd", + "--sharding_degree", + dest="sharding_degree", + default=1, + type=int, + help="sharding_degree", + required=False, + ) + parser.add_argument( + "-dist", + "--distributed_op", + dest="distributed_op", + default=False, + type=bool, + help="distributed_mode", + required=False, + ) + +def check_json(json_list): + data_list = [] + for json_file in json_list: + f = open(json_file, 'r', encoding='utf-8') + data = json.load(f) + keys = [] + for key, _ in data.items(): + keys.append(key) + data_list.append(keys) + f.close() + + for i in range(len(data_list[0])): + key = data_list[0][i] + for j in range(len(data_list) - 1): + key_j = data_list[j + 1][i] + if key != key_j: + print("op: rand0: " + str(key) + " rank" + str(j + 1) + ": " + str(key_j)) + return False + return True if __name__ == "__main__": parser = argparse.ArgumentParser() arg_parser(parser) cfg = parser.parse_args() - forward_content = api_json_read(cfg.json_path) + out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" if os.path.exists(out_path): print_warn_log("The output path already exists and the file with the same name will be overwritten.") + + #from paddlenlp.trainer import set_seed + #set_seed(1026) + + if cfg.distributed_op: + if cfg.test_class: + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": cfg.dp_degree, + "mp_degree": cfg.mp_degree, + "pp_degree": cfg.pp_degree, + "sharding_degree": cfg.sharding_degree} + fleet.init(is_collective=True, strategy=strategy) + paddle.set_default_dtype(cfg.class_default_type) + + hcg = fleet.get_hybrid_communicate_group() + model_parallel_group = hcg.get_model_parallel_group() + paddle.distributed.barrier(model_parallel_group) + + dist.init_parallel_env() + local_rank = dist.get_rank() + + json_path_list = cfg.json_path.split(' ') + data_path_list = cfg.real_data.split(' ') + + if False and not check_json(json_path_list): + raise Exception("Check json faile!!!") + else: + cfg.json_path = json_path_list[local_rank] + cfg.real_data = data_path_list[local_rank] + cfg.backend = cfg.backend + ":" + str(local_rank) + print(cfg) + + out_path = out_path + "/rank_" + str(local_rank) + "/" + if not os.path.exists(out_path): + os.makedirs(out_path, exist_ok=True) + cfg.out_path = out_path + + forward_content = api_json_read(cfg.json_path) ut_case_parsing(forward_content, cfg) print_info_log("UT save completed") warning_log_pth = os.path.join(out_path, "./warning_log.txt") diff --git a/paddleapex/apex/train_mmdit_xpu.sh b/paddleapex/apex/train_mmdit_xpu.sh new file mode 100644 index 0000000..fffa3cd --- /dev/null +++ b/paddleapex/apex/train_mmdit_xpu.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +mpi_rank=${OMPI_COMM_WORLD_RANK:-0} +node_rank=$((mpi_rank+offset)) +mpi_node=${OMPI_COMM_WORLD_SIZE:-1} +echo "MPI status:${mpi_rank}/${mpi_node}" +nnode_train=${nnode_set:-${mpi_node}} +master_train=${master:-localhost} + +echo "Distributed Training ${node_rank}/${nnode_train} master=${master_train}" +set -x + +nnodes=$PADDLE_TRAINERS_NUM +rank=$PADDLE_TRAINER_ID + +#source ./script/utils.sh +for name in `env | grep -E 'PADDLE|ENDPOINT' | awk -F'=' '{print $1}'`; do + unset ${name} +done + +unset PADDLE_ELASTIC_JOB_ID +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +unset FLAGS_START_PORT +unset PADDLE_ELASTIC_TIMEOUT +unset PADDLE_TRAINERS_NUM +unset PADDLE_TRAINER_ID +unset PADDLE_WORKERS_IP_PORT_LIST +unset PADDLE_TRAINERS +unset PADDLE_NUM_GRADIENT_SERVERS + +export XPU_FORCE_USERMODE_LAUNCH=1 + +runtime_location=/workspace/so-runtime +bkcl_location=/workspace/so-bkcl +fast_paddle_location=/workspace/so-fast_paddle +export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:${fast_paddle_location}/:$LD_LIBRARY_PATH + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 +#export XPUAPI_DEFAULT_SIZE0=1502653248 +#export XPUAPI_DEFAULT_SIZE1=380265324 +export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 + +# BKCL +# Multi-computer RDMA +export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=1 +export BKCL_TREE_THRESHOLD=0 +export BKCL_FORCE_L3_RDMA=0 +export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 +export BKCL_SOCKET_IFNAME=eth0 +echo "bkcl version:" +strings ${bkcl_location}/libbkcl.so | grep COM + +export CUDA_DEVICE_MAX_CONNECTIONS=8 +export BKCL_FLAT_RING=1 + +master=`cat /root/paddlejob/workspace/hostfile | head -n 1 | awk '{print $1}'` +port=36677 + +export PYTHONPATH=/workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/PaddleAPEX:/workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/PaddleMIX/ppdiffusers:/workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/PaddleMIX/PaddleNLP:$PYTHONPATH + +tp2pp4=${tp2pp4:-"False"} +if [ ${tp2pp4} == "True" ];then + unset BKCL_RDMA_NICS + unset CUDA_DEVICE_ORDER + unset XPULINK_VISIBLE_DEVICES + + export CUDA_DEVICE_ORDER=OAM_ID + export XPULINK_VISIBLE_DEVICES=2,3,0,1,4,5,6,7 + export BKCL_RDMA_NICS=eth2,eth2,eth1,eth1,eth3,eth3,eth4,eth4 +fi + +export BKCL_USE_AR=1 +# export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +# accuracy improve: matmul with fp32 input will use fp32 to calc instead of using int16 +# export XPU_PADDLE_FC_LOCAL_INT16=1 +export XPU_AUTO_BF16_TF32=1 +export XPU_PADDLE_FC_TF32=1 + +# memory improve +# export XPU_INPLACE_SHARING_BF16_FP16_CACHE=1 + +export CUDA_DISABLE_PRINTF=1 +export BCCL_TRACE_HANG_ENABLE=1 +export BCCL_HANG_DETECT_INTERVAL=5 +export BCCL_UNIX_SOCKET_PATH=/var/run +export BCCL_ERROR_FILE=/root/paddlejob/workspace/log/err.%h.%p.log + +if [[ $rank -ge $nnodes ]]; then + exit 0 +fi + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +# open it when debug +export XPUAPI_DEBUG=0x1 +#export XPURT_DISPATCH_MODE="PROFILING" +#export GLOG_v=10 + +python -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_paddle.py -json \ + "/workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank0_step5/forward_rank0_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank1_step5/forward_rank1_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank2_step5/forward_rank2_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank3_step5/forward_rank3_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank4_step5/forward_rank4_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank5_step5/forward_rank5_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank6_step5/forward_rank6_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank7_step5/forward_rank7_all.json" \ + -backend xpu \ + -real \ + "/workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank0_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank1_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank2_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank3_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank4_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank5_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank6_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank7_step5/" \ + -out result_xpu/ -mode acc -class 1 -class_type float16 -dist 1 + diff --git a/paddleapex/apex/utils/__init__.py b/paddleapex/apex/utils/__init__.py index ed900ef..fb3d28a 100644 --- a/paddleapex/apex/utils/__init__.py +++ b/paddleapex/apex/utils/__init__.py @@ -29,7 +29,7 @@ seed_all, api_json_read, ) -from .data_generate import gen_api_params, rand_like, gen_args +from .data_generate import gen_api_params, create_model, rand_like, gen_args from .file_check_util import ( FileCheckException, FileChecker, diff --git a/paddleapex/apex/utils/data_generate.py b/paddleapex/apex/utils/data_generate.py index 25c09ce..7fa7375 100644 --- a/paddleapex/apex/utils/data_generate.py +++ b/paddleapex/apex/utils/data_generate.py @@ -19,6 +19,8 @@ import math import random import numpy as np +import pickle +from importlib import import_module from .utils import ( check_object_type, CompareException, @@ -86,6 +88,41 @@ ] +def load_params(filename): + with open(filename, 'rb') as f: + return pickle.load(f) + + +def create_model(api_call_stack, real_data_path): + # api_call_stack = api_call_name.rsplit("*")[0] + init_path = real_data_path + ".init_params" + state_path = real_data_path + ".state_dict" + [args, kwargs] = load_params(init_path) + state_para = paddle.load(state_path) + parent_package, class_n = api_call_stack.rsplit(".", maxsplit=1) + try: + MODULE = import_module(parent_package) + class_model = getattr(MODULE, class_n) + model = class_model(*args, **kwargs) + model.set_state_dict(state_para) + return model + except Exception as err: + print(init_path) + print(args) + print(kwargs) + print(state_path) + print(state_para) + msg = "Create Model Error: %s" % str(err) + print_warn_log(msg) + return None + + +def create_config(api_call_stack, real_data_path): + config_path = real_data_path + '.config' + config = load_params(config_path) + return config + + def gen_data(info, real_data_path=None): check_object_type(info, dict) data_type = info.get("type") @@ -110,6 +147,14 @@ def gen_data(info, real_data_path=None): data = eval(data_type)(data) except Exception as err: print_error_log("Failed to convert the type to numpy: %s" % str(err)) + elif data_type == 'class': + api_call_stack = info.get("api_call_stack") + data_pth = os.path.join(real_data_path, rel_data_path) + data = create_model(api_call_stack, data_pth) + elif data_type == 'config': + api_call_stack = info.get("api_call_stack") + data_pth = os.path.join(real_data_path, rel_data_path) + data = create_config(api_call_stack, data_pth) else: data = info.get("value") if info.get("type") == "slice": @@ -290,11 +335,11 @@ def rand_like(data, seed=1234): os.environ["PYTHONHASHSEED"] = str(seed) np.random.seed(seed) if isinstance(data, paddle.Tensor): - if data.dtype.name in ["BF16", "FP16"]: + if data.dtype.name in ["BF16", "FP16", "BFLOAT16", "FLOAT16"]: random_normals = numpy.random.randn(*data.shape) x = paddle.to_tensor(random_normals, dtype=data.dtype) return x - elif data.dtype.name in ["FP32", "FP64"]: + elif data.dtype.name in ["FP32", "FP64", "FLOAT32", "FLOAT64"]: random_normals = numpy.random.randn(*data.shape) x = paddle.to_tensor(random_normals, dtype=data.dtype) return x diff --git a/paddleapex/api_tracer/Dump.py b/paddleapex/api_tracer/Dump.py index 28787e4..826c31a 100644 --- a/paddleapex/api_tracer/Dump.py +++ b/paddleapex/api_tracer/Dump.py @@ -70,6 +70,7 @@ def __init__(self, mode="real_data", Async_save=cfg.Async_dump): self.rank = None self.dump_api_dict = None self.dump_api_dict_half = None + self.dump_api_dict_distributed = None self.dump_api_dict_other = None self.Async_save = Async_save @@ -103,11 +104,12 @@ def dump_real_data(self, api_args, tensor, rank): save_tensor(tensor, file_path) return f"{api_args}.pt" + """ Get Api_info dict, update self.dump_api_dict """ - def update_api_dict(self, api_info_dict, rank, is_half_precision = False): + def update_api_dict(self, api_info_dict, rank, is_half_precision = False, is_distributed = False): self.rank = rank if self.dump_api_dict is None: self.dump_api_dict = api_info_dict.copy() @@ -115,6 +117,11 @@ def update_api_dict(self, api_info_dict, rank, is_half_precision = False): self.dump_api_dict.update(api_info_dict) if cfg.split_dump: + if is_distributed: + if self.dump_api_dict_distributed is None: + self.dump_api_dict_distributed = api_info_dict.copy() + else: + self.dump_api_dict_distributed.update(api_info_dict) if is_half_precision: if self.dump_api_dict_half is None: self.dump_api_dict_half = api_info_dict.copy() @@ -148,11 +155,13 @@ def dump(self): write_json(directory, self.dump_api_dict, rank=self.rank, mode="forward", split_type="all") if cfg.split_dump: write_json(directory, self.dump_api_dict_half, rank=self.rank, mode="forward", split_type="half") + write_json(directory, self.dump_api_dict_distributed, rank=self.rank, mode="forward", split_type="distributed") write_json(directory, self.dump_api_dict_other, rank=self.rank, mode="forward", split_type="other") else: write_json(directory, self.dump_api_dict, rank=None, mode="forward", split_type="all") if cfg.split_dump: write_json(directory, self.dump_api_dict_half, rank=None, mode="forward", split_type="half") + write_json(directory, self.dump_api_dict_distributed, rank=None, mode="forward", split_type="distributed") write_json(directory, self.dump_api_dict_other, rank=None, mode="forward", split_type="other") diff --git a/paddleapex/api_tracer/Tracer.py b/paddleapex/api_tracer/Tracer.py index 9efe197..55abc98 100644 --- a/paddleapex/api_tracer/Tracer.py +++ b/paddleapex/api_tracer/Tracer.py @@ -16,19 +16,36 @@ from paddleapex.api_tracer.Dump import dump_util from paddleapex.api_tracer.wrap_op.hijack_tool import hijack_api from paddleapex.api_tracer.config import cfg - +from paddleapex.apex.utils import print_info_log class Tracer: - def __init__(self): + # def __init__(self): + # hijack_api() + + def register_op(self): hijack_api() def start(self): # Evoke stop implicity. - if cfg.dump_state: - dump_util.dump() + # dump_util.dump() # global step counting. cfg.new_step() def stop(self): - if cfg.dump_state: - dump_util.dump() + dump_util.dump() + + def start_in_training(self, cur_step, acc): + self.acc = acc + self.global_step = cur_step // acc + self.inner_step = cur_step % acc + if self.inner_step == 0: + dump_signal = cfg.new_step_in_training(self.global_step) + if dump_signal: + print_info_log(f"Starting tracing step:{self.global_step}") + + def stop_in_training(self): + if self.inner_step == self.acc - 1: + dump_signal = cfg.reset_step_in_training(self.global_step) + if dump_signal: + print_info_log(f"Stopping tracing step:{self.global_step}") + dump_util.dump() diff --git a/paddleapex/api_tracer/api_info.py b/paddleapex/api_tracer/api_info.py index c677845..65c121e 100644 --- a/paddleapex/api_tracer/api_info.py +++ b/paddleapex/api_tracer/api_info.py @@ -17,6 +17,10 @@ import numpy as np from paddleapex.api_tracer.Dump import dump_util from paddleapex.api_tracer.config import cfg +import paddle.distributed as dist +import pickle +import os +from inspect import signature Paddle_Type_Map = { "FP64": "paddle.float64", @@ -41,7 +45,12 @@ "FLOAT16", ] +# inf, nan def get_rounded_num(x, round_up=True): + if math.isinf(x) or math.isnan(x): + msg = f"warning, x is inf or nan" + print(msg, x) + return x if abs(x) <= 1e-10: return 0 @@ -79,6 +88,12 @@ def get_tensor_extremum(data): max_result = np.max(data_clone).item() min_result = np.min(data_clone).item() + if math.isinf(max_result) or math.isnan(max_result): + msg = f"warning, for max_result, where is a inf or nan, need to notice" + print(msg) + if math.isinf(min_result) or math.isnan(min_result): + msg = f"warning, for min_result, where is a inf or nan, need to notice" + print(msg) if cfg.dump_unique: ori_max_ = max_result ori_min_ = min_result @@ -87,16 +102,59 @@ def get_tensor_extremum(data): return max_result, max_result, min_result, min_result +def get_init_params(instance): + sig = signature(instance.__init__) + bound_args = sig.bind_partial() + bound_args.apply_defaults() + + init_params = {} + for param in sig.parameters.values(): + if param.name != 'self': + init_params[param.name] = getattr(instance, param.name, param.default) + + return init_params + + +def get_file_path(rank): + data_route = cfg.dump_root_path + directory = os.path.join(data_route, f"rank{rank}_step{cfg.global_step}") + return directory + + +def save_init_params(init_params, name, rank): + directory = get_file_path(rank) + file_path = os.path.join(directory, f"{name}.init_params") + with open(file_path, 'wb') as f: + pickle.dump(init_params, f) + + +def save_weight(state_dict, name, rank): + directory = get_file_path(rank) + paddle.save(state_dict, os.path.join(directory, f"{name}.state_dict")) + + +def save_init_params_and_weight(init_params, state_dict, name, rank): + directory = get_file_path(rank) + file_path = os.path.join(directory, f"{name}.init_params") + with open(file_path, 'wb') as f: + pickle.dump(init_params, f) + paddle.save(state_dict, os.path.join(directory, f"{name}.state_dict")) + + class API: def __init__(self, mode): self.op_name = "" self.rank = "" self.mode = mode self.args_num = 0 + self.hook_num = 0 self.embedding_num = 0 self.output_num = 0 self.dout_list = [] + self.out_list = [] + self.arg_index = 0 self.is_half_precision = False + self.is_distributed = False if cfg.profile_mode: self.tensor_analyzer_ = self.effi_analyze_tensor else: @@ -106,15 +164,23 @@ def update_APIInfo(self, op_name, rank): print("dump api: ", op_name) self.op_name = op_name self.rank = rank + if "distributed" in self.op_name or "modeling" in self.op_name: + self.is_distributed = True def update_real_data(self, inputs, kwargs): self.is_half_precision = False args_info_list = self.analyze_element(inputs) kwargs_info_dict = self.analyze_element(kwargs) self.api_info_struct = { - self.op_name: {"args": args_info_list, "kwargs": kwargs_info_dict, "dout_list": ["Failed"]} + self.op_name: {"args": args_info_list, "kwargs": kwargs_info_dict, "out_list": ["Failed"], "dout_list": ["Failed"]} } - dump_util.update_api_dict(self.api_info_struct, self.rank, self.is_half_precision) + dump_util.update_api_dict(self.api_info_struct, self.rank, self.is_half_precision, self.is_distributed) + + def update_output(self, output): + if isinstance(output, paddle.Tensor): + setattr(tensor, 'description', self.op_name) + # self.out_list = self.analyze_element(outputs) + # self.api_info_struct[self.op_name].update({"out_list": self.dout_list}) def record_dout(self, grad_value): if grad_value is not None: @@ -153,37 +219,94 @@ def analyze_element(self, element): if element is None or isinstance(element, (bool, int, float, str, slice)): return self._analyze_builtin(element) + + try: + from paddlenlp.transformers.llama.modeling import LlamaRotaryEmbedding + if type(element) is LlamaRotaryEmbedding: + return self.analyze_class(element, "paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding") + from paddlenlp.transformers.llama.configuration import LlamaConfig + if type(element) is LlamaConfig: + return self.analyze_config(element, "paddlenlp.transformers.llama.configuration.LlamaConfig") + except Exception as e: + print(e) + print("check you environment, and ensure the path of paddlenlp is valid") + print(type(element)) + print(element) msg = f"In op:{self.op_name}, its args type {type(element)} is unsupported at analyze_element" print(msg) + + def analyze_config(self, arg, call_stack): + single_arg = {} + single_arg.update({"type": "config"}) + single_arg.update({"dtype": str(type(arg))}) + single_arg.update({"api_call_stack": call_stack}) + if self.mode == "real_data": + api_args = self.op_name + "." + str(self.args_num) + self.args_num += 1 + directory = get_file_path(self.rank) + file_path = os.path.join(directory, f"{api_args}.config") + with open(file_path, 'wb') as f: + pickle.dump(arg, f) + single_arg.update({"real_data_path": api_args}) + return single_arg + + + def analyze_class(self, arg, call_stack): + single_arg = {} + single_arg.update({"type": "class"}) + single_arg.update({"dtype": str(type(arg))}) + single_arg.update({"api_call_stack": call_stack}) + if self.mode == "real_data": + api_args = self.op_name + "." + str(self.args_num) + self.args_num += 1 + init_params = get_init_params(arg) + save_init_params_and_weight(init_params, arg.state_dict(), api_args, self.rank) + single_arg.update({"real_data_path": api_args}) + return single_arg + + def effi_analyze_tensor(self, arg): single_arg = {} single_arg.update({"type": "paddle.Tensor"}) single_arg.update({"dtype": str(arg.dtype.name)}) single_arg.update({"shape": arg.shape}) - try: - with paddle.no_grad(): - max_ = paddle.max(arg).item() - min_ = paddle.min(arg).item() - except: - max_ = 1 - min_ = 0 - if cfg.dump_unique and arg.dtype.name != "BOOL": - ori_max_ = max_ - ori_min_ = min_ - max_ = get_rounded_num(ori_max_, True) - min_ = get_rounded_num(ori_min_, False) if ori_min_ != ori_max_ else max_ - single_arg.update({"Max": max_}) - single_arg.update({"Max_origin": max_}) - single_arg.update({"Min": min_}) - single_arg.update({"Min_origin": min_}) + arg_name = arg.name + exit_tensor = arg_name.startswith("APEX_") + # if not exit_tensor: + # arg.name = "APEX_" + self.op_name + "_" + str(self.arg_index) + # single_arg.update({"name": arg.name}) + # self.arg_index = self.arg_index + 1 single_arg.update({"stop_gradient": arg.stop_gradient}) if self.mode == "real_data": api_args = self.op_name + "." + str(self.args_num) pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) self.args_num += 1 single_arg.update({"real_data_path": pt_path}) + else: + try: + with paddle.no_grad(): + max_ = paddle.max(arg).item() + min_ = paddle.min(arg).item() + except: + max_ = 1 + min_ = 0 + if cfg.dump_unique and arg.dtype.name != "BOOL": + ori_max_ = max_ + ori_min_ = min_ + if math.isinf(ori_max_) or math.isnan(ori_max_): + msg = f"warning, for max_result, where is a inf or nan, need to notice" + print(msg) + if math.isinf(ori_min_) or math.isnan(ori_min_): + msg = f"warning, for min_result, where is a inf or nan, need to notice" + print(msg) + max_ = get_rounded_num(ori_max_, True) + min_ = get_rounded_num(ori_min_, False) if ori_min_ != ori_max_ else max_ + single_arg.update({"Max": max_}) + single_arg.update({"Max_origin": max_}) + single_arg.update({"Min": min_}) + single_arg.update({"Min_origin": min_}) return single_arg def _analyze_tensor(self, arg): @@ -191,6 +314,13 @@ def _analyze_tensor(self, arg): single_arg.update({"type": "paddle.Tensor"}) single_arg.update({"dtype": str(arg.dtype.name)}) single_arg.update({"shape": arg.shape}) + single_arg.update({"stop_gradient": arg.stop_gradient}) + if self.mode == "real_data": + api_args = self.op_name + "." + str(self.args_num) + pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) + self.args_num += 1 + single_arg.update({"real_data_path": pt_path}) + return single_arg if arg.dtype.name == "BF16": arg = paddle.cast(arg, "float32") max_handle, max_origin, min_handle, min_origin = get_tensor_extremum(arg) @@ -202,13 +332,6 @@ def _analyze_tensor(self, arg): single_arg.update( {"Min_origin": transfer_types(min_origin, str(arg.dtype.name))} ) - single_arg.update({"stop_gradient": arg.stop_gradient}) - - if self.mode == "real_data": - api_args = self.op_name + "." + str(self.args_num) - pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) - self.args_num += 1 - single_arg.update({"real_data_path": pt_path}) return single_arg def _analyze_builtin(self, arg): diff --git a/paddleapex/api_tracer/config.py b/paddleapex/api_tracer/config.py index 6c40c73..ebacaca 100644 --- a/paddleapex/api_tracer/config.py +++ b/paddleapex/api_tracer/config.py @@ -40,8 +40,8 @@ def __init__(self) -> None: print(f"You are using Apex Toolkit, Dump mode : {self.dump_mode}, Target step : {self.target_step}, profile mode : {self.profile_mode}") print("*" * 100) time.sleep(1) - self.global_step = -1 - self.dump_state = False + self.global_step = 0 + self.dump_state = True self.Op_count = {} self.prefix_op_name_ = None @@ -54,5 +54,18 @@ def new_step(self): self.Op_count = {} self.dump_state = False + def new_step_in_training(self, global_step): + if global_step in self.target_step: + self.global_step = global_step + self.Op_count = {} + self.dump_state = True + return True + return False + + def reset_step_in_training(self, global_step): + if global_step in self.target_step: + self.dump_state = False + return True + return False cfg = Config() diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index d05646b..38bb393 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -1,877 +1,966 @@ +target_class: + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXSafeConv3d + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXCausalConv3d + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXSpatialNorm3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXResnetBlock3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXDownBlock3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXMidBlock3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXUpBlock3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXEncoder3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXDecoder3D + - vaes.cogx_vae_ppdiffusers_ver_new.AutoencoderKLCogVideoX + # - paddlenlp.transformers.llama.modeling.LlamaLMHead + # - paddlenlp.transformers.llama.modeling.LlamaRMSNorm + # - paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaLinearScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaNTKScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaDynamicNTKScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.Llama3RotaryEmbedding + # - paddlenlp.transformers.llama.modeling.MoEAllToAll + # - paddlenlp.transformers.llama.modeling.MoEGateCombine + # - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler + # - paddlenlp.transformers.llama.modeling.LlamaMoEGate + # - paddlenlp.transformers.llama.modeling.LlamaMoEMLP + # - paddlenlp.transformers.llama.modeling.LlamaAttention + # - paddlenlp.transformers.llama.modeling.LlamaDecoderLayer + # - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss + # - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler + # - paddlenlp.transformers.llama.modeling.LlamaPretrainedModel + # - paddlenlp.transformers.llama.modeling.LlamaPretrainingCriterion + # - paddlenlp.transformers.llama.modeling.LlamaForCausalLM + # - paddlenlp.transformers.llama.modeling.LlamaMLP + # - paddlenlp.transformers.llama.modeling.LlamaModel ignored_op: - paddle._C_ops.max - paddle._C_ops.min - -target_op: - # Special op, paddle has wrapped op in framework. #noqa - - paddle._C_ops.layer_norm #noqa - - paddle.nn.functional.adaptive_avg_pool1d - - paddle.nn.functional.adaptive_avg_pool2d - - paddle.nn.functional.adaptive_avg_pool3d - - paddle.nn.functional.adaptive_max_pool1d - - paddle.nn.functional.adaptive_max_pool2d - - paddle.nn.functional.adaptive_max_pool3d - - paddle.nn.functional.affine_grid - - paddle.nn.functional.alpha_dropout - - paddle.nn.functional.avg_pool1d - - paddle.nn.functional.avg_pool2d - - paddle.nn.functional.avg_pool3d - - paddle.nn.functional.batch_norm - - paddle.nn.functional.bilinear - - paddle.nn.functional.binary_cross_entropy - - paddle.nn.functional.binary_cross_entropy_with_logits - - paddle.nn.functional.celu - - paddle.nn.functional.channel_shuffle - - paddle.nn.functional.class_center_sample - - paddle.nn.functional.common - - paddle.nn.functional.conv1d - - paddle.nn.functional.conv1d_transpose - - paddle.nn.functional.conv2d - - paddle.nn.functional.conv2d_transpose - - paddle.nn.functional.conv3d - - paddle.nn.functional.conv3d_transpose - - paddle.nn.functional.cosine_embedding_loss - - paddle.nn.functional.cosine_similarity - - paddle.nn.functional.cross_entropy - - paddle.nn.functional.ctc_loss - - paddle.nn.functional.diag_embed - - paddle.nn.functional.dice_loss - - paddle.nn.functional.distance - - paddle.nn.functional.dropout - - paddle.nn.functional.dropout2d - - paddle.nn.functional.dropout3d - - paddle.nn.functional.elu - - paddle.nn.functional.elu_ - # - paddle.nn.functional.embedding - - paddle.nn.functional.extension - - paddle.nn.functional.flash_attention - - paddle.nn.functional.flash_attention_with_sparse_mask - - paddle.nn.functional.fractional_max_pool2d - - paddle.nn.functional.fractional_max_pool3d - - paddle.nn.functional.fold - - paddle.nn.functional.gather_tree - - paddle.nn.functional.gaussian_nll_loss - - paddle.nn.functional.gelu - - paddle.nn.functional.glu - - paddle.nn.functional.grid_sample - - paddle.nn.functional.gumbel_softmax - - paddle.nn.functional.hardshrink - - paddle.nn.functional.hardsigmoid - - paddle.nn.functional.hardswish - - paddle.nn.functional.hardtanh - - paddle.nn.functional.hardtanh_ - - paddle.nn.functional.hinge_embedding_loss - - paddle.nn.functional.hsigmoid_loss - - paddle.nn.functional.instance_norm - - paddle.nn.functional.interpolate - - paddle.nn.functional.kl_div - - paddle.nn.functional.l1_loss - - paddle.nn.functional.label_smooth - - paddle.nn.functional.layer_norm - - paddle.nn.functional.leaky_relu - - paddle.nn.functional.leaky_relu_ - - paddle.nn.functional.linear - - paddle.nn.functional.local_response_norm - - paddle.nn.functional.log_loss - - paddle.nn.functional.log_sigmoid - - paddle.nn.functional.log_softmax - - paddle.nn.functional.margin_cross_entropy - - paddle.nn.functional.margin_ranking_loss - - paddle.nn.functional.max_pool1d - - paddle.nn.functional.max_pool2d - - paddle.nn.functional.max_pool3d - - paddle.nn.functional.max_unpool1d - - paddle.nn.functional.max_unpool2d - - paddle.nn.functional.max_unpool3d - - paddle.nn.functional.maxout - - paddle.nn.functional.mish - - paddle.nn.functional.mse_loss - - paddle.nn.functional.multi_label_soft_margin_loss - - paddle.nn.functional.multi_margin_loss - - paddle.nn.functional.nll_loss - - paddle.nn.functional.norm - - paddle.nn.functional.normalize - - paddle.nn.functional.npair_loss - - paddle.nn.functional.one_hot - - paddle.nn.functional.pad - - paddle.nn.functional.pairwise_distance - - paddle.nn.functional.pdist - - paddle.nn.functional.pixel_shuffle - - paddle.nn.functional.pixel_unshuffle - - paddle.nn.functional.poisson_nll_loss - - paddle.nn.functional.pooling - - paddle.nn.functional.prelu - - paddle.nn.functional.relu - - paddle.nn.functional.relu6 - - paddle.nn.functional.relu_ - - paddle.nn.functional.rnnt_loss - - paddle.nn.functional.rrelu - - paddle.nn.functional.scaled_dot_product_attention - - paddle.nn.functional.sdp_kernel - - paddle.nn.functional.selu - - paddle.nn.functional.sequence_mask - - paddle.nn.functional.sigmoid - - paddle.nn.functional.sigmoid_focal_loss - - paddle.nn.functional.silu - - paddle.nn.functional.smooth_l1_loss - - paddle.nn.functional.soft_margin_loss - - paddle.nn.functional.softmax - - paddle.nn.functional.softmax_ - - paddle.nn.functional.softmax_with_cross_entropy - - paddle.nn.functional.softplus - - paddle.nn.functional.softshrink - - paddle.nn.functional.softsign - - paddle.nn.functional.sparse_attention - - paddle.nn.functional.square_error_cost - - paddle.nn.functional.swish - - paddle.nn.functional.tanh - - paddle.nn.functional.tanh_ - - paddle.nn.functional.tanhshrink - - paddle.nn.functional.temporal_shift - - paddle.nn.functional.thresholded_relu - - paddle.nn.functional.thresholded_relu_ - - paddle.nn.functional.triplet_margin_loss - - paddle.nn.functional.triplet_margin_with_distance_loss - - paddle.nn.functional.unfold - - paddle.nn.functional.upsample - - paddle.nn.functional.zeropad2d - - paddle.abs - - paddle.abs_ - - paddle.acos - - paddle.acos_ - - paddle.acosh - - paddle.acosh_ - - paddle.add - - paddle.add_n - - paddle.addmm - - paddle.addmm_ - - paddle.all - - paddle.allclose - - paddle.amax - - paddle.amin - - paddle.angle - - paddle.any - - paddle.arange - - paddle.argmax - - paddle.argmin - - paddle.argsort - - paddle.as_complex - - paddle.as_real - - paddle.as_strided - - paddle.asin - - paddle.asin_ - - paddle.asinh - - paddle.asinh_ - - paddle.assign - - paddle.atan - - paddle.atan2 - - paddle.atan_ - - paddle.atanh - - paddle.atanh_ - - paddle.atleast_1d - - paddle.atleast_2d - - paddle.atleast_3d - - paddle.bernoulli - - paddle.bincount - - paddle.binomial - - paddle.bitwise_and - - paddle.bitwise_and_ - - paddle.bitwise_not - - paddle.bitwise_not_ - - paddle.bitwise_or - - paddle.bitwise_or_ - - paddle.bitwise_xor - - paddle.bitwise_xor_ - - paddle.bmm - - paddle.broadcast_shape - - paddle.broadcast_tensors - - paddle.broadcast_to - - paddle.cauchy_ - # - paddle.cast - - paddle.cdist - - paddle.ceil - - paddle.cholesky - - paddle.chunk - - paddle.clip - - paddle.column_stack - - paddle.combinations - - paddle.concat - - paddle.conj - - paddle.copysign - - paddle.copysign_ - - paddle.cos - - paddle.cos_ - - paddle.cosh - - paddle.cosh_ - - paddle.count_nonzero - - paddle.crop - - paddle.cross - - paddle.cummax - - paddle.cummin - - paddle.cumprod - - paddle.cumprod_ - - paddle.cumsum - - paddle.cumsum_ - - paddle.cumulative_trapezoid - - paddle.decomposition - - paddle.deg2rad - - paddle.diag - - paddle.diag_embed - - paddle.diagflat - - paddle.diagonal - - paddle.diagonal_scatter - - paddle.diff - - paddle.digamma - - paddle.digamma_ - - paddle.divide - - paddle.divide_ - - paddle.dot - - paddle.dsplit - - paddle.dstack - - paddle.eigvalsh - - paddle.einsum - paddle.empty - paddle.empty_like - - paddle.equal - - paddle.equal_all - - paddle.erf - - paddle.erf_ - - paddle.erfinv - - paddle.exp - - paddle.expand - - paddle.expand_as - - paddle.expm1 - - paddle.expm1_ - - paddle.eye - - paddle.fft - - paddle.flatten - - paddle.flatten_ - - paddle.flip - - paddle.floor - - paddle.floor_divide - - paddle.floor_divide_ - - paddle.floor_mod - - paddle.floor_mod_ - - paddle.fmax - - paddle.fmin - - paddle.frac - - paddle.frac_ - - paddle.frexp - - paddle.full - - paddle.full_like - - paddle.gather - - paddle.gather_nd - - paddle.gcd - - paddle.gcd_ - - paddle.greater_equal - - paddle.greater_equal_ - - paddle.greater_than - - paddle.greater_than_ - - paddle.heaviside - - paddle.histogram - - paddle.histogramdd - - paddle.hsplit - - paddle.hstack - - paddle.hypot - - paddle.hypot_ - - paddle.i0 - - paddle.i0_ - - paddle.i0e - - paddle.i1 - - paddle.i1e - - paddle.imag - - paddle.increment - - paddle.index_add - - paddle.index_add_ - - paddle.index_fill - - paddle.index_fill_ - - paddle.index_put - - paddle.index_put_ - - paddle.index_sample - - paddle.index_select - - paddle.inner - - paddle.kron - - paddle.kthvalue - - paddle.lcm - - paddle.lcm_ - - paddle.ldexp - - paddle.ldexp_ - - paddle.lerp - - paddle.less_equal - - paddle.less_equal_ - - paddle.less_than - - paddle.less_than_ - - paddle.lgamma - - paddle.lgamma_ - - paddle.linalg - - paddle.linspace - - paddle.log - - paddle.log10 - - paddle.log10_ - - paddle.log1p - - paddle.log1p_ - - paddle.log2 - - paddle.log2_ - - paddle.log_ - - paddle.logaddexp - - paddle.logcumsumexp - - paddle.logical_and - - paddle.logical_and_ - - paddle.logical_not - - paddle.logical_not_ - - paddle.logical_or - - paddle.logical_or_ - - paddle.logical_xor - - paddle.logical_xor_ - - paddle.logit - - paddle.logit_ - - paddle.logspace - - paddle.logsumexp - - paddle.masked_fill - - paddle.masked_fill_ - - paddle.masked_scatter - - paddle.masked_scatter_ - - paddle.masked_select - - paddle.matmul - - paddle.max - - paddle.maximum - - paddle.mean - - paddle.median - - paddle.meshgrid - - paddle.min - - paddle.minimum - - paddle.mm - - paddle.mod - - paddle.mod_ - - paddle.mode - - paddle.moveaxis - - paddle.multigammaln - - paddle.multigammaln_ - - paddle.multinomial - - paddle.multiplex - - paddle.multiply - - paddle.multiply_ - - paddle.mv - - paddle.nan_to_num - - paddle.nan_to_num_ - - paddle.nanmean - - paddle.nanmedian - - paddle.nanquantile - - paddle.nansum - - paddle.neg - - paddle.neg_ - - paddle.nextafter - - paddle.nonzero - - paddle.normal - - paddle.normal_ - - paddle.not_equal - - paddle.not_equal_ - - paddle.numel - - paddle.ones - - paddle.ones_like - - paddle.outer - - paddle.pdist - - paddle.poisson - - paddle.polar - - paddle.polygamma - - paddle.polygamma_ - - paddle.pow - - paddle.pow_ - - paddle.prod - - paddle.put_along_axis - - paddle.quantile - - paddle.rad2deg - - paddle.rand - - paddle.randint - - paddle.randint_like - - paddle.randn - - paddle.randperm - - paddle.reader - - paddle.real - - paddle.reciprocal - - paddle.regularizer - - paddle.remainder - - paddle.remainder_ - - paddle.renorm - - paddle.renorm_ - - paddle.repeat_interleave - paddle.reshape - paddle.reshape_ - - paddle.roll - - paddle.rot90 - - paddle.round - - paddle.row_stack - - paddle.rsqrt - - paddle.scale - - paddle.scatter - - paddle.scatter_ - # - paddle.scatter_nd # cause CUDA_ERROR ignored. - # - paddle.scatter_nd_add - - paddle.searchsorted - - paddle.select_scatter - - paddle.sgn - - paddle.shard_index - - paddle.sign - - paddle.signal - - paddle.signbit - - paddle.sin - - paddle.sin_ - - paddle.sinh - - paddle.sinh_ - - paddle.slice - # - paddle.slice_scatter - - paddle.sort - - paddle.split - - paddle.sqrt - - paddle.square - - paddle.square_ - - paddle.squeeze - - paddle.squeeze_ - - paddle.stack - - paddle.standard_gamma - - paddle.standard_normal - - paddle.stanh - - paddle.strided_slice - - paddle.subtract - - paddle.sum - - paddle.t - - paddle.t_ - - paddle.take - - paddle.take_along_axis - - paddle.tan - - paddle.tan_ - - paddle.tanh - - paddle.tanh_ - - paddle.tensordot - - paddle.tile - - paddle.topk - - paddle.trace - - paddle.transpose - - paddle.transpose_ - - paddle.trapezoid - - paddle.tril - - paddle.tril_ - - paddle.tril_indices - - paddle.triu - - paddle.triu_ - - paddle.triu_indices - - paddle.trunc - - paddle.trunc_ - - paddle.unbind - - paddle.unflatten - - paddle.unfold - - paddle.uniform - - paddle.unique - - paddle.unique_consecutive - paddle.unsqueeze - paddle.unsqueeze_ - - paddle.unstack - - paddle.vander - - paddle.var - - paddle.view - - paddle.view_as - - paddle.vsplit - - paddle.where - - paddle.where_ - - paddle.zeros - - paddle.zeros_like - # - paddle.Tensor.T - - paddle.Tensor.__add__ - - paddle.Tensor.__and__ - - paddle.Tensor.__radd__ - - paddle.Tensor.__div__ - - paddle.Tensor.__eq__ - - paddle.Tensor.__floordiv__ - - paddle.Tensor.__ge__ - - paddle.Tensor.__gt__ - - paddle.Tensor.__le__ - - paddle.Tensor.__lt__ - - paddle.Tensor.__matmul__ - - paddle.Tensor.__mod__ - - paddle.Tensor.__mul__ - - paddle.Tensor.__ne__ - - paddle.Tensor.__neg__ - - paddle.Tensor.__nonzero__ - - paddle.Tensor.__or__ - - paddle.Tensor.__pow__ - - paddle.Tensor.__radd__ - - paddle.Tensor.__rdiv__ - - paddle.Tensor.__rmul__ - - paddle.Tensor.__rpow__ - - paddle.Tensor.__rsub__ - - paddle.Tensor.__rtruediv__ - - paddle.Tensor.__sub__ - - paddle.Tensor.__truediv__ - - paddle.Tensor.__xor__ - - paddle.Tensor.abs - - paddle.Tensor.abs_ - - paddle.Tensor.acos - - paddle.Tensor.acos_ - - paddle.Tensor.acosh - - paddle.Tensor.acosh_ - - paddle.Tensor.add - - paddle.Tensor.add_ - - paddle.Tensor.add_n - - paddle.Tensor.addmm - - paddle.Tensor.addmm_ - - paddle.Tensor.all - - paddle.Tensor.allclose - - paddle.Tensor.amax - - paddle.Tensor.amin - - paddle.Tensor.angle - - paddle.Tensor.any - - paddle.Tensor.argmax - - paddle.Tensor.argmin - - paddle.Tensor.argsort - - paddle.Tensor.as_complex - - paddle.Tensor.as_real - - paddle.Tensor.as_strided - - paddle.Tensor.asin - - paddle.Tensor.asin_ - - paddle.Tensor.asinh - - paddle.Tensor.asinh_ - - paddle.Tensor.atan - - paddle.Tensor.atan2 - - paddle.Tensor.atan_ - - paddle.Tensor.atanh - - paddle.Tensor.atanh_ - - paddle.Tensor.atleast_1d - - paddle.Tensor.atleast_2d - - paddle.Tensor.atleast_3d - - paddle.Tensor.bincount - - paddle.Tensor.bitwise_and - - paddle.Tensor.bitwise_and_ - - paddle.Tensor.bitwise_not - - paddle.Tensor.bitwise_not_ - - paddle.Tensor.bitwise_or - - paddle.Tensor.bitwise_or_ - - paddle.Tensor.bitwise_xor - - paddle.Tensor.bitwise_xor_ - - paddle.Tensor.bmm - - paddle.Tensor.broadcast_shape - - paddle.Tensor.broadcast_tensors - - paddle.Tensor.broadcast_to - - paddle.Tensor.cauchy_ - - paddle.Tensor.cdist - - paddle.Tensor.ceil - - paddle.Tensor.ceil_ - - paddle.Tensor.cholesky - - paddle.Tensor.cholesky_solve - - paddle.Tensor.clip - - paddle.Tensor.clip_ - - paddle.Tensor.coalesce - - paddle.Tensor.cols - - paddle.Tensor.combinations - - paddle.Tensor.concat - - paddle.Tensor.cond - - paddle.Tensor.conj - - paddle.Tensor.contiguous - - paddle.Tensor.corrcoef - - paddle.Tensor.cos - - paddle.Tensor.cos_ - - paddle.Tensor.cosh - - paddle.Tensor.cosh_ - - paddle.Tensor.count_nonzero - - paddle.Tensor.cov - - paddle.Tensor.cross - - paddle.Tensor.crows - - paddle.Tensor.cummax - - paddle.Tensor.cummin - - paddle.Tensor.cumprod - - paddle.Tensor.cumprod_ - - paddle.Tensor.cumsum - - paddle.Tensor.cumsum_ - - paddle.Tensor.cumulative_trapezoid - - paddle.Tensor.deg2rad - - paddle.Tensor.diag - - paddle.Tensor.diag_embed - - paddle.Tensor.diagflat - - paddle.Tensor.diagonal - - paddle.Tensor.diagonal_scatter - - paddle.Tensor.diff - - paddle.Tensor.digamma - - paddle.Tensor.digamma_ - - paddle.Tensor.divide - - paddle.Tensor.divide_ - - paddle.Tensor.dot - - paddle.Tensor.eig - - paddle.Tensor.eigvals - - paddle.Tensor.eigvalsh - - paddle.Tensor.equal - - paddle.Tensor.equal_all - - paddle.Tensor.erf - - paddle.Tensor.erfinv - - paddle.Tensor.erfinv_ - - paddle.Tensor.exp - - paddle.Tensor.exp_ - - paddle.Tensor.expand - - paddle.Tensor.expand_as - - paddle.Tensor.expm1 - - paddle.Tensor.exponential_ - - paddle.Tensor.fill_ - - paddle.Tensor.fill_diagonal_ - - paddle.Tensor.fill_diagonal_tensor - - paddle.Tensor.fill_diagonal_tensor_ - - paddle.Tensor.flatten - - paddle.Tensor.flatten_ - - paddle.Tensor.flip - - paddle.Tensor.floor - - paddle.Tensor.floor_ - - paddle.Tensor.floor_divide - - paddle.Tensor.floor_divide_ - - paddle.Tensor.floor_mod - - paddle.Tensor.floor_mod_ - - paddle.Tensor.fmax - - paddle.Tensor.fmin - - paddle.Tensor.frac - - paddle.Tensor.frac_ - - paddle.Tensor.frexp - - paddle.Tensor.gather - - paddle.Tensor.gather_nd - - paddle.Tensor.gcd - - paddle.Tensor.gcd_ - - paddle.Tensor.get_selected_rows - - paddle.Tensor.get_strides - - paddle.Tensor.greater_equal - - paddle.Tensor.greater_equal_ - - paddle.Tensor.greater_than - - paddle.Tensor.greater_than_ - - paddle.Tensor.heaviside - - paddle.Tensor.histogram - - paddle.Tensor.histogramdd - - paddle.Tensor.hsplit - - paddle.Tensor.hypot - - paddle.Tensor.hypot_ - - paddle.Tensor.i0 - - paddle.Tensor.i0_ - - paddle.Tensor.i0e - - paddle.Tensor.i1 - - paddle.Tensor.i1e - - paddle.Tensor.imag - - paddle.Tensor.increment - - paddle.Tensor.index_add - - paddle.Tensor.index_add_ - - paddle.Tensor.index_fill - - paddle.Tensor.index_fill_ - - paddle.Tensor.index_put - - paddle.Tensor.index_put_ - - paddle.Tensor.index_sample - - paddle.Tensor.index_select - - paddle.Tensor.inner - - paddle.Tensor.kron - - paddle.Tensor.kthvalue - - paddle.Tensor.layout - - paddle.Tensor.lcm - - paddle.Tensor.lcm_ - - paddle.Tensor.ldexp - - paddle.Tensor.ldexp_ - - paddle.Tensor.lerp - - paddle.Tensor.lerp_ - - paddle.Tensor.less_equal - - paddle.Tensor.less_equal_ - - paddle.Tensor.less_than - - paddle.Tensor.less_than_ - - paddle.Tensor.lgamma - - paddle.Tensor.lgamma_ - - paddle.Tensor.log - - paddle.Tensor.log10 - - paddle.Tensor.log10_ - - paddle.Tensor.log1p - - paddle.Tensor.log1p_ - - paddle.Tensor.log2 - - paddle.Tensor.log2_ - - paddle.Tensor.log_ - - paddle.Tensor.logaddexp - - paddle.Tensor.logcumsumexp - - paddle.Tensor.logical_and - - paddle.Tensor.logical_and_ - - paddle.Tensor.logical_not - - paddle.Tensor.logical_not_ - - paddle.Tensor.logical_or - - paddle.Tensor.logical_or_ - - paddle.Tensor.logical_xor - - paddle.Tensor.logical_xor_ - - paddle.Tensor.logit - - paddle.Tensor.logit_ - - paddle.Tensor.logsumexp - - paddle.Tensor.lstsq - - paddle.Tensor.lu - - paddle.Tensor.lu_unpack - - paddle.Tensor.masked_fill - - paddle.Tensor.masked_fill_ - - paddle.Tensor.masked_select - - paddle.Tensor.masked_scatter - - paddle.Tensor.masked_scatter_ - - paddle.Tensor.matmul - - paddle.Tensor.matrix_power - - paddle.Tensor.max - - paddle.Tensor.maximum - - paddle.Tensor.mean - - paddle.Tensor.median - - paddle.Tensor.min - - paddle.Tensor.minimum - - paddle.Tensor.mm - - paddle.Tensor.mod - - paddle.Tensor.mod_ - - paddle.Tensor.mode - - paddle.Tensor.moveaxis - - paddle.Tensor.multi_dot - - paddle.Tensor.multigammaln - - paddle.Tensor.multigammaln_ - - paddle.Tensor.multinomial - - paddle.Tensor.multiplex - - paddle.Tensor.multiply - - paddle.Tensor.multiply_ - - paddle.Tensor.mv - - paddle.Tensor.nan_to_num - - paddle.Tensor.nan_to_num_ - - paddle.Tensor.nanmean - - paddle.Tensor.nanmedian - - paddle.Tensor.nanquantile - - paddle.Tensor.nansum - - paddle.Tensor.ndimension - - paddle.Tensor.neg - - paddle.Tensor.neg_ - - paddle.Tensor.nnz - - paddle.Tensor.nonzero - - paddle.Tensor.norm - - paddle.Tensor.normal_ - - paddle.Tensor.not_equal - - paddle.Tensor.not_equal_ - - paddle.Tensor.numel - - paddle.Tensor.offset - - paddle.Tensor.outer - - paddle.Tensor.pca_lowrank - - paddle.Tensor.pinv - - paddle.Tensor.polar - - paddle.Tensor.polygamma - - paddle.Tensor.polygamma_ - - paddle.Tensor.pow - - paddle.Tensor.pow_ - - paddle.Tensor.process_mesh - - paddle.Tensor.prod - - paddle.Tensor.put_along_axis - - paddle.Tensor.put_along_axis_ - - paddle.Tensor.qr - - paddle.Tensor.quantile - - paddle.Tensor.rad2deg - - paddle.Tensor.remainder - - paddle.Tensor.remainder_ - - paddle.Tensor.renorm - - paddle.Tensor.renorm_ - - paddle.Tensor.repeat_interleave - - paddle.Tensor.reverse - - paddle.Tensor.roll - - paddle.Tensor.rot90 - - paddle.Tensor.round - - paddle.Tensor.round_ - - paddle.Tensor.rows - - paddle.Tensor.rsqrt - - paddle.Tensor.rsqrt_ - - paddle.Tensor.scale - - paddle.Tensor.scale_ - - paddle.Tensor.scatter - - paddle.Tensor.scatter_ - # - paddle.Tensor.scatter_nd - # - paddle.Tensor.scatter_nd_add - - paddle.Tensor.select_scatter - - paddle.Tensor.sgn - - paddle.Tensor.shard_index - - paddle.Tensor.sigmoid - - paddle.Tensor.sigmoid_ - - paddle.Tensor.sign - - paddle.Tensor.sin - - paddle.Tensor.sin_ - - paddle.Tensor.sinh - - paddle.Tensor.sinh_ - - paddle.Tensor.size - - paddle.Tensor.slice - - paddle.Tensor.solve - - paddle.Tensor.sort - - paddle.Tensor.split - - paddle.Tensor.sqrt - - paddle.Tensor.sqrt_ - - paddle.Tensor.square - paddle.Tensor.squeeze - paddle.Tensor.squeeze_ - - paddle.Tensor.stack - - paddle.Tensor.stanh - - paddle.Tensor.std - - paddle.Tensor.stft - - paddle.Tensor.strided_slice - - paddle.Tensor.strides - - paddle.Tensor.subtract - - paddle.Tensor.subtract_ - - paddle.Tensor.sum - - paddle.Tensor.t - - paddle.Tensor.t_ - - paddle.Tensor.take - - paddle.Tensor.take_along_axis - - paddle.Tensor.tan - - paddle.Tensor.tan_ - - paddle.Tensor.tanh - - paddle.Tensor.tanh_ - - paddle.Tensor.tensordot - - paddle.Tensor.tile - - paddle.Tensor.top_p_sampling - - paddle.Tensor.topk - - paddle.Tensor.trace - - paddle.Tensor.transpose - - paddle.Tensor.transpose_ - - paddle.Tensor.trapezoid - - paddle.Tensor.tril - - paddle.Tensor.tril_ - - paddle.Tensor.triu - - paddle.Tensor.triu_ - - paddle.Tensor.trunc - - paddle.Tensor.trunc_ - - paddle.Tensor.unbind - - paddle.Tensor.unflatten - - paddle.Tensor.unfold - - paddle.Tensor.uniform_ - - paddle.Tensor.unique - - paddle.Tensor.unique_consecutive - paddle.Tensor.unsqueeze - paddle.Tensor.unsqueeze_ - - paddle.Tensor.unstack - - paddle.Tensor.vander - - paddle.Tensor.var - - paddle.Tensor.view - - paddle.Tensor.view_as - - paddle.Tensor.vsplit - - paddle.Tensor.where - - paddle.Tensor.where_ + - paddle.squeeze_ + - paddle.ones + - paddle.ones_like + - paddle.split - paddle.Tensor.zero_ - #### experiment op: - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy - - paddle._C_ops.fused_gemm_epilogue - - paddle._legacy_C_ops.fused_gemm_epilogue - - paddle.incubate.nn.functional.fused_multi_head_attention - - paddle.incubate.nn.functional.fused_feedforward - - paddle.incubate.nn.functional.fused_multi_transformer - - paddle.incubate.nn.functional.fused_linear - - paddle.incubate.nn.functional.fused_linear_activation - - paddle.incubate.nn.functional.fused_bias_dropout_residual_layer_norm - - paddle.incubate.nn.functional.fused_ec_moe - - paddle.incubate.nn.functional.fused_dropout_add - - paddle.incubate.nn.functional.fused_rotary_position_embedding - - paddle.incubate.nn.functional.variable_length_memory_efficient_attention - - paddle.incubate.nn.functional.fused_rms_norm - - paddle.incubate.nn.functional.fused_layer_norm - - paddle.incubate.nn.functional.masked_multihead_attention - - paddle.incubate.nn.functional.block_multihead_attention - - paddle.incubate.nn.functional.swiglu - - paddle.incubate.nn.functional.fused_matmul_bias - - paddle.tensor.fill_constant - - paddle.nn.clip._squared_l2_norm - - paddle.uniform + - paddle.stack + - paddle.zeros + - paddle.zeros_like +distributed_op: + - paddle.distributed.communication.stream.alltoall_single + - paddle.distributed.barrier + - paddle.distributed.broadcast_object_list + - paddle.distributed.communication.stream.broadcast + - paddle.distributed.communication.stream.gather + - paddle.distributed.communication.stream.recv + - paddle.distributed.communication.stream.reduce + - paddle.distributed.communication.stream.reduce_scatter + - paddle.distributed.communication.stream.scatter + - paddle.distributed.communication.stream.send + - paddle.distributed.all_gather + - paddle.distributed.all_gather_object + - paddle.distributed.all_reduce + - paddle.distributed.alltoall + - paddle.distributed.alltoall_single + - paddle.distributed.broadcast + - paddle.distributed.communication.stream.all_gather + - paddle.distributed.communication.stream.all_reduce + - paddle.distributed.communication.stream.alltoall + - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity + - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table + - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy + - paddle.distributed.fleet.layers.mpu.mp_ops. +target_op: + - paddle.nn.functional.conv.conv1d + - paddle.nn.functional.conv.conv1d_transpose + - paddle.nn.functional.conv.conv2d + - paddle.nn.functional.conv.conv2d_transpose + - paddle.nn.functional.conv.conv3d + - paddle.nn.functional.conv.conv3d_transpose + # - paddle._C_ops.min + # - paddle._C_ops.min + # - paddle._C_ops.max + # - paddle.empty + # - paddle.empty_like + # - paddle.reshape + # - paddle.reshape_ + # - paddle.unsqueeze + # - paddle.unsqueeze_ + # - paddle.Tensor.squeeze + # - paddle.Tensor.squeeze_ + # - paddle.Tensor.unsqueeze + # - paddle.Tensor.unsqueeze_ + # - paddle.squeeze_ + # - paddle.ones + # - paddle.ones_like + # - paddle.split + # - paddle.Tensor.zero_ + # - paddle.stack + # - paddle.zeros + # - paddle.zeros_like + # - paddle.Tensor.__add__ + # - paddle._C_ops.layer_norm + # - paddle.multiply + # - paddle.multiply_ + # - paddle.Tensor.__mul__ + # - paddle.Tensor.__neg__ + # - paddle.Tensor.add_ + # - paddle._C_ops.adamw + # - paddle._C_ops.adamw_ + # - paddle.square_ + # - paddle.nn.functional.scaled_dot_product_attention + # - paddle._C_ops.layer_norm #noqa + # - paddle.nn.functional.adaptive_avg_pool1d + # - paddle.nn.functional.adaptive_avg_pool2d + # - paddle.nn.functional.adaptive_avg_pool3d + # - paddle.nn.functional.adaptive_max_pool1d + # - paddle.nn.functional.adaptive_max_pool2d + # - paddle.nn.functional.adaptive_max_pool3d + # - paddle.nn.functional.affine_grid + # - paddle.nn.functional.alpha_dropout + # - paddle.nn.functional.avg_pool1d + # - paddle.nn.functional.avg_pool2d + # - paddle.nn.functional.avg_pool3d + # - paddle.nn.functional.batch_norm + # - paddle.nn.functional.bilinear + # - paddle.nn.functional.binary_cross_entropy + # - paddle.nn.functional.binary_cross_entropy_with_logits + # - paddle.nn.functional.celu + # - paddle.nn.functional.channel_shuffle + # - paddle.nn.functional.class_center_sample + # - paddle.nn.functional.common + # - paddle.nn.functional.conv1d + # - paddle.nn.functional.conv1d_transpose + # - paddle.nn.functional.conv2d + # - paddle.nn.functional.conv2d_transpose + # - paddle.nn.functional.conv3d + # - paddle.nn.functional.conv3d_transpose + # - paddle.nn.functional.cosine_embedding_loss + # - paddle.nn.functional.cosine_similarity + # - paddle.nn.functional.cross_entropy + # - paddle.nn.functional.ctc_loss + # - paddle.nn.functional.diag_embed + # - paddle.nn.functional.dice_loss + # - paddle.nn.functional.distance + # - paddle.nn.functional.dropout + # - paddle.nn.functional.dropout2d + # - paddle.nn.functional.dropout3d + # - paddle.nn.functional.elu + # - paddle.nn.functional.elu_ + # - paddle.nn.functional.embedding + # - paddle.nn.functional.extension + # - paddle.nn.functional.flash_attention + # - paddle.nn.functional.flash_attention_with_sparse_mask + # - paddle.nn.functional.fractional_max_pool2d + # - paddle.nn.functional.fractional_max_pool3d + # - paddle.nn.functional.fold + # - paddle.nn.functional.gather_tree + # - paddle.nn.functional.gaussian_nll_loss + # - paddle.nn.functional.gelu + # - paddle.nn.functional.glu + # - paddle.nn.functional.grid_sample + # - paddle.nn.functional.gumbel_softmax + # - paddle.nn.functional.hardshrink + # - paddle.nn.functional.hardsigmoid + # - paddle.nn.functional.hardswish + # - paddle.nn.functional.hardtanh + # - paddle.nn.functional.hardtanh_ + # - paddle.nn.functional.hinge_embedding_loss + # - paddle.nn.functional.hsigmoid_loss + # - paddle.nn.functional.instance_norm + # - paddle.nn.functional.interpolate + # - paddle.nn.functional.kl_div + # - paddle.nn.functional.l1_loss + # - paddle.nn.functional.label_smooth + # - paddle.nn.functional.layer_norm + # - paddle.nn.functional.leaky_relu + # - paddle.nn.functional.leaky_relu_ + # - paddle.nn.functional.linear + # - paddle.nn.functional.local_response_norm + # - paddle.nn.functional.log_loss + # - paddle.nn.functional.log_sigmoid + # - paddle.nn.functional.log_softmax + # - paddle.nn.functional.margin_cross_entropy + # - paddle.nn.functional.margin_ranking_loss + # - paddle.nn.functional.max_pool1d + # - paddle.nn.functional.max_pool2d + # - paddle.nn.functional.max_pool3d + # - paddle.nn.functional.max_unpool1d + # - paddle.nn.functional.max_unpool2d + # - paddle.nn.functional.max_unpool3d + # - paddle.nn.functional.maxout + # - paddle.nn.functional.mish + # - paddle.nn.functional.mse_loss + # - paddle.nn.functional.multi_label_soft_margin_loss + # - paddle.nn.functional.multi_margin_loss + # - paddle.nn.functional.nll_loss + # - paddle.nn.functional.norm + # - paddle.nn.functional.normalize + # - paddle.nn.functional.npair_loss + # - paddle.nn.functional.one_hot + # - paddle.nn.functional.pad + # - paddle.nn.functional.pairwise_distance + # - paddle.nn.functional.pdist + # - paddle.nn.functional.pixel_shuffle + # - paddle.nn.functional.pixel_unshuffle + # - paddle.nn.functional.poisson_nll_loss + # - paddle.nn.functional.pooling + # - paddle.nn.functional.prelu + # - paddle.nn.functional.relu + # - paddle.nn.functional.relu6 + # - paddle.nn.functional.relu_ + # - paddle.nn.functional.rnnt_loss + # - paddle.nn.functional.rrelu + # - paddle.nn.functional.sdp_kernel + # - paddle.nn.functional.selu + # - paddle.nn.functional.sequence_mask + # - paddle.nn.functional.sigmoid + # - paddle.nn.functional.sigmoid_focal_loss + # - paddle.nn.functional.silu + # - paddle.nn.functional.smooth_l1_loss + # - paddle.nn.functional.soft_margin_loss + # - paddle.nn.functional.softmax + # - paddle.nn.functional.softmax_ + # - paddle.nn.functional.softmax_with_cross_entropy + # - paddle.nn.functional.softplus + # - paddle.nn.functional.softshrink + # - paddle.nn.functional.softsign + # - paddle.nn.functional.sparse_attention + # - paddle.nn.functional.square_error_cost + # - paddle.nn.functional.swish + # - paddle.nn.functional.tanh + # - paddle.nn.functional.tanh_ + # - paddle.nn.functional.tanhshrink + # - paddle.nn.functional.temporal_shift + # - paddle.nn.functional.thresholded_relu + # - paddle.nn.functional.thresholded_relu_ + # - paddle.nn.functional.triplet_margin_loss + # - paddle.nn.functional.triplet_margin_with_distance_loss + # - paddle.nn.functional.unfold + # - paddle.nn.functional.upsample + # - paddle.nn.functional.zeropad2d + # - paddle.abs + # - paddle.abs_ + # - paddle.acos + # - paddle.acos_ + # - paddle.acosh + # - paddle.acosh_ + # - paddle.add + # - paddle.add_n + # - paddle.addmm + # - paddle.addmm_ + # - paddle.all + # - paddle.allclose + # - paddle.amax + # - paddle.amin + # - paddle.angle + # - paddle.any + # - paddle.arange + # - paddle.argmax + # - paddle.argmin + # - paddle.argsort + # - paddle.as_complex + # - paddle.as_real + # - paddle.as_strided + # - paddle.asin + # - paddle.asin_ + # - paddle.asinh + # - paddle.asinh_ + # - paddle.assign + # - paddle.atan + # - paddle.atan2 + # - paddle.atan_ + # - paddle.atanh + # - paddle.atanh_ + # - paddle.atleast_1d + # - paddle.atleast_2d + # - paddle.atleast_3d + # - paddle.bernoulli + # - paddle.bincount + # - paddle.binomial + # - paddle.bitwise_and + # - paddle.bitwise_and_ + # - paddle.bitwise_not + # - paddle.bitwise_not_ + # - paddle.bitwise_or + # - paddle.bitwise_or_ + # - paddle.bitwise_xor + # - paddle.bitwise_xor_ + # - paddle.bmm + # - paddle.broadcast_shape + # - paddle.broadcast_tensors + # - paddle.broadcast_to + # - paddle.cauchy_ + # - paddle.cast + # - paddle.cdist + # - paddle.ceil + # - paddle.cholesky + # - paddle.chunk + # - paddle.clip + # - paddle.column_stack + # - paddle.combinations + # - paddle.concat + # - paddle.conj + # - paddle.copysign + # - paddle.copysign_ + # - paddle.cos + # - paddle.cos_ + # - paddle.cosh + # - paddle.cosh_ + # - paddle.count_nonzero + # - paddle.crop + # - paddle.cross + # - paddle.cummax + # - paddle.cummin + # - paddle.cumprod + # - paddle.cumprod_ + # - paddle.cumsum + # - paddle.cumsum_ + # - paddle.cumulative_trapezoid + # - paddle.decomposition + # - paddle.deg2rad + # - paddle.diag + # - paddle.diag_embed + # - paddle.diagflat + # - paddle.diagonal + # - paddle.diagonal_scatter + # - paddle.diff + # - paddle.digamma + # - paddle.digamma_ + # - paddle.divide + # - paddle.divide_ + # - paddle.dot + # - paddle.dsplit + # - paddle.dstack + # - paddle.eigvalsh + # - paddle.einsum + # - paddle.equal + # - paddle.equal_all + # - paddle.erf + # - paddle.erf_ + # - paddle.erfinv + # - paddle.exp + # - paddle.expand + # - paddle.expand_as + # - paddle.expm1 + # - paddle.expm1_ + # - paddle.eye + # - paddle.fft + # - paddle.flatten + # - paddle.flatten_ + # - paddle.flip + # - paddle.floor + # - paddle.floor_divide + # - paddle.floor_divide_ + # - paddle.floor_mod + # - paddle.floor_mod_ + # - paddle.fmax + # - paddle.fmin + # - paddle.frac + # - paddle.frac_ + # - paddle.frexp + # - paddle.full + # - paddle.full_like + # - paddle.gather + # - paddle.gather_nd + # - paddle.gcd + # - paddle.gcd_ + # - paddle.greater_equal + # - paddle.greater_equal_ + # - paddle.greater_than + # - paddle.greater_than_ + # - paddle.heaviside + # - paddle.histogram + # - paddle.histogramdd + # - paddle.hsplit + # - paddle.hstack + # - paddle.hypot + # - paddle.hypot_ + # - paddle.i0 + # - paddle.i0_ + # - paddle.i0e + # - paddle.i1 + # - paddle.i1e + # - paddle.imag + # - paddle.increment + # - paddle.index_add + # - paddle.index_add_ + # - paddle.index_fill + # - paddle.index_fill_ + # - paddle.index_put + # - paddle.index_put_ + # - paddle.index_sample + # - paddle.index_select + # - paddle.inner + # - paddle.kron + # - paddle.kthvalue + # - paddle.lcm + # - paddle.lcm_ + # - paddle.ldexp + # - paddle.ldexp_ + # - paddle.lerp + # - paddle.less_equal + # - paddle.less_equal_ + # - paddle.less_than + # - paddle.less_than_ + # - paddle.lgamma + # - paddle.lgamma_ + # - paddle.linalg + # - paddle.linspace + # - paddle.log + # - paddle.log10 + # - paddle.log10_ + # - paddle.log1p + # - paddle.log1p_ + # - paddle.log2 + # - paddle.log2_ + # - paddle.log_ + # - paddle.logaddexp + # - paddle.logcumsumexp + # - paddle.logical_and + # - paddle.logical_and_ + # - paddle.logical_not + # - paddle.logical_not_ + # - paddle.logical_or + # - paddle.logical_or_ + # - paddle.logical_xor + # - paddle.logical_xor_ + # - paddle.logit + # - paddle.logit_ + # - paddle.logspace + # - paddle.logsumexp + # - paddle.masked_fill + # - paddle.masked_fill_ + # - paddle.masked_scatter + # - paddle.masked_scatter_ + # - paddle.masked_select + # - paddle.matmul + # - paddle.max + # - paddle.maximum + # - paddle.mean + # - paddle.median + # - paddle.meshgrid + # - paddle.min + # - paddle.minimum + # - paddle.mm + # - paddle.mod + # - paddle.mod_ + # - paddle.mode + # - paddle.moveaxis + # - paddle.multigammaln + # - paddle.multigammaln_ + # - paddle.multinomial + # - paddle.multiplex + # - paddle.multiply + # - paddle.multiply_ + # - paddle.mv + # - paddle.nan_to_num + # - paddle.nan_to_num_ + # - paddle.nanmean + # - paddle.nanmedian + # - paddle.nanquantile + # - paddle.nansum + # - paddle.neg + # - paddle.neg_ + # - paddle.nextafter + # - paddle.nonzero + # - paddle.normal + # - paddle.normal_ + # - paddle.not_equal + # - paddle.not_equal_ + # - paddle.numel + # - paddle.outer + # - paddle.pdist + # - paddle.poisson + # - paddle.polar + # - paddle.polygamma + # - paddle.polygamma_ + # - paddle.pow + # - paddle.pow_ + # - paddle.prod + # - paddle.put_along_axis + # - paddle.quantile + # - paddle.rad2deg + # - paddle.rand + # - paddle.randint + # - paddle.randint_like + # - paddle.randn + # - paddle.randperm + # - paddle.reader + # - paddle.real + # - paddle.reciprocal + # - paddle.regularizer + # - paddle.remainder + # - paddle.remainder_ + # - paddle.renorm + # - paddle.renorm_ + # - paddle.repeat_interleave + # - paddle.roll + # - paddle.rot90 + # - paddle.round + # - paddle.row_stack + # - paddle.rsqrt + # - paddle.scale + # - paddle.scatter + # - paddle.scatter_ + # # - paddle.scatter_nd # cause CUDA_ERROR ignored. + # # - paddle.scatter_nd_add + # - paddle.searchsorted + # - paddle.select_scatter + # - paddle.sgn + # - paddle.shard_index + # - paddle.sign + # - paddle.signal + # - paddle.signbit + # - paddle.sin + # - paddle.sin_ + # - paddle.sinh + # - paddle.sinh_ + # - paddle.slice + # # - paddle.slice_scatter + # - paddle.sort + # - paddle.sqrt + # - paddle.square + # - paddle.standard_gamma + # - paddle.standard_normal + # - paddle.stanh + # - paddle.strided_slice + # - paddle.subtract + # - paddle.sum + # - paddle.t + # - paddle.t_ + # - paddle.take + # - paddle.take_along_axis + # - paddle.tan + # - paddle.tan_ + # - paddle.tanh + # - paddle.tanh_ + # - paddle.tensordot + # - paddle.tile + # - paddle.topk + # - paddle.trace + # - paddle.transpose + # - paddle.transpose_ + # - paddle.trapezoid + # - paddle.tril + # - paddle.tril_ + # - paddle.tril_indices + # - paddle.triu + # - paddle.triu_ + # - paddle.triu_indices + # - paddle.trunc + # - paddle.trunc_ + # - paddle.unbind + # - paddle.unflatten + # - paddle.unfold + # - paddle.uniform + # - paddle.unique + # - paddle.unique_consecutive + # - paddle.unstack + # - paddle.vander + # - paddle.var + # - paddle.view + # - paddle.view_as + # - paddle.vsplit + # - paddle.where + # - paddle.where_ + # - paddle.zeros + # - paddle.zeros_like + # - paddle.Tensor.T + # - paddle.Tensor.__add__ + # - paddle.Tensor.__and__ + # - paddle.Tensor.__radd__ + # - paddle.Tensor.__div__ + # - paddle.Tensor.__eq__ + # - paddle.Tensor.__floordiv__ + # - paddle.Tensor.__ge__ + # - paddle.Tensor.__gt__ + # - paddle.Tensor.__le__ + # - paddle.Tensor.__lt__ + # - paddle.Tensor.__matmul__ + # - paddle.Tensor.__mod__ + # - paddle.Tensor.__mul__ + # - paddle.Tensor.__ne__ + # - paddle.Tensor.__neg__ + # - paddle.Tensor.__nonzero__ + # - paddle.Tensor.__or__ + # - paddle.Tensor.__pow__ + # - paddle.Tensor.__radd__ + # - paddle.Tensor.__rdiv__ + # - paddle.Tensor.__rmul__ + # - paddle.Tensor.__rpow__ + # - paddle.Tensor.__rsub__ + # - paddle.Tensor.__rtruediv__ + # - paddle.Tensor.__sub__ + # - paddle.Tensor.__truediv__ + # - paddle.Tensor.__xor__ + # - paddle.Tensor.abs + # - paddle.Tensor.abs_ + # - paddle.Tensor.acos + # - paddle.Tensor.acos_ + # - paddle.Tensor.acosh + # - paddle.Tensor.acosh_ + # - paddle.Tensor.add + # - paddle.Tensor.add_ + # - paddle.Tensor.add_n + # - paddle.Tensor.addmm + # - paddle.Tensor.addmm_ + # - paddle.Tensor.all + # - paddle.Tensor.allclose + # - paddle.Tensor.amax + # - paddle.Tensor.amin + # - paddle.Tensor.angle + # - paddle.Tensor.any + # - paddle.Tensor.argmax + # - paddle.Tensor.argmin + # - paddle.Tensor.argsort + # - paddle.Tensor.as_complex + # - paddle.Tensor.as_real + # - paddle.Tensor.as_strided + # - paddle.Tensor.asin + # - paddle.Tensor.asin_ + # - paddle.Tensor.asinh + # - paddle.Tensor.asinh_ + # - paddle.Tensor.atan + # - paddle.Tensor.atan2 + # - paddle.Tensor.atan_ + # - paddle.Tensor.atanh + # - paddle.Tensor.atanh_ + # - paddle.Tensor.atleast_1d + # - paddle.Tensor.atleast_2d + # - paddle.Tensor.atleast_3d + # - paddle.Tensor.bincount + # - paddle.Tensor.bitwise_and + # - paddle.Tensor.bitwise_and_ + # - paddle.Tensor.bitwise_not + # - paddle.Tensor.bitwise_not_ + # - paddle.Tensor.bitwise_or + # - paddle.Tensor.bitwise_or_ + # - paddle.Tensor.bitwise_xor + # - paddle.Tensor.bitwise_xor_ + # - paddle.Tensor.bmm + # - paddle.Tensor.broadcast_shape + # - paddle.Tensor.broadcast_tensors + # - paddle.Tensor.broadcast_to + # - paddle.Tensor.cauchy_ + # - paddle.Tensor.cdist + # - paddle.Tensor.ceil + # - paddle.Tensor.ceil_ + # - paddle.Tensor.cholesky + # - paddle.Tensor.cholesky_solve + # - paddle.Tensor.clip + # - paddle.Tensor.clip_ + # - paddle.Tensor.coalesce + # - paddle.Tensor.cols + # - paddle.Tensor.combinations + # - paddle.Tensor.concat + # - paddle.Tensor.cond + # - paddle.Tensor.conj + # - paddle.Tensor.contiguous + # - paddle.Tensor.corrcoef + # - paddle.Tensor.cos + # - paddle.Tensor.cos_ + # - paddle.Tensor.cosh + # - paddle.Tensor.cosh_ + # - paddle.Tensor.count_nonzero + # - paddle.Tensor.cov + # - paddle.Tensor.cross + # - paddle.Tensor.crows + # - paddle.Tensor.cummax + # - paddle.Tensor.cummin + # - paddle.Tensor.cumprod + # - paddle.Tensor.cumprod_ + # - paddle.Tensor.cumsum + # - paddle.Tensor.cumsum_ + # - paddle.Tensor.cumulative_trapezoid + # - paddle.Tensor.deg2rad + # - paddle.Tensor.diag + # - paddle.Tensor.diag_embed + # - paddle.Tensor.diagflat + # - paddle.Tensor.diagonal + # - paddle.Tensor.diagonal_scatter + # - paddle.Tensor.diff + # - paddle.Tensor.digamma + # - paddle.Tensor.digamma_ + # - paddle.Tensor.divide + # - paddle.Tensor.divide_ + # - paddle.Tensor.dot + # - paddle.Tensor.eig + # - paddle.Tensor.eigvals + # - paddle.Tensor.eigvalsh + # - paddle.Tensor.equal + # - paddle.Tensor.equal_all + # - paddle.Tensor.erf + # - paddle.Tensor.erfinv + # - paddle.Tensor.erfinv_ + # - paddle.Tensor.exp + # - paddle.Tensor.exp_ + # - paddle.Tensor.expand + # - paddle.Tensor.expand_as + # - paddle.Tensor.expm1 + # - paddle.Tensor.exponential_ + # - paddle.Tensor.fill_ + # - paddle.Tensor.fill_diagonal_ + # - paddle.Tensor.fill_diagonal_tensor + # - paddle.Tensor.fill_diagonal_tensor_ + # - paddle.Tensor.flatten + # - paddle.Tensor.flatten_ + # - paddle.Tensor.flip + # - paddle.Tensor.floor + # - paddle.Tensor.floor_ + # - paddle.Tensor.floor_divide + # - paddle.Tensor.floor_divide_ + # - paddle.Tensor.floor_mod + # - paddle.Tensor.floor_mod_ + # - paddle.Tensor.fmax + # - paddle.Tensor.fmin + # - paddle.Tensor.frac + # - paddle.Tensor.frac_ + # - paddle.Tensor.frexp + # - paddle.Tensor.gather + # - paddle.Tensor.gather_nd + # - paddle.Tensor.gcd + # - paddle.Tensor.gcd_ + # - paddle.Tensor.get_selected_rows + # - paddle.Tensor.get_strides + # - paddle.Tensor.greater_equal + # - paddle.Tensor.greater_equal_ + # - paddle.Tensor.greater_than + # - paddle.Tensor.greater_than_ + # - paddle.Tensor.heaviside + # - paddle.Tensor.histogram + # - paddle.Tensor.histogramdd + # - paddle.Tensor.hsplit + # - paddle.Tensor.hypot + # - paddle.Tensor.hypot_ + # - paddle.Tensor.i0 + # - paddle.Tensor.i0_ + # - paddle.Tensor.i0e + # - paddle.Tensor.i1 + # - paddle.Tensor.i1e + # - paddle.Tensor.imag + # - paddle.Tensor.increment + # - paddle.Tensor.index_add + # - paddle.Tensor.index_add_ + # - paddle.Tensor.index_fill + # - paddle.Tensor.index_fill_ + # - paddle.Tensor.index_put + # - paddle.Tensor.index_put_ + # - paddle.Tensor.index_sample + # - paddle.Tensor.index_select + # - paddle.Tensor.inner + # - paddle.Tensor.kron + # - paddle.Tensor.kthvalue + # - paddle.Tensor.layout + # - paddle.Tensor.lcm + # - paddle.Tensor.lcm_ + # - paddle.Tensor.ldexp + # - paddle.Tensor.ldexp_ + # - paddle.Tensor.lerp + # - paddle.Tensor.lerp_ + # - paddle.Tensor.less_equal + # - paddle.Tensor.less_equal_ + # - paddle.Tensor.less_than + # - paddle.Tensor.less_than_ + # - paddle.Tensor.lgamma + # - paddle.Tensor.lgamma_ + # - paddle.Tensor.log + # - paddle.Tensor.log10 + # - paddle.Tensor.log10_ + # - paddle.Tensor.log1p + # - paddle.Tensor.log1p_ + # - paddle.Tensor.log2 + # - paddle.Tensor.log2_ + # - paddle.Tensor.log_ + # - paddle.Tensor.logaddexp + # - paddle.Tensor.logcumsumexp + # - paddle.Tensor.logical_and + # - paddle.Tensor.logical_and_ + # - paddle.Tensor.logical_not + # - paddle.Tensor.logical_not_ + # - paddle.Tensor.logical_or + # - paddle.Tensor.logical_or_ + # - paddle.Tensor.logical_xor + # - paddle.Tensor.logical_xor_ + # - paddle.Tensor.logit + # - paddle.Tensor.logit_ + # - paddle.Tensor.logsumexp + # - paddle.Tensor.lstsq + # - paddle.Tensor.lu + # - paddle.Tensor.lu_unpack + # - paddle.Tensor.masked_fill + # - paddle.Tensor.masked_fill_ + # - paddle.Tensor.masked_select + # - paddle.Tensor.masked_scatter + # - paddle.Tensor.masked_scatter_ + # - paddle.Tensor.matmul + # - paddle.Tensor.matrix_power + # - paddle.Tensor.max + # - paddle.Tensor.maximum + # - paddle.Tensor.mean + # - paddle.Tensor.median + # - paddle.Tensor.min + # - paddle.Tensor.minimum + # - paddle.Tensor.mm + # - paddle.Tensor.mod + # - paddle.Tensor.mod_ + # - paddle.Tensor.mode + # - paddle.Tensor.moveaxis + # - paddle.Tensor.multi_dot + # - paddle.Tensor.multigammaln + # - paddle.Tensor.multigammaln_ + # - paddle.Tensor.multinomial + # - paddle.Tensor.multiplex + # - paddle.Tensor.multiply + # - paddle.Tensor.multiply_ + # - paddle.Tensor.mv + # - paddle.Tensor.nan_to_num + # - paddle.Tensor.nan_to_num_ + # - paddle.Tensor.nanmean + # - paddle.Tensor.nanmedian + # - paddle.Tensor.nanquantile + # - paddle.Tensor.nansum + # - paddle.Tensor.ndimension + # - paddle.Tensor.neg + # - paddle.Tensor.neg_ + # - paddle.Tensor.nnz + # - paddle.Tensor.nonzero + # - paddle.Tensor.norm + # - paddle.Tensor.normal_ + # - paddle.Tensor.not_equal + # - paddle.Tensor.not_equal_ + # - paddle.Tensor.numel + # - paddle.Tensor.offset + # - paddle.Tensor.outer + # - paddle.Tensor.pca_lowrank + # - paddle.Tensor.pinv + # - paddle.Tensor.polar + # - paddle.Tensor.polygamma + # - paddle.Tensor.polygamma_ + # - paddle.Tensor.pow + # - paddle.Tensor.pow_ + # - paddle.Tensor.process_mesh + # - paddle.Tensor.prod + # - paddle.Tensor.put_along_axis + # - paddle.Tensor.put_along_axis_ + # - paddle.Tensor.qr + # - paddle.Tensor.quantile + # - paddle.Tensor.rad2deg + # - paddle.Tensor.remainder + # - paddle.Tensor.remainder_ + # - paddle.Tensor.renorm + # - paddle.Tensor.renorm_ + # - paddle.Tensor.repeat_interleave + # - paddle.Tensor.reverse + # - paddle.Tensor.roll + # - paddle.Tensor.rot90 + # - paddle.Tensor.round + # - paddle.Tensor.round_ + # - paddle.Tensor.rows + # - paddle.Tensor.rsqrt + # - paddle.Tensor.rsqrt_ + # - paddle.Tensor.scale + # - paddle.Tensor.scale_ + # - paddle.Tensor.scatter + # - paddle.Tensor.scatter_ + # - paddle.Tensor.scatter_nd + # - paddle.Tensor.scatter_nd_add + # - paddle.Tensor.select_scatter + # - paddle.Tensor.sgn + # - paddle.Tensor.shard_index + # - paddle.Tensor.sigmoid + # - paddle.Tensor.sigmoid_ + # - paddle.Tensor.sign + # - paddle.Tensor.sin + # - paddle.Tensor.sin_ + # - paddle.Tensor.sinh + # - paddle.Tensor.sinh_ + # - paddle.Tensor.size + # - paddle.Tensor.slice + # - paddle.Tensor.solve + # - paddle.Tensor.sort + # - paddle.Tensor.split + # - paddle.Tensor.sqrt + # - paddle.Tensor.sqrt_ + # - paddle.Tensor.square + # - paddle.Tensor.stack + # - paddle.Tensor.stanh + # - paddle.Tensor.std + # - paddle.Tensor.stft + # - paddle.Tensor.strided_slice + # - paddle.Tensor.strides + # - paddle.Tensor.subtract + # - paddle.Tensor.subtract_ + # - paddle.Tensor.sum + # - paddle.Tensor.t + # - paddle.Tensor.t_ + # - paddle.Tensor.take + # - paddle.Tensor.take_along_axis + # - paddle.Tensor.tan + # - paddle.Tensor.tan_ + # - paddle.Tensor.tanh + # - paddle.Tensor.tanh_ + # - paddle.Tensor.tensordot + # - paddle.Tensor.tile + # - paddle.Tensor.top_p_sampling + # - paddle.Tensor.topk + # - paddle.Tensor.trace + # - paddle.Tensor.transpose + # - paddle.Tensor.transpose_ + # - paddle.Tensor.trapezoid + # - paddle.Tensor.tril + # - paddle.Tensor.tril_ + # - paddle.Tensor.triu + # - paddle.Tensor.triu_ + # - paddle.Tensor.trunc + # - paddle.Tensor.trunc_ + # - paddle.Tensor.unbind + # - paddle.Tensor.unflatten + # - paddle.Tensor.unfold + # - paddle.Tensor.uniform_ + # - paddle.Tensor.unique + # - paddle.Tensor.unique_consecutive + # - paddle.Tensor.unstack + # - paddle.Tensor.vander + # - paddle.Tensor.var + # - paddle.Tensor.view + # - paddle.Tensor.view_as + # - paddle.Tensor.vsplit + # - paddle.Tensor.where + # - paddle.Tensor.where_ + # - paddle._C_ops.fused_gemm_epilogue + # - paddle.optimizer.Adam + # - paddle.optimizer.AdamW + # - paddle._C_ops.adamw + # - paddle._C_ops.adamw_ + # - paddle._legacy_C_ops.fused_gemm_epilogue + # - paddle.incubate.nn.functional.fused_multi_head_attention + # - paddle.incubate.nn.functional.fused_feedforward + # - paddle.incubate.nn.functional.fused_multi_transformer + # - paddle.incubate.nn.functional.fused_linear + # - paddle.incubate.nn.functional.fused_linear_activation + # - paddle.incubate.nn.functional.fused_bias_dropout_residual_layer_norm + # - paddle.incubate.nn.functional.fused_ec_moe + # - paddle.incubate.nn.functional.fused_dropout_add + # - paddle.incubate.nn.functional.fused_rotary_position_embedding + # - paddle.incubate.nn.functional.variable_length_memory_efficient_attention + # - paddle.incubate.nn.functional.fused_rms_norm + # - paddle.incubate.nn.functional.fused_layer_norm + # - paddle.incubate.nn.functional.masked_multihead_attention + # - paddle.incubate.nn.functional.block_multihead_attention + # - paddle.incubate.nn.functional.swiglu + # - paddle.incubate.nn.functional.fused_matmul_bias + # - paddle.tensor.fill_constant + # - paddle.nn.clip._squared_l2_norm + # - paddle.uniform # - paddle._C_ops.gaussian # - paddle._legacy_C_ops.c_identity - # - paddle.distributed.fleet.layers.mpu.mp_ops. - # - paddlenlp.transformers.llama.fusion_ops.fused_rotary_position_embedding - # - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention - # - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm - # - paddlenlp.transformers.llama.fusion_ops.fusion_rope - # - paddlenlp.transformers.llama.fusion_ops.swiglu +#fusion_ops: + - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention + - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm + - paddlenlp.transformers.llama.fusion_ops.fusion_rope diff --git a/paddleapex/api_tracer/configs/op_target.yaml.all b/paddleapex/api_tracer/configs/op_target.yaml.all new file mode 100644 index 0000000..5fcad26 --- /dev/null +++ b/paddleapex/api_tracer/configs/op_target.yaml.all @@ -0,0 +1,901 @@ +ignored_op: + - paddle._C_ops.max + - paddle._C_ops.min + +target_op: + # Special op, paddle has wrapped op in framework. #noqa + - paddle._C_ops.layer_norm #noqa + - paddle.nn.functional.adaptive_avg_pool1d + - paddle.nn.functional.adaptive_avg_pool2d + - paddle.nn.functional.adaptive_avg_pool3d + - paddle.nn.functional.adaptive_max_pool1d + - paddle.nn.functional.adaptive_max_pool2d + - paddle.nn.functional.adaptive_max_pool3d + - paddle.nn.functional.affine_grid + - paddle.nn.functional.alpha_dropout + - paddle.nn.functional.avg_pool1d + - paddle.nn.functional.avg_pool2d + - paddle.nn.functional.avg_pool3d + - paddle.nn.functional.batch_norm + - paddle.nn.functional.bilinear + - paddle.nn.functional.binary_cross_entropy + - paddle.nn.functional.binary_cross_entropy_with_logits + - paddle.nn.functional.celu + - paddle.nn.functional.channel_shuffle + - paddle.nn.functional.class_center_sample + - paddle.nn.functional.common + - paddle.nn.functional.conv1d + - paddle.nn.functional.conv1d_transpose + - paddle.nn.functional.conv2d + - paddle.nn.functional.conv2d_transpose + - paddle.nn.functional.conv3d + - paddle.nn.functional.conv3d_transpose + - paddle.nn.functional.cosine_embedding_loss + - paddle.nn.functional.cosine_similarity + - paddle.nn.functional.cross_entropy + - paddle.nn.functional.ctc_loss + - paddle.nn.functional.diag_embed + - paddle.nn.functional.dice_loss + - paddle.nn.functional.distance + - paddle.nn.functional.dropout + - paddle.nn.functional.dropout2d + - paddle.nn.functional.dropout3d + - paddle.nn.functional.elu + - paddle.nn.functional.elu_ + - paddle.nn.functional.embedding + - paddle.nn.functional.extension + - paddle.nn.functional.flash_attention + - paddle.nn.functional.flash_attention_with_sparse_mask + - paddle.nn.functional.fractional_max_pool2d + - paddle.nn.functional.fractional_max_pool3d + - paddle.nn.functional.fold + - paddle.nn.functional.gather_tree + - paddle.nn.functional.gaussian_nll_loss + - paddle.nn.functional.gelu + - paddle.nn.functional.glu + - paddle.nn.functional.grid_sample + - paddle.nn.functional.gumbel_softmax + - paddle.nn.functional.hardshrink + - paddle.nn.functional.hardsigmoid + - paddle.nn.functional.hardswish + - paddle.nn.functional.hardtanh + - paddle.nn.functional.hardtanh_ + - paddle.nn.functional.hinge_embedding_loss + - paddle.nn.functional.hsigmoid_loss + - paddle.nn.functional.instance_norm + - paddle.nn.functional.interpolate + - paddle.nn.functional.kl_div + - paddle.nn.functional.l1_loss + - paddle.nn.functional.label_smooth + - paddle.nn.functional.layer_norm + - paddle.nn.functional.leaky_relu + - paddle.nn.functional.leaky_relu_ + - paddle.nn.functional.linear + - paddle.nn.functional.local_response_norm + - paddle.nn.functional.log_loss + - paddle.nn.functional.log_sigmoid + - paddle.nn.functional.log_softmax + - paddle.nn.functional.margin_cross_entropy + - paddle.nn.functional.margin_ranking_loss + - paddle.nn.functional.max_pool1d + - paddle.nn.functional.max_pool2d + - paddle.nn.functional.max_pool3d + - paddle.nn.functional.max_unpool1d + - paddle.nn.functional.max_unpool2d + - paddle.nn.functional.max_unpool3d + - paddle.nn.functional.maxout + - paddle.nn.functional.mish + - paddle.nn.functional.mse_loss + - paddle.nn.functional.multi_label_soft_margin_loss + - paddle.nn.functional.multi_margin_loss + - paddle.nn.functional.nll_loss + - paddle.nn.functional.norm + - paddle.nn.functional.normalize + - paddle.nn.functional.npair_loss + - paddle.nn.functional.one_hot + - paddle.nn.functional.pad + - paddle.nn.functional.pairwise_distance + - paddle.nn.functional.pdist + - paddle.nn.functional.pixel_shuffle + - paddle.nn.functional.pixel_unshuffle + - paddle.nn.functional.poisson_nll_loss + - paddle.nn.functional.pooling + - paddle.nn.functional.prelu + - paddle.nn.functional.relu + - paddle.nn.functional.relu6 + - paddle.nn.functional.relu_ + - paddle.nn.functional.rnnt_loss + - paddle.nn.functional.rrelu + - paddle.nn.functional.scaled_dot_product_attention + - paddle.nn.functional.sdp_kernel + - paddle.nn.functional.selu + - paddle.nn.functional.sequence_mask + - paddle.nn.functional.sigmoid + - paddle.nn.functional.sigmoid_focal_loss + - paddle.nn.functional.silu + - paddle.nn.functional.smooth_l1_loss + - paddle.nn.functional.soft_margin_loss + - paddle.nn.functional.softmax + - paddle.nn.functional.softmax_ + - paddle.nn.functional.softmax_with_cross_entropy + - paddle.nn.functional.softplus + - paddle.nn.functional.softshrink + - paddle.nn.functional.softsign + - paddle.nn.functional.sparse_attention + - paddle.nn.functional.square_error_cost + - paddle.nn.functional.swish + - paddle.nn.functional.tanh + - paddle.nn.functional.tanh_ + - paddle.nn.functional.tanhshrink + - paddle.nn.functional.temporal_shift + - paddle.nn.functional.thresholded_relu + - paddle.nn.functional.thresholded_relu_ + - paddle.nn.functional.triplet_margin_loss + - paddle.nn.functional.triplet_margin_with_distance_loss + - paddle.nn.functional.unfold + - paddle.nn.functional.upsample + - paddle.nn.functional.zeropad2d + - paddle.abs + - paddle.abs_ + - paddle.acos + - paddle.acos_ + - paddle.acosh + - paddle.acosh_ + - paddle.add + - paddle.add_n + - paddle.addmm + - paddle.addmm_ + - paddle.all + - paddle.allclose + - paddle.amax + - paddle.amin + - paddle.angle + - paddle.any + - paddle.arange + - paddle.argmax + - paddle.argmin + - paddle.argsort + - paddle.as_complex + - paddle.as_real + - paddle.as_strided + - paddle.asin + - paddle.asin_ + - paddle.asinh + - paddle.asinh_ + - paddle.assign + - paddle.atan + - paddle.atan2 + - paddle.atan_ + - paddle.atanh + - paddle.atanh_ + - paddle.atleast_1d + - paddle.atleast_2d + - paddle.atleast_3d + - paddle.bernoulli + - paddle.bincount + - paddle.binomial + - paddle.bitwise_and + - paddle.bitwise_and_ + - paddle.bitwise_not + - paddle.bitwise_not_ + - paddle.bitwise_or + - paddle.bitwise_or_ + - paddle.bitwise_xor + - paddle.bitwise_xor_ + - paddle.bmm + - paddle.broadcast_shape + - paddle.broadcast_tensors + - paddle.broadcast_to + - paddle.cauchy_ + - paddle.cast + - paddle.cdist + - paddle.ceil + - paddle.cholesky + - paddle.chunk + - paddle.clip + - paddle.column_stack + - paddle.combinations + - paddle.concat + - paddle.conj + - paddle.copysign + - paddle.copysign_ + - paddle.cos + - paddle.cos_ + - paddle.cosh + - paddle.cosh_ + - paddle.count_nonzero + - paddle.crop + - paddle.cross + - paddle.cummax + - paddle.cummin + - paddle.cumprod + - paddle.cumprod_ + - paddle.cumsum + - paddle.cumsum_ + - paddle.cumulative_trapezoid + - paddle.decomposition + - paddle.deg2rad + - paddle.diag + - paddle.diag_embed + - paddle.diagflat + - paddle.diagonal + - paddle.diagonal_scatter + - paddle.diff + - paddle.digamma + - paddle.digamma_ + - paddle.divide + - paddle.divide_ + - paddle.dot + - paddle.dsplit + - paddle.dstack + - paddle.eigvalsh + - paddle.einsum + - paddle.empty + - paddle.empty_like + - paddle.equal + - paddle.equal_all + - paddle.erf + - paddle.erf_ + - paddle.erfinv + - paddle.exp + - paddle.expand + - paddle.expand_as + - paddle.expm1 + - paddle.expm1_ + - paddle.eye + - paddle.fft + - paddle.flatten + - paddle.flatten_ + - paddle.flip + - paddle.floor + - paddle.floor_divide + - paddle.floor_divide_ + - paddle.floor_mod + - paddle.floor_mod_ + - paddle.fmax + - paddle.fmin + - paddle.frac + - paddle.frac_ + - paddle.frexp + - paddle.full + - paddle.full_like + - paddle.gather + - paddle.gather_nd + - paddle.gcd + - paddle.gcd_ + - paddle.greater_equal + - paddle.greater_equal_ + - paddle.greater_than + - paddle.greater_than_ + - paddle.heaviside + - paddle.histogram + - paddle.histogramdd + - paddle.hsplit + - paddle.hstack + - paddle.hypot + - paddle.hypot_ + - paddle.i0 + - paddle.i0_ + - paddle.i0e + - paddle.i1 + - paddle.i1e + - paddle.imag + - paddle.increment + - paddle.index_add + - paddle.index_add_ + - paddle.index_fill + - paddle.index_fill_ + - paddle.index_put + - paddle.index_put_ + - paddle.index_sample + - paddle.index_select + - paddle.inner + - paddle.kron + - paddle.kthvalue + - paddle.lcm + - paddle.lcm_ + - paddle.ldexp + - paddle.ldexp_ + - paddle.lerp + - paddle.less_equal + - paddle.less_equal_ + - paddle.less_than + - paddle.less_than_ + - paddle.lgamma + - paddle.lgamma_ + - paddle.linalg + - paddle.linspace + - paddle.log + - paddle.log10 + - paddle.log10_ + - paddle.log1p + - paddle.log1p_ + - paddle.log2 + - paddle.log2_ + - paddle.log_ + - paddle.logaddexp + - paddle.logcumsumexp + - paddle.logical_and + - paddle.logical_and_ + - paddle.logical_not + - paddle.logical_not_ + - paddle.logical_or + - paddle.logical_or_ + - paddle.logical_xor + - paddle.logical_xor_ + - paddle.logit + - paddle.logit_ + - paddle.logspace + - paddle.logsumexp + - paddle.masked_fill + - paddle.masked_fill_ + - paddle.masked_scatter + - paddle.masked_scatter_ + - paddle.masked_select + - paddle.matmul + - paddle.max + - paddle.maximum + - paddle.mean + - paddle.median + - paddle.meshgrid + - paddle.min + - paddle.minimum + - paddle.mm + - paddle.mod + - paddle.mod_ + - paddle.mode + - paddle.moveaxis + - paddle.multigammaln + - paddle.multigammaln_ + - paddle.multinomial + - paddle.multiplex + - paddle.multiply + - paddle.multiply_ + - paddle.mv + - paddle.nan_to_num + - paddle.nan_to_num_ + - paddle.nanmean + - paddle.nanmedian + - paddle.nanquantile + - paddle.nansum + - paddle.neg + - paddle.neg_ + - paddle.nextafter + - paddle.nonzero + - paddle.normal + - paddle.normal_ + - paddle.not_equal + - paddle.not_equal_ + - paddle.numel + - paddle.ones + - paddle.ones_like + - paddle.outer + - paddle.pdist + - paddle.poisson + - paddle.polar + - paddle.polygamma + - paddle.polygamma_ + - paddle.pow + - paddle.pow_ + - paddle.prod + - paddle.put_along_axis + - paddle.quantile + - paddle.rad2deg + - paddle.rand + - paddle.randint + - paddle.randint_like + - paddle.randn + - paddle.randperm + - paddle.reader + - paddle.real + - paddle.reciprocal + - paddle.regularizer + - paddle.remainder + - paddle.remainder_ + - paddle.renorm + - paddle.renorm_ + - paddle.repeat_interleave + - paddle.reshape + - paddle.reshape_ + - paddle.roll + - paddle.rot90 + - paddle.round + - paddle.row_stack + - paddle.rsqrt + - paddle.scale + - paddle.scatter + - paddle.scatter_ + # - paddle.scatter_nd # cause CUDA_ERROR ignored. + # - paddle.scatter_nd_add + - paddle.searchsorted + - paddle.select_scatter + - paddle.sgn + - paddle.shard_index + - paddle.sign + - paddle.signal + - paddle.signbit + - paddle.sin + - paddle.sin_ + - paddle.sinh + - paddle.sinh_ + - paddle.slice + # - paddle.slice_scatter + - paddle.sort + - paddle.split + - paddle.sqrt + - paddle.square + - paddle.square_ + - paddle.squeeze + - paddle.squeeze_ + - paddle.stack + - paddle.standard_gamma + - paddle.standard_normal + - paddle.stanh + - paddle.strided_slice + - paddle.subtract + - paddle.sum + - paddle.t + - paddle.t_ + - paddle.take + - paddle.take_along_axis + - paddle.tan + - paddle.tan_ + - paddle.tanh + - paddle.tanh_ + - paddle.tensordot + - paddle.tile + - paddle.topk + - paddle.trace + - paddle.transpose + - paddle.transpose_ + - paddle.trapezoid + - paddle.tril + - paddle.tril_ + - paddle.tril_indices + - paddle.triu + - paddle.triu_ + - paddle.triu_indices + - paddle.trunc + - paddle.trunc_ + - paddle.unbind + - paddle.unflatten + - paddle.unfold + - paddle.uniform + - paddle.unique + - paddle.unique_consecutive + - paddle.unsqueeze + - paddle.unsqueeze_ + - paddle.unstack + - paddle.vander + - paddle.var + - paddle.view + - paddle.view_as + - paddle.vsplit + - paddle.where + - paddle.where_ + - paddle.zeros + - paddle.zeros_like + # - paddle.Tensor.T + - paddle.Tensor.__add__ + - paddle.Tensor.__and__ + - paddle.Tensor.__radd__ + - paddle.Tensor.__div__ + - paddle.Tensor.__eq__ + - paddle.Tensor.__floordiv__ + - paddle.Tensor.__ge__ + - paddle.Tensor.__gt__ + - paddle.Tensor.__le__ + - paddle.Tensor.__lt__ + - paddle.Tensor.__matmul__ + - paddle.Tensor.__mod__ + - paddle.Tensor.__mul__ + - paddle.Tensor.__ne__ + - paddle.Tensor.__neg__ + - paddle.Tensor.__nonzero__ + - paddle.Tensor.__or__ + - paddle.Tensor.__pow__ + - paddle.Tensor.__radd__ + - paddle.Tensor.__rdiv__ + - paddle.Tensor.__rmul__ + - paddle.Tensor.__rpow__ + - paddle.Tensor.__rsub__ + - paddle.Tensor.__rtruediv__ + - paddle.Tensor.__sub__ + - paddle.Tensor.__truediv__ + - paddle.Tensor.__xor__ + - paddle.Tensor.abs + - paddle.Tensor.abs_ + - paddle.Tensor.acos + - paddle.Tensor.acos_ + - paddle.Tensor.acosh + - paddle.Tensor.acosh_ + - paddle.Tensor.add + - paddle.Tensor.add_ + - paddle.Tensor.add_n + - paddle.Tensor.addmm + - paddle.Tensor.addmm_ + - paddle.Tensor.all + - paddle.Tensor.allclose + - paddle.Tensor.amax + - paddle.Tensor.amin + - paddle.Tensor.angle + - paddle.Tensor.any + - paddle.Tensor.argmax + - paddle.Tensor.argmin + - paddle.Tensor.argsort + - paddle.Tensor.as_complex + - paddle.Tensor.as_real + - paddle.Tensor.as_strided + - paddle.Tensor.asin + - paddle.Tensor.asin_ + - paddle.Tensor.asinh + - paddle.Tensor.asinh_ + - paddle.Tensor.atan + - paddle.Tensor.atan2 + - paddle.Tensor.atan_ + - paddle.Tensor.atanh + - paddle.Tensor.atanh_ + - paddle.Tensor.atleast_1d + - paddle.Tensor.atleast_2d + - paddle.Tensor.atleast_3d + - paddle.Tensor.bincount + - paddle.Tensor.bitwise_and + - paddle.Tensor.bitwise_and_ + - paddle.Tensor.bitwise_not + - paddle.Tensor.bitwise_not_ + - paddle.Tensor.bitwise_or + - paddle.Tensor.bitwise_or_ + - paddle.Tensor.bitwise_xor + - paddle.Tensor.bitwise_xor_ + - paddle.Tensor.bmm + - paddle.Tensor.broadcast_shape + - paddle.Tensor.broadcast_tensors + - paddle.Tensor.broadcast_to + - paddle.Tensor.cauchy_ + - paddle.Tensor.cdist + - paddle.Tensor.ceil + - paddle.Tensor.ceil_ + - paddle.Tensor.cholesky + - paddle.Tensor.cholesky_solve + - paddle.Tensor.clip + - paddle.Tensor.clip_ + - paddle.Tensor.coalesce + - paddle.Tensor.cols + - paddle.Tensor.combinations + - paddle.Tensor.concat + - paddle.Tensor.cond + - paddle.Tensor.conj + - paddle.Tensor.contiguous + - paddle.Tensor.corrcoef + - paddle.Tensor.cos + - paddle.Tensor.cos_ + - paddle.Tensor.cosh + - paddle.Tensor.cosh_ + - paddle.Tensor.count_nonzero + - paddle.Tensor.cov + - paddle.Tensor.cross + - paddle.Tensor.crows + - paddle.Tensor.cummax + - paddle.Tensor.cummin + - paddle.Tensor.cumprod + - paddle.Tensor.cumprod_ + - paddle.Tensor.cumsum + - paddle.Tensor.cumsum_ + - paddle.Tensor.cumulative_trapezoid + - paddle.Tensor.deg2rad + - paddle.Tensor.diag + - paddle.Tensor.diag_embed + - paddle.Tensor.diagflat + - paddle.Tensor.diagonal + - paddle.Tensor.diagonal_scatter + - paddle.Tensor.diff + - paddle.Tensor.digamma + - paddle.Tensor.digamma_ + - paddle.Tensor.divide + - paddle.Tensor.divide_ + - paddle.Tensor.dot + - paddle.Tensor.eig + - paddle.Tensor.eigvals + - paddle.Tensor.eigvalsh + - paddle.Tensor.equal + - paddle.Tensor.equal_all + - paddle.Tensor.erf + - paddle.Tensor.erfinv + - paddle.Tensor.erfinv_ + - paddle.Tensor.exp + - paddle.Tensor.exp_ + - paddle.Tensor.expand + - paddle.Tensor.expand_as + - paddle.Tensor.expm1 + - paddle.Tensor.exponential_ + - paddle.Tensor.fill_ + - paddle.Tensor.fill_diagonal_ + - paddle.Tensor.fill_diagonal_tensor + - paddle.Tensor.fill_diagonal_tensor_ + - paddle.Tensor.flatten + - paddle.Tensor.flatten_ + - paddle.Tensor.flip + - paddle.Tensor.floor + - paddle.Tensor.floor_ + - paddle.Tensor.floor_divide + - paddle.Tensor.floor_divide_ + - paddle.Tensor.floor_mod + - paddle.Tensor.floor_mod_ + - paddle.Tensor.fmax + - paddle.Tensor.fmin + - paddle.Tensor.frac + - paddle.Tensor.frac_ + - paddle.Tensor.frexp + - paddle.Tensor.gather + - paddle.Tensor.gather_nd + - paddle.Tensor.gcd + - paddle.Tensor.gcd_ + - paddle.Tensor.get_selected_rows + - paddle.Tensor.get_strides + - paddle.Tensor.greater_equal + - paddle.Tensor.greater_equal_ + - paddle.Tensor.greater_than + - paddle.Tensor.greater_than_ + - paddle.Tensor.heaviside + - paddle.Tensor.histogram + - paddle.Tensor.histogramdd + - paddle.Tensor.hsplit + - paddle.Tensor.hypot + - paddle.Tensor.hypot_ + - paddle.Tensor.i0 + - paddle.Tensor.i0_ + - paddle.Tensor.i0e + - paddle.Tensor.i1 + - paddle.Tensor.i1e + - paddle.Tensor.imag + - paddle.Tensor.increment + - paddle.Tensor.index_add + - paddle.Tensor.index_add_ + - paddle.Tensor.index_fill + - paddle.Tensor.index_fill_ + - paddle.Tensor.index_put + - paddle.Tensor.index_put_ + - paddle.Tensor.index_sample + - paddle.Tensor.index_select + - paddle.Tensor.inner + - paddle.Tensor.kron + - paddle.Tensor.kthvalue + - paddle.Tensor.layout + - paddle.Tensor.lcm + - paddle.Tensor.lcm_ + - paddle.Tensor.ldexp + - paddle.Tensor.ldexp_ + - paddle.Tensor.lerp + - paddle.Tensor.lerp_ + - paddle.Tensor.less_equal + - paddle.Tensor.less_equal_ + - paddle.Tensor.less_than + - paddle.Tensor.less_than_ + - paddle.Tensor.lgamma + - paddle.Tensor.lgamma_ + - paddle.Tensor.log + - paddle.Tensor.log10 + - paddle.Tensor.log10_ + - paddle.Tensor.log1p + - paddle.Tensor.log1p_ + - paddle.Tensor.log2 + - paddle.Tensor.log2_ + - paddle.Tensor.log_ + - paddle.Tensor.logaddexp + - paddle.Tensor.logcumsumexp + - paddle.Tensor.logical_and + - paddle.Tensor.logical_and_ + - paddle.Tensor.logical_not + - paddle.Tensor.logical_not_ + - paddle.Tensor.logical_or + - paddle.Tensor.logical_or_ + - paddle.Tensor.logical_xor + - paddle.Tensor.logical_xor_ + - paddle.Tensor.logit + - paddle.Tensor.logit_ + - paddle.Tensor.logsumexp + - paddle.Tensor.lstsq + - paddle.Tensor.lu + - paddle.Tensor.lu_unpack + - paddle.Tensor.masked_fill + - paddle.Tensor.masked_fill_ + - paddle.Tensor.masked_select + - paddle.Tensor.masked_scatter + - paddle.Tensor.masked_scatter_ + - paddle.Tensor.matmul + - paddle.Tensor.matrix_power + - paddle.Tensor.max + - paddle.Tensor.maximum + - paddle.Tensor.mean + - paddle.Tensor.median + - paddle.Tensor.min + - paddle.Tensor.minimum + - paddle.Tensor.mm + - paddle.Tensor.mod + - paddle.Tensor.mod_ + - paddle.Tensor.mode + - paddle.Tensor.moveaxis + - paddle.Tensor.multi_dot + - paddle.Tensor.multigammaln + - paddle.Tensor.multigammaln_ + - paddle.Tensor.multinomial + - paddle.Tensor.multiplex + - paddle.Tensor.multiply + - paddle.Tensor.multiply_ + - paddle.Tensor.mv + - paddle.Tensor.nan_to_num + - paddle.Tensor.nan_to_num_ + - paddle.Tensor.nanmean + - paddle.Tensor.nanmedian + - paddle.Tensor.nanquantile + - paddle.Tensor.nansum + - paddle.Tensor.ndimension + - paddle.Tensor.neg + - paddle.Tensor.neg_ + - paddle.Tensor.nnz + - paddle.Tensor.nonzero + - paddle.Tensor.norm + - paddle.Tensor.normal_ + - paddle.Tensor.not_equal + - paddle.Tensor.not_equal_ + - paddle.Tensor.numel + - paddle.Tensor.offset + - paddle.Tensor.outer + - paddle.Tensor.pca_lowrank + - paddle.Tensor.pinv + - paddle.Tensor.polar + - paddle.Tensor.polygamma + - paddle.Tensor.polygamma_ + - paddle.Tensor.pow + - paddle.Tensor.pow_ + - paddle.Tensor.process_mesh + - paddle.Tensor.prod + - paddle.Tensor.put_along_axis + - paddle.Tensor.put_along_axis_ + - paddle.Tensor.qr + - paddle.Tensor.quantile + - paddle.Tensor.rad2deg + - paddle.Tensor.remainder + - paddle.Tensor.remainder_ + - paddle.Tensor.renorm + - paddle.Tensor.renorm_ + - paddle.Tensor.repeat_interleave + - paddle.Tensor.reverse + - paddle.Tensor.roll + - paddle.Tensor.rot90 + - paddle.Tensor.round + - paddle.Tensor.round_ + - paddle.Tensor.rows + - paddle.Tensor.rsqrt + - paddle.Tensor.rsqrt_ + - paddle.Tensor.scale + - paddle.Tensor.scale_ + - paddle.Tensor.scatter + - paddle.Tensor.scatter_ + - paddle.Tensor.scatter_nd + - paddle.Tensor.scatter_nd_add + - paddle.Tensor.select_scatter + - paddle.Tensor.sgn + - paddle.Tensor.shard_index + - paddle.Tensor.sigmoid + - paddle.Tensor.sigmoid_ + - paddle.Tensor.sign + - paddle.Tensor.sin + - paddle.Tensor.sin_ + - paddle.Tensor.sinh + - paddle.Tensor.sinh_ + - paddle.Tensor.size + - paddle.Tensor.slice + - paddle.Tensor.solve + - paddle.Tensor.sort + - paddle.Tensor.split + - paddle.Tensor.sqrt + - paddle.Tensor.sqrt_ + - paddle.Tensor.square + - paddle.Tensor.squeeze + - paddle.Tensor.squeeze_ + - paddle.Tensor.stack + - paddle.Tensor.stanh + - paddle.Tensor.std + - paddle.Tensor.stft + - paddle.Tensor.strided_slice + - paddle.Tensor.strides + - paddle.Tensor.subtract + - paddle.Tensor.subtract_ + - paddle.Tensor.sum + - paddle.Tensor.t + - paddle.Tensor.t_ + - paddle.Tensor.take + - paddle.Tensor.take_along_axis + - paddle.Tensor.tan + - paddle.Tensor.tan_ + - paddle.Tensor.tanh + - paddle.Tensor.tanh_ + - paddle.Tensor.tensordot + - paddle.Tensor.tile + - paddle.Tensor.top_p_sampling + - paddle.Tensor.topk + - paddle.Tensor.trace + - paddle.Tensor.transpose + - paddle.Tensor.transpose_ + - paddle.Tensor.trapezoid + - paddle.Tensor.tril + - paddle.Tensor.tril_ + - paddle.Tensor.triu + - paddle.Tensor.triu_ + - paddle.Tensor.trunc + - paddle.Tensor.trunc_ + - paddle.Tensor.unbind + - paddle.Tensor.unflatten + - paddle.Tensor.unfold + - paddle.Tensor.uniform_ + - paddle.Tensor.unique + - paddle.Tensor.unique_consecutive + - paddle.Tensor.unsqueeze + - paddle.Tensor.unsqueeze_ + - paddle.Tensor.unstack + - paddle.Tensor.vander + - paddle.Tensor.var + - paddle.Tensor.view + - paddle.Tensor.view_as + - paddle.Tensor.vsplit + - paddle.Tensor.where + - paddle.Tensor.where_ + - paddle.Tensor.zero_ + #### experiment op: + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy + - paddle._C_ops.fused_gemm_epilogue + - paddle.optimizer.Adam + - paddle.optimizer.AdamW + - paddle._C_ops.adamw + - paddle._C_ops.adamw_ + - paddle._legacy_C_ops.fused_gemm_epilogue + - paddle.incubate.nn.functional.fused_multi_head_attention + - paddle.incubate.nn.functional.fused_feedforward + - paddle.incubate.nn.functional.fused_multi_transformer + - paddle.incubate.nn.functional.fused_linear + - paddle.incubate.nn.functional.fused_linear_activation + - paddle.incubate.nn.functional.fused_bias_dropout_residual_layer_norm + - paddle.incubate.nn.functional.fused_ec_moe + - paddle.incubate.nn.functional.fused_dropout_add + - paddle.incubate.nn.functional.fused_rotary_position_embedding + - paddle.incubate.nn.functional.variable_length_memory_efficient_attention + - paddle.incubate.nn.functional.fused_rms_norm + - paddle.incubate.nn.functional.fused_layer_norm + - paddle.incubate.nn.functional.masked_multihead_attention + - paddle.incubate.nn.functional.block_multihead_attention + - paddle.incubate.nn.functional.swiglu + - paddle.incubate.nn.functional.fused_matmul_bias + - paddle.tensor.fill_constant + - paddle.nn.clip._squared_l2_norm + - paddle.uniform + - paddle._C_ops.gaussian + - paddle._legacy_C_ops.c_identity + - paddle.distributed.fleet.layers.mpu.mp_ops. + - paddlenlp.transformers.llama.fusion_ops.fused_rotary_position_embedding + - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention + - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm + - paddlenlp.transformers.llama.fusion_ops.fusion_rope + # - paddlenlp.transformers.llama.fusion_ops.swiglu +# distributed + - paddle.distributed.all_gather + - paddle.distributed.all_gather_object + - paddle.distributed.all_reduce + - paddle.distributed.alltoall + - paddle.distributed.alltoall_single + - paddle.distributed.barrier + - paddle.distributed.broadcast + - paddle.distributed.broadcast_object_list + - paddle.distributed.communication.stream.all_gather + - paddle.distributed.communication.stream.all_reduce + - paddle.distributed.communication.stream.alltoall + - paddle.distributed.communication.stream.alltoall_single + - paddle.distributed.communication.stream.broadcast + - paddle.distributed.communication.stream.gather + - paddle.distributed.communication.stream.recv + - paddle.distributed.communication.stream.reduce + - paddle.distributed.communication.stream.reduce_scatter + - paddle.distributed.communication.stream.scatter + - paddle.distributed.communication.stream.send diff --git a/paddleapex/api_tracer/configs/tool_config.yaml b/paddleapex/api_tracer/configs/tool_config.yaml index 30375b1..07e8f37 100644 --- a/paddleapex/api_tracer/configs/tool_config.yaml +++ b/paddleapex/api_tracer/configs/tool_config.yaml @@ -10,17 +10,17 @@ remote_path: "/root/paddlejob/workspace/PaddleAPEX_dump/" Async_dump: False # mode must be chosen from ["real_data", "random"] -dump_mode: "random" +dump_mode: "real_data" # acclerate dump process by getting extremum value on device side. # In profile_mode, the speed of dump is 75% of vanilla speed. profile_mode: True # target_step is a list, dump api function will turn on at the specific step -target_step: [0] +target_step: [5] # Remove duplicate apis from dump_info and keep only one api in the same value range. -dump_unique: True +dump_unique: False # Split dump_info into half-precision operators and other operators when saving json files split_dump: True diff --git a/paddleapex/api_tracer/wrap_op/OPTemplate.py b/paddleapex/api_tracer/wrap_op/OPTemplate.py index f9e8833..1652a99 100644 --- a/paddleapex/api_tracer/wrap_op/OPTemplate.py +++ b/paddleapex/api_tracer/wrap_op/OPTemplate.py @@ -14,7 +14,9 @@ import paddle.distributed as dist import paddle from .. import config -from ..api_info import API +from ..api_info import API, get_init_params, save_init_params_and_weight, save_init_params, save_weight +import os +from paddleapex.api_tracer.Dump import dump_util class HookOp: @@ -24,12 +26,91 @@ class HookOp: cfg = config.cfg +def hijack_init(self, *args, **kwargs): + print("args", args) + print("kwargs", kwargs) + self.__init__(*args, **kwargs) + + +def create_hook_with_info(tensor, attr_index, api): + def grad_hook(grad): + if grad is not None: + single_arg = {} + single_arg.update({"type": "paddle.Tensor"}) + single_arg.update({"dtype": str(grad.dtype.name)}) + single_arg.update({"shape": grad.shape}) + single_arg.update({"stop_gradient": grad.stop_gradient}) + api_args = api.op_name + ".grad_" + str(attr_index) + pt_path = dump_util.dump_real_data(api_args, grad.detach().cpu(), api.rank) + single_arg.update({"real_data_path": pt_path}) + + api.dout_list.append(single_arg) + api.output_num -= 1 + if api.output_num == 0: + api.api_info_struct[api.op_name].update({"dout_list": api.dout_list}) + if api.mode == "real_data": + return grad_hook + else: + return api.record_dout + + +def create_output_attr(tensor, num): + setattr(tensor, 'id_apex', num) + return 'id_apex', num + + +def hijack_call(self, *args, **kwargs): + cls = self.__class__ + # init_params = get_init_params(self) + cfg.prefix_op_name_ = self.prefix_op_name_ + "*" + if self.__class__.__name__ not in cfg.Op_count: + cfg.Op_count[self.__class__.__name__] = 1 + cfg.prefix_op_name_ += "0" + else: + cfg.Op_count[self.__class__.__name__] += 1 + cfg.prefix_op_name_ += str(cfg.Op_count[self.__class__.__name__] - 1) + if cfg.dump_state: + api_recorder = API(cfg.dump_mode) + rank = dist.get_rank() + api_recorder.update_APIInfo(cfg.prefix_op_name_, rank) + api_recorder.update_real_data(args, kwargs) + # save_weight(self.state_dict(), cfg.prefix_op_name_, rank) + # save_init_params_and_weight(init_params, self.state_dict(), cfg.prefix_op_name_, rank) + save_init_params_and_weight(self.apex_init_params, self.state_dict(), cfg.prefix_op_name_, rank) + output = self.forward(*args, **kwargs) + try: + out_num = 0 + if isinstance(output, paddle.Tensor): + if not output.stop_gradient: + output.register_hook(create_hook_with_info(output, api_recorder.output_num, api_recorder)) + #output.register_hook(api_recorder.record_dout) + api_recorder.output_num = 1 + else: + api_recorder.record_dout(None) + if isinstance(output, (list, tuple)): + need_record = False + for item in output: + if isinstance(item, paddle.Tensor) and not item.stop_gradient: + item.register_hook(create_hook_with_info(item, api_recorder.output_num, api_recorder)) + api_recorder.output_num += 1 + need_record = True + if not need_record: + api_recorder.record_dout(None) + except Exception as e: + print(self.__class__.__name__, " register hook failed. Due to :", e) + api_recorder.record_dout(None) + else: + output = self.forward(*args, **kwargs) + return output + + class OPTemplate: def __init__(self, op_name): self.op_name_ = op_name cfg.prefix_op_name_ = self.op_name_ + "*" def forward(self, *args, **kwargs): + print("OPTemplate", self.op_name_) if self.op_name_ not in cfg.Op_count: cfg.Op_count[self.op_name_] = 1 cfg.prefix_op_name_ += "0" @@ -39,13 +120,15 @@ def forward(self, *args, **kwargs): if cfg.dump_state: api_recorder = API(cfg.dump_mode) rank = dist.get_rank() + print("rank", rank) api_recorder.update_APIInfo(cfg.prefix_op_name_, rank) api_recorder.update_real_data(args, kwargs) output = getattr(HookOp, "wrap_" + str(self.op_name_))(*args, **kwargs) try: if isinstance(output, paddle.Tensor): if not output.stop_gradient: - output.register_hook(api_recorder.record_dout) + #output.register_hook(api_recorder.record_dout) + output.register_hook(create_hook_with_info(output, api_recorder.output_num, api_recorder)) api_recorder.output_num = 1 else: api_recorder.record_dout(None) @@ -53,9 +136,10 @@ def forward(self, *args, **kwargs): need_record = False for item in output: if isinstance(item, paddle.Tensor) and not item.stop_gradient: - api_recorder.output_num += 1 need_record = True - item.register_hook(api_recorder.record_dout) + #item.register_hook(api_recorder.record_dout) + item.register_hook(create_hook_with_info(item, api_recorder.output_num, api_recorder)) + api_recorder.output_num += 1 if not need_record: api_recorder.record_dout(None) except Exception as e: diff --git a/paddleapex/api_tracer/wrap_op/get_target_op.py b/paddleapex/api_tracer/wrap_op/get_target_op.py index 9b5c2f0..04c7625 100644 --- a/paddleapex/api_tracer/wrap_op/get_target_op.py +++ b/paddleapex/api_tracer/wrap_op/get_target_op.py @@ -25,10 +25,18 @@ def __init__(self, yaml_path): Ops = yaml.safe_load(f) self.target_op = Ops.get("target_op") self.ignored_op = Ops.get("ignored_op") + self.target_class = Ops.get("target_class") + self.distributed_op = Ops.get("distributed_op") f.close() if self.ignored_op is None: self.ignored_op = [] - self.api_to_catch = set(self.target_op) - set(self.ignored_op) + if self.target_op is None: + self.target_op = [] + if self.target_class is None: + self.target_class = [] + if self.distributed_op is None: + self.distributed_op = [] + self.api_to_catch = set(self.target_op).union(set(self.distributed_op)) - set(self.ignored_op) def check_api_stack(self): for api in self.api_to_catch: @@ -43,8 +51,10 @@ def check_api_stack(self): print(f"For api: {api} ", str(err)) def get_target_ops(self): - self.api_to_catch = set(self.target_op) - set(self.ignored_op) if cfg.profile_mode: self.api_to_catch -= set(["paddle.max", "paddle.min"]) self.check_api_stack() return self.api_to_catch + + def get_target_class(self): + return self.target_class diff --git a/paddleapex/api_tracer/wrap_op/hijack_tool.py b/paddleapex/api_tracer/wrap_op/hijack_tool.py index 8dad0f2..a5b61a3 100644 --- a/paddleapex/api_tracer/wrap_op/hijack_tool.py +++ b/paddleapex/api_tracer/wrap_op/hijack_tool.py @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. - from .. import config from ...utils import try_import from .get_target_op import GetTargetOP -from .OPTemplate import OPTemplate, HookOp +from .OPTemplate import OPTemplate, HookOp, hijack_call +import paddle.distributed as dist +from ..api_info import save_init_params cfg = config.cfg - def wrapped_op(op_name): def op_template(*args, **kwargs): return OPTemplate(op_name)(*args, **kwargs) @@ -31,6 +31,8 @@ def op_template(*args, **kwargs): def hijack_api(): op = GetTargetOP(cfg.op_target_pth) target_op = op.get_target_ops() + target_class = op.get_target_class() + print("hijack api") for op_name in target_op: parent_package, method_name = op_name.rsplit(".", maxsplit=1) try: @@ -42,8 +44,29 @@ def hijack_api(): ) except Exception as err: print(op_name, str(err)) - + + print("hijack api") for attr_name in dir(HookOp): if attr_name.startswith("wrap_"): parent_package, method_name = attr_name[5:].rsplit(".", maxsplit=1) setattr(eval(parent_package), method_name, wrapped_op(attr_name[5:])) + + print("hijack class") + for class_in in target_class: + parent_package, class_n = class_in.rsplit(".", maxsplit=1) + + try: + class_name, model = try_import(parent_package) + model = getattr(model, class_n) + model.prefix_op_name_ = class_in + model.__call__ = hijack_call + ori__init__ = model.__init__ + def hijack_init(self, *args, **kwargs): + self.apex_init_params = [args, kwargs] + ori__init__(self, *args, **kwargs) + model.__init__ = hijack_init + + except Exception as err: + print(class_in, str(err)) + + diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..83bb84f --- /dev/null +++ b/run.sh @@ -0,0 +1,8 @@ +export XPUAPI_DEBUG=0x1 +export CUDA_VISIBLE_DEVICES=7 +XPU_AUTO_BF16_TF32=1 XPU_PADDLE_FC_TF32=1 python test_fc.py +python test_fa.py +cd paddleapex/apex/ +python run_paddle.py -json ../../dump_info/rank0_step0/forward_rank0_all.json -real ../../dump_info/rank0_step0/ -out XPU -backend XPU -mode acc +cd ../../../linux-bcecmd-0.3.0 +./bcecmd --conf-path ./ bos sync ../PaddleAPEX bos:/baidu-kunlun-customer/NEW_FA_0312/ diff --git a/test.py b/test.py new file mode 100644 index 0000000..1f372a6 --- /dev/null +++ b/test.py @@ -0,0 +1,63 @@ +import paddle +import paddle.nn as nn + +import os + +def generate_true_random_number(num_bytes): + """ 生成真随机数,从 /dev/urandom 读取指定数量的字节 """ + # 从 /dev/urandom 读取 num_bytes 个字节 + random_bytes = os.urandom(num_bytes) + # 将字节转换为整数 + return int.from_bytes(random_bytes, 'big') + +# 生成一个真随机数 +#random_number = generate_true_random_number(4) # 读取 4 bytes +#print(random_number) + +from paddleapex import Tracer +checker = Tracer() +checker.register_op() + +checker.start() + +paddle.seed(42) +q_len = [131328, 147728, 106128, 128128, 120128] +#q_len = [10880, 12224, 1280, 1424, 15104, 1520, 1664, 16976, 18224, 1856, 1952, 20096, 22592, 23840, 24320, 2432, 27344, 29360, 30080, 8192, 9200, 9872] +#for i in q_len: +for i in range(20): + paddle.seed(int(generate_true_random_number(4))) + q = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) + paddle.seed(int(generate_true_random_number(6))) + k = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) + paddle.seed(int(generate_true_random_number(5))) + v = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) + q.stop_gradient = False + k.stop_gradient = False + v.stop_gradient = False + + output = paddle.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False, True) + output.stop_gradient = False + output.backward() + + +#q = paddle.rand((80, 1408, 32, 64), dtype=paddle.bfloat16) +#k = paddle.rand((80, 1408, 32, 64), dtype=paddle.bfloat16) +#v = paddle.rand((80, 1408, 32, 64), dtype=paddle.bfloat16) +#q.stop_gradient = False +#k.stop_gradient = False +#v.stop_gradient = False +# +#output = paddle.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False, True) +# +#dout = paddle.zeros_like(output) +#output.backward(dout) + +#q_len = [131328, 147728, 106128, 128128, 120128] +#for i in q_len: +# q = paddle.rand((1, i, 30, 64), dtype=paddle.bfloat16) +# k = paddle.rand((1, i, 30, 64), dtype=paddle.bfloat16) +# v = paddle.rand((1, i, 30, 64), dtype=paddle.bfloat16) +# output = paddle.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False, True) +# +checker.stop() +print(output) diff --git a/test_conv.py b/test_conv.py new file mode 100644 index 0000000..26db349 --- /dev/null +++ b/test_conv.py @@ -0,0 +1,163 @@ +import paddle +import paddle.nn as nn +import json +import numpy as np +import os +import re + + +dtype = paddle.bfloat16 +paddle.set_default_dtype(dtype) + +log_file_path = 'conv.log' +pattern = re.compile(r".*?cases(.*)") + +# 初始化列表 +list_n = [] +list_c = [] +list_xh = [] +list_xw = [] +list_f = [] +list_ksize = [] +list_stride = [] +list_pad = [] +list_dilation = [] +list_group = [] +list_nchw = [] + +n = 0 +try: + with open(log_file_path, 'r') as file: + for line in file: + n = n + 1 + print(n) + entry = json.loads(line) + # 检查是否为"fc_fusion"操作 + if entry.get("op") == "cudnn_conv2d_grad": + # 提取参数 + params = entry.get("params", {}) + types = entry.get("desc", {}) + + if 'n' in params: + list_n.append(params['n']) + if 'c' in params: + list_c.append(params['c']) + if 'xh' in params: + list_xh.append(params['xh']) + elif 'h' in params: + list_xh.append(params['h']) + if 'xw' in params: + list_xw.append(params['xw']) + elif 'w' in params: + list_xw.append(params['w']) + if 'f' in params: + list_f.append(params['f']) + if '_ksize' in params: + list_ksize.append(params['_ksize']) + if '_stride' in params: + list_stride.append(params['_stride']) + if '_pad' in params: + list_pad.append(params['_pad']) + if '_dilation' in params: + list_dilation.append(params['_dilation']) + if 'group' in params: + list_group.append(params['group']) + if 'is_nchw' in params: + list_nchw.append(params['is_nchw']) +except FileNotFoundError: + print(f"Error: The file {log_file_path} does not exist.") +except IOError as e: + print(f"Error: An I/O error occurred while reading {log_file_path}: {str(e)}") + +def generate_true_random_number(num_bytes): + """ 生成真随机数,从 /dev/urandom 读取指定数量的字节 """ + # 从 /dev/urandom 读取 num_bytes 个字节 + random_bytes = os.urandom(num_bytes) + # 将字节转换为整数 + #print(int.from_bytes(random_bytes, 'big')) + return int.from_bytes(random_bytes, 'big') + +CREATE_DATA = False +DEVICE = 'XPU' +#DEVICE = 'GPU' + +# 输出各个列表 +print("List n:", list_n) +print("List c:", list_c) +print("List xh:", list_xh) +print("List xw:", list_xw) +print("List f:", list_f) +print("List ksize:", list_ksize) +print("List stride:", list_stride) +print("List pad:", list_pad) +print("List dilation:", list_dilation) +print("List group:", list_group) +print("List nchw:", list_nchw) + +x_shape = [] +f_shape = [] +y_shape = [] +for i in range(len(list_n)): + n = list_n[i] + c = list_c[i] + xh = list_xh[i] + xw = list_xw[i] + f = list_f[i] + ksize = list_ksize[i] + pad = list_pad[i] + dilation = list_dilation[i] + stride = list_stride[i] + + x_shape.append([n, c, xh, xw]) + f_shape.append([ksize[0], ksize[1], c, f]) + h_out = (xh + 2 * pad[0] - (dilation[0] * (ksize[0] - 1) + 1)) / stride[0] + 1 + w_out = (xw + 2 * pad[1] - (dilation[1] * (ksize[1] - 1) + 1)) / stride[1] + 1 + y_shape.append([n, f, int(h_out), int(w_out)]) + +print(x_shape) +print(f_shape) +print(y_shape) + +dtype = paddle.bfloat16 +paddle.set_default_dtype(dtype) +# +if DEVICE == 'XPU' and CREATE_DATA: + for i in range(len(list_n)): + #for i in range(1): + paddle.seed(int(generate_true_random_number(4))) + x = np.random.uniform(-1, 1, x_shape[i]).astype("float32") + paddle.seed(int(generate_true_random_number(5))) + f = np.random.uniform(-1, 1, f_shape[i]).astype("float32") + paddle.seed(int(generate_true_random_number(6))) + out_grad = np.random.uniform(-1, 1, y_shape[i]).astype("float32") + x = paddle.to_tensor(x, stop_gradient=False).cast(dtype) + f = paddle.to_tensor(f, stop_gradient=False).cast(dtype) + out_grad = paddle.to_tensor(out_grad, stop_gradient=True).cast(dtype) + paddle.save([x, f, out_grad], 'CONV_INPUT/conv_' + str(i)) + +for i in range(len(list_n)): +#for i in range(1): + x, f, out_grad = paddle.load('CONV_INPUT/conv_' + str(i)) + conv = nn.Conv2D(list_c[i], list_f[i], kernel_size=list_ksize[i], + stride=list_stride[i], padding=list_pad[i], dilation=list_dilation[i], + groups=list_group[i]) + if list_group[i] == 1: + conv.weight.set_value(f) + conv.train() + + out = conv(x) + #paddle.save([True, [out]], 'XPU/output/conv_' + str(i)) + #out = paddle.cast(out, "float32") + paddle.autograd.backward(tensors=[out], grad_tensors=[out_grad]) + #paddle.save([True, [linear.weight.grad, linear.bias.grad]], 'XPU/output_backward/linear_' + str(i)) +# +# +#x = paddle.uniform((1, 1, 1, 51200), dtype=dtype, min=-1., max=1.) +#x.stop_gradient = False +#conv = nn.Conv2D(1, 512, kernel_size=[1, 10], stride=[1, 5], padding=[0,0,0,0], dilation=[1,1], groups=1) +#y = conv(x) +#print(y.shape) +#y.stop_gradient = False +#out_grad = paddle.uniform(y.shape, dtype=dtype, min=-1., max=1.) +#paddle.autograd.backward(tensors=[y], grad_tensors=[out_grad]) +#print(x.grad) diff --git a/test_fa.py b/test_fa.py new file mode 100644 index 0000000..20b03d0 --- /dev/null +++ b/test_fa.py @@ -0,0 +1,112 @@ +import paddle +import paddle.nn as nn +import json + +import os + +def generate_true_random_number(num_bytes): + """ 生成真随机数,从 /dev/urandom 读取指定数量的字节 """ + # 从 /dev/urandom 读取 num_bytes 个字节 + random_bytes = os.urandom(num_bytes) + # 将字节转换为整数 + #print(int.from_bytes(random_bytes, 'big')) + return int.from_bytes(random_bytes, 'big') + + +# 路径到你的日志文件 +log_file_path = 'fa.log' + +# 初始化存储列表 +lod_seqlens_q_list = [] +head_num_list = [] +head_dim_list = [] +max_seqlen_q_list = [] + +def convert_to_list(s): + # 去除字符串中的花括号 + s = s.strip('{}') + # 分割字符串 + elements = s.split(',') + # 将字符串元素转换为整数,并返回列表 + return [int(element.strip()) for element in elements] + +# 读取和解析日志文件 +try: + with open(log_file_path, 'r') as file: + for line in file: + try: + # 将每行转换为JSON对象 + data = json.loads(line) + # 检查是否是目标操作 + if data.get('op') == 'mha_varlen_bwd': + params = data.get('params', {}) + seqlens_q = params.get('lod_seqlens_q', []) + #lod_seqlens_q_list.append(params.get('lod_seqlens_q', [])) + lod_seqlens_q_list.append(convert_to_list(seqlens_q)) + head_num_list.append(params.get('head_num', None)) + head_dim_list.append(params.get('head_dim', None)) + max_seqlen_q_list.append(params.get('max_seqlen_q', None)) + except json.JSONDecodeError: + print("Warning: Failed to decode JSON from line:", line) +except FileNotFoundError: + print(f"Error: The file {log_file_path} does not exist.") +except IOError as e: + print(f"Error: An I/O error occurred while reading {log_file_path}: {str(e)}") + + + +from paddleapex import Tracer +checker = Tracer() +checker.register_op() + +paddle.seed(46) +B = [] +for b in lod_seqlens_q_list: + if len(b) == 2: + B.append(1) + if len(b) > 2: + B.append(int(b[-1] / b[1])) +L = max_seqlen_q_list +H = head_num_list +D = head_dim_list + +# 输出结果 +print("LOD Sequence Lengths Q:", B) +print("Head Numbers:", H) +print("Head Dimensions:", D) +print("Max Sequence Length Q:", L) + + +for i in range(len(B)): + paddle.seed(int(generate_true_random_number(4))) + q = paddle.rand((B[i], L[i], H[i], D[i]), dtype=paddle.bfloat16) + paddle.seed(int(generate_true_random_number(4))) + k = paddle.rand((B[i], L[i], H[i], D[i]), dtype=paddle.bfloat16) + paddle.seed(int(generate_true_random_number(4))) + v = paddle.rand((B[i], L[i], H[i], D[i]), dtype=paddle.bfloat16) + q.stop_gradient = False + k.stop_gradient = False + v.stop_gradient = False + + output = paddle.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False, True) + output.stop_gradient = False + output.backward() + +#for i in range(10): +# paddle.seed(int(generate_true_random_number(4))) +# q = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) +# paddle.seed(int(generate_true_random_number(6))) +# k = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) +# paddle.seed(int(generate_true_random_number(5))) +# v = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) +# q.stop_gradient = False +# k.stop_gradient = False +# v.stop_gradient = False +# +# output = paddle.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False, True) +# output.stop_gradient = False +# output.backward() + +checker.stop() + + diff --git a/test_fc.py b/test_fc.py new file mode 100644 index 0000000..d3a420f --- /dev/null +++ b/test_fc.py @@ -0,0 +1,233 @@ +import paddle +import paddle.nn as nn +import json +import numpy as np +import os +import re +import math + +from tools import * + +try: + from paddle_xpu.layers.nn import Linear +except ImportError: + from paddle.nn import Linear +#from paddle.nn import Linear + +#log_file_path = 'workerlog.0.fc_fa_mean_max' +log_file_path = 'fc.log' + +pattern = re.compile(r".*?cases(.*)") +mean_var = r"\[mean\](-?\d+\.\d+), \[max\](\d+\.\d+)" + + +input_file = 'FC_INPUT' +xpu_file = 'XPU' +cal_out_file = xpu_file + '/output' +cal_out_back_file = xpu_file + '/output_backward' +base_out_file = 'BASE/output' +base_out_back_file = 'BASE/output_backward' + +createDir(input_file) +createDir(xpu_file) + +CREATE_DATA = False +DEVICE = 'XPU' +#DEVICE = 'GPU' +base_type = paddle.float32 +dtype = paddle.bfloat16 +#calculate_type = paddle.bfloat16 +calculate_type = base_type + + +if calculate_type == base_type: + createDir(base_out_file) + out_file = base_out_file + createDir(base_out_back_file) + out_back_file = base_out_back_file +else: + createDir(cal_out_file) + out_file = cal_out_file + createDir(cal_out_back_file) + out_back_file = cal_out_back_file + +paddle.set_default_dtype(calculate_type) + + +# 初始化列表 +list_a = [] +list_b = [] +list_c = [] +list_d = [] +alpha_beta = [] +a_mean = [] +b_mean = [] +d_mean = [] +a_max = [] +b_max = [] +d_max = [] +n = 0 +try: + with open(log_file_path, 'r') as file: + log_lines = file.readlines() + + for i in range(len(log_lines)): + print("i", i) + line = log_lines[i] + if "kXPU3" not in log_lines[i]: + continue + if "float16" not in log_lines[i]: + continue + #if "fc_fusion" in log_lines[i]: + # run_mode = log_lines[i + 3] + # if "desc.run_mode" in run_mode: + # mean_match = re.search(mean_var, run_mode) + # if mean_match: + # if float(mean_match.group(1)) == 1: + # continue + # else: + # print('------------------error----------------') + #else: + # continue + + #a_line = log_lines[i + 1] + #match = re.search(mean_var, a_line) + #if match: + # a_mean.append(float(match.group(1))) + # a_max.append(float(match.group(2))) + #b_line = log_lines[i + 2] + #match = re.search(mean_var, b_line) + #if match: + # b_mean.append(float(match.group(1))) + # b_max.append(float(match.group(2))) + ##d_line = log_lines[i + 6] + ##match = re.search(mean_var, d_line) + ##if match: + ## d_mean.append(float(match.group(1))) + ## d_max.append(float(match.group(2))) + + if "fc_fusion" in log_lines[i]: + print(i) + #match = pattern.search(line) + #if 'cases' not in line: + # # 将字符串转换为字典 + # entry = json.loads(line) + #else: + # if match: + # json_part = match.group(1) + # entry = json.loads(json_part) + entry = json.loads(line) + # 检查是否为"fc_fusion"操作 + if entry.get("op") == "fc_fusion": + # 提取参数 + params = entry.get("params", {}) + desc = entry.get("desc", {}) + + # 分别提取a, b, c, d的rows和cols,并存储在各自的列表中 + if 'a' in params: + if params['a']['trans']: + list_a.append([params['a']["cols"], params['a']["rows"]]) + else: + list_a.append([params['a']["rows"], params['a']["cols"]]) + if 'b' in params: + if params['b']['trans']: + list_b.append([params['b']["cols"], params['b']["rows"]]) + else: + list_b.append([params['b']["rows"], params['b']["cols"]]) + if 'c' in params: + if params['c']['trans']: + list_c.append([params['c']["cols"], params['c']["rows"]]) + else: + list_c.append([params['c']["rows"], params['c']["cols"]]) + if 'd' in params: + if params['d']['trans']: + list_d.append([params['d']["cols"], params['d']["rows"]]) + else: + list_d.append([params['d']["rows"], params['d']["cols"]]) + + # 提取alpha和beta,转换为float,并存储 + alpha = float(desc.get("alpha", 0)) + beta = float(desc.get("beta", 0)) + alpha_beta.append([alpha, beta]) +except FileNotFoundError: + print(f"Error: The file {log_file_path} does not exist.") +except IOError as e: + print(f"Error: An I/O error occurred while reading {log_file_path}: {str(e)}") + + +unique_string = [] +u_list_a = [] +u_list_b = [] +u_a_m = [] +u_a_v = [] +u_b_m = [] +u_b_v = [] + +for i in range(len(a_mean)): + a_m = get_rounded_num(a_mean[i]) + b_m = get_rounded_num(b_mean[i]) + a_v = get_rounded_num(a_max[i]) + b_v = get_rounded_num(b_max[i]) + new_string = str(list_a[i]) + str(list_b[i]) + str(a_m) + str(b_m) + str(a_v) + str(b_v) + if new_string not in unique_string: + unique_string.append(new_string) + u_list_a.append(list_a[i]) + u_list_b.append(list_b[i]) + u_a_m.append(a_m) + u_b_m.append(b_m) + u_a_v.append(a_v) + u_b_v.append(b_v) + +print("-------------------------------------------", len(u_list_a)) + +#list_a = u_list_a +#list_b = u_list_b +#a_mean = u_a_m +#b_mean = u_b_m +#a_max = u_a_v +#b_max = u_b_v + + +### 输出各个列表 +print("List A:", list_a) +print("List B:", list_b) +print("List C:", list_c) +print("List D:", list_d) +print("Alpha and Beta:", alpha_beta) +#print("a_mean", a_mean) +#print("b_mean", b_mean) +#print("d_mean", d_mean) +#print("a_max", a_max) +#print("b_max", b_max) +#print("d_max", d_max) +# +#print("-------------------------------------------", len(a_mean)) +# +if DEVICE == 'XPU' and CREATE_DATA: + for i in range(len(list_a)): + print(i) + #for i in range(1): + x = create_random_tensor(list_a[i], dtype, int(generate_true_random_number(4))) + w = create_random_tensor(list_b[i], dtype, int(generate_true_random_number(4))) + bias = create_random_tensor([list_b[i][1]], dtype, int(generate_true_random_number(4))) + out_grad = create_random_tensor([list_a[i][0], list_b[i][1]], dtype, int(generate_true_random_number(4))) + paddle.save([x, w, bias, out_grad], input_file + '/linear_' + str(i)) + + +for i in range(len(list_a)): +#for i in range(1): + print(i) + x, w, bias, out_grad = paddle.load('FC_INPUT/linear_' + str(i)) + w = w.cast(calculate_type) + bias = bias.cast(calculate_type) + x = x.cast(calculate_type) + linear = Linear(w.shape[0], w.shape[1], bias_attr=True) + linear.weight.set_value(w) + linear.bias.set_value(bias) + linear.train() + + out = linear(x) + paddle.save([True, [out]], out_file + '/linear_' + str(i)) + out = paddle.cast(out, "float32") + paddle.autograd.backward(tensors=[out], grad_tensors=[out_grad]) + paddle.save([True, [linear.weight.grad, linear.bias.grad]], out_back_file + '/linear_' + str(i)) diff --git a/test_fc_gpu.py b/test_fc_gpu.py new file mode 100644 index 0000000..b681608 --- /dev/null +++ b/test_fc_gpu.py @@ -0,0 +1,59 @@ +import paddle +import paddle.nn as nn +import json +import numpy as np +import os +import re + +try: + from paddle_xpu.layers.nn import Linear +except ImportError: + from paddle.nn import Linear + +import json +CREATE_DATA = False +#DEVICE = 'XPU' +DEVICE = 'GPU' + +input_file = 'FC_INPUT' +xpu_file = 'XPU' +cal_out_file = xpu_file + '/output' +cal_out_back_file = xpu_file + '/output_backward' +base_out_file = 'BASE/output' +base_out_back_file = 'BASE/output_backward' + +base_type = paddle.float32 +dtype = paddle.bfloat16 +calculate_type = base_type + +if calculate_type == base_type: + createDir(base_out_file) + out_file = base_out_file + createDir(base_out_back_file) + out_back_file = base_out_back_file +else: + createDir(cal_out_file) + out_file = cal_out_file + createDir(cal_out_back_file) + out_back_file = cal_out_back_file + +paddle.set_default_dtype(calculate_type) + +for i in range(len(os.listdir(input_file))): +#for i in range(1): + print(i) + x, w, bias, out_grad = paddle.load(input_file + '/linear_' + str(i)) + w = w.cast(calculate_type) + bias = bias.cast(calculate_type) + x = x.cast(calculate_type) + + linear = Linear(w.shape[0], w.shape[1], bias_attr=True) + linear.weight.set_value(w) + linear.bias.set_value(bias) + linear.train() + + out = linear(x) + paddle.save([True, [out]], out_file + '/linear_' + str(i)) + out = paddle.cast(out, "float32") + paddle.autograd.backward(tensors=[out], grad_tensors=[out_grad]) + paddle.save([True, [linear.weight.grad, linear.bias.grad]], out_back_file + '/linear_' + str(i)) diff --git a/tools.py b/tools.py new file mode 100644 index 0000000..1d7410b --- /dev/null +++ b/tools.py @@ -0,0 +1,61 @@ +import os + +def createDir(file_name): + if not os.path.exists(file_name): + os.makedirs(file_name) + + +def generate_true_random_number(num_bytes): + """ 生成真随机数,从 /dev/urandom 读取指定数量的字节 """ + # 从 /dev/urandom 读取 num_bytes 个字节 + random_bytes = os.urandom(num_bytes) + # 将字节转换为整数 + #print(int.from_bytes(random_bytes, 'big')) + return int.from_bytes(random_bytes, 'big') + + +# inf, nan +def get_rounded_num(x, round_up=True): + if math.isinf(x) or math.isnan(x): + msg = f"warning, x is inf or nan" + print(msg, x) + return x + if abs(x) <= 1e-10: + return 0 + + abs_x = abs(x) + log_x = math.log10(abs_x) + round_log_x = math.floor(log_x) if round_up ^ (x > 0) else math.ceil(log_x) + + result = 10**round_log_x + return result if x >= 0 else -result + + +def generate_random_array(mean, max_value, shape, seed=None): + if seed is not None: + np.random.seed(seed) + # 首先生成标准正态分布的随机数组 + random_array = np.random.randn(*shape).astype(np.float32) + # 计算当前随机数组的最大值 + current_max = random_array.max() + # 计算缩放因子,使得新的最大值为给定的max_value + scale_factor = max_value / current_max + # 对数组进行缩放 + random_array *= scale_factor + # 计算当前数组的均值 + current_mean = random_array.mean() + # 计算偏移量,使得新的均值为给定的mean + shift_value = mean - current_mean + # 对数组进行偏移 + random_array += shift_value + return random_array + + +def create_random_tensor(shape, dtype, seed, min_v=-1, mean_v=None, max_v=1): + if mean_v == None: + paddle.seed(seed) + data = np.random.uniform(min_v, max_v, shape).astype("float32") + else: + data = generate_random_array(mean_v, max_v, shape, seed) + data = paddle.to_tensor(data, stop_gradient=False).cast(dtype) + return data