From 99cffe032daac1dd180b6406a1b6b022fcde71b9 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Sun, 17 Nov 2024 12:05:44 +0800 Subject: [PATCH 01/22] fix some bugs --- paddleapex/apex/run_paddle.py | 71 ++++++++++++++------ paddleapex/apex/utils/data_generate.py | 4 +- paddleapex/api_tracer/Tracer.py | 18 ++++- paddleapex/api_tracer/config.py | 15 ++++- paddleapex/api_tracer/configs/op_target.yaml | 46 ++++++++++--- 5 files changed, 117 insertions(+), 37 deletions(-) diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 2fa51b3..74de4e4 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +# import paddlenlp # if you wanna test nlp fusion operations import argparse import os import shutil @@ -54,7 +55,7 @@ "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 } -PROFILE_RUN_TIMES = 100 +PROFILE_RUN_TIMES = 1 def recursive_delete_arg(arg_in): if isinstance(arg_in, (list, tuple)): @@ -107,9 +108,13 @@ def convert_out2fp32(arg_in): return flag, res elif isinstance(arg_in, paddle.Tensor): if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": - arg_in = arg_in.cast("float32") - flag = True - return flag, arg_in + try: + arg_in = arg_in.cast("float32") + flag = True + except Exception as err: + print(arg_in) + return False, arg_in + return flag, arg_in def recursive_arg_to_cpu(arg_in): @@ -137,11 +142,11 @@ def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): arg_in = arg_in.cuda() if "cpu" in backend: arg_in = arg_in.cpu() - if arg_in.dtype.name == "BF16": + if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": arg_in = arg_in.cast("float32") else: arg_in = arg_in.to(backend) - if enforce_dtype and arg_in.dtype.name in ["BF16", "FP16", "FP32"]: + if enforce_dtype and arg_in.dtype.name in ["BF16", "BFLOAT16", "FP16", "FP32"]: arg_in = arg_in.cast(enforce_dtype) arg_in.stop_gradient = grad_status return arg_in @@ -162,12 +167,28 @@ def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=" bwd_output_path = os.path.join(bwd_output_dir, api_call_name) os.makedirs(fwd_output_dir, exist_ok=True) os.makedirs(bwd_output_dir, exist_ok=True) - if not isinstance(forward_res, type(None)): - fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) - paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) - if not isinstance(backward_res, type(None)): - bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) - paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) + if isinstance(forward_res, (type(None), list, tuple, paddle.Tensor)): + try: + fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) + paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) + except Exception as err: + msg = "save_forward Error: %s" % str(err) + print_warn_log(msg) + return + else: + print(forward_res) + print_warn_log("forward_res not supported!") + if isinstance(backward_res, (type(None), list, tuple, paddle.Tensor)): + try: + bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) + paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) + except Exception as err: + msg = "save_bacward Error: %s" % str(err) + print_warn_log(msg) + return + else: + print(backward_res) + print_warn_log("bacward_res not supported!") def evoke_related_test_func(test_mode): @@ -318,12 +339,15 @@ def run_acc_case( try: device_grad_out = [] - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - device_grad_out = run_backward( - api_call_name, device_out, dout, device_args, device_kwargs, need_backward - ) + if api_info_dict["dout_list"][0] != "Failed": + dout = create_dout( + api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path + ) + device_grad_out = run_backward( + api_call_name, device_out, dout, device_args, device_kwargs, need_backward + ) + else: + device_grad_out = None except Exception as err: msg = "Run_backward Error: %s" % str(err) print_warn_log(msg) @@ -359,10 +383,13 @@ def run_profile_case( # device warmming up try: device_out = run_forward(api_call_name, device_args, device_kwargs) - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - paddle.autograd.backward([device_out], dout) + if api_info_dict["dout_list"][0] != "Failed": + dout = create_dout( + api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path + ) + paddle.autograd.backward([device_out], dout) + else: + need_backward = False except Exception as err: msg = "Failed in device warming up: %s" % str(err) print_warn_log(msg) diff --git a/paddleapex/apex/utils/data_generate.py b/paddleapex/apex/utils/data_generate.py index 25c09ce..5c43b85 100644 --- a/paddleapex/apex/utils/data_generate.py +++ b/paddleapex/apex/utils/data_generate.py @@ -290,11 +290,11 @@ def rand_like(data, seed=1234): os.environ["PYTHONHASHSEED"] = str(seed) np.random.seed(seed) if isinstance(data, paddle.Tensor): - if data.dtype.name in ["BF16", "FP16"]: + if data.dtype.name in ["BF16", "FP16", "BFLOAT16", "FLOAT16"]: random_normals = numpy.random.randn(*data.shape) x = paddle.to_tensor(random_normals, dtype=data.dtype) return x - elif data.dtype.name in ["FP32", "FP64"]: + elif data.dtype.name in ["FP32", "FP64", "FLOAT32", "FLOAT64"]: random_normals = numpy.random.randn(*data.shape) x = paddle.to_tensor(random_normals, dtype=data.dtype) return x diff --git a/paddleapex/api_tracer/Tracer.py b/paddleapex/api_tracer/Tracer.py index 9efe197..7742d8f 100644 --- a/paddleapex/api_tracer/Tracer.py +++ b/paddleapex/api_tracer/Tracer.py @@ -16,7 +16,7 @@ from paddleapex.api_tracer.Dump import dump_util from paddleapex.api_tracer.wrap_op.hijack_tool import hijack_api from paddleapex.api_tracer.config import cfg - +from paddleapex.apex.utils import print_info_log class Tracer: def __init__(self): @@ -32,3 +32,19 @@ def start(self): def stop(self): if cfg.dump_state: dump_util.dump() + + def start_in_training(self, cur_step, acc): + self.acc = acc + self.global_step = cur_step // acc + self.inner_step = cur_step % acc + if self.inner_step == 0: + dump_signal = cfg.new_step_in_training(self.global_step) + if dump_signal: + print_info_log(f"Starting tracing step:{self.global_step}") + + def stop_in_training(self): + if self.inner_step == self.acc - 1: + dump_signal = cfg.reset_step_in_training(self.global_step) + if dump_signal: + print_info_log(f"Stopping tracing step:{self.global_step}") + dump_util.dump() diff --git a/paddleapex/api_tracer/config.py b/paddleapex/api_tracer/config.py index 6c40c73..6b9d1f3 100644 --- a/paddleapex/api_tracer/config.py +++ b/paddleapex/api_tracer/config.py @@ -40,7 +40,7 @@ def __init__(self) -> None: print(f"You are using Apex Toolkit, Dump mode : {self.dump_mode}, Target step : {self.target_step}, profile mode : {self.profile_mode}") print("*" * 100) time.sleep(1) - self.global_step = -1 + self.global_step = 2 self.dump_state = False self.Op_count = {} self.prefix_op_name_ = None @@ -54,5 +54,18 @@ def new_step(self): self.Op_count = {} self.dump_state = False + def new_step_in_training(self, global_step): + if global_step in self.target_step: + self.Op_count = {} + self.dump_state = True + return True + return False + + def reset_step_in_training(self, global_step): + if global_step in self.target_step: + self.global_step = global_step + self.dump_state = False + return True + return False cfg = Config() diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index d05646b..5fcad26 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -42,7 +42,7 @@ target_op: - paddle.nn.functional.dropout3d - paddle.nn.functional.elu - paddle.nn.functional.elu_ - # - paddle.nn.functional.embedding + - paddle.nn.functional.embedding - paddle.nn.functional.extension - paddle.nn.functional.flash_attention - paddle.nn.functional.flash_attention_with_sparse_mask @@ -187,7 +187,7 @@ target_op: - paddle.broadcast_tensors - paddle.broadcast_to - paddle.cauchy_ - # - paddle.cast + - paddle.cast - paddle.cdist - paddle.ceil - paddle.cholesky @@ -772,8 +772,8 @@ target_op: - paddle.Tensor.scale_ - paddle.Tensor.scatter - paddle.Tensor.scatter_ - # - paddle.Tensor.scatter_nd - # - paddle.Tensor.scatter_nd_add + - paddle.Tensor.scatter_nd + - paddle.Tensor.scatter_nd_add - paddle.Tensor.select_scatter - paddle.Tensor.sgn - paddle.Tensor.shard_index @@ -847,6 +847,10 @@ target_op: # - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table # - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy - paddle._C_ops.fused_gemm_epilogue + - paddle.optimizer.Adam + - paddle.optimizer.AdamW + - paddle._C_ops.adamw + - paddle._C_ops.adamw_ - paddle._legacy_C_ops.fused_gemm_epilogue - paddle.incubate.nn.functional.fused_multi_head_attention - paddle.incubate.nn.functional.fused_feedforward @@ -867,11 +871,31 @@ target_op: - paddle.tensor.fill_constant - paddle.nn.clip._squared_l2_norm - paddle.uniform - # - paddle._C_ops.gaussian - # - paddle._legacy_C_ops.c_identity - # - paddle.distributed.fleet.layers.mpu.mp_ops. - # - paddlenlp.transformers.llama.fusion_ops.fused_rotary_position_embedding - # - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention - # - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm - # - paddlenlp.transformers.llama.fusion_ops.fusion_rope + - paddle._C_ops.gaussian + - paddle._legacy_C_ops.c_identity + - paddle.distributed.fleet.layers.mpu.mp_ops. + - paddlenlp.transformers.llama.fusion_ops.fused_rotary_position_embedding + - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention + - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm + - paddlenlp.transformers.llama.fusion_ops.fusion_rope # - paddlenlp.transformers.llama.fusion_ops.swiglu +# distributed + - paddle.distributed.all_gather + - paddle.distributed.all_gather_object + - paddle.distributed.all_reduce + - paddle.distributed.alltoall + - paddle.distributed.alltoall_single + - paddle.distributed.barrier + - paddle.distributed.broadcast + - paddle.distributed.broadcast_object_list + - paddle.distributed.communication.stream.all_gather + - paddle.distributed.communication.stream.all_reduce + - paddle.distributed.communication.stream.alltoall + - paddle.distributed.communication.stream.alltoall_single + - paddle.distributed.communication.stream.broadcast + - paddle.distributed.communication.stream.gather + - paddle.distributed.communication.stream.recv + - paddle.distributed.communication.stream.reduce + - paddle.distributed.communication.stream.reduce_scatter + - paddle.distributed.communication.stream.scatter + - paddle.distributed.communication.stream.send From f362c6960811d4648ede304812ecf6486aea2aee Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Wed, 27 Nov 2024 10:48:14 +0800 Subject: [PATCH 02/22] support distributed op --- paddleapex/apex/run_distributed.py | 612 ++++++++++++++++++ paddleapex/apex/run_paddle.py | 5 +- paddleapex/api_tracer/api_info.py | 27 +- paddleapex/api_tracer/config.py | 2 +- paddleapex/api_tracer/configs/op_target.yaml | 86 +-- .../api_tracer/configs/tool_config.yaml | 8 +- 6 files changed, 688 insertions(+), 52 deletions(-) create mode 100644 paddleapex/apex/run_distributed.py diff --git a/paddleapex/apex/run_distributed.py b/paddleapex/apex/run_distributed.py new file mode 100644 index 0000000..6d35e57 --- /dev/null +++ b/paddleapex/apex/run_distributed.py @@ -0,0 +1,612 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddlenlp # if you wanna test nlp fusion operations +import argparse +import os +import shutil +import time +import copy +from tqdm import tqdm +import paddle +import paddle.distributed as dist +from paddle import framework +from paddle.base import core +from utils import ( + print_info_log, + gen_api_params, + api_json_read, + check_grad_list, + rand_like, + gen_args, + print_warn_log, +) + +type_map = { + "FP16": paddle.float16, + "FP32": paddle.float32, + "BF16": paddle.bfloat16, +} +Warning_list = [] + +current_time = time.strftime("%Y%m%d%H%M%S") + +tqdm_params = { + "smoothing": 0, # 平滑进度条的预计剩余时间,取值范围0到1 + "desc": "Processing", # 进度条前的描述文字 + "leave": True, # 迭代完成后保留进度条的显示 + "ncols": 75, # 进度条的固定宽度 + "mininterval": 0.1, # 更新进度条的最小间隔秒数 + "maxinterval": 1.0, # 更新进度条的最大间隔秒数 + "miniters": 1, # 更新进度条之间的最小迭代次数 + "ascii": None, # 根据环境自动使用ASCII或Unicode字符 + "unit": "it", # 迭代单位 + "unit_scale": True, # 自动根据单位缩放 + "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 + "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 +} +PROFILE_RUN_TIMES = 1 + +def recursive_delete_arg(arg_in): + if isinstance(arg_in, (list, tuple)): + for item in arg_in: + recursive_delete_arg(item) + return + elif isinstance(arg_in, paddle.Tensor): + del arg_in + return +def get_shape(arg_in): + if isinstance(arg_in, (list, tuple)): + res = [] + for item in arg_in: + ret_value = get_shape(item) + res.append(ret_value) + return res + elif isinstance(arg_in, paddle.Tensor): + shape = arg_in.shape + return shape + +def merge_two_lists(lst1, lst2): + merged_list = [] + if lst1 is None and lst2 is not None: + merged_list = lst2 + elif lst1 is not None and lst2 is None: + merged_list = lst1 + elif lst1 is None and lst2 is None: + merged_list = [] + else: + for item in lst1: + if item is None: + continue + else: + merged_list.append(item) + for item in lst2: + if item is None: + continue + else: + merged_list.append(item) + return merged_list + +def convert_out2fp32(arg_in): + flag = False + if isinstance(arg_in, (list, tuple)): + res = [] + for item in arg_in: + ret_flag, ret_value = convert_out2fp32(item) + res.append(ret_value) + flag = flag or ret_flag + return flag, res + elif isinstance(arg_in, paddle.Tensor): + if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": + try: + arg_in = arg_in.cast("float32") + flag = True + except Exception as err: + print(arg_in) + return False, arg_in + return flag, arg_in + + +def recursive_arg_to_cpu(arg_in): + if isinstance(arg_in, (list, tuple)): + res = [] + for item in arg_in: + res.append(recursive_arg_to_cpu(item)) + return res + elif isinstance(arg_in, paddle.Tensor): + arg_in = arg_in.to( + "cpu" + ) # avoid using .cpu(), which will cause the gradient to be lost + return arg_in + + +def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): + if isinstance(arg_in, (list, tuple)): + return type(arg_in)( + recursive_arg_to_device(arg, backend, enforce_dtype) for arg in arg_in + ) + elif isinstance(arg_in, paddle.Tensor): + grad_status = arg_in.stop_gradient + with paddle.no_grad(): + if "gpu" in backend: + arg_in = arg_in.cuda() + if "cpu" in backend: + arg_in = arg_in.cpu() + if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": + arg_in = arg_in.cast("float32") + else: + arg_in = arg_in.to(backend) + if enforce_dtype and arg_in.dtype.name in ["BF16", "BFLOAT16", "FP16", "FP32"]: + arg_in = arg_in.cast(enforce_dtype) + arg_in.stop_gradient = grad_status + return arg_in + else: + return arg_in + + +def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): + if not dist.get_rank() == 0: + return + if dtype_name == "": + bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) + fwd_output_dir = os.path.abspath(os.path.join(out_path, "output")) + else: + bwd_output_dir = os.path.abspath( + os.path.join(out_path, dtype_name, "output_backward") + ) + fwd_output_dir = os.path.abspath(os.path.join(out_path, dtype_name, "output")) + fwd_output_path = os.path.join(fwd_output_dir, api_call_name) + bwd_output_path = os.path.join(bwd_output_dir, api_call_name) + os.makedirs(fwd_output_dir, exist_ok=True) + os.makedirs(bwd_output_dir, exist_ok=True) + if isinstance(forward_res, (type(None), list, tuple, paddle.Tensor)): + try: + fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) + paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) + except Exception as err: + msg = "save_forward Error: %s" % str(err) + print_warn_log(msg) + return + else: + print(forward_res) + print_warn_log("forward_res not supported!") + if isinstance(backward_res, (type(None), list, tuple, paddle.Tensor)): + try: + bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) + paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) + except Exception as err: + msg = "save_bacward Error: %s" % str(err) + print_warn_log(msg) + return + else: + print(backward_res) + print_warn_log("bacward_res not supported!") + + +def evoke_related_test_func(test_mode): + func_method = [] + if "acc" in test_mode: + func_method.append(run_acc_case) + if "mem" in test_mode: + func_method.append(run_mem_case) + if "pro" in test_mode: + func_method.append(run_profile_case) + if test_mode == "all": + return [run_acc_case, run_mem_case, run_profile_case] + if len(func_method) == 0: + raise ValueError("test mode is not supported!") + return func_method + + +def ut_case_parsing(forward_content, cfg): + run_case_funcs = evoke_related_test_func(cfg.test_mode) + backend = cfg.backend + out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" + os.mkdir(out_path) if not os.path.exists(out_path) else None + multi_dtype_ut = cfg.multi_dtype_ut.split(",") if cfg.multi_dtype_ut else [] + debug_case = cfg.test_case_name.split(",") if cfg.test_case_name else [] + print("debug_case", debug_case) + debug_mode = False + paddle.set_device(cfg.backend) + if len(debug_case) > 0: + debug_mode = True + enforce_types = [type_map[item] for item in multi_dtype_ut] + for i, (api_call_name, api_info_dict) in enumerate( + tqdm(forward_content.items(), **tqdm_params) + ): + print(api_call_name) + if debug_mode and api_call_name not in debug_case: + continue + if len(multi_dtype_ut) > 0: + for enforce_dtype in enforce_types: + print(api_call_name + "*" + enforce_dtype.name) + args = api_call_name, api_info_dict, backend, out_path + kwargs = {"enforce_dtype": enforce_dtype, "debug_case": debug_case, "real_data_path": cfg.real_data} + for run_case in run_case_funcs: + run_case(*args, **kwargs) + print("*" * 100) + else: + print(api_call_name) + args = api_call_name, api_info_dict, backend, out_path + kwargs = {"enforce_dtype": None, "debug_case": debug_case, "real_data_path": cfg.real_data} + if isinstance(run_case_funcs, list): + for run_case in run_case_funcs: + run_case(*args, **kwargs) + else: + run_case_funcs(*args, **kwargs) + print("*" * 100) + + +def create_input_args(api_info, backend, enforce_dtype=None, real_data_path=None): + args, kwargs, need_backward = gen_api_params(api_info, real_data_path) + device_args = recursive_arg_to_device(args, backend, enforce_dtype) + device_kwargs = { + key: recursive_arg_to_device(value, backend, enforce_dtype) + for key, value in kwargs.items() + } + return device_args, device_kwargs, need_backward + + +def create_dout(dout_info_dict, device_out, backend, enforce_dtype=None, real_data_path=None): + if dout_info_dict[0] != "Failed": + dout, _ = gen_args(dout_info_dict, real_data_path) + else: + print("dout dump json is None!") + dout = rand_like(device_out) + dout = recursive_arg_to_device(dout, backend, enforce_dtype) + return dout + + +def run_forward(api_call_name, device_args, device_kwargs): + api_call_stack = api_call_name.rsplit("*")[0] + try: + device_out = eval(api_call_stack)(*device_args, **device_kwargs) + paddle.device.synchronize() + return device_out + + except Exception as err: + msg = f"Run API {api_call_name} Forward Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return None + + +def get_grad_tensor(args, kwargs): + device_grad_out = [] + for arg in args: + if isinstance(arg, paddle.Tensor): + device_grad_out.append(arg.grad) + if isinstance(arg, list): # op: concat/stack + for x in arg: + if isinstance(x, paddle.Tensor): + device_grad_out.append(x.grad) + for k, v in kwargs.items(): + if isinstance(v, paddle.Tensor): + device_grad_out.append(v.grad) + if isinstance(v, list): # op: concat/stack + for x in v: + if isinstance(x, paddle.Tensor): + device_grad_out.append(x.grad) + return device_grad_out + + +def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=None): + if need_backward: + try: + paddle.autograd.backward([device_out], dout) + device_grad_out = get_grad_tensor(args, kwargs) + device_grad_out = check_grad_list(device_grad_out) + if device_grad_out is None: + msg = f"{api_call_name} grad_list is None" + Warning_list.append(msg) + return device_grad_out + except Exception as err: + msg = f"Run API {api_call_name} backward Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return None + else: + msg = f"{api_call_name} has no tensor required grad, SKIP Backward" + print_warn_log(msg) + Warning_list.append(msg) + return None + + +def run_acc_case( + api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None +): + api_info_dict_copy = copy.deepcopy(api_info_dict) + if not dist.get_rank() == 0 or "distributed" not in api_call_name: + real_data_path = None + device_args, device_kwargs, need_backward = create_input_args( + api_info_dict_copy, backend, enforce_dtype, real_data_path + ) + print(f"Running {api_call_name} acc test!") + # if api_call_name in debug_case: + # x = [device_args, device_kwargs] + # out_path = os.path.realpath(out_path) if out_path else "./" + # save_pth = os.path.join(out_path, "input_data", api_call_name) + # paddle.save(x, save_pth) + try: + device_out = run_forward(api_call_name, device_args, device_kwargs) + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return + + try: + device_grad_out = [] + if api_info_dict["dout_list"][0] != "Failed": + dout = create_dout( + api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path + ) + device_grad_out = run_backward( + api_call_name, device_out, dout, device_args, device_kwargs, need_backward + ) + else: + device_grad_out = None + except Exception as err: + msg = "Run_backward Error: %s" % str(err) + print_warn_log(msg) + if enforce_dtype: + save_tensor( + device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name + ) + else: + save_tensor(device_out, device_grad_out, out_path, api_call_name) + return + if enforce_dtype: + save_tensor( + device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name + ) + else: + save_tensor(device_out, device_grad_out, out_path, api_call_name) + return + + +def run_profile_case( + api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None +): + print(f"Running {api_call_name} profile test!") + api_info_dict_copy = copy.deepcopy(api_info_dict) + device_args, device_kwargs, need_backward = create_input_args( + api_info_dict_copy, backend, enforce_dtype, real_data_path + ) + # if api_call_name in debug_case: + # x = [device_args, device_kwargs] + # out_path = os.path.realpath(out_path) if out_path else "./" + # save_pth = os.path.join(out_path, "input_data", api_call_name) + # paddle.save(x, save_pth) + # device warmming up + try: + device_out = run_forward(api_call_name, device_args, device_kwargs) + if api_info_dict["dout_list"][0] != "Failed": + dout = create_dout( + api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path + ) + paddle.autograd.backward([device_out], dout) + else: + need_backward = False + except Exception as err: + msg = "Failed in device warming up: %s" % str(err) + print_warn_log(msg) + return + input_shape1 = get_shape(device_args) + input_shape2 = get_shape(device_kwargs) + input_shape_lst = merge_two_lists(input_shape1, input_shape2) + output_shape_lst = get_shape(device_out) + def profile_inner_loop_(): + try: + paddle.device.synchronize() + fwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + paddle.device.synchronize() + fwd_end_time = time.time() + fwd_time = fwd_end_time - fwd_start_time + fwd_time = fwd_time * 1000000 / float(PROFILE_RUN_TIMES) # fwd_time is in us + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return -1, -1 + try: + if not need_backward: + return fwd_time, -1 + paddle.device.synchronize() + bwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + paddle.autograd.backward([device_out], dout) + paddle.device.synchronize() + bwd_end_time = time.time() + bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second + bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us + bwd_time = bwd_time - fwd_time + except Exception as err: + msg = "Run_backward Error: %s" % str(err) + print_warn_log(msg) + return fwd_time, -1 + return fwd_time, bwd_time + + try: + fwd_time, bwd_time = profile_inner_loop_() + except Exception as err: + msg = f"Run {api_call_name} profile Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return + + if not enforce_dtype: + log_path = os.path.join(out_path, "profile_analyze.log") + else: + log_path = os.path.join(out_path, enforce_dtype.name, "profile_analyze.log") + + F = open(log_path, "a") + dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" + op_fwd = api_call_name + dtype + ".forward" + op_bwd = api_call_name + dtype + ".backward" + print_info_log(f"{op_fwd}:\t{fwd_time}") + print_info_log(f"{op_bwd}:\t{bwd_time}") + dtype = "\t" if not enforce_dtype else f"\t{enforce_dtype.name}" + msg_fwd = f"{api_call_name}.forward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tforward\t{fwd_time}" + msg_bwd = f"{api_call_name}.backward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tbackward\t{bwd_time}" + + F.write(msg_fwd + "\n") + F.write(msg_bwd + "\n") + F.close() + return + + +def run_mem_case( + api_call_name, + api_info_dict, + backend, + out_path, + enforce_dtype=None, + debug_case=[], # noqa + real_data_path=None +): + print(f"Running {api_call_name} mem test!") + + activation_cost = None + place = framework._current_expected_place_() + device_id = place.get_device_id() + before_run_mem = core.device_memory_stat_current_value("Allocated", device_id) + api_info_dict_copy = copy.deepcopy(api_info_dict) + device_args, device_kwargs, _ = create_input_args( + api_info_dict_copy, backend, enforce_dtype, real_data_path + ) + try: + device_out = run_forward(api_call_name, device_args, device_kwargs) + recursive_delete_arg(device_args) + for _, value in device_kwargs.items(): + recursive_delete_arg(value) + _ = recursive_arg_to_cpu(device_out) + after_run_mem = core.device_memory_stat_current_value("Allocated", device_id) + activation_cost = after_run_mem - before_run_mem + + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return + if not enforce_dtype: + log_path = os.path.join(out_path, "memory_analyze.log") + else: + log_path = os.path.join(out_path, enforce_dtype.name, "memory_analyze.log") + + os.mkdir(out_path) if not os.path.exists(out_path) else None + F = open(log_path, "a") + dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" + op_name = api_call_name + dtype + ".forward" + F.write(f"{op_name}:\t{str(activation_cost)}\n") + F.close() + return + + +def arg_parser(parser): + parser.add_argument( + "-json_file", + "--json_file", + dest="json_path", + default="", + type=str, + help="Dump json file path", + required=True, + ) + parser.add_argument( + "-out", + "--dump_path", + dest="out_path", + default="./paddle/", + type=str, + help=" The ut task result out path.", + required=False, + ) + parser.add_argument( + "-backend", + "--backend", + dest="backend", + default="gpu", + type=str, + help=" The running device DEVICE or BENCH.", + required=False, + ) + parser.add_argument( + "-dtype", + "--enforce-dtype", + dest="multi_dtype_ut", + default="", + type=str, + help="", + required=False, + ) + parser.add_argument( + "-real", + "--real_data", + dest="real_data", + default="", + type=str, + help="", + required=False, + ) + parser.add_argument( + "-op", + "--op_name", + dest="test_case_name", + default="", + type=str, + help="debug_op name", + required=False, + ) + parser.add_argument( + "-mode", + "--mode", + dest="test_mode", + default="all", + type=str, + help="debug_op name", + required=False, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + arg_parser(parser) + cfg = parser.parse_args() + print(cfg) + dist.init_parallel_env() + local_rank = dist.get_rank() + json_path = "/zhouxiangquan/llama10b/dump_info/rank" + str(local_rank) + "_step5/forward_rank" + str(local_rank) + "_all.json" + + cfg.backend = cfg.backend + ":" + str(local_rank) + cfg.json_path = json_path + + data_path = "/zhouxiangquan/llama10b/dump_info/rank" + str(local_rank) + "_step0/" + cfg.real_data = data_path + + forward_content = api_json_read(cfg.json_path) + out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" + if os.path.exists(out_path): + print_warn_log("The output path already exists and the file with the same name will be overwritten.") + out_path = out_path + "/rank_" + str(local_rank) + "/" + if not os.path.exists(out_path): + os.makedirs(out_path, exist_ok=True) + cfg.out_path = out_path + ut_case_parsing(forward_content, cfg) + print_info_log("UT save completed") + warning_log_pth = os.path.join(out_path, "./warning_log.txt") + File = open(warning_log_pth, "w") + for item in Warning_list: + File.write(item + "\n") + File.close() diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 74de4e4..8a8376c 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -# import paddlenlp # if you wanna test nlp fusion operations +#import paddlenlp # if you wanna test nlp fusion operations import argparse import os import shutil @@ -234,7 +234,7 @@ def ut_case_parsing(forward_content, cfg): else: print(api_call_name) args = api_call_name, api_info_dict, backend, out_path - kwargs = {"enforce_dtype": None, "debug_case": debug_case} + kwargs = {"enforce_dtype": None, "debug_case": debug_case, "real_data_path": cfg.real_data} if isinstance(run_case_funcs, list): for run_case in run_case_funcs: run_case(*args, **kwargs) @@ -576,6 +576,7 @@ def arg_parser(parser): parser = argparse.ArgumentParser() arg_parser(parser) cfg = parser.parse_args() + print(cfg) forward_content = api_json_read(cfg.json_path) out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" if os.path.exists(out_path): diff --git a/paddleapex/api_tracer/api_info.py b/paddleapex/api_tracer/api_info.py index c677845..6646b25 100644 --- a/paddleapex/api_tracer/api_info.py +++ b/paddleapex/api_tracer/api_info.py @@ -17,6 +17,7 @@ import numpy as np from paddleapex.api_tracer.Dump import dump_util from paddleapex.api_tracer.config import cfg +import paddle.distributed as dist Paddle_Type_Map = { "FP64": "paddle.float64", @@ -41,7 +42,12 @@ "FLOAT16", ] +# inf, nan def get_rounded_num(x, round_up=True): + if math.isinf(x) or math.isnan(x): + msg = f"warning, x is inf or nan" + print(msg, x) + return x if abs(x) <= 1e-10: return 0 @@ -79,6 +85,12 @@ def get_tensor_extremum(data): max_result = np.max(data_clone).item() min_result = np.min(data_clone).item() + if math.isinf(max_result) or math.isnan(max_result): + msg = f"warning, for max_result, where is a inf or nan, need to notice" + print(msg) + if math.isinf(min_result) or math.isnan(min_result): + msg = f"warning, for min_result, where is a inf or nan, need to notice" + print(msg) if cfg.dump_unique: ori_max_ = max_result ori_min_ = min_result @@ -153,7 +165,9 @@ def analyze_element(self, element): if element is None or isinstance(element, (bool, int, float, str, slice)): return self._analyze_builtin(element) - + + print(type(element)) + print(element) msg = f"In op:{self.op_name}, its args type {type(element)} is unsupported at analyze_element" print(msg) @@ -172,6 +186,12 @@ def effi_analyze_tensor(self, arg): if cfg.dump_unique and arg.dtype.name != "BOOL": ori_max_ = max_ ori_min_ = min_ + if math.isinf(ori_max_) or math.isnan(ori_max_): + msg = f"warning, for max_result, where is a inf or nan, need to notice" + print(msg) + if math.isinf(ori_min_) or math.isnan(ori_min_): + msg = f"warning, for min_result, where is a inf or nan, need to notice" + print(msg) max_ = get_rounded_num(ori_max_, True) min_ = get_rounded_num(ori_min_, False) if ori_min_ != ori_max_ else max_ single_arg.update({"Max": max_}) @@ -179,7 +199,7 @@ def effi_analyze_tensor(self, arg): single_arg.update({"Min": min_}) single_arg.update({"Min_origin": min_}) single_arg.update({"stop_gradient": arg.stop_gradient}) - if self.mode == "real_data": + if self.mode == "real_data" and (dist.get_rank() == 0 or "distributed" in self.op_name): api_args = self.op_name + "." + str(self.args_num) pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) self.args_num += 1 @@ -204,7 +224,8 @@ def _analyze_tensor(self, arg): ) single_arg.update({"stop_gradient": arg.stop_gradient}) - if self.mode == "real_data": + # if self.mode == "real_data": + if self.mode == "real_data" and (dist.get_rank() == 0 or "distributed" in self.op_name): api_args = self.op_name + "." + str(self.args_num) pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) self.args_num += 1 diff --git a/paddleapex/api_tracer/config.py b/paddleapex/api_tracer/config.py index 6b9d1f3..62ab99c 100644 --- a/paddleapex/api_tracer/config.py +++ b/paddleapex/api_tracer/config.py @@ -40,7 +40,7 @@ def __init__(self) -> None: print(f"You are using Apex Toolkit, Dump mode : {self.dump_mode}, Target step : {self.target_step}, profile mode : {self.profile_mode}") print("*" * 100) time.sleep(1) - self.global_step = 2 + self.global_step = 0 self.dump_state = False self.Op_count = {} self.prefix_op_name_ = None diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index 5fcad26..9ce1a89 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -1,8 +1,46 @@ ignored_op: - - paddle._C_ops.max - paddle._C_ops.min + - paddle._C_ops.max + - paddle.empty + - paddle.empty_like + - paddle.reshape + - paddle.reshape_ + - paddle.unsqueeze + - paddle.unsqueeze_ + - paddle.square_ + - paddle.Tensor.squeeze + - paddle.Tensor.squeeze_ + - paddle.Tensor.unsqueeze + - paddle.Tensor.unsqueeze_ + - paddle.squeeze_ + - paddle.ones + - paddle.ones_like + - paddle.split + - paddle.Tensor.zero_ + - paddle.stack + # distributed + - paddle.distributed.barrier + - paddle.distributed.broadcast_object_list + - paddle.distributed.communication.stream.alltoall_single + - paddle.distributed.communication.stream.broadcast + - paddle.distributed.communication.stream.gather + - paddle.distributed.communication.stream.recv + - paddle.distributed.communication.stream.reduce + - paddle.distributed.communication.stream.reduce_scatter + - paddle.distributed.communication.stream.scatter + - paddle.distributed.communication.stream.send + - paddle.distributed.all_gather + - paddle.distributed.all_gather_object + - paddle.distributed.all_reduce + - paddle.distributed.alltoall + - paddle.distributed.alltoall_single + - paddle.distributed.broadcast + - paddle.distributed.communication.stream.all_gather + - paddle.distributed.communication.stream.all_reduce + - paddle.distributed.communication.stream.alltoall target_op: + - paddle.nn.functional.scaled_dot_product_attention # Special op, paddle has wrapped op in framework. #noqa - paddle._C_ops.layer_norm #noqa - paddle.nn.functional.adaptive_avg_pool1d @@ -106,7 +144,6 @@ target_op: - paddle.nn.functional.relu_ - paddle.nn.functional.rnnt_loss - paddle.nn.functional.rrelu - - paddle.nn.functional.scaled_dot_product_attention - paddle.nn.functional.sdp_kernel - paddle.nn.functional.selu - paddle.nn.functional.sequence_mask @@ -230,8 +267,6 @@ target_op: - paddle.dstack - paddle.eigvalsh - paddle.einsum - - paddle.empty - - paddle.empty_like - paddle.equal - paddle.equal_all - paddle.erf @@ -367,8 +402,6 @@ target_op: - paddle.not_equal - paddle.not_equal_ - paddle.numel - - paddle.ones - - paddle.ones_like - paddle.outer - paddle.pdist - paddle.poisson @@ -395,8 +428,6 @@ target_op: - paddle.renorm - paddle.renorm_ - paddle.repeat_interleave - - paddle.reshape - - paddle.reshape_ - paddle.roll - paddle.rot90 - paddle.round @@ -421,13 +452,8 @@ target_op: - paddle.slice # - paddle.slice_scatter - paddle.sort - - paddle.split - paddle.sqrt - paddle.square - - paddle.square_ - - paddle.squeeze - - paddle.squeeze_ - - paddle.stack - paddle.standard_gamma - paddle.standard_normal - paddle.stanh @@ -463,8 +489,6 @@ target_op: - paddle.uniform - paddle.unique - paddle.unique_consecutive - - paddle.unsqueeze - - paddle.unsqueeze_ - paddle.unstack - paddle.vander - paddle.var @@ -792,8 +816,6 @@ target_op: - paddle.Tensor.sqrt - paddle.Tensor.sqrt_ - paddle.Tensor.square - - paddle.Tensor.squeeze - - paddle.Tensor.squeeze_ - paddle.Tensor.stack - paddle.Tensor.stanh - paddle.Tensor.std @@ -831,8 +853,6 @@ target_op: - paddle.Tensor.uniform_ - paddle.Tensor.unique - paddle.Tensor.unique_consecutive - - paddle.Tensor.unsqueeze - - paddle.Tensor.unsqueeze_ - paddle.Tensor.unstack - paddle.Tensor.vander - paddle.Tensor.var @@ -841,11 +861,10 @@ target_op: - paddle.Tensor.vsplit - paddle.Tensor.where - paddle.Tensor.where_ - - paddle.Tensor.zero_ #### experiment op: - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy + - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity + - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table + - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy - paddle._C_ops.fused_gemm_epilogue - paddle.optimizer.Adam - paddle.optimizer.AdamW @@ -879,23 +898,4 @@ target_op: - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm - paddlenlp.transformers.llama.fusion_ops.fusion_rope # - paddlenlp.transformers.llama.fusion_ops.swiglu -# distributed - - paddle.distributed.all_gather - - paddle.distributed.all_gather_object - - paddle.distributed.all_reduce - - paddle.distributed.alltoall - - paddle.distributed.alltoall_single - - paddle.distributed.barrier - - paddle.distributed.broadcast - - paddle.distributed.broadcast_object_list - - paddle.distributed.communication.stream.all_gather - - paddle.distributed.communication.stream.all_reduce - - paddle.distributed.communication.stream.alltoall - - paddle.distributed.communication.stream.alltoall_single - - paddle.distributed.communication.stream.broadcast - - paddle.distributed.communication.stream.gather - - paddle.distributed.communication.stream.recv - - paddle.distributed.communication.stream.reduce - - paddle.distributed.communication.stream.reduce_scatter - - paddle.distributed.communication.stream.scatter - - paddle.distributed.communication.stream.send + diff --git a/paddleapex/api_tracer/configs/tool_config.yaml b/paddleapex/api_tracer/configs/tool_config.yaml index 30375b1..6baa265 100644 --- a/paddleapex/api_tracer/configs/tool_config.yaml +++ b/paddleapex/api_tracer/configs/tool_config.yaml @@ -10,17 +10,19 @@ remote_path: "/root/paddlejob/workspace/PaddleAPEX_dump/" Async_dump: False # mode must be chosen from ["real_data", "random"] -dump_mode: "random" +dump_mode: "real_data" +# dump_mode: "random" # acclerate dump process by getting extremum value on device side. # In profile_mode, the speed of dump is 75% of vanilla speed. profile_mode: True # target_step is a list, dump api function will turn on at the specific step -target_step: [0] +target_step: [5] # Remove duplicate apis from dump_info and keep only one api in the same value range. -dump_unique: True +# dump_unique: True +dump_unique: False # Split dump_info into half-precision operators and other operators when saving json files split_dump: True From 78ac286dec06d8a50e95c7e7caa27079d0e554de Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Wed, 27 Nov 2024 14:16:51 +0800 Subject: [PATCH 03/22] support save distributed op result --- paddleapex/apex/run_distributed.py | 32 +- paddleapex/api_tracer/configs/op_target.yaml | 25 +- .../api_tracer/configs/op_target.yaml.all | 901 ++++++++++++++++++ 3 files changed, 952 insertions(+), 6 deletions(-) create mode 100644 paddleapex/api_tracer/configs/op_target.yaml.all diff --git a/paddleapex/apex/run_distributed.py b/paddleapex/apex/run_distributed.py index 6d35e57..96d9650 100644 --- a/paddleapex/apex/run_distributed.py +++ b/paddleapex/apex/run_distributed.py @@ -40,6 +40,26 @@ } Warning_list = [] +distributed_op = ["paddle.distributed.broadcast_object_list", + "paddle.distributed.barrier", + "paddle.distributed.communication.stream.alltoall_single", + "paddle.distributed.communication.stream.broadcast", + "paddle.distributed.communication.stream.gather", + "paddle.distributed.communication.stream.recv", + "paddle.distributed.communication.stream.reduce", + "paddle.distributed.communication.stream.reduce_scatter", + "paddle.distributed.communication.stream.scatter", + "paddle.distributed.communication.stream.send", + "paddle.distributed.all_gather", + "paddle.distributed.all_gather_object", + "paddle.distributed.all_reduce", + "paddle.distributed.alltoall", + "paddle.distributed.alltoall_single", + "paddle.distributed.broadcast", + "paddle.distributed.communication.stream.all_gather", + "paddle.distributed.communication.stream.all_reduce", + "paddle.distributed.communication.stream.alltoall"] + current_time = time.strftime("%Y%m%d%H%M%S") tqdm_params = { @@ -156,7 +176,7 @@ def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): - if not dist.get_rank() == 0: + if not dist.get_rank() == 0 and "distributed" not in api_call_name: return if dtype_name == "": bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) @@ -355,6 +375,9 @@ def run_acc_case( api_call_name, device_out, dout, device_args, device_kwargs, need_backward ) else: + if api_call_name.rsplit("*")[0] in distributed_op: + print('this is distributed op: ', api_call_name) + device_out = device_args device_grad_out = None except Exception as err: msg = "Run_backward Error: %s" % str(err) @@ -587,14 +610,15 @@ def arg_parser(parser): print(cfg) dist.init_parallel_env() local_rank = dist.get_rank() - json_path = "/zhouxiangquan/llama10b/dump_info/rank" + str(local_rank) + "_step5/forward_rank" + str(local_rank) + "_all.json" + json_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step5/forward_rank" + str(local_rank) + "_all.json" cfg.backend = cfg.backend + ":" + str(local_rank) cfg.json_path = json_path - data_path = "/zhouxiangquan/llama10b/dump_info/rank" + str(local_rank) + "_step0/" + data_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step0/" cfg.real_data = data_path - + cfg.real_data = None + forward_content = api_json_read(cfg.json_path) out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" if os.path.exists(out_path): diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index 9ce1a89..292942a 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -19,6 +19,28 @@ ignored_op: - paddle.Tensor.zero_ - paddle.stack # distributed +# - paddle.distributed.barrier +# - paddle.distributed.broadcast_object_list +# - paddle.distributed.communication.stream.alltoall_single +# - paddle.distributed.communication.stream.broadcast +# - paddle.distributed.communication.stream.gather +# - paddle.distributed.communication.stream.recv +# - paddle.distributed.communication.stream.reduce +# - paddle.distributed.communication.stream.reduce_scatter +# - paddle.distributed.communication.stream.scatter +# - paddle.distributed.communication.stream.send +# - paddle.distributed.all_gather +# - paddle.distributed.all_gather_object +# - paddle.distributed.all_reduce +# - paddle.distributed.alltoall +# - paddle.distributed.alltoall_single +# - paddle.distributed.broadcast +# - paddle.distributed.communication.stream.all_gather +# - paddle.distributed.communication.stream.all_reduce +# - paddle.distributed.communication.stream.alltoall + +target_op: + # distributed - paddle.distributed.barrier - paddle.distributed.broadcast_object_list - paddle.distributed.communication.stream.alltoall_single @@ -38,8 +60,7 @@ ignored_op: - paddle.distributed.communication.stream.all_gather - paddle.distributed.communication.stream.all_reduce - paddle.distributed.communication.stream.alltoall - -target_op: + # # - paddle.nn.functional.scaled_dot_product_attention # Special op, paddle has wrapped op in framework. #noqa - paddle._C_ops.layer_norm #noqa diff --git a/paddleapex/api_tracer/configs/op_target.yaml.all b/paddleapex/api_tracer/configs/op_target.yaml.all new file mode 100644 index 0000000..5fcad26 --- /dev/null +++ b/paddleapex/api_tracer/configs/op_target.yaml.all @@ -0,0 +1,901 @@ +ignored_op: + - paddle._C_ops.max + - paddle._C_ops.min + +target_op: + # Special op, paddle has wrapped op in framework. #noqa + - paddle._C_ops.layer_norm #noqa + - paddle.nn.functional.adaptive_avg_pool1d + - paddle.nn.functional.adaptive_avg_pool2d + - paddle.nn.functional.adaptive_avg_pool3d + - paddle.nn.functional.adaptive_max_pool1d + - paddle.nn.functional.adaptive_max_pool2d + - paddle.nn.functional.adaptive_max_pool3d + - paddle.nn.functional.affine_grid + - paddle.nn.functional.alpha_dropout + - paddle.nn.functional.avg_pool1d + - paddle.nn.functional.avg_pool2d + - paddle.nn.functional.avg_pool3d + - paddle.nn.functional.batch_norm + - paddle.nn.functional.bilinear + - paddle.nn.functional.binary_cross_entropy + - paddle.nn.functional.binary_cross_entropy_with_logits + - paddle.nn.functional.celu + - paddle.nn.functional.channel_shuffle + - paddle.nn.functional.class_center_sample + - paddle.nn.functional.common + - paddle.nn.functional.conv1d + - paddle.nn.functional.conv1d_transpose + - paddle.nn.functional.conv2d + - paddle.nn.functional.conv2d_transpose + - paddle.nn.functional.conv3d + - paddle.nn.functional.conv3d_transpose + - paddle.nn.functional.cosine_embedding_loss + - paddle.nn.functional.cosine_similarity + - paddle.nn.functional.cross_entropy + - paddle.nn.functional.ctc_loss + - paddle.nn.functional.diag_embed + - paddle.nn.functional.dice_loss + - paddle.nn.functional.distance + - paddle.nn.functional.dropout + - paddle.nn.functional.dropout2d + - paddle.nn.functional.dropout3d + - paddle.nn.functional.elu + - paddle.nn.functional.elu_ + - paddle.nn.functional.embedding + - paddle.nn.functional.extension + - paddle.nn.functional.flash_attention + - paddle.nn.functional.flash_attention_with_sparse_mask + - paddle.nn.functional.fractional_max_pool2d + - paddle.nn.functional.fractional_max_pool3d + - paddle.nn.functional.fold + - paddle.nn.functional.gather_tree + - paddle.nn.functional.gaussian_nll_loss + - paddle.nn.functional.gelu + - paddle.nn.functional.glu + - paddle.nn.functional.grid_sample + - paddle.nn.functional.gumbel_softmax + - paddle.nn.functional.hardshrink + - paddle.nn.functional.hardsigmoid + - paddle.nn.functional.hardswish + - paddle.nn.functional.hardtanh + - paddle.nn.functional.hardtanh_ + - paddle.nn.functional.hinge_embedding_loss + - paddle.nn.functional.hsigmoid_loss + - paddle.nn.functional.instance_norm + - paddle.nn.functional.interpolate + - paddle.nn.functional.kl_div + - paddle.nn.functional.l1_loss + - paddle.nn.functional.label_smooth + - paddle.nn.functional.layer_norm + - paddle.nn.functional.leaky_relu + - paddle.nn.functional.leaky_relu_ + - paddle.nn.functional.linear + - paddle.nn.functional.local_response_norm + - paddle.nn.functional.log_loss + - paddle.nn.functional.log_sigmoid + - paddle.nn.functional.log_softmax + - paddle.nn.functional.margin_cross_entropy + - paddle.nn.functional.margin_ranking_loss + - paddle.nn.functional.max_pool1d + - paddle.nn.functional.max_pool2d + - paddle.nn.functional.max_pool3d + - paddle.nn.functional.max_unpool1d + - paddle.nn.functional.max_unpool2d + - paddle.nn.functional.max_unpool3d + - paddle.nn.functional.maxout + - paddle.nn.functional.mish + - paddle.nn.functional.mse_loss + - paddle.nn.functional.multi_label_soft_margin_loss + - paddle.nn.functional.multi_margin_loss + - paddle.nn.functional.nll_loss + - paddle.nn.functional.norm + - paddle.nn.functional.normalize + - paddle.nn.functional.npair_loss + - paddle.nn.functional.one_hot + - paddle.nn.functional.pad + - paddle.nn.functional.pairwise_distance + - paddle.nn.functional.pdist + - paddle.nn.functional.pixel_shuffle + - paddle.nn.functional.pixel_unshuffle + - paddle.nn.functional.poisson_nll_loss + - paddle.nn.functional.pooling + - paddle.nn.functional.prelu + - paddle.nn.functional.relu + - paddle.nn.functional.relu6 + - paddle.nn.functional.relu_ + - paddle.nn.functional.rnnt_loss + - paddle.nn.functional.rrelu + - paddle.nn.functional.scaled_dot_product_attention + - paddle.nn.functional.sdp_kernel + - paddle.nn.functional.selu + - paddle.nn.functional.sequence_mask + - paddle.nn.functional.sigmoid + - paddle.nn.functional.sigmoid_focal_loss + - paddle.nn.functional.silu + - paddle.nn.functional.smooth_l1_loss + - paddle.nn.functional.soft_margin_loss + - paddle.nn.functional.softmax + - paddle.nn.functional.softmax_ + - paddle.nn.functional.softmax_with_cross_entropy + - paddle.nn.functional.softplus + - paddle.nn.functional.softshrink + - paddle.nn.functional.softsign + - paddle.nn.functional.sparse_attention + - paddle.nn.functional.square_error_cost + - paddle.nn.functional.swish + - paddle.nn.functional.tanh + - paddle.nn.functional.tanh_ + - paddle.nn.functional.tanhshrink + - paddle.nn.functional.temporal_shift + - paddle.nn.functional.thresholded_relu + - paddle.nn.functional.thresholded_relu_ + - paddle.nn.functional.triplet_margin_loss + - paddle.nn.functional.triplet_margin_with_distance_loss + - paddle.nn.functional.unfold + - paddle.nn.functional.upsample + - paddle.nn.functional.zeropad2d + - paddle.abs + - paddle.abs_ + - paddle.acos + - paddle.acos_ + - paddle.acosh + - paddle.acosh_ + - paddle.add + - paddle.add_n + - paddle.addmm + - paddle.addmm_ + - paddle.all + - paddle.allclose + - paddle.amax + - paddle.amin + - paddle.angle + - paddle.any + - paddle.arange + - paddle.argmax + - paddle.argmin + - paddle.argsort + - paddle.as_complex + - paddle.as_real + - paddle.as_strided + - paddle.asin + - paddle.asin_ + - paddle.asinh + - paddle.asinh_ + - paddle.assign + - paddle.atan + - paddle.atan2 + - paddle.atan_ + - paddle.atanh + - paddle.atanh_ + - paddle.atleast_1d + - paddle.atleast_2d + - paddle.atleast_3d + - paddle.bernoulli + - paddle.bincount + - paddle.binomial + - paddle.bitwise_and + - paddle.bitwise_and_ + - paddle.bitwise_not + - paddle.bitwise_not_ + - paddle.bitwise_or + - paddle.bitwise_or_ + - paddle.bitwise_xor + - paddle.bitwise_xor_ + - paddle.bmm + - paddle.broadcast_shape + - paddle.broadcast_tensors + - paddle.broadcast_to + - paddle.cauchy_ + - paddle.cast + - paddle.cdist + - paddle.ceil + - paddle.cholesky + - paddle.chunk + - paddle.clip + - paddle.column_stack + - paddle.combinations + - paddle.concat + - paddle.conj + - paddle.copysign + - paddle.copysign_ + - paddle.cos + - paddle.cos_ + - paddle.cosh + - paddle.cosh_ + - paddle.count_nonzero + - paddle.crop + - paddle.cross + - paddle.cummax + - paddle.cummin + - paddle.cumprod + - paddle.cumprod_ + - paddle.cumsum + - paddle.cumsum_ + - paddle.cumulative_trapezoid + - paddle.decomposition + - paddle.deg2rad + - paddle.diag + - paddle.diag_embed + - paddle.diagflat + - paddle.diagonal + - paddle.diagonal_scatter + - paddle.diff + - paddle.digamma + - paddle.digamma_ + - paddle.divide + - paddle.divide_ + - paddle.dot + - paddle.dsplit + - paddle.dstack + - paddle.eigvalsh + - paddle.einsum + - paddle.empty + - paddle.empty_like + - paddle.equal + - paddle.equal_all + - paddle.erf + - paddle.erf_ + - paddle.erfinv + - paddle.exp + - paddle.expand + - paddle.expand_as + - paddle.expm1 + - paddle.expm1_ + - paddle.eye + - paddle.fft + - paddle.flatten + - paddle.flatten_ + - paddle.flip + - paddle.floor + - paddle.floor_divide + - paddle.floor_divide_ + - paddle.floor_mod + - paddle.floor_mod_ + - paddle.fmax + - paddle.fmin + - paddle.frac + - paddle.frac_ + - paddle.frexp + - paddle.full + - paddle.full_like + - paddle.gather + - paddle.gather_nd + - paddle.gcd + - paddle.gcd_ + - paddle.greater_equal + - paddle.greater_equal_ + - paddle.greater_than + - paddle.greater_than_ + - paddle.heaviside + - paddle.histogram + - paddle.histogramdd + - paddle.hsplit + - paddle.hstack + - paddle.hypot + - paddle.hypot_ + - paddle.i0 + - paddle.i0_ + - paddle.i0e + - paddle.i1 + - paddle.i1e + - paddle.imag + - paddle.increment + - paddle.index_add + - paddle.index_add_ + - paddle.index_fill + - paddle.index_fill_ + - paddle.index_put + - paddle.index_put_ + - paddle.index_sample + - paddle.index_select + - paddle.inner + - paddle.kron + - paddle.kthvalue + - paddle.lcm + - paddle.lcm_ + - paddle.ldexp + - paddle.ldexp_ + - paddle.lerp + - paddle.less_equal + - paddle.less_equal_ + - paddle.less_than + - paddle.less_than_ + - paddle.lgamma + - paddle.lgamma_ + - paddle.linalg + - paddle.linspace + - paddle.log + - paddle.log10 + - paddle.log10_ + - paddle.log1p + - paddle.log1p_ + - paddle.log2 + - paddle.log2_ + - paddle.log_ + - paddle.logaddexp + - paddle.logcumsumexp + - paddle.logical_and + - paddle.logical_and_ + - paddle.logical_not + - paddle.logical_not_ + - paddle.logical_or + - paddle.logical_or_ + - paddle.logical_xor + - paddle.logical_xor_ + - paddle.logit + - paddle.logit_ + - paddle.logspace + - paddle.logsumexp + - paddle.masked_fill + - paddle.masked_fill_ + - paddle.masked_scatter + - paddle.masked_scatter_ + - paddle.masked_select + - paddle.matmul + - paddle.max + - paddle.maximum + - paddle.mean + - paddle.median + - paddle.meshgrid + - paddle.min + - paddle.minimum + - paddle.mm + - paddle.mod + - paddle.mod_ + - paddle.mode + - paddle.moveaxis + - paddle.multigammaln + - paddle.multigammaln_ + - paddle.multinomial + - paddle.multiplex + - paddle.multiply + - paddle.multiply_ + - paddle.mv + - paddle.nan_to_num + - paddle.nan_to_num_ + - paddle.nanmean + - paddle.nanmedian + - paddle.nanquantile + - paddle.nansum + - paddle.neg + - paddle.neg_ + - paddle.nextafter + - paddle.nonzero + - paddle.normal + - paddle.normal_ + - paddle.not_equal + - paddle.not_equal_ + - paddle.numel + - paddle.ones + - paddle.ones_like + - paddle.outer + - paddle.pdist + - paddle.poisson + - paddle.polar + - paddle.polygamma + - paddle.polygamma_ + - paddle.pow + - paddle.pow_ + - paddle.prod + - paddle.put_along_axis + - paddle.quantile + - paddle.rad2deg + - paddle.rand + - paddle.randint + - paddle.randint_like + - paddle.randn + - paddle.randperm + - paddle.reader + - paddle.real + - paddle.reciprocal + - paddle.regularizer + - paddle.remainder + - paddle.remainder_ + - paddle.renorm + - paddle.renorm_ + - paddle.repeat_interleave + - paddle.reshape + - paddle.reshape_ + - paddle.roll + - paddle.rot90 + - paddle.round + - paddle.row_stack + - paddle.rsqrt + - paddle.scale + - paddle.scatter + - paddle.scatter_ + # - paddle.scatter_nd # cause CUDA_ERROR ignored. + # - paddle.scatter_nd_add + - paddle.searchsorted + - paddle.select_scatter + - paddle.sgn + - paddle.shard_index + - paddle.sign + - paddle.signal + - paddle.signbit + - paddle.sin + - paddle.sin_ + - paddle.sinh + - paddle.sinh_ + - paddle.slice + # - paddle.slice_scatter + - paddle.sort + - paddle.split + - paddle.sqrt + - paddle.square + - paddle.square_ + - paddle.squeeze + - paddle.squeeze_ + - paddle.stack + - paddle.standard_gamma + - paddle.standard_normal + - paddle.stanh + - paddle.strided_slice + - paddle.subtract + - paddle.sum + - paddle.t + - paddle.t_ + - paddle.take + - paddle.take_along_axis + - paddle.tan + - paddle.tan_ + - paddle.tanh + - paddle.tanh_ + - paddle.tensordot + - paddle.tile + - paddle.topk + - paddle.trace + - paddle.transpose + - paddle.transpose_ + - paddle.trapezoid + - paddle.tril + - paddle.tril_ + - paddle.tril_indices + - paddle.triu + - paddle.triu_ + - paddle.triu_indices + - paddle.trunc + - paddle.trunc_ + - paddle.unbind + - paddle.unflatten + - paddle.unfold + - paddle.uniform + - paddle.unique + - paddle.unique_consecutive + - paddle.unsqueeze + - paddle.unsqueeze_ + - paddle.unstack + - paddle.vander + - paddle.var + - paddle.view + - paddle.view_as + - paddle.vsplit + - paddle.where + - paddle.where_ + - paddle.zeros + - paddle.zeros_like + # - paddle.Tensor.T + - paddle.Tensor.__add__ + - paddle.Tensor.__and__ + - paddle.Tensor.__radd__ + - paddle.Tensor.__div__ + - paddle.Tensor.__eq__ + - paddle.Tensor.__floordiv__ + - paddle.Tensor.__ge__ + - paddle.Tensor.__gt__ + - paddle.Tensor.__le__ + - paddle.Tensor.__lt__ + - paddle.Tensor.__matmul__ + - paddle.Tensor.__mod__ + - paddle.Tensor.__mul__ + - paddle.Tensor.__ne__ + - paddle.Tensor.__neg__ + - paddle.Tensor.__nonzero__ + - paddle.Tensor.__or__ + - paddle.Tensor.__pow__ + - paddle.Tensor.__radd__ + - paddle.Tensor.__rdiv__ + - paddle.Tensor.__rmul__ + - paddle.Tensor.__rpow__ + - paddle.Tensor.__rsub__ + - paddle.Tensor.__rtruediv__ + - paddle.Tensor.__sub__ + - paddle.Tensor.__truediv__ + - paddle.Tensor.__xor__ + - paddle.Tensor.abs + - paddle.Tensor.abs_ + - paddle.Tensor.acos + - paddle.Tensor.acos_ + - paddle.Tensor.acosh + - paddle.Tensor.acosh_ + - paddle.Tensor.add + - paddle.Tensor.add_ + - paddle.Tensor.add_n + - paddle.Tensor.addmm + - paddle.Tensor.addmm_ + - paddle.Tensor.all + - paddle.Tensor.allclose + - paddle.Tensor.amax + - paddle.Tensor.amin + - paddle.Tensor.angle + - paddle.Tensor.any + - paddle.Tensor.argmax + - paddle.Tensor.argmin + - paddle.Tensor.argsort + - paddle.Tensor.as_complex + - paddle.Tensor.as_real + - paddle.Tensor.as_strided + - paddle.Tensor.asin + - paddle.Tensor.asin_ + - paddle.Tensor.asinh + - paddle.Tensor.asinh_ + - paddle.Tensor.atan + - paddle.Tensor.atan2 + - paddle.Tensor.atan_ + - paddle.Tensor.atanh + - paddle.Tensor.atanh_ + - paddle.Tensor.atleast_1d + - paddle.Tensor.atleast_2d + - paddle.Tensor.atleast_3d + - paddle.Tensor.bincount + - paddle.Tensor.bitwise_and + - paddle.Tensor.bitwise_and_ + - paddle.Tensor.bitwise_not + - paddle.Tensor.bitwise_not_ + - paddle.Tensor.bitwise_or + - paddle.Tensor.bitwise_or_ + - paddle.Tensor.bitwise_xor + - paddle.Tensor.bitwise_xor_ + - paddle.Tensor.bmm + - paddle.Tensor.broadcast_shape + - paddle.Tensor.broadcast_tensors + - paddle.Tensor.broadcast_to + - paddle.Tensor.cauchy_ + - paddle.Tensor.cdist + - paddle.Tensor.ceil + - paddle.Tensor.ceil_ + - paddle.Tensor.cholesky + - paddle.Tensor.cholesky_solve + - paddle.Tensor.clip + - paddle.Tensor.clip_ + - paddle.Tensor.coalesce + - paddle.Tensor.cols + - paddle.Tensor.combinations + - paddle.Tensor.concat + - paddle.Tensor.cond + - paddle.Tensor.conj + - paddle.Tensor.contiguous + - paddle.Tensor.corrcoef + - paddle.Tensor.cos + - paddle.Tensor.cos_ + - paddle.Tensor.cosh + - paddle.Tensor.cosh_ + - paddle.Tensor.count_nonzero + - paddle.Tensor.cov + - paddle.Tensor.cross + - paddle.Tensor.crows + - paddle.Tensor.cummax + - paddle.Tensor.cummin + - paddle.Tensor.cumprod + - paddle.Tensor.cumprod_ + - paddle.Tensor.cumsum + - paddle.Tensor.cumsum_ + - paddle.Tensor.cumulative_trapezoid + - paddle.Tensor.deg2rad + - paddle.Tensor.diag + - paddle.Tensor.diag_embed + - paddle.Tensor.diagflat + - paddle.Tensor.diagonal + - paddle.Tensor.diagonal_scatter + - paddle.Tensor.diff + - paddle.Tensor.digamma + - paddle.Tensor.digamma_ + - paddle.Tensor.divide + - paddle.Tensor.divide_ + - paddle.Tensor.dot + - paddle.Tensor.eig + - paddle.Tensor.eigvals + - paddle.Tensor.eigvalsh + - paddle.Tensor.equal + - paddle.Tensor.equal_all + - paddle.Tensor.erf + - paddle.Tensor.erfinv + - paddle.Tensor.erfinv_ + - paddle.Tensor.exp + - paddle.Tensor.exp_ + - paddle.Tensor.expand + - paddle.Tensor.expand_as + - paddle.Tensor.expm1 + - paddle.Tensor.exponential_ + - paddle.Tensor.fill_ + - paddle.Tensor.fill_diagonal_ + - paddle.Tensor.fill_diagonal_tensor + - paddle.Tensor.fill_diagonal_tensor_ + - paddle.Tensor.flatten + - paddle.Tensor.flatten_ + - paddle.Tensor.flip + - paddle.Tensor.floor + - paddle.Tensor.floor_ + - paddle.Tensor.floor_divide + - paddle.Tensor.floor_divide_ + - paddle.Tensor.floor_mod + - paddle.Tensor.floor_mod_ + - paddle.Tensor.fmax + - paddle.Tensor.fmin + - paddle.Tensor.frac + - paddle.Tensor.frac_ + - paddle.Tensor.frexp + - paddle.Tensor.gather + - paddle.Tensor.gather_nd + - paddle.Tensor.gcd + - paddle.Tensor.gcd_ + - paddle.Tensor.get_selected_rows + - paddle.Tensor.get_strides + - paddle.Tensor.greater_equal + - paddle.Tensor.greater_equal_ + - paddle.Tensor.greater_than + - paddle.Tensor.greater_than_ + - paddle.Tensor.heaviside + - paddle.Tensor.histogram + - paddle.Tensor.histogramdd + - paddle.Tensor.hsplit + - paddle.Tensor.hypot + - paddle.Tensor.hypot_ + - paddle.Tensor.i0 + - paddle.Tensor.i0_ + - paddle.Tensor.i0e + - paddle.Tensor.i1 + - paddle.Tensor.i1e + - paddle.Tensor.imag + - paddle.Tensor.increment + - paddle.Tensor.index_add + - paddle.Tensor.index_add_ + - paddle.Tensor.index_fill + - paddle.Tensor.index_fill_ + - paddle.Tensor.index_put + - paddle.Tensor.index_put_ + - paddle.Tensor.index_sample + - paddle.Tensor.index_select + - paddle.Tensor.inner + - paddle.Tensor.kron + - paddle.Tensor.kthvalue + - paddle.Tensor.layout + - paddle.Tensor.lcm + - paddle.Tensor.lcm_ + - paddle.Tensor.ldexp + - paddle.Tensor.ldexp_ + - paddle.Tensor.lerp + - paddle.Tensor.lerp_ + - paddle.Tensor.less_equal + - paddle.Tensor.less_equal_ + - paddle.Tensor.less_than + - paddle.Tensor.less_than_ + - paddle.Tensor.lgamma + - paddle.Tensor.lgamma_ + - paddle.Tensor.log + - paddle.Tensor.log10 + - paddle.Tensor.log10_ + - paddle.Tensor.log1p + - paddle.Tensor.log1p_ + - paddle.Tensor.log2 + - paddle.Tensor.log2_ + - paddle.Tensor.log_ + - paddle.Tensor.logaddexp + - paddle.Tensor.logcumsumexp + - paddle.Tensor.logical_and + - paddle.Tensor.logical_and_ + - paddle.Tensor.logical_not + - paddle.Tensor.logical_not_ + - paddle.Tensor.logical_or + - paddle.Tensor.logical_or_ + - paddle.Tensor.logical_xor + - paddle.Tensor.logical_xor_ + - paddle.Tensor.logit + - paddle.Tensor.logit_ + - paddle.Tensor.logsumexp + - paddle.Tensor.lstsq + - paddle.Tensor.lu + - paddle.Tensor.lu_unpack + - paddle.Tensor.masked_fill + - paddle.Tensor.masked_fill_ + - paddle.Tensor.masked_select + - paddle.Tensor.masked_scatter + - paddle.Tensor.masked_scatter_ + - paddle.Tensor.matmul + - paddle.Tensor.matrix_power + - paddle.Tensor.max + - paddle.Tensor.maximum + - paddle.Tensor.mean + - paddle.Tensor.median + - paddle.Tensor.min + - paddle.Tensor.minimum + - paddle.Tensor.mm + - paddle.Tensor.mod + - paddle.Tensor.mod_ + - paddle.Tensor.mode + - paddle.Tensor.moveaxis + - paddle.Tensor.multi_dot + - paddle.Tensor.multigammaln + - paddle.Tensor.multigammaln_ + - paddle.Tensor.multinomial + - paddle.Tensor.multiplex + - paddle.Tensor.multiply + - paddle.Tensor.multiply_ + - paddle.Tensor.mv + - paddle.Tensor.nan_to_num + - paddle.Tensor.nan_to_num_ + - paddle.Tensor.nanmean + - paddle.Tensor.nanmedian + - paddle.Tensor.nanquantile + - paddle.Tensor.nansum + - paddle.Tensor.ndimension + - paddle.Tensor.neg + - paddle.Tensor.neg_ + - paddle.Tensor.nnz + - paddle.Tensor.nonzero + - paddle.Tensor.norm + - paddle.Tensor.normal_ + - paddle.Tensor.not_equal + - paddle.Tensor.not_equal_ + - paddle.Tensor.numel + - paddle.Tensor.offset + - paddle.Tensor.outer + - paddle.Tensor.pca_lowrank + - paddle.Tensor.pinv + - paddle.Tensor.polar + - paddle.Tensor.polygamma + - paddle.Tensor.polygamma_ + - paddle.Tensor.pow + - paddle.Tensor.pow_ + - paddle.Tensor.process_mesh + - paddle.Tensor.prod + - paddle.Tensor.put_along_axis + - paddle.Tensor.put_along_axis_ + - paddle.Tensor.qr + - paddle.Tensor.quantile + - paddle.Tensor.rad2deg + - paddle.Tensor.remainder + - paddle.Tensor.remainder_ + - paddle.Tensor.renorm + - paddle.Tensor.renorm_ + - paddle.Tensor.repeat_interleave + - paddle.Tensor.reverse + - paddle.Tensor.roll + - paddle.Tensor.rot90 + - paddle.Tensor.round + - paddle.Tensor.round_ + - paddle.Tensor.rows + - paddle.Tensor.rsqrt + - paddle.Tensor.rsqrt_ + - paddle.Tensor.scale + - paddle.Tensor.scale_ + - paddle.Tensor.scatter + - paddle.Tensor.scatter_ + - paddle.Tensor.scatter_nd + - paddle.Tensor.scatter_nd_add + - paddle.Tensor.select_scatter + - paddle.Tensor.sgn + - paddle.Tensor.shard_index + - paddle.Tensor.sigmoid + - paddle.Tensor.sigmoid_ + - paddle.Tensor.sign + - paddle.Tensor.sin + - paddle.Tensor.sin_ + - paddle.Tensor.sinh + - paddle.Tensor.sinh_ + - paddle.Tensor.size + - paddle.Tensor.slice + - paddle.Tensor.solve + - paddle.Tensor.sort + - paddle.Tensor.split + - paddle.Tensor.sqrt + - paddle.Tensor.sqrt_ + - paddle.Tensor.square + - paddle.Tensor.squeeze + - paddle.Tensor.squeeze_ + - paddle.Tensor.stack + - paddle.Tensor.stanh + - paddle.Tensor.std + - paddle.Tensor.stft + - paddle.Tensor.strided_slice + - paddle.Tensor.strides + - paddle.Tensor.subtract + - paddle.Tensor.subtract_ + - paddle.Tensor.sum + - paddle.Tensor.t + - paddle.Tensor.t_ + - paddle.Tensor.take + - paddle.Tensor.take_along_axis + - paddle.Tensor.tan + - paddle.Tensor.tan_ + - paddle.Tensor.tanh + - paddle.Tensor.tanh_ + - paddle.Tensor.tensordot + - paddle.Tensor.tile + - paddle.Tensor.top_p_sampling + - paddle.Tensor.topk + - paddle.Tensor.trace + - paddle.Tensor.transpose + - paddle.Tensor.transpose_ + - paddle.Tensor.trapezoid + - paddle.Tensor.tril + - paddle.Tensor.tril_ + - paddle.Tensor.triu + - paddle.Tensor.triu_ + - paddle.Tensor.trunc + - paddle.Tensor.trunc_ + - paddle.Tensor.unbind + - paddle.Tensor.unflatten + - paddle.Tensor.unfold + - paddle.Tensor.uniform_ + - paddle.Tensor.unique + - paddle.Tensor.unique_consecutive + - paddle.Tensor.unsqueeze + - paddle.Tensor.unsqueeze_ + - paddle.Tensor.unstack + - paddle.Tensor.vander + - paddle.Tensor.var + - paddle.Tensor.view + - paddle.Tensor.view_as + - paddle.Tensor.vsplit + - paddle.Tensor.where + - paddle.Tensor.where_ + - paddle.Tensor.zero_ + #### experiment op: + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy + - paddle._C_ops.fused_gemm_epilogue + - paddle.optimizer.Adam + - paddle.optimizer.AdamW + - paddle._C_ops.adamw + - paddle._C_ops.adamw_ + - paddle._legacy_C_ops.fused_gemm_epilogue + - paddle.incubate.nn.functional.fused_multi_head_attention + - paddle.incubate.nn.functional.fused_feedforward + - paddle.incubate.nn.functional.fused_multi_transformer + - paddle.incubate.nn.functional.fused_linear + - paddle.incubate.nn.functional.fused_linear_activation + - paddle.incubate.nn.functional.fused_bias_dropout_residual_layer_norm + - paddle.incubate.nn.functional.fused_ec_moe + - paddle.incubate.nn.functional.fused_dropout_add + - paddle.incubate.nn.functional.fused_rotary_position_embedding + - paddle.incubate.nn.functional.variable_length_memory_efficient_attention + - paddle.incubate.nn.functional.fused_rms_norm + - paddle.incubate.nn.functional.fused_layer_norm + - paddle.incubate.nn.functional.masked_multihead_attention + - paddle.incubate.nn.functional.block_multihead_attention + - paddle.incubate.nn.functional.swiglu + - paddle.incubate.nn.functional.fused_matmul_bias + - paddle.tensor.fill_constant + - paddle.nn.clip._squared_l2_norm + - paddle.uniform + - paddle._C_ops.gaussian + - paddle._legacy_C_ops.c_identity + - paddle.distributed.fleet.layers.mpu.mp_ops. + - paddlenlp.transformers.llama.fusion_ops.fused_rotary_position_embedding + - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention + - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm + - paddlenlp.transformers.llama.fusion_ops.fusion_rope + # - paddlenlp.transformers.llama.fusion_ops.swiglu +# distributed + - paddle.distributed.all_gather + - paddle.distributed.all_gather_object + - paddle.distributed.all_reduce + - paddle.distributed.alltoall + - paddle.distributed.alltoall_single + - paddle.distributed.barrier + - paddle.distributed.broadcast + - paddle.distributed.broadcast_object_list + - paddle.distributed.communication.stream.all_gather + - paddle.distributed.communication.stream.all_reduce + - paddle.distributed.communication.stream.alltoall + - paddle.distributed.communication.stream.alltoall_single + - paddle.distributed.communication.stream.broadcast + - paddle.distributed.communication.stream.gather + - paddle.distributed.communication.stream.recv + - paddle.distributed.communication.stream.reduce + - paddle.distributed.communication.stream.reduce_scatter + - paddle.distributed.communication.stream.scatter + - paddle.distributed.communication.stream.send From 88883a0523160748c0053c7a423b90cea4c129c0 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Thu, 28 Nov 2024 17:05:49 +0800 Subject: [PATCH 04/22] fix some bugs of read real data --- paddleapex/apex/run_distributed.py | 20 ++++--- paddleapex/apex/run_llama10b_xpu.sh | 64 ++++++++++++++++++++++ paddleapex/apex/run_llama10b_xpu_new.sh | 73 +++++++++++++++++++++++++ paddleapex/apex/run_llama20b_xpu.sh | 44 +++++++++++++++ 4 files changed, 194 insertions(+), 7 deletions(-) create mode 100755 paddleapex/apex/run_llama10b_xpu.sh create mode 100755 paddleapex/apex/run_llama10b_xpu_new.sh create mode 100755 paddleapex/apex/run_llama20b_xpu.sh diff --git a/paddleapex/apex/run_distributed.py b/paddleapex/apex/run_distributed.py index 96d9650..d95e304 100644 --- a/paddleapex/apex/run_distributed.py +++ b/paddleapex/apex/run_distributed.py @@ -347,7 +347,7 @@ def run_acc_case( api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None ): api_info_dict_copy = copy.deepcopy(api_info_dict) - if not dist.get_rank() == 0 or "distributed" not in api_call_name: + if not dist.get_rank() == 0 and "distributed" not in api_call_name: real_data_path = None device_args, device_kwargs, need_backward = create_input_args( api_info_dict_copy, backend, enforce_dtype, real_data_path @@ -610,14 +610,20 @@ def arg_parser(parser): print(cfg) dist.init_parallel_env() local_rank = dist.get_rank() - json_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step5/forward_rank" + str(local_rank) + "_all.json" - + # json_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step5/forward_rank" + str(local_rank) + "_all.json" + json_path_list = cfg.json_path.split(' ') + data_path_list = cfg.real_data.split(' ') + + print("json_path_list", json_path_list) + print("data_path_list", data_path_list) + + cfg.json_path = json_path_list[local_rank] + cfg.real_data = data_path_list[local_rank] cfg.backend = cfg.backend + ":" + str(local_rank) - cfg.json_path = json_path - data_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step0/" - cfg.real_data = data_path - cfg.real_data = None + print(cfg) + # data_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step0/" + # cfg.real_data = None forward_content = api_json_read(cfg.json_path) out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" diff --git a/paddleapex/apex/run_llama10b_xpu.sh b/paddleapex/apex/run_llama10b_xpu.sh new file mode 100755 index 0000000..7db7f5e --- /dev/null +++ b/paddleapex/apex/run_llama10b_xpu.sh @@ -0,0 +1,64 @@ +#!/bin/bash +task_name_or_path="llama-10b" +#export XPUAPI_DEBUG=0x1 +#export XPURT_DISPATCH_MODE=PROFILING +export XPU_FORCE_USERMODE_LAUNCH=1 +export PYTHONPATH=$PYTHONPATH:/workspace/APEX/PaddleAPEX:/workspace/APEX/PaddleNLP + +runtime_location=/workspace/so-runtime +bkcl_location=/workspace/so-bkcl +export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:$LD_LIBRARY_PATH + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +# export XPU_CDNN_CLUSTER_PARALLEL=1 +# export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 + +export XPUAPI_DEBUG=0x1 + +# BKCL +# export BKCL_DEBUG=1 +# Multi-computer RDMA +export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=1 +export BKCL_TREE_THRESHOLD=0 +#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 +#export BKCL_SOCKET_IFNAME=eth0 +export BKCL_FORCE_L3_RDMA=0 +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +echo "bkcl version:" +strings ${bkcl_location}/libbkcl.so | grep COM +master_ip=$POD_0_IP +nnodes=$PADDLE_TRAINERS_NUM +echo "master ip:" +echo $master_ip + +export CUDA_DEVICE_MAX_CONNECTIONS=8 + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) +echo "PaddleNLP_DIR: "$PaddleNLP_DIR + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 + +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 +export XPU_PADDLE_FC_LOCAL_INT16=1 + + +python run_paddle.py -json /workspace/APEX/llama10b/dump_info/rank0_step0/forward_rank0_all.json -backend xpu -out /workspace/APEX/llama10b/ -mode acc diff --git a/paddleapex/apex/run_llama10b_xpu_new.sh b/paddleapex/apex/run_llama10b_xpu_new.sh new file mode 100755 index 0000000..7e86f40 --- /dev/null +++ b/paddleapex/apex/run_llama10b_xpu_new.sh @@ -0,0 +1,73 @@ +#!/bin/bash +task_name_or_path="llama-10b" +export XPUAPI_DEBUG=0x1 +#export XPURT_DISPATCH_MODE=PROFILING +export XPU_FORCE_USERMODE_LAUNCH=1 + +runtime_location=/workspace/so-runtime +bkcl_location=/workspace/so-bkcl +export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:$LD_LIBRARY_PATH + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +# export XPU_CDNN_CLUSTER_PARALLEL=1 +# export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 + +# BKCL +# export BKCL_DEBUG=1 +# Multi-computer RDMA +export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=1 +export BKCL_TREE_THRESHOLD=0 +#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 +#export BKCL_SOCKET_IFNAME=eth0 +export BKCL_FORCE_L3_RDMA=0 +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +echo "bkcl version:" +strings ${bkcl_location}/libbkcl.so | grep COM +master_ip=$POD_0_IP +nnodes=$PADDLE_TRAINERS_NUM +echo "master ip:" +echo $master_ip + +export CUDA_DEVICE_MAX_CONNECTIONS=8 + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) +echo "PaddleNLP_DIR: "$PaddleNLP_DIR + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 + +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 +export XPU_PADDLE_FC_LOCAL_INT16=1 +export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 + +export XPU_AUTO_BF16_TF32_RADIO=1 # 设置比例 0.001, XPU_AUTO_BF16_TF32_RADIO/1000 +export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 +export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 + +export PYTHONPATH=$PYTHONPATH:/workspace/APEX/PaddleNLP:/workspace/AA/PaddleAPEX + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ + -json \ + "/workspace/APEX/PaddleNLP/dump_info/rank0_step5/forward_rank0_all.json /workspace/APEX/PaddleNLP/dump_info/rank1_step5/forward_rank1_all.json /workspace/APEX/PaddleNLP/dump_info/rank2_step5/forward_rank2_all.json /workspace/APEX/PaddleNLP/dump_info/rank3_step5/forward_rank3_all.json /workspace/APEX/PaddleNLP/dump_info/rank4_step5/forward_rank4_all.json /workspace/APEX/PaddleNLP/dump_info/rank5_step5/forward_rank5_all.json /workspace/APEX/PaddleNLP/dump_info/rank6_step5/forward_rank6_all.json /workspace/APEX/PaddleNLP/dump_info/rank7_step5/forward_rank7_all.json" \ + -backend xpu \ + -real \ + "/workspace/APEX/PaddleNLP/dump_info/rank0_step0/ /workspace/APEX/PaddleNLP/dump_info/rank1_step0/ /workspace/APEX/PaddleNLP/dump_info/rank2_step0/ /workspace/APEX/PaddleNLP/dump_info/rank3_step0/ /workspace/APEX/PaddleNLP/dump_info/rank4_step0/ /workspace/APEX/PaddleNLP/dump_info/rank5_step0/ /workspace/APEX/PaddleNLP/dump_info/rank6_step0/ /workspace/APEX/PaddleNLP/dump_info/rank7_step0/" \ + -out /workspace/APEX/llama10b/result/ -mode acc diff --git a/paddleapex/apex/run_llama20b_xpu.sh b/paddleapex/apex/run_llama20b_xpu.sh new file mode 100755 index 0000000..53d36b0 --- /dev/null +++ b/paddleapex/apex/run_llama20b_xpu.sh @@ -0,0 +1,44 @@ +#!/bin/bash +task_name_or_path="llama-20b" +export XPUAPI_DEBUG=0x1 +#export XPURT_DISPATCH_MODE=PROFILING +export XPU_FORCE_USERMODE_LAUNCH=1 +export PYTHONPATH=$PYTHONPATH:/workspace/APEX/PaddleAPEX:/workspace/APEX/PaddleNLP + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +#export XPU_CDNN_CLUSTER_PARALLEL=1 +#export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 +export XPU_PADDLE_FC_LOCAL_INT16=1 + +# BKCL +# export BKCL_DEBUG=1 +# Multi-computer RDMA +#export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=0 +export BKCL_TREE_THRESHOLD=0 +#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 +#export BKCL_SOCKET_IFNAME=eth0 +export BKCL_FORCE_L3_RDMA=0 + +export CUDA_DEVICE_MAX_CONNECTIONS=8 +export CUDA_DEVICE_ORDER=OAM_ID + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py -json ./ -backend xpu -out /workspace/APEX/llama20b/distributed/ -mode pro +# python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py -json ./ -backend xpu -out /workspace/APEX/llama20b/distributed/ -mode acc +# python run_paddle.py -json /workspace/APEX/llama20b/dump_info/rank0_step0/forward_rank0_all.json -backend xpu -out /workspace/APEX/llama20b/ -mode acc +# python run_paddle.py -real /workspace/APEX/scaled_dot_product_attention/dump_info/rank0_step0/ -json /workspace/APEX/scaled_dot_product_attention/dump_info/rank0_step5/forward_rank0_all.json -backend xpu -out /workspace/APEX/scaled_dot_product_attention/ -mode acc From 2d50e4375c0fc1b199f666480723fdbc7d97f9c5 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Fri, 29 Nov 2024 16:17:07 +0800 Subject: [PATCH 05/22] accelerate computation of run_acc and acc_cmp --- paddleapex/apex/acc_direct_cmp_zxq.py | 181 +++++ paddleapex/apex/run_distributed.py | 26 +- paddleapex/apex/run_llama10b_xpu_32k.sh | 73 ++ .../apex/run_llama10b_xpu_distributed.sh | 80 +++ paddleapex/apex/run_without_distributed.py | 633 ++++++++++++++++++ paddleapex/apex/split_distributed.py | 60 ++ paddleapex/api_tracer/Dump.py | 10 +- paddleapex/api_tracer/api_info.py | 2 +- paddleapex/api_tracer/configs/op_target.yaml | 13 +- 9 files changed, 1066 insertions(+), 12 deletions(-) create mode 100644 paddleapex/apex/acc_direct_cmp_zxq.py create mode 100755 paddleapex/apex/run_llama10b_xpu_32k.sh create mode 100755 paddleapex/apex/run_llama10b_xpu_distributed.sh create mode 100644 paddleapex/apex/run_without_distributed.py create mode 100644 paddleapex/apex/split_distributed.py diff --git a/paddleapex/apex/acc_direct_cmp_zxq.py b/paddleapex/apex/acc_direct_cmp_zxq.py new file mode 100644 index 0000000..1b87221 --- /dev/null +++ b/paddleapex/apex/acc_direct_cmp_zxq.py @@ -0,0 +1,181 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import csv +import argparse +import sys +import time +import paddle +import tqdm + +import paddle.distributed as dist + +from compare_utils.compare import Comparator +from compare_utils.compare_dependency import print_info_log, FileOpen + +current_time = time.strftime("%Y%m%d%H%M%S") +rank = dist.get_rank() + +RESULT_FILE_NAME = "accuracy_checking_result_" + current_time + "_" + str(rank) + ".csv" +DETAILS_FILE_NAME = "accuracy_checking_details_" + current_time + "_" + str(rank) + ".csv" + +tqdm_params = { + "smoothing": 0, # 平滑进度条的预计剩余时间,取值范围0到1 + "desc": "Processing", # 进度条前的描述文字 + "leave": True, # 迭代完成后保留进度条的显示 + "ncols": 75, # 进度条的固定宽度 + "mininterval": 0.1, # 更新进度条的最小间隔秒数 + "maxinterval": 1.0, # 更新进度条的最大间隔秒数 + "miniters": 1, # 更新进度条之间的最小迭代次数 + "ascii": None, # 根据环境自动使用ASCII或Unicode字符 + "unit": "it", # 迭代单位 + "unit_scale": True, # 自动根据单位缩放 + "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 + "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出格式 +} + + +def _compare_parser(parser): + parser.add_argument( + "-bench", + "--benchmark", + dest="bench_dir", + type=str, + help="The executed output api tensor path directory on BENCH", + required=True, + ) + parser.add_argument( + "-device", + "--device", + dest="device_dir", + type=str, + help="The executed output api tensor path directory on DEVICE", + required=True, + ) + parser.add_argument( + "-o", + "--output_path", + dest="out_path", + default="", + type=str, + help=" The result out path", + ) + + +def compare_command(args): + out_path = os.path.realpath(args.out_path) if args.out_path else "./" + os.makedirs(out_path, exist_ok=True) + result_csv_path = os.path.join(out_path, RESULT_FILE_NAME) + details_csv_path = os.path.join(out_path, DETAILS_FILE_NAME) + print_info_log(f"Compare task result will be saved in {result_csv_path}") + print_info_log(f"Compare task details will be saved in {details_csv_path}") + bench_dir = os.path.join(args.bench_dir, "./output") + device_dir = os.path.join(args.device_dir, "./output") + bench_back_dir = os.path.join(args.bench_dir, "./output_backward") + device_back_dir = os.path.join(args.device_dir, "./output_backward") + + compare_device_bench( + result_csv_path, + details_csv_path, + bench_dir, + device_dir, + out_path, + bench_back_dir, + device_back_dir, + ) + + +def compare_device_bench( + result_csv_path, + details_csv_path, + bench_dir, + device_dir, + out_path, + bench_grad_dir=None, + device_grad_dir=None, +): + Warning_list = [] + compare = Comparator(result_csv_path, details_csv_path, False) + with FileOpen(result_csv_path, "r") as file: + csv_reader = csv.reader(file) + next(csv_reader) + api_pt_files_bench = os.listdir(bench_dir) + api_pt_files_device = os.listdir(device_dir) + api_pt_files_all = list(set(api_pt_files_bench + api_pt_files_device)) + api_pt_files_all = sorted(api_pt_files_all) + + for i, api_file in enumerate(tqdm.tqdm(api_pt_files_all, **tqdm_params)): + if not i % dist.get_world_size() == dist.get_rank(): + continue + try: + print("=" * 100) + bench_pt_path = os.path.join(bench_dir, api_file) + device_pt_path = os.path.join(device_dir, api_file) + if os.path.exists(bench_pt_path) and os.path.exists(device_pt_path): + print(f"Loading {bench_pt_path} & {device_pt_path}") + bench_BF16_flag, bench_out_tensor = paddle.load(bench_pt_path) + device_BF16_flag, device_out_tensor = paddle.load(device_pt_path) + elif os.path.exists(bench_pt_path) or os.path.exists(device_pt_path): + msg = f"{api_file} One framework has No output!" + Warning_list.append(msg) + print(msg) + continue + else: + msg = f"{api_file} has no output, please refer to run_ut warning log info." + Warning_list.append(msg) + print(msg) + continue + + bench_grad_tensor_list, device_grad_tensor_list = None, None + if bench_grad_dir and device_grad_dir: + bench_grad_path = os.path.join(bench_grad_dir, api_file) + device_grad_path = os.path.join(device_grad_dir, api_file) + if os.path.exists(bench_grad_path) and os.path.exists(device_grad_path): + _, bench_grad_tensor_list = paddle.load(bench_grad_path) + _, device_grad_tensor_list = paddle.load(device_grad_path) + print(f"Loading {bench_grad_path} & {device_grad_path}") + elif os.path.exists(bench_grad_path) or os.path.exists( + device_grad_path + ): + msg = f"{api_file} One framework has No gard output!" + Warning_list.append(msg) + print(msg) + else: + msg = f"{api_file} has no grad output, please refer to run_ut warning log info." + Warning_list.append(msg) + print(msg) + + compare.compare_output( + api_file, + bench_out_tensor, + device_out_tensor, + bench_grad_tensor_list, + device_grad_tensor_list, + bench_BF16_flag, + device_BF16_flag, # BF16 convert flag + ) + except Exception as err: + print(err) + warning_log_pth = os.path.join(out_path, "./compare_warning.txt") + File = open(warning_log_pth, "w") + for item in Warning_list: + File.write(item + "\n") + File.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + _compare_parser(parser) + args = parser.parse_args(sys.argv[1:]) + compare_command(args) diff --git a/paddleapex/apex/run_distributed.py b/paddleapex/apex/run_distributed.py index 96d9650..4511002 100644 --- a/paddleapex/apex/run_distributed.py +++ b/paddleapex/apex/run_distributed.py @@ -269,6 +269,7 @@ def ut_case_parsing(forward_content, cfg): def create_input_args(api_info, backend, enforce_dtype=None, real_data_path=None): + print(real_data_path) args, kwargs, need_backward = gen_api_params(api_info, real_data_path) device_args = recursive_arg_to_device(args, backend, enforce_dtype) device_kwargs = { @@ -291,6 +292,7 @@ def create_dout(dout_info_dict, device_out, backend, enforce_dtype=None, real_da def run_forward(api_call_name, device_args, device_kwargs): api_call_stack = api_call_name.rsplit("*")[0] try: + # paddle.distributed.barrier() device_out = eval(api_call_stack)(*device_args, **device_kwargs) paddle.device.synchronize() return device_out @@ -347,7 +349,7 @@ def run_acc_case( api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None ): api_info_dict_copy = copy.deepcopy(api_info_dict) - if not dist.get_rank() == 0 or "distributed" not in api_call_name: + if not dist.get_rank() == 0 and "distributed" not in api_call_name: real_data_path = None device_args, device_kwargs, need_backward = create_input_args( api_info_dict_copy, backend, enforce_dtype, real_data_path @@ -359,6 +361,8 @@ def run_acc_case( # save_pth = os.path.join(out_path, "input_data", api_call_name) # paddle.save(x, save_pth) try: + # if "distributed" in api_call_name: + # paddle.distributed.barrier() device_out = run_forward(api_call_name, device_args, device_kwargs) except Exception as err: msg = "Run_forward Error: %s" % str(err) @@ -395,6 +399,8 @@ def run_acc_case( ) else: save_tensor(device_out, device_grad_out, out_path, api_call_name) + + # paddle.distributed.barrier() return @@ -610,14 +616,20 @@ def arg_parser(parser): print(cfg) dist.init_parallel_env() local_rank = dist.get_rank() - json_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step5/forward_rank" + str(local_rank) + "_all.json" - + # json_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step5/forward_rank" + str(local_rank) + "_all.json" + json_path_list = cfg.json_path.split(' ') + data_path_list = cfg.real_data.split(' ') + + print("json_path_list", json_path_list) + print("data_path_list", data_path_list) + + cfg.json_path = json_path_list[local_rank] + cfg.real_data = data_path_list[local_rank] cfg.backend = cfg.backend + ":" + str(local_rank) - cfg.json_path = json_path - data_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step0/" - cfg.real_data = data_path - cfg.real_data = None + print(cfg) + # data_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step0/" + # cfg.real_data = None forward_content = api_json_read(cfg.json_path) out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" diff --git a/paddleapex/apex/run_llama10b_xpu_32k.sh b/paddleapex/apex/run_llama10b_xpu_32k.sh new file mode 100755 index 0000000..b240ca0 --- /dev/null +++ b/paddleapex/apex/run_llama10b_xpu_32k.sh @@ -0,0 +1,73 @@ +#!/bin/bash +task_name_or_path="llama-10b" +export XPUAPI_DEBUG=0x1 +#export XPURT_DISPATCH_MODE=PROFILING +export XPU_FORCE_USERMODE_LAUNCH=1 +export PYTHONPATH=$PYTHONPATH:/zhouxiangquan/PaddleAPEX:/zhouxiangquan/PaddleNLP + +runtime_location=/workspace/so-runtime +bkcl_location=/workspace/so-bkcl +export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:$LD_LIBRARY_PATH + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 + +# BKCL +# export BKCL_DEBUG=1 +# Multi-computer RDMA +export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=0 +export BKCL_TREE_THRESHOLD=0 +#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 +#export BKCL_SOCKET_IFNAME=eth0 +export BKCL_FORCE_L3_RDMA=0 +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +echo "bkcl version:" +strings ${bkcl_location}/libbkcl.so | grep COM +master_ip=$POD_0_IP +nnodes=$PADDLE_TRAINERS_NUM +echo "master ip:" +echo $master_ip + +export CUDA_DEVICE_MAX_CONNECTIONS=8 + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) +echo "PaddleNLP_DIR: "$PaddleNLP_DIR + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 + +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 +export XPU_PADDLE_FC_LOCAL_INT16=1 +export CUDA_DEVICE_ORDER=OAM_ID +export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 +export XPU_AUTO_BF16_TF32_RADIO=1 +export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 +export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 + + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ + -json \ + "/zhouxiangquan/llama10b/dump_info/rank0_step5/forward_rank0_all.json /zhouxiangquan/llama10b/dump_info/rank1_step5/forward_rank1_all.json /zhouxiangquan/llama10b/dump_info/rank2_step5/forward_rank2_all.json /zhouxiangquan/llama10b/dump_info/rank3_step5/forward_rank3_all.json /zhouxiangquan/llama10b/dump_info/rank4_step5/forward_rank4_all.json /zhouxiangquan/llama10b/dump_info/rank5_step5/forward_rank5_all.json /zhouxiangquan/llama10b/dump_info/rank6_step5/forward_rank6_all.json /zhouxiangquan/llama10b/dump_info/rank7_step5/forward_rank7_all.json" \ + -backend xpu \ + -real \ + "/zhouxiangquan/llama10b/dump_info/rank0_step0/ /zhouxiangquan/llama10b/dump_info/rank1_step0/ /zhouxiangquan/llama10b/dump_info/rank2_step0/ /zhouxiangquan/llama10b/dump_info/rank3_step0/ /zhouxiangquan/llama10b/dump_info/rank4_step0/ /zhouxiangquan/llama10b/dump_info/rank5_step0/ /zhouxiangquan/llama10b/dump_info/rank6_step0/ /zhouxiangquan/llama10b/dump_info/rank7_step0/" \ + -out /zhouxiangquan/llama10b/result_32k/ -mode acc diff --git a/paddleapex/apex/run_llama10b_xpu_distributed.sh b/paddleapex/apex/run_llama10b_xpu_distributed.sh new file mode 100755 index 0000000..ef16466 --- /dev/null +++ b/paddleapex/apex/run_llama10b_xpu_distributed.sh @@ -0,0 +1,80 @@ +#!/bin/bash +task_name_or_path="llama-10b" +export XPUAPI_DEBUG=0x1 +#export XPURT_DISPATCH_MODE=PROFILING +export XPU_FORCE_USERMODE_LAUNCH=1 +export PYTHONPATH=$PYTHONPATH:/zhouxiangquan/PaddleAPEX:/zhouxiangquan/PaddleNLP + +runtime_location=/workspace/so-runtime +bkcl_location=/workspace/so-bkcl +export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:$LD_LIBRARY_PATH + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 + +# BKCL +# export BKCL_DEBUG=1 +# Multi-computer RDMA +export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=0 +export BKCL_TREE_THRESHOLD=0 +#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 +#export BKCL_SOCKET_IFNAME=eth0 +export BKCL_FORCE_L3_RDMA=0 +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +echo "bkcl version:" +strings ${bkcl_location}/libbkcl.so | grep COM +master_ip=$POD_0_IP +nnodes=$PADDLE_TRAINERS_NUM +echo "master ip:" +echo $master_ip + +export CUDA_DEVICE_MAX_CONNECTIONS=8 + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) +echo "PaddleNLP_DIR: "$PaddleNLP_DIR + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 + +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 +export XPU_PADDLE_FC_LOCAL_INT16=1 +export CUDA_DEVICE_ORDER=OAM_ID +export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 +export XPU_AUTO_BF16_TF32_RADIO=1 +export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 +export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 + +# "llama10b/rand0_distributed.json llama10b/rand1_distributed.json llama10b/rand2_distributed.json llama10b/rand3_distributed.json llama10b/rand4_distributed.json llama10b/rand5_distributed.json llama10b/rand6_distributed.json llama10b/rand7_distributed.json" + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_without_distributed.py \ + -json "llama10b/rand0_without_distributed.json" \ + -backend xpu \ + -real "/zhouxiangquan/llama10b/dump_info/rank0_step0/" \ + -out /zhouxiangquan/llama10b/result_32k/rank_0/ -mode acc + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ + -json \ + "llama10b/rand0_distributed.json llama10b/rand1_distributed.json llama10b/rand2_distributed.json llama10b/rand3_distributed.json llama10b/rand4_distributed.json llama10b/rand5_distributed.json llama10b/rand6_distributed.json llama10b/rand7_distributed.json" \ + -backend xpu \ + -real \ + "/zhouxiangquan/llama10b/dump_info/rank0_step0/ /zhouxiangquan/llama10b/dump_info/rank1_step0/ /zhouxiangquan/llama10b/dump_info/rank2_step0/ /zhouxiangquan/llama10b/dump_info/rank3_step0/ /zhouxiangquan/llama10b/dump_info/rank4_step0/ /zhouxiangquan/llama10b/dump_info/rank5_step0/ /zhouxiangquan/llama10b/dump_info/rank6_step0/ /zhouxiangquan/llama10b/dump_info/rank7_step0/" \ + -out /zhouxiangquan/llama10b/result_32k/ -mode acc diff --git a/paddleapex/apex/run_without_distributed.py b/paddleapex/apex/run_without_distributed.py new file mode 100644 index 0000000..46df68a --- /dev/null +++ b/paddleapex/apex/run_without_distributed.py @@ -0,0 +1,633 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddlenlp # if you wanna test nlp fusion operations +import argparse +import os +import shutil +import time +import copy +from tqdm import tqdm +import paddle +import paddle.distributed as dist +from paddle import framework +from paddle.base import core +from utils import ( + print_info_log, + gen_api_params, + api_json_read, + check_grad_list, + rand_like, + gen_args, + print_warn_log, +) + +type_map = { + "FP16": paddle.float16, + "FP32": paddle.float32, + "BF16": paddle.bfloat16, +} +Warning_list = [] + +distributed_op = ["paddle.distributed.broadcast_object_list", + "paddle.distributed.barrier", + "paddle.distributed.communication.stream.alltoall_single", + "paddle.distributed.communication.stream.broadcast", + "paddle.distributed.communication.stream.gather", + "paddle.distributed.communication.stream.recv", + "paddle.distributed.communication.stream.reduce", + "paddle.distributed.communication.stream.reduce_scatter", + "paddle.distributed.communication.stream.scatter", + "paddle.distributed.communication.stream.send", + "paddle.distributed.all_gather", + "paddle.distributed.all_gather_object", + "paddle.distributed.all_reduce", + "paddle.distributed.alltoall", + "paddle.distributed.alltoall_single", + "paddle.distributed.broadcast", + "paddle.distributed.communication.stream.all_gather", + "paddle.distributed.communication.stream.all_reduce", + "paddle.distributed.communication.stream.alltoall"] + +current_time = time.strftime("%Y%m%d%H%M%S") + +tqdm_params = { + "smoothing": 0, # 平滑进度条的预计剩余时间,取值范围0到1 + "desc": "Processing", # 进度条前的描述文字 + "leave": True, # 迭代完成后保留进度条的显示 + "ncols": 75, # 进度条的固定宽度 + "mininterval": 0.1, # 更新进度条的最小间隔秒数 + "maxinterval": 1.0, # 更新进度条的最大间隔秒数 + "miniters": 1, # 更新进度条之间的最小迭代次数 + "ascii": None, # 根据环境自动使用ASCII或Unicode字符 + "unit": "it", # 迭代单位 + "unit_scale": True, # 自动根据单位缩放 + "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 + "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 +} +PROFILE_RUN_TIMES = 1 + +def recursive_delete_arg(arg_in): + if isinstance(arg_in, (list, tuple)): + for item in arg_in: + recursive_delete_arg(item) + return + elif isinstance(arg_in, paddle.Tensor): + del arg_in + return +def get_shape(arg_in): + if isinstance(arg_in, (list, tuple)): + res = [] + for item in arg_in: + ret_value = get_shape(item) + res.append(ret_value) + return res + elif isinstance(arg_in, paddle.Tensor): + shape = arg_in.shape + return shape + +def merge_two_lists(lst1, lst2): + merged_list = [] + if lst1 is None and lst2 is not None: + merged_list = lst2 + elif lst1 is not None and lst2 is None: + merged_list = lst1 + elif lst1 is None and lst2 is None: + merged_list = [] + else: + for item in lst1: + if item is None: + continue + else: + merged_list.append(item) + for item in lst2: + if item is None: + continue + else: + merged_list.append(item) + return merged_list + +def convert_out2fp32(arg_in): + flag = False + if isinstance(arg_in, (list, tuple)): + res = [] + for item in arg_in: + ret_flag, ret_value = convert_out2fp32(item) + res.append(ret_value) + flag = flag or ret_flag + return flag, res + elif isinstance(arg_in, paddle.Tensor): + if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": + try: + arg_in = arg_in.cast("float32") + flag = True + except Exception as err: + print(arg_in) + return False, arg_in + return flag, arg_in + + +def recursive_arg_to_cpu(arg_in): + if isinstance(arg_in, (list, tuple)): + res = [] + for item in arg_in: + res.append(recursive_arg_to_cpu(item)) + return res + elif isinstance(arg_in, paddle.Tensor): + arg_in = arg_in.to( + "cpu" + ) # avoid using .cpu(), which will cause the gradient to be lost + return arg_in + + +def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): + if isinstance(arg_in, (list, tuple)): + return type(arg_in)( + recursive_arg_to_device(arg, backend, enforce_dtype) for arg in arg_in + ) + elif isinstance(arg_in, paddle.Tensor): + grad_status = arg_in.stop_gradient + with paddle.no_grad(): + if "gpu" in backend: + arg_in = arg_in.cuda() + if "cpu" in backend: + arg_in = arg_in.cpu() + if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": + arg_in = arg_in.cast("float32") + else: + arg_in = arg_in.to(backend) + if enforce_dtype and arg_in.dtype.name in ["BF16", "BFLOAT16", "FP16", "FP32"]: + arg_in = arg_in.cast(enforce_dtype) + arg_in.stop_gradient = grad_status + return arg_in + else: + return arg_in + + +def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): + if dtype_name == "": + bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) + fwd_output_dir = os.path.abspath(os.path.join(out_path, "output")) + else: + bwd_output_dir = os.path.abspath( + os.path.join(out_path, dtype_name, "output_backward") + ) + fwd_output_dir = os.path.abspath(os.path.join(out_path, dtype_name, "output")) + fwd_output_path = os.path.join(fwd_output_dir, api_call_name) + bwd_output_path = os.path.join(bwd_output_dir, api_call_name) + os.makedirs(fwd_output_dir, exist_ok=True) + os.makedirs(bwd_output_dir, exist_ok=True) + if isinstance(forward_res, (type(None), list, tuple, paddle.Tensor)): + try: + fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) + paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) + except Exception as err: + msg = "save_forward Error: %s" % str(err) + print_warn_log(msg) + return + else: + print(forward_res) + print_warn_log("forward_res not supported!") + if isinstance(backward_res, (type(None), list, tuple, paddle.Tensor)): + try: + bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) + paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) + except Exception as err: + msg = "save_bacward Error: %s" % str(err) + print_warn_log(msg) + return + else: + print(backward_res) + print_warn_log("bacward_res not supported!") + + +def evoke_related_test_func(test_mode): + func_method = [] + if "acc" in test_mode: + func_method.append(run_acc_case) + if "mem" in test_mode: + func_method.append(run_mem_case) + if "pro" in test_mode: + func_method.append(run_profile_case) + if test_mode == "all": + return [run_acc_case, run_mem_case, run_profile_case] + if len(func_method) == 0: + raise ValueError("test mode is not supported!") + return func_method + + +def ut_case_parsing(forward_content, cfg): + run_case_funcs = evoke_related_test_func(cfg.test_mode) + backend = cfg.backend + out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" + os.mkdir(out_path) if not os.path.exists(out_path) else None + multi_dtype_ut = cfg.multi_dtype_ut.split(",") if cfg.multi_dtype_ut else [] + debug_case = cfg.test_case_name.split(",") if cfg.test_case_name else [] + print("debug_case", debug_case) + debug_mode = False + paddle.set_device(cfg.backend) + if len(debug_case) > 0: + debug_mode = True + enforce_types = [type_map[item] for item in multi_dtype_ut] + for i, (api_call_name, api_info_dict) in enumerate( + tqdm(forward_content.items(), **tqdm_params) + ): + if not i % dist.get_world_size() == dist.get_rank(): + continue + print(api_call_name) + if debug_mode and api_call_name not in debug_case: + continue + if len(multi_dtype_ut) > 0: + for enforce_dtype in enforce_types: + print(api_call_name + "*" + enforce_dtype.name) + args = api_call_name, api_info_dict, backend, out_path + kwargs = {"enforce_dtype": enforce_dtype, "debug_case": debug_case, "real_data_path": cfg.real_data} + for run_case in run_case_funcs: + run_case(*args, **kwargs) + print("*" * 100) + else: + print(api_call_name) + args = api_call_name, api_info_dict, backend, out_path + kwargs = {"enforce_dtype": None, "debug_case": debug_case, "real_data_path": cfg.real_data} + if isinstance(run_case_funcs, list): + for run_case in run_case_funcs: + run_case(*args, **kwargs) + else: + run_case_funcs(*args, **kwargs) + print("*" * 100) + + +def create_input_args(api_info, backend, enforce_dtype=None, real_data_path=None): + print(real_data_path) + args, kwargs, need_backward = gen_api_params(api_info, real_data_path) + device_args = recursive_arg_to_device(args, backend, enforce_dtype) + device_kwargs = { + key: recursive_arg_to_device(value, backend, enforce_dtype) + for key, value in kwargs.items() + } + return device_args, device_kwargs, need_backward + + +def create_dout(dout_info_dict, device_out, backend, enforce_dtype=None, real_data_path=None): + if dout_info_dict[0] != "Failed": + dout, _ = gen_args(dout_info_dict, real_data_path) + else: + print("dout dump json is None!") + dout = rand_like(device_out) + dout = recursive_arg_to_device(dout, backend, enforce_dtype) + return dout + + +def run_forward(api_call_name, device_args, device_kwargs): + api_call_stack = api_call_name.rsplit("*")[0] + try: + # paddle.distributed.barrier() + device_out = eval(api_call_stack)(*device_args, **device_kwargs) + paddle.device.synchronize() + return device_out + + except Exception as err: + msg = f"Run API {api_call_name} Forward Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return None + + +def get_grad_tensor(args, kwargs): + device_grad_out = [] + for arg in args: + if isinstance(arg, paddle.Tensor): + device_grad_out.append(arg.grad) + if isinstance(arg, list): # op: concat/stack + for x in arg: + if isinstance(x, paddle.Tensor): + device_grad_out.append(x.grad) + for k, v in kwargs.items(): + if isinstance(v, paddle.Tensor): + device_grad_out.append(v.grad) + if isinstance(v, list): # op: concat/stack + for x in v: + if isinstance(x, paddle.Tensor): + device_grad_out.append(x.grad) + return device_grad_out + + +def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=None): + if need_backward: + try: + paddle.autograd.backward([device_out], dout) + device_grad_out = get_grad_tensor(args, kwargs) + device_grad_out = check_grad_list(device_grad_out) + if device_grad_out is None: + msg = f"{api_call_name} grad_list is None" + Warning_list.append(msg) + return device_grad_out + except Exception as err: + msg = f"Run API {api_call_name} backward Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return None + else: + msg = f"{api_call_name} has no tensor required grad, SKIP Backward" + print_warn_log(msg) + Warning_list.append(msg) + return None + + +def run_acc_case( + api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None +): + api_info_dict_copy = copy.deepcopy(api_info_dict) + # if not dist.get_rank() == 0 and "distributed" not in api_call_name: + # real_data_path = None + device_args, device_kwargs, need_backward = create_input_args( + api_info_dict_copy, backend, enforce_dtype, real_data_path + ) + print(f"Running {api_call_name} acc test!") + # if api_call_name in debug_case: + # x = [device_args, device_kwargs] + # out_path = os.path.realpath(out_path) if out_path else "./" + # save_pth = os.path.join(out_path, "input_data", api_call_name) + # paddle.save(x, save_pth) + try: + # if "distributed" in api_call_name: + # paddle.distributed.barrier() + device_out = run_forward(api_call_name, device_args, device_kwargs) + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return + + try: + device_grad_out = [] + if api_info_dict["dout_list"][0] != "Failed": + dout = create_dout( + api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path + ) + device_grad_out = run_backward( + api_call_name, device_out, dout, device_args, device_kwargs, need_backward + ) + else: + if api_call_name.rsplit("*")[0] in distributed_op: + print('this is distributed op: ', api_call_name) + device_out = device_args + device_grad_out = None + except Exception as err: + msg = "Run_backward Error: %s" % str(err) + print_warn_log(msg) + if enforce_dtype: + save_tensor( + device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name + ) + else: + save_tensor(device_out, device_grad_out, out_path, api_call_name) + return + if enforce_dtype: + save_tensor( + device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name + ) + else: + save_tensor(device_out, device_grad_out, out_path, api_call_name) + + # paddle.distributed.barrier() + return + + +def run_profile_case( + api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None +): + print(f"Running {api_call_name} profile test!") + api_info_dict_copy = copy.deepcopy(api_info_dict) + device_args, device_kwargs, need_backward = create_input_args( + api_info_dict_copy, backend, enforce_dtype, real_data_path + ) + # if api_call_name in debug_case: + # x = [device_args, device_kwargs] + # out_path = os.path.realpath(out_path) if out_path else "./" + # save_pth = os.path.join(out_path, "input_data", api_call_name) + # paddle.save(x, save_pth) + # device warmming up + try: + device_out = run_forward(api_call_name, device_args, device_kwargs) + if api_info_dict["dout_list"][0] != "Failed": + dout = create_dout( + api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path + ) + paddle.autograd.backward([device_out], dout) + else: + need_backward = False + except Exception as err: + msg = "Failed in device warming up: %s" % str(err) + print_warn_log(msg) + return + input_shape1 = get_shape(device_args) + input_shape2 = get_shape(device_kwargs) + input_shape_lst = merge_two_lists(input_shape1, input_shape2) + output_shape_lst = get_shape(device_out) + def profile_inner_loop_(): + try: + paddle.device.synchronize() + fwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + paddle.device.synchronize() + fwd_end_time = time.time() + fwd_time = fwd_end_time - fwd_start_time + fwd_time = fwd_time * 1000000 / float(PROFILE_RUN_TIMES) # fwd_time is in us + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return -1, -1 + try: + if not need_backward: + return fwd_time, -1 + paddle.device.synchronize() + bwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + paddle.autograd.backward([device_out], dout) + paddle.device.synchronize() + bwd_end_time = time.time() + bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second + bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us + bwd_time = bwd_time - fwd_time + except Exception as err: + msg = "Run_backward Error: %s" % str(err) + print_warn_log(msg) + return fwd_time, -1 + return fwd_time, bwd_time + + try: + fwd_time, bwd_time = profile_inner_loop_() + except Exception as err: + msg = f"Run {api_call_name} profile Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return + + if not enforce_dtype: + log_path = os.path.join(out_path, "profile_analyze.log") + else: + log_path = os.path.join(out_path, enforce_dtype.name, "profile_analyze.log") + + F = open(log_path, "a") + dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" + op_fwd = api_call_name + dtype + ".forward" + op_bwd = api_call_name + dtype + ".backward" + print_info_log(f"{op_fwd}:\t{fwd_time}") + print_info_log(f"{op_bwd}:\t{bwd_time}") + dtype = "\t" if not enforce_dtype else f"\t{enforce_dtype.name}" + msg_fwd = f"{api_call_name}.forward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tforward\t{fwd_time}" + msg_bwd = f"{api_call_name}.backward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tbackward\t{bwd_time}" + + F.write(msg_fwd + "\n") + F.write(msg_bwd + "\n") + F.close() + return + + +def run_mem_case( + api_call_name, + api_info_dict, + backend, + out_path, + enforce_dtype=None, + debug_case=[], # noqa + real_data_path=None +): + print(f"Running {api_call_name} mem test!") + + activation_cost = None + place = framework._current_expected_place_() + device_id = place.get_device_id() + before_run_mem = core.device_memory_stat_current_value("Allocated", device_id) + api_info_dict_copy = copy.deepcopy(api_info_dict) + device_args, device_kwargs, _ = create_input_args( + api_info_dict_copy, backend, enforce_dtype, real_data_path + ) + try: + device_out = run_forward(api_call_name, device_args, device_kwargs) + recursive_delete_arg(device_args) + for _, value in device_kwargs.items(): + recursive_delete_arg(value) + _ = recursive_arg_to_cpu(device_out) + after_run_mem = core.device_memory_stat_current_value("Allocated", device_id) + activation_cost = after_run_mem - before_run_mem + + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return + if not enforce_dtype: + log_path = os.path.join(out_path, "memory_analyze.log") + else: + log_path = os.path.join(out_path, enforce_dtype.name, "memory_analyze.log") + + os.mkdir(out_path) if not os.path.exists(out_path) else None + F = open(log_path, "a") + dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" + op_name = api_call_name + dtype + ".forward" + F.write(f"{op_name}:\t{str(activation_cost)}\n") + F.close() + return + + +def arg_parser(parser): + parser.add_argument( + "-json_file", + "--json_file", + dest="json_path", + default="", + type=str, + help="Dump json file path", + required=True, + ) + parser.add_argument( + "-out", + "--dump_path", + dest="out_path", + default="./paddle/", + type=str, + help=" The ut task result out path.", + required=False, + ) + parser.add_argument( + "-backend", + "--backend", + dest="backend", + default="gpu", + type=str, + help=" The running device DEVICE or BENCH.", + required=False, + ) + parser.add_argument( + "-dtype", + "--enforce-dtype", + dest="multi_dtype_ut", + default="", + type=str, + help="", + required=False, + ) + parser.add_argument( + "-real", + "--real_data", + dest="real_data", + default="", + type=str, + help="", + required=False, + ) + parser.add_argument( + "-op", + "--op_name", + dest="test_case_name", + default="", + type=str, + help="debug_op name", + required=False, + ) + parser.add_argument( + "-mode", + "--mode", + dest="test_mode", + default="all", + type=str, + help="debug_op name", + required=False, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + arg_parser(parser) + cfg = parser.parse_args() + print(cfg) + dist.init_parallel_env() + local_rank = dist.get_rank() + cfg.backend = cfg.backend + ":" + str(local_rank) + + forward_content = api_json_read(cfg.json_path) + out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" + if os.path.exists(out_path): + print_warn_log("The output path already exists and the file with the same name will be overwritten.") + cfg.out_path = out_path + ut_case_parsing(forward_content, cfg) + print_info_log("UT save completed") + # warning_log_pth = os.path.join(out_path, "./warning_log.txt") + # File = open(warning_log_pth, "w") + # for item in Warning_list: + # File.write(item + "\n") + # File.close() + paddle.device.synchronize() diff --git a/paddleapex/apex/split_distributed.py b/paddleapex/apex/split_distributed.py new file mode 100644 index 0000000..84ed37e --- /dev/null +++ b/paddleapex/apex/split_distributed.py @@ -0,0 +1,60 @@ +import json + +def split_json_by_keyword(input_file, output_file_with, output_file_without, keyword): + # 读取 JSON 文件 + with open(input_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + # 分别存储包含和不包含关键字的项 + with_keyword = {} + without_keyword = {} + + # 遍历每个项并分类 + for key, value in data.items(): + if keyword in key: + with_keyword[key] = value + else: + without_keyword[key]= value + + # 将结果写入不同的文件 + with open(output_file_with, 'w', encoding='utf-8') as f_with: + json.dump(with_keyword, f_with, ensure_ascii=False, indent=4) + + with open(output_file_without, 'w', encoding='utf-8') as f_without: + json.dump(without_keyword, f_without, ensure_ascii=False, indent=4) + + print(f"Items with '{keyword}' written to {output_file_with}") + print(f"Items without '{keyword}' written to {output_file_without}") + + +input_json_files = ["/zhouxiangquan/llama10b/dump_info/rank0_step5/forward_rank0_all.json", + "/zhouxiangquan/llama10b/dump_info/rank1_step5/forward_rank1_all.json", + "/zhouxiangquan/llama10b/dump_info/rank2_step5/forward_rank2_all.json", + "/zhouxiangquan/llama10b/dump_info/rank3_step5/forward_rank3_all.json", + "/zhouxiangquan/llama10b/dump_info/rank4_step5/forward_rank4_all.json", + "/zhouxiangquan/llama10b/dump_info/rank5_step5/forward_rank5_all.json", + "/zhouxiangquan/llama10b/dump_info/rank6_step5/forward_rank6_all.json", + "/zhouxiangquan/llama10b/dump_info/rank7_step5/forward_rank7_all.json"] + +output_with_keyword = ["llama10b/rand0_distributed.json", + "llama10b/rand1_distributed.json", + "llama10b/rand2_distributed.json", + "llama10b/rand3_distributed.json", + "llama10b/rand4_distributed.json", + "llama10b/rand5_distributed.json", + "llama10b/rand6_distributed.json", + "llama10b/rand7_distributed.json"] + +output_without_keyword = ["llama10b/rand0_without_distributed.json", + "llama10b/rand1_without_distributed.json", + "llama10b/rand2_without_distributed.json", + "llama10b/rand3_without_distributed.json", + "llama10b/rand4_without_distributed.json", + "llama10b/rand5_without_distributed.json", + "llama10b/rand6_without_distributed.json", + "llama10b/rand7_without_distributed.json"] + +keyword = 'distributed' +for i in range(len(input_json_files)): + split_json_by_keyword(input_json_files[i], output_with_keyword[i], output_without_keyword[i], keyword) + diff --git a/paddleapex/api_tracer/Dump.py b/paddleapex/api_tracer/Dump.py index 28787e4..cc6a013 100644 --- a/paddleapex/api_tracer/Dump.py +++ b/paddleapex/api_tracer/Dump.py @@ -70,6 +70,7 @@ def __init__(self, mode="real_data", Async_save=cfg.Async_dump): self.rank = None self.dump_api_dict = None self.dump_api_dict_half = None + self.dump_api_dict_distributed = None self.dump_api_dict_other = None self.Async_save = Async_save @@ -107,7 +108,7 @@ def dump_real_data(self, api_args, tensor, rank): Get Api_info dict, update self.dump_api_dict """ - def update_api_dict(self, api_info_dict, rank, is_half_precision = False): + def update_api_dict(self, api_info_dict, rank, is_half_precision = False, is_distributed = False): self.rank = rank if self.dump_api_dict is None: self.dump_api_dict = api_info_dict.copy() @@ -115,6 +116,11 @@ def update_api_dict(self, api_info_dict, rank, is_half_precision = False): self.dump_api_dict.update(api_info_dict) if cfg.split_dump: + if is_distributed: + if self.dump_api_dict_distributed is None: + self.dump_api_dict_distributed = api_info_dict.copy() + else: + self.dump_api_dict_distributed.update(api_info_dict) if is_half_precision: if self.dump_api_dict_half is None: self.dump_api_dict_half = api_info_dict.copy() @@ -148,11 +154,13 @@ def dump(self): write_json(directory, self.dump_api_dict, rank=self.rank, mode="forward", split_type="all") if cfg.split_dump: write_json(directory, self.dump_api_dict_half, rank=self.rank, mode="forward", split_type="half") + write_json(directory, self.dump_api_dict_distributed, rank=self.rank, mode="forward", split_type="distributed") write_json(directory, self.dump_api_dict_other, rank=self.rank, mode="forward", split_type="other") else: write_json(directory, self.dump_api_dict, rank=None, mode="forward", split_type="all") if cfg.split_dump: write_json(directory, self.dump_api_dict_half, rank=None, mode="forward", split_type="half") + write_json(directory, self.dump_api_dict_distributed, rank=None, mode="forward", split_type="distributed") write_json(directory, self.dump_api_dict_other, rank=None, mode="forward", split_type="other") diff --git a/paddleapex/api_tracer/api_info.py b/paddleapex/api_tracer/api_info.py index 6646b25..0b8a5dc 100644 --- a/paddleapex/api_tracer/api_info.py +++ b/paddleapex/api_tracer/api_info.py @@ -126,7 +126,7 @@ def update_real_data(self, inputs, kwargs): self.api_info_struct = { self.op_name: {"args": args_info_list, "kwargs": kwargs_info_dict, "dout_list": ["Failed"]} } - dump_util.update_api_dict(self.api_info_struct, self.rank, self.is_half_precision) + dump_util.update_api_dict(self.api_info_struct, self.rank, self.is_half_precision, "distributed" in self.op_name) def record_dout(self, grad_value): if grad_value is not None: diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index 292942a..d03816c 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -18,7 +18,14 @@ ignored_op: - paddle.split - paddle.Tensor.zero_ - paddle.stack - # distributed + - paddle.zeros + - paddle.zeros_like +# self-confidence, arrogance + # - paddle.Tensor.__add__ + # - paddle.Tensor.__mul__ + # - paddle.Tensor.__neg__ + # - paddle.Tensor.add_ +# distributed # - paddle.distributed.barrier # - paddle.distributed.broadcast_object_list # - paddle.distributed.communication.stream.alltoall_single @@ -518,8 +525,8 @@ target_op: - paddle.vsplit - paddle.where - paddle.where_ - - paddle.zeros - - paddle.zeros_like + # - paddle.zeros + # - paddle.zeros_like # - paddle.Tensor.T - paddle.Tensor.__add__ - paddle.Tensor.__and__ From 3c87b520ca79008302df0eafbed45343223693be Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Wed, 4 Dec 2024 16:53:27 +0800 Subject: [PATCH 06/22] support catch class, and run_acc --- paddleapex/apex/run_class.py | 649 ++++++++++++++++++ paddleapex/apex/run_llama10b_xpu_new.sh | 4 +- paddleapex/api_tracer/Dump.py | 12 + paddleapex/api_tracer/api_info.py | 9 +- paddleapex/api_tracer/configs/op_target.yaml | 26 +- .../api_tracer/configs/tool_config.yaml | 2 +- paddleapex/api_tracer/wrap_op/OPTemplate.py | 81 +++ .../api_tracer/wrap_op/get_target_op.py | 4 + paddleapex/api_tracer/wrap_op/hijack_tool.py | 26 +- 9 files changed, 804 insertions(+), 9 deletions(-) create mode 100644 paddleapex/apex/run_class.py diff --git a/paddleapex/apex/run_class.py b/paddleapex/apex/run_class.py new file mode 100644 index 0000000..9b38d65 --- /dev/null +++ b/paddleapex/apex/run_class.py @@ -0,0 +1,649 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddlenlp # if you wanna test nlp fusion operations +import argparse +import os +from importlib import import_module +import pickle +import shutil +import time +import copy +from tqdm import tqdm +import paddle +import paddle.distributed as dist +from paddle import framework +from paddle.base import core +from utils import ( + print_info_log, + gen_api_params, + api_json_read, + check_grad_list, + rand_like, + gen_args, + print_warn_log, +) + +type_map = { + "FP16": paddle.float16, + "FP32": paddle.float32, + "BF16": paddle.bfloat16, +} +Warning_list = [] + +current_time = time.strftime("%Y%m%d%H%M%S") + +tqdm_params = { + "smoothing": 0, # 平滑进度条的预计剩余时间,取值范围0到1 + "desc": "Processing", # 进度条前的描述文字 + "leave": True, # 迭代完成后保留进度条的显示 + "ncols": 75, # 进度条的固定宽度 + "mininterval": 0.1, # 更新进度条的最小间隔秒数 + "maxinterval": 1.0, # 更新进度条的最大间隔秒数 + "miniters": 1, # 更新进度条之间的最小迭代次数 + "ascii": None, # 根据环境自动使用ASCII或Unicode字符 + "unit": "it", # 迭代单位 + "unit_scale": True, # 自动根据单位缩放 + "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 + "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 +} +PROFILE_RUN_TIMES = 1 + +from paddle.distributed import fleet +strategy = fleet.DistributedStrategy() +strategy.hybrid_configs = { + "dp_degree": 1, + "mp_degree": 8, + "pp_degree": 1, + "sharding_degree": 1, +} +fleet.init(is_collective=True, strategy=strategy) +paddle.set_default_dtype("bfloat16") + +def recursive_delete_arg(arg_in): + if isinstance(arg_in, (list, tuple)): + for item in arg_in: + recursive_delete_arg(item) + return + elif isinstance(arg_in, paddle.Tensor): + del arg_in + return +def get_shape(arg_in): + if isinstance(arg_in, (list, tuple)): + res = [] + for item in arg_in: + ret_value = get_shape(item) + res.append(ret_value) + return res + elif isinstance(arg_in, paddle.Tensor): + shape = arg_in.shape + return shape + +def merge_two_lists(lst1, lst2): + merged_list = [] + if lst1 is None and lst2 is not None: + merged_list = lst2 + elif lst1 is not None and lst2 is None: + merged_list = lst1 + elif lst1 is None and lst2 is None: + merged_list = [] + else: + for item in lst1: + if item is None: + continue + else: + merged_list.append(item) + for item in lst2: + if item is None: + continue + else: + merged_list.append(item) + return merged_list + +def convert_out2fp32(arg_in): + flag = False + if isinstance(arg_in, (list, tuple)): + res = [] + for item in arg_in: + ret_flag, ret_value = convert_out2fp32(item) + res.append(ret_value) + flag = flag or ret_flag + return flag, res + elif isinstance(arg_in, paddle.Tensor): + if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": + try: + arg_in = arg_in.cast("float32") + flag = True + except Exception as err: + print(arg_in) + return False, arg_in + return flag, arg_in + + +def recursive_arg_to_cpu(arg_in): + if isinstance(arg_in, (list, tuple)): + res = [] + for item in arg_in: + res.append(recursive_arg_to_cpu(item)) + return res + elif isinstance(arg_in, paddle.Tensor): + arg_in = arg_in.to( + "cpu" + ) # avoid using .cpu(), which will cause the gradient to be lost + return arg_in + + +def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): + if isinstance(arg_in, (list, tuple)): + return type(arg_in)( + recursive_arg_to_device(arg, backend, enforce_dtype) for arg in arg_in + ) + elif isinstance(arg_in, paddle.Tensor): + grad_status = arg_in.stop_gradient + with paddle.no_grad(): + if "gpu" in backend: + arg_in = arg_in.cuda() + if "cpu" in backend: + arg_in = arg_in.cpu() + if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": + arg_in = arg_in.cast("float32") + else: + arg_in = arg_in.to(backend) + if enforce_dtype and arg_in.dtype.name in ["BF16", "BFLOAT16", "FP16", "FP32"]: + arg_in = arg_in.cast(enforce_dtype) + arg_in.stop_gradient = grad_status + return arg_in + else: + return arg_in + + +def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): + if dtype_name == "": + bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) + fwd_output_dir = os.path.abspath(os.path.join(out_path, "output")) + else: + bwd_output_dir = os.path.abspath( + os.path.join(out_path, dtype_name, "output_backward") + ) + fwd_output_dir = os.path.abspath(os.path.join(out_path, dtype_name, "output")) + fwd_output_path = os.path.join(fwd_output_dir, api_call_name) + bwd_output_path = os.path.join(bwd_output_dir, api_call_name) + os.makedirs(fwd_output_dir, exist_ok=True) + os.makedirs(bwd_output_dir, exist_ok=True) + if isinstance(forward_res, (type(None), list, tuple, paddle.Tensor)): + try: + fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) + paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) + except Exception as err: + msg = "save_forward Error: %s" % str(err) + print_warn_log(msg) + return + else: + print(forward_res) + print_warn_log("forward_res not supported!") + if isinstance(backward_res, (type(None), list, tuple, paddle.Tensor)): + try: + bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) + paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) + except Exception as err: + msg = "save_bacward Error: %s" % str(err) + print_warn_log(msg) + return + else: + print(backward_res) + print_warn_log("bacward_res not supported!") + + +def evoke_related_test_func(test_mode): + func_method = [] + if "acc" in test_mode: + func_method.append(run_acc_case) + if "mem" in test_mode: + func_method.append(run_mem_case) + if "pro" in test_mode: + func_method.append(run_profile_case) + if test_mode == "all": + return [run_acc_case, run_mem_case, run_profile_case] + if len(func_method) == 0: + raise ValueError("test mode is not supported!") + return func_method + + +def ut_case_parsing(forward_content, cfg): + run_case_funcs = evoke_related_test_func(cfg.test_mode) + backend = cfg.backend + out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" + os.mkdir(out_path) if not os.path.exists(out_path) else None + multi_dtype_ut = cfg.multi_dtype_ut.split(",") if cfg.multi_dtype_ut else [] + debug_case = cfg.test_case_name.split(",") if cfg.test_case_name else [] + print("debug_case", debug_case) + debug_mode = False + paddle.set_device(cfg.backend) + if len(debug_case) > 0: + debug_mode = True + enforce_types = [type_map[item] for item in multi_dtype_ut] + for i, (api_call_name, api_info_dict) in enumerate( + tqdm(forward_content.items(), **tqdm_params) + ): + print(api_call_name) + if debug_mode and api_call_name not in debug_case: + continue + if len(multi_dtype_ut) > 0: + for enforce_dtype in enforce_types: + print(api_call_name + "*" + enforce_dtype.name) + args = api_call_name, api_info_dict, backend, out_path + kwargs = {"enforce_dtype": enforce_dtype, "debug_case": debug_case, "real_data_path": cfg.real_data} + for run_case in run_case_funcs: + run_case(*args, **kwargs) + print("*" * 100) + else: + print(api_call_name) + args = api_call_name, api_info_dict, backend, out_path + kwargs = {"enforce_dtype": None, "debug_case": debug_case, "real_data_path": cfg.real_data} + if isinstance(run_case_funcs, list): + for run_case in run_case_funcs: + run_case(*args, **kwargs) + else: + run_case_funcs(*args, **kwargs) + print("*" * 100) + + +def create_input_args(api_info, backend, enforce_dtype=None, real_data_path=None): + print(real_data_path) + args, kwargs, need_backward = gen_api_params(api_info, real_data_path) + device_args = recursive_arg_to_device(args, backend, enforce_dtype) + device_kwargs = { + key: recursive_arg_to_device(value, backend, enforce_dtype) + for key, value in kwargs.items() + } + return device_args, device_kwargs, need_backward + + +def create_dout(dout_info_dict, device_out, backend, enforce_dtype=None, real_data_path=None): + if dout_info_dict[0] != "Failed": + dout, _ = gen_args(dout_info_dict, real_data_path) + else: + print("dout dump json is None!") + dout = rand_like(device_out) + dout = recursive_arg_to_device(dout, backend, enforce_dtype) + return dout + + +def run_forward(api_call_name, device_args, device_kwargs): + try: + # paddle.distributed.barrier() + device_out = eval(api_call_stack)(*device_args, **device_kwargs) + paddle.device.synchronize() + return device_out + + except Exception as err: + msg = f"Run API {api_call_name} Forward Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return None + + +def get_grad_tensor(args, kwargs): + device_grad_out = [] + for arg in args: + if isinstance(arg, paddle.Tensor): + device_grad_out.append(arg.grad) + if isinstance(arg, list): # op: concat/stack + for x in arg: + if isinstance(x, paddle.Tensor): + device_grad_out.append(x.grad) + for k, v in kwargs.items(): + if isinstance(v, paddle.Tensor): + device_grad_out.append(v.grad) + if isinstance(v, list): # op: concat/stack + for x in v: + if isinstance(x, paddle.Tensor): + device_grad_out.append(x.grad) + return device_grad_out + + +def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=None): + if need_backward: + try: + paddle.autograd.backward([device_out], dout) + device_grad_out = get_grad_tensor(args, kwargs) + device_grad_out = check_grad_list(device_grad_out) + if device_grad_out is None: + msg = f"{api_call_name} grad_list is None" + Warning_list.append(msg) + return device_grad_out + except Exception as err: + msg = f"Run API {api_call_name} backward Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return None + else: + msg = f"{api_call_name} has no tensor required grad, SKIP Backward" + print_warn_log(msg) + Warning_list.append(msg) + return None + + +def load_params(filename): + with open(filename, 'rb') as f: + return pickle.load(f) + + +def run_acc_case( + api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None +): + api_info_dict_copy = copy.deepcopy(api_info_dict) + device_args, device_kwargs, need_backward = create_input_args( + api_info_dict_copy, backend, enforce_dtype, real_data_path + ) + print(f"Running {api_call_name} acc test!") + if real_data_path is None: + print("do not support!!!!!!!!!!!!") + return + init_path = real_data_path + api_call_name + ".init_params" + print(init_path) + state_path = real_data_path + api_call_name + ".state_dict" + print(state_path) + init_para = load_params(init_path) + api_call_stack = api_call_name.rsplit("*")[0] + parent_package, class_n = api_call_stack.rsplit(".", maxsplit=1) + try: + MODULE = import_module(parent_package) + class_model = getattr(MODULE, class_n) + model = class_model(**init_para) + model.set_state_dict(paddle.load(state_path)) + device_out = model(*device_args, **device_kwargs) + paddle.device.synchronize() + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return + + try: + device_grad_out = [] + if api_info_dict["dout_list"][0] != "Failed": + dout = create_dout( + api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path + ) + device_grad_out = run_backward( + api_call_name, device_out, dout, device_args, device_kwargs, need_backward + ) + else: + if api_call_name.rsplit("*")[0] in distributed_op: + print('this is distributed op: ', api_call_name) + device_out = device_args + device_grad_out = None + except Exception as err: + msg = "Run_backward Error: %s" % str(err) + print_warn_log(msg) + if enforce_dtype: + save_tensor( + device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name + ) + else: + save_tensor(device_out, device_grad_out, out_path, api_call_name) + return + if enforce_dtype: + save_tensor( + device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name + ) + else: + save_tensor(device_out, device_grad_out, out_path, api_call_name) + + # paddle.distributed.barrier() + return + + +def run_profile_case( + api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None +): + print(f"Running {api_call_name} profile test!") + api_info_dict_copy = copy.deepcopy(api_info_dict) + device_args, device_kwargs, need_backward = create_input_args( + api_info_dict_copy, backend, enforce_dtype, real_data_path + ) + # if api_call_name in debug_case: + # x = [device_args, device_kwargs] + # out_path = os.path.realpath(out_path) if out_path else "./" + # save_pth = os.path.join(out_path, "input_data", api_call_name) + # paddle.save(x, save_pth) + # device warmming up + try: + device_out = run_forward(api_call_name, device_args, device_kwargs) + if api_info_dict["dout_list"][0] != "Failed": + dout = create_dout( + api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path + ) + paddle.autograd.backward([device_out], dout) + else: + need_backward = False + except Exception as err: + msg = "Failed in device warming up: %s" % str(err) + print_warn_log(msg) + return + input_shape1 = get_shape(device_args) + input_shape2 = get_shape(device_kwargs) + input_shape_lst = merge_two_lists(input_shape1, input_shape2) + output_shape_lst = get_shape(device_out) + def profile_inner_loop_(): + try: + paddle.device.synchronize() + fwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + paddle.device.synchronize() + fwd_end_time = time.time() + fwd_time = fwd_end_time - fwd_start_time + fwd_time = fwd_time * 1000000 / float(PROFILE_RUN_TIMES) # fwd_time is in us + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return -1, -1 + try: + if not need_backward: + return fwd_time, -1 + paddle.device.synchronize() + bwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + paddle.autograd.backward([device_out], dout) + paddle.device.synchronize() + bwd_end_time = time.time() + bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second + bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us + bwd_time = bwd_time - fwd_time + except Exception as err: + msg = "Run_backward Error: %s" % str(err) + print_warn_log(msg) + return fwd_time, -1 + return fwd_time, bwd_time + + try: + fwd_time, bwd_time = profile_inner_loop_() + except Exception as err: + msg = f"Run {api_call_name} profile Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return + + if not enforce_dtype: + log_path = os.path.join(out_path, "profile_analyze.log") + else: + log_path = os.path.join(out_path, enforce_dtype.name, "profile_analyze.log") + + F = open(log_path, "a") + dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" + op_fwd = api_call_name + dtype + ".forward" + op_bwd = api_call_name + dtype + ".backward" + print_info_log(f"{op_fwd}:\t{fwd_time}") + print_info_log(f"{op_bwd}:\t{bwd_time}") + dtype = "\t" if not enforce_dtype else f"\t{enforce_dtype.name}" + msg_fwd = f"{api_call_name}.forward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tforward\t{fwd_time}" + msg_bwd = f"{api_call_name}.backward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tbackward\t{bwd_time}" + + F.write(msg_fwd + "\n") + F.write(msg_bwd + "\n") + F.close() + return + + +def run_mem_case( + api_call_name, + api_info_dict, + backend, + out_path, + enforce_dtype=None, + debug_case=[], # noqa + real_data_path=None +): + print(f"Running {api_call_name} mem test!") + + activation_cost = None + place = framework._current_expected_place_() + device_id = place.get_device_id() + before_run_mem = core.device_memory_stat_current_value("Allocated", device_id) + api_info_dict_copy = copy.deepcopy(api_info_dict) + device_args, device_kwargs, _ = create_input_args( + api_info_dict_copy, backend, enforce_dtype, real_data_path + ) + try: + device_out = run_forward(api_call_name, device_args, device_kwargs) + recursive_delete_arg(device_args) + for _, value in device_kwargs.items(): + recursive_delete_arg(value) + _ = recursive_arg_to_cpu(device_out) + after_run_mem = core.device_memory_stat_current_value("Allocated", device_id) + activation_cost = after_run_mem - before_run_mem + + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return + if not enforce_dtype: + log_path = os.path.join(out_path, "memory_analyze.log") + else: + log_path = os.path.join(out_path, enforce_dtype.name, "memory_analyze.log") + + os.mkdir(out_path) if not os.path.exists(out_path) else None + F = open(log_path, "a") + dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" + op_name = api_call_name + dtype + ".forward" + F.write(f"{op_name}:\t{str(activation_cost)}\n") + F.close() + return + + +def arg_parser(parser): + parser.add_argument( + "-json_file", + "--json_file", + dest="json_path", + default="", + type=str, + help="Dump json file path", + required=True, + ) + parser.add_argument( + "-out", + "--dump_path", + dest="out_path", + default="./paddle/", + type=str, + help=" The ut task result out path.", + required=False, + ) + parser.add_argument( + "-backend", + "--backend", + dest="backend", + default="gpu", + type=str, + help=" The running device DEVICE or BENCH.", + required=False, + ) + parser.add_argument( + "-dtype", + "--enforce-dtype", + dest="multi_dtype_ut", + default="", + type=str, + help="", + required=False, + ) + parser.add_argument( + "-real", + "--real_data", + dest="real_data", + default="", + type=str, + help="", + required=False, + ) + parser.add_argument( + "-op", + "--op_name", + dest="test_case_name", + default="", + type=str, + help="debug_op name", + required=False, + ) + parser.add_argument( + "-mode", + "--mode", + dest="test_mode", + default="all", + type=str, + help="debug_op name", + required=False, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + arg_parser(parser) + cfg = parser.parse_args() + print(cfg) + dist.init_parallel_env() + local_rank = dist.get_rank() + # json_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step5/forward_rank" + str(local_rank) + "_all.json" + json_path_list = cfg.json_path.split(' ') + data_path_list = cfg.real_data.split(' ') + + print("json_path_list", json_path_list) + print("data_path_list", data_path_list) + + cfg.json_path = json_path_list[local_rank] + cfg.real_data = data_path_list[local_rank] + cfg.backend = cfg.backend + ":" + str(local_rank) + + print(cfg) + # data_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step0/" + # cfg.real_data = None + + forward_content = api_json_read(cfg.json_path) + out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" + if os.path.exists(out_path): + print_warn_log("The output path already exists and the file with the same name will be overwritten.") + out_path = out_path + "/rank_" + str(local_rank) + "/" + if not os.path.exists(out_path): + os.makedirs(out_path, exist_ok=True) + cfg.out_path = out_path + ut_case_parsing(forward_content, cfg) + print_info_log("UT save completed") + warning_log_pth = os.path.join(out_path, "./warning_log.txt") + File = open(warning_log_pth, "w") + for item in Warning_list: + File.write(item + "\n") + File.close() diff --git a/paddleapex/apex/run_llama10b_xpu_new.sh b/paddleapex/apex/run_llama10b_xpu_new.sh index 7e86f40..9bdec5a 100755 --- a/paddleapex/apex/run_llama10b_xpu_new.sh +++ b/paddleapex/apex/run_llama10b_xpu_new.sh @@ -64,9 +64,9 @@ export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 export PYTHONPATH=$PYTHONPATH:/workspace/APEX/PaddleNLP:/workspace/AA/PaddleAPEX -python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_class.py \ -json \ - "/workspace/APEX/PaddleNLP/dump_info/rank0_step5/forward_rank0_all.json /workspace/APEX/PaddleNLP/dump_info/rank1_step5/forward_rank1_all.json /workspace/APEX/PaddleNLP/dump_info/rank2_step5/forward_rank2_all.json /workspace/APEX/PaddleNLP/dump_info/rank3_step5/forward_rank3_all.json /workspace/APEX/PaddleNLP/dump_info/rank4_step5/forward_rank4_all.json /workspace/APEX/PaddleNLP/dump_info/rank5_step5/forward_rank5_all.json /workspace/APEX/PaddleNLP/dump_info/rank6_step5/forward_rank6_all.json /workspace/APEX/PaddleNLP/dump_info/rank7_step5/forward_rank7_all.json" \ + "/workspace/APEX/PaddleNLP/dump_info/rank0_step1/forward_rank0_all.json /workspace/APEX/PaddleNLP/dump_info/rank1_step1/forward_rank1_all.json /workspace/APEX/PaddleNLP/dump_info/rank2_step1/forward_rank2_all.json /workspace/APEX/PaddleNLP/dump_info/rank3_step1/forward_rank3_all.json /workspace/APEX/PaddleNLP/dump_info/rank4_step1/forward_rank4_all.json /workspace/APEX/PaddleNLP/dump_info/rank5_step1/forward_rank5_all.json /workspace/APEX/PaddleNLP/dump_info/rank6_step1/forward_rank6_all.json /workspace/APEX/PaddleNLP/dump_info/rank7_step1/forward_rank7_all.json" \ -backend xpu \ -real \ "/workspace/APEX/PaddleNLP/dump_info/rank0_step0/ /workspace/APEX/PaddleNLP/dump_info/rank1_step0/ /workspace/APEX/PaddleNLP/dump_info/rank2_step0/ /workspace/APEX/PaddleNLP/dump_info/rank3_step0/ /workspace/APEX/PaddleNLP/dump_info/rank4_step0/ /workspace/APEX/PaddleNLP/dump_info/rank5_step0/ /workspace/APEX/PaddleNLP/dump_info/rank6_step0/ /workspace/APEX/PaddleNLP/dump_info/rank7_step0/" \ diff --git a/paddleapex/api_tracer/Dump.py b/paddleapex/api_tracer/Dump.py index cc6a013..d95d716 100644 --- a/paddleapex/api_tracer/Dump.py +++ b/paddleapex/api_tracer/Dump.py @@ -104,6 +104,18 @@ def dump_real_data(self, api_args, tensor, rank): save_tensor(tensor, file_path) return f"{api_args}.pt" + # def dump_model(self, model, model_name): + # directory = os.path.join(self.data_route, f"rank{rank}_step{cfg.global_step}") + # file_path = os.path.join(directory, f"{model_name}") + # create_directory(directory) + # if os.path.exists(file_path): + # os.remove(file_path) + # print( + # f"File {file_path} already exists, tool has overwritten it automatically." + # ) + # paddle.jit.save(layer=model, file_path) + + """ Get Api_info dict, update self.dump_api_dict """ diff --git a/paddleapex/api_tracer/api_info.py b/paddleapex/api_tracer/api_info.py index 0b8a5dc..9407228 100644 --- a/paddleapex/api_tracer/api_info.py +++ b/paddleapex/api_tracer/api_info.py @@ -109,6 +109,7 @@ def __init__(self, mode): self.output_num = 0 self.dout_list = [] self.is_half_precision = False + self.is_distributed = False if cfg.profile_mode: self.tensor_analyzer_ = self.effi_analyze_tensor else: @@ -118,6 +119,8 @@ def update_APIInfo(self, op_name, rank): print("dump api: ", op_name) self.op_name = op_name self.rank = rank + if "distributed" in self.op_name or "modeling" in self.op_name: + self.is_distributed = True def update_real_data(self, inputs, kwargs): self.is_half_precision = False @@ -126,7 +129,7 @@ def update_real_data(self, inputs, kwargs): self.api_info_struct = { self.op_name: {"args": args_info_list, "kwargs": kwargs_info_dict, "dout_list": ["Failed"]} } - dump_util.update_api_dict(self.api_info_struct, self.rank, self.is_half_precision, "distributed" in self.op_name) + dump_util.update_api_dict(self.api_info_struct, self.rank, self.is_half_precision, self.is_distributed) def record_dout(self, grad_value): if grad_value is not None: @@ -199,7 +202,7 @@ def effi_analyze_tensor(self, arg): single_arg.update({"Min": min_}) single_arg.update({"Min_origin": min_}) single_arg.update({"stop_gradient": arg.stop_gradient}) - if self.mode == "real_data" and (dist.get_rank() == 0 or "distributed" in self.op_name): + if self.mode == "real_data" and (dist.get_rank() == 0 or self.is_distributed): api_args = self.op_name + "." + str(self.args_num) pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) self.args_num += 1 @@ -225,7 +228,7 @@ def _analyze_tensor(self, arg): single_arg.update({"stop_gradient": arg.stop_gradient}) # if self.mode == "real_data": - if self.mode == "real_data" and (dist.get_rank() == 0 or "distributed" in self.op_name): + if self.mode == "real_data" and (dist.get_rank() == 0 or self.is_distributed): api_args = self.op_name + "." + str(self.args_num) pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) self.args_num += 1 diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index d03816c..0cab0fe 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -1,3 +1,27 @@ +target_class: + # - paddlenlp.transformers.llama.modeling.LlamaMLP + - paddlenlp.transformers.llama.modeling.LlamaLMHead + # - paddlenlp.transformers.llama.modeling.LlamaRMSNorm + # - paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaLinearScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaNTKScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaDynamicNTKScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.Llama3RotaryEmbedding + # - paddlenlp.transformers.llama.modeling.MoEAllToAll + # - paddlenlp.transformers.llama.modeling.MoEGateCombine + # - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler + # - paddlenlp.transformers.llama.modeling.LlamaMoEGate + # - paddlenlp.transformers.llama.modeling.LlamaMoEMLP + # - paddlenlp.transformers.llama.modeling.LlamaAttention + # - paddlenlp.transformers.llama.modeling.LlamaDecoderLayer + # - paddlenlp.transformers.llama.modeling.LlamaPretrainedModel + # - paddlenlp.transformers.llama.modeling.LlamaModel + # - paddlenlp.transformers.llama.modeling.LlamaPretrainingCriterion + # - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss + # - paddlenlp.transformers.llama.modeling.LlamaForCausalLM + +target_op: + - paddle.Tensor.__add__ ignored_op: - paddle._C_ops.min - paddle._C_ops.max @@ -46,7 +70,7 @@ ignored_op: # - paddle.distributed.communication.stream.all_reduce # - paddle.distributed.communication.stream.alltoall -target_op: +#target_op: # distributed - paddle.distributed.barrier - paddle.distributed.broadcast_object_list diff --git a/paddleapex/api_tracer/configs/tool_config.yaml b/paddleapex/api_tracer/configs/tool_config.yaml index 6baa265..1eb25d1 100644 --- a/paddleapex/api_tracer/configs/tool_config.yaml +++ b/paddleapex/api_tracer/configs/tool_config.yaml @@ -18,7 +18,7 @@ dump_mode: "real_data" profile_mode: True # target_step is a list, dump api function will turn on at the specific step -target_step: [5] +target_step: [1] # Remove duplicate apis from dump_info and keep only one api in the same value range. # dump_unique: True diff --git a/paddleapex/api_tracer/wrap_op/OPTemplate.py b/paddleapex/api_tracer/wrap_op/OPTemplate.py index f9e8833..57a4972 100644 --- a/paddleapex/api_tracer/wrap_op/OPTemplate.py +++ b/paddleapex/api_tracer/wrap_op/OPTemplate.py @@ -15,6 +15,9 @@ import paddle from .. import config from ..api_info import API +from inspect import signature +import os +import pickle class HookOp: @@ -24,6 +27,84 @@ class HookOp: cfg = config.cfg +def hijack_init(self, *args, **kwargs): + print("args", args) + print("kwargs", kwargs) + self.__init__(*args, **kwargs) + + +# 获取初始化参数的方法 +def get_init_params(instance): + sig = signature(instance.__init__) + # 获取参数名称及默认值 + bound_args = sig.bind_partial() + bound_args.apply_defaults() + + # 提取参数值 + init_params = {} + for param in sig.parameters.values(): + if param.name != 'self': + init_params[param.name] = getattr(instance, param.name, param.default) + + return init_params + + +def save_init_params_and_weight(init_params, state_dict, name, rank): + data_route = cfg.dump_root_path + directory = os.path.join(data_route, f"rank{rank}_step{cfg.global_step}") + file_path = os.path.join(directory, f"{name}.init_params") + with open(file_path, 'wb') as f: + pickle.dump(init_params, f) + # paddle.save(init_params, file_path) + paddle.save(state_dict, os.path.join(directory, f"{name}.state_dict")) + + +def hijack_call(self, *args, **kwargs): + cls = self.__class__ + init_params = get_init_params(self) + # print("init_params", init_params) + + # print("hijack_call", self.__class__.__name__) + cfg.prefix_op_name_ = self.prefix_op_name_ + "*" + if self.__class__.__name__ not in cfg.Op_count: + cfg.Op_count[self.__class__.__name__] = 1 + cfg.prefix_op_name_ += "0" + else: + cfg.Op_count[self.__class__.__name__] += 1 + cfg.prefix_op_name_ += str(cfg.Op_count[self.__class__.__name__] - 1) + if cfg.dump_state: + api_recorder = API(cfg.dump_mode) + rank = dist.get_rank() + api_recorder.update_APIInfo(cfg.prefix_op_name_, rank) + api_recorder.update_real_data(args, kwargs) + save_init_params_and_weight(init_params, self.state_dict(), cfg.prefix_op_name_, rank) + output = self.forward(*args, **kwargs) + # print("api_info_struct !!!!!!", api_recorder.api_info_struct) + try: + if isinstance(output, paddle.Tensor): + if not output.stop_gradient: + output.register_hook(api_recorder.record_dout) + api_recorder.output_num = 1 + else: + api_recorder.record_dout(None) + if isinstance(output, (list, tuple)): + need_record = False + for item in output: + if isinstance(item, paddle.Tensor) and not item.stop_gradient: + api_recorder.output_num += 1 + need_record = True + item.register_hook(api_recorder.record_dout) + if not need_record: + api_recorder.record_dout(None) + except Exception as e: + print(self.__class__.__name__, " register hook failed. Due to :", e) + api_recorder.record_dout(None) + else: + output = self.forward(*args, **kwargs) + return output + + + class OPTemplate: def __init__(self, op_name): self.op_name_ = op_name diff --git a/paddleapex/api_tracer/wrap_op/get_target_op.py b/paddleapex/api_tracer/wrap_op/get_target_op.py index 9b5c2f0..2788e9d 100644 --- a/paddleapex/api_tracer/wrap_op/get_target_op.py +++ b/paddleapex/api_tracer/wrap_op/get_target_op.py @@ -25,6 +25,7 @@ def __init__(self, yaml_path): Ops = yaml.safe_load(f) self.target_op = Ops.get("target_op") self.ignored_op = Ops.get("ignored_op") + self.target_class = Ops.get("target_class") f.close() if self.ignored_op is None: self.ignored_op = [] @@ -48,3 +49,6 @@ def get_target_ops(self): self.api_to_catch -= set(["paddle.max", "paddle.min"]) self.check_api_stack() return self.api_to_catch + + def get_target_class(self): + return self.target_class diff --git a/paddleapex/api_tracer/wrap_op/hijack_tool.py b/paddleapex/api_tracer/wrap_op/hijack_tool.py index 8dad0f2..2430d5e 100644 --- a/paddleapex/api_tracer/wrap_op/hijack_tool.py +++ b/paddleapex/api_tracer/wrap_op/hijack_tool.py @@ -12,14 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. - from .. import config from ...utils import try_import from .get_target_op import GetTargetOP -from .OPTemplate import OPTemplate, HookOp +from .OPTemplate import OPTemplate, HookOp, hijack_call cfg = config.cfg +# from paddlenlp.test_model.test_model import SimpleModel +# from paddlenlp.transformers.llama.modeling import LlamaLMHead, LlamaMLP def wrapped_op(op_name): def op_template(*args, **kwargs): @@ -28,9 +29,18 @@ def op_template(*args, **kwargs): return op_template +# LlamaLMHead.prefix_op_name_ = "paddlenlp.transformers.llama.modeling.LlamaLMHead" +# LlamaLMHead.__call__ = hijack_call + +# LlamaMLP.prefix_op_name_ = "paddlenlp.transformers.llama.modeling.LlamaMLP" +# LlamaMLP.__call__ = hijack_call + + def hijack_api(): op = GetTargetOP(cfg.op_target_pth) target_op = op.get_target_ops() + target_class = op.get_target_class() + # target_op.add("paddlenlp.test_model.test_model.SimpleModel") for op_name in target_op: parent_package, method_name = op_name.rsplit(".", maxsplit=1) try: @@ -42,6 +52,18 @@ def hijack_api(): ) except Exception as err: print(op_name, str(err)) + + for class_in in target_class: + print("begin class --------------------------------", class_in) + parent_package, class_n = class_in.rsplit(".", maxsplit=1) + try: + class_name, model = try_import(parent_package) + model = getattr(model, class_n) + model.prefix_op_name_ = class_in + model.__call__ = hijack_call + # print("model---!!!!!!!!!!", model) + except Exception as err: + print(class_in, str(err)) for attr_name in dir(HookOp): if attr_name.startswith("wrap_"): From 3a55d69e3f808e1721a15144a925aa633a5134c2 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 9 Dec 2024 10:20:19 +0800 Subject: [PATCH 07/22] fix some bugs --- paddleapex/apex/acc_direct_cmp_zxq.py | 82 +++++++++++++-- paddleapex/apex/run_class.py | 14 ++- paddleapex/apex/split_distributed.py | 104 +++++++++++-------- paddleapex/api_tracer/configs/op_target.yaml | 43 ++++---- paddleapex/api_tracer/wrap_op/OPTemplate.py | 1 + 5 files changed, 161 insertions(+), 83 deletions(-) diff --git a/paddleapex/apex/acc_direct_cmp_zxq.py b/paddleapex/apex/acc_direct_cmp_zxq.py index 1b87221..1642767 100644 --- a/paddleapex/apex/acc_direct_cmp_zxq.py +++ b/paddleapex/apex/acc_direct_cmp_zxq.py @@ -114,10 +114,13 @@ def compare_device_bench( api_pt_files_device = os.listdir(device_dir) api_pt_files_all = list(set(api_pt_files_bench + api_pt_files_device)) api_pt_files_all = sorted(api_pt_files_all) - + + f = open(out_path + "compare_result.txt", 'a', encoding='utf-8') for i, api_file in enumerate(tqdm.tqdm(api_pt_files_all, **tqdm_params)): if not i % dist.get_world_size() == dist.get_rank(): continue + bench_out_tensor, device_out_tensor = None, None + bench_grad_tensor_list, device_grad_tensor_list = None, None try: print("=" * 100) bench_pt_path = os.path.join(bench_dir, api_file) @@ -137,7 +140,6 @@ def compare_device_bench( print(msg) continue - bench_grad_tensor_list, device_grad_tensor_list = None, None if bench_grad_dir and device_grad_dir: bench_grad_path = os.path.join(bench_grad_dir, api_file) device_grad_path = os.path.join(device_grad_dir, api_file) @@ -156,15 +158,19 @@ def compare_device_bench( Warning_list.append(msg) print(msg) - compare.compare_output( - api_file, - bench_out_tensor, - device_out_tensor, - bench_grad_tensor_list, - device_grad_tensor_list, - bench_BF16_flag, - device_BF16_flag, # BF16 convert flag - ) + print(api_file + " forward -------------") + compare_result(bench_out_tensor, device_out_tensor) + # print(api_file + " backward -------------") + # compare_result(bench_grad_tensor_list, device_grad_tensor_list) + #compare.compare_output( + # api_file, + # bench_out_tensor, + # device_out_tensor, + # bench_grad_tensor_list, + # device_grad_tensor_list, + # bench_BF16_flag, + # device_BF16_flag, # BF16 convert flag + #) except Exception as err: print(err) warning_log_pth = os.path.join(out_path, "./compare_warning.txt") @@ -173,6 +179,60 @@ def compare_device_bench( File.write(item + "\n") File.close() +def normalize_t(tensor0, tensor1): + min_val0, min_val1 = paddle.min(tensor0), paddle.min(tensor1) + max_val0, max_val1 = paddle.max(tensor0), paddle.max(tensor1) + min_val = min(min_val0, min_val1) + max_val = max(max_val0, max_val1) + if min_val == max_val: + return paddle.ones_like(tensor), paddle.ones_like(tensor) + return (tensor0 - min_val) / (max_val - min_val), (tensor1 - min_val) / (max_val - min_val) + # normalized_tensor_0_1 = (tensor0 - min_val) / (max_val - min_val) + # return normalized_tensor_0_1 + # normalized_tensor_neg1_1 = normalized_tensor_0_1 * 2 - 1 + # return normalized_tensor_neg1_1 + +def compare_result(bench_output, device_output): + if isinstance(bench_output, (list, tuple)): + for b_out_i, n_out_i in zip(bench_output, device_output): + compare_result(b_out_i, n_out_i) + if isinstance(bench_output, paddle.Tensor): + bench_output_o = bench_output.reshape([-1,]) + device_output_o = device_output.reshape([-1,]) + bench_output, device_output = normalize_t(bench_output_o, device_output_o) + # bench_output = paddle.cast(bench_output, "float") + # device_output = paddle.cast(device_output, "float") + diff = (bench_output - device_output).abs() + # abs_diff = ((bench_output - device_output) / bench_output).abs() + num = len(diff) + diff005 = (diff < 0.05).sum() / num + diff001 = (diff < 0.01).sum() / num + diff0005 = (diff < 0.005).sum() / num + diff0001 = (diff < 0.001).sum() / num + diff00005 = (diff < 0.0005).sum() / num + if diff0005 < 1: + print("diff is too large---------------------------- erorr Erorr ERORR----------------------------") + print("bench_output----------") + print(bench_output_o) + print("device_output---------") + print(device_output_o) + print("diff < 0.05: ", diff005.numpy()) + print("diff < 0.01: ", diff001.numpy()) + print("diff < 0.005: ", diff0005.numpy()) + print("diff < 0.001: ", diff0001.numpy()) + print("diff < 0.0005: ", diff00005.numpy()) + + # diff005 = (abs_diff < 0.05).sum() / num + # diff001 = (abs_diff < 0.01).sum() / num + # diff0005 = (abs_diff < 0.005).sum() / num + # diff0001 = (abs_diff < 0.001).sum() / num + # print("abs_diff < 0.05: ", diff005.numpy()) + # print("abs_diff < 0.01: ", diff001.numpy()) + # print("abs_diff < 0.005: ", diff0005.numpy()) + # print("abs_diff < 0.001: ", diff0001.numpy()) + + + if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/paddleapex/apex/run_class.py b/paddleapex/apex/run_class.py index 9b38d65..92db52b 100644 --- a/paddleapex/apex/run_class.py +++ b/paddleapex/apex/run_class.py @@ -169,6 +169,8 @@ def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): + if not dist.get_rank() == 0: + return if dtype_name == "": bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) fwd_output_dir = os.path.abspath(os.path.join(out_path, "output")) @@ -181,9 +183,10 @@ def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=" bwd_output_path = os.path.join(bwd_output_dir, api_call_name) os.makedirs(fwd_output_dir, exist_ok=True) os.makedirs(bwd_output_dir, exist_ok=True) + bwd_BF16_flag, fwd_BF16_flag = True, True if isinstance(forward_res, (type(None), list, tuple, paddle.Tensor)): try: - fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) + # fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) except Exception as err: msg = "save_forward Error: %s" % str(err) @@ -194,7 +197,7 @@ def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=" print_warn_log("forward_res not supported!") if isinstance(backward_res, (type(None), list, tuple, paddle.Tensor)): try: - bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) + # bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) except Exception as err: msg = "save_bacward Error: %s" % str(err) @@ -365,6 +368,7 @@ def run_acc_case( model.set_state_dict(paddle.load(state_path)) device_out = model(*device_args, **device_kwargs) paddle.device.synchronize() + # print(device_out) except Exception as err: msg = "Run_forward Error: %s" % str(err) print_warn_log(msg) @@ -380,9 +384,9 @@ def run_acc_case( api_call_name, device_out, dout, device_args, device_kwargs, need_backward ) else: - if api_call_name.rsplit("*")[0] in distributed_op: - print('this is distributed op: ', api_call_name) - device_out = device_args + # if api_call_name.rsplit("*")[0] in distributed_op: + # print('this is distributed op: ', api_call_name) + # device_out = device_args device_grad_out = None except Exception as err: msg = "Run_backward Error: %s" % str(err) diff --git a/paddleapex/apex/split_distributed.py b/paddleapex/apex/split_distributed.py index 84ed37e..82665e1 100644 --- a/paddleapex/apex/split_distributed.py +++ b/paddleapex/apex/split_distributed.py @@ -1,60 +1,76 @@ import json -def split_json_by_keyword(input_file, output_file_with, output_file_without, keyword): +def split_json_by_keyword(input_file, outfiles, keywords): # 读取 JSON 文件 with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) # 分别存储包含和不包含关键字的项 - with_keyword = {} - without_keyword = {} + out_list = [] + for i in range(len(keywords)): + out_data = {} + out_list.append(out_data) + without_keyword = {} + # 遍历每个项并分类 for key, value in data.items(): - if keyword in key: - with_keyword[key] = value - else: + have_key = False + for keyword, out_data in zip(keywords, out_list): + if keyword in key: + out_data[key] = value + have_key = True + if not have_key: without_keyword[key]= value - - # 将结果写入不同的文件 - with open(output_file_with, 'w', encoding='utf-8') as f_with: - json.dump(with_keyword, f_with, ensure_ascii=False, indent=4) - + + for i in range(len(keywords)): + output_file_with = outfiles[i] + with_keyword = out_list[i] + with open(output_file_with, 'w', encoding='utf-8') as f_with: + json.dump(with_keyword, f_with, ensure_ascii=False, indent=4) + + output_file_without = outfiles[-1] with open(output_file_without, 'w', encoding='utf-8') as f_without: json.dump(without_keyword, f_without, ensure_ascii=False, indent=4) - print(f"Items with '{keyword}' written to {output_file_with}") - print(f"Items without '{keyword}' written to {output_file_without}") - - -input_json_files = ["/zhouxiangquan/llama10b/dump_info/rank0_step5/forward_rank0_all.json", - "/zhouxiangquan/llama10b/dump_info/rank1_step5/forward_rank1_all.json", - "/zhouxiangquan/llama10b/dump_info/rank2_step5/forward_rank2_all.json", - "/zhouxiangquan/llama10b/dump_info/rank3_step5/forward_rank3_all.json", - "/zhouxiangquan/llama10b/dump_info/rank4_step5/forward_rank4_all.json", - "/zhouxiangquan/llama10b/dump_info/rank5_step5/forward_rank5_all.json", - "/zhouxiangquan/llama10b/dump_info/rank6_step5/forward_rank6_all.json", - "/zhouxiangquan/llama10b/dump_info/rank7_step5/forward_rank7_all.json"] - -output_with_keyword = ["llama10b/rand0_distributed.json", - "llama10b/rand1_distributed.json", - "llama10b/rand2_distributed.json", - "llama10b/rand3_distributed.json", - "llama10b/rand4_distributed.json", - "llama10b/rand5_distributed.json", - "llama10b/rand6_distributed.json", - "llama10b/rand7_distributed.json"] - -output_without_keyword = ["llama10b/rand0_without_distributed.json", - "llama10b/rand1_without_distributed.json", - "llama10b/rand2_without_distributed.json", - "llama10b/rand3_without_distributed.json", - "llama10b/rand4_without_distributed.json", - "llama10b/rand5_without_distributed.json", - "llama10b/rand6_without_distributed.json", - "llama10b/rand7_without_distributed.json"] - -keyword = 'distributed' + print("well done") + +input_json_files = ["/zhouxiangquan/llama10b/dump_info/rank0_step1/forward_rank0_all.json", + "/zhouxiangquan/llama10b/dump_info/rank1_step1/forward_rank1_all.json", + "/zhouxiangquan/llama10b/dump_info/rank2_step1/forward_rank2_all.json", + "/zhouxiangquan/llama10b/dump_info/rank3_step1/forward_rank3_all.json", + "/zhouxiangquan/llama10b/dump_info/rank4_step1/forward_rank4_all.json", + "/zhouxiangquan/llama10b/dump_info/rank5_step1/forward_rank5_all.json", + "/zhouxiangquan/llama10b/dump_info/rank6_step1/forward_rank6_all.json", + "/zhouxiangquan/llama10b/dump_info/rank7_step1/forward_rank7_all.json"] +distributed_keyword = ["/zhouxiangquan/llama10b/dump_info/rank0_step1/distributed.json", + "/zhouxiangquan/llama10b/dump_info/rank1_step1/distributed.json", + "/zhouxiangquan/llama10b/dump_info/rank2_step1/distributed.json", + "/zhouxiangquan/llama10b/dump_info/rank3_step1/distributed.json", + "/zhouxiangquan/llama10b/dump_info/rank4_step1/distributed.json", + "/zhouxiangquan/llama10b/dump_info/rank5_step1/distributed.json", + "/zhouxiangquan/llama10b/dump_info/rank6_step1/distributed.json", + "/zhouxiangquan/llama10b/dump_info/rank7_step1/distributed.json"] +model_keyword = ["/zhouxiangquan/llama10b/dump_info/rank0_step1/class.json", + "/zhouxiangquan/llama10b/dump_info/rank1_step1/class.json", + "/zhouxiangquan/llama10b/dump_info/rank2_step1/class.json", + "/zhouxiangquan/llama10b/dump_info/rank3_step1/class.json", + "/zhouxiangquan/llama10b/dump_info/rank4_step1/class.json", + "/zhouxiangquan/llama10b/dump_info/rank5_step1/class.json", + "/zhouxiangquan/llama10b/dump_info/rank6_step1/class.json", + "/zhouxiangquan/llama10b/dump_info/rank7_step1/class.json"] +common_keyword = ["/zhouxiangquan/llama10b/dump_info/rank0_step1/common.json", + "/zhouxiangquan/llama10b/dump_info/rank1_step1/common.json", + "/zhouxiangquan/llama10b/dump_info/rank2_step1/common.json", + "/zhouxiangquan/llama10b/dump_info/rank3_step1/common.json", + "/zhouxiangquan/llama10b/dump_info/rank4_step1/common.json", + "/zhouxiangquan/llama10b/dump_info/rank5_step1/common.json", + "/zhouxiangquan/llama10b/dump_info/rank6_step1/common.json", + "/zhouxiangquan/llama10b/dump_info/rank7_step1/common.json"] + + + +keyword = ['distributed', 'model'] for i in range(len(input_json_files)): - split_json_by_keyword(input_json_files[i], output_with_keyword[i], output_without_keyword[i], keyword) + split_json_by_keyword(input_json_files[i], [distributed_keyword[i], model_keyword[i], common_keyword[i]],keyword) diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index 0cab0fe..f7a7f53 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -1,27 +1,24 @@ target_class: - # - paddlenlp.transformers.llama.modeling.LlamaMLP + - paddlenlp.transformers.llama.modeling.LlamaMLP - paddlenlp.transformers.llama.modeling.LlamaLMHead - # - paddlenlp.transformers.llama.modeling.LlamaRMSNorm - # - paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding - # - paddlenlp.transformers.llama.modeling.LlamaLinearScalingRotaryEmbedding - # - paddlenlp.transformers.llama.modeling.LlamaNTKScalingRotaryEmbedding - # - paddlenlp.transformers.llama.modeling.LlamaDynamicNTKScalingRotaryEmbedding - # - paddlenlp.transformers.llama.modeling.Llama3RotaryEmbedding - # - paddlenlp.transformers.llama.modeling.MoEAllToAll - # - paddlenlp.transformers.llama.modeling.MoEGateCombine - # - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler - # - paddlenlp.transformers.llama.modeling.LlamaMoEGate - # - paddlenlp.transformers.llama.modeling.LlamaMoEMLP - # - paddlenlp.transformers.llama.modeling.LlamaAttention - # - paddlenlp.transformers.llama.modeling.LlamaDecoderLayer - # - paddlenlp.transformers.llama.modeling.LlamaPretrainedModel - # - paddlenlp.transformers.llama.modeling.LlamaModel - # - paddlenlp.transformers.llama.modeling.LlamaPretrainingCriterion - # - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss - # - paddlenlp.transformers.llama.modeling.LlamaForCausalLM - -target_op: - - paddle.Tensor.__add__ + - paddlenlp.transformers.llama.modeling.LlamaRMSNorm + - paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding + - paddlenlp.transformers.llama.modeling.LlamaLinearScalingRotaryEmbedding + - paddlenlp.transformers.llama.modeling.LlamaNTKScalingRotaryEmbedding + - paddlenlp.transformers.llama.modeling.LlamaDynamicNTKScalingRotaryEmbedding + - paddlenlp.transformers.llama.modeling.Llama3RotaryEmbedding + - paddlenlp.transformers.llama.modeling.MoEAllToAll + - paddlenlp.transformers.llama.modeling.MoEGateCombine + - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler + - paddlenlp.transformers.llama.modeling.LlamaMoEGate + - paddlenlp.transformers.llama.modeling.LlamaMoEMLP + - paddlenlp.transformers.llama.modeling.LlamaAttention + - paddlenlp.transformers.llama.modeling.LlamaDecoderLayer + - paddlenlp.transformers.llama.modeling.LlamaPretrainedModel + - paddlenlp.transformers.llama.modeling.LlamaModel + - paddlenlp.transformers.llama.modeling.LlamaPretrainingCriterion + - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss + - paddlenlp.transformers.llama.modeling.LlamaForCausalLM ignored_op: - paddle._C_ops.min - paddle._C_ops.max @@ -70,7 +67,7 @@ ignored_op: # - paddle.distributed.communication.stream.all_reduce # - paddle.distributed.communication.stream.alltoall -#target_op: +target_op: # distributed - paddle.distributed.barrier - paddle.distributed.broadcast_object_list diff --git a/paddleapex/api_tracer/wrap_op/OPTemplate.py b/paddleapex/api_tracer/wrap_op/OPTemplate.py index 57a4972..f9120ae 100644 --- a/paddleapex/api_tracer/wrap_op/OPTemplate.py +++ b/paddleapex/api_tracer/wrap_op/OPTemplate.py @@ -80,6 +80,7 @@ def hijack_call(self, *args, **kwargs): save_init_params_and_weight(init_params, self.state_dict(), cfg.prefix_op_name_, rank) output = self.forward(*args, **kwargs) # print("api_info_struct !!!!!!", api_recorder.api_info_struct) + # print(output) try: if isinstance(output, paddle.Tensor): if not output.stop_gradient: From 66e302b12ea346aaadeb420d3458c4db0d9b4777 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Fri, 13 Dec 2024 15:53:21 +0800 Subject: [PATCH 08/22] support class and common op --- paddleapex/apex/acc_direct_cmp_zxq.py | 63 ++++-- paddleapex/apex/lot_t.py | 45 ++++ paddleapex/apex/run_class.py | 70 +++--- paddleapex/apex/run_llama10b_xpu.sh | 0 paddleapex/apex/run_llama10b_xpu_32k.sh | 38 +++- .../apex/run_llama10b_xpu_distributed.sh | 80 ------- paddleapex/apex/run_llama10b_xpu_new.sh | 0 paddleapex/apex/run_llama20b_xpu.sh | 47 +++- paddleapex/apex/run_llama20b_xpu_pro.sh | 75 +++++++ paddleapex/apex/run_paddle.py | 210 ++++++++++++++++-- paddleapex/apex/run_without_distributed.py | 57 ++--- paddleapex/apex/split_distributed.py | 64 +++--- paddleapex/api_tracer/Dump.py | 1 + paddleapex/api_tracer/api_info.py | 22 +- paddleapex/api_tracer/configs/op_target.yaml | 101 +++++---- .../api_tracer/configs/tool_config.yaml | 2 +- paddleapex/api_tracer/wrap_op/OPTemplate.py | 3 +- paddleapex/api_tracer/wrap_op/hijack_tool.py | 14 +- 18 files changed, 619 insertions(+), 273 deletions(-) create mode 100644 paddleapex/apex/lot_t.py mode change 100755 => 100644 paddleapex/apex/run_llama10b_xpu.sh mode change 100755 => 100644 paddleapex/apex/run_llama10b_xpu_32k.sh delete mode 100755 paddleapex/apex/run_llama10b_xpu_distributed.sh mode change 100755 => 100644 paddleapex/apex/run_llama10b_xpu_new.sh mode change 100755 => 100644 paddleapex/apex/run_llama20b_xpu.sh create mode 100644 paddleapex/apex/run_llama20b_xpu_pro.sh diff --git a/paddleapex/apex/acc_direct_cmp_zxq.py b/paddleapex/apex/acc_direct_cmp_zxq.py index 1642767..c4753f2 100644 --- a/paddleapex/apex/acc_direct_cmp_zxq.py +++ b/paddleapex/apex/acc_direct_cmp_zxq.py @@ -18,6 +18,7 @@ import time import paddle import tqdm +import pandas as pd import paddle.distributed as dist @@ -115,7 +116,10 @@ def compare_device_bench( api_pt_files_all = list(set(api_pt_files_bench + api_pt_files_device)) api_pt_files_all = sorted(api_pt_files_all) - f = open(out_path + "compare_result.txt", 'a', encoding='utf-8') + # f = open(out_path + "compare_result.txt", 'a', encoding='utf-8') + errors = [] + errors_forward_info = [] + errors_bacward_info = [] for i, api_file in enumerate(tqdm.tqdm(api_pt_files_all, **tqdm_params)): if not i % dist.get_world_size() == dist.get_rank(): continue @@ -157,11 +161,23 @@ def compare_device_bench( msg = f"{api_file} has no grad output, please refer to run_ut warning log info." Warning_list.append(msg) print(msg) - + + error_i = [] print(api_file + " forward -------------") - compare_result(bench_out_tensor, device_out_tensor) - # print(api_file + " backward -------------") - # compare_result(bench_grad_tensor_list, device_grad_tensor_list) + compare_result(bench_out_tensor, device_out_tensor, error_i, api_file + " forward") + errors_forward_info = errors_forward_info + error_i + # for e in error_i: + # errors.append(float(e.split(" ")[0])) + # errors_info.append(api_file + " forward " + e) + + print(api_file + " backward -------------") + error_i = [] + compare_result(bench_grad_tensor_list, device_grad_tensor_list, error_i, api_file + " backward") + errors_bacward_info = errors_bacward_info + error_i + + # for e in error_i: + # errors.append(float(e.split(" ")[0])) + # errors_info.append(api_file + " backward " + e) #compare.compare_output( # api_file, # bench_out_tensor, @@ -173,6 +189,13 @@ def compare_device_bench( #) except Exception as err: print(err) + errors_bacward_info.sort(key=lambda x: x[1]) + errors_forward_info.sort(key=lambda x: x[1]) + df = pd.DataFrame(errors_bacward_info, columns=["operator_name", "error", "bench_info", "device_info"]) + df.to_csv("log/rank" + str(dist.get_rank()) + "_backward_output.csv", index=False) + df = pd.DataFrame(errors_forward_info, columns=["operator_name", "error", "bench_info", "device_info"]) + df.to_csv("log/rank" + str(dist.get_rank()) + "_forward_output.csv", index=False) + warning_log_pth = os.path.join(out_path, "./compare_warning.txt") File = open(warning_log_pth, "w") for item in Warning_list: @@ -184,25 +207,27 @@ def normalize_t(tensor0, tensor1): max_val0, max_val1 = paddle.max(tensor0), paddle.max(tensor1) min_val = min(min_val0, min_val1) max_val = max(max_val0, max_val1) + if len(tensor0) == 1: + return tensor0 / max_val, tensor1 / max_val if min_val == max_val: - return paddle.ones_like(tensor), paddle.ones_like(tensor) + return paddle.ones_like(tensor0), paddle.ones_like(tensor1) return (tensor0 - min_val) / (max_val - min_val), (tensor1 - min_val) / (max_val - min_val) # normalized_tensor_0_1 = (tensor0 - min_val) / (max_val - min_val) # return normalized_tensor_0_1 # normalized_tensor_neg1_1 = normalized_tensor_0_1 * 2 - 1 # return normalized_tensor_neg1_1 -def compare_result(bench_output, device_output): +def compare_result(bench_output, device_output, errors, name): if isinstance(bench_output, (list, tuple)): for b_out_i, n_out_i in zip(bench_output, device_output): - compare_result(b_out_i, n_out_i) + compare_result(b_out_i, n_out_i, errors, name) if isinstance(bench_output, paddle.Tensor): bench_output_o = bench_output.reshape([-1,]) device_output_o = device_output.reshape([-1,]) bench_output, device_output = normalize_t(bench_output_o, device_output_o) # bench_output = paddle.cast(bench_output, "float") # device_output = paddle.cast(device_output, "float") - diff = (bench_output - device_output).abs() + diff = paddle.cast((bench_output - device_output).abs(), "float") # abs_diff = ((bench_output - device_output) / bench_output).abs() num = len(diff) diff005 = (diff < 0.05).sum() / num @@ -210,16 +235,22 @@ def compare_result(bench_output, device_output): diff0005 = (diff < 0.005).sum() / num diff0001 = (diff < 0.001).sum() / num diff00005 = (diff < 0.0005).sum() / num - if diff0005 < 1: + if diff0001 < 1 or len(bench_output) == 1: + diff_value, diff_index = paddle.topk(diff, k=min(10, num)) + error_info = diff0001.numpy() + bench_n = paddle.cast(bench_output_o[diff_index], "float").numpy().tolist() + device_n = paddle.cast(device_output_o[diff_index], "float").numpy().tolist() + # error_info = error_info + " bench_value: " + str(bench_n) + " device_value: " + str(device_n) + errors.append((name, error_info, str(bench_n), str(device_n))) print("diff is too large---------------------------- erorr Erorr ERORR----------------------------") print("bench_output----------") - print(bench_output_o) + print(bench_output_o[diff_index]) print("device_output---------") - print(device_output_o) - print("diff < 0.05: ", diff005.numpy()) - print("diff < 0.01: ", diff001.numpy()) - print("diff < 0.005: ", diff0005.numpy()) - print("diff < 0.001: ", diff0001.numpy()) + print(device_output_o[diff_index]) + print("diff < 0.05: ", diff005.numpy()) + print("diff < 0.01: ", diff001.numpy()) + print("diff < 0.005: ", diff0005.numpy()) + print("diff < 0.001: ", diff0001.numpy()) print("diff < 0.0005: ", diff00005.numpy()) # diff005 = (abs_diff < 0.05).sum() / num diff --git a/paddleapex/apex/lot_t.py b/paddleapex/apex/lot_t.py new file mode 100644 index 0000000..78dc668 --- /dev/null +++ b/paddleapex/apex/lot_t.py @@ -0,0 +1,45 @@ +import pandas as pd +import glob +import os + +# 定义包含 CSV 文件的目录 +csv_dir = 'log/' + +# 使用 glob 模块查找目录中所有的 CSV 文件 +csv_files = glob.glob(os.path.join(csv_dir, '*forward*.csv')) +dataframes = [] +for file in csv_files: + df = pd.read_csv(file) + dataframes.append(df) + +combined_df = pd.concat(dataframes, axis=0, ignore_index=True) + +# 假设所有 CSV 的列名和顺序相同,按第二列排序 +# 使用 iloc[:, 1] 获取第二列的列名 +second_column_name = combined_df.columns[1] +# 按第二列排序 +sorted_df = combined_df.sort_values(by=second_column_name) +# 输出排序后的 DataFrame +print(sorted_df) +# 可选:将排序后的 DataFrame 保存为新的 CSV 文件 +sorted_df.to_csv('sorted_combined_forward.csv', index=False) + + +# 使用 glob 模块查找目录中所有的 CSV 文件 +csv_files = glob.glob(os.path.join(csv_dir, '*backward*.csv')) +dataframes = [] +for file in csv_files: + df = pd.read_csv(file) + dataframes.append(df) + +combined_df = pd.concat(dataframes, axis=0, ignore_index=True) + +# 假设所有 CSV 的列名和顺序相同,按第二列排序 +# 使用 iloc[:, 1] 获取第二列的列名 +second_column_name = combined_df.columns[1] +# 按第二列排序 +sorted_df = combined_df.sort_values(by=second_column_name) +# 输出排序后的 DataFrame +print(sorted_df) +# 可选:将排序后的 DataFrame 保存为新的 CSV 文件 +sorted_df.to_csv('sorted_combined_backward.csv', index=False) diff --git a/paddleapex/apex/run_class.py b/paddleapex/apex/run_class.py index 92db52b..bf73373 100644 --- a/paddleapex/apex/run_class.py +++ b/paddleapex/apex/run_class.py @@ -58,7 +58,11 @@ "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 } -PROFILE_RUN_TIMES = 1 + + +PROFILE_RUN_TIMES = 10 +PROFILE_WARM_TIMES = 10 + from paddle.distributed import fleet strategy = fleet.DistributedStrategy() @@ -417,35 +421,41 @@ def run_profile_case( device_args, device_kwargs, need_backward = create_input_args( api_info_dict_copy, backend, enforce_dtype, real_data_path ) + + init_path = real_data_path + api_call_name + ".init_params" + print(init_path) + state_path = real_data_path + api_call_name + ".state_dict" + print(state_path) + init_para = load_params(init_path) + api_call_stack = api_call_name.rsplit("*")[0] + parent_package, class_n = api_call_stack.rsplit(".", maxsplit=1) + MODULE = import_module(parent_package) + class_model = getattr(MODULE, class_n) + model = class_model(**init_para) + model.set_state_dict(paddle.load(state_path)) + # if api_call_name in debug_case: # x = [device_args, device_kwargs] # out_path = os.path.realpath(out_path) if out_path else "./" # save_pth = os.path.join(out_path, "input_data", api_call_name) # paddle.save(x, save_pth) # device warmming up - try: - device_out = run_forward(api_call_name, device_args, device_kwargs) - if api_info_dict["dout_list"][0] != "Failed": - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - paddle.autograd.backward([device_out], dout) - else: - need_backward = False - except Exception as err: - msg = "Failed in device warming up: %s" % str(err) - print_warn_log(msg) - return + if api_info_dict["dout_list"][0] == "Failed": + need_backward = False input_shape1 = get_shape(device_args) input_shape2 = get_shape(device_kwargs) input_shape_lst = merge_two_lists(input_shape1, input_shape2) - output_shape_lst = get_shape(device_out) + output_shape_lst = [] def profile_inner_loop_(): try: + paddle.device.synchronize() + for _ in range(PROFILE_WARM_TIMES): + device_out = model(*device_args, **device_kwargs) + output_shape_lst = get_shape(device_out) paddle.device.synchronize() fwd_start_time = time.time() for _ in range(PROFILE_RUN_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) + device_out = model(*device_args, **device_kwargs) paddle.device.synchronize() fwd_end_time = time.time() fwd_time = fwd_end_time - fwd_start_time @@ -453,28 +463,34 @@ def profile_inner_loop_(): except Exception as err: msg = "Run_forward Error: %s" % str(err) print_warn_log(msg) - return -1, -1 + return -1, -1, output_shape_lst try: if not need_backward: - return fwd_time, -1 + return fwd_time, -1, output_shape_lst + dout = create_dout(api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path) + print("create_dout------") paddle.device.synchronize() - bwd_start_time = time.time() + device_out_list = [] for _ in range(PROFILE_RUN_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) - paddle.autograd.backward([device_out], dout) + device_out_list.append(model(*device_args, **device_kwargs)) + paddle.device.synchronize() + print("Run_backward------") + bwd_start_time = time.time() + for i in range(PROFILE_RUN_TIMES): + paddle.autograd.backward([device_out_list[i]], dout) paddle.device.synchronize() bwd_end_time = time.time() bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us - bwd_time = bwd_time - fwd_time + # bwd_time = bwd_time - fwd_time except Exception as err: msg = "Run_backward Error: %s" % str(err) print_warn_log(msg) - return fwd_time, -1 - return fwd_time, bwd_time + return fwd_time, -1, output_shape_lst + return fwd_time, bwd_time, output_shape_lst try: - fwd_time, bwd_time = profile_inner_loop_() + fwd_time, bwd_time, output_shape_lst = profile_inner_loop_() except Exception as err: msg = f"Run {api_call_name} profile Error: %s" % str(err) print_warn_log(msg) @@ -482,9 +498,9 @@ def profile_inner_loop_(): return if not enforce_dtype: - log_path = os.path.join(out_path, "profile_analyze.log") + log_path = os.path.join(out_path, "profile_analyze" + str(dist.get_rank()) +".log") else: - log_path = os.path.join(out_path, enforce_dtype.name, "profile_analyze.log") + log_path = os.path.join(out_path, enforce_dtype.name, "profile_analyze" + str(dist.get_rank()) +".log") F = open(log_path, "a") dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" diff --git a/paddleapex/apex/run_llama10b_xpu.sh b/paddleapex/apex/run_llama10b_xpu.sh old mode 100755 new mode 100644 diff --git a/paddleapex/apex/run_llama10b_xpu_32k.sh b/paddleapex/apex/run_llama10b_xpu_32k.sh old mode 100755 new mode 100644 index b240ca0..84cf9c1 --- a/paddleapex/apex/run_llama10b_xpu_32k.sh +++ b/paddleapex/apex/run_llama10b_xpu_32k.sh @@ -1,6 +1,6 @@ #!/bin/bash task_name_or_path="llama-10b" -export XPUAPI_DEBUG=0x1 +#export XPUAPI_DEBUG=0x1 #export XPURT_DISPATCH_MODE=PROFILING export XPU_FORCE_USERMODE_LAUNCH=1 export PYTHONPATH=$PYTHONPATH:/zhouxiangquan/PaddleAPEX:/zhouxiangquan/PaddleNLP @@ -63,11 +63,33 @@ export XPU_AUTO_BF16_TF32_RADIO=1 export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" acc_direct_cmp_zxq.py --bench /ssd3/zhouxiangquan/llama10b/GPU/ --device /ssd3/zhouxiangquan/llama10b/result/rank_0/ -o /ssd3/zhouxiangquan/llama10b/ +python lot_t.py + +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_class.py \ +# -json \ +# "/zhouxiangquan/llama10b/dump_info/rank0_step1/class.json /zhouxiangquan/llama10b/dump_info/rank1_step1/class.json /zhouxiangquan/llama10b/dump_info/rank2_step1/class.json /zhouxiangquan/llama10b/dump_info/rank3_step1/class.json /zhouxiangquan/llama10b/dump_info/rank4_step1/class.json /zhouxiangquan/llama10b/dump_info/rank5_step1/class.json /zhouxiangquan/llama10b/dump_info/rank6_step1/class.json /zhouxiangquan/llama10b/dump_info/rank7_step1/class.json" \ +# -backend xpu \ +# -real \ +# "/zhouxiangquan/llama10b/dump_info/rank0_step0/ /zhouxiangquan/llama10b/dump_info/rank1_step0/ /zhouxiangquan/llama10b/dump_info/rank2_step0/ /zhouxiangquan/llama10b/dump_info/rank3_step0/ /zhouxiangquan/llama10b/dump_info/rank4_step0/ /zhouxiangquan/llama10b/dump_info/rank5_step0/ /zhouxiangquan/llama10b/dump_info/rank6_step0/ /zhouxiangquan/llama10b/dump_info/rank7_step0/" \ +# -out /zhouxiangquan/llama10b/result/ -mode acc +# +# +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ +# -json \ +# "/zhouxiangquan/llama10b/dump_info/rank0_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank1_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank2_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank3_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank4_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank5_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank6_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank7_step1/distributed.json" \ +# -backend xpu \ +# -real \ +# "/zhouxiangquan/llama10b/dump_info/rank0_step0/ /zhouxiangquan/llama10b/dump_info/rank1_step0/ /zhouxiangquan/llama10b/dump_info/rank2_step0/ /zhouxiangquan/llama10b/dump_info/rank3_step0/ /zhouxiangquan/llama10b/dump_info/rank4_step0/ /zhouxiangquan/llama10b/dump_info/rank5_step0/ /zhouxiangquan/llama10b/dump_info/rank6_step0/ /zhouxiangquan/llama10b/dump_info/rank7_step0/" \ +# -out /zhouxiangquan/llama10b/result/ -mode acc +# + +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_without_distributed.py \ +# -json \ +# "/zhouxiangquan/llama10b/dump_info/rank0_step1/common.json" \ +# -backend xpu \ +# -real \ +# "/zhouxiangquan/llama10b/dump_info/rank0_step0/" \ +# -out /zhouxiangquan/llama10b/result/rank_0/ -mode acc + -python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ - -json \ - "/zhouxiangquan/llama10b/dump_info/rank0_step5/forward_rank0_all.json /zhouxiangquan/llama10b/dump_info/rank1_step5/forward_rank1_all.json /zhouxiangquan/llama10b/dump_info/rank2_step5/forward_rank2_all.json /zhouxiangquan/llama10b/dump_info/rank3_step5/forward_rank3_all.json /zhouxiangquan/llama10b/dump_info/rank4_step5/forward_rank4_all.json /zhouxiangquan/llama10b/dump_info/rank5_step5/forward_rank5_all.json /zhouxiangquan/llama10b/dump_info/rank6_step5/forward_rank6_all.json /zhouxiangquan/llama10b/dump_info/rank7_step5/forward_rank7_all.json" \ - -backend xpu \ - -real \ - "/zhouxiangquan/llama10b/dump_info/rank0_step0/ /zhouxiangquan/llama10b/dump_info/rank1_step0/ /zhouxiangquan/llama10b/dump_info/rank2_step0/ /zhouxiangquan/llama10b/dump_info/rank3_step0/ /zhouxiangquan/llama10b/dump_info/rank4_step0/ /zhouxiangquan/llama10b/dump_info/rank5_step0/ /zhouxiangquan/llama10b/dump_info/rank6_step0/ /zhouxiangquan/llama10b/dump_info/rank7_step0/" \ - -out /zhouxiangquan/llama10b/result_32k/ -mode acc diff --git a/paddleapex/apex/run_llama10b_xpu_distributed.sh b/paddleapex/apex/run_llama10b_xpu_distributed.sh deleted file mode 100755 index ef16466..0000000 --- a/paddleapex/apex/run_llama10b_xpu_distributed.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash -task_name_or_path="llama-10b" -export XPUAPI_DEBUG=0x1 -#export XPURT_DISPATCH_MODE=PROFILING -export XPU_FORCE_USERMODE_LAUNCH=1 -export PYTHONPATH=$PYTHONPATH:/zhouxiangquan/PaddleAPEX:/zhouxiangquan/PaddleNLP - -runtime_location=/workspace/so-runtime -bkcl_location=/workspace/so-bkcl -export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:$LD_LIBRARY_PATH - -export XBLAS_FC_HBM_VERSION=40 - -# PaddlePaddle -export FLAGS_use_stride_kernel="0" -export XPU_CDNN_CLUSTER_PARALLEL=1 -export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 -export XPU_PADDLE_L3_SIZE0=1024 -export XPU_PADDLE_L3_SIZE1=1024 - -# BKCL -# export BKCL_DEBUG=1 -# Multi-computer RDMA -export BKCL_ENABLE_XDR=1 -export BKCL_RDMA_FORCE_TREE=0 -export BKCL_TREE_THRESHOLD=0 -#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 -#export BKCL_SOCKET_IFNAME=eth0 -export BKCL_FORCE_L3_RDMA=0 -export BKCL_USE_AR=1 -export BKCL_RING_OPT=1 -export BKCL_RING_HOSTID_USE_RANK=1 - -echo "bkcl version:" -strings ${bkcl_location}/libbkcl.so | grep COM -master_ip=$POD_0_IP -nnodes=$PADDLE_TRAINERS_NUM -echo "master ip:" -echo $master_ip - -export CUDA_DEVICE_MAX_CONNECTIONS=8 - -timestamp=$(date +%Y%m%d%H%M%S) -echo $timestamp - -PaddleNLP_DIR=$(pwd) -echo "PaddleNLP_DIR: "$PaddleNLP_DIR - -export USING_LAYERNORM=1 -export USING_GQA_NEOX=1 -export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 - -export BKCL_USE_AR=1 -export BKCL_RING_OPT=1 -export BKCL_RING_HOSTID_USE_RANK=1 - -export USING_LOGITS_PRINT=1 -export LOGITS_PRINT_INTERVAL=1 -export XPU_PADDLE_FC_LOCAL_INT16=1 -export CUDA_DEVICE_ORDER=OAM_ID -export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 -export XPU_AUTO_BF16_TF32_RADIO=1 -export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 -export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 - -# "llama10b/rand0_distributed.json llama10b/rand1_distributed.json llama10b/rand2_distributed.json llama10b/rand3_distributed.json llama10b/rand4_distributed.json llama10b/rand5_distributed.json llama10b/rand6_distributed.json llama10b/rand7_distributed.json" - -python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_without_distributed.py \ - -json "llama10b/rand0_without_distributed.json" \ - -backend xpu \ - -real "/zhouxiangquan/llama10b/dump_info/rank0_step0/" \ - -out /zhouxiangquan/llama10b/result_32k/rank_0/ -mode acc - -python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ - -json \ - "llama10b/rand0_distributed.json llama10b/rand1_distributed.json llama10b/rand2_distributed.json llama10b/rand3_distributed.json llama10b/rand4_distributed.json llama10b/rand5_distributed.json llama10b/rand6_distributed.json llama10b/rand7_distributed.json" \ - -backend xpu \ - -real \ - "/zhouxiangquan/llama10b/dump_info/rank0_step0/ /zhouxiangquan/llama10b/dump_info/rank1_step0/ /zhouxiangquan/llama10b/dump_info/rank2_step0/ /zhouxiangquan/llama10b/dump_info/rank3_step0/ /zhouxiangquan/llama10b/dump_info/rank4_step0/ /zhouxiangquan/llama10b/dump_info/rank5_step0/ /zhouxiangquan/llama10b/dump_info/rank6_step0/ /zhouxiangquan/llama10b/dump_info/rank7_step0/" \ - -out /zhouxiangquan/llama10b/result_32k/ -mode acc diff --git a/paddleapex/apex/run_llama10b_xpu_new.sh b/paddleapex/apex/run_llama10b_xpu_new.sh old mode 100755 new mode 100644 diff --git a/paddleapex/apex/run_llama20b_xpu.sh b/paddleapex/apex/run_llama20b_xpu.sh old mode 100755 new mode 100644 index 53d36b0..4b1c82a --- a/paddleapex/apex/run_llama20b_xpu.sh +++ b/paddleapex/apex/run_llama20b_xpu.sh @@ -1,16 +1,16 @@ #!/bin/bash task_name_or_path="llama-20b" -export XPUAPI_DEBUG=0x1 +#export XPUAPI_DEBUG=0x1 #export XPURT_DISPATCH_MODE=PROFILING export XPU_FORCE_USERMODE_LAUNCH=1 -export PYTHONPATH=$PYTHONPATH:/workspace/APEX/PaddleAPEX:/workspace/APEX/PaddleNLP +export PYTHONPATH=$PYTHONPATH:/ssd3/zhouxiangquan/PaddleAPEX:/ssd3/zhouxiangquan/PaddleNLP export XBLAS_FC_HBM_VERSION=40 # PaddlePaddle export FLAGS_use_stride_kernel="0" -#export XPU_CDNN_CLUSTER_PARALLEL=1 -#export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 export XPU_PADDLE_L3_SIZE0=1024 export XPU_PADDLE_L3_SIZE1=1024 export XPU_PADDLE_FC_LOCAL_INT16=1 @@ -27,6 +27,10 @@ export BKCL_FORCE_L3_RDMA=0 export CUDA_DEVICE_MAX_CONNECTIONS=8 export CUDA_DEVICE_ORDER=OAM_ID +export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 +export XPU_AUTO_BF16_TF32_RADIO=1 +export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 +export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 timestamp=$(date +%Y%m%d%H%M%S) echo $timestamp @@ -38,7 +42,34 @@ export USING_GQA_NEOX=1 export USING_LOGITS_PRINT=1 export LOGITS_PRINT_INTERVAL=1 -python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py -json ./ -backend xpu -out /workspace/APEX/llama20b/distributed/ -mode pro -# python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py -json ./ -backend xpu -out /workspace/APEX/llama20b/distributed/ -mode acc -# python run_paddle.py -json /workspace/APEX/llama20b/dump_info/rank0_step0/forward_rank0_all.json -backend xpu -out /workspace/APEX/llama20b/ -mode acc -# python run_paddle.py -real /workspace/APEX/scaled_dot_product_attention/dump_info/rank0_step0/ -json /workspace/APEX/scaled_dot_product_attention/dump_info/rank0_step5/forward_rank0_all.json -backend xpu -out /workspace/APEX/scaled_dot_product_attention/ -mode acc + +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" acc_direct_cmp_zxq.py --bench /ssd3/zhouxiangquan/llama20b/GPU/ --device /ssd3/zhouxiangquan/llama20b/result/rank_0/ -o /ssd3/zhouxiangquan/llama20b/ +#python lot_t.py + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_paddle.py \ + -json \ + "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/forward_rank0_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/forward_rank1_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/forward_rank2_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/forward_rank3_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/forward_rank4_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/forward_rank5_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/forward_rank6_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/forward_rank7_all.json" \ + -backend xpu \ + -real \ + "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/" \ + -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc + +# +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ +# -json \ +# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/distributed.json" \ +# -backend xpu \ +# -real \ +# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/" \ +# -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc +# +# +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_without_distributed.py \ +# -json \ +# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/common.json" \ +# -backend xpu \ +# -real \ +# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/" \ +# -out /ssd3/zhouxiangquan/llama20b/result/rank_0/ -mode acc +# + diff --git a/paddleapex/apex/run_llama20b_xpu_pro.sh b/paddleapex/apex/run_llama20b_xpu_pro.sh new file mode 100644 index 0000000..a1a75e4 --- /dev/null +++ b/paddleapex/apex/run_llama20b_xpu_pro.sh @@ -0,0 +1,75 @@ +#!/bin/bash +task_name_or_path="llama-20b" +#export XPUAPI_DEBUG=0x1 +#export XPURT_DISPATCH_MODE=PROFILING +export XPU_FORCE_USERMODE_LAUNCH=1 +export PYTHONPATH=$PYTHONPATH:/ssd3/zhouxiangquan/PaddleAPEX:/ssd3/zhouxiangquan/PaddleNLP + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 +export XPU_PADDLE_FC_LOCAL_INT16=1 + +# BKCL +# export BKCL_DEBUG=1 +# Multi-computer RDMA +#export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=0 +export BKCL_TREE_THRESHOLD=0 +#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 +#export BKCL_SOCKET_IFNAME=eth0 +export BKCL_FORCE_L3_RDMA=0 + +export CUDA_DEVICE_MAX_CONNECTIONS=8 +export CUDA_DEVICE_ORDER=OAM_ID +export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 +export XPU_AUTO_BF16_TF32_RADIO=1 +export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 +export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 + + +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" acc_direct_cmp_zxq.py --bench /ssd3/zhouxiangquan/llama20b/GPU/ --device /ssd3/zhouxiangquan/llama20b/result/rank_0/ -o /ssd3/zhouxiangquan/llama20b/ +#python lot_t.py + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_class.py \ + -json \ + "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/class.json" \ + -backend xpu \ + -real \ + "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/" \ + -out /ssd3/zhouxiangquan/llama20b/result/ -mode pro +# +# +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ +# -json \ +# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/distributed.json" \ +# -backend xpu \ +# -real \ +# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/" \ +# -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc +# +# +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_without_distributed.py \ + -json \ + "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/common.json" \ + -backend xpu \ + -real \ + "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/" \ + -out /ssd3/zhouxiangquan/llama20b/result/rank_0/ -mode pro + + diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 8a8376c..52f638d 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -12,14 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -#import paddlenlp # if you wanna test nlp fusion operations +import paddlenlp # if you wanna test nlp fusion operations import argparse import os +from importlib import import_module import shutil import time import copy +import json +import yaml from tqdm import tqdm +import pickle import paddle +import paddle.distributed as dist +from paddle.distributed import fleet from paddle import framework from paddle.base import core from utils import ( @@ -37,6 +43,16 @@ "FP32": paddle.float32, "BF16": paddle.bfloat16, } + +yaml_path = "../api_tracer/configs/op_target.yaml" +f = open(yaml_path, "r") +Ops = yaml.safe_load(f) +target_op = Ops.get("target_op") +ignored_op = Ops.get("ignored_op") +target_class = Ops.get("target_class") +distributed_op = Ops.get("distributed_op") +f.close() + Warning_list = [] current_time = time.strftime("%Y%m%d%H%M%S") @@ -267,8 +283,8 @@ def run_forward(api_call_name, device_args, device_kwargs): api_call_stack = api_call_name.rsplit("*")[0] try: device_out = eval(api_call_stack)(*device_args, **device_kwargs) + paddle.device.synchronize() return device_out - except Exception as err: msg = f"Run API {api_call_name} Forward Error: %s" % str(err) print_warn_log(msg) @@ -317,26 +333,82 @@ def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=No return None +def load_params(filename): + with open(filename, 'rb') as f: + return pickle.load(f) + + +def create_model(api_call_name, real_data_path): + api_call_stack = api_call_name.rsplit("*")[0] + init_path = real_data_path + api_call_name + ".init_params" + state_path = real_data_path + api_call_name + ".state_dict" + init_para = load_params(init_path) + parent_package, class_n = api_call_stack.rsplit(".", maxsplit=1) + try: + MODULE = import_module(parent_package) + class_model = getattr(MODULE, class_n) + model = class_model(**init_para) + model.set_state_dict(paddle.load(state_path)) + return model + except Exception as err: + msg = "Create Model Error: %s" % str(err) + print_warn_log(msg) + return None + + +def run_model_forward(model, device_args, device_kwargs): + try: + # paddle.distributed.barrier() + device_out = model(*device_args, **device_kwargs) + paddle.device.synchronize() + return device_out + except Exception as err: + msg = f"Run Forward Error: %s" % str(err) + print_warn_log(msg) + Warning_list.append(msg) + return None + + def run_acc_case( api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None ): + api_call_stack = api_call_name.rsplit("*")[0] api_info_dict_copy = copy.deepcopy(api_info_dict) device_args, device_kwargs, need_backward = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) + api_info_dict_copy, backend, enforce_dtype, real_data_path) print(f"Running {api_call_name} acc test!") - if api_call_name in debug_case: + if api_call_name in debug_case: x = [device_args, device_kwargs] out_path = os.path.realpath(out_path) if out_path else "./" save_pth = os.path.join(out_path, "input_data", api_call_name) paddle.save(x, save_pth) - try: - device_out = run_forward(api_call_name, device_args, device_kwargs) - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return + # if this case is class + if api_call_stack in target_class: + if real_data_path == None: + msg = (f"Running {api_call_name} acc Failed! Don't support run class without real_data_path!") + print_warn_log(msg) + Warning_list.append(msg) + return + else: + try: + model = create_model(api_call_name, real_data_path) + device_out = run_model_forward(model, device_args, device_kwargs) + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return + else: + try: + device_out = run_forward(api_call_name, device_args, device_kwargs) + if api_call_stack in distributed_op and device_out is None: + print('this is distributed op: ', api_call_name) + device_out = device_args + except Exception as err: + msg = "Run_forward Error: %s" % str(err) + print_warn_log(msg) + return + try: device_grad_out = [] if api_info_dict["dout_list"][0] != "Failed": @@ -570,17 +642,129 @@ def arg_parser(parser): help="debug_op name", required=False, ) + parser.add_argument( + "-class", + "--class_op", + dest="test_class", + default=False, + type=bool, + help="test class op", + required=False, + ) + parser.add_argument( + "-class_type", + "--class_type", + dest="class_default_type", + default="bfloat16", + type=str, + help="the default type of class", + required=False, + ) + parser.add_argument( + "-dp", + "--dp_degree", + dest="dp_degree", + default=1, + type=int, + help="dp_degree", + required=False, + ) + parser.add_argument( + "-mp", + "--mp_degree", + dest="mp_degree", + default=8, + type=int, + help="mp_degree", + required=False, + ) + parser.add_argument( + "-pp", + "--pp_degree", + dest="pp_degree", + default=1, + type=int, + help="pp_degree", + required=False, + ) + parser.add_argument( + "-sd", + "--sharding_degree", + dest="sharding_degree", + default=1, + type=int, + help="sharding_degree", + required=False, + ) + parser.add_argument( + "-dist", + "--distributed_op", + dest="distributed_op", + default=False, + type=bool, + help="distributed_mode", + required=False, + ) + +def check_json(json_list): + data_list = [] + for json_file in json_list: + f = open(json_file, 'r', encoding='utf-8') + data = json.load(f) + keys = [] + for key, _ in data.items(): + keys.append(key) + data_list.append(keys) + f.close() + + for i in range(len(data_list[0])): + key = data_list[0][i] + for j in range(len(data_list) - 1): + key_j = data_list[j + 1][i] + if key != key_j: + print("op: rand0: " + str(key) + " rank" + str(j + 1) + ": " + str(key_j)) + return False + return True if __name__ == "__main__": parser = argparse.ArgumentParser() arg_parser(parser) cfg = parser.parse_args() - print(cfg) - forward_content = api_json_read(cfg.json_path) + out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" if os.path.exists(out_path): print_warn_log("The output path already exists and the file with the same name will be overwritten.") + + if cfg.test_class: + dist.init_parallel_env() + local_rank = dist.get_rank() + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": cfg.dp_degree, + "mp_degree": cfg.mp_degree, + "pp_degree": cfg.pp_degree, + "sharding_degree": cfg.sharding_degree} + fleet.init(is_collective=True, strategy=strategy) + paddle.set_default_dtype(cfg.class_type) + + json_path_list = cfg.json_path.split(' ') + data_path_list = cfg.real_data.split(' ') + + if not check_json(json_path_list): + raise Exception("Check json faile!!!") + else: + cfg.json_path = json_path_list[local_rank] + cfg.real_data = data_path_list[local_rank] + cfg.backend = cfg.backend + ":" + str(local_rank) + print(cfg) + + out_path = out_path + "/rank_" + str(local_rank) + "/" + if not os.path.exists(out_path): + os.makedirs(out_path, exist_ok=True) + cfg.out_path = out_path + + forward_content = api_json_read(cfg.json_path) ut_case_parsing(forward_content, cfg) print_info_log("UT save completed") warning_log_pth = os.path.join(out_path, "./warning_log.txt") diff --git a/paddleapex/apex/run_without_distributed.py b/paddleapex/apex/run_without_distributed.py index 46df68a..04866bb 100644 --- a/paddleapex/apex/run_without_distributed.py +++ b/paddleapex/apex/run_without_distributed.py @@ -76,7 +76,11 @@ "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 } -PROFILE_RUN_TIMES = 1 + + +PROFILE_RUN_TIMES = 100 +PROFILE_WARM_TIMES = 100 + def recursive_delete_arg(arg_in): if isinstance(arg_in, (list, tuple)): @@ -418,29 +422,22 @@ def run_profile_case( # save_pth = os.path.join(out_path, "input_data", api_call_name) # paddle.save(x, save_pth) # device warmming up - try: - device_out = run_forward(api_call_name, device_args, device_kwargs) - if api_info_dict["dout_list"][0] != "Failed": - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - paddle.autograd.backward([device_out], dout) - else: - need_backward = False - except Exception as err: - msg = "Failed in device warming up: %s" % str(err) - print_warn_log(msg) - return + if api_info_dict["dout_list"][0] == "Failed": + need_backward = False input_shape1 = get_shape(device_args) input_shape2 = get_shape(device_kwargs) input_shape_lst = merge_two_lists(input_shape1, input_shape2) - output_shape_lst = get_shape(device_out) + output_shape_lst = [] def profile_inner_loop_(): try: + paddle.device.synchronize() + for _ in range(PROFILE_WARM_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + output_shape_lst = get_shape(device_out) paddle.device.synchronize() fwd_start_time = time.time() for _ in range(PROFILE_RUN_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) + run_forward(api_call_name, device_args, device_kwargs) paddle.device.synchronize() fwd_end_time = time.time() fwd_time = fwd_end_time - fwd_start_time @@ -448,28 +445,34 @@ def profile_inner_loop_(): except Exception as err: msg = "Run_forward Error: %s" % str(err) print_warn_log(msg) - return -1, -1 + return -1, -1, output_shape_lst try: if not need_backward: - return fwd_time, -1 + return fwd_time, -1, output_shape_lst + dout = create_dout(api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path) + print("create_dout------") paddle.device.synchronize() - bwd_start_time = time.time() + device_out_list = [] for _ in range(PROFILE_RUN_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) - paddle.autograd.backward([device_out], dout) + device_out_list.append(run_forward(api_call_name, device_args, device_kwargs)) + paddle.device.synchronize() + print("Run_backward------") + bwd_start_time = time.time() + for i in range(PROFILE_RUN_TIMES): + paddle.autograd.backward([device_out_list[i]], dout) paddle.device.synchronize() bwd_end_time = time.time() bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us - bwd_time = bwd_time - fwd_time + # bwd_time = bwd_time - fwd_time except Exception as err: msg = "Run_backward Error: %s" % str(err) print_warn_log(msg) - return fwd_time, -1 - return fwd_time, bwd_time + return fwd_time, -1, output_shape_lst + return fwd_time, bwd_time, output_shape_lst try: - fwd_time, bwd_time = profile_inner_loop_() + fwd_time, bwd_time, output_shape_lst = profile_inner_loop_() except Exception as err: msg = f"Run {api_call_name} profile Error: %s" % str(err) print_warn_log(msg) @@ -477,9 +480,9 @@ def profile_inner_loop_(): return if not enforce_dtype: - log_path = os.path.join(out_path, "profile_analyze.log") + log_path = os.path.join(out_path, "profile_analyze" + str(dist.get_rank()) +".log") else: - log_path = os.path.join(out_path, enforce_dtype.name, "profile_analyze.log") + log_path = os.path.join(out_path, enforce_dtype.name, "profile_analyze" + str(dist.get_rank()) +".log") F = open(log_path, "a") dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" diff --git a/paddleapex/apex/split_distributed.py b/paddleapex/apex/split_distributed.py index 82665e1..1c0841c 100644 --- a/paddleapex/apex/split_distributed.py +++ b/paddleapex/apex/split_distributed.py @@ -35,38 +35,38 @@ def split_json_by_keyword(input_file, outfiles, keywords): print("well done") -input_json_files = ["/zhouxiangquan/llama10b/dump_info/rank0_step1/forward_rank0_all.json", - "/zhouxiangquan/llama10b/dump_info/rank1_step1/forward_rank1_all.json", - "/zhouxiangquan/llama10b/dump_info/rank2_step1/forward_rank2_all.json", - "/zhouxiangquan/llama10b/dump_info/rank3_step1/forward_rank3_all.json", - "/zhouxiangquan/llama10b/dump_info/rank4_step1/forward_rank4_all.json", - "/zhouxiangquan/llama10b/dump_info/rank5_step1/forward_rank5_all.json", - "/zhouxiangquan/llama10b/dump_info/rank6_step1/forward_rank6_all.json", - "/zhouxiangquan/llama10b/dump_info/rank7_step1/forward_rank7_all.json"] -distributed_keyword = ["/zhouxiangquan/llama10b/dump_info/rank0_step1/distributed.json", - "/zhouxiangquan/llama10b/dump_info/rank1_step1/distributed.json", - "/zhouxiangquan/llama10b/dump_info/rank2_step1/distributed.json", - "/zhouxiangquan/llama10b/dump_info/rank3_step1/distributed.json", - "/zhouxiangquan/llama10b/dump_info/rank4_step1/distributed.json", - "/zhouxiangquan/llama10b/dump_info/rank5_step1/distributed.json", - "/zhouxiangquan/llama10b/dump_info/rank6_step1/distributed.json", - "/zhouxiangquan/llama10b/dump_info/rank7_step1/distributed.json"] -model_keyword = ["/zhouxiangquan/llama10b/dump_info/rank0_step1/class.json", - "/zhouxiangquan/llama10b/dump_info/rank1_step1/class.json", - "/zhouxiangquan/llama10b/dump_info/rank2_step1/class.json", - "/zhouxiangquan/llama10b/dump_info/rank3_step1/class.json", - "/zhouxiangquan/llama10b/dump_info/rank4_step1/class.json", - "/zhouxiangquan/llama10b/dump_info/rank5_step1/class.json", - "/zhouxiangquan/llama10b/dump_info/rank6_step1/class.json", - "/zhouxiangquan/llama10b/dump_info/rank7_step1/class.json"] -common_keyword = ["/zhouxiangquan/llama10b/dump_info/rank0_step1/common.json", - "/zhouxiangquan/llama10b/dump_info/rank1_step1/common.json", - "/zhouxiangquan/llama10b/dump_info/rank2_step1/common.json", - "/zhouxiangquan/llama10b/dump_info/rank3_step1/common.json", - "/zhouxiangquan/llama10b/dump_info/rank4_step1/common.json", - "/zhouxiangquan/llama10b/dump_info/rank5_step1/common.json", - "/zhouxiangquan/llama10b/dump_info/rank6_step1/common.json", - "/zhouxiangquan/llama10b/dump_info/rank7_step1/common.json"] +input_json_files = ["/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/forward_rank0_all.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/forward_rank1_all.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/forward_rank2_all.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/forward_rank3_all.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/forward_rank4_all.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/forward_rank5_all.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/forward_rank6_all.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/forward_rank7_all.json"] +distributed_keyword = ["/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/distributed.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/distributed.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/distributed.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/distributed.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/distributed.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/distributed.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/distributed.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/distributed.json"] +model_keyword = ["/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/class.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/class.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/class.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/class.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/class.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/class.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/class.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/class.json"] +common_keyword = ["/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/common.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/common.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/common.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/common.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/common.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/common.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/common.json", + "/ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/common.json"] diff --git a/paddleapex/api_tracer/Dump.py b/paddleapex/api_tracer/Dump.py index d95d716..bd7c45b 100644 --- a/paddleapex/api_tracer/Dump.py +++ b/paddleapex/api_tracer/Dump.py @@ -101,6 +101,7 @@ def dump_real_data(self, api_args, tensor, rank): create_directory(remote_repo) self.pool.safe_parellel_save(tensor, file_path, remote_repo) else: + # print("sss----save----") save_tensor(tensor, file_path) return f"{api_args}.pt" diff --git a/paddleapex/api_tracer/api_info.py b/paddleapex/api_tracer/api_info.py index 9407228..808884e 100644 --- a/paddleapex/api_tracer/api_info.py +++ b/paddleapex/api_tracer/api_info.py @@ -108,6 +108,8 @@ def __init__(self, mode): self.embedding_num = 0 self.output_num = 0 self.dout_list = [] + self.out_list = [] + self.arg_index = 0 self.is_half_precision = False self.is_distributed = False if cfg.profile_mode: @@ -127,9 +129,13 @@ def update_real_data(self, inputs, kwargs): args_info_list = self.analyze_element(inputs) kwargs_info_dict = self.analyze_element(kwargs) self.api_info_struct = { - self.op_name: {"args": args_info_list, "kwargs": kwargs_info_dict, "dout_list": ["Failed"]} + self.op_name: {"args": args_info_list, "kwargs": kwargs_info_dict, "out_list": ["Failed"], "dout_list": ["Failed"]} } dump_util.update_api_dict(self.api_info_struct, self.rank, self.is_half_precision, self.is_distributed) + + def update_output(self, outputs): + self.out_list = self.analyze_element(outputs) + self.api_info_struct[self.op_name].update({"out_list": self.dout_list}) def record_dout(self, grad_value): if grad_value is not None: @@ -179,6 +185,12 @@ def effi_analyze_tensor(self, arg): single_arg.update({"type": "paddle.Tensor"}) single_arg.update({"dtype": str(arg.dtype.name)}) single_arg.update({"shape": arg.shape}) + arg_name = arg.name + exit_tensor = arg_name.startswith("APEX_") + if not exit_tensor: + arg.name = "APEX_" + self.op_name + "_" + str(self.arg_index) + single_arg.update({"name": arg.name}) + self.arg_index = self.arg_index + 1 try: with paddle.no_grad(): max_ = paddle.max(arg).item() @@ -202,8 +214,10 @@ def effi_analyze_tensor(self, arg): single_arg.update({"Min": min_}) single_arg.update({"Min_origin": min_}) single_arg.update({"stop_gradient": arg.stop_gradient}) - if self.mode == "real_data" and (dist.get_rank() == 0 or self.is_distributed): + # if self.mode == "real_data" and (dist.get_rank() == 0 or self.is_distributed): + if self.mode == "real_data": api_args = self.op_name + "." + str(self.args_num) + # if not exit_tensor: pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) self.args_num += 1 single_arg.update({"real_data_path": pt_path}) @@ -227,8 +241,8 @@ def _analyze_tensor(self, arg): ) single_arg.update({"stop_gradient": arg.stop_gradient}) - # if self.mode == "real_data": - if self.mode == "real_data" and (dist.get_rank() == 0 or self.is_distributed): + # if self.mode == "real_data" and (dist.get_rank() == 0 or self.is_distributed): + if self.mode == "real_data": api_args = self.op_name + "." + str(self.args_num) pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) self.args_num += 1 diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index f7a7f53..6fdf250 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -1,4 +1,5 @@ -target_class: +ignored_op: +#target_class: - paddlenlp.transformers.llama.modeling.LlamaMLP - paddlenlp.transformers.llama.modeling.LlamaLMHead - paddlenlp.transformers.llama.modeling.LlamaRMSNorm @@ -19,9 +20,11 @@ target_class: - paddlenlp.transformers.llama.modeling.LlamaPretrainingCriterion - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss - paddlenlp.transformers.llama.modeling.LlamaForCausalLM +target_class: + - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler ignored_op: - - paddle._C_ops.min - paddle._C_ops.max + - paddle._C_ops.min - paddle.empty - paddle.empty_like - paddle.reshape @@ -41,34 +44,8 @@ ignored_op: - paddle.stack - paddle.zeros - paddle.zeros_like -# self-confidence, arrogance - # - paddle.Tensor.__add__ - # - paddle.Tensor.__mul__ - # - paddle.Tensor.__neg__ - # - paddle.Tensor.add_ -# distributed -# - paddle.distributed.barrier -# - paddle.distributed.broadcast_object_list -# - paddle.distributed.communication.stream.alltoall_single -# - paddle.distributed.communication.stream.broadcast -# - paddle.distributed.communication.stream.gather -# - paddle.distributed.communication.stream.recv -# - paddle.distributed.communication.stream.reduce -# - paddle.distributed.communication.stream.reduce_scatter -# - paddle.distributed.communication.stream.scatter -# - paddle.distributed.communication.stream.send -# - paddle.distributed.all_gather -# - paddle.distributed.all_gather_object -# - paddle.distributed.all_reduce -# - paddle.distributed.alltoall -# - paddle.distributed.alltoall_single -# - paddle.distributed.broadcast -# - paddle.distributed.communication.stream.all_gather -# - paddle.distributed.communication.stream.all_reduce -# - paddle.distributed.communication.stream.alltoall - -target_op: - # distributed +distributed_op: + # distributed - paddle.distributed.barrier - paddle.distributed.broadcast_object_list - paddle.distributed.communication.stream.alltoall_single @@ -88,9 +65,47 @@ target_op: - paddle.distributed.communication.stream.all_gather - paddle.distributed.communication.stream.all_reduce - paddle.distributed.communication.stream.alltoall - # # + - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity + - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table + - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy + - paddle.distributed.fleet.layers.mpu.mp_ops. +target_op: + - paddle._C_ops.min +#ignored_op: + - paddle._C_ops.min + - paddle._C_ops.max +#target_op: + - paddle.empty + - paddle.empty_like + - paddle.reshape + - paddle.reshape_ + - paddle.unsqueeze + - paddle.unsqueeze_ + - paddle.square_ + - paddle.Tensor.squeeze + - paddle.Tensor.squeeze_ + - paddle.Tensor.unsqueeze + - paddle.Tensor.unsqueeze_ + - paddle.squeeze_ + - paddle.ones + - paddle.ones_like + - paddle.split + - paddle.Tensor.zero_ + - paddle.stack + - paddle.zeros + - paddle.zeros_like +# self-confidence, arrogance + - paddle.Tensor.__add__ + - paddle.multiply + - paddle.multiply_ + - paddle.Tensor.__mul__ + - paddle.Tensor.__neg__ + - paddle.Tensor.add_ + - paddle._C_ops.adamw + - paddle._C_ops.adamw_ + - paddle._C_ops.layer_norm +#target_op: - paddle.nn.functional.scaled_dot_product_attention - # Special op, paddle has wrapped op in framework. #noqa - paddle._C_ops.layer_norm #noqa - paddle.nn.functional.adaptive_avg_pool1d - paddle.nn.functional.adaptive_avg_pool2d @@ -546,9 +561,9 @@ target_op: - paddle.vsplit - paddle.where - paddle.where_ - # - paddle.zeros - # - paddle.zeros_like - # - paddle.Tensor.T + - paddle.zeros + - paddle.zeros_like + - paddle.Tensor.T - paddle.Tensor.__add__ - paddle.Tensor.__and__ - paddle.Tensor.__radd__ @@ -911,9 +926,9 @@ target_op: - paddle.Tensor.where - paddle.Tensor.where_ #### experiment op: - - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity - - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table - - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy - paddle._C_ops.fused_gemm_epilogue - paddle.optimizer.Adam - paddle.optimizer.AdamW @@ -941,10 +956,10 @@ target_op: - paddle.uniform - paddle._C_ops.gaussian - paddle._legacy_C_ops.c_identity - - paddle.distributed.fleet.layers.mpu.mp_ops. - - paddlenlp.transformers.llama.fusion_ops.fused_rotary_position_embedding - - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention - - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm - - paddlenlp.transformers.llama.fusion_ops.fusion_rope + # - paddle.distributed.fleet.layers.mpu.mp_ops. + # - paddlenlp.transformers.llama.fusion_ops.fused_rotary_position_embedding + # - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention + # - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm + # - paddlenlp.transformers.llama.fusion_ops.fusion_rope # - paddlenlp.transformers.llama.fusion_ops.swiglu diff --git a/paddleapex/api_tracer/configs/tool_config.yaml b/paddleapex/api_tracer/configs/tool_config.yaml index 1eb25d1..95df766 100644 --- a/paddleapex/api_tracer/configs/tool_config.yaml +++ b/paddleapex/api_tracer/configs/tool_config.yaml @@ -18,7 +18,7 @@ dump_mode: "real_data" profile_mode: True # target_step is a list, dump api function will turn on at the specific step -target_step: [1] +target_step: [0] # Remove duplicate apis from dump_info and keep only one api in the same value range. # dump_unique: True diff --git a/paddleapex/api_tracer/wrap_op/OPTemplate.py b/paddleapex/api_tracer/wrap_op/OPTemplate.py index f9120ae..82fb171 100644 --- a/paddleapex/api_tracer/wrap_op/OPTemplate.py +++ b/paddleapex/api_tracer/wrap_op/OPTemplate.py @@ -63,7 +63,6 @@ def hijack_call(self, *args, **kwargs): cls = self.__class__ init_params = get_init_params(self) # print("init_params", init_params) - # print("hijack_call", self.__class__.__name__) cfg.prefix_op_name_ = self.prefix_op_name_ + "*" if self.__class__.__name__ not in cfg.Op_count: @@ -79,6 +78,7 @@ def hijack_call(self, *args, **kwargs): api_recorder.update_real_data(args, kwargs) save_init_params_and_weight(init_params, self.state_dict(), cfg.prefix_op_name_, rank) output = self.forward(*args, **kwargs) + # api_recorder.update_output(output) # print("api_info_struct !!!!!!", api_recorder.api_info_struct) # print(output) try: @@ -124,6 +124,7 @@ def forward(self, *args, **kwargs): api_recorder.update_APIInfo(cfg.prefix_op_name_, rank) api_recorder.update_real_data(args, kwargs) output = getattr(HookOp, "wrap_" + str(self.op_name_))(*args, **kwargs) + # api_recorder.update_output(output) try: if isinstance(output, paddle.Tensor): if not output.stop_gradient: diff --git a/paddleapex/api_tracer/wrap_op/hijack_tool.py b/paddleapex/api_tracer/wrap_op/hijack_tool.py index 2430d5e..d70fe8e 100644 --- a/paddleapex/api_tracer/wrap_op/hijack_tool.py +++ b/paddleapex/api_tracer/wrap_op/hijack_tool.py @@ -19,9 +19,6 @@ cfg = config.cfg -# from paddlenlp.test_model.test_model import SimpleModel -# from paddlenlp.transformers.llama.modeling import LlamaLMHead, LlamaMLP - def wrapped_op(op_name): def op_template(*args, **kwargs): return OPTemplate(op_name)(*args, **kwargs) @@ -29,18 +26,10 @@ def op_template(*args, **kwargs): return op_template -# LlamaLMHead.prefix_op_name_ = "paddlenlp.transformers.llama.modeling.LlamaLMHead" -# LlamaLMHead.__call__ = hijack_call - -# LlamaMLP.prefix_op_name_ = "paddlenlp.transformers.llama.modeling.LlamaMLP" -# LlamaMLP.__call__ = hijack_call - - def hijack_api(): op = GetTargetOP(cfg.op_target_pth) target_op = op.get_target_ops() target_class = op.get_target_class() - # target_op.add("paddlenlp.test_model.test_model.SimpleModel") for op_name in target_op: parent_package, method_name = op_name.rsplit(".", maxsplit=1) try: @@ -54,14 +43,13 @@ def hijack_api(): print(op_name, str(err)) for class_in in target_class: - print("begin class --------------------------------", class_in) + # print("begin class --------------------------------", class_in) parent_package, class_n = class_in.rsplit(".", maxsplit=1) try: class_name, model = try_import(parent_package) model = getattr(model, class_n) model.prefix_op_name_ = class_in model.__call__ = hijack_call - # print("model---!!!!!!!!!!", model) except Exception as err: print(class_in, str(err)) From 48db07b2ea2bd0b12963b6c1649fc5d11ab34be2 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Tue, 17 Dec 2024 15:25:49 +0800 Subject: [PATCH 09/22] fix some bug and commine files --- paddleapex/apex/acc_direct_cmp_zxq.py | 49 +- paddleapex/apex/{lot_t.py => combine_file.py} | 0 paddleapex/apex/run_class.py | 669 ------------------ paddleapex/apex/run_distributed.py | 648 ----------------- paddleapex/apex/run_paddle.py | 117 +-- paddleapex/apex/run_without_distributed.py | 636 ----------------- 6 files changed, 87 insertions(+), 2032 deletions(-) rename paddleapex/apex/{lot_t.py => combine_file.py} (100%) delete mode 100644 paddleapex/apex/run_class.py delete mode 100644 paddleapex/apex/run_distributed.py delete mode 100644 paddleapex/apex/run_without_distributed.py diff --git a/paddleapex/apex/acc_direct_cmp_zxq.py b/paddleapex/apex/acc_direct_cmp_zxq.py index c4753f2..42c8765 100644 --- a/paddleapex/apex/acc_direct_cmp_zxq.py +++ b/paddleapex/apex/acc_direct_cmp_zxq.py @@ -163,37 +163,25 @@ def compare_device_bench( print(msg) error_i = [] - print(api_file + " forward -------------") + msg = f"{api_file} forward -------------" + Warning_list.append(msg) + print(msg) compare_result(bench_out_tensor, device_out_tensor, error_i, api_file + " forward") errors_forward_info = errors_forward_info + error_i - # for e in error_i: - # errors.append(float(e.split(" ")[0])) - # errors_info.append(api_file + " forward " + e) - print(api_file + " backward -------------") error_i = [] + msg = f"{api_file} backward -------------" + Warning_list.append(msg) + print(msg) compare_result(bench_grad_tensor_list, device_grad_tensor_list, error_i, api_file + " backward") errors_bacward_info = errors_bacward_info + error_i - - # for e in error_i: - # errors.append(float(e.split(" ")[0])) - # errors_info.append(api_file + " backward " + e) - #compare.compare_output( - # api_file, - # bench_out_tensor, - # device_out_tensor, - # bench_grad_tensor_list, - # device_grad_tensor_list, - # bench_BF16_flag, - # device_BF16_flag, # BF16 convert flag - #) except Exception as err: print(err) errors_bacward_info.sort(key=lambda x: x[1]) errors_forward_info.sort(key=lambda x: x[1]) - df = pd.DataFrame(errors_bacward_info, columns=["operator_name", "error", "bench_info", "device_info"]) + df = pd.DataFrame(errors_bacward_info, columns=["operator_name", "error<0.001", "bench_data", "device_data", "diff_value", "diff_index"]) df.to_csv("log/rank" + str(dist.get_rank()) + "_backward_output.csv", index=False) - df = pd.DataFrame(errors_forward_info, columns=["operator_name", "error", "bench_info", "device_info"]) + df = pd.DataFrame(errors_forward_info, columns=["operator_name", "error<0.001", "bench_data", "device_data", "diff_value", "diff_index"]) df.to_csv("log/rank" + str(dist.get_rank()) + "_forward_output.csv", index=False) warning_log_pth = os.path.join(out_path, "./compare_warning.txt") @@ -212,10 +200,6 @@ def normalize_t(tensor0, tensor1): if min_val == max_val: return paddle.ones_like(tensor0), paddle.ones_like(tensor1) return (tensor0 - min_val) / (max_val - min_val), (tensor1 - min_val) / (max_val - min_val) - # normalized_tensor_0_1 = (tensor0 - min_val) / (max_val - min_val) - # return normalized_tensor_0_1 - # normalized_tensor_neg1_1 = normalized_tensor_0_1 * 2 - 1 - # return normalized_tensor_neg1_1 def compare_result(bench_output, device_output, errors, name): if isinstance(bench_output, (list, tuple)): @@ -228,7 +212,6 @@ def compare_result(bench_output, device_output, errors, name): # bench_output = paddle.cast(bench_output, "float") # device_output = paddle.cast(device_output, "float") diff = paddle.cast((bench_output - device_output).abs(), "float") - # abs_diff = ((bench_output - device_output) / bench_output).abs() num = len(diff) diff005 = (diff < 0.05).sum() / num diff001 = (diff < 0.01).sum() / num @@ -240,8 +223,9 @@ def compare_result(bench_output, device_output, errors, name): error_info = diff0001.numpy() bench_n = paddle.cast(bench_output_o[diff_index], "float").numpy().tolist() device_n = paddle.cast(device_output_o[diff_index], "float").numpy().tolist() - # error_info = error_info + " bench_value: " + str(bench_n) + " device_value: " + str(device_n) - errors.append((name, error_info, str(bench_n), str(device_n))) + diff_index_n = diff_index.numpy().tolist() + diff_value_n = diff_value.numpy().tolist() + errors.append((name, error_info, str(bench_n), str(device_n), str(diff_value_n), str(diff_index_n))) print("diff is too large---------------------------- erorr Erorr ERORR----------------------------") print("bench_output----------") print(bench_output_o[diff_index]) @@ -253,17 +237,6 @@ def compare_result(bench_output, device_output, errors, name): print("diff < 0.001: ", diff0001.numpy()) print("diff < 0.0005: ", diff00005.numpy()) - # diff005 = (abs_diff < 0.05).sum() / num - # diff001 = (abs_diff < 0.01).sum() / num - # diff0005 = (abs_diff < 0.005).sum() / num - # diff0001 = (abs_diff < 0.001).sum() / num - # print("abs_diff < 0.05: ", diff005.numpy()) - # print("abs_diff < 0.01: ", diff001.numpy()) - # print("abs_diff < 0.005: ", diff0005.numpy()) - # print("abs_diff < 0.001: ", diff0001.numpy()) - - - if __name__ == "__main__": parser = argparse.ArgumentParser() diff --git a/paddleapex/apex/lot_t.py b/paddleapex/apex/combine_file.py similarity index 100% rename from paddleapex/apex/lot_t.py rename to paddleapex/apex/combine_file.py diff --git a/paddleapex/apex/run_class.py b/paddleapex/apex/run_class.py deleted file mode 100644 index bf73373..0000000 --- a/paddleapex/apex/run_class.py +++ /dev/null @@ -1,669 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddlenlp # if you wanna test nlp fusion operations -import argparse -import os -from importlib import import_module -import pickle -import shutil -import time -import copy -from tqdm import tqdm -import paddle -import paddle.distributed as dist -from paddle import framework -from paddle.base import core -from utils import ( - print_info_log, - gen_api_params, - api_json_read, - check_grad_list, - rand_like, - gen_args, - print_warn_log, -) - -type_map = { - "FP16": paddle.float16, - "FP32": paddle.float32, - "BF16": paddle.bfloat16, -} -Warning_list = [] - -current_time = time.strftime("%Y%m%d%H%M%S") - -tqdm_params = { - "smoothing": 0, # 平滑进度条的预计剩余时间,取值范围0到1 - "desc": "Processing", # 进度条前的描述文字 - "leave": True, # 迭代完成后保留进度条的显示 - "ncols": 75, # 进度条的固定宽度 - "mininterval": 0.1, # 更新进度条的最小间隔秒数 - "maxinterval": 1.0, # 更新进度条的最大间隔秒数 - "miniters": 1, # 更新进度条之间的最小迭代次数 - "ascii": None, # 根据环境自动使用ASCII或Unicode字符 - "unit": "it", # 迭代单位 - "unit_scale": True, # 自动根据单位缩放 - "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 - "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 -} - - -PROFILE_RUN_TIMES = 10 -PROFILE_WARM_TIMES = 10 - - -from paddle.distributed import fleet -strategy = fleet.DistributedStrategy() -strategy.hybrid_configs = { - "dp_degree": 1, - "mp_degree": 8, - "pp_degree": 1, - "sharding_degree": 1, -} -fleet.init(is_collective=True, strategy=strategy) -paddle.set_default_dtype("bfloat16") - -def recursive_delete_arg(arg_in): - if isinstance(arg_in, (list, tuple)): - for item in arg_in: - recursive_delete_arg(item) - return - elif isinstance(arg_in, paddle.Tensor): - del arg_in - return -def get_shape(arg_in): - if isinstance(arg_in, (list, tuple)): - res = [] - for item in arg_in: - ret_value = get_shape(item) - res.append(ret_value) - return res - elif isinstance(arg_in, paddle.Tensor): - shape = arg_in.shape - return shape - -def merge_two_lists(lst1, lst2): - merged_list = [] - if lst1 is None and lst2 is not None: - merged_list = lst2 - elif lst1 is not None and lst2 is None: - merged_list = lst1 - elif lst1 is None and lst2 is None: - merged_list = [] - else: - for item in lst1: - if item is None: - continue - else: - merged_list.append(item) - for item in lst2: - if item is None: - continue - else: - merged_list.append(item) - return merged_list - -def convert_out2fp32(arg_in): - flag = False - if isinstance(arg_in, (list, tuple)): - res = [] - for item in arg_in: - ret_flag, ret_value = convert_out2fp32(item) - res.append(ret_value) - flag = flag or ret_flag - return flag, res - elif isinstance(arg_in, paddle.Tensor): - if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": - try: - arg_in = arg_in.cast("float32") - flag = True - except Exception as err: - print(arg_in) - return False, arg_in - return flag, arg_in - - -def recursive_arg_to_cpu(arg_in): - if isinstance(arg_in, (list, tuple)): - res = [] - for item in arg_in: - res.append(recursive_arg_to_cpu(item)) - return res - elif isinstance(arg_in, paddle.Tensor): - arg_in = arg_in.to( - "cpu" - ) # avoid using .cpu(), which will cause the gradient to be lost - return arg_in - - -def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): - if isinstance(arg_in, (list, tuple)): - return type(arg_in)( - recursive_arg_to_device(arg, backend, enforce_dtype) for arg in arg_in - ) - elif isinstance(arg_in, paddle.Tensor): - grad_status = arg_in.stop_gradient - with paddle.no_grad(): - if "gpu" in backend: - arg_in = arg_in.cuda() - if "cpu" in backend: - arg_in = arg_in.cpu() - if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": - arg_in = arg_in.cast("float32") - else: - arg_in = arg_in.to(backend) - if enforce_dtype and arg_in.dtype.name in ["BF16", "BFLOAT16", "FP16", "FP32"]: - arg_in = arg_in.cast(enforce_dtype) - arg_in.stop_gradient = grad_status - return arg_in - else: - return arg_in - - -def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): - if not dist.get_rank() == 0: - return - if dtype_name == "": - bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) - fwd_output_dir = os.path.abspath(os.path.join(out_path, "output")) - else: - bwd_output_dir = os.path.abspath( - os.path.join(out_path, dtype_name, "output_backward") - ) - fwd_output_dir = os.path.abspath(os.path.join(out_path, dtype_name, "output")) - fwd_output_path = os.path.join(fwd_output_dir, api_call_name) - bwd_output_path = os.path.join(bwd_output_dir, api_call_name) - os.makedirs(fwd_output_dir, exist_ok=True) - os.makedirs(bwd_output_dir, exist_ok=True) - bwd_BF16_flag, fwd_BF16_flag = True, True - if isinstance(forward_res, (type(None), list, tuple, paddle.Tensor)): - try: - # fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) - paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) - except Exception as err: - msg = "save_forward Error: %s" % str(err) - print_warn_log(msg) - return - else: - print(forward_res) - print_warn_log("forward_res not supported!") - if isinstance(backward_res, (type(None), list, tuple, paddle.Tensor)): - try: - # bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) - paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) - except Exception as err: - msg = "save_bacward Error: %s" % str(err) - print_warn_log(msg) - return - else: - print(backward_res) - print_warn_log("bacward_res not supported!") - - -def evoke_related_test_func(test_mode): - func_method = [] - if "acc" in test_mode: - func_method.append(run_acc_case) - if "mem" in test_mode: - func_method.append(run_mem_case) - if "pro" in test_mode: - func_method.append(run_profile_case) - if test_mode == "all": - return [run_acc_case, run_mem_case, run_profile_case] - if len(func_method) == 0: - raise ValueError("test mode is not supported!") - return func_method - - -def ut_case_parsing(forward_content, cfg): - run_case_funcs = evoke_related_test_func(cfg.test_mode) - backend = cfg.backend - out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" - os.mkdir(out_path) if not os.path.exists(out_path) else None - multi_dtype_ut = cfg.multi_dtype_ut.split(",") if cfg.multi_dtype_ut else [] - debug_case = cfg.test_case_name.split(",") if cfg.test_case_name else [] - print("debug_case", debug_case) - debug_mode = False - paddle.set_device(cfg.backend) - if len(debug_case) > 0: - debug_mode = True - enforce_types = [type_map[item] for item in multi_dtype_ut] - for i, (api_call_name, api_info_dict) in enumerate( - tqdm(forward_content.items(), **tqdm_params) - ): - print(api_call_name) - if debug_mode and api_call_name not in debug_case: - continue - if len(multi_dtype_ut) > 0: - for enforce_dtype in enforce_types: - print(api_call_name + "*" + enforce_dtype.name) - args = api_call_name, api_info_dict, backend, out_path - kwargs = {"enforce_dtype": enforce_dtype, "debug_case": debug_case, "real_data_path": cfg.real_data} - for run_case in run_case_funcs: - run_case(*args, **kwargs) - print("*" * 100) - else: - print(api_call_name) - args = api_call_name, api_info_dict, backend, out_path - kwargs = {"enforce_dtype": None, "debug_case": debug_case, "real_data_path": cfg.real_data} - if isinstance(run_case_funcs, list): - for run_case in run_case_funcs: - run_case(*args, **kwargs) - else: - run_case_funcs(*args, **kwargs) - print("*" * 100) - - -def create_input_args(api_info, backend, enforce_dtype=None, real_data_path=None): - print(real_data_path) - args, kwargs, need_backward = gen_api_params(api_info, real_data_path) - device_args = recursive_arg_to_device(args, backend, enforce_dtype) - device_kwargs = { - key: recursive_arg_to_device(value, backend, enforce_dtype) - for key, value in kwargs.items() - } - return device_args, device_kwargs, need_backward - - -def create_dout(dout_info_dict, device_out, backend, enforce_dtype=None, real_data_path=None): - if dout_info_dict[0] != "Failed": - dout, _ = gen_args(dout_info_dict, real_data_path) - else: - print("dout dump json is None!") - dout = rand_like(device_out) - dout = recursive_arg_to_device(dout, backend, enforce_dtype) - return dout - - -def run_forward(api_call_name, device_args, device_kwargs): - try: - # paddle.distributed.barrier() - device_out = eval(api_call_stack)(*device_args, **device_kwargs) - paddle.device.synchronize() - return device_out - - except Exception as err: - msg = f"Run API {api_call_name} Forward Error: %s" % str(err) - print_warn_log(msg) - Warning_list.append(msg) - return None - - -def get_grad_tensor(args, kwargs): - device_grad_out = [] - for arg in args: - if isinstance(arg, paddle.Tensor): - device_grad_out.append(arg.grad) - if isinstance(arg, list): # op: concat/stack - for x in arg: - if isinstance(x, paddle.Tensor): - device_grad_out.append(x.grad) - for k, v in kwargs.items(): - if isinstance(v, paddle.Tensor): - device_grad_out.append(v.grad) - if isinstance(v, list): # op: concat/stack - for x in v: - if isinstance(x, paddle.Tensor): - device_grad_out.append(x.grad) - return device_grad_out - - -def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=None): - if need_backward: - try: - paddle.autograd.backward([device_out], dout) - device_grad_out = get_grad_tensor(args, kwargs) - device_grad_out = check_grad_list(device_grad_out) - if device_grad_out is None: - msg = f"{api_call_name} grad_list is None" - Warning_list.append(msg) - return device_grad_out - except Exception as err: - msg = f"Run API {api_call_name} backward Error: %s" % str(err) - print_warn_log(msg) - Warning_list.append(msg) - return None - else: - msg = f"{api_call_name} has no tensor required grad, SKIP Backward" - print_warn_log(msg) - Warning_list.append(msg) - return None - - -def load_params(filename): - with open(filename, 'rb') as f: - return pickle.load(f) - - -def run_acc_case( - api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None -): - api_info_dict_copy = copy.deepcopy(api_info_dict) - device_args, device_kwargs, need_backward = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) - print(f"Running {api_call_name} acc test!") - if real_data_path is None: - print("do not support!!!!!!!!!!!!") - return - init_path = real_data_path + api_call_name + ".init_params" - print(init_path) - state_path = real_data_path + api_call_name + ".state_dict" - print(state_path) - init_para = load_params(init_path) - api_call_stack = api_call_name.rsplit("*")[0] - parent_package, class_n = api_call_stack.rsplit(".", maxsplit=1) - try: - MODULE = import_module(parent_package) - class_model = getattr(MODULE, class_n) - model = class_model(**init_para) - model.set_state_dict(paddle.load(state_path)) - device_out = model(*device_args, **device_kwargs) - paddle.device.synchronize() - # print(device_out) - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return - - try: - device_grad_out = [] - if api_info_dict["dout_list"][0] != "Failed": - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - device_grad_out = run_backward( - api_call_name, device_out, dout, device_args, device_kwargs, need_backward - ) - else: - # if api_call_name.rsplit("*")[0] in distributed_op: - # print('this is distributed op: ', api_call_name) - # device_out = device_args - device_grad_out = None - except Exception as err: - msg = "Run_backward Error: %s" % str(err) - print_warn_log(msg) - if enforce_dtype: - save_tensor( - device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name - ) - else: - save_tensor(device_out, device_grad_out, out_path, api_call_name) - return - if enforce_dtype: - save_tensor( - device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name - ) - else: - save_tensor(device_out, device_grad_out, out_path, api_call_name) - - # paddle.distributed.barrier() - return - - -def run_profile_case( - api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None -): - print(f"Running {api_call_name} profile test!") - api_info_dict_copy = copy.deepcopy(api_info_dict) - device_args, device_kwargs, need_backward = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) - - init_path = real_data_path + api_call_name + ".init_params" - print(init_path) - state_path = real_data_path + api_call_name + ".state_dict" - print(state_path) - init_para = load_params(init_path) - api_call_stack = api_call_name.rsplit("*")[0] - parent_package, class_n = api_call_stack.rsplit(".", maxsplit=1) - MODULE = import_module(parent_package) - class_model = getattr(MODULE, class_n) - model = class_model(**init_para) - model.set_state_dict(paddle.load(state_path)) - - # if api_call_name in debug_case: - # x = [device_args, device_kwargs] - # out_path = os.path.realpath(out_path) if out_path else "./" - # save_pth = os.path.join(out_path, "input_data", api_call_name) - # paddle.save(x, save_pth) - # device warmming up - if api_info_dict["dout_list"][0] == "Failed": - need_backward = False - input_shape1 = get_shape(device_args) - input_shape2 = get_shape(device_kwargs) - input_shape_lst = merge_two_lists(input_shape1, input_shape2) - output_shape_lst = [] - def profile_inner_loop_(): - try: - paddle.device.synchronize() - for _ in range(PROFILE_WARM_TIMES): - device_out = model(*device_args, **device_kwargs) - output_shape_lst = get_shape(device_out) - paddle.device.synchronize() - fwd_start_time = time.time() - for _ in range(PROFILE_RUN_TIMES): - device_out = model(*device_args, **device_kwargs) - paddle.device.synchronize() - fwd_end_time = time.time() - fwd_time = fwd_end_time - fwd_start_time - fwd_time = fwd_time * 1000000 / float(PROFILE_RUN_TIMES) # fwd_time is in us - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return -1, -1, output_shape_lst - try: - if not need_backward: - return fwd_time, -1, output_shape_lst - dout = create_dout(api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path) - print("create_dout------") - paddle.device.synchronize() - device_out_list = [] - for _ in range(PROFILE_RUN_TIMES): - device_out_list.append(model(*device_args, **device_kwargs)) - paddle.device.synchronize() - print("Run_backward------") - bwd_start_time = time.time() - for i in range(PROFILE_RUN_TIMES): - paddle.autograd.backward([device_out_list[i]], dout) - paddle.device.synchronize() - bwd_end_time = time.time() - bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second - bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us - # bwd_time = bwd_time - fwd_time - except Exception as err: - msg = "Run_backward Error: %s" % str(err) - print_warn_log(msg) - return fwd_time, -1, output_shape_lst - return fwd_time, bwd_time, output_shape_lst - - try: - fwd_time, bwd_time, output_shape_lst = profile_inner_loop_() - except Exception as err: - msg = f"Run {api_call_name} profile Error: %s" % str(err) - print_warn_log(msg) - Warning_list.append(msg) - return - - if not enforce_dtype: - log_path = os.path.join(out_path, "profile_analyze" + str(dist.get_rank()) +".log") - else: - log_path = os.path.join(out_path, enforce_dtype.name, "profile_analyze" + str(dist.get_rank()) +".log") - - F = open(log_path, "a") - dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" - op_fwd = api_call_name + dtype + ".forward" - op_bwd = api_call_name + dtype + ".backward" - print_info_log(f"{op_fwd}:\t{fwd_time}") - print_info_log(f"{op_bwd}:\t{bwd_time}") - dtype = "\t" if not enforce_dtype else f"\t{enforce_dtype.name}" - msg_fwd = f"{api_call_name}.forward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tforward\t{fwd_time}" - msg_bwd = f"{api_call_name}.backward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tbackward\t{bwd_time}" - - F.write(msg_fwd + "\n") - F.write(msg_bwd + "\n") - F.close() - return - - -def run_mem_case( - api_call_name, - api_info_dict, - backend, - out_path, - enforce_dtype=None, - debug_case=[], # noqa - real_data_path=None -): - print(f"Running {api_call_name} mem test!") - - activation_cost = None - place = framework._current_expected_place_() - device_id = place.get_device_id() - before_run_mem = core.device_memory_stat_current_value("Allocated", device_id) - api_info_dict_copy = copy.deepcopy(api_info_dict) - device_args, device_kwargs, _ = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) - try: - device_out = run_forward(api_call_name, device_args, device_kwargs) - recursive_delete_arg(device_args) - for _, value in device_kwargs.items(): - recursive_delete_arg(value) - _ = recursive_arg_to_cpu(device_out) - after_run_mem = core.device_memory_stat_current_value("Allocated", device_id) - activation_cost = after_run_mem - before_run_mem - - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return - if not enforce_dtype: - log_path = os.path.join(out_path, "memory_analyze.log") - else: - log_path = os.path.join(out_path, enforce_dtype.name, "memory_analyze.log") - - os.mkdir(out_path) if not os.path.exists(out_path) else None - F = open(log_path, "a") - dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" - op_name = api_call_name + dtype + ".forward" - F.write(f"{op_name}:\t{str(activation_cost)}\n") - F.close() - return - - -def arg_parser(parser): - parser.add_argument( - "-json_file", - "--json_file", - dest="json_path", - default="", - type=str, - help="Dump json file path", - required=True, - ) - parser.add_argument( - "-out", - "--dump_path", - dest="out_path", - default="./paddle/", - type=str, - help=" The ut task result out path.", - required=False, - ) - parser.add_argument( - "-backend", - "--backend", - dest="backend", - default="gpu", - type=str, - help=" The running device DEVICE or BENCH.", - required=False, - ) - parser.add_argument( - "-dtype", - "--enforce-dtype", - dest="multi_dtype_ut", - default="", - type=str, - help="", - required=False, - ) - parser.add_argument( - "-real", - "--real_data", - dest="real_data", - default="", - type=str, - help="", - required=False, - ) - parser.add_argument( - "-op", - "--op_name", - dest="test_case_name", - default="", - type=str, - help="debug_op name", - required=False, - ) - parser.add_argument( - "-mode", - "--mode", - dest="test_mode", - default="all", - type=str, - help="debug_op name", - required=False, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - arg_parser(parser) - cfg = parser.parse_args() - print(cfg) - dist.init_parallel_env() - local_rank = dist.get_rank() - # json_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step5/forward_rank" + str(local_rank) + "_all.json" - json_path_list = cfg.json_path.split(' ') - data_path_list = cfg.real_data.split(' ') - - print("json_path_list", json_path_list) - print("data_path_list", data_path_list) - - cfg.json_path = json_path_list[local_rank] - cfg.real_data = data_path_list[local_rank] - cfg.backend = cfg.backend + ":" + str(local_rank) - - print(cfg) - # data_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step0/" - # cfg.real_data = None - - forward_content = api_json_read(cfg.json_path) - out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" - if os.path.exists(out_path): - print_warn_log("The output path already exists and the file with the same name will be overwritten.") - out_path = out_path + "/rank_" + str(local_rank) + "/" - if not os.path.exists(out_path): - os.makedirs(out_path, exist_ok=True) - cfg.out_path = out_path - ut_case_parsing(forward_content, cfg) - print_info_log("UT save completed") - warning_log_pth = os.path.join(out_path, "./warning_log.txt") - File = open(warning_log_pth, "w") - for item in Warning_list: - File.write(item + "\n") - File.close() diff --git a/paddleapex/apex/run_distributed.py b/paddleapex/apex/run_distributed.py deleted file mode 100644 index 4511002..0000000 --- a/paddleapex/apex/run_distributed.py +++ /dev/null @@ -1,648 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddlenlp # if you wanna test nlp fusion operations -import argparse -import os -import shutil -import time -import copy -from tqdm import tqdm -import paddle -import paddle.distributed as dist -from paddle import framework -from paddle.base import core -from utils import ( - print_info_log, - gen_api_params, - api_json_read, - check_grad_list, - rand_like, - gen_args, - print_warn_log, -) - -type_map = { - "FP16": paddle.float16, - "FP32": paddle.float32, - "BF16": paddle.bfloat16, -} -Warning_list = [] - -distributed_op = ["paddle.distributed.broadcast_object_list", - "paddle.distributed.barrier", - "paddle.distributed.communication.stream.alltoall_single", - "paddle.distributed.communication.stream.broadcast", - "paddle.distributed.communication.stream.gather", - "paddle.distributed.communication.stream.recv", - "paddle.distributed.communication.stream.reduce", - "paddle.distributed.communication.stream.reduce_scatter", - "paddle.distributed.communication.stream.scatter", - "paddle.distributed.communication.stream.send", - "paddle.distributed.all_gather", - "paddle.distributed.all_gather_object", - "paddle.distributed.all_reduce", - "paddle.distributed.alltoall", - "paddle.distributed.alltoall_single", - "paddle.distributed.broadcast", - "paddle.distributed.communication.stream.all_gather", - "paddle.distributed.communication.stream.all_reduce", - "paddle.distributed.communication.stream.alltoall"] - -current_time = time.strftime("%Y%m%d%H%M%S") - -tqdm_params = { - "smoothing": 0, # 平滑进度条的预计剩余时间,取值范围0到1 - "desc": "Processing", # 进度条前的描述文字 - "leave": True, # 迭代完成后保留进度条的显示 - "ncols": 75, # 进度条的固定宽度 - "mininterval": 0.1, # 更新进度条的最小间隔秒数 - "maxinterval": 1.0, # 更新进度条的最大间隔秒数 - "miniters": 1, # 更新进度条之间的最小迭代次数 - "ascii": None, # 根据环境自动使用ASCII或Unicode字符 - "unit": "it", # 迭代单位 - "unit_scale": True, # 自动根据单位缩放 - "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 - "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 -} -PROFILE_RUN_TIMES = 1 - -def recursive_delete_arg(arg_in): - if isinstance(arg_in, (list, tuple)): - for item in arg_in: - recursive_delete_arg(item) - return - elif isinstance(arg_in, paddle.Tensor): - del arg_in - return -def get_shape(arg_in): - if isinstance(arg_in, (list, tuple)): - res = [] - for item in arg_in: - ret_value = get_shape(item) - res.append(ret_value) - return res - elif isinstance(arg_in, paddle.Tensor): - shape = arg_in.shape - return shape - -def merge_two_lists(lst1, lst2): - merged_list = [] - if lst1 is None and lst2 is not None: - merged_list = lst2 - elif lst1 is not None and lst2 is None: - merged_list = lst1 - elif lst1 is None and lst2 is None: - merged_list = [] - else: - for item in lst1: - if item is None: - continue - else: - merged_list.append(item) - for item in lst2: - if item is None: - continue - else: - merged_list.append(item) - return merged_list - -def convert_out2fp32(arg_in): - flag = False - if isinstance(arg_in, (list, tuple)): - res = [] - for item in arg_in: - ret_flag, ret_value = convert_out2fp32(item) - res.append(ret_value) - flag = flag or ret_flag - return flag, res - elif isinstance(arg_in, paddle.Tensor): - if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": - try: - arg_in = arg_in.cast("float32") - flag = True - except Exception as err: - print(arg_in) - return False, arg_in - return flag, arg_in - - -def recursive_arg_to_cpu(arg_in): - if isinstance(arg_in, (list, tuple)): - res = [] - for item in arg_in: - res.append(recursive_arg_to_cpu(item)) - return res - elif isinstance(arg_in, paddle.Tensor): - arg_in = arg_in.to( - "cpu" - ) # avoid using .cpu(), which will cause the gradient to be lost - return arg_in - - -def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): - if isinstance(arg_in, (list, tuple)): - return type(arg_in)( - recursive_arg_to_device(arg, backend, enforce_dtype) for arg in arg_in - ) - elif isinstance(arg_in, paddle.Tensor): - grad_status = arg_in.stop_gradient - with paddle.no_grad(): - if "gpu" in backend: - arg_in = arg_in.cuda() - if "cpu" in backend: - arg_in = arg_in.cpu() - if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": - arg_in = arg_in.cast("float32") - else: - arg_in = arg_in.to(backend) - if enforce_dtype and arg_in.dtype.name in ["BF16", "BFLOAT16", "FP16", "FP32"]: - arg_in = arg_in.cast(enforce_dtype) - arg_in.stop_gradient = grad_status - return arg_in - else: - return arg_in - - -def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): - if not dist.get_rank() == 0 and "distributed" not in api_call_name: - return - if dtype_name == "": - bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) - fwd_output_dir = os.path.abspath(os.path.join(out_path, "output")) - else: - bwd_output_dir = os.path.abspath( - os.path.join(out_path, dtype_name, "output_backward") - ) - fwd_output_dir = os.path.abspath(os.path.join(out_path, dtype_name, "output")) - fwd_output_path = os.path.join(fwd_output_dir, api_call_name) - bwd_output_path = os.path.join(bwd_output_dir, api_call_name) - os.makedirs(fwd_output_dir, exist_ok=True) - os.makedirs(bwd_output_dir, exist_ok=True) - if isinstance(forward_res, (type(None), list, tuple, paddle.Tensor)): - try: - fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) - paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) - except Exception as err: - msg = "save_forward Error: %s" % str(err) - print_warn_log(msg) - return - else: - print(forward_res) - print_warn_log("forward_res not supported!") - if isinstance(backward_res, (type(None), list, tuple, paddle.Tensor)): - try: - bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) - paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) - except Exception as err: - msg = "save_bacward Error: %s" % str(err) - print_warn_log(msg) - return - else: - print(backward_res) - print_warn_log("bacward_res not supported!") - - -def evoke_related_test_func(test_mode): - func_method = [] - if "acc" in test_mode: - func_method.append(run_acc_case) - if "mem" in test_mode: - func_method.append(run_mem_case) - if "pro" in test_mode: - func_method.append(run_profile_case) - if test_mode == "all": - return [run_acc_case, run_mem_case, run_profile_case] - if len(func_method) == 0: - raise ValueError("test mode is not supported!") - return func_method - - -def ut_case_parsing(forward_content, cfg): - run_case_funcs = evoke_related_test_func(cfg.test_mode) - backend = cfg.backend - out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" - os.mkdir(out_path) if not os.path.exists(out_path) else None - multi_dtype_ut = cfg.multi_dtype_ut.split(",") if cfg.multi_dtype_ut else [] - debug_case = cfg.test_case_name.split(",") if cfg.test_case_name else [] - print("debug_case", debug_case) - debug_mode = False - paddle.set_device(cfg.backend) - if len(debug_case) > 0: - debug_mode = True - enforce_types = [type_map[item] for item in multi_dtype_ut] - for i, (api_call_name, api_info_dict) in enumerate( - tqdm(forward_content.items(), **tqdm_params) - ): - print(api_call_name) - if debug_mode and api_call_name not in debug_case: - continue - if len(multi_dtype_ut) > 0: - for enforce_dtype in enforce_types: - print(api_call_name + "*" + enforce_dtype.name) - args = api_call_name, api_info_dict, backend, out_path - kwargs = {"enforce_dtype": enforce_dtype, "debug_case": debug_case, "real_data_path": cfg.real_data} - for run_case in run_case_funcs: - run_case(*args, **kwargs) - print("*" * 100) - else: - print(api_call_name) - args = api_call_name, api_info_dict, backend, out_path - kwargs = {"enforce_dtype": None, "debug_case": debug_case, "real_data_path": cfg.real_data} - if isinstance(run_case_funcs, list): - for run_case in run_case_funcs: - run_case(*args, **kwargs) - else: - run_case_funcs(*args, **kwargs) - print("*" * 100) - - -def create_input_args(api_info, backend, enforce_dtype=None, real_data_path=None): - print(real_data_path) - args, kwargs, need_backward = gen_api_params(api_info, real_data_path) - device_args = recursive_arg_to_device(args, backend, enforce_dtype) - device_kwargs = { - key: recursive_arg_to_device(value, backend, enforce_dtype) - for key, value in kwargs.items() - } - return device_args, device_kwargs, need_backward - - -def create_dout(dout_info_dict, device_out, backend, enforce_dtype=None, real_data_path=None): - if dout_info_dict[0] != "Failed": - dout, _ = gen_args(dout_info_dict, real_data_path) - else: - print("dout dump json is None!") - dout = rand_like(device_out) - dout = recursive_arg_to_device(dout, backend, enforce_dtype) - return dout - - -def run_forward(api_call_name, device_args, device_kwargs): - api_call_stack = api_call_name.rsplit("*")[0] - try: - # paddle.distributed.barrier() - device_out = eval(api_call_stack)(*device_args, **device_kwargs) - paddle.device.synchronize() - return device_out - - except Exception as err: - msg = f"Run API {api_call_name} Forward Error: %s" % str(err) - print_warn_log(msg) - Warning_list.append(msg) - return None - - -def get_grad_tensor(args, kwargs): - device_grad_out = [] - for arg in args: - if isinstance(arg, paddle.Tensor): - device_grad_out.append(arg.grad) - if isinstance(arg, list): # op: concat/stack - for x in arg: - if isinstance(x, paddle.Tensor): - device_grad_out.append(x.grad) - for k, v in kwargs.items(): - if isinstance(v, paddle.Tensor): - device_grad_out.append(v.grad) - if isinstance(v, list): # op: concat/stack - for x in v: - if isinstance(x, paddle.Tensor): - device_grad_out.append(x.grad) - return device_grad_out - - -def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=None): - if need_backward: - try: - paddle.autograd.backward([device_out], dout) - device_grad_out = get_grad_tensor(args, kwargs) - device_grad_out = check_grad_list(device_grad_out) - if device_grad_out is None: - msg = f"{api_call_name} grad_list is None" - Warning_list.append(msg) - return device_grad_out - except Exception as err: - msg = f"Run API {api_call_name} backward Error: %s" % str(err) - print_warn_log(msg) - Warning_list.append(msg) - return None - else: - msg = f"{api_call_name} has no tensor required grad, SKIP Backward" - print_warn_log(msg) - Warning_list.append(msg) - return None - - -def run_acc_case( - api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None -): - api_info_dict_copy = copy.deepcopy(api_info_dict) - if not dist.get_rank() == 0 and "distributed" not in api_call_name: - real_data_path = None - device_args, device_kwargs, need_backward = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) - print(f"Running {api_call_name} acc test!") - # if api_call_name in debug_case: - # x = [device_args, device_kwargs] - # out_path = os.path.realpath(out_path) if out_path else "./" - # save_pth = os.path.join(out_path, "input_data", api_call_name) - # paddle.save(x, save_pth) - try: - # if "distributed" in api_call_name: - # paddle.distributed.barrier() - device_out = run_forward(api_call_name, device_args, device_kwargs) - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return - - try: - device_grad_out = [] - if api_info_dict["dout_list"][0] != "Failed": - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - device_grad_out = run_backward( - api_call_name, device_out, dout, device_args, device_kwargs, need_backward - ) - else: - if api_call_name.rsplit("*")[0] in distributed_op: - print('this is distributed op: ', api_call_name) - device_out = device_args - device_grad_out = None - except Exception as err: - msg = "Run_backward Error: %s" % str(err) - print_warn_log(msg) - if enforce_dtype: - save_tensor( - device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name - ) - else: - save_tensor(device_out, device_grad_out, out_path, api_call_name) - return - if enforce_dtype: - save_tensor( - device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name - ) - else: - save_tensor(device_out, device_grad_out, out_path, api_call_name) - - # paddle.distributed.barrier() - return - - -def run_profile_case( - api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None -): - print(f"Running {api_call_name} profile test!") - api_info_dict_copy = copy.deepcopy(api_info_dict) - device_args, device_kwargs, need_backward = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) - # if api_call_name in debug_case: - # x = [device_args, device_kwargs] - # out_path = os.path.realpath(out_path) if out_path else "./" - # save_pth = os.path.join(out_path, "input_data", api_call_name) - # paddle.save(x, save_pth) - # device warmming up - try: - device_out = run_forward(api_call_name, device_args, device_kwargs) - if api_info_dict["dout_list"][0] != "Failed": - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - paddle.autograd.backward([device_out], dout) - else: - need_backward = False - except Exception as err: - msg = "Failed in device warming up: %s" % str(err) - print_warn_log(msg) - return - input_shape1 = get_shape(device_args) - input_shape2 = get_shape(device_kwargs) - input_shape_lst = merge_two_lists(input_shape1, input_shape2) - output_shape_lst = get_shape(device_out) - def profile_inner_loop_(): - try: - paddle.device.synchronize() - fwd_start_time = time.time() - for _ in range(PROFILE_RUN_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) - paddle.device.synchronize() - fwd_end_time = time.time() - fwd_time = fwd_end_time - fwd_start_time - fwd_time = fwd_time * 1000000 / float(PROFILE_RUN_TIMES) # fwd_time is in us - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return -1, -1 - try: - if not need_backward: - return fwd_time, -1 - paddle.device.synchronize() - bwd_start_time = time.time() - for _ in range(PROFILE_RUN_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) - paddle.autograd.backward([device_out], dout) - paddle.device.synchronize() - bwd_end_time = time.time() - bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second - bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us - bwd_time = bwd_time - fwd_time - except Exception as err: - msg = "Run_backward Error: %s" % str(err) - print_warn_log(msg) - return fwd_time, -1 - return fwd_time, bwd_time - - try: - fwd_time, bwd_time = profile_inner_loop_() - except Exception as err: - msg = f"Run {api_call_name} profile Error: %s" % str(err) - print_warn_log(msg) - Warning_list.append(msg) - return - - if not enforce_dtype: - log_path = os.path.join(out_path, "profile_analyze.log") - else: - log_path = os.path.join(out_path, enforce_dtype.name, "profile_analyze.log") - - F = open(log_path, "a") - dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" - op_fwd = api_call_name + dtype + ".forward" - op_bwd = api_call_name + dtype + ".backward" - print_info_log(f"{op_fwd}:\t{fwd_time}") - print_info_log(f"{op_bwd}:\t{bwd_time}") - dtype = "\t" if not enforce_dtype else f"\t{enforce_dtype.name}" - msg_fwd = f"{api_call_name}.forward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tforward\t{fwd_time}" - msg_bwd = f"{api_call_name}.backward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tbackward\t{bwd_time}" - - F.write(msg_fwd + "\n") - F.write(msg_bwd + "\n") - F.close() - return - - -def run_mem_case( - api_call_name, - api_info_dict, - backend, - out_path, - enforce_dtype=None, - debug_case=[], # noqa - real_data_path=None -): - print(f"Running {api_call_name} mem test!") - - activation_cost = None - place = framework._current_expected_place_() - device_id = place.get_device_id() - before_run_mem = core.device_memory_stat_current_value("Allocated", device_id) - api_info_dict_copy = copy.deepcopy(api_info_dict) - device_args, device_kwargs, _ = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) - try: - device_out = run_forward(api_call_name, device_args, device_kwargs) - recursive_delete_arg(device_args) - for _, value in device_kwargs.items(): - recursive_delete_arg(value) - _ = recursive_arg_to_cpu(device_out) - after_run_mem = core.device_memory_stat_current_value("Allocated", device_id) - activation_cost = after_run_mem - before_run_mem - - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return - if not enforce_dtype: - log_path = os.path.join(out_path, "memory_analyze.log") - else: - log_path = os.path.join(out_path, enforce_dtype.name, "memory_analyze.log") - - os.mkdir(out_path) if not os.path.exists(out_path) else None - F = open(log_path, "a") - dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" - op_name = api_call_name + dtype + ".forward" - F.write(f"{op_name}:\t{str(activation_cost)}\n") - F.close() - return - - -def arg_parser(parser): - parser.add_argument( - "-json_file", - "--json_file", - dest="json_path", - default="", - type=str, - help="Dump json file path", - required=True, - ) - parser.add_argument( - "-out", - "--dump_path", - dest="out_path", - default="./paddle/", - type=str, - help=" The ut task result out path.", - required=False, - ) - parser.add_argument( - "-backend", - "--backend", - dest="backend", - default="gpu", - type=str, - help=" The running device DEVICE or BENCH.", - required=False, - ) - parser.add_argument( - "-dtype", - "--enforce-dtype", - dest="multi_dtype_ut", - default="", - type=str, - help="", - required=False, - ) - parser.add_argument( - "-real", - "--real_data", - dest="real_data", - default="", - type=str, - help="", - required=False, - ) - parser.add_argument( - "-op", - "--op_name", - dest="test_case_name", - default="", - type=str, - help="debug_op name", - required=False, - ) - parser.add_argument( - "-mode", - "--mode", - dest="test_mode", - default="all", - type=str, - help="debug_op name", - required=False, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - arg_parser(parser) - cfg = parser.parse_args() - print(cfg) - dist.init_parallel_env() - local_rank = dist.get_rank() - # json_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step5/forward_rank" + str(local_rank) + "_all.json" - json_path_list = cfg.json_path.split(' ') - data_path_list = cfg.real_data.split(' ') - - print("json_path_list", json_path_list) - print("data_path_list", data_path_list) - - cfg.json_path = json_path_list[local_rank] - cfg.real_data = data_path_list[local_rank] - cfg.backend = cfg.backend + ":" + str(local_rank) - - print(cfg) - # data_path = "/workspace/APEX/PaddleNLP/dump_info/rank" + str(local_rank) + "_step0/" - # cfg.real_data = None - - forward_content = api_json_read(cfg.json_path) - out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" - if os.path.exists(out_path): - print_warn_log("The output path already exists and the file with the same name will be overwritten.") - out_path = out_path + "/rank_" + str(local_rank) + "/" - if not os.path.exists(out_path): - os.makedirs(out_path, exist_ok=True) - cfg.out_path = out_path - ut_case_parsing(forward_content, cfg) - print_info_log("UT save completed") - warning_log_pth = os.path.join(out_path, "./warning_log.txt") - File = open(warning_log_pth, "w") - for item in Warning_list: - File.write(item + "\n") - File.close() diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 52f638d..2fc695a 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -71,7 +71,9 @@ "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 } -PROFILE_RUN_TIMES = 1 + +PROFILE_WARM_TIMES = 10 +PROFILE_RUN_TIMES = 10 def recursive_delete_arg(arg_in): if isinstance(arg_in, (list, tuple)): @@ -452,59 +454,91 @@ def run_profile_case( out_path = os.path.realpath(out_path) if out_path else "./" save_pth = os.path.join(out_path, "input_data", api_call_name) paddle.save(x, save_pth) - # device warmming up - try: - device_out = run_forward(api_call_name, device_args, device_kwargs) - if api_info_dict["dout_list"][0] != "Failed": - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - paddle.autograd.backward([device_out], dout) - else: - need_backward = False - except Exception as err: - msg = "Failed in device warming up: %s" % str(err) - print_warn_log(msg) - return + + if api_info_dict["dout_list"][0] == "Failed": + need_backward = False input_shape1 = get_shape(device_args) input_shape2 = get_shape(device_kwargs) input_shape_lst = merge_two_lists(input_shape1, input_shape2) - output_shape_lst = get_shape(device_out) + output_shape_lst = [] + model = None def profile_inner_loop_(): try: + if api_call_stack in target_class: + if real_data_path == None: + msg = (f"Running {api_call_name} acc Failed! Don't support run class without real_data_path!") + print_warn_log(msg) + Warning_list.append(msg) + return -1, -1, output_shape_lst + else: + try: + model = create_model(api_call_name, real_data_path) paddle.device.synchronize() - fwd_start_time = time.time() - for _ in range(PROFILE_RUN_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) - paddle.device.synchronize() - fwd_end_time = time.time() + fwd_start_time = 0 + fwd_end_time = 0 + if model is not None: + for _ in range(PROFILE_WARM_TIMES): + device_out = model(*device_args, **device_kwargs) + output_shape_lst = get_shape(device_out) + paddle.device.synchronize() + fwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + device_out = model(*device_args, **device_kwargs) + paddle.device.synchronize() + fwd_end_time = time.time() + else: + for _ in range(PROFILE_WARM_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + output_shape_lst = get_shape(device_out) + paddle.device.synchronize() + fwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + device_out = run_forward(api_call_name, device_args, device_kwargs) + paddle.device.synchronize() + fwd_end_time = time.time() fwd_time = fwd_end_time - fwd_start_time fwd_time = fwd_time * 1000000 / float(PROFILE_RUN_TIMES) # fwd_time is in us except Exception as err: msg = "Run_forward Error: %s" % str(err) print_warn_log(msg) - return -1, -1 + return -1, -1, output_shape_lst try: if not need_backward: - return fwd_time, -1 + return fwd_time, -1, output_shape_lst + bwd_start_time = 0 + bwd_end_time = 0 + dout = create_dout(api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path) + device_out_list = [] paddle.device.synchronize() - bwd_start_time = time.time() - for _ in range(PROFILE_RUN_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) - paddle.autograd.backward([device_out], dout) - paddle.device.synchronize() - bwd_end_time = time.time() + if model is not None: + for _ in range(PROFILE_RUN_TIMES): + device_out_list.append(model(*device_args, **device_kwargs)) + paddle.device.synchronize() + bwd_start_time = time.time() + for i in range(PROFILE_RUN_TIMES): + paddle.autograd.backward([device_out_list[i]], dout) + paddle.device.synchronize() + bwd_end_time = time.time() + else: + for _ in range(PROFILE_RUN_TIMES): + device_out_list.append(run_forward(api_call_name, device_args, device_kwargs)) + paddle.device.synchronize() + bwd_start_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + paddle.autograd.backward([device_out_list[i]], dout) + paddle.device.synchronize() + bwd_end_time = time.time() bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us bwd_time = bwd_time - fwd_time except Exception as err: msg = "Run_backward Error: %s" % str(err) print_warn_log(msg) - return fwd_time, -1 - return fwd_time, bwd_time + return fwd_time, -1, output_shape_lst + return fwd_time, bwd_time, output_shape_lst try: - fwd_time, bwd_time = profile_inner_loop_() + fwd_time, bwd_time, output_shape_lst = profile_inner_loop_() except Exception as err: msg = f"Run {api_call_name} profile Error: %s" % str(err) print_warn_log(msg) @@ -736,17 +770,18 @@ def check_json(json_list): if os.path.exists(out_path): print_warn_log("The output path already exists and the file with the same name will be overwritten.") - if cfg.test_class: + if cfg.distributed_op: dist.init_parallel_env() local_rank = dist.get_rank() - strategy = fleet.DistributedStrategy() - strategy.hybrid_configs = { - "dp_degree": cfg.dp_degree, - "mp_degree": cfg.mp_degree, - "pp_degree": cfg.pp_degree, - "sharding_degree": cfg.sharding_degree} - fleet.init(is_collective=True, strategy=strategy) - paddle.set_default_dtype(cfg.class_type) + if cfg.test_class: + strategy = fleet.DistributedStrategy() + strategy.hybrid_configs = { + "dp_degree": cfg.dp_degree, + "mp_degree": cfg.mp_degree, + "pp_degree": cfg.pp_degree, + "sharding_degree": cfg.sharding_degree} + fleet.init(is_collective=True, strategy=strategy) + paddle.set_default_dtype(cfg.class_type) json_path_list = cfg.json_path.split(' ') data_path_list = cfg.real_data.split(' ') diff --git a/paddleapex/apex/run_without_distributed.py b/paddleapex/apex/run_without_distributed.py deleted file mode 100644 index 04866bb..0000000 --- a/paddleapex/apex/run_without_distributed.py +++ /dev/null @@ -1,636 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddlenlp # if you wanna test nlp fusion operations -import argparse -import os -import shutil -import time -import copy -from tqdm import tqdm -import paddle -import paddle.distributed as dist -from paddle import framework -from paddle.base import core -from utils import ( - print_info_log, - gen_api_params, - api_json_read, - check_grad_list, - rand_like, - gen_args, - print_warn_log, -) - -type_map = { - "FP16": paddle.float16, - "FP32": paddle.float32, - "BF16": paddle.bfloat16, -} -Warning_list = [] - -distributed_op = ["paddle.distributed.broadcast_object_list", - "paddle.distributed.barrier", - "paddle.distributed.communication.stream.alltoall_single", - "paddle.distributed.communication.stream.broadcast", - "paddle.distributed.communication.stream.gather", - "paddle.distributed.communication.stream.recv", - "paddle.distributed.communication.stream.reduce", - "paddle.distributed.communication.stream.reduce_scatter", - "paddle.distributed.communication.stream.scatter", - "paddle.distributed.communication.stream.send", - "paddle.distributed.all_gather", - "paddle.distributed.all_gather_object", - "paddle.distributed.all_reduce", - "paddle.distributed.alltoall", - "paddle.distributed.alltoall_single", - "paddle.distributed.broadcast", - "paddle.distributed.communication.stream.all_gather", - "paddle.distributed.communication.stream.all_reduce", - "paddle.distributed.communication.stream.alltoall"] - -current_time = time.strftime("%Y%m%d%H%M%S") - -tqdm_params = { - "smoothing": 0, # 平滑进度条的预计剩余时间,取值范围0到1 - "desc": "Processing", # 进度条前的描述文字 - "leave": True, # 迭代完成后保留进度条的显示 - "ncols": 75, # 进度条的固定宽度 - "mininterval": 0.1, # 更新进度条的最小间隔秒数 - "maxinterval": 1.0, # 更新进度条的最大间隔秒数 - "miniters": 1, # 更新进度条之间的最小迭代次数 - "ascii": None, # 根据环境自动使用ASCII或Unicode字符 - "unit": "it", # 迭代单位 - "unit_scale": True, # 自动根据单位缩放 - "dynamic_ncols": True, # 动态调整进度条宽度以适应控制台 - "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 -} - - -PROFILE_RUN_TIMES = 100 -PROFILE_WARM_TIMES = 100 - - -def recursive_delete_arg(arg_in): - if isinstance(arg_in, (list, tuple)): - for item in arg_in: - recursive_delete_arg(item) - return - elif isinstance(arg_in, paddle.Tensor): - del arg_in - return -def get_shape(arg_in): - if isinstance(arg_in, (list, tuple)): - res = [] - for item in arg_in: - ret_value = get_shape(item) - res.append(ret_value) - return res - elif isinstance(arg_in, paddle.Tensor): - shape = arg_in.shape - return shape - -def merge_two_lists(lst1, lst2): - merged_list = [] - if lst1 is None and lst2 is not None: - merged_list = lst2 - elif lst1 is not None and lst2 is None: - merged_list = lst1 - elif lst1 is None and lst2 is None: - merged_list = [] - else: - for item in lst1: - if item is None: - continue - else: - merged_list.append(item) - for item in lst2: - if item is None: - continue - else: - merged_list.append(item) - return merged_list - -def convert_out2fp32(arg_in): - flag = False - if isinstance(arg_in, (list, tuple)): - res = [] - for item in arg_in: - ret_flag, ret_value = convert_out2fp32(item) - res.append(ret_value) - flag = flag or ret_flag - return flag, res - elif isinstance(arg_in, paddle.Tensor): - if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": - try: - arg_in = arg_in.cast("float32") - flag = True - except Exception as err: - print(arg_in) - return False, arg_in - return flag, arg_in - - -def recursive_arg_to_cpu(arg_in): - if isinstance(arg_in, (list, tuple)): - res = [] - for item in arg_in: - res.append(recursive_arg_to_cpu(item)) - return res - elif isinstance(arg_in, paddle.Tensor): - arg_in = arg_in.to( - "cpu" - ) # avoid using .cpu(), which will cause the gradient to be lost - return arg_in - - -def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): - if isinstance(arg_in, (list, tuple)): - return type(arg_in)( - recursive_arg_to_device(arg, backend, enforce_dtype) for arg in arg_in - ) - elif isinstance(arg_in, paddle.Tensor): - grad_status = arg_in.stop_gradient - with paddle.no_grad(): - if "gpu" in backend: - arg_in = arg_in.cuda() - if "cpu" in backend: - arg_in = arg_in.cpu() - if arg_in.dtype.name == "BF16" or arg_in.dtype.name == "BFLOAT16": - arg_in = arg_in.cast("float32") - else: - arg_in = arg_in.to(backend) - if enforce_dtype and arg_in.dtype.name in ["BF16", "BFLOAT16", "FP16", "FP32"]: - arg_in = arg_in.cast(enforce_dtype) - arg_in.stop_gradient = grad_status - return arg_in - else: - return arg_in - - -def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): - if dtype_name == "": - bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) - fwd_output_dir = os.path.abspath(os.path.join(out_path, "output")) - else: - bwd_output_dir = os.path.abspath( - os.path.join(out_path, dtype_name, "output_backward") - ) - fwd_output_dir = os.path.abspath(os.path.join(out_path, dtype_name, "output")) - fwd_output_path = os.path.join(fwd_output_dir, api_call_name) - bwd_output_path = os.path.join(bwd_output_dir, api_call_name) - os.makedirs(fwd_output_dir, exist_ok=True) - os.makedirs(bwd_output_dir, exist_ok=True) - if isinstance(forward_res, (type(None), list, tuple, paddle.Tensor)): - try: - fwd_BF16_flag, forward_res = convert_out2fp32(forward_res) - paddle.save([fwd_BF16_flag, forward_res], fwd_output_path) - except Exception as err: - msg = "save_forward Error: %s" % str(err) - print_warn_log(msg) - return - else: - print(forward_res) - print_warn_log("forward_res not supported!") - if isinstance(backward_res, (type(None), list, tuple, paddle.Tensor)): - try: - bwd_BF16_flag, backward_res = convert_out2fp32(backward_res) - paddle.save([bwd_BF16_flag, backward_res], bwd_output_path) - except Exception as err: - msg = "save_bacward Error: %s" % str(err) - print_warn_log(msg) - return - else: - print(backward_res) - print_warn_log("bacward_res not supported!") - - -def evoke_related_test_func(test_mode): - func_method = [] - if "acc" in test_mode: - func_method.append(run_acc_case) - if "mem" in test_mode: - func_method.append(run_mem_case) - if "pro" in test_mode: - func_method.append(run_profile_case) - if test_mode == "all": - return [run_acc_case, run_mem_case, run_profile_case] - if len(func_method) == 0: - raise ValueError("test mode is not supported!") - return func_method - - -def ut_case_parsing(forward_content, cfg): - run_case_funcs = evoke_related_test_func(cfg.test_mode) - backend = cfg.backend - out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" - os.mkdir(out_path) if not os.path.exists(out_path) else None - multi_dtype_ut = cfg.multi_dtype_ut.split(",") if cfg.multi_dtype_ut else [] - debug_case = cfg.test_case_name.split(",") if cfg.test_case_name else [] - print("debug_case", debug_case) - debug_mode = False - paddle.set_device(cfg.backend) - if len(debug_case) > 0: - debug_mode = True - enforce_types = [type_map[item] for item in multi_dtype_ut] - for i, (api_call_name, api_info_dict) in enumerate( - tqdm(forward_content.items(), **tqdm_params) - ): - if not i % dist.get_world_size() == dist.get_rank(): - continue - print(api_call_name) - if debug_mode and api_call_name not in debug_case: - continue - if len(multi_dtype_ut) > 0: - for enforce_dtype in enforce_types: - print(api_call_name + "*" + enforce_dtype.name) - args = api_call_name, api_info_dict, backend, out_path - kwargs = {"enforce_dtype": enforce_dtype, "debug_case": debug_case, "real_data_path": cfg.real_data} - for run_case in run_case_funcs: - run_case(*args, **kwargs) - print("*" * 100) - else: - print(api_call_name) - args = api_call_name, api_info_dict, backend, out_path - kwargs = {"enforce_dtype": None, "debug_case": debug_case, "real_data_path": cfg.real_data} - if isinstance(run_case_funcs, list): - for run_case in run_case_funcs: - run_case(*args, **kwargs) - else: - run_case_funcs(*args, **kwargs) - print("*" * 100) - - -def create_input_args(api_info, backend, enforce_dtype=None, real_data_path=None): - print(real_data_path) - args, kwargs, need_backward = gen_api_params(api_info, real_data_path) - device_args = recursive_arg_to_device(args, backend, enforce_dtype) - device_kwargs = { - key: recursive_arg_to_device(value, backend, enforce_dtype) - for key, value in kwargs.items() - } - return device_args, device_kwargs, need_backward - - -def create_dout(dout_info_dict, device_out, backend, enforce_dtype=None, real_data_path=None): - if dout_info_dict[0] != "Failed": - dout, _ = gen_args(dout_info_dict, real_data_path) - else: - print("dout dump json is None!") - dout = rand_like(device_out) - dout = recursive_arg_to_device(dout, backend, enforce_dtype) - return dout - - -def run_forward(api_call_name, device_args, device_kwargs): - api_call_stack = api_call_name.rsplit("*")[0] - try: - # paddle.distributed.barrier() - device_out = eval(api_call_stack)(*device_args, **device_kwargs) - paddle.device.synchronize() - return device_out - - except Exception as err: - msg = f"Run API {api_call_name} Forward Error: %s" % str(err) - print_warn_log(msg) - Warning_list.append(msg) - return None - - -def get_grad_tensor(args, kwargs): - device_grad_out = [] - for arg in args: - if isinstance(arg, paddle.Tensor): - device_grad_out.append(arg.grad) - if isinstance(arg, list): # op: concat/stack - for x in arg: - if isinstance(x, paddle.Tensor): - device_grad_out.append(x.grad) - for k, v in kwargs.items(): - if isinstance(v, paddle.Tensor): - device_grad_out.append(v.grad) - if isinstance(v, list): # op: concat/stack - for x in v: - if isinstance(x, paddle.Tensor): - device_grad_out.append(x.grad) - return device_grad_out - - -def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=None): - if need_backward: - try: - paddle.autograd.backward([device_out], dout) - device_grad_out = get_grad_tensor(args, kwargs) - device_grad_out = check_grad_list(device_grad_out) - if device_grad_out is None: - msg = f"{api_call_name} grad_list is None" - Warning_list.append(msg) - return device_grad_out - except Exception as err: - msg = f"Run API {api_call_name} backward Error: %s" % str(err) - print_warn_log(msg) - Warning_list.append(msg) - return None - else: - msg = f"{api_call_name} has no tensor required grad, SKIP Backward" - print_warn_log(msg) - Warning_list.append(msg) - return None - - -def run_acc_case( - api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None -): - api_info_dict_copy = copy.deepcopy(api_info_dict) - # if not dist.get_rank() == 0 and "distributed" not in api_call_name: - # real_data_path = None - device_args, device_kwargs, need_backward = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) - print(f"Running {api_call_name} acc test!") - # if api_call_name in debug_case: - # x = [device_args, device_kwargs] - # out_path = os.path.realpath(out_path) if out_path else "./" - # save_pth = os.path.join(out_path, "input_data", api_call_name) - # paddle.save(x, save_pth) - try: - # if "distributed" in api_call_name: - # paddle.distributed.barrier() - device_out = run_forward(api_call_name, device_args, device_kwargs) - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return - - try: - device_grad_out = [] - if api_info_dict["dout_list"][0] != "Failed": - dout = create_dout( - api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path - ) - device_grad_out = run_backward( - api_call_name, device_out, dout, device_args, device_kwargs, need_backward - ) - else: - if api_call_name.rsplit("*")[0] in distributed_op: - print('this is distributed op: ', api_call_name) - device_out = device_args - device_grad_out = None - except Exception as err: - msg = "Run_backward Error: %s" % str(err) - print_warn_log(msg) - if enforce_dtype: - save_tensor( - device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name - ) - else: - save_tensor(device_out, device_grad_out, out_path, api_call_name) - return - if enforce_dtype: - save_tensor( - device_out, device_grad_out, out_path, api_call_name, enforce_dtype.name - ) - else: - save_tensor(device_out, device_grad_out, out_path, api_call_name) - - # paddle.distributed.barrier() - return - - -def run_profile_case( - api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None -): - print(f"Running {api_call_name} profile test!") - api_info_dict_copy = copy.deepcopy(api_info_dict) - device_args, device_kwargs, need_backward = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) - # if api_call_name in debug_case: - # x = [device_args, device_kwargs] - # out_path = os.path.realpath(out_path) if out_path else "./" - # save_pth = os.path.join(out_path, "input_data", api_call_name) - # paddle.save(x, save_pth) - # device warmming up - if api_info_dict["dout_list"][0] == "Failed": - need_backward = False - input_shape1 = get_shape(device_args) - input_shape2 = get_shape(device_kwargs) - input_shape_lst = merge_two_lists(input_shape1, input_shape2) - output_shape_lst = [] - def profile_inner_loop_(): - try: - paddle.device.synchronize() - for _ in range(PROFILE_WARM_TIMES): - device_out = run_forward(api_call_name, device_args, device_kwargs) - output_shape_lst = get_shape(device_out) - paddle.device.synchronize() - fwd_start_time = time.time() - for _ in range(PROFILE_RUN_TIMES): - run_forward(api_call_name, device_args, device_kwargs) - paddle.device.synchronize() - fwd_end_time = time.time() - fwd_time = fwd_end_time - fwd_start_time - fwd_time = fwd_time * 1000000 / float(PROFILE_RUN_TIMES) # fwd_time is in us - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return -1, -1, output_shape_lst - try: - if not need_backward: - return fwd_time, -1, output_shape_lst - dout = create_dout(api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path) - print("create_dout------") - paddle.device.synchronize() - device_out_list = [] - for _ in range(PROFILE_RUN_TIMES): - device_out_list.append(run_forward(api_call_name, device_args, device_kwargs)) - paddle.device.synchronize() - print("Run_backward------") - bwd_start_time = time.time() - for i in range(PROFILE_RUN_TIMES): - paddle.autograd.backward([device_out_list[i]], dout) - paddle.device.synchronize() - bwd_end_time = time.time() - bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second - bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us - # bwd_time = bwd_time - fwd_time - except Exception as err: - msg = "Run_backward Error: %s" % str(err) - print_warn_log(msg) - return fwd_time, -1, output_shape_lst - return fwd_time, bwd_time, output_shape_lst - - try: - fwd_time, bwd_time, output_shape_lst = profile_inner_loop_() - except Exception as err: - msg = f"Run {api_call_name} profile Error: %s" % str(err) - print_warn_log(msg) - Warning_list.append(msg) - return - - if not enforce_dtype: - log_path = os.path.join(out_path, "profile_analyze" + str(dist.get_rank()) +".log") - else: - log_path = os.path.join(out_path, enforce_dtype.name, "profile_analyze" + str(dist.get_rank()) +".log") - - F = open(log_path, "a") - dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" - op_fwd = api_call_name + dtype + ".forward" - op_bwd = api_call_name + dtype + ".backward" - print_info_log(f"{op_fwd}:\t{fwd_time}") - print_info_log(f"{op_bwd}:\t{bwd_time}") - dtype = "\t" if not enforce_dtype else f"\t{enforce_dtype.name}" - msg_fwd = f"{api_call_name}.forward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tforward\t{fwd_time}" - msg_bwd = f"{api_call_name}.backward\tdtype{dtype}\tinput shape\t{input_shape_lst}\toutput shape\t{output_shape_lst}\tbackward\t{bwd_time}" - - F.write(msg_fwd + "\n") - F.write(msg_bwd + "\n") - F.close() - return - - -def run_mem_case( - api_call_name, - api_info_dict, - backend, - out_path, - enforce_dtype=None, - debug_case=[], # noqa - real_data_path=None -): - print(f"Running {api_call_name} mem test!") - - activation_cost = None - place = framework._current_expected_place_() - device_id = place.get_device_id() - before_run_mem = core.device_memory_stat_current_value("Allocated", device_id) - api_info_dict_copy = copy.deepcopy(api_info_dict) - device_args, device_kwargs, _ = create_input_args( - api_info_dict_copy, backend, enforce_dtype, real_data_path - ) - try: - device_out = run_forward(api_call_name, device_args, device_kwargs) - recursive_delete_arg(device_args) - for _, value in device_kwargs.items(): - recursive_delete_arg(value) - _ = recursive_arg_to_cpu(device_out) - after_run_mem = core.device_memory_stat_current_value("Allocated", device_id) - activation_cost = after_run_mem - before_run_mem - - except Exception as err: - msg = "Run_forward Error: %s" % str(err) - print_warn_log(msg) - return - if not enforce_dtype: - log_path = os.path.join(out_path, "memory_analyze.log") - else: - log_path = os.path.join(out_path, enforce_dtype.name, "memory_analyze.log") - - os.mkdir(out_path) if not os.path.exists(out_path) else None - F = open(log_path, "a") - dtype = "" if not enforce_dtype else f"*{enforce_dtype.name}" - op_name = api_call_name + dtype + ".forward" - F.write(f"{op_name}:\t{str(activation_cost)}\n") - F.close() - return - - -def arg_parser(parser): - parser.add_argument( - "-json_file", - "--json_file", - dest="json_path", - default="", - type=str, - help="Dump json file path", - required=True, - ) - parser.add_argument( - "-out", - "--dump_path", - dest="out_path", - default="./paddle/", - type=str, - help=" The ut task result out path.", - required=False, - ) - parser.add_argument( - "-backend", - "--backend", - dest="backend", - default="gpu", - type=str, - help=" The running device DEVICE or BENCH.", - required=False, - ) - parser.add_argument( - "-dtype", - "--enforce-dtype", - dest="multi_dtype_ut", - default="", - type=str, - help="", - required=False, - ) - parser.add_argument( - "-real", - "--real_data", - dest="real_data", - default="", - type=str, - help="", - required=False, - ) - parser.add_argument( - "-op", - "--op_name", - dest="test_case_name", - default="", - type=str, - help="debug_op name", - required=False, - ) - parser.add_argument( - "-mode", - "--mode", - dest="test_mode", - default="all", - type=str, - help="debug_op name", - required=False, - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - arg_parser(parser) - cfg = parser.parse_args() - print(cfg) - dist.init_parallel_env() - local_rank = dist.get_rank() - cfg.backend = cfg.backend + ":" + str(local_rank) - - forward_content = api_json_read(cfg.json_path) - out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" - if os.path.exists(out_path): - print_warn_log("The output path already exists and the file with the same name will be overwritten.") - cfg.out_path = out_path - ut_case_parsing(forward_content, cfg) - print_info_log("UT save completed") - # warning_log_pth = os.path.join(out_path, "./warning_log.txt") - # File = open(warning_log_pth, "w") - # for item in Warning_list: - # File.write(item + "\n") - # File.close() - paddle.device.synchronize() From 7604ad46f63135b0c2b2b60d84ef045025e8d88d Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Tue, 17 Dec 2024 15:44:28 +0800 Subject: [PATCH 10/22] remove some useless comments --- paddleapex/api_tracer/Dump.py | 1 - paddleapex/api_tracer/api_info.py | 62 ++++++++++---------- paddleapex/api_tracer/wrap_op/OPTemplate.py | 7 --- paddleapex/api_tracer/wrap_op/hijack_tool.py | 1 - 4 files changed, 30 insertions(+), 41 deletions(-) diff --git a/paddleapex/api_tracer/Dump.py b/paddleapex/api_tracer/Dump.py index bd7c45b..d95d716 100644 --- a/paddleapex/api_tracer/Dump.py +++ b/paddleapex/api_tracer/Dump.py @@ -101,7 +101,6 @@ def dump_real_data(self, api_args, tensor, rank): create_directory(remote_repo) self.pool.safe_parellel_save(tensor, file_path, remote_repo) else: - # print("sss----save----") save_tensor(tensor, file_path) return f"{api_args}.pt" diff --git a/paddleapex/api_tracer/api_info.py b/paddleapex/api_tracer/api_info.py index 808884e..4fe59e4 100644 --- a/paddleapex/api_tracer/api_info.py +++ b/paddleapex/api_tracer/api_info.py @@ -191,36 +191,35 @@ def effi_analyze_tensor(self, arg): arg.name = "APEX_" + self.op_name + "_" + str(self.arg_index) single_arg.update({"name": arg.name}) self.arg_index = self.arg_index + 1 - try: - with paddle.no_grad(): - max_ = paddle.max(arg).item() - min_ = paddle.min(arg).item() - except: - max_ = 1 - min_ = 0 - if cfg.dump_unique and arg.dtype.name != "BOOL": - ori_max_ = max_ - ori_min_ = min_ - if math.isinf(ori_max_) or math.isnan(ori_max_): - msg = f"warning, for max_result, where is a inf or nan, need to notice" - print(msg) - if math.isinf(ori_min_) or math.isnan(ori_min_): - msg = f"warning, for min_result, where is a inf or nan, need to notice" - print(msg) - max_ = get_rounded_num(ori_max_, True) - min_ = get_rounded_num(ori_min_, False) if ori_min_ != ori_max_ else max_ - single_arg.update({"Max": max_}) - single_arg.update({"Max_origin": max_}) - single_arg.update({"Min": min_}) - single_arg.update({"Min_origin": min_}) single_arg.update({"stop_gradient": arg.stop_gradient}) - # if self.mode == "real_data" and (dist.get_rank() == 0 or self.is_distributed): if self.mode == "real_data": api_args = self.op_name + "." + str(self.args_num) - # if not exit_tensor: pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) self.args_num += 1 single_arg.update({"real_data_path": pt_path}) + else: + try: + with paddle.no_grad(): + max_ = paddle.max(arg).item() + min_ = paddle.min(arg).item() + except: + max_ = 1 + min_ = 0 + if cfg.dump_unique and arg.dtype.name != "BOOL": + ori_max_ = max_ + ori_min_ = min_ + if math.isinf(ori_max_) or math.isnan(ori_max_): + msg = f"warning, for max_result, where is a inf or nan, need to notice" + print(msg) + if math.isinf(ori_min_) or math.isnan(ori_min_): + msg = f"warning, for min_result, where is a inf or nan, need to notice" + print(msg) + max_ = get_rounded_num(ori_max_, True) + min_ = get_rounded_num(ori_min_, False) if ori_min_ != ori_max_ else max_ + single_arg.update({"Max": max_}) + single_arg.update({"Max_origin": max_}) + single_arg.update({"Min": min_}) + single_arg.update({"Min_origin": min_}) return single_arg def _analyze_tensor(self, arg): @@ -228,6 +227,13 @@ def _analyze_tensor(self, arg): single_arg.update({"type": "paddle.Tensor"}) single_arg.update({"dtype": str(arg.dtype.name)}) single_arg.update({"shape": arg.shape}) + single_arg.update({"stop_gradient": arg.stop_gradient}) + if self.mode == "real_data": + api_args = self.op_name + "." + str(self.args_num) + pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) + self.args_num += 1 + single_arg.update({"real_data_path": pt_path}) + return single_arg if arg.dtype.name == "BF16": arg = paddle.cast(arg, "float32") max_handle, max_origin, min_handle, min_origin = get_tensor_extremum(arg) @@ -239,14 +245,6 @@ def _analyze_tensor(self, arg): single_arg.update( {"Min_origin": transfer_types(min_origin, str(arg.dtype.name))} ) - single_arg.update({"stop_gradient": arg.stop_gradient}) - - # if self.mode == "real_data" and (dist.get_rank() == 0 or self.is_distributed): - if self.mode == "real_data": - api_args = self.op_name + "." + str(self.args_num) - pt_path = dump_util.dump_real_data(api_args, arg.detach().cpu(), self.rank) - self.args_num += 1 - single_arg.update({"real_data_path": pt_path}) return single_arg def _analyze_builtin(self, arg): diff --git a/paddleapex/api_tracer/wrap_op/OPTemplate.py b/paddleapex/api_tracer/wrap_op/OPTemplate.py index 82fb171..1616025 100644 --- a/paddleapex/api_tracer/wrap_op/OPTemplate.py +++ b/paddleapex/api_tracer/wrap_op/OPTemplate.py @@ -55,15 +55,12 @@ def save_init_params_and_weight(init_params, state_dict, name, rank): file_path = os.path.join(directory, f"{name}.init_params") with open(file_path, 'wb') as f: pickle.dump(init_params, f) - # paddle.save(init_params, file_path) paddle.save(state_dict, os.path.join(directory, f"{name}.state_dict")) def hijack_call(self, *args, **kwargs): cls = self.__class__ init_params = get_init_params(self) - # print("init_params", init_params) - # print("hijack_call", self.__class__.__name__) cfg.prefix_op_name_ = self.prefix_op_name_ + "*" if self.__class__.__name__ not in cfg.Op_count: cfg.Op_count[self.__class__.__name__] = 1 @@ -78,9 +75,6 @@ def hijack_call(self, *args, **kwargs): api_recorder.update_real_data(args, kwargs) save_init_params_and_weight(init_params, self.state_dict(), cfg.prefix_op_name_, rank) output = self.forward(*args, **kwargs) - # api_recorder.update_output(output) - # print("api_info_struct !!!!!!", api_recorder.api_info_struct) - # print(output) try: if isinstance(output, paddle.Tensor): if not output.stop_gradient: @@ -124,7 +118,6 @@ def forward(self, *args, **kwargs): api_recorder.update_APIInfo(cfg.prefix_op_name_, rank) api_recorder.update_real_data(args, kwargs) output = getattr(HookOp, "wrap_" + str(self.op_name_))(*args, **kwargs) - # api_recorder.update_output(output) try: if isinstance(output, paddle.Tensor): if not output.stop_gradient: diff --git a/paddleapex/api_tracer/wrap_op/hijack_tool.py b/paddleapex/api_tracer/wrap_op/hijack_tool.py index d70fe8e..4360f25 100644 --- a/paddleapex/api_tracer/wrap_op/hijack_tool.py +++ b/paddleapex/api_tracer/wrap_op/hijack_tool.py @@ -43,7 +43,6 @@ def hijack_api(): print(op_name, str(err)) for class_in in target_class: - # print("begin class --------------------------------", class_in) parent_package, class_n = class_in.rsplit(".", maxsplit=1) try: class_name, model = try_import(parent_package) From bb37e2c56bdcf95ef5c59acd1713659537574d30 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Wed, 18 Dec 2024 16:08:22 +0800 Subject: [PATCH 11/22] fix some bugs --- ...c_direct_cmp_zxq.py => acc_direct_paddle.py} | 1 - paddleapex/apex/run_llama20b_xpu.sh | 4 +++- paddleapex/apex/run_paddle.py | 17 +++++++++-------- paddleapex/api_tracer/configs/op_target.yaml | 4 +--- paddleapex/api_tracer/wrap_op/OPTemplate.py | 3 --- paddleapex/api_tracer/wrap_op/get_target_op.py | 4 ++-- 6 files changed, 15 insertions(+), 18 deletions(-) rename paddleapex/apex/{acc_direct_cmp_zxq.py => acc_direct_paddle.py} (99%) diff --git a/paddleapex/apex/acc_direct_cmp_zxq.py b/paddleapex/apex/acc_direct_paddle.py similarity index 99% rename from paddleapex/apex/acc_direct_cmp_zxq.py rename to paddleapex/apex/acc_direct_paddle.py index 42c8765..4e546c6 100644 --- a/paddleapex/apex/acc_direct_cmp_zxq.py +++ b/paddleapex/apex/acc_direct_paddle.py @@ -116,7 +116,6 @@ def compare_device_bench( api_pt_files_all = list(set(api_pt_files_bench + api_pt_files_device)) api_pt_files_all = sorted(api_pt_files_all) - # f = open(out_path + "compare_result.txt", 'a', encoding='utf-8') errors = [] errors_forward_info = [] errors_bacward_info = [] diff --git a/paddleapex/apex/run_llama20b_xpu.sh b/paddleapex/apex/run_llama20b_xpu.sh index 4b1c82a..034b7e4 100644 --- a/paddleapex/apex/run_llama20b_xpu.sh +++ b/paddleapex/apex/run_llama20b_xpu.sh @@ -46,13 +46,15 @@ export LOGITS_PRINT_INTERVAL=1 #python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" acc_direct_cmp_zxq.py --bench /ssd3/zhouxiangquan/llama20b/GPU/ --device /ssd3/zhouxiangquan/llama20b/result/rank_0/ -o /ssd3/zhouxiangquan/llama20b/ #python lot_t.py +#python run_paddle.py -json /ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/test.json -backend xpu -real /ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc + python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_paddle.py \ -json \ "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/forward_rank0_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/forward_rank1_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/forward_rank2_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/forward_rank3_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/forward_rank4_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/forward_rank5_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/forward_rank6_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/forward_rank7_all.json" \ -backend xpu \ -real \ "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/" \ - -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc + -out /ssd3/zhouxiangquan/llama20b/result/ -mode pro -class 1 -dist 1 # #python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 2fc695a..c41734f 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -444,6 +444,7 @@ def run_acc_case( def run_profile_case( api_call_name, api_info_dict, backend, out_path, enforce_dtype=None, debug_case=[], real_data_path=None ): + api_call_stack = api_call_name.rsplit("*")[0] print(f"Running {api_call_name} profile test!") api_info_dict_copy = copy.deepcopy(api_info_dict) device_args, device_kwargs, need_backward = create_input_args( @@ -461,8 +462,8 @@ def run_profile_case( input_shape2 = get_shape(device_kwargs) input_shape_lst = merge_two_lists(input_shape1, input_shape2) output_shape_lst = [] - model = None def profile_inner_loop_(): + is_model = False try: if api_call_stack in target_class: if real_data_path == None: @@ -471,12 +472,12 @@ def profile_inner_loop_(): Warning_list.append(msg) return -1, -1, output_shape_lst else: - try: - model = create_model(api_call_name, real_data_path) + model = create_model(api_call_name, real_data_path) + is_model = True paddle.device.synchronize() fwd_start_time = 0 fwd_end_time = 0 - if model is not None: + if is_model: for _ in range(PROFILE_WARM_TIMES): device_out = model(*device_args, **device_kwargs) output_shape_lst = get_shape(device_out) @@ -510,7 +511,7 @@ def profile_inner_loop_(): dout = create_dout(api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path) device_out_list = [] paddle.device.synchronize() - if model is not None: + if is_model: for _ in range(PROFILE_RUN_TIMES): device_out_list.append(model(*device_args, **device_kwargs)) paddle.device.synchronize() @@ -524,13 +525,13 @@ def profile_inner_loop_(): device_out_list.append(run_forward(api_call_name, device_args, device_kwargs)) paddle.device.synchronize() bwd_start_time = time.time() - for _ in range(PROFILE_RUN_TIMES): + for i in range(PROFILE_RUN_TIMES): paddle.autograd.backward([device_out_list[i]], dout) paddle.device.synchronize() bwd_end_time = time.time() bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us - bwd_time = bwd_time - fwd_time + # bwd_time = bwd_time - fwd_time except Exception as err: msg = "Run_backward Error: %s" % str(err) print_warn_log(msg) @@ -781,7 +782,7 @@ def check_json(json_list): "pp_degree": cfg.pp_degree, "sharding_degree": cfg.sharding_degree} fleet.init(is_collective=True, strategy=strategy) - paddle.set_default_dtype(cfg.class_type) + paddle.set_default_dtype(cfg.class_default_type) json_path_list = cfg.json_path.split(' ') data_path_list = cfg.real_data.split(' ') diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index 6fdf250..ea3bcaf 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -1,5 +1,4 @@ -ignored_op: -#target_class: +target_class: - paddlenlp.transformers.llama.modeling.LlamaMLP - paddlenlp.transformers.llama.modeling.LlamaLMHead - paddlenlp.transformers.llama.modeling.LlamaRMSNorm @@ -20,7 +19,6 @@ ignored_op: - paddlenlp.transformers.llama.modeling.LlamaPretrainingCriterion - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss - paddlenlp.transformers.llama.modeling.LlamaForCausalLM -target_class: - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler ignored_op: - paddle._C_ops.max diff --git a/paddleapex/api_tracer/wrap_op/OPTemplate.py b/paddleapex/api_tracer/wrap_op/OPTemplate.py index 1616025..0ef670e 100644 --- a/paddleapex/api_tracer/wrap_op/OPTemplate.py +++ b/paddleapex/api_tracer/wrap_op/OPTemplate.py @@ -33,14 +33,11 @@ def hijack_init(self, *args, **kwargs): self.__init__(*args, **kwargs) -# 获取初始化参数的方法 def get_init_params(instance): sig = signature(instance.__init__) - # 获取参数名称及默认值 bound_args = sig.bind_partial() bound_args.apply_defaults() - # 提取参数值 init_params = {} for param in sig.parameters.values(): if param.name != 'self': diff --git a/paddleapex/api_tracer/wrap_op/get_target_op.py b/paddleapex/api_tracer/wrap_op/get_target_op.py index 2788e9d..42daf73 100644 --- a/paddleapex/api_tracer/wrap_op/get_target_op.py +++ b/paddleapex/api_tracer/wrap_op/get_target_op.py @@ -26,10 +26,11 @@ def __init__(self, yaml_path): self.target_op = Ops.get("target_op") self.ignored_op = Ops.get("ignored_op") self.target_class = Ops.get("target_class") + self.distributed_op = Ops.get("distributed_op") f.close() if self.ignored_op is None: self.ignored_op = [] - self.api_to_catch = set(self.target_op) - set(self.ignored_op) + self.api_to_catch = set(self.target_op).union(set(self.distributed_op)) - set(self.ignored_op) def check_api_stack(self): for api in self.api_to_catch: @@ -44,7 +45,6 @@ def check_api_stack(self): print(f"For api: {api} ", str(err)) def get_target_ops(self): - self.api_to_catch = set(self.target_op) - set(self.ignored_op) if cfg.profile_mode: self.api_to_catch -= set(["paddle.max", "paddle.min"]) self.check_api_stack() From b19c752bf3442a9c16eadf5499a6b93a3c69d499 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Wed, 18 Dec 2024 16:23:53 +0800 Subject: [PATCH 12/22] remove some debug files --- paddleapex/apex/run_llama10b_xpu.sh | 64 ----------------- paddleapex/apex/run_llama10b_xpu_32k.sh | 95 ------------------------- paddleapex/apex/run_llama10b_xpu_new.sh | 73 ------------------- paddleapex/apex/run_llama20b_xpu_pro.sh | 75 ------------------- paddleapex/apex/split_distributed.py | 76 -------------------- 5 files changed, 383 deletions(-) delete mode 100644 paddleapex/apex/run_llama10b_xpu.sh delete mode 100644 paddleapex/apex/run_llama10b_xpu_32k.sh delete mode 100644 paddleapex/apex/run_llama10b_xpu_new.sh delete mode 100644 paddleapex/apex/run_llama20b_xpu_pro.sh delete mode 100644 paddleapex/apex/split_distributed.py diff --git a/paddleapex/apex/run_llama10b_xpu.sh b/paddleapex/apex/run_llama10b_xpu.sh deleted file mode 100644 index 7db7f5e..0000000 --- a/paddleapex/apex/run_llama10b_xpu.sh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/bash -task_name_or_path="llama-10b" -#export XPUAPI_DEBUG=0x1 -#export XPURT_DISPATCH_MODE=PROFILING -export XPU_FORCE_USERMODE_LAUNCH=1 -export PYTHONPATH=$PYTHONPATH:/workspace/APEX/PaddleAPEX:/workspace/APEX/PaddleNLP - -runtime_location=/workspace/so-runtime -bkcl_location=/workspace/so-bkcl -export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:$LD_LIBRARY_PATH - -export XBLAS_FC_HBM_VERSION=40 - -# PaddlePaddle -export FLAGS_use_stride_kernel="0" -# export XPU_CDNN_CLUSTER_PARALLEL=1 -# export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 -export XPU_PADDLE_L3_SIZE0=1024 -export XPU_PADDLE_L3_SIZE1=1024 - -export XPUAPI_DEBUG=0x1 - -# BKCL -# export BKCL_DEBUG=1 -# Multi-computer RDMA -export BKCL_ENABLE_XDR=1 -export BKCL_RDMA_FORCE_TREE=1 -export BKCL_TREE_THRESHOLD=0 -#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 -#export BKCL_SOCKET_IFNAME=eth0 -export BKCL_FORCE_L3_RDMA=0 -export BKCL_USE_AR=1 -export BKCL_RING_OPT=1 -export BKCL_RING_HOSTID_USE_RANK=1 - -echo "bkcl version:" -strings ${bkcl_location}/libbkcl.so | grep COM -master_ip=$POD_0_IP -nnodes=$PADDLE_TRAINERS_NUM -echo "master ip:" -echo $master_ip - -export CUDA_DEVICE_MAX_CONNECTIONS=8 - -timestamp=$(date +%Y%m%d%H%M%S) -echo $timestamp - -PaddleNLP_DIR=$(pwd) -echo "PaddleNLP_DIR: "$PaddleNLP_DIR - -export USING_LAYERNORM=1 -export USING_GQA_NEOX=1 -export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 - -export BKCL_USE_AR=1 -export BKCL_RING_OPT=1 -export BKCL_RING_HOSTID_USE_RANK=1 - -export USING_LOGITS_PRINT=1 -export LOGITS_PRINT_INTERVAL=1 -export XPU_PADDLE_FC_LOCAL_INT16=1 - - -python run_paddle.py -json /workspace/APEX/llama10b/dump_info/rank0_step0/forward_rank0_all.json -backend xpu -out /workspace/APEX/llama10b/ -mode acc diff --git a/paddleapex/apex/run_llama10b_xpu_32k.sh b/paddleapex/apex/run_llama10b_xpu_32k.sh deleted file mode 100644 index 84cf9c1..0000000 --- a/paddleapex/apex/run_llama10b_xpu_32k.sh +++ /dev/null @@ -1,95 +0,0 @@ -#!/bin/bash -task_name_or_path="llama-10b" -#export XPUAPI_DEBUG=0x1 -#export XPURT_DISPATCH_MODE=PROFILING -export XPU_FORCE_USERMODE_LAUNCH=1 -export PYTHONPATH=$PYTHONPATH:/zhouxiangquan/PaddleAPEX:/zhouxiangquan/PaddleNLP - -runtime_location=/workspace/so-runtime -bkcl_location=/workspace/so-bkcl -export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:$LD_LIBRARY_PATH - -export XBLAS_FC_HBM_VERSION=40 - -# PaddlePaddle -export FLAGS_use_stride_kernel="0" -export XPU_CDNN_CLUSTER_PARALLEL=1 -export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 -export XPU_PADDLE_L3_SIZE0=1024 -export XPU_PADDLE_L3_SIZE1=1024 - -# BKCL -# export BKCL_DEBUG=1 -# Multi-computer RDMA -export BKCL_ENABLE_XDR=1 -export BKCL_RDMA_FORCE_TREE=0 -export BKCL_TREE_THRESHOLD=0 -#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 -#export BKCL_SOCKET_IFNAME=eth0 -export BKCL_FORCE_L3_RDMA=0 -export BKCL_USE_AR=1 -export BKCL_RING_OPT=1 -export BKCL_RING_HOSTID_USE_RANK=1 - -echo "bkcl version:" -strings ${bkcl_location}/libbkcl.so | grep COM -master_ip=$POD_0_IP -nnodes=$PADDLE_TRAINERS_NUM -echo "master ip:" -echo $master_ip - -export CUDA_DEVICE_MAX_CONNECTIONS=8 - -timestamp=$(date +%Y%m%d%H%M%S) -echo $timestamp - -PaddleNLP_DIR=$(pwd) -echo "PaddleNLP_DIR: "$PaddleNLP_DIR - -export USING_LAYERNORM=1 -export USING_GQA_NEOX=1 -export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 - -export BKCL_USE_AR=1 -export BKCL_RING_OPT=1 -export BKCL_RING_HOSTID_USE_RANK=1 - -export USING_LOGITS_PRINT=1 -export LOGITS_PRINT_INTERVAL=1 -export XPU_PADDLE_FC_LOCAL_INT16=1 -export CUDA_DEVICE_ORDER=OAM_ID -export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 -export XPU_AUTO_BF16_TF32_RADIO=1 -export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 -export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 - -python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" acc_direct_cmp_zxq.py --bench /ssd3/zhouxiangquan/llama10b/GPU/ --device /ssd3/zhouxiangquan/llama10b/result/rank_0/ -o /ssd3/zhouxiangquan/llama10b/ -python lot_t.py - -#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_class.py \ -# -json \ -# "/zhouxiangquan/llama10b/dump_info/rank0_step1/class.json /zhouxiangquan/llama10b/dump_info/rank1_step1/class.json /zhouxiangquan/llama10b/dump_info/rank2_step1/class.json /zhouxiangquan/llama10b/dump_info/rank3_step1/class.json /zhouxiangquan/llama10b/dump_info/rank4_step1/class.json /zhouxiangquan/llama10b/dump_info/rank5_step1/class.json /zhouxiangquan/llama10b/dump_info/rank6_step1/class.json /zhouxiangquan/llama10b/dump_info/rank7_step1/class.json" \ -# -backend xpu \ -# -real \ -# "/zhouxiangquan/llama10b/dump_info/rank0_step0/ /zhouxiangquan/llama10b/dump_info/rank1_step0/ /zhouxiangquan/llama10b/dump_info/rank2_step0/ /zhouxiangquan/llama10b/dump_info/rank3_step0/ /zhouxiangquan/llama10b/dump_info/rank4_step0/ /zhouxiangquan/llama10b/dump_info/rank5_step0/ /zhouxiangquan/llama10b/dump_info/rank6_step0/ /zhouxiangquan/llama10b/dump_info/rank7_step0/" \ -# -out /zhouxiangquan/llama10b/result/ -mode acc -# -# -#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ -# -json \ -# "/zhouxiangquan/llama10b/dump_info/rank0_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank1_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank2_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank3_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank4_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank5_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank6_step1/distributed.json /zhouxiangquan/llama10b/dump_info/rank7_step1/distributed.json" \ -# -backend xpu \ -# -real \ -# "/zhouxiangquan/llama10b/dump_info/rank0_step0/ /zhouxiangquan/llama10b/dump_info/rank1_step0/ /zhouxiangquan/llama10b/dump_info/rank2_step0/ /zhouxiangquan/llama10b/dump_info/rank3_step0/ /zhouxiangquan/llama10b/dump_info/rank4_step0/ /zhouxiangquan/llama10b/dump_info/rank5_step0/ /zhouxiangquan/llama10b/dump_info/rank6_step0/ /zhouxiangquan/llama10b/dump_info/rank7_step0/" \ -# -out /zhouxiangquan/llama10b/result/ -mode acc -# - -#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_without_distributed.py \ -# -json \ -# "/zhouxiangquan/llama10b/dump_info/rank0_step1/common.json" \ -# -backend xpu \ -# -real \ -# "/zhouxiangquan/llama10b/dump_info/rank0_step0/" \ -# -out /zhouxiangquan/llama10b/result/rank_0/ -mode acc - - diff --git a/paddleapex/apex/run_llama10b_xpu_new.sh b/paddleapex/apex/run_llama10b_xpu_new.sh deleted file mode 100644 index 9bdec5a..0000000 --- a/paddleapex/apex/run_llama10b_xpu_new.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/bin/bash -task_name_or_path="llama-10b" -export XPUAPI_DEBUG=0x1 -#export XPURT_DISPATCH_MODE=PROFILING -export XPU_FORCE_USERMODE_LAUNCH=1 - -runtime_location=/workspace/so-runtime -bkcl_location=/workspace/so-bkcl -export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:$LD_LIBRARY_PATH - -export XBLAS_FC_HBM_VERSION=40 - -# PaddlePaddle -export FLAGS_use_stride_kernel="0" -# export XPU_CDNN_CLUSTER_PARALLEL=1 -# export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 -export XPU_PADDLE_L3_SIZE0=1024 -export XPU_PADDLE_L3_SIZE1=1024 - -# BKCL -# export BKCL_DEBUG=1 -# Multi-computer RDMA -export BKCL_ENABLE_XDR=1 -export BKCL_RDMA_FORCE_TREE=1 -export BKCL_TREE_THRESHOLD=0 -#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 -#export BKCL_SOCKET_IFNAME=eth0 -export BKCL_FORCE_L3_RDMA=0 -export BKCL_USE_AR=1 -export BKCL_RING_OPT=1 -export BKCL_RING_HOSTID_USE_RANK=1 - -echo "bkcl version:" -strings ${bkcl_location}/libbkcl.so | grep COM -master_ip=$POD_0_IP -nnodes=$PADDLE_TRAINERS_NUM -echo "master ip:" -echo $master_ip - -export CUDA_DEVICE_MAX_CONNECTIONS=8 - -timestamp=$(date +%Y%m%d%H%M%S) -echo $timestamp - -PaddleNLP_DIR=$(pwd) -echo "PaddleNLP_DIR: "$PaddleNLP_DIR - -export USING_LAYERNORM=1 -export USING_GQA_NEOX=1 -export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 - -export BKCL_USE_AR=1 -export BKCL_RING_OPT=1 -export BKCL_RING_HOSTID_USE_RANK=1 - -export USING_LOGITS_PRINT=1 -export LOGITS_PRINT_INTERVAL=1 -export XPU_PADDLE_FC_LOCAL_INT16=1 -export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 - -export XPU_AUTO_BF16_TF32_RADIO=1 # 设置比例 0.001, XPU_AUTO_BF16_TF32_RADIO/1000 -export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 -export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 - -export PYTHONPATH=$PYTHONPATH:/workspace/APEX/PaddleNLP:/workspace/AA/PaddleAPEX - -python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_class.py \ - -json \ - "/workspace/APEX/PaddleNLP/dump_info/rank0_step1/forward_rank0_all.json /workspace/APEX/PaddleNLP/dump_info/rank1_step1/forward_rank1_all.json /workspace/APEX/PaddleNLP/dump_info/rank2_step1/forward_rank2_all.json /workspace/APEX/PaddleNLP/dump_info/rank3_step1/forward_rank3_all.json /workspace/APEX/PaddleNLP/dump_info/rank4_step1/forward_rank4_all.json /workspace/APEX/PaddleNLP/dump_info/rank5_step1/forward_rank5_all.json /workspace/APEX/PaddleNLP/dump_info/rank6_step1/forward_rank6_all.json /workspace/APEX/PaddleNLP/dump_info/rank7_step1/forward_rank7_all.json" \ - -backend xpu \ - -real \ - "/workspace/APEX/PaddleNLP/dump_info/rank0_step0/ /workspace/APEX/PaddleNLP/dump_info/rank1_step0/ /workspace/APEX/PaddleNLP/dump_info/rank2_step0/ /workspace/APEX/PaddleNLP/dump_info/rank3_step0/ /workspace/APEX/PaddleNLP/dump_info/rank4_step0/ /workspace/APEX/PaddleNLP/dump_info/rank5_step0/ /workspace/APEX/PaddleNLP/dump_info/rank6_step0/ /workspace/APEX/PaddleNLP/dump_info/rank7_step0/" \ - -out /workspace/APEX/llama10b/result/ -mode acc diff --git a/paddleapex/apex/run_llama20b_xpu_pro.sh b/paddleapex/apex/run_llama20b_xpu_pro.sh deleted file mode 100644 index a1a75e4..0000000 --- a/paddleapex/apex/run_llama20b_xpu_pro.sh +++ /dev/null @@ -1,75 +0,0 @@ -#!/bin/bash -task_name_or_path="llama-20b" -#export XPUAPI_DEBUG=0x1 -#export XPURT_DISPATCH_MODE=PROFILING -export XPU_FORCE_USERMODE_LAUNCH=1 -export PYTHONPATH=$PYTHONPATH:/ssd3/zhouxiangquan/PaddleAPEX:/ssd3/zhouxiangquan/PaddleNLP - -export XBLAS_FC_HBM_VERSION=40 - -# PaddlePaddle -export FLAGS_use_stride_kernel="0" -export XPU_CDNN_CLUSTER_PARALLEL=1 -export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 -export XPU_PADDLE_L3_SIZE0=1024 -export XPU_PADDLE_L3_SIZE1=1024 -export XPU_PADDLE_FC_LOCAL_INT16=1 - -# BKCL -# export BKCL_DEBUG=1 -# Multi-computer RDMA -#export BKCL_ENABLE_XDR=1 -export BKCL_RDMA_FORCE_TREE=0 -export BKCL_TREE_THRESHOLD=0 -#export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 -#export BKCL_SOCKET_IFNAME=eth0 -export BKCL_FORCE_L3_RDMA=0 - -export CUDA_DEVICE_MAX_CONNECTIONS=8 -export CUDA_DEVICE_ORDER=OAM_ID -export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 -export XPU_AUTO_BF16_TF32_RADIO=1 -export XPU_AUTO_BF16_TF32=1 # 开启TF32/BF16自动切换 -export XPU_AUTO_BF16_TF32_DEBUG=1 # 开启日志打印 - -timestamp=$(date +%Y%m%d%H%M%S) -echo $timestamp - -PaddleNLP_DIR=$(pwd) - -export USING_LAYERNORM=1 -export USING_GQA_NEOX=1 -export USING_LOGITS_PRINT=1 -export LOGITS_PRINT_INTERVAL=1 - - -#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" acc_direct_cmp_zxq.py --bench /ssd3/zhouxiangquan/llama20b/GPU/ --device /ssd3/zhouxiangquan/llama20b/result/rank_0/ -o /ssd3/zhouxiangquan/llama20b/ -#python lot_t.py - -python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_class.py \ - -json \ - "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/class.json /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/class.json" \ - -backend xpu \ - -real \ - "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/" \ - -out /ssd3/zhouxiangquan/llama20b/result/ -mode pro -# -# -#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ -# -json \ -# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/distributed.json /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/distributed.json" \ -# -backend xpu \ -# -real \ -# "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/" \ -# -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc -# -# -python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_without_distributed.py \ - -json \ - "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/common.json" \ - -backend xpu \ - -real \ - "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/" \ - -out /ssd3/zhouxiangquan/llama20b/result/rank_0/ -mode pro - - diff --git a/paddleapex/apex/split_distributed.py b/paddleapex/apex/split_distributed.py deleted file mode 100644 index 1c0841c..0000000 --- a/paddleapex/apex/split_distributed.py +++ /dev/null @@ -1,76 +0,0 @@ -import json - -def split_json_by_keyword(input_file, outfiles, keywords): - # 读取 JSON 文件 - with open(input_file, 'r', encoding='utf-8') as f: - data = json.load(f) - - # 分别存储包含和不包含关键字的项 - out_list = [] - for i in range(len(keywords)): - out_data = {} - out_list.append(out_data) - - without_keyword = {} - - # 遍历每个项并分类 - for key, value in data.items(): - have_key = False - for keyword, out_data in zip(keywords, out_list): - if keyword in key: - out_data[key] = value - have_key = True - if not have_key: - without_keyword[key]= value - - for i in range(len(keywords)): - output_file_with = outfiles[i] - with_keyword = out_list[i] - with open(output_file_with, 'w', encoding='utf-8') as f_with: - json.dump(with_keyword, f_with, ensure_ascii=False, indent=4) - - output_file_without = outfiles[-1] - with open(output_file_without, 'w', encoding='utf-8') as f_without: - json.dump(without_keyword, f_without, ensure_ascii=False, indent=4) - - print("well done") - -input_json_files = ["/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/forward_rank0_all.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/forward_rank1_all.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/forward_rank2_all.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/forward_rank3_all.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/forward_rank4_all.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/forward_rank5_all.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/forward_rank6_all.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/forward_rank7_all.json"] -distributed_keyword = ["/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/distributed.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/distributed.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/distributed.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/distributed.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/distributed.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/distributed.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/distributed.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/distributed.json"] -model_keyword = ["/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/class.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/class.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/class.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/class.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/class.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/class.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/class.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/class.json"] -common_keyword = ["/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/common.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/common.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/common.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/common.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/common.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/common.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/common.json", - "/ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/common.json"] - - - -keyword = ['distributed', 'model'] -for i in range(len(input_json_files)): - split_json_by_keyword(input_json_files[i], [distributed_keyword[i], model_keyword[i], common_keyword[i]],keyword) - From d00842f2d0f2127492fcb71e3d2727ff1ee5ad08 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Wed, 18 Dec 2024 16:38:12 +0800 Subject: [PATCH 13/22] remove some debug comments --- paddleapex/apex/run_paddle.py | 2 -- paddleapex/api_tracer/Dump.py | 11 ---------- paddleapex/api_tracer/configs/op_target.yaml | 21 ++++++------------- .../api_tracer/configs/tool_config.yaml | 2 -- 4 files changed, 6 insertions(+), 30 deletions(-) diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index c41734f..6ed02f8 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -360,7 +360,6 @@ def create_model(api_call_name, real_data_path): def run_model_forward(model, device_args, device_kwargs): try: - # paddle.distributed.barrier() device_out = model(*device_args, **device_kwargs) paddle.device.synchronize() return device_out @@ -531,7 +530,6 @@ def profile_inner_loop_(): bwd_end_time = time.time() bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us - # bwd_time = bwd_time - fwd_time except Exception as err: msg = "Run_backward Error: %s" % str(err) print_warn_log(msg) diff --git a/paddleapex/api_tracer/Dump.py b/paddleapex/api_tracer/Dump.py index d95d716..826c31a 100644 --- a/paddleapex/api_tracer/Dump.py +++ b/paddleapex/api_tracer/Dump.py @@ -104,17 +104,6 @@ def dump_real_data(self, api_args, tensor, rank): save_tensor(tensor, file_path) return f"{api_args}.pt" - # def dump_model(self, model, model_name): - # directory = os.path.join(self.data_route, f"rank{rank}_step{cfg.global_step}") - # file_path = os.path.join(directory, f"{model_name}") - # create_directory(directory) - # if os.path.exists(file_path): - # os.remove(file_path) - # print( - # f"File {file_path} already exists, tool has overwritten it automatically." - # ) - # paddle.jit.save(layer=model, file_path) - """ Get Api_info dict, update self.dump_api_dict diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index ea3bcaf..0bcbbb6 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -20,6 +20,12 @@ target_class: - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss - paddlenlp.transformers.llama.modeling.LlamaForCausalLM - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler +#fusion_ops: + # - paddlenlp.transformers.llama.fusion_ops.fused_rotary_position_embedding + # - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention + # - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm + # - paddlenlp.transformers.llama.fusion_ops.fusion_rope + # - paddlenlp.transformers.llama.fusion_ops.swiglu ignored_op: - paddle._C_ops.max - paddle._C_ops.min @@ -43,7 +49,6 @@ ignored_op: - paddle.zeros - paddle.zeros_like distributed_op: - # distributed - paddle.distributed.barrier - paddle.distributed.broadcast_object_list - paddle.distributed.communication.stream.alltoall_single @@ -69,10 +74,8 @@ distributed_op: - paddle.distributed.fleet.layers.mpu.mp_ops. target_op: - paddle._C_ops.min -#ignored_op: - paddle._C_ops.min - paddle._C_ops.max -#target_op: - paddle.empty - paddle.empty_like - paddle.reshape @@ -92,7 +95,6 @@ target_op: - paddle.stack - paddle.zeros - paddle.zeros_like -# self-confidence, arrogance - paddle.Tensor.__add__ - paddle.multiply - paddle.multiply_ @@ -102,7 +104,6 @@ target_op: - paddle._C_ops.adamw - paddle._C_ops.adamw_ - paddle._C_ops.layer_norm -#target_op: - paddle.nn.functional.scaled_dot_product_attention - paddle._C_ops.layer_norm #noqa - paddle.nn.functional.adaptive_avg_pool1d @@ -923,10 +924,6 @@ target_op: - paddle.Tensor.vsplit - paddle.Tensor.where - paddle.Tensor.where_ - #### experiment op: - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy - paddle._C_ops.fused_gemm_epilogue - paddle.optimizer.Adam - paddle.optimizer.AdamW @@ -954,10 +951,4 @@ target_op: - paddle.uniform - paddle._C_ops.gaussian - paddle._legacy_C_ops.c_identity - # - paddle.distributed.fleet.layers.mpu.mp_ops. - # - paddlenlp.transformers.llama.fusion_ops.fused_rotary_position_embedding - # - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention - # - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm - # - paddlenlp.transformers.llama.fusion_ops.fusion_rope - # - paddlenlp.transformers.llama.fusion_ops.swiglu diff --git a/paddleapex/api_tracer/configs/tool_config.yaml b/paddleapex/api_tracer/configs/tool_config.yaml index 95df766..e83cedb 100644 --- a/paddleapex/api_tracer/configs/tool_config.yaml +++ b/paddleapex/api_tracer/configs/tool_config.yaml @@ -11,7 +11,6 @@ Async_dump: False # mode must be chosen from ["real_data", "random"] dump_mode: "real_data" -# dump_mode: "random" # acclerate dump process by getting extremum value on device side. # In profile_mode, the speed of dump is 75% of vanilla speed. @@ -21,7 +20,6 @@ profile_mode: True target_step: [0] # Remove duplicate apis from dump_info and keep only one api in the same value range. -# dump_unique: True dump_unique: False # Split dump_info into half-precision operators and other operators when saving json files From ecf0c370a6625988bc60f4d06c87000a1d501d88 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Thu, 19 Dec 2024 15:32:42 +0800 Subject: [PATCH 14/22] support class use CLUSTER_PARALLEL --- paddleapex/apex/run_paddle.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 6ed02f8..2e9bcfc 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -75,6 +75,12 @@ PROFILE_WARM_TIMES = 10 PROFILE_RUN_TIMES = 10 +#strategy = fleet.DistributedStrategy() +#strategy.hybrid_configs = { +# "dp_degree": 1, "mp_degree": 8, "pp_degree": 1,"sharding_degree": 1,} +#fleet.init(is_collective=True, strategy=strategy) +#paddle.set_default_dtype("bfloat16") + def recursive_delete_arg(arg_in): if isinstance(arg_in, (list, tuple)): for item in arg_in: @@ -770,8 +776,6 @@ def check_json(json_list): print_warn_log("The output path already exists and the file with the same name will be overwritten.") if cfg.distributed_op: - dist.init_parallel_env() - local_rank = dist.get_rank() if cfg.test_class: strategy = fleet.DistributedStrategy() strategy.hybrid_configs = { @@ -781,6 +785,13 @@ def check_json(json_list): "sharding_degree": cfg.sharding_degree} fleet.init(is_collective=True, strategy=strategy) paddle.set_default_dtype(cfg.class_default_type) + + hcg = fleet.get_hybrid_communicate_group() + model_parallel_group = hcg.get_model_parallel_group() + paddle.distributed.barrier(model_parallel_group) + + dist.init_parallel_env() + local_rank = dist.get_rank() json_path_list = cfg.json_path.split(' ') data_path_list = cfg.real_data.split(' ') From ce310f134e1b00fff68e4a460e3230da05a842ac Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Fri, 20 Dec 2024 15:54:21 +0800 Subject: [PATCH 15/22] support class output is list or tuple --- paddleapex/apex/run_paddle.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 2e9bcfc..0840947 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -319,10 +319,22 @@ def get_grad_tensor(args, kwargs): return device_grad_out +def get_need_grad_out(args): + device_grad_out = [] + if isinstance(args, paddle.Tensor): + device_grad_out.append(args) + if isinstance(args, (list, tuple)): + for x in args: + if isinstance(x, paddle.Tensor) and x.stop_gradient == False: + device_grad_out.append(x) + return device_grad_out + + def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=None): if need_backward: try: - paddle.autograd.backward([device_out], dout) + device_out = get_need_grad_out(device_out) + paddle.autograd.backward(device_out, dout) device_grad_out = get_grad_tensor(args, kwargs) device_grad_out = check_grad_list(device_grad_out) if device_grad_out is None: From 97255d6763e27acb3d4c9c752a0c7705791a4452 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Tue, 24 Dec 2024 14:45:18 +0800 Subject: [PATCH 16/22] support fusion op, and reorder grad --- paddleapex/apex/acc_direct_paddle.py | 22 +- paddleapex/apex/run_llama20b_xpu.sh | 4 +- paddleapex/apex/run_moe_xpu.sh | 46 + paddleapex/apex/run_paddle.py | 131 +- paddleapex/apex/utils/__init__.py | 2 +- paddleapex/apex/utils/data_generate.py | 29 + paddleapex/api_tracer/api_info.py | 65 +- paddleapex/api_tracer/configs/op_target.yaml | 1813 ++++++++--------- paddleapex/api_tracer/wrap_op/OPTemplate.py | 60 +- .../api_tracer/wrap_op/get_target_op.py | 6 + 10 files changed, 1176 insertions(+), 1002 deletions(-) create mode 100644 paddleapex/apex/run_moe_xpu.sh diff --git a/paddleapex/apex/acc_direct_paddle.py b/paddleapex/apex/acc_direct_paddle.py index 4e546c6..f9ebbf8 100644 --- a/paddleapex/apex/acc_direct_paddle.py +++ b/paddleapex/apex/acc_direct_paddle.py @@ -81,10 +81,10 @@ def compare_command(args): details_csv_path = os.path.join(out_path, DETAILS_FILE_NAME) print_info_log(f"Compare task result will be saved in {result_csv_path}") print_info_log(f"Compare task details will be saved in {details_csv_path}") - bench_dir = os.path.join(args.bench_dir, "./output") - device_dir = os.path.join(args.device_dir, "./output") - bench_back_dir = os.path.join(args.bench_dir, "./output_backward") - device_back_dir = os.path.join(args.device_dir, "./output_backward") + bench_dir = os.path.join(args.bench_dir, "./rank_" + str(rank) + "/output") + device_dir = os.path.join(args.device_dir, "./rank_" + str(rank) + "/output") + bench_back_dir = os.path.join(args.bench_dir, "./rank_" + str(rank) + "/output_backward") + device_back_dir = os.path.join(args.device_dir, "./rank_" + str(rank) + "/output_backward") compare_device_bench( result_csv_path, @@ -120,7 +120,7 @@ def compare_device_bench( errors_forward_info = [] errors_bacward_info = [] for i, api_file in enumerate(tqdm.tqdm(api_pt_files_all, **tqdm_params)): - if not i % dist.get_world_size() == dist.get_rank(): + if i < 2700: continue bench_out_tensor, device_out_tensor = None, None bench_grad_tensor_list, device_grad_tensor_list = None, None @@ -142,7 +142,8 @@ def compare_device_bench( Warning_list.append(msg) print(msg) continue - + print(bench_grad_dir) + print(device_grad_dir) if bench_grad_dir and device_grad_dir: bench_grad_path = os.path.join(bench_grad_dir, api_file) device_grad_path = os.path.join(device_grad_dir, api_file) @@ -178,9 +179,9 @@ def compare_device_bench( print(err) errors_bacward_info.sort(key=lambda x: x[1]) errors_forward_info.sort(key=lambda x: x[1]) - df = pd.DataFrame(errors_bacward_info, columns=["operator_name", "error<0.001", "bench_data", "device_data", "diff_value", "diff_index"]) + df = pd.DataFrame(errors_bacward_info, columns=["operator_name", "error<0.001", "bench_data", "device_data", "diff_index"]) df.to_csv("log/rank" + str(dist.get_rank()) + "_backward_output.csv", index=False) - df = pd.DataFrame(errors_forward_info, columns=["operator_name", "error<0.001", "bench_data", "device_data", "diff_value", "diff_index"]) + df = pd.DataFrame(errors_forward_info, columns=["operator_name", "error<0.001", "bench_data", "device_data", "diff_index"]) df.to_csv("log/rank" + str(dist.get_rank()) + "_forward_output.csv", index=False) warning_log_pth = os.path.join(out_path, "./compare_warning.txt") @@ -223,8 +224,9 @@ def compare_result(bench_output, device_output, errors, name): bench_n = paddle.cast(bench_output_o[diff_index], "float").numpy().tolist() device_n = paddle.cast(device_output_o[diff_index], "float").numpy().tolist() diff_index_n = diff_index.numpy().tolist() - diff_value_n = diff_value.numpy().tolist() - errors.append((name, error_info, str(bench_n), str(device_n), str(diff_value_n), str(diff_index_n))) + # diff_value_n = diff_value.numpy().tolist() + # errors.append((name, error_info, str(bench_n), str(device_n), str(diff_value_n), str(diff_index_n))) + errors.append((name, error_info, str(bench_n), str(device_n), str(diff_index_n))) print("diff is too large---------------------------- erorr Erorr ERORR----------------------------") print("bench_output----------") print(bench_output_o[diff_index]) diff --git a/paddleapex/apex/run_llama20b_xpu.sh b/paddleapex/apex/run_llama20b_xpu.sh index 034b7e4..cd24622 100644 --- a/paddleapex/apex/run_llama20b_xpu.sh +++ b/paddleapex/apex/run_llama20b_xpu.sh @@ -48,13 +48,13 @@ export LOGITS_PRINT_INTERVAL=1 #python run_paddle.py -json /ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/test.json -backend xpu -real /ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc -python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_paddle.py \ +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_class.py \ -json \ "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/forward_rank0_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/forward_rank1_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/forward_rank2_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/forward_rank3_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/forward_rank4_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/forward_rank5_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/forward_rank6_all.json /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/forward_rank7_all.json" \ -backend xpu \ -real \ "/ssd3/zhouxiangquan/llama20b/dump_info/rank0_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank1_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank2_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank3_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank4_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank5_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank6_step0/ /ssd3/zhouxiangquan/llama20b/dump_info/rank7_step0/" \ - -out /ssd3/zhouxiangquan/llama20b/result/ -mode pro -class 1 -dist 1 + -out /ssd3/zhouxiangquan/llama20b/result/ -mode acc # -class 1 -dist 1 # #python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_distributed.py \ diff --git a/paddleapex/apex/run_moe_xpu.sh b/paddleapex/apex/run_moe_xpu.sh new file mode 100644 index 0000000..d62ae40 --- /dev/null +++ b/paddleapex/apex/run_moe_xpu.sh @@ -0,0 +1,46 @@ +ask_name_or_path="llama-moe" +export XPU_FORCE_USERMODE_LAUNCH=1 +export PYTHONPATH=$PYTHONPATH:/ssd3/zhouxiangquan/PaddleAPEX:/ssd3/zhouxiangquan/PaddleNLP + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 + +# BKCL +export BKCL_TREE_THRESHOLD=0 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" + +export CUDA_DEVICE_MAX_CONNECTIONS=8 +#export XPUAPI_DEBUG=0x1 +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) + +export XPU_CHECKPOINT_ALLGATHER_OFFLOAD=1 + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 +export XPU_PADDLE_FC_LOCAL_INT16=1 +# --resume_from_checkpoint "/workspace/mnt/moe_workspace/llama-moe-gpu-checkpoint-2" \ + + +#python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" acc_direct_paddle.py --bench /ssd3/zhouxiangquan/moe/GPU/ --device /ssd3/zhouxiangquan/moe/result/ -o /ssd3/zhouxiangquan/moe/ + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_paddle.py \ + -json \ + "/ssd3/zhouxiangquan/moe/dump_info/rank0_step0/forward_rank0_all.json /ssd3/zhouxiangquan/moe/dump_info/rank1_step0/forward_rank1_all.json /ssd3/zhouxiangquan/moe/dump_info/rank2_step0/forward_rank2_all.json /ssd3/zhouxiangquan/moe/dump_info/rank3_step0/forward_rank3_all.json /ssd3/zhouxiangquan/moe/dump_info/rank4_step0/forward_rank4_all.json /ssd3/zhouxiangquan/moe/dump_info/rank5_step0/forward_rank5_all.json /ssd3/zhouxiangquan/moe/dump_info/rank6_step0/forward_rank6_all.json /ssd3/zhouxiangquan/moe/dump_info/rank7_step0/forward_rank7_all.json" \ + -backend xpu \ + -real \ + "/ssd3/zhouxiangquan/moe/dump_info/rank0_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank1_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank2_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank3_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank4_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank5_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank6_step0/ /ssd3/zhouxiangquan/moe/dump_info/rank7_step0/" \ + -out /ssd3/zhouxiangquan/moe/result/ -mode pro -class 1 -dist 1 + diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 2e9bcfc..c37e956 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -21,6 +21,7 @@ import copy import json import yaml +import re from tqdm import tqdm import pickle import paddle @@ -31,6 +32,7 @@ from utils import ( print_info_log, gen_api_params, + create_model, api_json_read, check_grad_list, rand_like, @@ -51,6 +53,14 @@ ignored_op = Ops.get("ignored_op") target_class = Ops.get("target_class") distributed_op = Ops.get("distributed_op") +if target_op is None: + target_op = [] +if ignored_op is None: + ignored_op = [] +if target_class is None: + target_class = [] +if distributed_op is None: + distributed_op = [] f.close() Warning_list = [] @@ -319,10 +329,59 @@ def get_grad_tensor(args, kwargs): return device_grad_out -def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=None): +def get_need_grad_out(args): + device_grad_out = [] + if isinstance(args, paddle.Tensor): + device_grad_out.append(args) + if isinstance(args, (list, tuple)): + for x in args: + if isinstance(x, paddle.Tensor) and x.stop_gradient == False: + device_grad_out.append(x) + return device_grad_out + + +def print_tensor_name(args): + if isinstance(args, paddle.Tensor): + print(args.name) + if isinstance(args, (list, tuple)): + for x in args: + print_tensor_name(x) + + +def get_dout_sequence(dout_info_dict, order): + if isinstance(dout_info_dict, dict): + rel_data_path = dout_info_dict.get("real_data_path") + match = re.search(r'grad_(\d+)\.pt$', rel_data_path) + if match: + order.append(int(match.group(1))) + else: + print("match faile, check it!!!!!!") + elif isinstance(dout_info_dict, (list, tuple)): + for info in dout_info_dict: + get_dout_sequence(info, order) + else: + print("match faile, check it!!!!!!") + + +def reorder_dout(dout_info_dict, dout): + if dout_info_dict[0] == "Failed": + return dout + order = [] + get_dout_sequence(dout_info_dict, order) + ordered_out = [None] * len(dout) + for i in range(len(order)): + ordered_out[order[i]] = dout[i] + return ordered_out + + +def run_backward(dout_info_dict, api_call_name, device_out, dout, args, kwargs, need_backward=None): if need_backward: try: - paddle.autograd.backward([device_out], dout) + device_out = get_need_grad_out(device_out) + #print_tensor_name(device_out) + #print_tensor_name(dout) + dout = reorder_dout(dout_info_dict, dout) + paddle.autograd.backward(device_out, dout) device_grad_out = get_grad_tensor(args, kwargs) device_grad_out = check_grad_list(device_grad_out) if device_grad_out is None: @@ -341,29 +400,6 @@ def run_backward(api_call_name, device_out, dout, args, kwargs, need_backward=No return None -def load_params(filename): - with open(filename, 'rb') as f: - return pickle.load(f) - - -def create_model(api_call_name, real_data_path): - api_call_stack = api_call_name.rsplit("*")[0] - init_path = real_data_path + api_call_name + ".init_params" - state_path = real_data_path + api_call_name + ".state_dict" - init_para = load_params(init_path) - parent_package, class_n = api_call_stack.rsplit(".", maxsplit=1) - try: - MODULE = import_module(parent_package) - class_model = getattr(MODULE, class_n) - model = class_model(**init_para) - model.set_state_dict(paddle.load(state_path)) - return model - except Exception as err: - msg = "Create Model Error: %s" % str(err) - print_warn_log(msg) - return None - - def run_model_forward(model, device_args, device_kwargs): try: device_out = model(*device_args, **device_kwargs) @@ -399,10 +435,10 @@ def run_acc_case( return else: try: - model = create_model(api_call_name, real_data_path) + model = create_model(api_call_name.rsplit("*")[0], real_data_path + api_call_name) device_out = run_model_forward(model, device_args, device_kwargs) except Exception as err: - msg = "Run_forward Error: %s" % str(err) + msg = "Run_class_forward Error: %s" % str(err) print_warn_log(msg) return else: @@ -412,7 +448,7 @@ def run_acc_case( print('this is distributed op: ', api_call_name) device_out = device_args except Exception as err: - msg = "Run_forward Error: %s" % str(err) + msg = "Run_op_forward Error: %s" % str(err) print_warn_log(msg) return @@ -423,7 +459,7 @@ def run_acc_case( api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path ) device_grad_out = run_backward( - api_call_name, device_out, dout, device_args, device_kwargs, need_backward + api_info_dict["dout_list"], api_call_name, device_out, dout, device_args, device_kwargs, need_backward ) else: device_grad_out = None @@ -477,7 +513,7 @@ def profile_inner_loop_(): Warning_list.append(msg) return -1, -1, output_shape_lst else: - model = create_model(api_call_name, real_data_path) + model = create_model(api_call_name.rsplit("*")[0], real_data_path + api_call_name) is_model = True paddle.device.synchronize() fwd_start_time = 0 @@ -514,26 +550,25 @@ def profile_inner_loop_(): bwd_start_time = 0 bwd_end_time = 0 dout = create_dout(api_info_dict["dout_list"], device_out, backend, enforce_dtype, real_data_path) + dout = reorder_dout(api_info_dict["dout_list"], dout) device_out_list = [] paddle.device.synchronize() - if is_model: - for _ in range(PROFILE_RUN_TIMES): - device_out_list.append(model(*device_args, **device_kwargs)) - paddle.device.synchronize() - bwd_start_time = time.time() - for i in range(PROFILE_RUN_TIMES): - paddle.autograd.backward([device_out_list[i]], dout) - paddle.device.synchronize() - bwd_end_time = time.time() - else: - for _ in range(PROFILE_RUN_TIMES): - device_out_list.append(run_forward(api_call_name, device_args, device_kwargs)) - paddle.device.synchronize() - bwd_start_time = time.time() - for i in range(PROFILE_RUN_TIMES): - paddle.autograd.backward([device_out_list[i]], dout) - paddle.device.synchronize() - bwd_end_time = time.time() + for _ in range(PROFILE_RUN_TIMES): + if is_model: + output = model(*device_args, **device_kwargs) + else: + output = run_forward(api_call_name, device_args, device_kwargs) + output = get_need_grad_out(output) + if len(output) == 0: + return fwd_time, -1, output_shape_lst + device_out_list.append(output) + + paddle.device.synchronize() + bwd_start_time = time.time() + for i in range(PROFILE_RUN_TIMES): + paddle.autograd.backward(device_out_list[i], dout) + paddle.device.synchronize() + bwd_end_time = time.time() bwd_time = bwd_end_time - bwd_start_time # bwd_time is in second bwd_time = bwd_time * 1000000 / float(PROFILE_RUN_TIMES) # bwd_time is in us except Exception as err: diff --git a/paddleapex/apex/utils/__init__.py b/paddleapex/apex/utils/__init__.py index ed900ef..fb3d28a 100644 --- a/paddleapex/apex/utils/__init__.py +++ b/paddleapex/apex/utils/__init__.py @@ -29,7 +29,7 @@ seed_all, api_json_read, ) -from .data_generate import gen_api_params, rand_like, gen_args +from .data_generate import gen_api_params, create_model, rand_like, gen_args from .file_check_util import ( FileCheckException, FileChecker, diff --git a/paddleapex/apex/utils/data_generate.py b/paddleapex/apex/utils/data_generate.py index 5c43b85..f66e89f 100644 --- a/paddleapex/apex/utils/data_generate.py +++ b/paddleapex/apex/utils/data_generate.py @@ -19,6 +19,8 @@ import math import random import numpy as np +import pickle +from importlib import import_module from .utils import ( check_object_type, CompareException, @@ -86,6 +88,29 @@ ] +def load_params(filename): + with open(filename, 'rb') as f: + return pickle.load(f) + + +def create_model(api_call_stack, real_data_path): + # api_call_stack = api_call_name.rsplit("*")[0] + init_path = real_data_path + ".init_params" + state_path = real_data_path + ".state_dict" + init_para = load_params(init_path) + parent_package, class_n = api_call_stack.rsplit(".", maxsplit=1) + try: + MODULE = import_module(parent_package) + class_model = getattr(MODULE, class_n) + model = class_model(**init_para) + model.set_state_dict(paddle.load(state_path)) + return model + except Exception as err: + msg = "Create Model Error: %s" % str(err) + print_warn_log(msg) + return None + + def gen_data(info, real_data_path=None): check_object_type(info, dict) data_type = info.get("type") @@ -110,6 +135,10 @@ def gen_data(info, real_data_path=None): data = eval(data_type)(data) except Exception as err: print_error_log("Failed to convert the type to numpy: %s" % str(err)) + elif data_type == 'class': + api_call_stack = info.get("api_call_stack") + data_pth = os.path.join(real_data_path, rel_data_path) + data = create_model(api_call_stack, data_pth) else: data = info.get("value") if info.get("type") == "slice": diff --git a/paddleapex/api_tracer/api_info.py b/paddleapex/api_tracer/api_info.py index 4fe59e4..19dd7ee 100644 --- a/paddleapex/api_tracer/api_info.py +++ b/paddleapex/api_tracer/api_info.py @@ -18,6 +18,9 @@ from paddleapex.api_tracer.Dump import dump_util from paddleapex.api_tracer.config import cfg import paddle.distributed as dist +import pickle +import os +from inspect import signature Paddle_Type_Map = { "FP64": "paddle.float64", @@ -99,12 +102,35 @@ def get_tensor_extremum(data): return max_result, max_result, min_result, min_result +def get_init_params(instance): + sig = signature(instance.__init__) + bound_args = sig.bind_partial() + bound_args.apply_defaults() + + init_params = {} + for param in sig.parameters.values(): + if param.name != 'self': + init_params[param.name] = getattr(instance, param.name, param.default) + + return init_params + + +def save_init_params_and_weight(init_params, state_dict, name, rank): + data_route = cfg.dump_root_path + directory = os.path.join(data_route, f"rank{rank}_step{cfg.global_step}") + file_path = os.path.join(directory, f"{name}.init_params") + with open(file_path, 'wb') as f: + pickle.dump(init_params, f) + paddle.save(state_dict, os.path.join(directory, f"{name}.state_dict")) + + class API: def __init__(self, mode): self.op_name = "" self.rank = "" self.mode = mode self.args_num = 0 + self.hook_num = 0 self.embedding_num = 0 self.output_num = 0 self.dout_list = [] @@ -133,9 +159,11 @@ def update_real_data(self, inputs, kwargs): } dump_util.update_api_dict(self.api_info_struct, self.rank, self.is_half_precision, self.is_distributed) - def update_output(self, outputs): - self.out_list = self.analyze_element(outputs) - self.api_info_struct[self.op_name].update({"out_list": self.dout_list}) + def update_output(self, output): + if isinstance(output, paddle.Tensor): + setattr(tensor, 'description', self.op_name) + # self.out_list = self.analyze_element(outputs) + # self.api_info_struct[self.op_name].update({"out_list": self.dout_list}) def record_dout(self, grad_value): if grad_value is not None: @@ -175,11 +203,34 @@ def analyze_element(self, element): if element is None or isinstance(element, (bool, int, float, str, slice)): return self._analyze_builtin(element) + try: + from paddlenlp.transformers.llama.modeling import LlamaRotaryEmbedding + if type(element) is LlamaRotaryEmbedding: + return self.analyze_class(element, "paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding") + except Exception as e: + print(e) + print("check you environment, and ensure the path of paddlenlp is valid") + print(type(element)) print(element) msg = f"In op:{self.op_name}, its args type {type(element)} is unsupported at analyze_element" print(msg) + + def analyze_class(self, arg, call_stack): + single_arg = {} + single_arg.update({"type": "class"}) + single_arg.update({"dtype": str(type(arg))}) + single_arg.update({"api_call_stack": call_stack}) + if self.mode == "real_data": + api_args = self.op_name + "." + str(self.args_num) + self.args_num += 1 + init_params = get_init_params(arg) + save_init_params_and_weight(init_params, arg.state_dict(), api_args, self.rank) + single_arg.update({"real_data_path": api_args}) + return single_arg + + def effi_analyze_tensor(self, arg): single_arg = {} single_arg.update({"type": "paddle.Tensor"}) @@ -187,10 +238,10 @@ def effi_analyze_tensor(self, arg): single_arg.update({"shape": arg.shape}) arg_name = arg.name exit_tensor = arg_name.startswith("APEX_") - if not exit_tensor: - arg.name = "APEX_" + self.op_name + "_" + str(self.arg_index) - single_arg.update({"name": arg.name}) - self.arg_index = self.arg_index + 1 + # if not exit_tensor: + # arg.name = "APEX_" + self.op_name + "_" + str(self.arg_index) + # single_arg.update({"name": arg.name}) + # self.arg_index = self.arg_index + 1 single_arg.update({"stop_gradient": arg.stop_gradient}) if self.mode == "real_data": api_args = self.op_name + "." + str(self.args_num) diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index 0bcbbb6..eec3189 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -1,31 +1,25 @@ target_class: - paddlenlp.transformers.llama.modeling.LlamaMLP - - paddlenlp.transformers.llama.modeling.LlamaLMHead - - paddlenlp.transformers.llama.modeling.LlamaRMSNorm - - paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding - - paddlenlp.transformers.llama.modeling.LlamaLinearScalingRotaryEmbedding - - paddlenlp.transformers.llama.modeling.LlamaNTKScalingRotaryEmbedding - - paddlenlp.transformers.llama.modeling.LlamaDynamicNTKScalingRotaryEmbedding - - paddlenlp.transformers.llama.modeling.Llama3RotaryEmbedding - - paddlenlp.transformers.llama.modeling.MoEAllToAll - - paddlenlp.transformers.llama.modeling.MoEGateCombine - - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler - - paddlenlp.transformers.llama.modeling.LlamaMoEGate - - paddlenlp.transformers.llama.modeling.LlamaMoEMLP - - paddlenlp.transformers.llama.modeling.LlamaAttention - - paddlenlp.transformers.llama.modeling.LlamaDecoderLayer - - paddlenlp.transformers.llama.modeling.LlamaPretrainedModel - - paddlenlp.transformers.llama.modeling.LlamaModel - - paddlenlp.transformers.llama.modeling.LlamaPretrainingCriterion - - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss - - paddlenlp.transformers.llama.modeling.LlamaForCausalLM - - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler -#fusion_ops: - # - paddlenlp.transformers.llama.fusion_ops.fused_rotary_position_embedding - # - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention - # - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm - # - paddlenlp.transformers.llama.fusion_ops.fusion_rope - # - paddlenlp.transformers.llama.fusion_ops.swiglu + # - paddlenlp.transformers.llama.modeling.LlamaLMHead + # - paddlenlp.transformers.llama.modeling.LlamaRMSNorm + # - paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaLinearScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaNTKScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaDynamicNTKScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.Llama3RotaryEmbedding + # - paddlenlp.transformers.llama.modeling.MoEAllToAll + # - paddlenlp.transformers.llama.modeling.MoEGateCombine + # - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler + # - paddlenlp.transformers.llama.modeling.LlamaMoEGate + # - paddlenlp.transformers.llama.modeling.LlamaMoEMLP + # - paddlenlp.transformers.llama.modeling.LlamaAttention + # - paddlenlp.transformers.llama.modeling.LlamaDecoderLayer + # - paddlenlp.transformers.llama.modeling.LlamaPretrainedModel + # - paddlenlp.transformers.llama.modeling.LlamaPretrainingCriterion + # - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss + # - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler + # - paddlenlp.transformers.llama.modeling.LlamaForCausalLM + # - paddlenlp.transformers.llama.modeling.LlamaModel ignored_op: - paddle._C_ops.max - paddle._C_ops.min @@ -35,7 +29,6 @@ ignored_op: - paddle.reshape_ - paddle.unsqueeze - paddle.unsqueeze_ - - paddle.square_ - paddle.Tensor.squeeze - paddle.Tensor.squeeze_ - paddle.Tensor.unsqueeze @@ -48,30 +41,31 @@ ignored_op: - paddle.stack - paddle.zeros - paddle.zeros_like -distributed_op: - - paddle.distributed.barrier - - paddle.distributed.broadcast_object_list - paddle.distributed.communication.stream.alltoall_single - - paddle.distributed.communication.stream.broadcast - - paddle.distributed.communication.stream.gather - - paddle.distributed.communication.stream.recv - - paddle.distributed.communication.stream.reduce - - paddle.distributed.communication.stream.reduce_scatter - - paddle.distributed.communication.stream.scatter - - paddle.distributed.communication.stream.send - - paddle.distributed.all_gather - - paddle.distributed.all_gather_object - - paddle.distributed.all_reduce - - paddle.distributed.alltoall - - paddle.distributed.alltoall_single - - paddle.distributed.broadcast - - paddle.distributed.communication.stream.all_gather - - paddle.distributed.communication.stream.all_reduce - - paddle.distributed.communication.stream.alltoall - - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity - - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table - - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy - - paddle.distributed.fleet.layers.mpu.mp_ops. +distributed_op: + # - paddle.distributed.communication.stream.alltoall_single + # - paddle.distributed.barrier + # - paddle.distributed.broadcast_object_list + # - paddle.distributed.communication.stream.broadcast + # - paddle.distributed.communication.stream.gather + # - paddle.distributed.communication.stream.recv + # - paddle.distributed.communication.stream.reduce + # - paddle.distributed.communication.stream.reduce_scatter + # - paddle.distributed.communication.stream.scatter + # - paddle.distributed.communication.stream.send + # - paddle.distributed.all_gather + # - paddle.distributed.all_gather_object + # - paddle.distributed.all_reduce + # - paddle.distributed.alltoall + # - paddle.distributed.alltoall_single + # - paddle.distributed.broadcast + # - paddle.distributed.communication.stream.all_gather + # - paddle.distributed.communication.stream.all_reduce + # - paddle.distributed.communication.stream.alltoall + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table + # - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy + # - paddle.distributed.fleet.layers.mpu.mp_ops. target_op: - paddle._C_ops.min - paddle._C_ops.min @@ -82,7 +76,6 @@ target_op: - paddle.reshape_ - paddle.unsqueeze - paddle.unsqueeze_ - - paddle.square_ - paddle.Tensor.squeeze - paddle.Tensor.squeeze_ - paddle.Tensor.unsqueeze @@ -95,860 +88,864 @@ target_op: - paddle.stack - paddle.zeros - paddle.zeros_like - - paddle.Tensor.__add__ - - paddle.multiply - - paddle.multiply_ - - paddle.Tensor.__mul__ - - paddle.Tensor.__neg__ - - paddle.Tensor.add_ - - paddle._C_ops.adamw - - paddle._C_ops.adamw_ - - paddle._C_ops.layer_norm - - paddle.nn.functional.scaled_dot_product_attention - - paddle._C_ops.layer_norm #noqa - - paddle.nn.functional.adaptive_avg_pool1d - - paddle.nn.functional.adaptive_avg_pool2d - - paddle.nn.functional.adaptive_avg_pool3d - - paddle.nn.functional.adaptive_max_pool1d - - paddle.nn.functional.adaptive_max_pool2d - - paddle.nn.functional.adaptive_max_pool3d - - paddle.nn.functional.affine_grid - - paddle.nn.functional.alpha_dropout - - paddle.nn.functional.avg_pool1d - - paddle.nn.functional.avg_pool2d - - paddle.nn.functional.avg_pool3d - - paddle.nn.functional.batch_norm - - paddle.nn.functional.bilinear - - paddle.nn.functional.binary_cross_entropy - - paddle.nn.functional.binary_cross_entropy_with_logits - - paddle.nn.functional.celu - - paddle.nn.functional.channel_shuffle - - paddle.nn.functional.class_center_sample - - paddle.nn.functional.common - - paddle.nn.functional.conv1d - - paddle.nn.functional.conv1d_transpose - - paddle.nn.functional.conv2d - - paddle.nn.functional.conv2d_transpose - - paddle.nn.functional.conv3d - - paddle.nn.functional.conv3d_transpose - - paddle.nn.functional.cosine_embedding_loss - - paddle.nn.functional.cosine_similarity - - paddle.nn.functional.cross_entropy - - paddle.nn.functional.ctc_loss - - paddle.nn.functional.diag_embed - - paddle.nn.functional.dice_loss - - paddle.nn.functional.distance - - paddle.nn.functional.dropout - - paddle.nn.functional.dropout2d - - paddle.nn.functional.dropout3d - - paddle.nn.functional.elu - - paddle.nn.functional.elu_ - - paddle.nn.functional.embedding - - paddle.nn.functional.extension - - paddle.nn.functional.flash_attention - - paddle.nn.functional.flash_attention_with_sparse_mask - - paddle.nn.functional.fractional_max_pool2d - - paddle.nn.functional.fractional_max_pool3d - - paddle.nn.functional.fold - - paddle.nn.functional.gather_tree - - paddle.nn.functional.gaussian_nll_loss - - paddle.nn.functional.gelu - - paddle.nn.functional.glu - - paddle.nn.functional.grid_sample - - paddle.nn.functional.gumbel_softmax - - paddle.nn.functional.hardshrink - - paddle.nn.functional.hardsigmoid - - paddle.nn.functional.hardswish - - paddle.nn.functional.hardtanh - - paddle.nn.functional.hardtanh_ - - paddle.nn.functional.hinge_embedding_loss - - paddle.nn.functional.hsigmoid_loss - - paddle.nn.functional.instance_norm - - paddle.nn.functional.interpolate - - paddle.nn.functional.kl_div - - paddle.nn.functional.l1_loss - - paddle.nn.functional.label_smooth - - paddle.nn.functional.layer_norm - - paddle.nn.functional.leaky_relu - - paddle.nn.functional.leaky_relu_ - - paddle.nn.functional.linear - - paddle.nn.functional.local_response_norm - - paddle.nn.functional.log_loss - - paddle.nn.functional.log_sigmoid - - paddle.nn.functional.log_softmax - - paddle.nn.functional.margin_cross_entropy - - paddle.nn.functional.margin_ranking_loss - - paddle.nn.functional.max_pool1d - - paddle.nn.functional.max_pool2d - - paddle.nn.functional.max_pool3d - - paddle.nn.functional.max_unpool1d - - paddle.nn.functional.max_unpool2d - - paddle.nn.functional.max_unpool3d - - paddle.nn.functional.maxout - - paddle.nn.functional.mish - - paddle.nn.functional.mse_loss - - paddle.nn.functional.multi_label_soft_margin_loss - - paddle.nn.functional.multi_margin_loss - - paddle.nn.functional.nll_loss - - paddle.nn.functional.norm - - paddle.nn.functional.normalize - - paddle.nn.functional.npair_loss - - paddle.nn.functional.one_hot - - paddle.nn.functional.pad - - paddle.nn.functional.pairwise_distance - - paddle.nn.functional.pdist - - paddle.nn.functional.pixel_shuffle - - paddle.nn.functional.pixel_unshuffle - - paddle.nn.functional.poisson_nll_loss - - paddle.nn.functional.pooling - - paddle.nn.functional.prelu - - paddle.nn.functional.relu - - paddle.nn.functional.relu6 - - paddle.nn.functional.relu_ - - paddle.nn.functional.rnnt_loss - - paddle.nn.functional.rrelu - - paddle.nn.functional.sdp_kernel - - paddle.nn.functional.selu - - paddle.nn.functional.sequence_mask - - paddle.nn.functional.sigmoid - - paddle.nn.functional.sigmoid_focal_loss - - paddle.nn.functional.silu - - paddle.nn.functional.smooth_l1_loss - - paddle.nn.functional.soft_margin_loss - - paddle.nn.functional.softmax - - paddle.nn.functional.softmax_ - - paddle.nn.functional.softmax_with_cross_entropy - - paddle.nn.functional.softplus - - paddle.nn.functional.softshrink - - paddle.nn.functional.softsign - - paddle.nn.functional.sparse_attention - - paddle.nn.functional.square_error_cost - - paddle.nn.functional.swish - - paddle.nn.functional.tanh - - paddle.nn.functional.tanh_ - - paddle.nn.functional.tanhshrink - - paddle.nn.functional.temporal_shift - - paddle.nn.functional.thresholded_relu - - paddle.nn.functional.thresholded_relu_ - - paddle.nn.functional.triplet_margin_loss - - paddle.nn.functional.triplet_margin_with_distance_loss - - paddle.nn.functional.unfold - - paddle.nn.functional.upsample - - paddle.nn.functional.zeropad2d - - paddle.abs - - paddle.abs_ - - paddle.acos - - paddle.acos_ - - paddle.acosh - - paddle.acosh_ - - paddle.add - - paddle.add_n - - paddle.addmm - - paddle.addmm_ - - paddle.all - - paddle.allclose - - paddle.amax - - paddle.amin - - paddle.angle - - paddle.any - - paddle.arange - - paddle.argmax - - paddle.argmin - - paddle.argsort - - paddle.as_complex - - paddle.as_real - - paddle.as_strided - - paddle.asin - - paddle.asin_ - - paddle.asinh - - paddle.asinh_ - - paddle.assign - - paddle.atan - - paddle.atan2 - - paddle.atan_ - - paddle.atanh - - paddle.atanh_ - - paddle.atleast_1d - - paddle.atleast_2d - - paddle.atleast_3d - - paddle.bernoulli - - paddle.bincount - - paddle.binomial - - paddle.bitwise_and - - paddle.bitwise_and_ - - paddle.bitwise_not - - paddle.bitwise_not_ - - paddle.bitwise_or - - paddle.bitwise_or_ - - paddle.bitwise_xor - - paddle.bitwise_xor_ - - paddle.bmm - - paddle.broadcast_shape - - paddle.broadcast_tensors - - paddle.broadcast_to - - paddle.cauchy_ - - paddle.cast - - paddle.cdist - - paddle.ceil - - paddle.cholesky - - paddle.chunk - - paddle.clip - - paddle.column_stack - - paddle.combinations - - paddle.concat - - paddle.conj - - paddle.copysign - - paddle.copysign_ - - paddle.cos - - paddle.cos_ - - paddle.cosh - - paddle.cosh_ - - paddle.count_nonzero - - paddle.crop - - paddle.cross - - paddle.cummax - - paddle.cummin - - paddle.cumprod - - paddle.cumprod_ - - paddle.cumsum - - paddle.cumsum_ - - paddle.cumulative_trapezoid - - paddle.decomposition - - paddle.deg2rad - - paddle.diag - - paddle.diag_embed - - paddle.diagflat - - paddle.diagonal - - paddle.diagonal_scatter - - paddle.diff - - paddle.digamma - - paddle.digamma_ - - paddle.divide - - paddle.divide_ - - paddle.dot - - paddle.dsplit - - paddle.dstack - - paddle.eigvalsh - - paddle.einsum - - paddle.equal - - paddle.equal_all - - paddle.erf - - paddle.erf_ - - paddle.erfinv - - paddle.exp - - paddle.expand - - paddle.expand_as - - paddle.expm1 - - paddle.expm1_ - - paddle.eye - - paddle.fft - - paddle.flatten - - paddle.flatten_ - - paddle.flip - - paddle.floor - - paddle.floor_divide - - paddle.floor_divide_ - - paddle.floor_mod - - paddle.floor_mod_ - - paddle.fmax - - paddle.fmin - - paddle.frac - - paddle.frac_ - - paddle.frexp - - paddle.full - - paddle.full_like - - paddle.gather - - paddle.gather_nd - - paddle.gcd - - paddle.gcd_ - - paddle.greater_equal - - paddle.greater_equal_ - - paddle.greater_than - - paddle.greater_than_ - - paddle.heaviside - - paddle.histogram - - paddle.histogramdd - - paddle.hsplit - - paddle.hstack - - paddle.hypot - - paddle.hypot_ - - paddle.i0 - - paddle.i0_ - - paddle.i0e - - paddle.i1 - - paddle.i1e - - paddle.imag - - paddle.increment - - paddle.index_add - - paddle.index_add_ - - paddle.index_fill - - paddle.index_fill_ - - paddle.index_put - - paddle.index_put_ - - paddle.index_sample - - paddle.index_select - - paddle.inner - - paddle.kron - - paddle.kthvalue - - paddle.lcm - - paddle.lcm_ - - paddle.ldexp - - paddle.ldexp_ - - paddle.lerp - - paddle.less_equal - - paddle.less_equal_ - - paddle.less_than - - paddle.less_than_ - - paddle.lgamma - - paddle.lgamma_ - - paddle.linalg - - paddle.linspace - - paddle.log - - paddle.log10 - - paddle.log10_ - - paddle.log1p - - paddle.log1p_ - - paddle.log2 - - paddle.log2_ - - paddle.log_ - - paddle.logaddexp - - paddle.logcumsumexp - - paddle.logical_and - - paddle.logical_and_ - - paddle.logical_not - - paddle.logical_not_ - - paddle.logical_or - - paddle.logical_or_ - - paddle.logical_xor - - paddle.logical_xor_ - - paddle.logit - - paddle.logit_ - - paddle.logspace - - paddle.logsumexp - - paddle.masked_fill - - paddle.masked_fill_ - - paddle.masked_scatter - - paddle.masked_scatter_ - - paddle.masked_select - - paddle.matmul - - paddle.max - - paddle.maximum - - paddle.mean - - paddle.median - - paddle.meshgrid - - paddle.min - - paddle.minimum - - paddle.mm - - paddle.mod - - paddle.mod_ - - paddle.mode - - paddle.moveaxis - - paddle.multigammaln - - paddle.multigammaln_ - - paddle.multinomial - - paddle.multiplex - - paddle.multiply - - paddle.multiply_ - - paddle.mv - - paddle.nan_to_num - - paddle.nan_to_num_ - - paddle.nanmean - - paddle.nanmedian - - paddle.nanquantile - - paddle.nansum - - paddle.neg - - paddle.neg_ - - paddle.nextafter - - paddle.nonzero - - paddle.normal - - paddle.normal_ - - paddle.not_equal - - paddle.not_equal_ - - paddle.numel - - paddle.outer - - paddle.pdist - - paddle.poisson - - paddle.polar - - paddle.polygamma - - paddle.polygamma_ - - paddle.pow - - paddle.pow_ - - paddle.prod - - paddle.put_along_axis - - paddle.quantile - - paddle.rad2deg - - paddle.rand - - paddle.randint - - paddle.randint_like - - paddle.randn - - paddle.randperm - - paddle.reader - - paddle.real - - paddle.reciprocal - - paddle.regularizer - - paddle.remainder - - paddle.remainder_ - - paddle.renorm - - paddle.renorm_ - - paddle.repeat_interleave - - paddle.roll - - paddle.rot90 - - paddle.round - - paddle.row_stack - - paddle.rsqrt - - paddle.scale - - paddle.scatter - - paddle.scatter_ - # - paddle.scatter_nd # cause CUDA_ERROR ignored. - # - paddle.scatter_nd_add - - paddle.searchsorted - - paddle.select_scatter - - paddle.sgn - - paddle.shard_index - - paddle.sign - - paddle.signal - - paddle.signbit - - paddle.sin - - paddle.sin_ - - paddle.sinh - - paddle.sinh_ - - paddle.slice - # - paddle.slice_scatter - - paddle.sort - - paddle.sqrt - - paddle.square - - paddle.standard_gamma - - paddle.standard_normal - - paddle.stanh - - paddle.strided_slice - - paddle.subtract - - paddle.sum - - paddle.t - - paddle.t_ - - paddle.take - - paddle.take_along_axis - - paddle.tan - - paddle.tan_ - - paddle.tanh - - paddle.tanh_ - - paddle.tensordot - - paddle.tile - - paddle.topk - - paddle.trace - - paddle.transpose - - paddle.transpose_ - - paddle.trapezoid - - paddle.tril - - paddle.tril_ - - paddle.tril_indices - - paddle.triu - - paddle.triu_ - - paddle.triu_indices - - paddle.trunc - - paddle.trunc_ - - paddle.unbind - - paddle.unflatten - - paddle.unfold - - paddle.uniform - - paddle.unique - - paddle.unique_consecutive - - paddle.unstack - - paddle.vander - - paddle.var - - paddle.view - - paddle.view_as - - paddle.vsplit - - paddle.where - - paddle.where_ - - paddle.zeros - - paddle.zeros_like - - paddle.Tensor.T - - paddle.Tensor.__add__ - - paddle.Tensor.__and__ - - paddle.Tensor.__radd__ - - paddle.Tensor.__div__ - - paddle.Tensor.__eq__ - - paddle.Tensor.__floordiv__ - - paddle.Tensor.__ge__ - - paddle.Tensor.__gt__ - - paddle.Tensor.__le__ - - paddle.Tensor.__lt__ - - paddle.Tensor.__matmul__ - - paddle.Tensor.__mod__ - - paddle.Tensor.__mul__ - - paddle.Tensor.__ne__ - - paddle.Tensor.__neg__ - - paddle.Tensor.__nonzero__ - - paddle.Tensor.__or__ - - paddle.Tensor.__pow__ - - paddle.Tensor.__radd__ - - paddle.Tensor.__rdiv__ - - paddle.Tensor.__rmul__ - - paddle.Tensor.__rpow__ - - paddle.Tensor.__rsub__ - - paddle.Tensor.__rtruediv__ - - paddle.Tensor.__sub__ - - paddle.Tensor.__truediv__ - - paddle.Tensor.__xor__ - - paddle.Tensor.abs - - paddle.Tensor.abs_ - - paddle.Tensor.acos - - paddle.Tensor.acos_ - - paddle.Tensor.acosh - - paddle.Tensor.acosh_ - - paddle.Tensor.add - - paddle.Tensor.add_ - - paddle.Tensor.add_n - - paddle.Tensor.addmm - - paddle.Tensor.addmm_ - - paddle.Tensor.all - - paddle.Tensor.allclose - - paddle.Tensor.amax - - paddle.Tensor.amin - - paddle.Tensor.angle - - paddle.Tensor.any - - paddle.Tensor.argmax - - paddle.Tensor.argmin - - paddle.Tensor.argsort - - paddle.Tensor.as_complex - - paddle.Tensor.as_real - - paddle.Tensor.as_strided - - paddle.Tensor.asin - - paddle.Tensor.asin_ - - paddle.Tensor.asinh - - paddle.Tensor.asinh_ - - paddle.Tensor.atan - - paddle.Tensor.atan2 - - paddle.Tensor.atan_ - - paddle.Tensor.atanh - - paddle.Tensor.atanh_ - - paddle.Tensor.atleast_1d - - paddle.Tensor.atleast_2d - - paddle.Tensor.atleast_3d - - paddle.Tensor.bincount - - paddle.Tensor.bitwise_and - - paddle.Tensor.bitwise_and_ - - paddle.Tensor.bitwise_not - - paddle.Tensor.bitwise_not_ - - paddle.Tensor.bitwise_or - - paddle.Tensor.bitwise_or_ - - paddle.Tensor.bitwise_xor - - paddle.Tensor.bitwise_xor_ - - paddle.Tensor.bmm - - paddle.Tensor.broadcast_shape - - paddle.Tensor.broadcast_tensors - - paddle.Tensor.broadcast_to - - paddle.Tensor.cauchy_ - - paddle.Tensor.cdist - - paddle.Tensor.ceil - - paddle.Tensor.ceil_ - - paddle.Tensor.cholesky - - paddle.Tensor.cholesky_solve - - paddle.Tensor.clip - - paddle.Tensor.clip_ - - paddle.Tensor.coalesce - - paddle.Tensor.cols - - paddle.Tensor.combinations - - paddle.Tensor.concat - - paddle.Tensor.cond - - paddle.Tensor.conj - - paddle.Tensor.contiguous - - paddle.Tensor.corrcoef - - paddle.Tensor.cos - - paddle.Tensor.cos_ - - paddle.Tensor.cosh - - paddle.Tensor.cosh_ - - paddle.Tensor.count_nonzero - - paddle.Tensor.cov - - paddle.Tensor.cross - - paddle.Tensor.crows - - paddle.Tensor.cummax - - paddle.Tensor.cummin - - paddle.Tensor.cumprod - - paddle.Tensor.cumprod_ - - paddle.Tensor.cumsum - - paddle.Tensor.cumsum_ - - paddle.Tensor.cumulative_trapezoid - - paddle.Tensor.deg2rad - - paddle.Tensor.diag - - paddle.Tensor.diag_embed - - paddle.Tensor.diagflat - - paddle.Tensor.diagonal - - paddle.Tensor.diagonal_scatter - - paddle.Tensor.diff - - paddle.Tensor.digamma - - paddle.Tensor.digamma_ - - paddle.Tensor.divide - - paddle.Tensor.divide_ - - paddle.Tensor.dot - - paddle.Tensor.eig - - paddle.Tensor.eigvals - - paddle.Tensor.eigvalsh - - paddle.Tensor.equal - - paddle.Tensor.equal_all - - paddle.Tensor.erf - - paddle.Tensor.erfinv - - paddle.Tensor.erfinv_ - - paddle.Tensor.exp - - paddle.Tensor.exp_ - - paddle.Tensor.expand - - paddle.Tensor.expand_as - - paddle.Tensor.expm1 - - paddle.Tensor.exponential_ - - paddle.Tensor.fill_ - - paddle.Tensor.fill_diagonal_ - - paddle.Tensor.fill_diagonal_tensor - - paddle.Tensor.fill_diagonal_tensor_ - - paddle.Tensor.flatten - - paddle.Tensor.flatten_ - - paddle.Tensor.flip - - paddle.Tensor.floor - - paddle.Tensor.floor_ - - paddle.Tensor.floor_divide - - paddle.Tensor.floor_divide_ - - paddle.Tensor.floor_mod - - paddle.Tensor.floor_mod_ - - paddle.Tensor.fmax - - paddle.Tensor.fmin - - paddle.Tensor.frac - - paddle.Tensor.frac_ - - paddle.Tensor.frexp - - paddle.Tensor.gather - - paddle.Tensor.gather_nd - - paddle.Tensor.gcd - - paddle.Tensor.gcd_ - - paddle.Tensor.get_selected_rows - - paddle.Tensor.get_strides - - paddle.Tensor.greater_equal - - paddle.Tensor.greater_equal_ - - paddle.Tensor.greater_than - - paddle.Tensor.greater_than_ - - paddle.Tensor.heaviside - - paddle.Tensor.histogram - - paddle.Tensor.histogramdd - - paddle.Tensor.hsplit - - paddle.Tensor.hypot - - paddle.Tensor.hypot_ - - paddle.Tensor.i0 - - paddle.Tensor.i0_ - - paddle.Tensor.i0e - - paddle.Tensor.i1 - - paddle.Tensor.i1e - - paddle.Tensor.imag - - paddle.Tensor.increment - - paddle.Tensor.index_add - - paddle.Tensor.index_add_ - - paddle.Tensor.index_fill - - paddle.Tensor.index_fill_ - - paddle.Tensor.index_put - - paddle.Tensor.index_put_ - - paddle.Tensor.index_sample - - paddle.Tensor.index_select - - paddle.Tensor.inner - - paddle.Tensor.kron - - paddle.Tensor.kthvalue - - paddle.Tensor.layout - - paddle.Tensor.lcm - - paddle.Tensor.lcm_ - - paddle.Tensor.ldexp - - paddle.Tensor.ldexp_ - - paddle.Tensor.lerp - - paddle.Tensor.lerp_ - - paddle.Tensor.less_equal - - paddle.Tensor.less_equal_ - - paddle.Tensor.less_than - - paddle.Tensor.less_than_ - - paddle.Tensor.lgamma - - paddle.Tensor.lgamma_ - - paddle.Tensor.log - - paddle.Tensor.log10 - - paddle.Tensor.log10_ - - paddle.Tensor.log1p - - paddle.Tensor.log1p_ - - paddle.Tensor.log2 - - paddle.Tensor.log2_ - - paddle.Tensor.log_ - - paddle.Tensor.logaddexp - - paddle.Tensor.logcumsumexp - - paddle.Tensor.logical_and - - paddle.Tensor.logical_and_ - - paddle.Tensor.logical_not - - paddle.Tensor.logical_not_ - - paddle.Tensor.logical_or - - paddle.Tensor.logical_or_ - - paddle.Tensor.logical_xor - - paddle.Tensor.logical_xor_ - - paddle.Tensor.logit - - paddle.Tensor.logit_ - - paddle.Tensor.logsumexp - - paddle.Tensor.lstsq - - paddle.Tensor.lu - - paddle.Tensor.lu_unpack - - paddle.Tensor.masked_fill - - paddle.Tensor.masked_fill_ - - paddle.Tensor.masked_select - - paddle.Tensor.masked_scatter - - paddle.Tensor.masked_scatter_ - - paddle.Tensor.matmul - - paddle.Tensor.matrix_power - - paddle.Tensor.max - - paddle.Tensor.maximum - - paddle.Tensor.mean - - paddle.Tensor.median - - paddle.Tensor.min - - paddle.Tensor.minimum - - paddle.Tensor.mm - - paddle.Tensor.mod - - paddle.Tensor.mod_ - - paddle.Tensor.mode - - paddle.Tensor.moveaxis - - paddle.Tensor.multi_dot - - paddle.Tensor.multigammaln - - paddle.Tensor.multigammaln_ - - paddle.Tensor.multinomial - - paddle.Tensor.multiplex - - paddle.Tensor.multiply - - paddle.Tensor.multiply_ - - paddle.Tensor.mv - - paddle.Tensor.nan_to_num - - paddle.Tensor.nan_to_num_ - - paddle.Tensor.nanmean - - paddle.Tensor.nanmedian - - paddle.Tensor.nanquantile - - paddle.Tensor.nansum - - paddle.Tensor.ndimension - - paddle.Tensor.neg - - paddle.Tensor.neg_ - - paddle.Tensor.nnz - - paddle.Tensor.nonzero - - paddle.Tensor.norm - - paddle.Tensor.normal_ - - paddle.Tensor.not_equal - - paddle.Tensor.not_equal_ - - paddle.Tensor.numel - - paddle.Tensor.offset - - paddle.Tensor.outer - - paddle.Tensor.pca_lowrank - - paddle.Tensor.pinv - - paddle.Tensor.polar - - paddle.Tensor.polygamma - - paddle.Tensor.polygamma_ - - paddle.Tensor.pow - - paddle.Tensor.pow_ - - paddle.Tensor.process_mesh - - paddle.Tensor.prod - - paddle.Tensor.put_along_axis - - paddle.Tensor.put_along_axis_ - - paddle.Tensor.qr - - paddle.Tensor.quantile - - paddle.Tensor.rad2deg - - paddle.Tensor.remainder - - paddle.Tensor.remainder_ - - paddle.Tensor.renorm - - paddle.Tensor.renorm_ - - paddle.Tensor.repeat_interleave - - paddle.Tensor.reverse - - paddle.Tensor.roll - - paddle.Tensor.rot90 - - paddle.Tensor.round - - paddle.Tensor.round_ - - paddle.Tensor.rows - - paddle.Tensor.rsqrt - - paddle.Tensor.rsqrt_ - - paddle.Tensor.scale - - paddle.Tensor.scale_ - - paddle.Tensor.scatter - - paddle.Tensor.scatter_ - - paddle.Tensor.scatter_nd - - paddle.Tensor.scatter_nd_add - - paddle.Tensor.select_scatter - - paddle.Tensor.sgn - - paddle.Tensor.shard_index - - paddle.Tensor.sigmoid - - paddle.Tensor.sigmoid_ - - paddle.Tensor.sign - - paddle.Tensor.sin - - paddle.Tensor.sin_ - - paddle.Tensor.sinh - - paddle.Tensor.sinh_ - - paddle.Tensor.size - - paddle.Tensor.slice - - paddle.Tensor.solve - - paddle.Tensor.sort - - paddle.Tensor.split - - paddle.Tensor.sqrt - - paddle.Tensor.sqrt_ - - paddle.Tensor.square - - paddle.Tensor.stack - - paddle.Tensor.stanh - - paddle.Tensor.std - - paddle.Tensor.stft - - paddle.Tensor.strided_slice - - paddle.Tensor.strides - - paddle.Tensor.subtract - - paddle.Tensor.subtract_ - - paddle.Tensor.sum - - paddle.Tensor.t - - paddle.Tensor.t_ - - paddle.Tensor.take - - paddle.Tensor.take_along_axis - - paddle.Tensor.tan - - paddle.Tensor.tan_ - - paddle.Tensor.tanh - - paddle.Tensor.tanh_ - - paddle.Tensor.tensordot - - paddle.Tensor.tile - - paddle.Tensor.top_p_sampling - - paddle.Tensor.topk - - paddle.Tensor.trace - - paddle.Tensor.transpose - - paddle.Tensor.transpose_ - - paddle.Tensor.trapezoid - - paddle.Tensor.tril - - paddle.Tensor.tril_ - - paddle.Tensor.triu - - paddle.Tensor.triu_ - - paddle.Tensor.trunc - - paddle.Tensor.trunc_ - - paddle.Tensor.unbind - - paddle.Tensor.unflatten - - paddle.Tensor.unfold - - paddle.Tensor.uniform_ - - paddle.Tensor.unique - - paddle.Tensor.unique_consecutive - - paddle.Tensor.unstack - - paddle.Tensor.vander - - paddle.Tensor.var - - paddle.Tensor.view - - paddle.Tensor.view_as - - paddle.Tensor.vsplit - - paddle.Tensor.where - - paddle.Tensor.where_ - - paddle._C_ops.fused_gemm_epilogue - - paddle.optimizer.Adam - - paddle.optimizer.AdamW - - paddle._C_ops.adamw - - paddle._C_ops.adamw_ - - paddle._legacy_C_ops.fused_gemm_epilogue - - paddle.incubate.nn.functional.fused_multi_head_attention - - paddle.incubate.nn.functional.fused_feedforward - - paddle.incubate.nn.functional.fused_multi_transformer - - paddle.incubate.nn.functional.fused_linear - - paddle.incubate.nn.functional.fused_linear_activation - - paddle.incubate.nn.functional.fused_bias_dropout_residual_layer_norm - - paddle.incubate.nn.functional.fused_ec_moe - - paddle.incubate.nn.functional.fused_dropout_add - - paddle.incubate.nn.functional.fused_rotary_position_embedding - - paddle.incubate.nn.functional.variable_length_memory_efficient_attention - - paddle.incubate.nn.functional.fused_rms_norm - - paddle.incubate.nn.functional.fused_layer_norm - - paddle.incubate.nn.functional.masked_multihead_attention - - paddle.incubate.nn.functional.block_multihead_attention - - paddle.incubate.nn.functional.swiglu - - paddle.incubate.nn.functional.fused_matmul_bias - - paddle.tensor.fill_constant - - paddle.nn.clip._squared_l2_norm - - paddle.uniform - - paddle._C_ops.gaussian - - paddle._legacy_C_ops.c_identity - + # - paddle.Tensor.__add__ + # - paddle._C_ops.layer_norm + # - paddle.multiply + # - paddle.multiply_ + # - paddle.Tensor.__mul__ + # - paddle.Tensor.__neg__ + # - paddle.Tensor.add_ + # - paddle._C_ops.adamw + # - paddle._C_ops.adamw_ + # - paddle.square_ + # - paddle.nn.functional.scaled_dot_product_attention + # - paddle._C_ops.layer_norm #noqa + # - paddle.nn.functional.adaptive_avg_pool1d + # - paddle.nn.functional.adaptive_avg_pool2d + # - paddle.nn.functional.adaptive_avg_pool3d + # - paddle.nn.functional.adaptive_max_pool1d + # - paddle.nn.functional.adaptive_max_pool2d + # - paddle.nn.functional.adaptive_max_pool3d + # - paddle.nn.functional.affine_grid + # - paddle.nn.functional.alpha_dropout + # - paddle.nn.functional.avg_pool1d + # - paddle.nn.functional.avg_pool2d + # - paddle.nn.functional.avg_pool3d + # - paddle.nn.functional.batch_norm + # - paddle.nn.functional.bilinear + # - paddle.nn.functional.binary_cross_entropy + # - paddle.nn.functional.binary_cross_entropy_with_logits + # - paddle.nn.functional.celu + # - paddle.nn.functional.channel_shuffle + # - paddle.nn.functional.class_center_sample + # - paddle.nn.functional.common + # - paddle.nn.functional.conv1d + # - paddle.nn.functional.conv1d_transpose + # - paddle.nn.functional.conv2d + # - paddle.nn.functional.conv2d_transpose + # - paddle.nn.functional.conv3d + # - paddle.nn.functional.conv3d_transpose + # - paddle.nn.functional.cosine_embedding_loss + # - paddle.nn.functional.cosine_similarity + # - paddle.nn.functional.cross_entropy + # - paddle.nn.functional.ctc_loss + # - paddle.nn.functional.diag_embed + # - paddle.nn.functional.dice_loss + # - paddle.nn.functional.distance + # - paddle.nn.functional.dropout + # - paddle.nn.functional.dropout2d + # - paddle.nn.functional.dropout3d + # - paddle.nn.functional.elu + # - paddle.nn.functional.elu_ + # - paddle.nn.functional.embedding + # - paddle.nn.functional.extension + # - paddle.nn.functional.flash_attention + # - paddle.nn.functional.flash_attention_with_sparse_mask + # - paddle.nn.functional.fractional_max_pool2d + # - paddle.nn.functional.fractional_max_pool3d + # - paddle.nn.functional.fold + # - paddle.nn.functional.gather_tree + # - paddle.nn.functional.gaussian_nll_loss + # - paddle.nn.functional.gelu + # - paddle.nn.functional.glu + # - paddle.nn.functional.grid_sample + # - paddle.nn.functional.gumbel_softmax + # - paddle.nn.functional.hardshrink + # - paddle.nn.functional.hardsigmoid + # - paddle.nn.functional.hardswish + # - paddle.nn.functional.hardtanh + # - paddle.nn.functional.hardtanh_ + # - paddle.nn.functional.hinge_embedding_loss + # - paddle.nn.functional.hsigmoid_loss + # - paddle.nn.functional.instance_norm + # - paddle.nn.functional.interpolate + # - paddle.nn.functional.kl_div + # - paddle.nn.functional.l1_loss + # - paddle.nn.functional.label_smooth + # - paddle.nn.functional.layer_norm + # - paddle.nn.functional.leaky_relu + # - paddle.nn.functional.leaky_relu_ + # - paddle.nn.functional.linear + # - paddle.nn.functional.local_response_norm + # - paddle.nn.functional.log_loss + # - paddle.nn.functional.log_sigmoid + # - paddle.nn.functional.log_softmax + # - paddle.nn.functional.margin_cross_entropy + # - paddle.nn.functional.margin_ranking_loss + # - paddle.nn.functional.max_pool1d + # - paddle.nn.functional.max_pool2d + # - paddle.nn.functional.max_pool3d + # - paddle.nn.functional.max_unpool1d + # - paddle.nn.functional.max_unpool2d + # - paddle.nn.functional.max_unpool3d + # - paddle.nn.functional.maxout + # - paddle.nn.functional.mish + # - paddle.nn.functional.mse_loss + # - paddle.nn.functional.multi_label_soft_margin_loss + # - paddle.nn.functional.multi_margin_loss + # - paddle.nn.functional.nll_loss + # - paddle.nn.functional.norm + # - paddle.nn.functional.normalize + # - paddle.nn.functional.npair_loss + # - paddle.nn.functional.one_hot + # - paddle.nn.functional.pad + # - paddle.nn.functional.pairwise_distance + # - paddle.nn.functional.pdist + # - paddle.nn.functional.pixel_shuffle + # - paddle.nn.functional.pixel_unshuffle + # - paddle.nn.functional.poisson_nll_loss + # - paddle.nn.functional.pooling + # - paddle.nn.functional.prelu + # - paddle.nn.functional.relu + # - paddle.nn.functional.relu6 + # - paddle.nn.functional.relu_ + # - paddle.nn.functional.rnnt_loss + # - paddle.nn.functional.rrelu + # - paddle.nn.functional.sdp_kernel + # - paddle.nn.functional.selu + # - paddle.nn.functional.sequence_mask + # - paddle.nn.functional.sigmoid + # - paddle.nn.functional.sigmoid_focal_loss + # - paddle.nn.functional.silu + # - paddle.nn.functional.smooth_l1_loss + # - paddle.nn.functional.soft_margin_loss + # - paddle.nn.functional.softmax + # - paddle.nn.functional.softmax_ + # - paddle.nn.functional.softmax_with_cross_entropy + # - paddle.nn.functional.softplus + # - paddle.nn.functional.softshrink + # - paddle.nn.functional.softsign + # - paddle.nn.functional.sparse_attention + # - paddle.nn.functional.square_error_cost + # - paddle.nn.functional.swish + # - paddle.nn.functional.tanh + # - paddle.nn.functional.tanh_ + # - paddle.nn.functional.tanhshrink + # - paddle.nn.functional.temporal_shift + # - paddle.nn.functional.thresholded_relu + # - paddle.nn.functional.thresholded_relu_ + # - paddle.nn.functional.triplet_margin_loss + # - paddle.nn.functional.triplet_margin_with_distance_loss + # - paddle.nn.functional.unfold + # - paddle.nn.functional.upsample + # - paddle.nn.functional.zeropad2d + # - paddle.abs + # - paddle.abs_ + # - paddle.acos + # - paddle.acos_ + # - paddle.acosh + # - paddle.acosh_ + # - paddle.add + # - paddle.add_n + # - paddle.addmm + # - paddle.addmm_ + # - paddle.all + # - paddle.allclose + # - paddle.amax + # - paddle.amin + # - paddle.angle + # - paddle.any + # - paddle.arange + # - paddle.argmax + # - paddle.argmin + # - paddle.argsort + # - paddle.as_complex + # - paddle.as_real + # - paddle.as_strided + # - paddle.asin + # - paddle.asin_ + # - paddle.asinh + # - paddle.asinh_ + # - paddle.assign + # - paddle.atan + # - paddle.atan2 + # - paddle.atan_ + # - paddle.atanh + # - paddle.atanh_ + # - paddle.atleast_1d + # - paddle.atleast_2d + # - paddle.atleast_3d + # - paddle.bernoulli + # - paddle.bincount + # - paddle.binomial + # - paddle.bitwise_and + # - paddle.bitwise_and_ + # - paddle.bitwise_not + # - paddle.bitwise_not_ + # - paddle.bitwise_or + # - paddle.bitwise_or_ + # - paddle.bitwise_xor + # - paddle.bitwise_xor_ + # - paddle.bmm + # - paddle.broadcast_shape + # - paddle.broadcast_tensors + # - paddle.broadcast_to + # - paddle.cauchy_ + # - paddle.cast + # - paddle.cdist + # - paddle.ceil + # - paddle.cholesky + # - paddle.chunk + # - paddle.clip + # - paddle.column_stack + # - paddle.combinations + # - paddle.concat + # - paddle.conj + # - paddle.copysign + # - paddle.copysign_ + # - paddle.cos + # - paddle.cos_ + # - paddle.cosh + # - paddle.cosh_ + # - paddle.count_nonzero + # - paddle.crop + # - paddle.cross + # - paddle.cummax + # - paddle.cummin + # - paddle.cumprod + # - paddle.cumprod_ + # - paddle.cumsum + # - paddle.cumsum_ + # - paddle.cumulative_trapezoid + # - paddle.decomposition + # - paddle.deg2rad + # - paddle.diag + # - paddle.diag_embed + # - paddle.diagflat + # - paddle.diagonal + # - paddle.diagonal_scatter + # - paddle.diff + # - paddle.digamma + # - paddle.digamma_ + # - paddle.divide + # - paddle.divide_ + # - paddle.dot + # - paddle.dsplit + # - paddle.dstack + # - paddle.eigvalsh + # - paddle.einsum + # - paddle.equal + # - paddle.equal_all + # - paddle.erf + # - paddle.erf_ + # - paddle.erfinv + # - paddle.exp + # - paddle.expand + # - paddle.expand_as + # - paddle.expm1 + # - paddle.expm1_ + # - paddle.eye + # - paddle.fft + # - paddle.flatten + # - paddle.flatten_ + # - paddle.flip + # - paddle.floor + # - paddle.floor_divide + # - paddle.floor_divide_ + # - paddle.floor_mod + # - paddle.floor_mod_ + # - paddle.fmax + # - paddle.fmin + # - paddle.frac + # - paddle.frac_ + # - paddle.frexp + # - paddle.full + # - paddle.full_like + # - paddle.gather + # - paddle.gather_nd + # - paddle.gcd + # - paddle.gcd_ + # - paddle.greater_equal + # - paddle.greater_equal_ + # - paddle.greater_than + # - paddle.greater_than_ + # - paddle.heaviside + # - paddle.histogram + # - paddle.histogramdd + # - paddle.hsplit + # - paddle.hstack + # - paddle.hypot + # - paddle.hypot_ + # - paddle.i0 + # - paddle.i0_ + # - paddle.i0e + # - paddle.i1 + # - paddle.i1e + # - paddle.imag + # - paddle.increment + # - paddle.index_add + # - paddle.index_add_ + # - paddle.index_fill + # - paddle.index_fill_ + # - paddle.index_put + # - paddle.index_put_ + # - paddle.index_sample + # - paddle.index_select + # - paddle.inner + # - paddle.kron + # - paddle.kthvalue + # - paddle.lcm + # - paddle.lcm_ + # - paddle.ldexp + # - paddle.ldexp_ + # - paddle.lerp + # - paddle.less_equal + # - paddle.less_equal_ + # - paddle.less_than + # - paddle.less_than_ + # - paddle.lgamma + # - paddle.lgamma_ + # - paddle.linalg + # - paddle.linspace + # - paddle.log + # - paddle.log10 + # - paddle.log10_ + # - paddle.log1p + # - paddle.log1p_ + # - paddle.log2 + # - paddle.log2_ + # - paddle.log_ + # - paddle.logaddexp + # - paddle.logcumsumexp + # - paddle.logical_and + # - paddle.logical_and_ + # - paddle.logical_not + # - paddle.logical_not_ + # - paddle.logical_or + # - paddle.logical_or_ + # - paddle.logical_xor + # - paddle.logical_xor_ + # - paddle.logit + # - paddle.logit_ + # - paddle.logspace + # - paddle.logsumexp + # - paddle.masked_fill + # - paddle.masked_fill_ + # - paddle.masked_scatter + # - paddle.masked_scatter_ + # - paddle.masked_select + # - paddle.matmul + # - paddle.max + # - paddle.maximum + # - paddle.mean + # - paddle.median + # - paddle.meshgrid + # - paddle.min + # - paddle.minimum + # - paddle.mm + # - paddle.mod + # - paddle.mod_ + # - paddle.mode + # - paddle.moveaxis + # - paddle.multigammaln + # - paddle.multigammaln_ + # - paddle.multinomial + # - paddle.multiplex + # - paddle.multiply + # - paddle.multiply_ + # - paddle.mv + # - paddle.nan_to_num + # - paddle.nan_to_num_ + # - paddle.nanmean + # - paddle.nanmedian + # - paddle.nanquantile + # - paddle.nansum + # - paddle.neg + # - paddle.neg_ + # - paddle.nextafter + # - paddle.nonzero + # - paddle.normal + # - paddle.normal_ + # - paddle.not_equal + # - paddle.not_equal_ + # - paddle.numel + # - paddle.outer + # - paddle.pdist + # - paddle.poisson + # - paddle.polar + # - paddle.polygamma + # - paddle.polygamma_ + # - paddle.pow + # - paddle.pow_ + # - paddle.prod + # - paddle.put_along_axis + # - paddle.quantile + # - paddle.rad2deg + # - paddle.rand + # - paddle.randint + # - paddle.randint_like + # - paddle.randn + # - paddle.randperm + # - paddle.reader + # - paddle.real + # - paddle.reciprocal + # - paddle.regularizer + # - paddle.remainder + # - paddle.remainder_ + # - paddle.renorm + # - paddle.renorm_ + # - paddle.repeat_interleave + # - paddle.roll + # - paddle.rot90 + # - paddle.round + # - paddle.row_stack + # - paddle.rsqrt + # - paddle.scale + # - paddle.scatter + # - paddle.scatter_ + # # - paddle.scatter_nd # cause CUDA_ERROR ignored. + # # - paddle.scatter_nd_add + # - paddle.searchsorted + # - paddle.select_scatter + # - paddle.sgn + # - paddle.shard_index + # - paddle.sign + # - paddle.signal + # - paddle.signbit + # - paddle.sin + # - paddle.sin_ + # - paddle.sinh + # - paddle.sinh_ + # - paddle.slice + # # - paddle.slice_scatter + # - paddle.sort + # - paddle.sqrt + # - paddle.square + # - paddle.standard_gamma + # - paddle.standard_normal + # - paddle.stanh + # - paddle.strided_slice + # - paddle.subtract + # - paddle.sum + # - paddle.t + # - paddle.t_ + # - paddle.take + # - paddle.take_along_axis + # - paddle.tan + # - paddle.tan_ + # - paddle.tanh + # - paddle.tanh_ + # - paddle.tensordot + # - paddle.tile + # - paddle.topk + # - paddle.trace + # - paddle.transpose + # - paddle.transpose_ + # - paddle.trapezoid + # - paddle.tril + # - paddle.tril_ + # - paddle.tril_indices + # - paddle.triu + # - paddle.triu_ + # - paddle.triu_indices + # - paddle.trunc + # - paddle.trunc_ + # - paddle.unbind + # - paddle.unflatten + # - paddle.unfold + # - paddle.uniform + # - paddle.unique + # - paddle.unique_consecutive + # - paddle.unstack + # - paddle.vander + # - paddle.var + # - paddle.view + # - paddle.view_as + # - paddle.vsplit + # - paddle.where + # - paddle.where_ + # - paddle.zeros + # - paddle.zeros_like + # - paddle.Tensor.T + # - paddle.Tensor.__add__ + # - paddle.Tensor.__and__ + # - paddle.Tensor.__radd__ + # - paddle.Tensor.__div__ + # - paddle.Tensor.__eq__ + # - paddle.Tensor.__floordiv__ + # - paddle.Tensor.__ge__ + # - paddle.Tensor.__gt__ + # - paddle.Tensor.__le__ + # - paddle.Tensor.__lt__ + # - paddle.Tensor.__matmul__ + # - paddle.Tensor.__mod__ + # - paddle.Tensor.__mul__ + # - paddle.Tensor.__ne__ + # - paddle.Tensor.__neg__ + # - paddle.Tensor.__nonzero__ + # - paddle.Tensor.__or__ + # - paddle.Tensor.__pow__ + # - paddle.Tensor.__radd__ + # - paddle.Tensor.__rdiv__ + # - paddle.Tensor.__rmul__ + # - paddle.Tensor.__rpow__ + # - paddle.Tensor.__rsub__ + # - paddle.Tensor.__rtruediv__ + # - paddle.Tensor.__sub__ + # - paddle.Tensor.__truediv__ + # - paddle.Tensor.__xor__ + # - paddle.Tensor.abs + # - paddle.Tensor.abs_ + # - paddle.Tensor.acos + # - paddle.Tensor.acos_ + # - paddle.Tensor.acosh + # - paddle.Tensor.acosh_ + # - paddle.Tensor.add + # - paddle.Tensor.add_ + # - paddle.Tensor.add_n + # - paddle.Tensor.addmm + # - paddle.Tensor.addmm_ + # - paddle.Tensor.all + # - paddle.Tensor.allclose + # - paddle.Tensor.amax + # - paddle.Tensor.amin + # - paddle.Tensor.angle + # - paddle.Tensor.any + # - paddle.Tensor.argmax + # - paddle.Tensor.argmin + # - paddle.Tensor.argsort + # - paddle.Tensor.as_complex + # - paddle.Tensor.as_real + # - paddle.Tensor.as_strided + # - paddle.Tensor.asin + # - paddle.Tensor.asin_ + # - paddle.Tensor.asinh + # - paddle.Tensor.asinh_ + # - paddle.Tensor.atan + # - paddle.Tensor.atan2 + # - paddle.Tensor.atan_ + # - paddle.Tensor.atanh + # - paddle.Tensor.atanh_ + # - paddle.Tensor.atleast_1d + # - paddle.Tensor.atleast_2d + # - paddle.Tensor.atleast_3d + # - paddle.Tensor.bincount + # - paddle.Tensor.bitwise_and + # - paddle.Tensor.bitwise_and_ + # - paddle.Tensor.bitwise_not + # - paddle.Tensor.bitwise_not_ + # - paddle.Tensor.bitwise_or + # - paddle.Tensor.bitwise_or_ + # - paddle.Tensor.bitwise_xor + # - paddle.Tensor.bitwise_xor_ + # - paddle.Tensor.bmm + # - paddle.Tensor.broadcast_shape + # - paddle.Tensor.broadcast_tensors + # - paddle.Tensor.broadcast_to + # - paddle.Tensor.cauchy_ + # - paddle.Tensor.cdist + # - paddle.Tensor.ceil + # - paddle.Tensor.ceil_ + # - paddle.Tensor.cholesky + # - paddle.Tensor.cholesky_solve + # - paddle.Tensor.clip + # - paddle.Tensor.clip_ + # - paddle.Tensor.coalesce + # - paddle.Tensor.cols + # - paddle.Tensor.combinations + # - paddle.Tensor.concat + # - paddle.Tensor.cond + # - paddle.Tensor.conj + # - paddle.Tensor.contiguous + # - paddle.Tensor.corrcoef + # - paddle.Tensor.cos + # - paddle.Tensor.cos_ + # - paddle.Tensor.cosh + # - paddle.Tensor.cosh_ + # - paddle.Tensor.count_nonzero + # - paddle.Tensor.cov + # - paddle.Tensor.cross + # - paddle.Tensor.crows + # - paddle.Tensor.cummax + # - paddle.Tensor.cummin + # - paddle.Tensor.cumprod + # - paddle.Tensor.cumprod_ + # - paddle.Tensor.cumsum + # - paddle.Tensor.cumsum_ + # - paddle.Tensor.cumulative_trapezoid + # - paddle.Tensor.deg2rad + # - paddle.Tensor.diag + # - paddle.Tensor.diag_embed + # - paddle.Tensor.diagflat + # - paddle.Tensor.diagonal + # - paddle.Tensor.diagonal_scatter + # - paddle.Tensor.diff + # - paddle.Tensor.digamma + # - paddle.Tensor.digamma_ + # - paddle.Tensor.divide + # - paddle.Tensor.divide_ + # - paddle.Tensor.dot + # - paddle.Tensor.eig + # - paddle.Tensor.eigvals + # - paddle.Tensor.eigvalsh + # - paddle.Tensor.equal + # - paddle.Tensor.equal_all + # - paddle.Tensor.erf + # - paddle.Tensor.erfinv + # - paddle.Tensor.erfinv_ + # - paddle.Tensor.exp + # - paddle.Tensor.exp_ + # - paddle.Tensor.expand + # - paddle.Tensor.expand_as + # - paddle.Tensor.expm1 + # - paddle.Tensor.exponential_ + # - paddle.Tensor.fill_ + # - paddle.Tensor.fill_diagonal_ + # - paddle.Tensor.fill_diagonal_tensor + # - paddle.Tensor.fill_diagonal_tensor_ + # - paddle.Tensor.flatten + # - paddle.Tensor.flatten_ + # - paddle.Tensor.flip + # - paddle.Tensor.floor + # - paddle.Tensor.floor_ + # - paddle.Tensor.floor_divide + # - paddle.Tensor.floor_divide_ + # - paddle.Tensor.floor_mod + # - paddle.Tensor.floor_mod_ + # - paddle.Tensor.fmax + # - paddle.Tensor.fmin + # - paddle.Tensor.frac + # - paddle.Tensor.frac_ + # - paddle.Tensor.frexp + # - paddle.Tensor.gather + # - paddle.Tensor.gather_nd + # - paddle.Tensor.gcd + # - paddle.Tensor.gcd_ + # - paddle.Tensor.get_selected_rows + # - paddle.Tensor.get_strides + # - paddle.Tensor.greater_equal + # - paddle.Tensor.greater_equal_ + # - paddle.Tensor.greater_than + # - paddle.Tensor.greater_than_ + # - paddle.Tensor.heaviside + # - paddle.Tensor.histogram + # - paddle.Tensor.histogramdd + # - paddle.Tensor.hsplit + # - paddle.Tensor.hypot + # - paddle.Tensor.hypot_ + # - paddle.Tensor.i0 + # - paddle.Tensor.i0_ + # - paddle.Tensor.i0e + # - paddle.Tensor.i1 + # - paddle.Tensor.i1e + # - paddle.Tensor.imag + # - paddle.Tensor.increment + # - paddle.Tensor.index_add + # - paddle.Tensor.index_add_ + # - paddle.Tensor.index_fill + # - paddle.Tensor.index_fill_ + # - paddle.Tensor.index_put + # - paddle.Tensor.index_put_ + # - paddle.Tensor.index_sample + # - paddle.Tensor.index_select + # - paddle.Tensor.inner + # - paddle.Tensor.kron + # - paddle.Tensor.kthvalue + # - paddle.Tensor.layout + # - paddle.Tensor.lcm + # - paddle.Tensor.lcm_ + # - paddle.Tensor.ldexp + # - paddle.Tensor.ldexp_ + # - paddle.Tensor.lerp + # - paddle.Tensor.lerp_ + # - paddle.Tensor.less_equal + # - paddle.Tensor.less_equal_ + # - paddle.Tensor.less_than + # - paddle.Tensor.less_than_ + # - paddle.Tensor.lgamma + # - paddle.Tensor.lgamma_ + # - paddle.Tensor.log + # - paddle.Tensor.log10 + # - paddle.Tensor.log10_ + # - paddle.Tensor.log1p + # - paddle.Tensor.log1p_ + # - paddle.Tensor.log2 + # - paddle.Tensor.log2_ + # - paddle.Tensor.log_ + # - paddle.Tensor.logaddexp + # - paddle.Tensor.logcumsumexp + # - paddle.Tensor.logical_and + # - paddle.Tensor.logical_and_ + # - paddle.Tensor.logical_not + # - paddle.Tensor.logical_not_ + # - paddle.Tensor.logical_or + # - paddle.Tensor.logical_or_ + # - paddle.Tensor.logical_xor + # - paddle.Tensor.logical_xor_ + # - paddle.Tensor.logit + # - paddle.Tensor.logit_ + # - paddle.Tensor.logsumexp + # - paddle.Tensor.lstsq + # - paddle.Tensor.lu + # - paddle.Tensor.lu_unpack + # - paddle.Tensor.masked_fill + # - paddle.Tensor.masked_fill_ + # - paddle.Tensor.masked_select + # - paddle.Tensor.masked_scatter + # - paddle.Tensor.masked_scatter_ + # - paddle.Tensor.matmul + # - paddle.Tensor.matrix_power + # - paddle.Tensor.max + # - paddle.Tensor.maximum + # - paddle.Tensor.mean + # - paddle.Tensor.median + # - paddle.Tensor.min + # - paddle.Tensor.minimum + # - paddle.Tensor.mm + # - paddle.Tensor.mod + # - paddle.Tensor.mod_ + # - paddle.Tensor.mode + # - paddle.Tensor.moveaxis + # - paddle.Tensor.multi_dot + # - paddle.Tensor.multigammaln + # - paddle.Tensor.multigammaln_ + # - paddle.Tensor.multinomial + # - paddle.Tensor.multiplex + # - paddle.Tensor.multiply + # - paddle.Tensor.multiply_ + # - paddle.Tensor.mv + # - paddle.Tensor.nan_to_num + # - paddle.Tensor.nan_to_num_ + # - paddle.Tensor.nanmean + # - paddle.Tensor.nanmedian + # - paddle.Tensor.nanquantile + # - paddle.Tensor.nansum + # - paddle.Tensor.ndimension + # - paddle.Tensor.neg + # - paddle.Tensor.neg_ + # - paddle.Tensor.nnz + # - paddle.Tensor.nonzero + # - paddle.Tensor.norm + # - paddle.Tensor.normal_ + # - paddle.Tensor.not_equal + # - paddle.Tensor.not_equal_ + # - paddle.Tensor.numel + # - paddle.Tensor.offset + # - paddle.Tensor.outer + # - paddle.Tensor.pca_lowrank + # - paddle.Tensor.pinv + # - paddle.Tensor.polar + # - paddle.Tensor.polygamma + # - paddle.Tensor.polygamma_ + # - paddle.Tensor.pow + # - paddle.Tensor.pow_ + # - paddle.Tensor.process_mesh + # - paddle.Tensor.prod + # - paddle.Tensor.put_along_axis + # - paddle.Tensor.put_along_axis_ + # - paddle.Tensor.qr + # - paddle.Tensor.quantile + # - paddle.Tensor.rad2deg + # - paddle.Tensor.remainder + # - paddle.Tensor.remainder_ + # - paddle.Tensor.renorm + # - paddle.Tensor.renorm_ + # - paddle.Tensor.repeat_interleave + # - paddle.Tensor.reverse + # - paddle.Tensor.roll + # - paddle.Tensor.rot90 + # - paddle.Tensor.round + # - paddle.Tensor.round_ + # - paddle.Tensor.rows + # - paddle.Tensor.rsqrt + # - paddle.Tensor.rsqrt_ + # - paddle.Tensor.scale + # - paddle.Tensor.scale_ + # - paddle.Tensor.scatter + # - paddle.Tensor.scatter_ + # - paddle.Tensor.scatter_nd + # - paddle.Tensor.scatter_nd_add + # - paddle.Tensor.select_scatter + # - paddle.Tensor.sgn + # - paddle.Tensor.shard_index + # - paddle.Tensor.sigmoid + # - paddle.Tensor.sigmoid_ + # - paddle.Tensor.sign + # - paddle.Tensor.sin + # - paddle.Tensor.sin_ + # - paddle.Tensor.sinh + # - paddle.Tensor.sinh_ + # - paddle.Tensor.size + # - paddle.Tensor.slice + # - paddle.Tensor.solve + # - paddle.Tensor.sort + # - paddle.Tensor.split + # - paddle.Tensor.sqrt + # - paddle.Tensor.sqrt_ + # - paddle.Tensor.square + # - paddle.Tensor.stack + # - paddle.Tensor.stanh + # - paddle.Tensor.std + # - paddle.Tensor.stft + # - paddle.Tensor.strided_slice + # - paddle.Tensor.strides + # - paddle.Tensor.subtract + # - paddle.Tensor.subtract_ + # - paddle.Tensor.sum + # - paddle.Tensor.t + # - paddle.Tensor.t_ + # - paddle.Tensor.take + # - paddle.Tensor.take_along_axis + # - paddle.Tensor.tan + # - paddle.Tensor.tan_ + # - paddle.Tensor.tanh + # - paddle.Tensor.tanh_ + # - paddle.Tensor.tensordot + # - paddle.Tensor.tile + # - paddle.Tensor.top_p_sampling + # - paddle.Tensor.topk + # - paddle.Tensor.trace + # - paddle.Tensor.transpose + # - paddle.Tensor.transpose_ + # - paddle.Tensor.trapezoid + # - paddle.Tensor.tril + # - paddle.Tensor.tril_ + # - paddle.Tensor.triu + # - paddle.Tensor.triu_ + # - paddle.Tensor.trunc + # - paddle.Tensor.trunc_ + # - paddle.Tensor.unbind + # - paddle.Tensor.unflatten + # - paddle.Tensor.unfold + # - paddle.Tensor.uniform_ + # - paddle.Tensor.unique + # - paddle.Tensor.unique_consecutive + # - paddle.Tensor.unstack + # - paddle.Tensor.vander + # - paddle.Tensor.var + # - paddle.Tensor.view + # - paddle.Tensor.view_as + # - paddle.Tensor.vsplit + # - paddle.Tensor.where + # - paddle.Tensor.where_ + # - paddle._C_ops.fused_gemm_epilogue + # - paddle.optimizer.Adam + # - paddle.optimizer.AdamW + # - paddle._C_ops.adamw + # - paddle._C_ops.adamw_ + # - paddle._legacy_C_ops.fused_gemm_epilogue + # - paddle.incubate.nn.functional.fused_multi_head_attention + # - paddle.incubate.nn.functional.fused_feedforward + # - paddle.incubate.nn.functional.fused_multi_transformer + # - paddle.incubate.nn.functional.fused_linear + # - paddle.incubate.nn.functional.fused_linear_activation + # - paddle.incubate.nn.functional.fused_bias_dropout_residual_layer_norm + # - paddle.incubate.nn.functional.fused_ec_moe + # - paddle.incubate.nn.functional.fused_dropout_add + # - paddle.incubate.nn.functional.fused_rotary_position_embedding + # - paddle.incubate.nn.functional.variable_length_memory_efficient_attention + # - paddle.incubate.nn.functional.fused_rms_norm + # - paddle.incubate.nn.functional.fused_layer_norm + # - paddle.incubate.nn.functional.masked_multihead_attention + # - paddle.incubate.nn.functional.block_multihead_attention + # - paddle.incubate.nn.functional.swiglu + # - paddle.incubate.nn.functional.fused_matmul_bias + # - paddle.tensor.fill_constant + # - paddle.nn.clip._squared_l2_norm + # - paddle.uniform + # - paddle._C_ops.gaussian + # - paddle._legacy_C_ops.c_identity +#fusion_ops: + - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention + - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm + - paddlenlp.transformers.llama.fusion_ops.fusion_rope diff --git a/paddleapex/api_tracer/wrap_op/OPTemplate.py b/paddleapex/api_tracer/wrap_op/OPTemplate.py index 0ef670e..9a4258a 100644 --- a/paddleapex/api_tracer/wrap_op/OPTemplate.py +++ b/paddleapex/api_tracer/wrap_op/OPTemplate.py @@ -14,10 +14,9 @@ import paddle.distributed as dist import paddle from .. import config -from ..api_info import API -from inspect import signature +from ..api_info import API, get_init_params, save_init_params_and_weight import os -import pickle +from paddleapex.api_tracer.Dump import dump_util class HookOp: @@ -33,26 +32,31 @@ def hijack_init(self, *args, **kwargs): self.__init__(*args, **kwargs) -def get_init_params(instance): - sig = signature(instance.__init__) - bound_args = sig.bind_partial() - bound_args.apply_defaults() - - init_params = {} - for param in sig.parameters.values(): - if param.name != 'self': - init_params[param.name] = getattr(instance, param.name, param.default) - - return init_params +def create_hook_with_info(tensor, attr_index, api): + def grad_hook(grad): + if grad is not None: + single_arg = {} + single_arg.update({"type": "paddle.Tensor"}) + single_arg.update({"dtype": str(grad.dtype.name)}) + single_arg.update({"shape": grad.shape}) + single_arg.update({"stop_gradient": grad.stop_gradient}) + api_args = api.op_name + ".grad_" + str(attr_index) + pt_path = dump_util.dump_real_data(api_args, grad.detach().cpu(), api.rank) + single_arg.update({"real_data_path": pt_path}) + + api.dout_list.append(single_arg) + api.output_num -= 1 + if api.output_num == 0: + api.api_info_struct[api.op_name].update({"dout_list": api.dout_list}) + if api.mode == "real_data": + return grad_hook + else: + return api.record_dout -def save_init_params_and_weight(init_params, state_dict, name, rank): - data_route = cfg.dump_root_path - directory = os.path.join(data_route, f"rank{rank}_step{cfg.global_step}") - file_path = os.path.join(directory, f"{name}.init_params") - with open(file_path, 'wb') as f: - pickle.dump(init_params, f) - paddle.save(state_dict, os.path.join(directory, f"{name}.state_dict")) +def create_output_attr(tensor, num): + setattr(tensor, 'id_apex', num) + return 'id_apex', num def hijack_call(self, *args, **kwargs): @@ -73,9 +77,11 @@ def hijack_call(self, *args, **kwargs): save_init_params_and_weight(init_params, self.state_dict(), cfg.prefix_op_name_, rank) output = self.forward(*args, **kwargs) try: + out_num = 0 if isinstance(output, paddle.Tensor): if not output.stop_gradient: - output.register_hook(api_recorder.record_dout) + output.register_hook(create_hook_with_info(output, api_recorder.output_num, api_recorder)) + #output.register_hook(api_recorder.record_dout) api_recorder.output_num = 1 else: api_recorder.record_dout(None) @@ -83,9 +89,9 @@ def hijack_call(self, *args, **kwargs): need_record = False for item in output: if isinstance(item, paddle.Tensor) and not item.stop_gradient: + item.register_hook(create_hook_with_info(item, api_recorder.output_num, api_recorder)) api_recorder.output_num += 1 need_record = True - item.register_hook(api_recorder.record_dout) if not need_record: api_recorder.record_dout(None) except Exception as e: @@ -118,7 +124,8 @@ def forward(self, *args, **kwargs): try: if isinstance(output, paddle.Tensor): if not output.stop_gradient: - output.register_hook(api_recorder.record_dout) + #output.register_hook(api_recorder.record_dout) + output.register_hook(create_hook_with_info(output, api_recorder.output_num, api_recorder)) api_recorder.output_num = 1 else: api_recorder.record_dout(None) @@ -126,9 +133,10 @@ def forward(self, *args, **kwargs): need_record = False for item in output: if isinstance(item, paddle.Tensor) and not item.stop_gradient: - api_recorder.output_num += 1 need_record = True - item.register_hook(api_recorder.record_dout) + #item.register_hook(api_recorder.record_dout) + item.register_hook(create_hook_with_info(item, api_recorder.output_num, api_recorder)) + api_recorder.output_num += 1 if not need_record: api_recorder.record_dout(None) except Exception as e: diff --git a/paddleapex/api_tracer/wrap_op/get_target_op.py b/paddleapex/api_tracer/wrap_op/get_target_op.py index 42daf73..04c7625 100644 --- a/paddleapex/api_tracer/wrap_op/get_target_op.py +++ b/paddleapex/api_tracer/wrap_op/get_target_op.py @@ -30,6 +30,12 @@ def __init__(self, yaml_path): f.close() if self.ignored_op is None: self.ignored_op = [] + if self.target_op is None: + self.target_op = [] + if self.target_class is None: + self.target_class = [] + if self.distributed_op is None: + self.distributed_op = [] self.api_to_catch = set(self.target_op).union(set(self.distributed_op)) - set(self.ignored_op) def check_api_stack(self): From 09a6cf6cfde46091fb186ba18c0e364694c55b19 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Fri, 27 Dec 2024 14:05:38 +0800 Subject: [PATCH 17/22] remove some debug code --- paddleapex/apex/acc_direct_paddle.py | 2 -- paddleapex/apex/run_paddle.py | 6 ------ 2 files changed, 8 deletions(-) diff --git a/paddleapex/apex/acc_direct_paddle.py b/paddleapex/apex/acc_direct_paddle.py index f9ebbf8..1c4a7e0 100644 --- a/paddleapex/apex/acc_direct_paddle.py +++ b/paddleapex/apex/acc_direct_paddle.py @@ -120,8 +120,6 @@ def compare_device_bench( errors_forward_info = [] errors_bacward_info = [] for i, api_file in enumerate(tqdm.tqdm(api_pt_files_all, **tqdm_params)): - if i < 2700: - continue bench_out_tensor, device_out_tensor = None, None bench_grad_tensor_list, device_grad_tensor_list = None, None try: diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index be90871..62a760c 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -85,11 +85,6 @@ PROFILE_WARM_TIMES = 10 PROFILE_RUN_TIMES = 10 -#strategy = fleet.DistributedStrategy() -#strategy.hybrid_configs = { -# "dp_degree": 1, "mp_degree": 8, "pp_degree": 1,"sharding_degree": 1,} -#fleet.init(is_collective=True, strategy=strategy) -#paddle.set_default_dtype("bfloat16") def recursive_delete_arg(arg_in): if isinstance(arg_in, (list, tuple)): @@ -340,7 +335,6 @@ def get_need_grad_out(args): return device_grad_out -<<<<<<< HEAD def print_tensor_name(args): if isinstance(args, paddle.Tensor): print(args.name) From 551a018e1ededecdc188f8b394da5358a5383a5b Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Wed, 15 Jan 2025 11:18:33 +0800 Subject: [PATCH 18/22] fix some bug in class --- paddleapex/apex/run_paddle.py | 16 +- paddleapex/apex/utils/data_generate.py | 10 + paddleapex/api_tracer/Tracer.py | 5 +- paddleapex/api_tracer/api_info.py | 26 +- paddleapex/api_tracer/config.py | 2 +- paddleapex/api_tracer/configs/op_target.yaml | 1795 ++++++++--------- .../api_tracer/configs/tool_config.yaml | 2 +- paddleapex/api_tracer/wrap_op/OPTemplate.py | 1 - 8 files changed, 950 insertions(+), 907 deletions(-) diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 62a760c..812a2b1 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -40,6 +40,8 @@ print_warn_log, ) +os.environ["USE_CASUAL_MASK"] = "True" + type_map = { "FP16": paddle.float16, "FP32": paddle.float32, @@ -184,6 +186,7 @@ def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): + return if dtype_name == "": bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) fwd_output_dir = os.path.abspath(os.path.join(out_path, "output")) @@ -437,9 +440,11 @@ def run_acc_case( else: try: device_out = run_forward(api_call_name, device_args, device_kwargs) - if api_call_stack in distributed_op and device_out is None: - print('this is distributed op: ', api_call_name) - device_out = device_args + if api_call_stack in distributed_op: + from paddle.base.libpaddle import task + if type(device_out) is task: + print('this is distributed op: ', api_call_name) + device_out = device_args except Exception as err: msg = "Run_op_forward Error: %s" % str(err) print_warn_log(msg) @@ -802,7 +807,10 @@ def check_json(json_list): out_path = os.path.realpath(cfg.out_path) if cfg.out_path else "./" if os.path.exists(out_path): print_warn_log("The output path already exists and the file with the same name will be overwritten.") - + + from paddlenlp.trainer import set_seed + set_seed(1026) + if cfg.distributed_op: if cfg.test_class: strategy = fleet.DistributedStrategy() diff --git a/paddleapex/apex/utils/data_generate.py b/paddleapex/apex/utils/data_generate.py index f66e89f..3b1b3b1 100644 --- a/paddleapex/apex/utils/data_generate.py +++ b/paddleapex/apex/utils/data_generate.py @@ -111,6 +111,12 @@ def create_model(api_call_stack, real_data_path): return None +def create_config(api_call_stack, real_data_path): + config_path = real_data_path + '.config' + config = load_params(config_path) + return config + + def gen_data(info, real_data_path=None): check_object_type(info, dict) data_type = info.get("type") @@ -139,6 +145,10 @@ def gen_data(info, real_data_path=None): api_call_stack = info.get("api_call_stack") data_pth = os.path.join(real_data_path, rel_data_path) data = create_model(api_call_stack, data_pth) + elif data_type == 'config': + api_call_stack = info.get("api_call_stack") + data_pth = os.path.join(real_data_path, rel_data_path) + data = create_config(api_call_stack, data_pth) else: data = info.get("value") if info.get("type") == "slice": diff --git a/paddleapex/api_tracer/Tracer.py b/paddleapex/api_tracer/Tracer.py index 7742d8f..be468b5 100644 --- a/paddleapex/api_tracer/Tracer.py +++ b/paddleapex/api_tracer/Tracer.py @@ -19,7 +19,10 @@ from paddleapex.apex.utils import print_info_log class Tracer: - def __init__(self): + # def __init__(self): + # hijack_api() + + def register_op(self): hijack_api() def start(self): diff --git a/paddleapex/api_tracer/api_info.py b/paddleapex/api_tracer/api_info.py index 19dd7ee..7ccdd5d 100644 --- a/paddleapex/api_tracer/api_info.py +++ b/paddleapex/api_tracer/api_info.py @@ -115,9 +115,14 @@ def get_init_params(instance): return init_params -def save_init_params_and_weight(init_params, state_dict, name, rank): +def get_file_path(rank): data_route = cfg.dump_root_path directory = os.path.join(data_route, f"rank{rank}_step{cfg.global_step}") + return directory + + +def save_init_params_and_weight(init_params, state_dict, name, rank): + directory = get_file_path(rank) file_path = os.path.join(directory, f"{name}.init_params") with open(file_path, 'wb') as f: pickle.dump(init_params, f) @@ -207,6 +212,9 @@ def analyze_element(self, element): from paddlenlp.transformers.llama.modeling import LlamaRotaryEmbedding if type(element) is LlamaRotaryEmbedding: return self.analyze_class(element, "paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding") + from paddlenlp.transformers.llama.configuration import LlamaConfig + if type(element) is LlamaConfig: + return self.analyze_config(element, "paddlenlp.transformers.llama.configuration.LlamaConfig") except Exception as e: print(e) print("check you environment, and ensure the path of paddlenlp is valid") @@ -217,6 +225,22 @@ def analyze_element(self, element): print(msg) + def analyze_config(self, arg, call_stack): + single_arg = {} + single_arg.update({"type": "config"}) + single_arg.update({"dtype": str(type(arg))}) + single_arg.update({"api_call_stack": call_stack}) + if self.mode == "real_data": + api_args = self.op_name + "." + str(self.args_num) + self.args_num += 1 + directory = get_file_path(self.rank) + file_path = os.path.join(directory, f"{api_args}.config") + with open(file_path, 'wb') as f: + pickle.dump(arg, f) + single_arg.update({"real_data_path": api_args}) + return single_arg + + def analyze_class(self, arg, call_stack): single_arg = {} single_arg.update({"type": "class"}) diff --git a/paddleapex/api_tracer/config.py b/paddleapex/api_tracer/config.py index 62ab99c..d8fb34b 100644 --- a/paddleapex/api_tracer/config.py +++ b/paddleapex/api_tracer/config.py @@ -56,6 +56,7 @@ def new_step(self): def new_step_in_training(self, global_step): if global_step in self.target_step: + self.global_step = global_step self.Op_count = {} self.dump_state = True return True @@ -63,7 +64,6 @@ def new_step_in_training(self, global_step): def reset_step_in_training(self, global_step): if global_step in self.target_step: - self.global_step = global_step self.dump_state = False return True return False diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index eec3189..23c10ab 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -1,24 +1,24 @@ target_class: - - paddlenlp.transformers.llama.modeling.LlamaMLP - # - paddlenlp.transformers.llama.modeling.LlamaLMHead - # - paddlenlp.transformers.llama.modeling.LlamaRMSNorm - # - paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding - # - paddlenlp.transformers.llama.modeling.LlamaLinearScalingRotaryEmbedding - # - paddlenlp.transformers.llama.modeling.LlamaNTKScalingRotaryEmbedding - # - paddlenlp.transformers.llama.modeling.LlamaDynamicNTKScalingRotaryEmbedding - # - paddlenlp.transformers.llama.modeling.Llama3RotaryEmbedding - # - paddlenlp.transformers.llama.modeling.MoEAllToAll - # - paddlenlp.transformers.llama.modeling.MoEGateCombine - # - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler - # - paddlenlp.transformers.llama.modeling.LlamaMoEGate - # - paddlenlp.transformers.llama.modeling.LlamaMoEMLP - # - paddlenlp.transformers.llama.modeling.LlamaAttention - # - paddlenlp.transformers.llama.modeling.LlamaDecoderLayer + - paddlenlp.transformers.llama.modeling.LlamaLMHead + - paddlenlp.transformers.llama.modeling.LlamaRMSNorm + - paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding + - paddlenlp.transformers.llama.modeling.LlamaLinearScalingRotaryEmbedding + - paddlenlp.transformers.llama.modeling.LlamaNTKScalingRotaryEmbedding + - paddlenlp.transformers.llama.modeling.LlamaDynamicNTKScalingRotaryEmbedding + - paddlenlp.transformers.llama.modeling.Llama3RotaryEmbedding + - paddlenlp.transformers.llama.modeling.MoEAllToAll + - paddlenlp.transformers.llama.modeling.MoEGateCombine + - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler + - paddlenlp.transformers.llama.modeling.LlamaMoEGate + - paddlenlp.transformers.llama.modeling.LlamaMoEMLP + - paddlenlp.transformers.llama.modeling.LlamaAttention + - paddlenlp.transformers.llama.modeling.LlamaDecoderLayer + - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss + - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler # - paddlenlp.transformers.llama.modeling.LlamaPretrainedModel # - paddlenlp.transformers.llama.modeling.LlamaPretrainingCriterion - # - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss - # - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler # - paddlenlp.transformers.llama.modeling.LlamaForCausalLM + # - paddlenlp.transformers.llama.modeling.LlamaMLP # - paddlenlp.transformers.llama.modeling.LlamaModel ignored_op: - paddle._C_ops.max @@ -41,31 +41,30 @@ ignored_op: - paddle.stack - paddle.zeros - paddle.zeros_like - - paddle.distributed.communication.stream.alltoall_single distributed_op: - # - paddle.distributed.communication.stream.alltoall_single - # - paddle.distributed.barrier - # - paddle.distributed.broadcast_object_list - # - paddle.distributed.communication.stream.broadcast - # - paddle.distributed.communication.stream.gather - # - paddle.distributed.communication.stream.recv - # - paddle.distributed.communication.stream.reduce - # - paddle.distributed.communication.stream.reduce_scatter - # - paddle.distributed.communication.stream.scatter - # - paddle.distributed.communication.stream.send - # - paddle.distributed.all_gather - # - paddle.distributed.all_gather_object - # - paddle.distributed.all_reduce - # - paddle.distributed.alltoall - # - paddle.distributed.alltoall_single - # - paddle.distributed.broadcast - # - paddle.distributed.communication.stream.all_gather - # - paddle.distributed.communication.stream.all_reduce - # - paddle.distributed.communication.stream.alltoall - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table - # - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy - # - paddle.distributed.fleet.layers.mpu.mp_ops. + - paddle.distributed.communication.stream.alltoall_single + - paddle.distributed.barrier + - paddle.distributed.broadcast_object_list + - paddle.distributed.communication.stream.broadcast + - paddle.distributed.communication.stream.gather + - paddle.distributed.communication.stream.recv + - paddle.distributed.communication.stream.reduce + - paddle.distributed.communication.stream.reduce_scatter + - paddle.distributed.communication.stream.scatter + - paddle.distributed.communication.stream.send + - paddle.distributed.all_gather + - paddle.distributed.all_gather_object + - paddle.distributed.all_reduce + - paddle.distributed.alltoall + - paddle.distributed.alltoall_single + - paddle.distributed.broadcast + - paddle.distributed.communication.stream.all_gather + - paddle.distributed.communication.stream.all_reduce + - paddle.distributed.communication.stream.alltoall + - paddle.distributed.fleet.layers.mpu.mp_ops._c_identity + - paddle.distributed.fleet.layers.mpu.mp_ops._c_lookup_table + - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy + - paddle.distributed.fleet.layers.mpu.mp_ops. target_op: - paddle._C_ops.min - paddle._C_ops.min @@ -88,863 +87,863 @@ target_op: - paddle.stack - paddle.zeros - paddle.zeros_like - # - paddle.Tensor.__add__ - # - paddle._C_ops.layer_norm - # - paddle.multiply - # - paddle.multiply_ - # - paddle.Tensor.__mul__ - # - paddle.Tensor.__neg__ - # - paddle.Tensor.add_ - # - paddle._C_ops.adamw - # - paddle._C_ops.adamw_ - # - paddle.square_ - # - paddle.nn.functional.scaled_dot_product_attention - # - paddle._C_ops.layer_norm #noqa - # - paddle.nn.functional.adaptive_avg_pool1d - # - paddle.nn.functional.adaptive_avg_pool2d - # - paddle.nn.functional.adaptive_avg_pool3d - # - paddle.nn.functional.adaptive_max_pool1d - # - paddle.nn.functional.adaptive_max_pool2d - # - paddle.nn.functional.adaptive_max_pool3d - # - paddle.nn.functional.affine_grid - # - paddle.nn.functional.alpha_dropout - # - paddle.nn.functional.avg_pool1d - # - paddle.nn.functional.avg_pool2d - # - paddle.nn.functional.avg_pool3d - # - paddle.nn.functional.batch_norm - # - paddle.nn.functional.bilinear - # - paddle.nn.functional.binary_cross_entropy - # - paddle.nn.functional.binary_cross_entropy_with_logits - # - paddle.nn.functional.celu - # - paddle.nn.functional.channel_shuffle - # - paddle.nn.functional.class_center_sample - # - paddle.nn.functional.common - # - paddle.nn.functional.conv1d - # - paddle.nn.functional.conv1d_transpose - # - paddle.nn.functional.conv2d - # - paddle.nn.functional.conv2d_transpose - # - paddle.nn.functional.conv3d - # - paddle.nn.functional.conv3d_transpose - # - paddle.nn.functional.cosine_embedding_loss - # - paddle.nn.functional.cosine_similarity - # - paddle.nn.functional.cross_entropy - # - paddle.nn.functional.ctc_loss - # - paddle.nn.functional.diag_embed - # - paddle.nn.functional.dice_loss - # - paddle.nn.functional.distance - # - paddle.nn.functional.dropout - # - paddle.nn.functional.dropout2d - # - paddle.nn.functional.dropout3d - # - paddle.nn.functional.elu - # - paddle.nn.functional.elu_ - # - paddle.nn.functional.embedding - # - paddle.nn.functional.extension - # - paddle.nn.functional.flash_attention - # - paddle.nn.functional.flash_attention_with_sparse_mask - # - paddle.nn.functional.fractional_max_pool2d - # - paddle.nn.functional.fractional_max_pool3d - # - paddle.nn.functional.fold - # - paddle.nn.functional.gather_tree - # - paddle.nn.functional.gaussian_nll_loss - # - paddle.nn.functional.gelu - # - paddle.nn.functional.glu - # - paddle.nn.functional.grid_sample - # - paddle.nn.functional.gumbel_softmax - # - paddle.nn.functional.hardshrink - # - paddle.nn.functional.hardsigmoid - # - paddle.nn.functional.hardswish - # - paddle.nn.functional.hardtanh - # - paddle.nn.functional.hardtanh_ - # - paddle.nn.functional.hinge_embedding_loss - # - paddle.nn.functional.hsigmoid_loss - # - paddle.nn.functional.instance_norm - # - paddle.nn.functional.interpolate - # - paddle.nn.functional.kl_div - # - paddle.nn.functional.l1_loss - # - paddle.nn.functional.label_smooth - # - paddle.nn.functional.layer_norm - # - paddle.nn.functional.leaky_relu - # - paddle.nn.functional.leaky_relu_ - # - paddle.nn.functional.linear - # - paddle.nn.functional.local_response_norm - # - paddle.nn.functional.log_loss - # - paddle.nn.functional.log_sigmoid - # - paddle.nn.functional.log_softmax - # - paddle.nn.functional.margin_cross_entropy - # - paddle.nn.functional.margin_ranking_loss - # - paddle.nn.functional.max_pool1d - # - paddle.nn.functional.max_pool2d - # - paddle.nn.functional.max_pool3d - # - paddle.nn.functional.max_unpool1d - # - paddle.nn.functional.max_unpool2d - # - paddle.nn.functional.max_unpool3d - # - paddle.nn.functional.maxout - # - paddle.nn.functional.mish - # - paddle.nn.functional.mse_loss - # - paddle.nn.functional.multi_label_soft_margin_loss - # - paddle.nn.functional.multi_margin_loss - # - paddle.nn.functional.nll_loss - # - paddle.nn.functional.norm - # - paddle.nn.functional.normalize - # - paddle.nn.functional.npair_loss - # - paddle.nn.functional.one_hot - # - paddle.nn.functional.pad - # - paddle.nn.functional.pairwise_distance - # - paddle.nn.functional.pdist - # - paddle.nn.functional.pixel_shuffle - # - paddle.nn.functional.pixel_unshuffle - # - paddle.nn.functional.poisson_nll_loss - # - paddle.nn.functional.pooling - # - paddle.nn.functional.prelu - # - paddle.nn.functional.relu - # - paddle.nn.functional.relu6 - # - paddle.nn.functional.relu_ - # - paddle.nn.functional.rnnt_loss - # - paddle.nn.functional.rrelu - # - paddle.nn.functional.sdp_kernel - # - paddle.nn.functional.selu - # - paddle.nn.functional.sequence_mask - # - paddle.nn.functional.sigmoid - # - paddle.nn.functional.sigmoid_focal_loss - # - paddle.nn.functional.silu - # - paddle.nn.functional.smooth_l1_loss - # - paddle.nn.functional.soft_margin_loss - # - paddle.nn.functional.softmax - # - paddle.nn.functional.softmax_ - # - paddle.nn.functional.softmax_with_cross_entropy - # - paddle.nn.functional.softplus - # - paddle.nn.functional.softshrink - # - paddle.nn.functional.softsign - # - paddle.nn.functional.sparse_attention - # - paddle.nn.functional.square_error_cost - # - paddle.nn.functional.swish - # - paddle.nn.functional.tanh - # - paddle.nn.functional.tanh_ - # - paddle.nn.functional.tanhshrink - # - paddle.nn.functional.temporal_shift - # - paddle.nn.functional.thresholded_relu - # - paddle.nn.functional.thresholded_relu_ - # - paddle.nn.functional.triplet_margin_loss - # - paddle.nn.functional.triplet_margin_with_distance_loss - # - paddle.nn.functional.unfold - # - paddle.nn.functional.upsample - # - paddle.nn.functional.zeropad2d - # - paddle.abs - # - paddle.abs_ - # - paddle.acos - # - paddle.acos_ - # - paddle.acosh - # - paddle.acosh_ - # - paddle.add - # - paddle.add_n - # - paddle.addmm - # - paddle.addmm_ - # - paddle.all - # - paddle.allclose - # - paddle.amax - # - paddle.amin - # - paddle.angle - # - paddle.any - # - paddle.arange - # - paddle.argmax - # - paddle.argmin - # - paddle.argsort - # - paddle.as_complex - # - paddle.as_real - # - paddle.as_strided - # - paddle.asin - # - paddle.asin_ - # - paddle.asinh - # - paddle.asinh_ - # - paddle.assign - # - paddle.atan - # - paddle.atan2 - # - paddle.atan_ - # - paddle.atanh - # - paddle.atanh_ - # - paddle.atleast_1d - # - paddle.atleast_2d - # - paddle.atleast_3d - # - paddle.bernoulli - # - paddle.bincount - # - paddle.binomial - # - paddle.bitwise_and - # - paddle.bitwise_and_ - # - paddle.bitwise_not - # - paddle.bitwise_not_ - # - paddle.bitwise_or - # - paddle.bitwise_or_ - # - paddle.bitwise_xor - # - paddle.bitwise_xor_ - # - paddle.bmm - # - paddle.broadcast_shape - # - paddle.broadcast_tensors - # - paddle.broadcast_to - # - paddle.cauchy_ - # - paddle.cast - # - paddle.cdist - # - paddle.ceil - # - paddle.cholesky - # - paddle.chunk - # - paddle.clip - # - paddle.column_stack - # - paddle.combinations - # - paddle.concat - # - paddle.conj - # - paddle.copysign - # - paddle.copysign_ - # - paddle.cos - # - paddle.cos_ - # - paddle.cosh - # - paddle.cosh_ - # - paddle.count_nonzero - # - paddle.crop - # - paddle.cross - # - paddle.cummax - # - paddle.cummin - # - paddle.cumprod - # - paddle.cumprod_ - # - paddle.cumsum - # - paddle.cumsum_ - # - paddle.cumulative_trapezoid - # - paddle.decomposition - # - paddle.deg2rad - # - paddle.diag - # - paddle.diag_embed - # - paddle.diagflat - # - paddle.diagonal - # - paddle.diagonal_scatter - # - paddle.diff - # - paddle.digamma - # - paddle.digamma_ - # - paddle.divide - # - paddle.divide_ - # - paddle.dot - # - paddle.dsplit - # - paddle.dstack - # - paddle.eigvalsh - # - paddle.einsum - # - paddle.equal - # - paddle.equal_all - # - paddle.erf - # - paddle.erf_ - # - paddle.erfinv - # - paddle.exp - # - paddle.expand - # - paddle.expand_as - # - paddle.expm1 - # - paddle.expm1_ - # - paddle.eye - # - paddle.fft - # - paddle.flatten - # - paddle.flatten_ - # - paddle.flip - # - paddle.floor - # - paddle.floor_divide - # - paddle.floor_divide_ - # - paddle.floor_mod - # - paddle.floor_mod_ - # - paddle.fmax - # - paddle.fmin - # - paddle.frac - # - paddle.frac_ - # - paddle.frexp - # - paddle.full - # - paddle.full_like - # - paddle.gather - # - paddle.gather_nd - # - paddle.gcd - # - paddle.gcd_ - # - paddle.greater_equal - # - paddle.greater_equal_ - # - paddle.greater_than - # - paddle.greater_than_ - # - paddle.heaviside - # - paddle.histogram - # - paddle.histogramdd - # - paddle.hsplit - # - paddle.hstack - # - paddle.hypot - # - paddle.hypot_ - # - paddle.i0 - # - paddle.i0_ - # - paddle.i0e - # - paddle.i1 - # - paddle.i1e - # - paddle.imag - # - paddle.increment - # - paddle.index_add - # - paddle.index_add_ - # - paddle.index_fill - # - paddle.index_fill_ - # - paddle.index_put - # - paddle.index_put_ - # - paddle.index_sample - # - paddle.index_select - # - paddle.inner - # - paddle.kron - # - paddle.kthvalue - # - paddle.lcm - # - paddle.lcm_ - # - paddle.ldexp - # - paddle.ldexp_ - # - paddle.lerp - # - paddle.less_equal - # - paddle.less_equal_ - # - paddle.less_than - # - paddle.less_than_ - # - paddle.lgamma - # - paddle.lgamma_ - # - paddle.linalg - # - paddle.linspace - # - paddle.log - # - paddle.log10 - # - paddle.log10_ - # - paddle.log1p - # - paddle.log1p_ - # - paddle.log2 - # - paddle.log2_ - # - paddle.log_ - # - paddle.logaddexp - # - paddle.logcumsumexp - # - paddle.logical_and - # - paddle.logical_and_ - # - paddle.logical_not - # - paddle.logical_not_ - # - paddle.logical_or - # - paddle.logical_or_ - # - paddle.logical_xor - # - paddle.logical_xor_ - # - paddle.logit - # - paddle.logit_ - # - paddle.logspace - # - paddle.logsumexp - # - paddle.masked_fill - # - paddle.masked_fill_ - # - paddle.masked_scatter - # - paddle.masked_scatter_ - # - paddle.masked_select - # - paddle.matmul - # - paddle.max - # - paddle.maximum - # - paddle.mean - # - paddle.median - # - paddle.meshgrid - # - paddle.min - # - paddle.minimum - # - paddle.mm - # - paddle.mod - # - paddle.mod_ - # - paddle.mode - # - paddle.moveaxis - # - paddle.multigammaln - # - paddle.multigammaln_ - # - paddle.multinomial - # - paddle.multiplex - # - paddle.multiply - # - paddle.multiply_ - # - paddle.mv - # - paddle.nan_to_num - # - paddle.nan_to_num_ - # - paddle.nanmean - # - paddle.nanmedian - # - paddle.nanquantile - # - paddle.nansum - # - paddle.neg - # - paddle.neg_ - # - paddle.nextafter - # - paddle.nonzero - # - paddle.normal - # - paddle.normal_ - # - paddle.not_equal - # - paddle.not_equal_ - # - paddle.numel - # - paddle.outer - # - paddle.pdist - # - paddle.poisson - # - paddle.polar - # - paddle.polygamma - # - paddle.polygamma_ - # - paddle.pow - # - paddle.pow_ - # - paddle.prod - # - paddle.put_along_axis - # - paddle.quantile - # - paddle.rad2deg - # - paddle.rand - # - paddle.randint - # - paddle.randint_like - # - paddle.randn - # - paddle.randperm - # - paddle.reader - # - paddle.real - # - paddle.reciprocal - # - paddle.regularizer - # - paddle.remainder - # - paddle.remainder_ - # - paddle.renorm - # - paddle.renorm_ - # - paddle.repeat_interleave - # - paddle.roll - # - paddle.rot90 - # - paddle.round - # - paddle.row_stack - # - paddle.rsqrt - # - paddle.scale - # - paddle.scatter - # - paddle.scatter_ - # # - paddle.scatter_nd # cause CUDA_ERROR ignored. - # # - paddle.scatter_nd_add - # - paddle.searchsorted - # - paddle.select_scatter - # - paddle.sgn - # - paddle.shard_index - # - paddle.sign - # - paddle.signal - # - paddle.signbit - # - paddle.sin - # - paddle.sin_ - # - paddle.sinh - # - paddle.sinh_ - # - paddle.slice - # # - paddle.slice_scatter - # - paddle.sort - # - paddle.sqrt - # - paddle.square - # - paddle.standard_gamma - # - paddle.standard_normal - # - paddle.stanh - # - paddle.strided_slice - # - paddle.subtract - # - paddle.sum - # - paddle.t - # - paddle.t_ - # - paddle.take - # - paddle.take_along_axis - # - paddle.tan - # - paddle.tan_ - # - paddle.tanh - # - paddle.tanh_ - # - paddle.tensordot - # - paddle.tile - # - paddle.topk - # - paddle.trace - # - paddle.transpose - # - paddle.transpose_ - # - paddle.trapezoid - # - paddle.tril - # - paddle.tril_ - # - paddle.tril_indices - # - paddle.triu - # - paddle.triu_ - # - paddle.triu_indices - # - paddle.trunc - # - paddle.trunc_ - # - paddle.unbind - # - paddle.unflatten - # - paddle.unfold - # - paddle.uniform - # - paddle.unique - # - paddle.unique_consecutive - # - paddle.unstack - # - paddle.vander - # - paddle.var - # - paddle.view - # - paddle.view_as - # - paddle.vsplit - # - paddle.where - # - paddle.where_ - # - paddle.zeros - # - paddle.zeros_like - # - paddle.Tensor.T - # - paddle.Tensor.__add__ - # - paddle.Tensor.__and__ - # - paddle.Tensor.__radd__ - # - paddle.Tensor.__div__ - # - paddle.Tensor.__eq__ - # - paddle.Tensor.__floordiv__ - # - paddle.Tensor.__ge__ - # - paddle.Tensor.__gt__ - # - paddle.Tensor.__le__ - # - paddle.Tensor.__lt__ - # - paddle.Tensor.__matmul__ - # - paddle.Tensor.__mod__ - # - paddle.Tensor.__mul__ - # - paddle.Tensor.__ne__ - # - paddle.Tensor.__neg__ - # - paddle.Tensor.__nonzero__ - # - paddle.Tensor.__or__ - # - paddle.Tensor.__pow__ - # - paddle.Tensor.__radd__ - # - paddle.Tensor.__rdiv__ - # - paddle.Tensor.__rmul__ - # - paddle.Tensor.__rpow__ - # - paddle.Tensor.__rsub__ - # - paddle.Tensor.__rtruediv__ - # - paddle.Tensor.__sub__ - # - paddle.Tensor.__truediv__ - # - paddle.Tensor.__xor__ - # - paddle.Tensor.abs - # - paddle.Tensor.abs_ - # - paddle.Tensor.acos - # - paddle.Tensor.acos_ - # - paddle.Tensor.acosh - # - paddle.Tensor.acosh_ - # - paddle.Tensor.add - # - paddle.Tensor.add_ - # - paddle.Tensor.add_n - # - paddle.Tensor.addmm - # - paddle.Tensor.addmm_ - # - paddle.Tensor.all - # - paddle.Tensor.allclose - # - paddle.Tensor.amax - # - paddle.Tensor.amin - # - paddle.Tensor.angle - # - paddle.Tensor.any - # - paddle.Tensor.argmax - # - paddle.Tensor.argmin - # - paddle.Tensor.argsort - # - paddle.Tensor.as_complex - # - paddle.Tensor.as_real - # - paddle.Tensor.as_strided - # - paddle.Tensor.asin - # - paddle.Tensor.asin_ - # - paddle.Tensor.asinh - # - paddle.Tensor.asinh_ - # - paddle.Tensor.atan - # - paddle.Tensor.atan2 - # - paddle.Tensor.atan_ - # - paddle.Tensor.atanh - # - paddle.Tensor.atanh_ - # - paddle.Tensor.atleast_1d - # - paddle.Tensor.atleast_2d - # - paddle.Tensor.atleast_3d - # - paddle.Tensor.bincount - # - paddle.Tensor.bitwise_and - # - paddle.Tensor.bitwise_and_ - # - paddle.Tensor.bitwise_not - # - paddle.Tensor.bitwise_not_ - # - paddle.Tensor.bitwise_or - # - paddle.Tensor.bitwise_or_ - # - paddle.Tensor.bitwise_xor - # - paddle.Tensor.bitwise_xor_ - # - paddle.Tensor.bmm - # - paddle.Tensor.broadcast_shape - # - paddle.Tensor.broadcast_tensors - # - paddle.Tensor.broadcast_to - # - paddle.Tensor.cauchy_ - # - paddle.Tensor.cdist - # - paddle.Tensor.ceil - # - paddle.Tensor.ceil_ - # - paddle.Tensor.cholesky - # - paddle.Tensor.cholesky_solve - # - paddle.Tensor.clip - # - paddle.Tensor.clip_ - # - paddle.Tensor.coalesce - # - paddle.Tensor.cols - # - paddle.Tensor.combinations - # - paddle.Tensor.concat - # - paddle.Tensor.cond - # - paddle.Tensor.conj - # - paddle.Tensor.contiguous - # - paddle.Tensor.corrcoef - # - paddle.Tensor.cos - # - paddle.Tensor.cos_ - # - paddle.Tensor.cosh - # - paddle.Tensor.cosh_ - # - paddle.Tensor.count_nonzero - # - paddle.Tensor.cov - # - paddle.Tensor.cross - # - paddle.Tensor.crows - # - paddle.Tensor.cummax - # - paddle.Tensor.cummin - # - paddle.Tensor.cumprod - # - paddle.Tensor.cumprod_ - # - paddle.Tensor.cumsum - # - paddle.Tensor.cumsum_ - # - paddle.Tensor.cumulative_trapezoid - # - paddle.Tensor.deg2rad - # - paddle.Tensor.diag - # - paddle.Tensor.diag_embed - # - paddle.Tensor.diagflat - # - paddle.Tensor.diagonal - # - paddle.Tensor.diagonal_scatter - # - paddle.Tensor.diff - # - paddle.Tensor.digamma - # - paddle.Tensor.digamma_ - # - paddle.Tensor.divide - # - paddle.Tensor.divide_ - # - paddle.Tensor.dot - # - paddle.Tensor.eig - # - paddle.Tensor.eigvals - # - paddle.Tensor.eigvalsh - # - paddle.Tensor.equal - # - paddle.Tensor.equal_all - # - paddle.Tensor.erf - # - paddle.Tensor.erfinv - # - paddle.Tensor.erfinv_ - # - paddle.Tensor.exp - # - paddle.Tensor.exp_ - # - paddle.Tensor.expand - # - paddle.Tensor.expand_as - # - paddle.Tensor.expm1 - # - paddle.Tensor.exponential_ - # - paddle.Tensor.fill_ - # - paddle.Tensor.fill_diagonal_ - # - paddle.Tensor.fill_diagonal_tensor - # - paddle.Tensor.fill_diagonal_tensor_ - # - paddle.Tensor.flatten - # - paddle.Tensor.flatten_ - # - paddle.Tensor.flip - # - paddle.Tensor.floor - # - paddle.Tensor.floor_ - # - paddle.Tensor.floor_divide - # - paddle.Tensor.floor_divide_ - # - paddle.Tensor.floor_mod - # - paddle.Tensor.floor_mod_ - # - paddle.Tensor.fmax - # - paddle.Tensor.fmin - # - paddle.Tensor.frac - # - paddle.Tensor.frac_ - # - paddle.Tensor.frexp - # - paddle.Tensor.gather - # - paddle.Tensor.gather_nd - # - paddle.Tensor.gcd - # - paddle.Tensor.gcd_ - # - paddle.Tensor.get_selected_rows - # - paddle.Tensor.get_strides - # - paddle.Tensor.greater_equal - # - paddle.Tensor.greater_equal_ - # - paddle.Tensor.greater_than - # - paddle.Tensor.greater_than_ - # - paddle.Tensor.heaviside - # - paddle.Tensor.histogram - # - paddle.Tensor.histogramdd - # - paddle.Tensor.hsplit - # - paddle.Tensor.hypot - # - paddle.Tensor.hypot_ - # - paddle.Tensor.i0 - # - paddle.Tensor.i0_ - # - paddle.Tensor.i0e - # - paddle.Tensor.i1 - # - paddle.Tensor.i1e - # - paddle.Tensor.imag - # - paddle.Tensor.increment - # - paddle.Tensor.index_add - # - paddle.Tensor.index_add_ - # - paddle.Tensor.index_fill - # - paddle.Tensor.index_fill_ - # - paddle.Tensor.index_put - # - paddle.Tensor.index_put_ - # - paddle.Tensor.index_sample - # - paddle.Tensor.index_select - # - paddle.Tensor.inner - # - paddle.Tensor.kron - # - paddle.Tensor.kthvalue - # - paddle.Tensor.layout - # - paddle.Tensor.lcm - # - paddle.Tensor.lcm_ - # - paddle.Tensor.ldexp - # - paddle.Tensor.ldexp_ - # - paddle.Tensor.lerp - # - paddle.Tensor.lerp_ - # - paddle.Tensor.less_equal - # - paddle.Tensor.less_equal_ - # - paddle.Tensor.less_than - # - paddle.Tensor.less_than_ - # - paddle.Tensor.lgamma - # - paddle.Tensor.lgamma_ - # - paddle.Tensor.log - # - paddle.Tensor.log10 - # - paddle.Tensor.log10_ - # - paddle.Tensor.log1p - # - paddle.Tensor.log1p_ - # - paddle.Tensor.log2 - # - paddle.Tensor.log2_ - # - paddle.Tensor.log_ - # - paddle.Tensor.logaddexp - # - paddle.Tensor.logcumsumexp - # - paddle.Tensor.logical_and - # - paddle.Tensor.logical_and_ - # - paddle.Tensor.logical_not - # - paddle.Tensor.logical_not_ - # - paddle.Tensor.logical_or - # - paddle.Tensor.logical_or_ - # - paddle.Tensor.logical_xor - # - paddle.Tensor.logical_xor_ - # - paddle.Tensor.logit - # - paddle.Tensor.logit_ - # - paddle.Tensor.logsumexp - # - paddle.Tensor.lstsq - # - paddle.Tensor.lu - # - paddle.Tensor.lu_unpack - # - paddle.Tensor.masked_fill - # - paddle.Tensor.masked_fill_ - # - paddle.Tensor.masked_select - # - paddle.Tensor.masked_scatter - # - paddle.Tensor.masked_scatter_ - # - paddle.Tensor.matmul - # - paddle.Tensor.matrix_power - # - paddle.Tensor.max - # - paddle.Tensor.maximum - # - paddle.Tensor.mean - # - paddle.Tensor.median - # - paddle.Tensor.min - # - paddle.Tensor.minimum - # - paddle.Tensor.mm - # - paddle.Tensor.mod - # - paddle.Tensor.mod_ - # - paddle.Tensor.mode - # - paddle.Tensor.moveaxis - # - paddle.Tensor.multi_dot - # - paddle.Tensor.multigammaln - # - paddle.Tensor.multigammaln_ - # - paddle.Tensor.multinomial - # - paddle.Tensor.multiplex - # - paddle.Tensor.multiply - # - paddle.Tensor.multiply_ - # - paddle.Tensor.mv - # - paddle.Tensor.nan_to_num - # - paddle.Tensor.nan_to_num_ - # - paddle.Tensor.nanmean - # - paddle.Tensor.nanmedian - # - paddle.Tensor.nanquantile - # - paddle.Tensor.nansum - # - paddle.Tensor.ndimension - # - paddle.Tensor.neg - # - paddle.Tensor.neg_ - # - paddle.Tensor.nnz - # - paddle.Tensor.nonzero - # - paddle.Tensor.norm - # - paddle.Tensor.normal_ - # - paddle.Tensor.not_equal - # - paddle.Tensor.not_equal_ - # - paddle.Tensor.numel - # - paddle.Tensor.offset - # - paddle.Tensor.outer - # - paddle.Tensor.pca_lowrank - # - paddle.Tensor.pinv - # - paddle.Tensor.polar - # - paddle.Tensor.polygamma - # - paddle.Tensor.polygamma_ - # - paddle.Tensor.pow - # - paddle.Tensor.pow_ - # - paddle.Tensor.process_mesh - # - paddle.Tensor.prod - # - paddle.Tensor.put_along_axis - # - paddle.Tensor.put_along_axis_ - # - paddle.Tensor.qr - # - paddle.Tensor.quantile - # - paddle.Tensor.rad2deg - # - paddle.Tensor.remainder - # - paddle.Tensor.remainder_ - # - paddle.Tensor.renorm - # - paddle.Tensor.renorm_ - # - paddle.Tensor.repeat_interleave - # - paddle.Tensor.reverse - # - paddle.Tensor.roll - # - paddle.Tensor.rot90 - # - paddle.Tensor.round - # - paddle.Tensor.round_ - # - paddle.Tensor.rows - # - paddle.Tensor.rsqrt - # - paddle.Tensor.rsqrt_ - # - paddle.Tensor.scale - # - paddle.Tensor.scale_ - # - paddle.Tensor.scatter - # - paddle.Tensor.scatter_ - # - paddle.Tensor.scatter_nd - # - paddle.Tensor.scatter_nd_add - # - paddle.Tensor.select_scatter - # - paddle.Tensor.sgn - # - paddle.Tensor.shard_index - # - paddle.Tensor.sigmoid - # - paddle.Tensor.sigmoid_ - # - paddle.Tensor.sign - # - paddle.Tensor.sin - # - paddle.Tensor.sin_ - # - paddle.Tensor.sinh - # - paddle.Tensor.sinh_ - # - paddle.Tensor.size - # - paddle.Tensor.slice - # - paddle.Tensor.solve - # - paddle.Tensor.sort - # - paddle.Tensor.split - # - paddle.Tensor.sqrt - # - paddle.Tensor.sqrt_ - # - paddle.Tensor.square - # - paddle.Tensor.stack - # - paddle.Tensor.stanh - # - paddle.Tensor.std - # - paddle.Tensor.stft - # - paddle.Tensor.strided_slice - # - paddle.Tensor.strides - # - paddle.Tensor.subtract - # - paddle.Tensor.subtract_ - # - paddle.Tensor.sum - # - paddle.Tensor.t - # - paddle.Tensor.t_ - # - paddle.Tensor.take - # - paddle.Tensor.take_along_axis - # - paddle.Tensor.tan - # - paddle.Tensor.tan_ - # - paddle.Tensor.tanh - # - paddle.Tensor.tanh_ - # - paddle.Tensor.tensordot - # - paddle.Tensor.tile - # - paddle.Tensor.top_p_sampling - # - paddle.Tensor.topk - # - paddle.Tensor.trace - # - paddle.Tensor.transpose - # - paddle.Tensor.transpose_ - # - paddle.Tensor.trapezoid - # - paddle.Tensor.tril - # - paddle.Tensor.tril_ - # - paddle.Tensor.triu - # - paddle.Tensor.triu_ - # - paddle.Tensor.trunc - # - paddle.Tensor.trunc_ - # - paddle.Tensor.unbind - # - paddle.Tensor.unflatten - # - paddle.Tensor.unfold - # - paddle.Tensor.uniform_ - # - paddle.Tensor.unique - # - paddle.Tensor.unique_consecutive - # - paddle.Tensor.unstack - # - paddle.Tensor.vander - # - paddle.Tensor.var - # - paddle.Tensor.view - # - paddle.Tensor.view_as - # - paddle.Tensor.vsplit - # - paddle.Tensor.where - # - paddle.Tensor.where_ - # - paddle._C_ops.fused_gemm_epilogue - # - paddle.optimizer.Adam - # - paddle.optimizer.AdamW - # - paddle._C_ops.adamw - # - paddle._C_ops.adamw_ - # - paddle._legacy_C_ops.fused_gemm_epilogue - # - paddle.incubate.nn.functional.fused_multi_head_attention - # - paddle.incubate.nn.functional.fused_feedforward - # - paddle.incubate.nn.functional.fused_multi_transformer - # - paddle.incubate.nn.functional.fused_linear - # - paddle.incubate.nn.functional.fused_linear_activation - # - paddle.incubate.nn.functional.fused_bias_dropout_residual_layer_norm - # - paddle.incubate.nn.functional.fused_ec_moe - # - paddle.incubate.nn.functional.fused_dropout_add - # - paddle.incubate.nn.functional.fused_rotary_position_embedding - # - paddle.incubate.nn.functional.variable_length_memory_efficient_attention - # - paddle.incubate.nn.functional.fused_rms_norm - # - paddle.incubate.nn.functional.fused_layer_norm - # - paddle.incubate.nn.functional.masked_multihead_attention - # - paddle.incubate.nn.functional.block_multihead_attention - # - paddle.incubate.nn.functional.swiglu - # - paddle.incubate.nn.functional.fused_matmul_bias - # - paddle.tensor.fill_constant - # - paddle.nn.clip._squared_l2_norm - # - paddle.uniform - # - paddle._C_ops.gaussian - # - paddle._legacy_C_ops.c_identity + - paddle.Tensor.__add__ + - paddle._C_ops.layer_norm + - paddle.multiply + - paddle.multiply_ + - paddle.Tensor.__mul__ + - paddle.Tensor.__neg__ + - paddle.Tensor.add_ + - paddle._C_ops.adamw + - paddle._C_ops.adamw_ + - paddle.square_ + - paddle.nn.functional.scaled_dot_product_attention + - paddle._C_ops.layer_norm #noqa + - paddle.nn.functional.adaptive_avg_pool1d + - paddle.nn.functional.adaptive_avg_pool2d + - paddle.nn.functional.adaptive_avg_pool3d + - paddle.nn.functional.adaptive_max_pool1d + - paddle.nn.functional.adaptive_max_pool2d + - paddle.nn.functional.adaptive_max_pool3d + - paddle.nn.functional.affine_grid + - paddle.nn.functional.alpha_dropout + - paddle.nn.functional.avg_pool1d + - paddle.nn.functional.avg_pool2d + - paddle.nn.functional.avg_pool3d + - paddle.nn.functional.batch_norm + - paddle.nn.functional.bilinear + - paddle.nn.functional.binary_cross_entropy + - paddle.nn.functional.binary_cross_entropy_with_logits + - paddle.nn.functional.celu + - paddle.nn.functional.channel_shuffle + - paddle.nn.functional.class_center_sample + - paddle.nn.functional.common + - paddle.nn.functional.conv1d + - paddle.nn.functional.conv1d_transpose + - paddle.nn.functional.conv2d + - paddle.nn.functional.conv2d_transpose + - paddle.nn.functional.conv3d + - paddle.nn.functional.conv3d_transpose + - paddle.nn.functional.cosine_embedding_loss + - paddle.nn.functional.cosine_similarity + - paddle.nn.functional.cross_entropy + - paddle.nn.functional.ctc_loss + - paddle.nn.functional.diag_embed + - paddle.nn.functional.dice_loss + - paddle.nn.functional.distance + - paddle.nn.functional.dropout + - paddle.nn.functional.dropout2d + - paddle.nn.functional.dropout3d + - paddle.nn.functional.elu + - paddle.nn.functional.elu_ + - paddle.nn.functional.embedding + - paddle.nn.functional.extension + - paddle.nn.functional.flash_attention + - paddle.nn.functional.flash_attention_with_sparse_mask + - paddle.nn.functional.fractional_max_pool2d + - paddle.nn.functional.fractional_max_pool3d + - paddle.nn.functional.fold + - paddle.nn.functional.gather_tree + - paddle.nn.functional.gaussian_nll_loss + - paddle.nn.functional.gelu + - paddle.nn.functional.glu + - paddle.nn.functional.grid_sample + - paddle.nn.functional.gumbel_softmax + - paddle.nn.functional.hardshrink + - paddle.nn.functional.hardsigmoid + - paddle.nn.functional.hardswish + - paddle.nn.functional.hardtanh + - paddle.nn.functional.hardtanh_ + - paddle.nn.functional.hinge_embedding_loss + - paddle.nn.functional.hsigmoid_loss + - paddle.nn.functional.instance_norm + - paddle.nn.functional.interpolate + - paddle.nn.functional.kl_div + - paddle.nn.functional.l1_loss + - paddle.nn.functional.label_smooth + - paddle.nn.functional.layer_norm + - paddle.nn.functional.leaky_relu + - paddle.nn.functional.leaky_relu_ + - paddle.nn.functional.linear + - paddle.nn.functional.local_response_norm + - paddle.nn.functional.log_loss + - paddle.nn.functional.log_sigmoid + - paddle.nn.functional.log_softmax + - paddle.nn.functional.margin_cross_entropy + - paddle.nn.functional.margin_ranking_loss + - paddle.nn.functional.max_pool1d + - paddle.nn.functional.max_pool2d + - paddle.nn.functional.max_pool3d + - paddle.nn.functional.max_unpool1d + - paddle.nn.functional.max_unpool2d + - paddle.nn.functional.max_unpool3d + - paddle.nn.functional.maxout + - paddle.nn.functional.mish + - paddle.nn.functional.mse_loss + - paddle.nn.functional.multi_label_soft_margin_loss + - paddle.nn.functional.multi_margin_loss + - paddle.nn.functional.nll_loss + - paddle.nn.functional.norm + - paddle.nn.functional.normalize + - paddle.nn.functional.npair_loss + - paddle.nn.functional.one_hot + - paddle.nn.functional.pad + - paddle.nn.functional.pairwise_distance + - paddle.nn.functional.pdist + - paddle.nn.functional.pixel_shuffle + - paddle.nn.functional.pixel_unshuffle + - paddle.nn.functional.poisson_nll_loss + - paddle.nn.functional.pooling + - paddle.nn.functional.prelu + - paddle.nn.functional.relu + - paddle.nn.functional.relu6 + - paddle.nn.functional.relu_ + - paddle.nn.functional.rnnt_loss + - paddle.nn.functional.rrelu + - paddle.nn.functional.sdp_kernel + - paddle.nn.functional.selu + - paddle.nn.functional.sequence_mask + - paddle.nn.functional.sigmoid + - paddle.nn.functional.sigmoid_focal_loss + - paddle.nn.functional.silu + - paddle.nn.functional.smooth_l1_loss + - paddle.nn.functional.soft_margin_loss + - paddle.nn.functional.softmax + - paddle.nn.functional.softmax_ + - paddle.nn.functional.softmax_with_cross_entropy + - paddle.nn.functional.softplus + - paddle.nn.functional.softshrink + - paddle.nn.functional.softsign + - paddle.nn.functional.sparse_attention + - paddle.nn.functional.square_error_cost + - paddle.nn.functional.swish + - paddle.nn.functional.tanh + - paddle.nn.functional.tanh_ + - paddle.nn.functional.tanhshrink + - paddle.nn.functional.temporal_shift + - paddle.nn.functional.thresholded_relu + - paddle.nn.functional.thresholded_relu_ + - paddle.nn.functional.triplet_margin_loss + - paddle.nn.functional.triplet_margin_with_distance_loss + - paddle.nn.functional.unfold + - paddle.nn.functional.upsample + - paddle.nn.functional.zeropad2d + - paddle.abs + - paddle.abs_ + - paddle.acos + - paddle.acos_ + - paddle.acosh + - paddle.acosh_ + - paddle.add + - paddle.add_n + - paddle.addmm + - paddle.addmm_ + - paddle.all + - paddle.allclose + - paddle.amax + - paddle.amin + - paddle.angle + - paddle.any + - paddle.arange + - paddle.argmax + - paddle.argmin + - paddle.argsort + - paddle.as_complex + - paddle.as_real + - paddle.as_strided + - paddle.asin + - paddle.asin_ + - paddle.asinh + - paddle.asinh_ + - paddle.assign + - paddle.atan + - paddle.atan2 + - paddle.atan_ + - paddle.atanh + - paddle.atanh_ + - paddle.atleast_1d + - paddle.atleast_2d + - paddle.atleast_3d + - paddle.bernoulli + - paddle.bincount + - paddle.binomial + - paddle.bitwise_and + - paddle.bitwise_and_ + - paddle.bitwise_not + - paddle.bitwise_not_ + - paddle.bitwise_or + - paddle.bitwise_or_ + - paddle.bitwise_xor + - paddle.bitwise_xor_ + - paddle.bmm + - paddle.broadcast_shape + - paddle.broadcast_tensors + - paddle.broadcast_to + - paddle.cauchy_ + - paddle.cast + - paddle.cdist + - paddle.ceil + - paddle.cholesky + - paddle.chunk + - paddle.clip + - paddle.column_stack + - paddle.combinations + - paddle.concat + - paddle.conj + - paddle.copysign + - paddle.copysign_ + - paddle.cos + - paddle.cos_ + - paddle.cosh + - paddle.cosh_ + - paddle.count_nonzero + - paddle.crop + - paddle.cross + - paddle.cummax + - paddle.cummin + - paddle.cumprod + - paddle.cumprod_ + - paddle.cumsum + - paddle.cumsum_ + - paddle.cumulative_trapezoid + - paddle.decomposition + - paddle.deg2rad + - paddle.diag + - paddle.diag_embed + - paddle.diagflat + - paddle.diagonal + - paddle.diagonal_scatter + - paddle.diff + - paddle.digamma + - paddle.digamma_ + - paddle.divide + - paddle.divide_ + - paddle.dot + - paddle.dsplit + - paddle.dstack + - paddle.eigvalsh + - paddle.einsum + - paddle.equal + - paddle.equal_all + - paddle.erf + - paddle.erf_ + - paddle.erfinv + - paddle.exp + - paddle.expand + - paddle.expand_as + - paddle.expm1 + - paddle.expm1_ + - paddle.eye + - paddle.fft + - paddle.flatten + - paddle.flatten_ + - paddle.flip + - paddle.floor + - paddle.floor_divide + - paddle.floor_divide_ + - paddle.floor_mod + - paddle.floor_mod_ + - paddle.fmax + - paddle.fmin + - paddle.frac + - paddle.frac_ + - paddle.frexp + - paddle.full + - paddle.full_like + - paddle.gather + - paddle.gather_nd + - paddle.gcd + - paddle.gcd_ + - paddle.greater_equal + - paddle.greater_equal_ + - paddle.greater_than + - paddle.greater_than_ + - paddle.heaviside + - paddle.histogram + - paddle.histogramdd + - paddle.hsplit + - paddle.hstack + - paddle.hypot + - paddle.hypot_ + - paddle.i0 + - paddle.i0_ + - paddle.i0e + - paddle.i1 + - paddle.i1e + - paddle.imag + - paddle.increment + - paddle.index_add + - paddle.index_add_ + - paddle.index_fill + - paddle.index_fill_ + - paddle.index_put + - paddle.index_put_ + - paddle.index_sample + - paddle.index_select + - paddle.inner + - paddle.kron + - paddle.kthvalue + - paddle.lcm + - paddle.lcm_ + - paddle.ldexp + - paddle.ldexp_ + - paddle.lerp + - paddle.less_equal + - paddle.less_equal_ + - paddle.less_than + - paddle.less_than_ + - paddle.lgamma + - paddle.lgamma_ + - paddle.linalg + - paddle.linspace + - paddle.log + - paddle.log10 + - paddle.log10_ + - paddle.log1p + - paddle.log1p_ + - paddle.log2 + - paddle.log2_ + - paddle.log_ + - paddle.logaddexp + - paddle.logcumsumexp + - paddle.logical_and + - paddle.logical_and_ + - paddle.logical_not + - paddle.logical_not_ + - paddle.logical_or + - paddle.logical_or_ + - paddle.logical_xor + - paddle.logical_xor_ + - paddle.logit + - paddle.logit_ + - paddle.logspace + - paddle.logsumexp + - paddle.masked_fill + - paddle.masked_fill_ + - paddle.masked_scatter + - paddle.masked_scatter_ + - paddle.masked_select + - paddle.matmul + - paddle.max + - paddle.maximum + - paddle.mean + - paddle.median + - paddle.meshgrid + - paddle.min + - paddle.minimum + - paddle.mm + - paddle.mod + - paddle.mod_ + - paddle.mode + - paddle.moveaxis + - paddle.multigammaln + - paddle.multigammaln_ + - paddle.multinomial + - paddle.multiplex + - paddle.multiply + - paddle.multiply_ + - paddle.mv + - paddle.nan_to_num + - paddle.nan_to_num_ + - paddle.nanmean + - paddle.nanmedian + - paddle.nanquantile + - paddle.nansum + - paddle.neg + - paddle.neg_ + - paddle.nextafter + - paddle.nonzero + - paddle.normal + - paddle.normal_ + - paddle.not_equal + - paddle.not_equal_ + - paddle.numel + - paddle.outer + - paddle.pdist + - paddle.poisson + - paddle.polar + - paddle.polygamma + - paddle.polygamma_ + - paddle.pow + - paddle.pow_ + - paddle.prod + - paddle.put_along_axis + - paddle.quantile + - paddle.rad2deg + - paddle.rand + - paddle.randint + - paddle.randint_like + - paddle.randn + - paddle.randperm + - paddle.reader + - paddle.real + - paddle.reciprocal + - paddle.regularizer + - paddle.remainder + - paddle.remainder_ + - paddle.renorm + - paddle.renorm_ + - paddle.repeat_interleave + - paddle.roll + - paddle.rot90 + - paddle.round + - paddle.row_stack + - paddle.rsqrt + - paddle.scale + - paddle.scatter + - paddle.scatter_ + # - paddle.scatter_nd # cause CUDA_ERROR ignored. + # - paddle.scatter_nd_add + - paddle.searchsorted + - paddle.select_scatter + - paddle.sgn + - paddle.shard_index + - paddle.sign + - paddle.signal + - paddle.signbit + - paddle.sin + - paddle.sin_ + - paddle.sinh + - paddle.sinh_ + - paddle.slice + # - paddle.slice_scatter + - paddle.sort + - paddle.sqrt + - paddle.square + - paddle.standard_gamma + - paddle.standard_normal + - paddle.stanh + - paddle.strided_slice + - paddle.subtract + - paddle.sum + - paddle.t + - paddle.t_ + - paddle.take + - paddle.take_along_axis + - paddle.tan + - paddle.tan_ + - paddle.tanh + - paddle.tanh_ + - paddle.tensordot + - paddle.tile + - paddle.topk + - paddle.trace + - paddle.transpose + - paddle.transpose_ + - paddle.trapezoid + - paddle.tril + - paddle.tril_ + - paddle.tril_indices + - paddle.triu + - paddle.triu_ + - paddle.triu_indices + - paddle.trunc + - paddle.trunc_ + - paddle.unbind + - paddle.unflatten + - paddle.unfold + - paddle.uniform + - paddle.unique + - paddle.unique_consecutive + - paddle.unstack + - paddle.vander + - paddle.var + - paddle.view + - paddle.view_as + - paddle.vsplit + - paddle.where + - paddle.where_ + - paddle.zeros + - paddle.zeros_like + - paddle.Tensor.T + - paddle.Tensor.__add__ + - paddle.Tensor.__and__ + - paddle.Tensor.__radd__ + - paddle.Tensor.__div__ + - paddle.Tensor.__eq__ + - paddle.Tensor.__floordiv__ + - paddle.Tensor.__ge__ + - paddle.Tensor.__gt__ + - paddle.Tensor.__le__ + - paddle.Tensor.__lt__ + - paddle.Tensor.__matmul__ + - paddle.Tensor.__mod__ + - paddle.Tensor.__mul__ + - paddle.Tensor.__ne__ + - paddle.Tensor.__neg__ + - paddle.Tensor.__nonzero__ + - paddle.Tensor.__or__ + - paddle.Tensor.__pow__ + - paddle.Tensor.__radd__ + - paddle.Tensor.__rdiv__ + - paddle.Tensor.__rmul__ + - paddle.Tensor.__rpow__ + - paddle.Tensor.__rsub__ + - paddle.Tensor.__rtruediv__ + - paddle.Tensor.__sub__ + - paddle.Tensor.__truediv__ + - paddle.Tensor.__xor__ + - paddle.Tensor.abs + - paddle.Tensor.abs_ + - paddle.Tensor.acos + - paddle.Tensor.acos_ + - paddle.Tensor.acosh + - paddle.Tensor.acosh_ + - paddle.Tensor.add + - paddle.Tensor.add_ + - paddle.Tensor.add_n + - paddle.Tensor.addmm + - paddle.Tensor.addmm_ + - paddle.Tensor.all + - paddle.Tensor.allclose + - paddle.Tensor.amax + - paddle.Tensor.amin + - paddle.Tensor.angle + - paddle.Tensor.any + - paddle.Tensor.argmax + - paddle.Tensor.argmin + - paddle.Tensor.argsort + - paddle.Tensor.as_complex + - paddle.Tensor.as_real + - paddle.Tensor.as_strided + - paddle.Tensor.asin + - paddle.Tensor.asin_ + - paddle.Tensor.asinh + - paddle.Tensor.asinh_ + - paddle.Tensor.atan + - paddle.Tensor.atan2 + - paddle.Tensor.atan_ + - paddle.Tensor.atanh + - paddle.Tensor.atanh_ + - paddle.Tensor.atleast_1d + - paddle.Tensor.atleast_2d + - paddle.Tensor.atleast_3d + - paddle.Tensor.bincount + - paddle.Tensor.bitwise_and + - paddle.Tensor.bitwise_and_ + - paddle.Tensor.bitwise_not + - paddle.Tensor.bitwise_not_ + - paddle.Tensor.bitwise_or + - paddle.Tensor.bitwise_or_ + - paddle.Tensor.bitwise_xor + - paddle.Tensor.bitwise_xor_ + - paddle.Tensor.bmm + - paddle.Tensor.broadcast_shape + - paddle.Tensor.broadcast_tensors + - paddle.Tensor.broadcast_to + - paddle.Tensor.cauchy_ + - paddle.Tensor.cdist + - paddle.Tensor.ceil + - paddle.Tensor.ceil_ + - paddle.Tensor.cholesky + - paddle.Tensor.cholesky_solve + - paddle.Tensor.clip + - paddle.Tensor.clip_ + - paddle.Tensor.coalesce + - paddle.Tensor.cols + - paddle.Tensor.combinations + - paddle.Tensor.concat + - paddle.Tensor.cond + - paddle.Tensor.conj + - paddle.Tensor.contiguous + - paddle.Tensor.corrcoef + - paddle.Tensor.cos + - paddle.Tensor.cos_ + - paddle.Tensor.cosh + - paddle.Tensor.cosh_ + - paddle.Tensor.count_nonzero + - paddle.Tensor.cov + - paddle.Tensor.cross + - paddle.Tensor.crows + - paddle.Tensor.cummax + - paddle.Tensor.cummin + - paddle.Tensor.cumprod + - paddle.Tensor.cumprod_ + - paddle.Tensor.cumsum + - paddle.Tensor.cumsum_ + - paddle.Tensor.cumulative_trapezoid + - paddle.Tensor.deg2rad + - paddle.Tensor.diag + - paddle.Tensor.diag_embed + - paddle.Tensor.diagflat + - paddle.Tensor.diagonal + - paddle.Tensor.diagonal_scatter + - paddle.Tensor.diff + - paddle.Tensor.digamma + - paddle.Tensor.digamma_ + - paddle.Tensor.divide + - paddle.Tensor.divide_ + - paddle.Tensor.dot + - paddle.Tensor.eig + - paddle.Tensor.eigvals + - paddle.Tensor.eigvalsh + - paddle.Tensor.equal + - paddle.Tensor.equal_all + - paddle.Tensor.erf + - paddle.Tensor.erfinv + - paddle.Tensor.erfinv_ + - paddle.Tensor.exp + - paddle.Tensor.exp_ + - paddle.Tensor.expand + - paddle.Tensor.expand_as + - paddle.Tensor.expm1 + - paddle.Tensor.exponential_ + - paddle.Tensor.fill_ + - paddle.Tensor.fill_diagonal_ + - paddle.Tensor.fill_diagonal_tensor + - paddle.Tensor.fill_diagonal_tensor_ + - paddle.Tensor.flatten + - paddle.Tensor.flatten_ + - paddle.Tensor.flip + - paddle.Tensor.floor + - paddle.Tensor.floor_ + - paddle.Tensor.floor_divide + - paddle.Tensor.floor_divide_ + - paddle.Tensor.floor_mod + - paddle.Tensor.floor_mod_ + - paddle.Tensor.fmax + - paddle.Tensor.fmin + - paddle.Tensor.frac + - paddle.Tensor.frac_ + - paddle.Tensor.frexp + - paddle.Tensor.gather + - paddle.Tensor.gather_nd + - paddle.Tensor.gcd + - paddle.Tensor.gcd_ + - paddle.Tensor.get_selected_rows + - paddle.Tensor.get_strides + - paddle.Tensor.greater_equal + - paddle.Tensor.greater_equal_ + - paddle.Tensor.greater_than + - paddle.Tensor.greater_than_ + - paddle.Tensor.heaviside + - paddle.Tensor.histogram + - paddle.Tensor.histogramdd + - paddle.Tensor.hsplit + - paddle.Tensor.hypot + - paddle.Tensor.hypot_ + - paddle.Tensor.i0 + - paddle.Tensor.i0_ + - paddle.Tensor.i0e + - paddle.Tensor.i1 + - paddle.Tensor.i1e + - paddle.Tensor.imag + - paddle.Tensor.increment + - paddle.Tensor.index_add + - paddle.Tensor.index_add_ + - paddle.Tensor.index_fill + - paddle.Tensor.index_fill_ + - paddle.Tensor.index_put + - paddle.Tensor.index_put_ + - paddle.Tensor.index_sample + - paddle.Tensor.index_select + - paddle.Tensor.inner + - paddle.Tensor.kron + - paddle.Tensor.kthvalue + - paddle.Tensor.layout + - paddle.Tensor.lcm + - paddle.Tensor.lcm_ + - paddle.Tensor.ldexp + - paddle.Tensor.ldexp_ + - paddle.Tensor.lerp + - paddle.Tensor.lerp_ + - paddle.Tensor.less_equal + - paddle.Tensor.less_equal_ + - paddle.Tensor.less_than + - paddle.Tensor.less_than_ + - paddle.Tensor.lgamma + - paddle.Tensor.lgamma_ + - paddle.Tensor.log + - paddle.Tensor.log10 + - paddle.Tensor.log10_ + - paddle.Tensor.log1p + - paddle.Tensor.log1p_ + - paddle.Tensor.log2 + - paddle.Tensor.log2_ + - paddle.Tensor.log_ + - paddle.Tensor.logaddexp + - paddle.Tensor.logcumsumexp + - paddle.Tensor.logical_and + - paddle.Tensor.logical_and_ + - paddle.Tensor.logical_not + - paddle.Tensor.logical_not_ + - paddle.Tensor.logical_or + - paddle.Tensor.logical_or_ + - paddle.Tensor.logical_xor + - paddle.Tensor.logical_xor_ + - paddle.Tensor.logit + - paddle.Tensor.logit_ + - paddle.Tensor.logsumexp + - paddle.Tensor.lstsq + - paddle.Tensor.lu + - paddle.Tensor.lu_unpack + - paddle.Tensor.masked_fill + - paddle.Tensor.masked_fill_ + - paddle.Tensor.masked_select + - paddle.Tensor.masked_scatter + - paddle.Tensor.masked_scatter_ + - paddle.Tensor.matmul + - paddle.Tensor.matrix_power + - paddle.Tensor.max + - paddle.Tensor.maximum + - paddle.Tensor.mean + - paddle.Tensor.median + - paddle.Tensor.min + - paddle.Tensor.minimum + - paddle.Tensor.mm + - paddle.Tensor.mod + - paddle.Tensor.mod_ + - paddle.Tensor.mode + - paddle.Tensor.moveaxis + - paddle.Tensor.multi_dot + - paddle.Tensor.multigammaln + - paddle.Tensor.multigammaln_ + - paddle.Tensor.multinomial + - paddle.Tensor.multiplex + - paddle.Tensor.multiply + - paddle.Tensor.multiply_ + - paddle.Tensor.mv + - paddle.Tensor.nan_to_num + - paddle.Tensor.nan_to_num_ + - paddle.Tensor.nanmean + - paddle.Tensor.nanmedian + - paddle.Tensor.nanquantile + - paddle.Tensor.nansum + - paddle.Tensor.ndimension + - paddle.Tensor.neg + - paddle.Tensor.neg_ + - paddle.Tensor.nnz + - paddle.Tensor.nonzero + - paddle.Tensor.norm + - paddle.Tensor.normal_ + - paddle.Tensor.not_equal + - paddle.Tensor.not_equal_ + - paddle.Tensor.numel + - paddle.Tensor.offset + - paddle.Tensor.outer + - paddle.Tensor.pca_lowrank + - paddle.Tensor.pinv + - paddle.Tensor.polar + - paddle.Tensor.polygamma + - paddle.Tensor.polygamma_ + - paddle.Tensor.pow + - paddle.Tensor.pow_ + - paddle.Tensor.process_mesh + - paddle.Tensor.prod + - paddle.Tensor.put_along_axis + - paddle.Tensor.put_along_axis_ + - paddle.Tensor.qr + - paddle.Tensor.quantile + - paddle.Tensor.rad2deg + - paddle.Tensor.remainder + - paddle.Tensor.remainder_ + - paddle.Tensor.renorm + - paddle.Tensor.renorm_ + - paddle.Tensor.repeat_interleave + - paddle.Tensor.reverse + - paddle.Tensor.roll + - paddle.Tensor.rot90 + - paddle.Tensor.round + - paddle.Tensor.round_ + - paddle.Tensor.rows + - paddle.Tensor.rsqrt + - paddle.Tensor.rsqrt_ + - paddle.Tensor.scale + - paddle.Tensor.scale_ + - paddle.Tensor.scatter + - paddle.Tensor.scatter_ + - paddle.Tensor.scatter_nd + - paddle.Tensor.scatter_nd_add + - paddle.Tensor.select_scatter + - paddle.Tensor.sgn + - paddle.Tensor.shard_index + - paddle.Tensor.sigmoid + - paddle.Tensor.sigmoid_ + - paddle.Tensor.sign + - paddle.Tensor.sin + - paddle.Tensor.sin_ + - paddle.Tensor.sinh + - paddle.Tensor.sinh_ + - paddle.Tensor.size + - paddle.Tensor.slice + - paddle.Tensor.solve + - paddle.Tensor.sort + - paddle.Tensor.split + - paddle.Tensor.sqrt + - paddle.Tensor.sqrt_ + - paddle.Tensor.square + - paddle.Tensor.stack + - paddle.Tensor.stanh + - paddle.Tensor.std + - paddle.Tensor.stft + - paddle.Tensor.strided_slice + - paddle.Tensor.strides + - paddle.Tensor.subtract + - paddle.Tensor.subtract_ + - paddle.Tensor.sum + - paddle.Tensor.t + - paddle.Tensor.t_ + - paddle.Tensor.take + - paddle.Tensor.take_along_axis + - paddle.Tensor.tan + - paddle.Tensor.tan_ + - paddle.Tensor.tanh + - paddle.Tensor.tanh_ + - paddle.Tensor.tensordot + - paddle.Tensor.tile + - paddle.Tensor.top_p_sampling + - paddle.Tensor.topk + - paddle.Tensor.trace + - paddle.Tensor.transpose + - paddle.Tensor.transpose_ + - paddle.Tensor.trapezoid + - paddle.Tensor.tril + - paddle.Tensor.tril_ + - paddle.Tensor.triu + - paddle.Tensor.triu_ + - paddle.Tensor.trunc + - paddle.Tensor.trunc_ + - paddle.Tensor.unbind + - paddle.Tensor.unflatten + - paddle.Tensor.unfold + - paddle.Tensor.uniform_ + - paddle.Tensor.unique + - paddle.Tensor.unique_consecutive + - paddle.Tensor.unstack + - paddle.Tensor.vander + - paddle.Tensor.var + - paddle.Tensor.view + - paddle.Tensor.view_as + - paddle.Tensor.vsplit + - paddle.Tensor.where + - paddle.Tensor.where_ + - paddle._C_ops.fused_gemm_epilogue + - paddle.optimizer.Adam + - paddle.optimizer.AdamW + - paddle._C_ops.adamw + - paddle._C_ops.adamw_ + - paddle._legacy_C_ops.fused_gemm_epilogue + - paddle.incubate.nn.functional.fused_multi_head_attention + - paddle.incubate.nn.functional.fused_feedforward + - paddle.incubate.nn.functional.fused_multi_transformer + - paddle.incubate.nn.functional.fused_linear + - paddle.incubate.nn.functional.fused_linear_activation + - paddle.incubate.nn.functional.fused_bias_dropout_residual_layer_norm + - paddle.incubate.nn.functional.fused_ec_moe + - paddle.incubate.nn.functional.fused_dropout_add + - paddle.incubate.nn.functional.fused_rotary_position_embedding + - paddle.incubate.nn.functional.variable_length_memory_efficient_attention + - paddle.incubate.nn.functional.fused_rms_norm + - paddle.incubate.nn.functional.fused_layer_norm + - paddle.incubate.nn.functional.masked_multihead_attention + - paddle.incubate.nn.functional.block_multihead_attention + - paddle.incubate.nn.functional.swiglu + - paddle.incubate.nn.functional.fused_matmul_bias + - paddle.tensor.fill_constant + - paddle.nn.clip._squared_l2_norm + - paddle.uniform + - paddle._C_ops.gaussian + - paddle._legacy_C_ops.c_identity #fusion_ops: - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm diff --git a/paddleapex/api_tracer/configs/tool_config.yaml b/paddleapex/api_tracer/configs/tool_config.yaml index e83cedb..07e8f37 100644 --- a/paddleapex/api_tracer/configs/tool_config.yaml +++ b/paddleapex/api_tracer/configs/tool_config.yaml @@ -17,7 +17,7 @@ dump_mode: "real_data" profile_mode: True # target_step is a list, dump api function will turn on at the specific step -target_step: [0] +target_step: [5] # Remove duplicate apis from dump_info and keep only one api in the same value range. dump_unique: False diff --git a/paddleapex/api_tracer/wrap_op/OPTemplate.py b/paddleapex/api_tracer/wrap_op/OPTemplate.py index 9a4258a..97049a5 100644 --- a/paddleapex/api_tracer/wrap_op/OPTemplate.py +++ b/paddleapex/api_tracer/wrap_op/OPTemplate.py @@ -102,7 +102,6 @@ def hijack_call(self, *args, **kwargs): return output - class OPTemplate: def __init__(self, op_name): self.op_name_ = op_name From d9f6a17455ff620cd9f0d4b4ed819b5197c6353f Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Tue, 21 Jan 2025 15:22:35 +0800 Subject: [PATCH 19/22] fix some bugs, and support mmdit class --- paddleapex/apex/run_llama10b_xpu.sh | 61 + paddleapex/apex/run_paddle.py | 14 +- paddleapex/apex/train_mmdit_xpu.sh | 117 ++ paddleapex/apex/utils/data_generate.py | 12 +- paddleapex/api_tracer/api_info.py | 12 + paddleapex/api_tracer/configs/op_target.yaml | 1804 +++++++++--------- paddleapex/api_tracer/wrap_op/OPTemplate.py | 8 +- paddleapex/api_tracer/wrap_op/hijack_tool.py | 9 + 8 files changed, 1133 insertions(+), 904 deletions(-) create mode 100644 paddleapex/apex/run_llama10b_xpu.sh create mode 100644 paddleapex/apex/train_mmdit_xpu.sh diff --git a/paddleapex/apex/run_llama10b_xpu.sh b/paddleapex/apex/run_llama10b_xpu.sh new file mode 100644 index 0000000..875b1a4 --- /dev/null +++ b/paddleapex/apex/run_llama10b_xpu.sh @@ -0,0 +1,61 @@ +task_name_or_path="llama-10b" +export XPU_FORCE_USERMODE_LAUNCH=1 +export PYTHONPATH=$PYTHONPATH:/work/APEX/PaddleAPEX:/work/APEX/PaddleNLP +export XPUAPI_DEBUG=0x1 +runtime_location=/workspace/so-runtime +bkcl_location=/workspace/so-bkcl +export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:$LD_LIBRARY_PATH + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 + +export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=1 +export BKCL_TREE_THRESHOLD=0 +export BKCL_FORCE_L3_RDMA=0 +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +echo "bkcl version:" +strings ${bkcl_location}/libbkcl.so | grep COM +master_ip=$POD_0_IP +nnodes=$PADDLE_TRAINERS_NUM +echo "master ip:" +echo $master_ip + +export CUDA_DEVICE_MAX_CONNECTIONS=8 + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +PaddleNLP_DIR=$(pwd) +echo "PaddleNLP_DIR: "$PaddleNLP_DIR + +export USING_LAYERNORM=1 +export USING_GQA_NEOX=1 +export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 + +export BKCL_USE_AR=1 +export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +export USING_LOGITS_PRINT=1 +export LOGITS_PRINT_INTERVAL=1 +export XPU_PADDLE_FC_LOCAL_INT16=1 + + +python -u -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_paddle.py \ + -json \ + "/work/APEX/PaddleNLP/dump_info/rank0_step5/forward_rank0_all.json /work/APEX/PaddleNLP/dump_info/rank1_step5/forward_rank1_all.json /work/APEX/PaddleNLP/dump_info/rank2_step5/forward_rank2_all.json /work/APEX/PaddleNLP/dump_info/rank3_step5/forward_rank3_all.json /work/APEX/PaddleNLP/dump_info/rank4_step5/forward_rank4_all.json /work/APEX/PaddleNLP/dump_info/rank5_step5/forward_rank5_all.json /work/APEX/PaddleNLP/dump_info/rank6_step5/forward_rank6_all.json /work/APEX/PaddleNLP/dump_info/rank7_step5/forward_rank7_all.json" \ + -backend xpu \ + -real \ + "/work/APEX/PaddleNLP/dump_info/rank0_step5/ /work/APEX/PaddleNLP/dump_info/rank1_step5/ /work/APEX/PaddleNLP/dump_info/rank2_step5/ /work/APEX/PaddleNLP/dump_info/rank3_step5/ /work/APEX/PaddleNLP/dump_info/rank4_step5/ /work/APEX/PaddleNLP/dump_info/rank5_step5/ /work/APEX/PaddleNLP/dump_info/rank6_step5/ /work/APEX/PaddleNLP/dump_info/rank7_step5/" \ + -out /work/APEX/PaddleNLP/result/ -mode acc -class 1 -dist 1 + diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index 812a2b1..f2b34f2 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -67,6 +67,8 @@ Warning_list = [] +IGNORED_LIST = ["paddle._C_ops.gaussian"] + current_time = time.strftime("%Y%m%d%H%M%S") tqdm_params = { @@ -84,8 +86,8 @@ "bar_format": "{l_bar}{bar}| {n}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", # 自定义进度条输出 } -PROFILE_WARM_TIMES = 10 -PROFILE_RUN_TIMES = 10 +PROFILE_WARM_TIMES = 5 +PROFILE_RUN_TIMES = 5 def recursive_delete_arg(arg_in): @@ -186,7 +188,8 @@ def recursive_arg_to_device(arg_in, backend, enforce_dtype=None): def save_tensor(forward_res, backward_res, out_path, api_call_name, dtype_name=""): - return + if not dist.get_rank() == 0: + return if dtype_name == "": bwd_output_dir = os.path.abspath(os.path.join(out_path, "output_backward")) fwd_output_dir = os.path.abspath(os.path.join(out_path, "output")) @@ -253,6 +256,9 @@ def ut_case_parsing(forward_content, cfg): for i, (api_call_name, api_info_dict) in enumerate( tqdm(forward_content.items(), **tqdm_params) ): + api_call_stack = api_call_name.rsplit("*")[0] + if api_call_stack in IGNORED_LIST: + continue if debug_mode and api_call_name not in debug_case: continue if len(multi_dtype_ut) > 0: @@ -832,7 +838,7 @@ def check_json(json_list): json_path_list = cfg.json_path.split(' ') data_path_list = cfg.real_data.split(' ') - if not check_json(json_path_list): + if False and not check_json(json_path_list): raise Exception("Check json faile!!!") else: cfg.json_path = json_path_list[local_rank] diff --git a/paddleapex/apex/train_mmdit_xpu.sh b/paddleapex/apex/train_mmdit_xpu.sh new file mode 100644 index 0000000..fffa3cd --- /dev/null +++ b/paddleapex/apex/train_mmdit_xpu.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +mpi_rank=${OMPI_COMM_WORLD_RANK:-0} +node_rank=$((mpi_rank+offset)) +mpi_node=${OMPI_COMM_WORLD_SIZE:-1} +echo "MPI status:${mpi_rank}/${mpi_node}" +nnode_train=${nnode_set:-${mpi_node}} +master_train=${master:-localhost} + +echo "Distributed Training ${node_rank}/${nnode_train} master=${master_train}" +set -x + +nnodes=$PADDLE_TRAINERS_NUM +rank=$PADDLE_TRAINER_ID + +#source ./script/utils.sh +for name in `env | grep -E 'PADDLE|ENDPOINT' | awk -F'=' '{print $1}'`; do + unset ${name} +done + +unset PADDLE_ELASTIC_JOB_ID +unset PADDLE_TRAINER_ENDPOINTS +unset DISTRIBUTED_TRAINER_ENDPOINTS +unset FLAGS_START_PORT +unset PADDLE_ELASTIC_TIMEOUT +unset PADDLE_TRAINERS_NUM +unset PADDLE_TRAINER_ID +unset PADDLE_WORKERS_IP_PORT_LIST +unset PADDLE_TRAINERS +unset PADDLE_NUM_GRADIENT_SERVERS + +export XPU_FORCE_USERMODE_LAUNCH=1 + +runtime_location=/workspace/so-runtime +bkcl_location=/workspace/so-bkcl +fast_paddle_location=/workspace/so-fast_paddle +export LD_LIBRARY_PATH=${bkcl_location}/:${runtime_location}/:${fast_paddle_location}/:$LD_LIBRARY_PATH + +export XBLAS_FC_HBM_VERSION=40 + +# PaddlePaddle +export FLAGS_use_stride_kernel="0" +export XPU_CDNN_CLUSTER_PARALLEL=1 +export XPU_CDNN_CLUSTER_PARALLEL_STREAM_NUMBER=2 +export XPU_PADDLE_L3_SIZE0=1024 +export XPU_PADDLE_L3_SIZE1=1024 +#export XPUAPI_DEFAULT_SIZE0=1502653248 +#export XPUAPI_DEFAULT_SIZE1=380265324 +export XPU_PADDLE_FUSE_SHARDING_BUFFER=1 + +# BKCL +# Multi-computer RDMA +export BKCL_ENABLE_XDR=1 +export BKCL_RDMA_FORCE_TREE=1 +export BKCL_TREE_THRESHOLD=0 +export BKCL_FORCE_L3_RDMA=0 +export BKCL_RDMA_NICS=eth1,eth1,eth2,eth2,eth3,eth3,eth4,eth4 +export BKCL_SOCKET_IFNAME=eth0 +echo "bkcl version:" +strings ${bkcl_location}/libbkcl.so | grep COM + +export CUDA_DEVICE_MAX_CONNECTIONS=8 +export BKCL_FLAT_RING=1 + +master=`cat /root/paddlejob/workspace/hostfile | head -n 1 | awk '{print $1}'` +port=36677 + +export PYTHONPATH=/workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/PaddleAPEX:/workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/PaddleMIX/ppdiffusers:/workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/PaddleMIX/PaddleNLP:$PYTHONPATH + +tp2pp4=${tp2pp4:-"False"} +if [ ${tp2pp4} == "True" ];then + unset BKCL_RDMA_NICS + unset CUDA_DEVICE_ORDER + unset XPULINK_VISIBLE_DEVICES + + export CUDA_DEVICE_ORDER=OAM_ID + export XPULINK_VISIBLE_DEVICES=2,3,0,1,4,5,6,7 + export BKCL_RDMA_NICS=eth2,eth2,eth1,eth1,eth3,eth3,eth4,eth4 +fi + +export BKCL_USE_AR=1 +# export BKCL_RING_OPT=1 +export BKCL_RING_HOSTID_USE_RANK=1 + +# accuracy improve: matmul with fp32 input will use fp32 to calc instead of using int16 +# export XPU_PADDLE_FC_LOCAL_INT16=1 +export XPU_AUTO_BF16_TF32=1 +export XPU_PADDLE_FC_TF32=1 + +# memory improve +# export XPU_INPLACE_SHARING_BF16_FP16_CACHE=1 + +export CUDA_DISABLE_PRINTF=1 +export BCCL_TRACE_HANG_ENABLE=1 +export BCCL_HANG_DETECT_INTERVAL=5 +export BCCL_UNIX_SOCKET_PATH=/var/run +export BCCL_ERROR_FILE=/root/paddlejob/workspace/log/err.%h.%p.log + +if [[ $rank -ge $nnodes ]]; then + exit 0 +fi + +timestamp=$(date +%Y%m%d%H%M%S) +echo $timestamp + +# open it when debug +export XPUAPI_DEBUG=0x1 +#export XPURT_DISPATCH_MODE="PROFILING" +#export GLOG_v=10 + +python -m paddle.distributed.launch --xpus "0,1,2,3,4,5,6,7" run_paddle.py -json \ + "/workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank0_step5/forward_rank0_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank1_step5/forward_rank1_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank2_step5/forward_rank2_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank3_step5/forward_rank3_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank4_step5/forward_rank4_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank5_step5/forward_rank5_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank6_step5/forward_rank6_all.json /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank7_step5/forward_rank7_all.json" \ + -backend xpu \ + -real \ + "/workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank0_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank1_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank2_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank3_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank4_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank5_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank6_step5/ /workspace/ZHOU/baidu/personal-code/dit_t2iv_to_kunlun/dump_info/rank7_step5/" \ + -out result_xpu/ -mode acc -class 1 -class_type float16 -dist 1 + diff --git a/paddleapex/apex/utils/data_generate.py b/paddleapex/apex/utils/data_generate.py index 3b1b3b1..7fa7375 100644 --- a/paddleapex/apex/utils/data_generate.py +++ b/paddleapex/apex/utils/data_generate.py @@ -97,15 +97,21 @@ def create_model(api_call_stack, real_data_path): # api_call_stack = api_call_name.rsplit("*")[0] init_path = real_data_path + ".init_params" state_path = real_data_path + ".state_dict" - init_para = load_params(init_path) + [args, kwargs] = load_params(init_path) + state_para = paddle.load(state_path) parent_package, class_n = api_call_stack.rsplit(".", maxsplit=1) try: MODULE = import_module(parent_package) class_model = getattr(MODULE, class_n) - model = class_model(**init_para) - model.set_state_dict(paddle.load(state_path)) + model = class_model(*args, **kwargs) + model.set_state_dict(state_para) return model except Exception as err: + print(init_path) + print(args) + print(kwargs) + print(state_path) + print(state_para) msg = "Create Model Error: %s" % str(err) print_warn_log(msg) return None diff --git a/paddleapex/api_tracer/api_info.py b/paddleapex/api_tracer/api_info.py index 7ccdd5d..65c121e 100644 --- a/paddleapex/api_tracer/api_info.py +++ b/paddleapex/api_tracer/api_info.py @@ -121,6 +121,18 @@ def get_file_path(rank): return directory +def save_init_params(init_params, name, rank): + directory = get_file_path(rank) + file_path = os.path.join(directory, f"{name}.init_params") + with open(file_path, 'wb') as f: + pickle.dump(init_params, f) + + +def save_weight(state_dict, name, rank): + directory = get_file_path(rank) + paddle.save(state_dict, os.path.join(directory, f"{name}.state_dict")) + + def save_init_params_and_weight(init_params, state_dict, name, rank): directory = get_file_path(rank) file_path = os.path.join(directory, f"{name}.init_params") diff --git a/paddleapex/api_tracer/configs/op_target.yaml b/paddleapex/api_tracer/configs/op_target.yaml index 23c10ab..38bb393 100644 --- a/paddleapex/api_tracer/configs/op_target.yaml +++ b/paddleapex/api_tracer/configs/op_target.yaml @@ -1,20 +1,30 @@ target_class: - - paddlenlp.transformers.llama.modeling.LlamaLMHead - - paddlenlp.transformers.llama.modeling.LlamaRMSNorm - - paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding - - paddlenlp.transformers.llama.modeling.LlamaLinearScalingRotaryEmbedding - - paddlenlp.transformers.llama.modeling.LlamaNTKScalingRotaryEmbedding - - paddlenlp.transformers.llama.modeling.LlamaDynamicNTKScalingRotaryEmbedding - - paddlenlp.transformers.llama.modeling.Llama3RotaryEmbedding - - paddlenlp.transformers.llama.modeling.MoEAllToAll - - paddlenlp.transformers.llama.modeling.MoEGateCombine - - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler - - paddlenlp.transformers.llama.modeling.LlamaMoEGate - - paddlenlp.transformers.llama.modeling.LlamaMoEMLP - - paddlenlp.transformers.llama.modeling.LlamaAttention - - paddlenlp.transformers.llama.modeling.LlamaDecoderLayer - - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss - - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXSafeConv3d + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXCausalConv3d + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXSpatialNorm3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXResnetBlock3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXDownBlock3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXMidBlock3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXUpBlock3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXEncoder3D + - vaes.cogx_vae_ppdiffusers_ver_new.CogVideoXDecoder3D + - vaes.cogx_vae_ppdiffusers_ver_new.AutoencoderKLCogVideoX + # - paddlenlp.transformers.llama.modeling.LlamaLMHead + # - paddlenlp.transformers.llama.modeling.LlamaRMSNorm + # - paddlenlp.transformers.llama.modeling.LlamaRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaLinearScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaNTKScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.LlamaDynamicNTKScalingRotaryEmbedding + # - paddlenlp.transformers.llama.modeling.Llama3RotaryEmbedding + # - paddlenlp.transformers.llama.modeling.MoEAllToAll + # - paddlenlp.transformers.llama.modeling.MoEGateCombine + # - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler + # - paddlenlp.transformers.llama.modeling.LlamaMoEGate + # - paddlenlp.transformers.llama.modeling.LlamaMoEMLP + # - paddlenlp.transformers.llama.modeling.LlamaAttention + # - paddlenlp.transformers.llama.modeling.LlamaDecoderLayer + # - paddlenlp.transformers.llama.modeling.ConcatMaskedLoss + # - paddlenlp.transformers.llama.modeling.MoEAuxLossAutoScaler # - paddlenlp.transformers.llama.modeling.LlamaPretrainedModel # - paddlenlp.transformers.llama.modeling.LlamaPretrainingCriterion # - paddlenlp.transformers.llama.modeling.LlamaForCausalLM @@ -66,884 +76,890 @@ distributed_op: - paddle.distributed.fleet.layers.mpu.mp_ops._c_softmax_with_cross_entropy - paddle.distributed.fleet.layers.mpu.mp_ops. target_op: - - paddle._C_ops.min - - paddle._C_ops.min - - paddle._C_ops.max - - paddle.empty - - paddle.empty_like - - paddle.reshape - - paddle.reshape_ - - paddle.unsqueeze - - paddle.unsqueeze_ - - paddle.Tensor.squeeze - - paddle.Tensor.squeeze_ - - paddle.Tensor.unsqueeze - - paddle.Tensor.unsqueeze_ - - paddle.squeeze_ - - paddle.ones - - paddle.ones_like - - paddle.split - - paddle.Tensor.zero_ - - paddle.stack - - paddle.zeros - - paddle.zeros_like - - paddle.Tensor.__add__ - - paddle._C_ops.layer_norm - - paddle.multiply - - paddle.multiply_ - - paddle.Tensor.__mul__ - - paddle.Tensor.__neg__ - - paddle.Tensor.add_ - - paddle._C_ops.adamw - - paddle._C_ops.adamw_ - - paddle.square_ - - paddle.nn.functional.scaled_dot_product_attention - - paddle._C_ops.layer_norm #noqa - - paddle.nn.functional.adaptive_avg_pool1d - - paddle.nn.functional.adaptive_avg_pool2d - - paddle.nn.functional.adaptive_avg_pool3d - - paddle.nn.functional.adaptive_max_pool1d - - paddle.nn.functional.adaptive_max_pool2d - - paddle.nn.functional.adaptive_max_pool3d - - paddle.nn.functional.affine_grid - - paddle.nn.functional.alpha_dropout - - paddle.nn.functional.avg_pool1d - - paddle.nn.functional.avg_pool2d - - paddle.nn.functional.avg_pool3d - - paddle.nn.functional.batch_norm - - paddle.nn.functional.bilinear - - paddle.nn.functional.binary_cross_entropy - - paddle.nn.functional.binary_cross_entropy_with_logits - - paddle.nn.functional.celu - - paddle.nn.functional.channel_shuffle - - paddle.nn.functional.class_center_sample - - paddle.nn.functional.common - - paddle.nn.functional.conv1d - - paddle.nn.functional.conv1d_transpose - - paddle.nn.functional.conv2d - - paddle.nn.functional.conv2d_transpose - - paddle.nn.functional.conv3d - - paddle.nn.functional.conv3d_transpose - - paddle.nn.functional.cosine_embedding_loss - - paddle.nn.functional.cosine_similarity - - paddle.nn.functional.cross_entropy - - paddle.nn.functional.ctc_loss - - paddle.nn.functional.diag_embed - - paddle.nn.functional.dice_loss - - paddle.nn.functional.distance - - paddle.nn.functional.dropout - - paddle.nn.functional.dropout2d - - paddle.nn.functional.dropout3d - - paddle.nn.functional.elu - - paddle.nn.functional.elu_ - - paddle.nn.functional.embedding - - paddle.nn.functional.extension - - paddle.nn.functional.flash_attention - - paddle.nn.functional.flash_attention_with_sparse_mask - - paddle.nn.functional.fractional_max_pool2d - - paddle.nn.functional.fractional_max_pool3d - - paddle.nn.functional.fold - - paddle.nn.functional.gather_tree - - paddle.nn.functional.gaussian_nll_loss - - paddle.nn.functional.gelu - - paddle.nn.functional.glu - - paddle.nn.functional.grid_sample - - paddle.nn.functional.gumbel_softmax - - paddle.nn.functional.hardshrink - - paddle.nn.functional.hardsigmoid - - paddle.nn.functional.hardswish - - paddle.nn.functional.hardtanh - - paddle.nn.functional.hardtanh_ - - paddle.nn.functional.hinge_embedding_loss - - paddle.nn.functional.hsigmoid_loss - - paddle.nn.functional.instance_norm - - paddle.nn.functional.interpolate - - paddle.nn.functional.kl_div - - paddle.nn.functional.l1_loss - - paddle.nn.functional.label_smooth - - paddle.nn.functional.layer_norm - - paddle.nn.functional.leaky_relu - - paddle.nn.functional.leaky_relu_ - - paddle.nn.functional.linear - - paddle.nn.functional.local_response_norm - - paddle.nn.functional.log_loss - - paddle.nn.functional.log_sigmoid - - paddle.nn.functional.log_softmax - - paddle.nn.functional.margin_cross_entropy - - paddle.nn.functional.margin_ranking_loss - - paddle.nn.functional.max_pool1d - - paddle.nn.functional.max_pool2d - - paddle.nn.functional.max_pool3d - - paddle.nn.functional.max_unpool1d - - paddle.nn.functional.max_unpool2d - - paddle.nn.functional.max_unpool3d - - paddle.nn.functional.maxout - - paddle.nn.functional.mish - - paddle.nn.functional.mse_loss - - paddle.nn.functional.multi_label_soft_margin_loss - - paddle.nn.functional.multi_margin_loss - - paddle.nn.functional.nll_loss - - paddle.nn.functional.norm - - paddle.nn.functional.normalize - - paddle.nn.functional.npair_loss - - paddle.nn.functional.one_hot - - paddle.nn.functional.pad - - paddle.nn.functional.pairwise_distance - - paddle.nn.functional.pdist - - paddle.nn.functional.pixel_shuffle - - paddle.nn.functional.pixel_unshuffle - - paddle.nn.functional.poisson_nll_loss - - paddle.nn.functional.pooling - - paddle.nn.functional.prelu - - paddle.nn.functional.relu - - paddle.nn.functional.relu6 - - paddle.nn.functional.relu_ - - paddle.nn.functional.rnnt_loss - - paddle.nn.functional.rrelu - - paddle.nn.functional.sdp_kernel - - paddle.nn.functional.selu - - paddle.nn.functional.sequence_mask - - paddle.nn.functional.sigmoid - - paddle.nn.functional.sigmoid_focal_loss - - paddle.nn.functional.silu - - paddle.nn.functional.smooth_l1_loss - - paddle.nn.functional.soft_margin_loss - - paddle.nn.functional.softmax - - paddle.nn.functional.softmax_ - - paddle.nn.functional.softmax_with_cross_entropy - - paddle.nn.functional.softplus - - paddle.nn.functional.softshrink - - paddle.nn.functional.softsign - - paddle.nn.functional.sparse_attention - - paddle.nn.functional.square_error_cost - - paddle.nn.functional.swish - - paddle.nn.functional.tanh - - paddle.nn.functional.tanh_ - - paddle.nn.functional.tanhshrink - - paddle.nn.functional.temporal_shift - - paddle.nn.functional.thresholded_relu - - paddle.nn.functional.thresholded_relu_ - - paddle.nn.functional.triplet_margin_loss - - paddle.nn.functional.triplet_margin_with_distance_loss - - paddle.nn.functional.unfold - - paddle.nn.functional.upsample - - paddle.nn.functional.zeropad2d - - paddle.abs - - paddle.abs_ - - paddle.acos - - paddle.acos_ - - paddle.acosh - - paddle.acosh_ - - paddle.add - - paddle.add_n - - paddle.addmm - - paddle.addmm_ - - paddle.all - - paddle.allclose - - paddle.amax - - paddle.amin - - paddle.angle - - paddle.any - - paddle.arange - - paddle.argmax - - paddle.argmin - - paddle.argsort - - paddle.as_complex - - paddle.as_real - - paddle.as_strided - - paddle.asin - - paddle.asin_ - - paddle.asinh - - paddle.asinh_ - - paddle.assign - - paddle.atan - - paddle.atan2 - - paddle.atan_ - - paddle.atanh - - paddle.atanh_ - - paddle.atleast_1d - - paddle.atleast_2d - - paddle.atleast_3d - - paddle.bernoulli - - paddle.bincount - - paddle.binomial - - paddle.bitwise_and - - paddle.bitwise_and_ - - paddle.bitwise_not - - paddle.bitwise_not_ - - paddle.bitwise_or - - paddle.bitwise_or_ - - paddle.bitwise_xor - - paddle.bitwise_xor_ - - paddle.bmm - - paddle.broadcast_shape - - paddle.broadcast_tensors - - paddle.broadcast_to - - paddle.cauchy_ - - paddle.cast - - paddle.cdist - - paddle.ceil - - paddle.cholesky - - paddle.chunk - - paddle.clip - - paddle.column_stack - - paddle.combinations - - paddle.concat - - paddle.conj - - paddle.copysign - - paddle.copysign_ - - paddle.cos - - paddle.cos_ - - paddle.cosh - - paddle.cosh_ - - paddle.count_nonzero - - paddle.crop - - paddle.cross - - paddle.cummax - - paddle.cummin - - paddle.cumprod - - paddle.cumprod_ - - paddle.cumsum - - paddle.cumsum_ - - paddle.cumulative_trapezoid - - paddle.decomposition - - paddle.deg2rad - - paddle.diag - - paddle.diag_embed - - paddle.diagflat - - paddle.diagonal - - paddle.diagonal_scatter - - paddle.diff - - paddle.digamma - - paddle.digamma_ - - paddle.divide - - paddle.divide_ - - paddle.dot - - paddle.dsplit - - paddle.dstack - - paddle.eigvalsh - - paddle.einsum - - paddle.equal - - paddle.equal_all - - paddle.erf - - paddle.erf_ - - paddle.erfinv - - paddle.exp - - paddle.expand - - paddle.expand_as - - paddle.expm1 - - paddle.expm1_ - - paddle.eye - - paddle.fft - - paddle.flatten - - paddle.flatten_ - - paddle.flip - - paddle.floor - - paddle.floor_divide - - paddle.floor_divide_ - - paddle.floor_mod - - paddle.floor_mod_ - - paddle.fmax - - paddle.fmin - - paddle.frac - - paddle.frac_ - - paddle.frexp - - paddle.full - - paddle.full_like - - paddle.gather - - paddle.gather_nd - - paddle.gcd - - paddle.gcd_ - - paddle.greater_equal - - paddle.greater_equal_ - - paddle.greater_than - - paddle.greater_than_ - - paddle.heaviside - - paddle.histogram - - paddle.histogramdd - - paddle.hsplit - - paddle.hstack - - paddle.hypot - - paddle.hypot_ - - paddle.i0 - - paddle.i0_ - - paddle.i0e - - paddle.i1 - - paddle.i1e - - paddle.imag - - paddle.increment - - paddle.index_add - - paddle.index_add_ - - paddle.index_fill - - paddle.index_fill_ - - paddle.index_put - - paddle.index_put_ - - paddle.index_sample - - paddle.index_select - - paddle.inner - - paddle.kron - - paddle.kthvalue - - paddle.lcm - - paddle.lcm_ - - paddle.ldexp - - paddle.ldexp_ - - paddle.lerp - - paddle.less_equal - - paddle.less_equal_ - - paddle.less_than - - paddle.less_than_ - - paddle.lgamma - - paddle.lgamma_ - - paddle.linalg - - paddle.linspace - - paddle.log - - paddle.log10 - - paddle.log10_ - - paddle.log1p - - paddle.log1p_ - - paddle.log2 - - paddle.log2_ - - paddle.log_ - - paddle.logaddexp - - paddle.logcumsumexp - - paddle.logical_and - - paddle.logical_and_ - - paddle.logical_not - - paddle.logical_not_ - - paddle.logical_or - - paddle.logical_or_ - - paddle.logical_xor - - paddle.logical_xor_ - - paddle.logit - - paddle.logit_ - - paddle.logspace - - paddle.logsumexp - - paddle.masked_fill - - paddle.masked_fill_ - - paddle.masked_scatter - - paddle.masked_scatter_ - - paddle.masked_select - - paddle.matmul - - paddle.max - - paddle.maximum - - paddle.mean - - paddle.median - - paddle.meshgrid - - paddle.min - - paddle.minimum - - paddle.mm - - paddle.mod - - paddle.mod_ - - paddle.mode - - paddle.moveaxis - - paddle.multigammaln - - paddle.multigammaln_ - - paddle.multinomial - - paddle.multiplex - - paddle.multiply - - paddle.multiply_ - - paddle.mv - - paddle.nan_to_num - - paddle.nan_to_num_ - - paddle.nanmean - - paddle.nanmedian - - paddle.nanquantile - - paddle.nansum - - paddle.neg - - paddle.neg_ - - paddle.nextafter - - paddle.nonzero - - paddle.normal - - paddle.normal_ - - paddle.not_equal - - paddle.not_equal_ - - paddle.numel - - paddle.outer - - paddle.pdist - - paddle.poisson - - paddle.polar - - paddle.polygamma - - paddle.polygamma_ - - paddle.pow - - paddle.pow_ - - paddle.prod - - paddle.put_along_axis - - paddle.quantile - - paddle.rad2deg - - paddle.rand - - paddle.randint - - paddle.randint_like - - paddle.randn - - paddle.randperm - - paddle.reader - - paddle.real - - paddle.reciprocal - - paddle.regularizer - - paddle.remainder - - paddle.remainder_ - - paddle.renorm - - paddle.renorm_ - - paddle.repeat_interleave - - paddle.roll - - paddle.rot90 - - paddle.round - - paddle.row_stack - - paddle.rsqrt - - paddle.scale - - paddle.scatter - - paddle.scatter_ - # - paddle.scatter_nd # cause CUDA_ERROR ignored. - # - paddle.scatter_nd_add - - paddle.searchsorted - - paddle.select_scatter - - paddle.sgn - - paddle.shard_index - - paddle.sign - - paddle.signal - - paddle.signbit - - paddle.sin - - paddle.sin_ - - paddle.sinh - - paddle.sinh_ - - paddle.slice - # - paddle.slice_scatter - - paddle.sort - - paddle.sqrt - - paddle.square - - paddle.standard_gamma - - paddle.standard_normal - - paddle.stanh - - paddle.strided_slice - - paddle.subtract - - paddle.sum - - paddle.t - - paddle.t_ - - paddle.take - - paddle.take_along_axis - - paddle.tan - - paddle.tan_ - - paddle.tanh - - paddle.tanh_ - - paddle.tensordot - - paddle.tile - - paddle.topk - - paddle.trace - - paddle.transpose - - paddle.transpose_ - - paddle.trapezoid - - paddle.tril - - paddle.tril_ - - paddle.tril_indices - - paddle.triu - - paddle.triu_ - - paddle.triu_indices - - paddle.trunc - - paddle.trunc_ - - paddle.unbind - - paddle.unflatten - - paddle.unfold - - paddle.uniform - - paddle.unique - - paddle.unique_consecutive - - paddle.unstack - - paddle.vander - - paddle.var - - paddle.view - - paddle.view_as - - paddle.vsplit - - paddle.where - - paddle.where_ - - paddle.zeros - - paddle.zeros_like - - paddle.Tensor.T - - paddle.Tensor.__add__ - - paddle.Tensor.__and__ - - paddle.Tensor.__radd__ - - paddle.Tensor.__div__ - - paddle.Tensor.__eq__ - - paddle.Tensor.__floordiv__ - - paddle.Tensor.__ge__ - - paddle.Tensor.__gt__ - - paddle.Tensor.__le__ - - paddle.Tensor.__lt__ - - paddle.Tensor.__matmul__ - - paddle.Tensor.__mod__ - - paddle.Tensor.__mul__ - - paddle.Tensor.__ne__ - - paddle.Tensor.__neg__ - - paddle.Tensor.__nonzero__ - - paddle.Tensor.__or__ - - paddle.Tensor.__pow__ - - paddle.Tensor.__radd__ - - paddle.Tensor.__rdiv__ - - paddle.Tensor.__rmul__ - - paddle.Tensor.__rpow__ - - paddle.Tensor.__rsub__ - - paddle.Tensor.__rtruediv__ - - paddle.Tensor.__sub__ - - paddle.Tensor.__truediv__ - - paddle.Tensor.__xor__ - - paddle.Tensor.abs - - paddle.Tensor.abs_ - - paddle.Tensor.acos - - paddle.Tensor.acos_ - - paddle.Tensor.acosh - - paddle.Tensor.acosh_ - - paddle.Tensor.add - - paddle.Tensor.add_ - - paddle.Tensor.add_n - - paddle.Tensor.addmm - - paddle.Tensor.addmm_ - - paddle.Tensor.all - - paddle.Tensor.allclose - - paddle.Tensor.amax - - paddle.Tensor.amin - - paddle.Tensor.angle - - paddle.Tensor.any - - paddle.Tensor.argmax - - paddle.Tensor.argmin - - paddle.Tensor.argsort - - paddle.Tensor.as_complex - - paddle.Tensor.as_real - - paddle.Tensor.as_strided - - paddle.Tensor.asin - - paddle.Tensor.asin_ - - paddle.Tensor.asinh - - paddle.Tensor.asinh_ - - paddle.Tensor.atan - - paddle.Tensor.atan2 - - paddle.Tensor.atan_ - - paddle.Tensor.atanh - - paddle.Tensor.atanh_ - - paddle.Tensor.atleast_1d - - paddle.Tensor.atleast_2d - - paddle.Tensor.atleast_3d - - paddle.Tensor.bincount - - paddle.Tensor.bitwise_and - - paddle.Tensor.bitwise_and_ - - paddle.Tensor.bitwise_not - - paddle.Tensor.bitwise_not_ - - paddle.Tensor.bitwise_or - - paddle.Tensor.bitwise_or_ - - paddle.Tensor.bitwise_xor - - paddle.Tensor.bitwise_xor_ - - paddle.Tensor.bmm - - paddle.Tensor.broadcast_shape - - paddle.Tensor.broadcast_tensors - - paddle.Tensor.broadcast_to - - paddle.Tensor.cauchy_ - - paddle.Tensor.cdist - - paddle.Tensor.ceil - - paddle.Tensor.ceil_ - - paddle.Tensor.cholesky - - paddle.Tensor.cholesky_solve - - paddle.Tensor.clip - - paddle.Tensor.clip_ - - paddle.Tensor.coalesce - - paddle.Tensor.cols - - paddle.Tensor.combinations - - paddle.Tensor.concat - - paddle.Tensor.cond - - paddle.Tensor.conj - - paddle.Tensor.contiguous - - paddle.Tensor.corrcoef - - paddle.Tensor.cos - - paddle.Tensor.cos_ - - paddle.Tensor.cosh - - paddle.Tensor.cosh_ - - paddle.Tensor.count_nonzero - - paddle.Tensor.cov - - paddle.Tensor.cross - - paddle.Tensor.crows - - paddle.Tensor.cummax - - paddle.Tensor.cummin - - paddle.Tensor.cumprod - - paddle.Tensor.cumprod_ - - paddle.Tensor.cumsum - - paddle.Tensor.cumsum_ - - paddle.Tensor.cumulative_trapezoid - - paddle.Tensor.deg2rad - - paddle.Tensor.diag - - paddle.Tensor.diag_embed - - paddle.Tensor.diagflat - - paddle.Tensor.diagonal - - paddle.Tensor.diagonal_scatter - - paddle.Tensor.diff - - paddle.Tensor.digamma - - paddle.Tensor.digamma_ - - paddle.Tensor.divide - - paddle.Tensor.divide_ - - paddle.Tensor.dot - - paddle.Tensor.eig - - paddle.Tensor.eigvals - - paddle.Tensor.eigvalsh - - paddle.Tensor.equal - - paddle.Tensor.equal_all - - paddle.Tensor.erf - - paddle.Tensor.erfinv - - paddle.Tensor.erfinv_ - - paddle.Tensor.exp - - paddle.Tensor.exp_ - - paddle.Tensor.expand - - paddle.Tensor.expand_as - - paddle.Tensor.expm1 - - paddle.Tensor.exponential_ - - paddle.Tensor.fill_ - - paddle.Tensor.fill_diagonal_ - - paddle.Tensor.fill_diagonal_tensor - - paddle.Tensor.fill_diagonal_tensor_ - - paddle.Tensor.flatten - - paddle.Tensor.flatten_ - - paddle.Tensor.flip - - paddle.Tensor.floor - - paddle.Tensor.floor_ - - paddle.Tensor.floor_divide - - paddle.Tensor.floor_divide_ - - paddle.Tensor.floor_mod - - paddle.Tensor.floor_mod_ - - paddle.Tensor.fmax - - paddle.Tensor.fmin - - paddle.Tensor.frac - - paddle.Tensor.frac_ - - paddle.Tensor.frexp - - paddle.Tensor.gather - - paddle.Tensor.gather_nd - - paddle.Tensor.gcd - - paddle.Tensor.gcd_ - - paddle.Tensor.get_selected_rows - - paddle.Tensor.get_strides - - paddle.Tensor.greater_equal - - paddle.Tensor.greater_equal_ - - paddle.Tensor.greater_than - - paddle.Tensor.greater_than_ - - paddle.Tensor.heaviside - - paddle.Tensor.histogram - - paddle.Tensor.histogramdd - - paddle.Tensor.hsplit - - paddle.Tensor.hypot - - paddle.Tensor.hypot_ - - paddle.Tensor.i0 - - paddle.Tensor.i0_ - - paddle.Tensor.i0e - - paddle.Tensor.i1 - - paddle.Tensor.i1e - - paddle.Tensor.imag - - paddle.Tensor.increment - - paddle.Tensor.index_add - - paddle.Tensor.index_add_ - - paddle.Tensor.index_fill - - paddle.Tensor.index_fill_ - - paddle.Tensor.index_put - - paddle.Tensor.index_put_ - - paddle.Tensor.index_sample - - paddle.Tensor.index_select - - paddle.Tensor.inner - - paddle.Tensor.kron - - paddle.Tensor.kthvalue - - paddle.Tensor.layout - - paddle.Tensor.lcm - - paddle.Tensor.lcm_ - - paddle.Tensor.ldexp - - paddle.Tensor.ldexp_ - - paddle.Tensor.lerp - - paddle.Tensor.lerp_ - - paddle.Tensor.less_equal - - paddle.Tensor.less_equal_ - - paddle.Tensor.less_than - - paddle.Tensor.less_than_ - - paddle.Tensor.lgamma - - paddle.Tensor.lgamma_ - - paddle.Tensor.log - - paddle.Tensor.log10 - - paddle.Tensor.log10_ - - paddle.Tensor.log1p - - paddle.Tensor.log1p_ - - paddle.Tensor.log2 - - paddle.Tensor.log2_ - - paddle.Tensor.log_ - - paddle.Tensor.logaddexp - - paddle.Tensor.logcumsumexp - - paddle.Tensor.logical_and - - paddle.Tensor.logical_and_ - - paddle.Tensor.logical_not - - paddle.Tensor.logical_not_ - - paddle.Tensor.logical_or - - paddle.Tensor.logical_or_ - - paddle.Tensor.logical_xor - - paddle.Tensor.logical_xor_ - - paddle.Tensor.logit - - paddle.Tensor.logit_ - - paddle.Tensor.logsumexp - - paddle.Tensor.lstsq - - paddle.Tensor.lu - - paddle.Tensor.lu_unpack - - paddle.Tensor.masked_fill - - paddle.Tensor.masked_fill_ - - paddle.Tensor.masked_select - - paddle.Tensor.masked_scatter - - paddle.Tensor.masked_scatter_ - - paddle.Tensor.matmul - - paddle.Tensor.matrix_power - - paddle.Tensor.max - - paddle.Tensor.maximum - - paddle.Tensor.mean - - paddle.Tensor.median - - paddle.Tensor.min - - paddle.Tensor.minimum - - paddle.Tensor.mm - - paddle.Tensor.mod - - paddle.Tensor.mod_ - - paddle.Tensor.mode - - paddle.Tensor.moveaxis - - paddle.Tensor.multi_dot - - paddle.Tensor.multigammaln - - paddle.Tensor.multigammaln_ - - paddle.Tensor.multinomial - - paddle.Tensor.multiplex - - paddle.Tensor.multiply - - paddle.Tensor.multiply_ - - paddle.Tensor.mv - - paddle.Tensor.nan_to_num - - paddle.Tensor.nan_to_num_ - - paddle.Tensor.nanmean - - paddle.Tensor.nanmedian - - paddle.Tensor.nanquantile - - paddle.Tensor.nansum - - paddle.Tensor.ndimension - - paddle.Tensor.neg - - paddle.Tensor.neg_ - - paddle.Tensor.nnz - - paddle.Tensor.nonzero - - paddle.Tensor.norm - - paddle.Tensor.normal_ - - paddle.Tensor.not_equal - - paddle.Tensor.not_equal_ - - paddle.Tensor.numel - - paddle.Tensor.offset - - paddle.Tensor.outer - - paddle.Tensor.pca_lowrank - - paddle.Tensor.pinv - - paddle.Tensor.polar - - paddle.Tensor.polygamma - - paddle.Tensor.polygamma_ - - paddle.Tensor.pow - - paddle.Tensor.pow_ - - paddle.Tensor.process_mesh - - paddle.Tensor.prod - - paddle.Tensor.put_along_axis - - paddle.Tensor.put_along_axis_ - - paddle.Tensor.qr - - paddle.Tensor.quantile - - paddle.Tensor.rad2deg - - paddle.Tensor.remainder - - paddle.Tensor.remainder_ - - paddle.Tensor.renorm - - paddle.Tensor.renorm_ - - paddle.Tensor.repeat_interleave - - paddle.Tensor.reverse - - paddle.Tensor.roll - - paddle.Tensor.rot90 - - paddle.Tensor.round - - paddle.Tensor.round_ - - paddle.Tensor.rows - - paddle.Tensor.rsqrt - - paddle.Tensor.rsqrt_ - - paddle.Tensor.scale - - paddle.Tensor.scale_ - - paddle.Tensor.scatter - - paddle.Tensor.scatter_ - - paddle.Tensor.scatter_nd - - paddle.Tensor.scatter_nd_add - - paddle.Tensor.select_scatter - - paddle.Tensor.sgn - - paddle.Tensor.shard_index - - paddle.Tensor.sigmoid - - paddle.Tensor.sigmoid_ - - paddle.Tensor.sign - - paddle.Tensor.sin - - paddle.Tensor.sin_ - - paddle.Tensor.sinh - - paddle.Tensor.sinh_ - - paddle.Tensor.size - - paddle.Tensor.slice - - paddle.Tensor.solve - - paddle.Tensor.sort - - paddle.Tensor.split - - paddle.Tensor.sqrt - - paddle.Tensor.sqrt_ - - paddle.Tensor.square - - paddle.Tensor.stack - - paddle.Tensor.stanh - - paddle.Tensor.std - - paddle.Tensor.stft - - paddle.Tensor.strided_slice - - paddle.Tensor.strides - - paddle.Tensor.subtract - - paddle.Tensor.subtract_ - - paddle.Tensor.sum - - paddle.Tensor.t - - paddle.Tensor.t_ - - paddle.Tensor.take - - paddle.Tensor.take_along_axis - - paddle.Tensor.tan - - paddle.Tensor.tan_ - - paddle.Tensor.tanh - - paddle.Tensor.tanh_ - - paddle.Tensor.tensordot - - paddle.Tensor.tile - - paddle.Tensor.top_p_sampling - - paddle.Tensor.topk - - paddle.Tensor.trace - - paddle.Tensor.transpose - - paddle.Tensor.transpose_ - - paddle.Tensor.trapezoid - - paddle.Tensor.tril - - paddle.Tensor.tril_ - - paddle.Tensor.triu - - paddle.Tensor.triu_ - - paddle.Tensor.trunc - - paddle.Tensor.trunc_ - - paddle.Tensor.unbind - - paddle.Tensor.unflatten - - paddle.Tensor.unfold - - paddle.Tensor.uniform_ - - paddle.Tensor.unique - - paddle.Tensor.unique_consecutive - - paddle.Tensor.unstack - - paddle.Tensor.vander - - paddle.Tensor.var - - paddle.Tensor.view - - paddle.Tensor.view_as - - paddle.Tensor.vsplit - - paddle.Tensor.where - - paddle.Tensor.where_ - - paddle._C_ops.fused_gemm_epilogue - - paddle.optimizer.Adam - - paddle.optimizer.AdamW - - paddle._C_ops.adamw - - paddle._C_ops.adamw_ - - paddle._legacy_C_ops.fused_gemm_epilogue - - paddle.incubate.nn.functional.fused_multi_head_attention - - paddle.incubate.nn.functional.fused_feedforward - - paddle.incubate.nn.functional.fused_multi_transformer - - paddle.incubate.nn.functional.fused_linear - - paddle.incubate.nn.functional.fused_linear_activation - - paddle.incubate.nn.functional.fused_bias_dropout_residual_layer_norm - - paddle.incubate.nn.functional.fused_ec_moe - - paddle.incubate.nn.functional.fused_dropout_add - - paddle.incubate.nn.functional.fused_rotary_position_embedding - - paddle.incubate.nn.functional.variable_length_memory_efficient_attention - - paddle.incubate.nn.functional.fused_rms_norm - - paddle.incubate.nn.functional.fused_layer_norm - - paddle.incubate.nn.functional.masked_multihead_attention - - paddle.incubate.nn.functional.block_multihead_attention - - paddle.incubate.nn.functional.swiglu - - paddle.incubate.nn.functional.fused_matmul_bias - - paddle.tensor.fill_constant - - paddle.nn.clip._squared_l2_norm - - paddle.uniform - - paddle._C_ops.gaussian - - paddle._legacy_C_ops.c_identity + - paddle.nn.functional.conv.conv1d + - paddle.nn.functional.conv.conv1d_transpose + - paddle.nn.functional.conv.conv2d + - paddle.nn.functional.conv.conv2d_transpose + - paddle.nn.functional.conv.conv3d + - paddle.nn.functional.conv.conv3d_transpose + # - paddle._C_ops.min + # - paddle._C_ops.min + # - paddle._C_ops.max + # - paddle.empty + # - paddle.empty_like + # - paddle.reshape + # - paddle.reshape_ + # - paddle.unsqueeze + # - paddle.unsqueeze_ + # - paddle.Tensor.squeeze + # - paddle.Tensor.squeeze_ + # - paddle.Tensor.unsqueeze + # - paddle.Tensor.unsqueeze_ + # - paddle.squeeze_ + # - paddle.ones + # - paddle.ones_like + # - paddle.split + # - paddle.Tensor.zero_ + # - paddle.stack + # - paddle.zeros + # - paddle.zeros_like + # - paddle.Tensor.__add__ + # - paddle._C_ops.layer_norm + # - paddle.multiply + # - paddle.multiply_ + # - paddle.Tensor.__mul__ + # - paddle.Tensor.__neg__ + # - paddle.Tensor.add_ + # - paddle._C_ops.adamw + # - paddle._C_ops.adamw_ + # - paddle.square_ + # - paddle.nn.functional.scaled_dot_product_attention + # - paddle._C_ops.layer_norm #noqa + # - paddle.nn.functional.adaptive_avg_pool1d + # - paddle.nn.functional.adaptive_avg_pool2d + # - paddle.nn.functional.adaptive_avg_pool3d + # - paddle.nn.functional.adaptive_max_pool1d + # - paddle.nn.functional.adaptive_max_pool2d + # - paddle.nn.functional.adaptive_max_pool3d + # - paddle.nn.functional.affine_grid + # - paddle.nn.functional.alpha_dropout + # - paddle.nn.functional.avg_pool1d + # - paddle.nn.functional.avg_pool2d + # - paddle.nn.functional.avg_pool3d + # - paddle.nn.functional.batch_norm + # - paddle.nn.functional.bilinear + # - paddle.nn.functional.binary_cross_entropy + # - paddle.nn.functional.binary_cross_entropy_with_logits + # - paddle.nn.functional.celu + # - paddle.nn.functional.channel_shuffle + # - paddle.nn.functional.class_center_sample + # - paddle.nn.functional.common + # - paddle.nn.functional.conv1d + # - paddle.nn.functional.conv1d_transpose + # - paddle.nn.functional.conv2d + # - paddle.nn.functional.conv2d_transpose + # - paddle.nn.functional.conv3d + # - paddle.nn.functional.conv3d_transpose + # - paddle.nn.functional.cosine_embedding_loss + # - paddle.nn.functional.cosine_similarity + # - paddle.nn.functional.cross_entropy + # - paddle.nn.functional.ctc_loss + # - paddle.nn.functional.diag_embed + # - paddle.nn.functional.dice_loss + # - paddle.nn.functional.distance + # - paddle.nn.functional.dropout + # - paddle.nn.functional.dropout2d + # - paddle.nn.functional.dropout3d + # - paddle.nn.functional.elu + # - paddle.nn.functional.elu_ + # - paddle.nn.functional.embedding + # - paddle.nn.functional.extension + # - paddle.nn.functional.flash_attention + # - paddle.nn.functional.flash_attention_with_sparse_mask + # - paddle.nn.functional.fractional_max_pool2d + # - paddle.nn.functional.fractional_max_pool3d + # - paddle.nn.functional.fold + # - paddle.nn.functional.gather_tree + # - paddle.nn.functional.gaussian_nll_loss + # - paddle.nn.functional.gelu + # - paddle.nn.functional.glu + # - paddle.nn.functional.grid_sample + # - paddle.nn.functional.gumbel_softmax + # - paddle.nn.functional.hardshrink + # - paddle.nn.functional.hardsigmoid + # - paddle.nn.functional.hardswish + # - paddle.nn.functional.hardtanh + # - paddle.nn.functional.hardtanh_ + # - paddle.nn.functional.hinge_embedding_loss + # - paddle.nn.functional.hsigmoid_loss + # - paddle.nn.functional.instance_norm + # - paddle.nn.functional.interpolate + # - paddle.nn.functional.kl_div + # - paddle.nn.functional.l1_loss + # - paddle.nn.functional.label_smooth + # - paddle.nn.functional.layer_norm + # - paddle.nn.functional.leaky_relu + # - paddle.nn.functional.leaky_relu_ + # - paddle.nn.functional.linear + # - paddle.nn.functional.local_response_norm + # - paddle.nn.functional.log_loss + # - paddle.nn.functional.log_sigmoid + # - paddle.nn.functional.log_softmax + # - paddle.nn.functional.margin_cross_entropy + # - paddle.nn.functional.margin_ranking_loss + # - paddle.nn.functional.max_pool1d + # - paddle.nn.functional.max_pool2d + # - paddle.nn.functional.max_pool3d + # - paddle.nn.functional.max_unpool1d + # - paddle.nn.functional.max_unpool2d + # - paddle.nn.functional.max_unpool3d + # - paddle.nn.functional.maxout + # - paddle.nn.functional.mish + # - paddle.nn.functional.mse_loss + # - paddle.nn.functional.multi_label_soft_margin_loss + # - paddle.nn.functional.multi_margin_loss + # - paddle.nn.functional.nll_loss + # - paddle.nn.functional.norm + # - paddle.nn.functional.normalize + # - paddle.nn.functional.npair_loss + # - paddle.nn.functional.one_hot + # - paddle.nn.functional.pad + # - paddle.nn.functional.pairwise_distance + # - paddle.nn.functional.pdist + # - paddle.nn.functional.pixel_shuffle + # - paddle.nn.functional.pixel_unshuffle + # - paddle.nn.functional.poisson_nll_loss + # - paddle.nn.functional.pooling + # - paddle.nn.functional.prelu + # - paddle.nn.functional.relu + # - paddle.nn.functional.relu6 + # - paddle.nn.functional.relu_ + # - paddle.nn.functional.rnnt_loss + # - paddle.nn.functional.rrelu + # - paddle.nn.functional.sdp_kernel + # - paddle.nn.functional.selu + # - paddle.nn.functional.sequence_mask + # - paddle.nn.functional.sigmoid + # - paddle.nn.functional.sigmoid_focal_loss + # - paddle.nn.functional.silu + # - paddle.nn.functional.smooth_l1_loss + # - paddle.nn.functional.soft_margin_loss + # - paddle.nn.functional.softmax + # - paddle.nn.functional.softmax_ + # - paddle.nn.functional.softmax_with_cross_entropy + # - paddle.nn.functional.softplus + # - paddle.nn.functional.softshrink + # - paddle.nn.functional.softsign + # - paddle.nn.functional.sparse_attention + # - paddle.nn.functional.square_error_cost + # - paddle.nn.functional.swish + # - paddle.nn.functional.tanh + # - paddle.nn.functional.tanh_ + # - paddle.nn.functional.tanhshrink + # - paddle.nn.functional.temporal_shift + # - paddle.nn.functional.thresholded_relu + # - paddle.nn.functional.thresholded_relu_ + # - paddle.nn.functional.triplet_margin_loss + # - paddle.nn.functional.triplet_margin_with_distance_loss + # - paddle.nn.functional.unfold + # - paddle.nn.functional.upsample + # - paddle.nn.functional.zeropad2d + # - paddle.abs + # - paddle.abs_ + # - paddle.acos + # - paddle.acos_ + # - paddle.acosh + # - paddle.acosh_ + # - paddle.add + # - paddle.add_n + # - paddle.addmm + # - paddle.addmm_ + # - paddle.all + # - paddle.allclose + # - paddle.amax + # - paddle.amin + # - paddle.angle + # - paddle.any + # - paddle.arange + # - paddle.argmax + # - paddle.argmin + # - paddle.argsort + # - paddle.as_complex + # - paddle.as_real + # - paddle.as_strided + # - paddle.asin + # - paddle.asin_ + # - paddle.asinh + # - paddle.asinh_ + # - paddle.assign + # - paddle.atan + # - paddle.atan2 + # - paddle.atan_ + # - paddle.atanh + # - paddle.atanh_ + # - paddle.atleast_1d + # - paddle.atleast_2d + # - paddle.atleast_3d + # - paddle.bernoulli + # - paddle.bincount + # - paddle.binomial + # - paddle.bitwise_and + # - paddle.bitwise_and_ + # - paddle.bitwise_not + # - paddle.bitwise_not_ + # - paddle.bitwise_or + # - paddle.bitwise_or_ + # - paddle.bitwise_xor + # - paddle.bitwise_xor_ + # - paddle.bmm + # - paddle.broadcast_shape + # - paddle.broadcast_tensors + # - paddle.broadcast_to + # - paddle.cauchy_ + # - paddle.cast + # - paddle.cdist + # - paddle.ceil + # - paddle.cholesky + # - paddle.chunk + # - paddle.clip + # - paddle.column_stack + # - paddle.combinations + # - paddle.concat + # - paddle.conj + # - paddle.copysign + # - paddle.copysign_ + # - paddle.cos + # - paddle.cos_ + # - paddle.cosh + # - paddle.cosh_ + # - paddle.count_nonzero + # - paddle.crop + # - paddle.cross + # - paddle.cummax + # - paddle.cummin + # - paddle.cumprod + # - paddle.cumprod_ + # - paddle.cumsum + # - paddle.cumsum_ + # - paddle.cumulative_trapezoid + # - paddle.decomposition + # - paddle.deg2rad + # - paddle.diag + # - paddle.diag_embed + # - paddle.diagflat + # - paddle.diagonal + # - paddle.diagonal_scatter + # - paddle.diff + # - paddle.digamma + # - paddle.digamma_ + # - paddle.divide + # - paddle.divide_ + # - paddle.dot + # - paddle.dsplit + # - paddle.dstack + # - paddle.eigvalsh + # - paddle.einsum + # - paddle.equal + # - paddle.equal_all + # - paddle.erf + # - paddle.erf_ + # - paddle.erfinv + # - paddle.exp + # - paddle.expand + # - paddle.expand_as + # - paddle.expm1 + # - paddle.expm1_ + # - paddle.eye + # - paddle.fft + # - paddle.flatten + # - paddle.flatten_ + # - paddle.flip + # - paddle.floor + # - paddle.floor_divide + # - paddle.floor_divide_ + # - paddle.floor_mod + # - paddle.floor_mod_ + # - paddle.fmax + # - paddle.fmin + # - paddle.frac + # - paddle.frac_ + # - paddle.frexp + # - paddle.full + # - paddle.full_like + # - paddle.gather + # - paddle.gather_nd + # - paddle.gcd + # - paddle.gcd_ + # - paddle.greater_equal + # - paddle.greater_equal_ + # - paddle.greater_than + # - paddle.greater_than_ + # - paddle.heaviside + # - paddle.histogram + # - paddle.histogramdd + # - paddle.hsplit + # - paddle.hstack + # - paddle.hypot + # - paddle.hypot_ + # - paddle.i0 + # - paddle.i0_ + # - paddle.i0e + # - paddle.i1 + # - paddle.i1e + # - paddle.imag + # - paddle.increment + # - paddle.index_add + # - paddle.index_add_ + # - paddle.index_fill + # - paddle.index_fill_ + # - paddle.index_put + # - paddle.index_put_ + # - paddle.index_sample + # - paddle.index_select + # - paddle.inner + # - paddle.kron + # - paddle.kthvalue + # - paddle.lcm + # - paddle.lcm_ + # - paddle.ldexp + # - paddle.ldexp_ + # - paddle.lerp + # - paddle.less_equal + # - paddle.less_equal_ + # - paddle.less_than + # - paddle.less_than_ + # - paddle.lgamma + # - paddle.lgamma_ + # - paddle.linalg + # - paddle.linspace + # - paddle.log + # - paddle.log10 + # - paddle.log10_ + # - paddle.log1p + # - paddle.log1p_ + # - paddle.log2 + # - paddle.log2_ + # - paddle.log_ + # - paddle.logaddexp + # - paddle.logcumsumexp + # - paddle.logical_and + # - paddle.logical_and_ + # - paddle.logical_not + # - paddle.logical_not_ + # - paddle.logical_or + # - paddle.logical_or_ + # - paddle.logical_xor + # - paddle.logical_xor_ + # - paddle.logit + # - paddle.logit_ + # - paddle.logspace + # - paddle.logsumexp + # - paddle.masked_fill + # - paddle.masked_fill_ + # - paddle.masked_scatter + # - paddle.masked_scatter_ + # - paddle.masked_select + # - paddle.matmul + # - paddle.max + # - paddle.maximum + # - paddle.mean + # - paddle.median + # - paddle.meshgrid + # - paddle.min + # - paddle.minimum + # - paddle.mm + # - paddle.mod + # - paddle.mod_ + # - paddle.mode + # - paddle.moveaxis + # - paddle.multigammaln + # - paddle.multigammaln_ + # - paddle.multinomial + # - paddle.multiplex + # - paddle.multiply + # - paddle.multiply_ + # - paddle.mv + # - paddle.nan_to_num + # - paddle.nan_to_num_ + # - paddle.nanmean + # - paddle.nanmedian + # - paddle.nanquantile + # - paddle.nansum + # - paddle.neg + # - paddle.neg_ + # - paddle.nextafter + # - paddle.nonzero + # - paddle.normal + # - paddle.normal_ + # - paddle.not_equal + # - paddle.not_equal_ + # - paddle.numel + # - paddle.outer + # - paddle.pdist + # - paddle.poisson + # - paddle.polar + # - paddle.polygamma + # - paddle.polygamma_ + # - paddle.pow + # - paddle.pow_ + # - paddle.prod + # - paddle.put_along_axis + # - paddle.quantile + # - paddle.rad2deg + # - paddle.rand + # - paddle.randint + # - paddle.randint_like + # - paddle.randn + # - paddle.randperm + # - paddle.reader + # - paddle.real + # - paddle.reciprocal + # - paddle.regularizer + # - paddle.remainder + # - paddle.remainder_ + # - paddle.renorm + # - paddle.renorm_ + # - paddle.repeat_interleave + # - paddle.roll + # - paddle.rot90 + # - paddle.round + # - paddle.row_stack + # - paddle.rsqrt + # - paddle.scale + # - paddle.scatter + # - paddle.scatter_ + # # - paddle.scatter_nd # cause CUDA_ERROR ignored. + # # - paddle.scatter_nd_add + # - paddle.searchsorted + # - paddle.select_scatter + # - paddle.sgn + # - paddle.shard_index + # - paddle.sign + # - paddle.signal + # - paddle.signbit + # - paddle.sin + # - paddle.sin_ + # - paddle.sinh + # - paddle.sinh_ + # - paddle.slice + # # - paddle.slice_scatter + # - paddle.sort + # - paddle.sqrt + # - paddle.square + # - paddle.standard_gamma + # - paddle.standard_normal + # - paddle.stanh + # - paddle.strided_slice + # - paddle.subtract + # - paddle.sum + # - paddle.t + # - paddle.t_ + # - paddle.take + # - paddle.take_along_axis + # - paddle.tan + # - paddle.tan_ + # - paddle.tanh + # - paddle.tanh_ + # - paddle.tensordot + # - paddle.tile + # - paddle.topk + # - paddle.trace + # - paddle.transpose + # - paddle.transpose_ + # - paddle.trapezoid + # - paddle.tril + # - paddle.tril_ + # - paddle.tril_indices + # - paddle.triu + # - paddle.triu_ + # - paddle.triu_indices + # - paddle.trunc + # - paddle.trunc_ + # - paddle.unbind + # - paddle.unflatten + # - paddle.unfold + # - paddle.uniform + # - paddle.unique + # - paddle.unique_consecutive + # - paddle.unstack + # - paddle.vander + # - paddle.var + # - paddle.view + # - paddle.view_as + # - paddle.vsplit + # - paddle.where + # - paddle.where_ + # - paddle.zeros + # - paddle.zeros_like + # - paddle.Tensor.T + # - paddle.Tensor.__add__ + # - paddle.Tensor.__and__ + # - paddle.Tensor.__radd__ + # - paddle.Tensor.__div__ + # - paddle.Tensor.__eq__ + # - paddle.Tensor.__floordiv__ + # - paddle.Tensor.__ge__ + # - paddle.Tensor.__gt__ + # - paddle.Tensor.__le__ + # - paddle.Tensor.__lt__ + # - paddle.Tensor.__matmul__ + # - paddle.Tensor.__mod__ + # - paddle.Tensor.__mul__ + # - paddle.Tensor.__ne__ + # - paddle.Tensor.__neg__ + # - paddle.Tensor.__nonzero__ + # - paddle.Tensor.__or__ + # - paddle.Tensor.__pow__ + # - paddle.Tensor.__radd__ + # - paddle.Tensor.__rdiv__ + # - paddle.Tensor.__rmul__ + # - paddle.Tensor.__rpow__ + # - paddle.Tensor.__rsub__ + # - paddle.Tensor.__rtruediv__ + # - paddle.Tensor.__sub__ + # - paddle.Tensor.__truediv__ + # - paddle.Tensor.__xor__ + # - paddle.Tensor.abs + # - paddle.Tensor.abs_ + # - paddle.Tensor.acos + # - paddle.Tensor.acos_ + # - paddle.Tensor.acosh + # - paddle.Tensor.acosh_ + # - paddle.Tensor.add + # - paddle.Tensor.add_ + # - paddle.Tensor.add_n + # - paddle.Tensor.addmm + # - paddle.Tensor.addmm_ + # - paddle.Tensor.all + # - paddle.Tensor.allclose + # - paddle.Tensor.amax + # - paddle.Tensor.amin + # - paddle.Tensor.angle + # - paddle.Tensor.any + # - paddle.Tensor.argmax + # - paddle.Tensor.argmin + # - paddle.Tensor.argsort + # - paddle.Tensor.as_complex + # - paddle.Tensor.as_real + # - paddle.Tensor.as_strided + # - paddle.Tensor.asin + # - paddle.Tensor.asin_ + # - paddle.Tensor.asinh + # - paddle.Tensor.asinh_ + # - paddle.Tensor.atan + # - paddle.Tensor.atan2 + # - paddle.Tensor.atan_ + # - paddle.Tensor.atanh + # - paddle.Tensor.atanh_ + # - paddle.Tensor.atleast_1d + # - paddle.Tensor.atleast_2d + # - paddle.Tensor.atleast_3d + # - paddle.Tensor.bincount + # - paddle.Tensor.bitwise_and + # - paddle.Tensor.bitwise_and_ + # - paddle.Tensor.bitwise_not + # - paddle.Tensor.bitwise_not_ + # - paddle.Tensor.bitwise_or + # - paddle.Tensor.bitwise_or_ + # - paddle.Tensor.bitwise_xor + # - paddle.Tensor.bitwise_xor_ + # - paddle.Tensor.bmm + # - paddle.Tensor.broadcast_shape + # - paddle.Tensor.broadcast_tensors + # - paddle.Tensor.broadcast_to + # - paddle.Tensor.cauchy_ + # - paddle.Tensor.cdist + # - paddle.Tensor.ceil + # - paddle.Tensor.ceil_ + # - paddle.Tensor.cholesky + # - paddle.Tensor.cholesky_solve + # - paddle.Tensor.clip + # - paddle.Tensor.clip_ + # - paddle.Tensor.coalesce + # - paddle.Tensor.cols + # - paddle.Tensor.combinations + # - paddle.Tensor.concat + # - paddle.Tensor.cond + # - paddle.Tensor.conj + # - paddle.Tensor.contiguous + # - paddle.Tensor.corrcoef + # - paddle.Tensor.cos + # - paddle.Tensor.cos_ + # - paddle.Tensor.cosh + # - paddle.Tensor.cosh_ + # - paddle.Tensor.count_nonzero + # - paddle.Tensor.cov + # - paddle.Tensor.cross + # - paddle.Tensor.crows + # - paddle.Tensor.cummax + # - paddle.Tensor.cummin + # - paddle.Tensor.cumprod + # - paddle.Tensor.cumprod_ + # - paddle.Tensor.cumsum + # - paddle.Tensor.cumsum_ + # - paddle.Tensor.cumulative_trapezoid + # - paddle.Tensor.deg2rad + # - paddle.Tensor.diag + # - paddle.Tensor.diag_embed + # - paddle.Tensor.diagflat + # - paddle.Tensor.diagonal + # - paddle.Tensor.diagonal_scatter + # - paddle.Tensor.diff + # - paddle.Tensor.digamma + # - paddle.Tensor.digamma_ + # - paddle.Tensor.divide + # - paddle.Tensor.divide_ + # - paddle.Tensor.dot + # - paddle.Tensor.eig + # - paddle.Tensor.eigvals + # - paddle.Tensor.eigvalsh + # - paddle.Tensor.equal + # - paddle.Tensor.equal_all + # - paddle.Tensor.erf + # - paddle.Tensor.erfinv + # - paddle.Tensor.erfinv_ + # - paddle.Tensor.exp + # - paddle.Tensor.exp_ + # - paddle.Tensor.expand + # - paddle.Tensor.expand_as + # - paddle.Tensor.expm1 + # - paddle.Tensor.exponential_ + # - paddle.Tensor.fill_ + # - paddle.Tensor.fill_diagonal_ + # - paddle.Tensor.fill_diagonal_tensor + # - paddle.Tensor.fill_diagonal_tensor_ + # - paddle.Tensor.flatten + # - paddle.Tensor.flatten_ + # - paddle.Tensor.flip + # - paddle.Tensor.floor + # - paddle.Tensor.floor_ + # - paddle.Tensor.floor_divide + # - paddle.Tensor.floor_divide_ + # - paddle.Tensor.floor_mod + # - paddle.Tensor.floor_mod_ + # - paddle.Tensor.fmax + # - paddle.Tensor.fmin + # - paddle.Tensor.frac + # - paddle.Tensor.frac_ + # - paddle.Tensor.frexp + # - paddle.Tensor.gather + # - paddle.Tensor.gather_nd + # - paddle.Tensor.gcd + # - paddle.Tensor.gcd_ + # - paddle.Tensor.get_selected_rows + # - paddle.Tensor.get_strides + # - paddle.Tensor.greater_equal + # - paddle.Tensor.greater_equal_ + # - paddle.Tensor.greater_than + # - paddle.Tensor.greater_than_ + # - paddle.Tensor.heaviside + # - paddle.Tensor.histogram + # - paddle.Tensor.histogramdd + # - paddle.Tensor.hsplit + # - paddle.Tensor.hypot + # - paddle.Tensor.hypot_ + # - paddle.Tensor.i0 + # - paddle.Tensor.i0_ + # - paddle.Tensor.i0e + # - paddle.Tensor.i1 + # - paddle.Tensor.i1e + # - paddle.Tensor.imag + # - paddle.Tensor.increment + # - paddle.Tensor.index_add + # - paddle.Tensor.index_add_ + # - paddle.Tensor.index_fill + # - paddle.Tensor.index_fill_ + # - paddle.Tensor.index_put + # - paddle.Tensor.index_put_ + # - paddle.Tensor.index_sample + # - paddle.Tensor.index_select + # - paddle.Tensor.inner + # - paddle.Tensor.kron + # - paddle.Tensor.kthvalue + # - paddle.Tensor.layout + # - paddle.Tensor.lcm + # - paddle.Tensor.lcm_ + # - paddle.Tensor.ldexp + # - paddle.Tensor.ldexp_ + # - paddle.Tensor.lerp + # - paddle.Tensor.lerp_ + # - paddle.Tensor.less_equal + # - paddle.Tensor.less_equal_ + # - paddle.Tensor.less_than + # - paddle.Tensor.less_than_ + # - paddle.Tensor.lgamma + # - paddle.Tensor.lgamma_ + # - paddle.Tensor.log + # - paddle.Tensor.log10 + # - paddle.Tensor.log10_ + # - paddle.Tensor.log1p + # - paddle.Tensor.log1p_ + # - paddle.Tensor.log2 + # - paddle.Tensor.log2_ + # - paddle.Tensor.log_ + # - paddle.Tensor.logaddexp + # - paddle.Tensor.logcumsumexp + # - paddle.Tensor.logical_and + # - paddle.Tensor.logical_and_ + # - paddle.Tensor.logical_not + # - paddle.Tensor.logical_not_ + # - paddle.Tensor.logical_or + # - paddle.Tensor.logical_or_ + # - paddle.Tensor.logical_xor + # - paddle.Tensor.logical_xor_ + # - paddle.Tensor.logit + # - paddle.Tensor.logit_ + # - paddle.Tensor.logsumexp + # - paddle.Tensor.lstsq + # - paddle.Tensor.lu + # - paddle.Tensor.lu_unpack + # - paddle.Tensor.masked_fill + # - paddle.Tensor.masked_fill_ + # - paddle.Tensor.masked_select + # - paddle.Tensor.masked_scatter + # - paddle.Tensor.masked_scatter_ + # - paddle.Tensor.matmul + # - paddle.Tensor.matrix_power + # - paddle.Tensor.max + # - paddle.Tensor.maximum + # - paddle.Tensor.mean + # - paddle.Tensor.median + # - paddle.Tensor.min + # - paddle.Tensor.minimum + # - paddle.Tensor.mm + # - paddle.Tensor.mod + # - paddle.Tensor.mod_ + # - paddle.Tensor.mode + # - paddle.Tensor.moveaxis + # - paddle.Tensor.multi_dot + # - paddle.Tensor.multigammaln + # - paddle.Tensor.multigammaln_ + # - paddle.Tensor.multinomial + # - paddle.Tensor.multiplex + # - paddle.Tensor.multiply + # - paddle.Tensor.multiply_ + # - paddle.Tensor.mv + # - paddle.Tensor.nan_to_num + # - paddle.Tensor.nan_to_num_ + # - paddle.Tensor.nanmean + # - paddle.Tensor.nanmedian + # - paddle.Tensor.nanquantile + # - paddle.Tensor.nansum + # - paddle.Tensor.ndimension + # - paddle.Tensor.neg + # - paddle.Tensor.neg_ + # - paddle.Tensor.nnz + # - paddle.Tensor.nonzero + # - paddle.Tensor.norm + # - paddle.Tensor.normal_ + # - paddle.Tensor.not_equal + # - paddle.Tensor.not_equal_ + # - paddle.Tensor.numel + # - paddle.Tensor.offset + # - paddle.Tensor.outer + # - paddle.Tensor.pca_lowrank + # - paddle.Tensor.pinv + # - paddle.Tensor.polar + # - paddle.Tensor.polygamma + # - paddle.Tensor.polygamma_ + # - paddle.Tensor.pow + # - paddle.Tensor.pow_ + # - paddle.Tensor.process_mesh + # - paddle.Tensor.prod + # - paddle.Tensor.put_along_axis + # - paddle.Tensor.put_along_axis_ + # - paddle.Tensor.qr + # - paddle.Tensor.quantile + # - paddle.Tensor.rad2deg + # - paddle.Tensor.remainder + # - paddle.Tensor.remainder_ + # - paddle.Tensor.renorm + # - paddle.Tensor.renorm_ + # - paddle.Tensor.repeat_interleave + # - paddle.Tensor.reverse + # - paddle.Tensor.roll + # - paddle.Tensor.rot90 + # - paddle.Tensor.round + # - paddle.Tensor.round_ + # - paddle.Tensor.rows + # - paddle.Tensor.rsqrt + # - paddle.Tensor.rsqrt_ + # - paddle.Tensor.scale + # - paddle.Tensor.scale_ + # - paddle.Tensor.scatter + # - paddle.Tensor.scatter_ + # - paddle.Tensor.scatter_nd + # - paddle.Tensor.scatter_nd_add + # - paddle.Tensor.select_scatter + # - paddle.Tensor.sgn + # - paddle.Tensor.shard_index + # - paddle.Tensor.sigmoid + # - paddle.Tensor.sigmoid_ + # - paddle.Tensor.sign + # - paddle.Tensor.sin + # - paddle.Tensor.sin_ + # - paddle.Tensor.sinh + # - paddle.Tensor.sinh_ + # - paddle.Tensor.size + # - paddle.Tensor.slice + # - paddle.Tensor.solve + # - paddle.Tensor.sort + # - paddle.Tensor.split + # - paddle.Tensor.sqrt + # - paddle.Tensor.sqrt_ + # - paddle.Tensor.square + # - paddle.Tensor.stack + # - paddle.Tensor.stanh + # - paddle.Tensor.std + # - paddle.Tensor.stft + # - paddle.Tensor.strided_slice + # - paddle.Tensor.strides + # - paddle.Tensor.subtract + # - paddle.Tensor.subtract_ + # - paddle.Tensor.sum + # - paddle.Tensor.t + # - paddle.Tensor.t_ + # - paddle.Tensor.take + # - paddle.Tensor.take_along_axis + # - paddle.Tensor.tan + # - paddle.Tensor.tan_ + # - paddle.Tensor.tanh + # - paddle.Tensor.tanh_ + # - paddle.Tensor.tensordot + # - paddle.Tensor.tile + # - paddle.Tensor.top_p_sampling + # - paddle.Tensor.topk + # - paddle.Tensor.trace + # - paddle.Tensor.transpose + # - paddle.Tensor.transpose_ + # - paddle.Tensor.trapezoid + # - paddle.Tensor.tril + # - paddle.Tensor.tril_ + # - paddle.Tensor.triu + # - paddle.Tensor.triu_ + # - paddle.Tensor.trunc + # - paddle.Tensor.trunc_ + # - paddle.Tensor.unbind + # - paddle.Tensor.unflatten + # - paddle.Tensor.unfold + # - paddle.Tensor.uniform_ + # - paddle.Tensor.unique + # - paddle.Tensor.unique_consecutive + # - paddle.Tensor.unstack + # - paddle.Tensor.vander + # - paddle.Tensor.var + # - paddle.Tensor.view + # - paddle.Tensor.view_as + # - paddle.Tensor.vsplit + # - paddle.Tensor.where + # - paddle.Tensor.where_ + # - paddle._C_ops.fused_gemm_epilogue + # - paddle.optimizer.Adam + # - paddle.optimizer.AdamW + # - paddle._C_ops.adamw + # - paddle._C_ops.adamw_ + # - paddle._legacy_C_ops.fused_gemm_epilogue + # - paddle.incubate.nn.functional.fused_multi_head_attention + # - paddle.incubate.nn.functional.fused_feedforward + # - paddle.incubate.nn.functional.fused_multi_transformer + # - paddle.incubate.nn.functional.fused_linear + # - paddle.incubate.nn.functional.fused_linear_activation + # - paddle.incubate.nn.functional.fused_bias_dropout_residual_layer_norm + # - paddle.incubate.nn.functional.fused_ec_moe + # - paddle.incubate.nn.functional.fused_dropout_add + # - paddle.incubate.nn.functional.fused_rotary_position_embedding + # - paddle.incubate.nn.functional.variable_length_memory_efficient_attention + # - paddle.incubate.nn.functional.fused_rms_norm + # - paddle.incubate.nn.functional.fused_layer_norm + # - paddle.incubate.nn.functional.masked_multihead_attention + # - paddle.incubate.nn.functional.block_multihead_attention + # - paddle.incubate.nn.functional.swiglu + # - paddle.incubate.nn.functional.fused_matmul_bias + # - paddle.tensor.fill_constant + # - paddle.nn.clip._squared_l2_norm + # - paddle.uniform + # - paddle._C_ops.gaussian + # - paddle._legacy_C_ops.c_identity #fusion_ops: - paddlenlp.transformers.llama.fusion_ops.fusion_flash_attention - paddlenlp.transformers.llama.fusion_ops.fusion_rms_norm diff --git a/paddleapex/api_tracer/wrap_op/OPTemplate.py b/paddleapex/api_tracer/wrap_op/OPTemplate.py index 97049a5..17123ee 100644 --- a/paddleapex/api_tracer/wrap_op/OPTemplate.py +++ b/paddleapex/api_tracer/wrap_op/OPTemplate.py @@ -14,7 +14,7 @@ import paddle.distributed as dist import paddle from .. import config -from ..api_info import API, get_init_params, save_init_params_and_weight +from ..api_info import API, get_init_params, save_init_params_and_weight, save_init_params, save_weight import os from paddleapex.api_tracer.Dump import dump_util @@ -61,7 +61,7 @@ def create_output_attr(tensor, num): def hijack_call(self, *args, **kwargs): cls = self.__class__ - init_params = get_init_params(self) + # init_params = get_init_params(self) cfg.prefix_op_name_ = self.prefix_op_name_ + "*" if self.__class__.__name__ not in cfg.Op_count: cfg.Op_count[self.__class__.__name__] = 1 @@ -74,7 +74,9 @@ def hijack_call(self, *args, **kwargs): rank = dist.get_rank() api_recorder.update_APIInfo(cfg.prefix_op_name_, rank) api_recorder.update_real_data(args, kwargs) - save_init_params_and_weight(init_params, self.state_dict(), cfg.prefix_op_name_, rank) + # save_weight(self.state_dict(), cfg.prefix_op_name_, rank) + # save_init_params_and_weight(init_params, self.state_dict(), cfg.prefix_op_name_, rank) + save_init_params_and_weight(self.apex_init_params, self.state_dict(), cfg.prefix_op_name_, rank) output = self.forward(*args, **kwargs) try: out_num = 0 diff --git a/paddleapex/api_tracer/wrap_op/hijack_tool.py b/paddleapex/api_tracer/wrap_op/hijack_tool.py index 4360f25..d99a4f6 100644 --- a/paddleapex/api_tracer/wrap_op/hijack_tool.py +++ b/paddleapex/api_tracer/wrap_op/hijack_tool.py @@ -16,6 +16,8 @@ from ...utils import try_import from .get_target_op import GetTargetOP from .OPTemplate import OPTemplate, HookOp, hijack_call +import paddle.distributed as dist +from ..api_info import save_init_params cfg = config.cfg @@ -44,11 +46,18 @@ def hijack_api(): for class_in in target_class: parent_package, class_n = class_in.rsplit(".", maxsplit=1) + try: class_name, model = try_import(parent_package) model = getattr(model, class_n) model.prefix_op_name_ = class_in model.__call__ = hijack_call + ori__init__ = model.__init__ + def hijack_init(self, *args, **kwargs): + self.apex_init_params = [args, kwargs] + ori__init__(self, *args, **kwargs) + model.__init__ = hijack_init + except Exception as err: print(class_in, str(err)) From 24a92d0789866ea25f1e4ede826500efba6bbe20 Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Thu, 24 Apr 2025 14:07:07 +0800 Subject: [PATCH 20/22] add test for fc and fa --- clean.sh | 11 + paddleapex/apex/acc_direct_paddle.py | 13 +- paddleapex/apex/run_paddle.py | 6 +- paddleapex/api_tracer/Tracer.py | 6 +- paddleapex/api_tracer/config.py | 2 +- paddleapex/api_tracer/wrap_op/OPTemplate.py | 2 + paddleapex/api_tracer/wrap_op/hijack_tool.py | 13 +- run.sh | 8 + test.py | 63 +++++ test_fa.py | 112 ++++++++ test_fc.py | 257 +++++++++++++++++++ test_fc_gpu.py | 34 +++ 12 files changed, 511 insertions(+), 16 deletions(-) create mode 100644 clean.sh create mode 100644 run.sh create mode 100644 test.py create mode 100644 test_fa.py create mode 100644 test_fc.py create mode 100644 test_fc_gpu.py diff --git a/clean.sh b/clean.sh new file mode 100644 index 0000000..0040eed --- /dev/null +++ b/clean.sh @@ -0,0 +1,11 @@ +cd FC_INPUT/ +rm -rf * +cd .. +rm -rf dump_info/ +cd XPU/output +rm -rf * +cd ../output_backward +rm -rf * +cd ../../paddleapex/apex/ +rm -rf XPU +rm -rf GPU diff --git a/paddleapex/apex/acc_direct_paddle.py b/paddleapex/apex/acc_direct_paddle.py index 1c4a7e0..8bf3b62 100644 --- a/paddleapex/apex/acc_direct_paddle.py +++ b/paddleapex/apex/acc_direct_paddle.py @@ -81,10 +81,15 @@ def compare_command(args): details_csv_path = os.path.join(out_path, DETAILS_FILE_NAME) print_info_log(f"Compare task result will be saved in {result_csv_path}") print_info_log(f"Compare task details will be saved in {details_csv_path}") - bench_dir = os.path.join(args.bench_dir, "./rank_" + str(rank) + "/output") - device_dir = os.path.join(args.device_dir, "./rank_" + str(rank) + "/output") - bench_back_dir = os.path.join(args.bench_dir, "./rank_" + str(rank) + "/output_backward") - device_back_dir = os.path.join(args.device_dir, "./rank_" + str(rank) + "/output_backward") + #bench_dir = os.path.join(args.bench_dir, "./rank_" + str(rank) + "/output") + #device_dir = os.path.join(args.device_dir, "./rank_" + str(rank) + "/output") + #bench_back_dir = os.path.join(args.bench_dir, "./rank_" + str(rank) + "/output_backward") + #device_back_dir = os.path.join(args.device_dir, "./rank_" + str(rank) + "/output_backward") + bench_dir = os.path.join(args.bench_dir, "./output") + device_dir = os.path.join(args.device_dir, "./output") + bench_back_dir = os.path.join(args.bench_dir, "./output_backward") + device_back_dir = os.path.join(args.device_dir, "./output_backward") + compare_device_bench( result_csv_path, diff --git a/paddleapex/apex/run_paddle.py b/paddleapex/apex/run_paddle.py index f2b34f2..037f73d 100644 --- a/paddleapex/apex/run_paddle.py +++ b/paddleapex/apex/run_paddle.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddlenlp # if you wanna test nlp fusion operations +#import paddlenlp # if you wanna test nlp fusion operations import argparse import os from importlib import import_module @@ -814,8 +814,8 @@ def check_json(json_list): if os.path.exists(out_path): print_warn_log("The output path already exists and the file with the same name will be overwritten.") - from paddlenlp.trainer import set_seed - set_seed(1026) + #from paddlenlp.trainer import set_seed + #set_seed(1026) if cfg.distributed_op: if cfg.test_class: diff --git a/paddleapex/api_tracer/Tracer.py b/paddleapex/api_tracer/Tracer.py index be468b5..55abc98 100644 --- a/paddleapex/api_tracer/Tracer.py +++ b/paddleapex/api_tracer/Tracer.py @@ -27,14 +27,12 @@ def register_op(self): def start(self): # Evoke stop implicity. - if cfg.dump_state: - dump_util.dump() + # dump_util.dump() # global step counting. cfg.new_step() def stop(self): - if cfg.dump_state: - dump_util.dump() + dump_util.dump() def start_in_training(self, cur_step, acc): self.acc = acc diff --git a/paddleapex/api_tracer/config.py b/paddleapex/api_tracer/config.py index d8fb34b..ebacaca 100644 --- a/paddleapex/api_tracer/config.py +++ b/paddleapex/api_tracer/config.py @@ -41,7 +41,7 @@ def __init__(self) -> None: print("*" * 100) time.sleep(1) self.global_step = 0 - self.dump_state = False + self.dump_state = True self.Op_count = {} self.prefix_op_name_ = None diff --git a/paddleapex/api_tracer/wrap_op/OPTemplate.py b/paddleapex/api_tracer/wrap_op/OPTemplate.py index 17123ee..1652a99 100644 --- a/paddleapex/api_tracer/wrap_op/OPTemplate.py +++ b/paddleapex/api_tracer/wrap_op/OPTemplate.py @@ -110,6 +110,7 @@ def __init__(self, op_name): cfg.prefix_op_name_ = self.op_name_ + "*" def forward(self, *args, **kwargs): + print("OPTemplate", self.op_name_) if self.op_name_ not in cfg.Op_count: cfg.Op_count[self.op_name_] = 1 cfg.prefix_op_name_ += "0" @@ -119,6 +120,7 @@ def forward(self, *args, **kwargs): if cfg.dump_state: api_recorder = API(cfg.dump_mode) rank = dist.get_rank() + print("rank", rank) api_recorder.update_APIInfo(cfg.prefix_op_name_, rank) api_recorder.update_real_data(args, kwargs) output = getattr(HookOp, "wrap_" + str(self.op_name_))(*args, **kwargs) diff --git a/paddleapex/api_tracer/wrap_op/hijack_tool.py b/paddleapex/api_tracer/wrap_op/hijack_tool.py index d99a4f6..a5b61a3 100644 --- a/paddleapex/api_tracer/wrap_op/hijack_tool.py +++ b/paddleapex/api_tracer/wrap_op/hijack_tool.py @@ -32,6 +32,7 @@ def hijack_api(): op = GetTargetOP(cfg.op_target_pth) target_op = op.get_target_ops() target_class = op.get_target_class() + print("hijack api") for op_name in target_op: parent_package, method_name = op_name.rsplit(".", maxsplit=1) try: @@ -44,6 +45,13 @@ def hijack_api(): except Exception as err: print(op_name, str(err)) + print("hijack api") + for attr_name in dir(HookOp): + if attr_name.startswith("wrap_"): + parent_package, method_name = attr_name[5:].rsplit(".", maxsplit=1) + setattr(eval(parent_package), method_name, wrapped_op(attr_name[5:])) + + print("hijack class") for class_in in target_class: parent_package, class_n = class_in.rsplit(".", maxsplit=1) @@ -61,7 +69,4 @@ def hijack_init(self, *args, **kwargs): except Exception as err: print(class_in, str(err)) - for attr_name in dir(HookOp): - if attr_name.startswith("wrap_"): - parent_package, method_name = attr_name[5:].rsplit(".", maxsplit=1) - setattr(eval(parent_package), method_name, wrapped_op(attr_name[5:])) + diff --git a/run.sh b/run.sh new file mode 100644 index 0000000..83bb84f --- /dev/null +++ b/run.sh @@ -0,0 +1,8 @@ +export XPUAPI_DEBUG=0x1 +export CUDA_VISIBLE_DEVICES=7 +XPU_AUTO_BF16_TF32=1 XPU_PADDLE_FC_TF32=1 python test_fc.py +python test_fa.py +cd paddleapex/apex/ +python run_paddle.py -json ../../dump_info/rank0_step0/forward_rank0_all.json -real ../../dump_info/rank0_step0/ -out XPU -backend XPU -mode acc +cd ../../../linux-bcecmd-0.3.0 +./bcecmd --conf-path ./ bos sync ../PaddleAPEX bos:/baidu-kunlun-customer/NEW_FA_0312/ diff --git a/test.py b/test.py new file mode 100644 index 0000000..1f372a6 --- /dev/null +++ b/test.py @@ -0,0 +1,63 @@ +import paddle +import paddle.nn as nn + +import os + +def generate_true_random_number(num_bytes): + """ 生成真随机数,从 /dev/urandom 读取指定数量的字节 """ + # 从 /dev/urandom 读取 num_bytes 个字节 + random_bytes = os.urandom(num_bytes) + # 将字节转换为整数 + return int.from_bytes(random_bytes, 'big') + +# 生成一个真随机数 +#random_number = generate_true_random_number(4) # 读取 4 bytes +#print(random_number) + +from paddleapex import Tracer +checker = Tracer() +checker.register_op() + +checker.start() + +paddle.seed(42) +q_len = [131328, 147728, 106128, 128128, 120128] +#q_len = [10880, 12224, 1280, 1424, 15104, 1520, 1664, 16976, 18224, 1856, 1952, 20096, 22592, 23840, 24320, 2432, 27344, 29360, 30080, 8192, 9200, 9872] +#for i in q_len: +for i in range(20): + paddle.seed(int(generate_true_random_number(4))) + q = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) + paddle.seed(int(generate_true_random_number(6))) + k = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) + paddle.seed(int(generate_true_random_number(5))) + v = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) + q.stop_gradient = False + k.stop_gradient = False + v.stop_gradient = False + + output = paddle.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False, True) + output.stop_gradient = False + output.backward() + + +#q = paddle.rand((80, 1408, 32, 64), dtype=paddle.bfloat16) +#k = paddle.rand((80, 1408, 32, 64), dtype=paddle.bfloat16) +#v = paddle.rand((80, 1408, 32, 64), dtype=paddle.bfloat16) +#q.stop_gradient = False +#k.stop_gradient = False +#v.stop_gradient = False +# +#output = paddle.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False, True) +# +#dout = paddle.zeros_like(output) +#output.backward(dout) + +#q_len = [131328, 147728, 106128, 128128, 120128] +#for i in q_len: +# q = paddle.rand((1, i, 30, 64), dtype=paddle.bfloat16) +# k = paddle.rand((1, i, 30, 64), dtype=paddle.bfloat16) +# v = paddle.rand((1, i, 30, 64), dtype=paddle.bfloat16) +# output = paddle.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False, True) +# +checker.stop() +print(output) diff --git a/test_fa.py b/test_fa.py new file mode 100644 index 0000000..20b03d0 --- /dev/null +++ b/test_fa.py @@ -0,0 +1,112 @@ +import paddle +import paddle.nn as nn +import json + +import os + +def generate_true_random_number(num_bytes): + """ 生成真随机数,从 /dev/urandom 读取指定数量的字节 """ + # 从 /dev/urandom 读取 num_bytes 个字节 + random_bytes = os.urandom(num_bytes) + # 将字节转换为整数 + #print(int.from_bytes(random_bytes, 'big')) + return int.from_bytes(random_bytes, 'big') + + +# 路径到你的日志文件 +log_file_path = 'fa.log' + +# 初始化存储列表 +lod_seqlens_q_list = [] +head_num_list = [] +head_dim_list = [] +max_seqlen_q_list = [] + +def convert_to_list(s): + # 去除字符串中的花括号 + s = s.strip('{}') + # 分割字符串 + elements = s.split(',') + # 将字符串元素转换为整数,并返回列表 + return [int(element.strip()) for element in elements] + +# 读取和解析日志文件 +try: + with open(log_file_path, 'r') as file: + for line in file: + try: + # 将每行转换为JSON对象 + data = json.loads(line) + # 检查是否是目标操作 + if data.get('op') == 'mha_varlen_bwd': + params = data.get('params', {}) + seqlens_q = params.get('lod_seqlens_q', []) + #lod_seqlens_q_list.append(params.get('lod_seqlens_q', [])) + lod_seqlens_q_list.append(convert_to_list(seqlens_q)) + head_num_list.append(params.get('head_num', None)) + head_dim_list.append(params.get('head_dim', None)) + max_seqlen_q_list.append(params.get('max_seqlen_q', None)) + except json.JSONDecodeError: + print("Warning: Failed to decode JSON from line:", line) +except FileNotFoundError: + print(f"Error: The file {log_file_path} does not exist.") +except IOError as e: + print(f"Error: An I/O error occurred while reading {log_file_path}: {str(e)}") + + + +from paddleapex import Tracer +checker = Tracer() +checker.register_op() + +paddle.seed(46) +B = [] +for b in lod_seqlens_q_list: + if len(b) == 2: + B.append(1) + if len(b) > 2: + B.append(int(b[-1] / b[1])) +L = max_seqlen_q_list +H = head_num_list +D = head_dim_list + +# 输出结果 +print("LOD Sequence Lengths Q:", B) +print("Head Numbers:", H) +print("Head Dimensions:", D) +print("Max Sequence Length Q:", L) + + +for i in range(len(B)): + paddle.seed(int(generate_true_random_number(4))) + q = paddle.rand((B[i], L[i], H[i], D[i]), dtype=paddle.bfloat16) + paddle.seed(int(generate_true_random_number(4))) + k = paddle.rand((B[i], L[i], H[i], D[i]), dtype=paddle.bfloat16) + paddle.seed(int(generate_true_random_number(4))) + v = paddle.rand((B[i], L[i], H[i], D[i]), dtype=paddle.bfloat16) + q.stop_gradient = False + k.stop_gradient = False + v.stop_gradient = False + + output = paddle.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False, True) + output.stop_gradient = False + output.backward() + +#for i in range(10): +# paddle.seed(int(generate_true_random_number(4))) +# q = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) +# paddle.seed(int(generate_true_random_number(6))) +# k = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) +# paddle.seed(int(generate_true_random_number(5))) +# v = paddle.rand((1, 147728, 30, 64), dtype=paddle.bfloat16) +# q.stop_gradient = False +# k.stop_gradient = False +# v.stop_gradient = False +# +# output = paddle.nn.functional.scaled_dot_product_attention(q, k, v, None, 0.0, False, True) +# output.stop_gradient = False +# output.backward() + +checker.stop() + + diff --git a/test_fc.py b/test_fc.py new file mode 100644 index 0000000..25878ff --- /dev/null +++ b/test_fc.py @@ -0,0 +1,257 @@ +import paddle +import paddle.nn as nn +import json +import numpy as np +import os +import re +import math + +try: + from paddle_xpu.layers.nn import Linear +except ImportError: + from paddle.nn import Linear +#from paddle.nn import Linear + +import json + +log_file_path = 'workerlog.0.fc_fa_mean_max' + + +pattern = re.compile(r".*?cases(.*)") +mean_var = r"\[mean\](-?\d+\.\d+), \[max\](\d+\.\d+)" + + +# 初始化列表 +list_a = [] +list_b = [] +list_c = [] +list_d = [] +alpha_beta = [] +a_mean = [] +b_mean = [] +d_mean = [] +a_max = [] +b_max = [] +d_max = [] +n = 0 +try: + with open(log_file_path, 'r') as file: + log_lines = file.readlines() + + for i in range(len(log_lines)): + print("i", i) + line = log_lines[i] + if "kXPU3" not in log_lines[i]: + continue + if "float16" not in log_lines[i]: + continue + if "fc_fusion" in log_lines[i]: + run_mode = log_lines[i + 3] + if "desc.run_mode" in run_mode: + mean_match = re.search(mean_var, run_mode) + if mean_match: + if float(mean_match.group(1)) == 1: + continue + else: + print('------------------error----------------') + else: + continue + + a_line = log_lines[i + 1] + match = re.search(mean_var, a_line) + if match: + a_mean.append(float(match.group(1))) + a_max.append(float(match.group(2))) + b_line = log_lines[i + 2] + match = re.search(mean_var, b_line) + if match: + b_mean.append(float(match.group(1))) + b_max.append(float(match.group(2))) + #d_line = log_lines[i + 6] + #match = re.search(mean_var, d_line) + #if match: + # d_mean.append(float(match.group(1))) + # d_max.append(float(match.group(2))) + + if "fc_fusion" in log_lines[i]: + print(i) + #match = pattern.search(line) + #if 'cases' not in line: + # # 将字符串转换为字典 + # entry = json.loads(line) + #else: + # if match: + # json_part = match.group(1) + # entry = json.loads(json_part) + entry = json.loads(line) + # 检查是否为"fc_fusion"操作 + if entry.get("op") == "fc_fusion": + # 提取参数 + params = entry.get("params", {}) + desc = entry.get("desc", {}) + + # 分别提取a, b, c, d的rows和cols,并存储在各自的列表中 + if 'a' in params: + if params['a']['trans']: + list_a.append([params['a']["cols"], params['a']["rows"]]) + else: + list_a.append([params['a']["rows"], params['a']["cols"]]) + if 'b' in params: + if params['b']['trans']: + list_b.append([params['b']["cols"], params['b']["rows"]]) + else: + list_b.append([params['b']["rows"], params['b']["cols"]]) + if 'c' in params: + if params['c']['trans']: + list_c.append([params['c']["cols"], params['c']["rows"]]) + else: + list_c.append([params['c']["rows"], params['c']["cols"]]) + if 'd' in params: + if params['d']['trans']: + list_d.append([params['d']["cols"], params['d']["rows"]]) + else: + list_d.append([params['d']["rows"], params['d']["cols"]]) + + # 提取alpha和beta,转换为float,并存储 + alpha = float(desc.get("alpha", 0)) + beta = float(desc.get("beta", 0)) + alpha_beta.append([alpha, beta]) +except FileNotFoundError: + print(f"Error: The file {log_file_path} does not exist.") +except IOError as e: + print(f"Error: An I/O error occurred while reading {log_file_path}: {str(e)}") + +def generate_true_random_number(num_bytes): + """ 生成真随机数,从 /dev/urandom 读取指定数量的字节 """ + # 从 /dev/urandom 读取 num_bytes 个字节 + random_bytes = os.urandom(num_bytes) + # 将字节转换为整数 + #print(int.from_bytes(random_bytes, 'big')) + return int.from_bytes(random_bytes, 'big') + +# inf, nan +def get_rounded_num(x, round_up=True): + if math.isinf(x) or math.isnan(x): + msg = f"warning, x is inf or nan" + print(msg, x) + return x + if abs(x) <= 1e-10: + return 0 + + abs_x = abs(x) + log_x = math.log10(abs_x) + round_log_x = math.floor(log_x) if round_up ^ (x > 0) else math.ceil(log_x) + + result = 10**round_log_x + return result if x >= 0 else -result + +unique_string = [] +u_list_a = [] +u_list_b = [] +u_a_m = [] +u_a_v = [] +u_b_m = [] +u_b_v = [] + +for i in range(len(a_mean)): + a_m = get_rounded_num(a_mean[i]) + b_m = get_rounded_num(b_mean[i]) + a_v = get_rounded_num(a_max[i]) + b_v = get_rounded_num(b_max[i]) + new_string = str(list_a[i]) + str(list_b[i]) + str(a_m) + str(b_m) + str(a_v) + str(b_v) + if new_string not in unique_string: + unique_string.append(new_string) + u_list_a.append(list_a[i]) + u_list_b.append(list_b[i]) + u_a_m.append(a_m) + u_b_m.append(b_m) + u_a_v.append(a_v) + u_b_v.append(b_v) + +print("-------------------------------------------", len(u_list_a)) + +list_a = u_list_a +list_b = u_list_b +a_mean = u_a_m +b_mean = u_b_m +a_max = u_a_v +b_max = u_b_v + + +### 输出各个列表 +#print("List A:", list_a) +#print("List B:", list_b) +#print("List C:", list_c) +#print("List D:", list_d) +#print("Alpha and Beta:", alpha_beta) +#print("a_mean", a_mean) +#print("b_mean", b_mean) +#print("d_mean", d_mean) +#print("a_max", a_max) +#print("b_max", b_max) +#print("d_max", d_max) +# +#print("-------------------------------------------", len(a_mean)) +# +def generate_random_array(mean, max_value, shape, seed=None): + if seed is not None: + np.random.seed(seed) + # 首先生成标准正态分布的随机数组 + random_array = np.random.randn(*shape).astype(np.float32) + # 计算当前随机数组的最大值 + current_max = random_array.max() + # 计算缩放因子,使得新的最大值为给定的max_value + scale_factor = max_value / current_max + # 对数组进行缩放 + random_array *= scale_factor + # 计算当前数组的均值 + current_mean = random_array.mean() + # 计算偏移量,使得新的均值为给定的mean + shift_value = mean - current_mean + # 对数组进行偏移 + random_array += shift_value + return random_array + +#CREATE_DATA = True +#DEVICE = 'XPU' +##DEVICE = 'GPU' +# +## +#dtype = paddle.bfloat16 +#paddle.set_default_dtype(dtype) +# +#if DEVICE == 'XPU' and CREATE_DATA: +# for i in range(len(list_a)): +# print(i) +# #for i in range(1): +# paddle.seed(int(generate_true_random_number(4))) +# #x = np.random.uniform(-1, 1, list_a[i]).astype("float32") +# x = generate_random_array(a_mean[i], a_max[i], list_a[i], int(generate_true_random_number(4))) +# paddle.seed(int(generate_true_random_number(5))) +# #w = np.random.uniform(-1, 1, list_b[i]).astype("float32") +# w = generate_random_array(b_mean[i], b_max[i], list_b[i], int(generate_true_random_number(3))) +# paddle.seed(int(generate_true_random_number(6))) +# bias = np.random.uniform(-1, 1, [list_b[i][1]]).astype("float32") +# paddle.seed(int(generate_true_random_number(5))) +# out_grad = np.random.uniform(-1, 1, [list_a[i][0], list_b[i][1]]).astype("float32") +# #out_grad = generate_random_array(d_mean[i], d_max[i], [list_a[i][0], list_b[i][1]], int(generate_true_random_number(3))) +# x = paddle.to_tensor(x, stop_gradient=False).cast(dtype) +# w = paddle.to_tensor(w, stop_gradient=False).cast(dtype) +# bias = paddle.to_tensor(bias, stop_gradient=False).cast(dtype) +# out_grad = paddle.to_tensor(out_grad, stop_gradient=True) +# paddle.save([x, w, bias, out_grad], 'FC_INPUT/linear_' + str(i)) +# +#for i in range(len(list_a)): +##for i in range(1): +# print(i) +# x, w, bias, out_grad = paddle.load('FC_INPUT/linear_' + str(i)) +# linear = Linear(w.shape[0], w.shape[1], bias_attr=True) +# linear.weight.set_value(w) +# linear.bias.set_value(bias) +# linear.train() +# +# out = linear(x) +# #paddle.save([True, [out]], 'XPU/output/linear_' + str(i)) +# out = paddle.cast(out, "float32") +# paddle.autograd.backward(tensors=[out], grad_tensors=[out_grad]) +# paddle.save([True, [linear.weight.grad, linear.bias.grad]], 'XPU/output_backward/linear_' + str(i)) diff --git a/test_fc_gpu.py b/test_fc_gpu.py new file mode 100644 index 0000000..a245dd7 --- /dev/null +++ b/test_fc_gpu.py @@ -0,0 +1,34 @@ +import paddle +import paddle.nn as nn +import json +import numpy as np +import os +import re + +try: + from paddle_xpu.layers.nn import Linear +except ImportError: + from paddle.nn import Linear + +import json +CREATE_DATA = False +#DEVICE = 'XPU' +DEVICE = 'GPU' + +dtype = paddle.bfloat16 +paddle.set_default_dtype(dtype) + +for i in range(len(os.listdir('FC_INPUT/'))): +#for i in range(1): + print(i) + x, w, bias, out_grad = paddle.load('FC_INPUT/linear_' + str(i)) + linear = Linear(w.shape[0], w.shape[1], bias_attr=True) + linear.weight.set_value(w) + linear.bias.set_value(bias) + linear.train() + + out = linear(x) + paddle.save([True, [out]], 'GPU/output/linear_' + str(i)) + out = paddle.cast(out, "float32") + paddle.autograd.backward(tensors=[out], grad_tensors=[out_grad]) + paddle.save([True, [linear.weight.grad, linear.bias.grad]], 'GPU/output_backward/linear_' + str(i)) From b8737271dd3d597e6c0632687b05a3a750720c5b Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Tue, 29 Apr 2025 10:36:33 +0800 Subject: [PATCH 21/22] add tests for conv --- test_conv.py | 163 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 test_conv.py diff --git a/test_conv.py b/test_conv.py new file mode 100644 index 0000000..26db349 --- /dev/null +++ b/test_conv.py @@ -0,0 +1,163 @@ +import paddle +import paddle.nn as nn +import json +import numpy as np +import os +import re + + +dtype = paddle.bfloat16 +paddle.set_default_dtype(dtype) + +log_file_path = 'conv.log' +pattern = re.compile(r".*?cases(.*)") + +# 初始化列表 +list_n = [] +list_c = [] +list_xh = [] +list_xw = [] +list_f = [] +list_ksize = [] +list_stride = [] +list_pad = [] +list_dilation = [] +list_group = [] +list_nchw = [] + +n = 0 +try: + with open(log_file_path, 'r') as file: + for line in file: + n = n + 1 + print(n) + entry = json.loads(line) + # 检查是否为"fc_fusion"操作 + if entry.get("op") == "cudnn_conv2d_grad": + # 提取参数 + params = entry.get("params", {}) + types = entry.get("desc", {}) + + if 'n' in params: + list_n.append(params['n']) + if 'c' in params: + list_c.append(params['c']) + if 'xh' in params: + list_xh.append(params['xh']) + elif 'h' in params: + list_xh.append(params['h']) + if 'xw' in params: + list_xw.append(params['xw']) + elif 'w' in params: + list_xw.append(params['w']) + if 'f' in params: + list_f.append(params['f']) + if '_ksize' in params: + list_ksize.append(params['_ksize']) + if '_stride' in params: + list_stride.append(params['_stride']) + if '_pad' in params: + list_pad.append(params['_pad']) + if '_dilation' in params: + list_dilation.append(params['_dilation']) + if 'group' in params: + list_group.append(params['group']) + if 'is_nchw' in params: + list_nchw.append(params['is_nchw']) +except FileNotFoundError: + print(f"Error: The file {log_file_path} does not exist.") +except IOError as e: + print(f"Error: An I/O error occurred while reading {log_file_path}: {str(e)}") + +def generate_true_random_number(num_bytes): + """ 生成真随机数,从 /dev/urandom 读取指定数量的字节 """ + # 从 /dev/urandom 读取 num_bytes 个字节 + random_bytes = os.urandom(num_bytes) + # 将字节转换为整数 + #print(int.from_bytes(random_bytes, 'big')) + return int.from_bytes(random_bytes, 'big') + +CREATE_DATA = False +DEVICE = 'XPU' +#DEVICE = 'GPU' + +# 输出各个列表 +print("List n:", list_n) +print("List c:", list_c) +print("List xh:", list_xh) +print("List xw:", list_xw) +print("List f:", list_f) +print("List ksize:", list_ksize) +print("List stride:", list_stride) +print("List pad:", list_pad) +print("List dilation:", list_dilation) +print("List group:", list_group) +print("List nchw:", list_nchw) + +x_shape = [] +f_shape = [] +y_shape = [] +for i in range(len(list_n)): + n = list_n[i] + c = list_c[i] + xh = list_xh[i] + xw = list_xw[i] + f = list_f[i] + ksize = list_ksize[i] + pad = list_pad[i] + dilation = list_dilation[i] + stride = list_stride[i] + + x_shape.append([n, c, xh, xw]) + f_shape.append([ksize[0], ksize[1], c, f]) + h_out = (xh + 2 * pad[0] - (dilation[0] * (ksize[0] - 1) + 1)) / stride[0] + 1 + w_out = (xw + 2 * pad[1] - (dilation[1] * (ksize[1] - 1) + 1)) / stride[1] + 1 + y_shape.append([n, f, int(h_out), int(w_out)]) + +print(x_shape) +print(f_shape) +print(y_shape) + +dtype = paddle.bfloat16 +paddle.set_default_dtype(dtype) +# +if DEVICE == 'XPU' and CREATE_DATA: + for i in range(len(list_n)): + #for i in range(1): + paddle.seed(int(generate_true_random_number(4))) + x = np.random.uniform(-1, 1, x_shape[i]).astype("float32") + paddle.seed(int(generate_true_random_number(5))) + f = np.random.uniform(-1, 1, f_shape[i]).astype("float32") + paddle.seed(int(generate_true_random_number(6))) + out_grad = np.random.uniform(-1, 1, y_shape[i]).astype("float32") + x = paddle.to_tensor(x, stop_gradient=False).cast(dtype) + f = paddle.to_tensor(f, stop_gradient=False).cast(dtype) + out_grad = paddle.to_tensor(out_grad, stop_gradient=True).cast(dtype) + paddle.save([x, f, out_grad], 'CONV_INPUT/conv_' + str(i)) + +for i in range(len(list_n)): +#for i in range(1): + x, f, out_grad = paddle.load('CONV_INPUT/conv_' + str(i)) + conv = nn.Conv2D(list_c[i], list_f[i], kernel_size=list_ksize[i], + stride=list_stride[i], padding=list_pad[i], dilation=list_dilation[i], + groups=list_group[i]) + if list_group[i] == 1: + conv.weight.set_value(f) + conv.train() + + out = conv(x) + #paddle.save([True, [out]], 'XPU/output/conv_' + str(i)) + #out = paddle.cast(out, "float32") + paddle.autograd.backward(tensors=[out], grad_tensors=[out_grad]) + #paddle.save([True, [linear.weight.grad, linear.bias.grad]], 'XPU/output_backward/linear_' + str(i)) +# +# +#x = paddle.uniform((1, 1, 1, 51200), dtype=dtype, min=-1., max=1.) +#x.stop_gradient = False +#conv = nn.Conv2D(1, 512, kernel_size=[1, 10], stride=[1, 5], padding=[0,0,0,0], dilation=[1,1], groups=1) +#y = conv(x) +#print(y.shape) +#y.stop_gradient = False +#out_grad = paddle.uniform(y.shape, dtype=dtype, min=-1., max=1.) +#paddle.autograd.backward(tensors=[y], grad_tensors=[out_grad]) +#print(x.grad) From 03788f8ca27f563f6ef03777bab9e30be3bb573a Mon Sep 17 00:00:00 2001 From: zhouquan32 <458178300@qq.com> Date: Wed, 7 May 2025 16:59:30 +0800 Subject: [PATCH 22/22] add baseline for test_fc --- test_fc.py | 228 ++++++++++++++++++++++--------------------------- test_fc_gpu.py | 35 ++++++-- tools.py | 61 +++++++++++++ 3 files changed, 193 insertions(+), 131 deletions(-) create mode 100644 tools.py diff --git a/test_fc.py b/test_fc.py index 25878ff..d3a420f 100644 --- a/test_fc.py +++ b/test_fc.py @@ -6,21 +6,54 @@ import re import math +from tools import * + try: from paddle_xpu.layers.nn import Linear except ImportError: from paddle.nn import Linear #from paddle.nn import Linear -import json - -log_file_path = 'workerlog.0.fc_fa_mean_max' - +#log_file_path = 'workerlog.0.fc_fa_mean_max' +log_file_path = 'fc.log' pattern = re.compile(r".*?cases(.*)") mean_var = r"\[mean\](-?\d+\.\d+), \[max\](\d+\.\d+)" +input_file = 'FC_INPUT' +xpu_file = 'XPU' +cal_out_file = xpu_file + '/output' +cal_out_back_file = xpu_file + '/output_backward' +base_out_file = 'BASE/output' +base_out_back_file = 'BASE/output_backward' + +createDir(input_file) +createDir(xpu_file) + +CREATE_DATA = False +DEVICE = 'XPU' +#DEVICE = 'GPU' +base_type = paddle.float32 +dtype = paddle.bfloat16 +#calculate_type = paddle.bfloat16 +calculate_type = base_type + + +if calculate_type == base_type: + createDir(base_out_file) + out_file = base_out_file + createDir(base_out_back_file) + out_back_file = base_out_back_file +else: + createDir(cal_out_file) + out_file = cal_out_file + createDir(cal_out_back_file) + out_back_file = cal_out_back_file + +paddle.set_default_dtype(calculate_type) + + # 初始化列表 list_a = [] list_b = [] @@ -45,33 +78,33 @@ continue if "float16" not in log_lines[i]: continue - if "fc_fusion" in log_lines[i]: - run_mode = log_lines[i + 3] - if "desc.run_mode" in run_mode: - mean_match = re.search(mean_var, run_mode) - if mean_match: - if float(mean_match.group(1)) == 1: - continue - else: - print('------------------error----------------') - else: - continue - - a_line = log_lines[i + 1] - match = re.search(mean_var, a_line) - if match: - a_mean.append(float(match.group(1))) - a_max.append(float(match.group(2))) - b_line = log_lines[i + 2] - match = re.search(mean_var, b_line) - if match: - b_mean.append(float(match.group(1))) - b_max.append(float(match.group(2))) - #d_line = log_lines[i + 6] - #match = re.search(mean_var, d_line) + #if "fc_fusion" in log_lines[i]: + # run_mode = log_lines[i + 3] + # if "desc.run_mode" in run_mode: + # mean_match = re.search(mean_var, run_mode) + # if mean_match: + # if float(mean_match.group(1)) == 1: + # continue + # else: + # print('------------------error----------------') + #else: + # continue + + #a_line = log_lines[i + 1] + #match = re.search(mean_var, a_line) + #if match: + # a_mean.append(float(match.group(1))) + # a_max.append(float(match.group(2))) + #b_line = log_lines[i + 2] + #match = re.search(mean_var, b_line) #if match: - # d_mean.append(float(match.group(1))) - # d_max.append(float(match.group(2))) + # b_mean.append(float(match.group(1))) + # b_max.append(float(match.group(2))) + ##d_line = log_lines[i + 6] + ##match = re.search(mean_var, d_line) + ##if match: + ## d_mean.append(float(match.group(1))) + ## d_max.append(float(match.group(2))) if "fc_fusion" in log_lines[i]: print(i) @@ -121,29 +154,6 @@ except IOError as e: print(f"Error: An I/O error occurred while reading {log_file_path}: {str(e)}") -def generate_true_random_number(num_bytes): - """ 生成真随机数,从 /dev/urandom 读取指定数量的字节 """ - # 从 /dev/urandom 读取 num_bytes 个字节 - random_bytes = os.urandom(num_bytes) - # 将字节转换为整数 - #print(int.from_bytes(random_bytes, 'big')) - return int.from_bytes(random_bytes, 'big') - -# inf, nan -def get_rounded_num(x, round_up=True): - if math.isinf(x) or math.isnan(x): - msg = f"warning, x is inf or nan" - print(msg, x) - return x - if abs(x) <= 1e-10: - return 0 - - abs_x = abs(x) - log_x = math.log10(abs_x) - round_log_x = math.floor(log_x) if round_up ^ (x > 0) else math.ceil(log_x) - - result = 10**round_log_x - return result if x >= 0 else -result unique_string = [] u_list_a = [] @@ -170,20 +180,20 @@ def get_rounded_num(x, round_up=True): print("-------------------------------------------", len(u_list_a)) -list_a = u_list_a -list_b = u_list_b -a_mean = u_a_m -b_mean = u_b_m -a_max = u_a_v -b_max = u_b_v +#list_a = u_list_a +#list_b = u_list_b +#a_mean = u_a_m +#b_mean = u_b_m +#a_max = u_a_v +#b_max = u_b_v ### 输出各个列表 -#print("List A:", list_a) -#print("List B:", list_b) -#print("List C:", list_c) -#print("List D:", list_d) -#print("Alpha and Beta:", alpha_beta) +print("List A:", list_a) +print("List B:", list_b) +print("List C:", list_c) +print("List D:", list_d) +print("Alpha and Beta:", alpha_beta) #print("a_mean", a_mean) #print("b_mean", b_mean) #print("d_mean", d_mean) @@ -193,65 +203,31 @@ def get_rounded_num(x, round_up=True): # #print("-------------------------------------------", len(a_mean)) # -def generate_random_array(mean, max_value, shape, seed=None): - if seed is not None: - np.random.seed(seed) - # 首先生成标准正态分布的随机数组 - random_array = np.random.randn(*shape).astype(np.float32) - # 计算当前随机数组的最大值 - current_max = random_array.max() - # 计算缩放因子,使得新的最大值为给定的max_value - scale_factor = max_value / current_max - # 对数组进行缩放 - random_array *= scale_factor - # 计算当前数组的均值 - current_mean = random_array.mean() - # 计算偏移量,使得新的均值为给定的mean - shift_value = mean - current_mean - # 对数组进行偏移 - random_array += shift_value - return random_array - -#CREATE_DATA = True -#DEVICE = 'XPU' -##DEVICE = 'GPU' -# -## -#dtype = paddle.bfloat16 -#paddle.set_default_dtype(dtype) -# -#if DEVICE == 'XPU' and CREATE_DATA: -# for i in range(len(list_a)): -# print(i) -# #for i in range(1): -# paddle.seed(int(generate_true_random_number(4))) -# #x = np.random.uniform(-1, 1, list_a[i]).astype("float32") -# x = generate_random_array(a_mean[i], a_max[i], list_a[i], int(generate_true_random_number(4))) -# paddle.seed(int(generate_true_random_number(5))) -# #w = np.random.uniform(-1, 1, list_b[i]).astype("float32") -# w = generate_random_array(b_mean[i], b_max[i], list_b[i], int(generate_true_random_number(3))) -# paddle.seed(int(generate_true_random_number(6))) -# bias = np.random.uniform(-1, 1, [list_b[i][1]]).astype("float32") -# paddle.seed(int(generate_true_random_number(5))) -# out_grad = np.random.uniform(-1, 1, [list_a[i][0], list_b[i][1]]).astype("float32") -# #out_grad = generate_random_array(d_mean[i], d_max[i], [list_a[i][0], list_b[i][1]], int(generate_true_random_number(3))) -# x = paddle.to_tensor(x, stop_gradient=False).cast(dtype) -# w = paddle.to_tensor(w, stop_gradient=False).cast(dtype) -# bias = paddle.to_tensor(bias, stop_gradient=False).cast(dtype) -# out_grad = paddle.to_tensor(out_grad, stop_gradient=True) -# paddle.save([x, w, bias, out_grad], 'FC_INPUT/linear_' + str(i)) -# -#for i in range(len(list_a)): -##for i in range(1): -# print(i) -# x, w, bias, out_grad = paddle.load('FC_INPUT/linear_' + str(i)) -# linear = Linear(w.shape[0], w.shape[1], bias_attr=True) -# linear.weight.set_value(w) -# linear.bias.set_value(bias) -# linear.train() -# -# out = linear(x) -# #paddle.save([True, [out]], 'XPU/output/linear_' + str(i)) -# out = paddle.cast(out, "float32") -# paddle.autograd.backward(tensors=[out], grad_tensors=[out_grad]) -# paddle.save([True, [linear.weight.grad, linear.bias.grad]], 'XPU/output_backward/linear_' + str(i)) +if DEVICE == 'XPU' and CREATE_DATA: + for i in range(len(list_a)): + print(i) + #for i in range(1): + x = create_random_tensor(list_a[i], dtype, int(generate_true_random_number(4))) + w = create_random_tensor(list_b[i], dtype, int(generate_true_random_number(4))) + bias = create_random_tensor([list_b[i][1]], dtype, int(generate_true_random_number(4))) + out_grad = create_random_tensor([list_a[i][0], list_b[i][1]], dtype, int(generate_true_random_number(4))) + paddle.save([x, w, bias, out_grad], input_file + '/linear_' + str(i)) + + +for i in range(len(list_a)): +#for i in range(1): + print(i) + x, w, bias, out_grad = paddle.load('FC_INPUT/linear_' + str(i)) + w = w.cast(calculate_type) + bias = bias.cast(calculate_type) + x = x.cast(calculate_type) + linear = Linear(w.shape[0], w.shape[1], bias_attr=True) + linear.weight.set_value(w) + linear.bias.set_value(bias) + linear.train() + + out = linear(x) + paddle.save([True, [out]], out_file + '/linear_' + str(i)) + out = paddle.cast(out, "float32") + paddle.autograd.backward(tensors=[out], grad_tensors=[out_grad]) + paddle.save([True, [linear.weight.grad, linear.bias.grad]], out_back_file + '/linear_' + str(i)) diff --git a/test_fc_gpu.py b/test_fc_gpu.py index a245dd7..b681608 100644 --- a/test_fc_gpu.py +++ b/test_fc_gpu.py @@ -15,20 +15,45 @@ #DEVICE = 'XPU' DEVICE = 'GPU' +input_file = 'FC_INPUT' +xpu_file = 'XPU' +cal_out_file = xpu_file + '/output' +cal_out_back_file = xpu_file + '/output_backward' +base_out_file = 'BASE/output' +base_out_back_file = 'BASE/output_backward' + +base_type = paddle.float32 dtype = paddle.bfloat16 -paddle.set_default_dtype(dtype) +calculate_type = base_type + +if calculate_type == base_type: + createDir(base_out_file) + out_file = base_out_file + createDir(base_out_back_file) + out_back_file = base_out_back_file +else: + createDir(cal_out_file) + out_file = cal_out_file + createDir(cal_out_back_file) + out_back_file = cal_out_back_file -for i in range(len(os.listdir('FC_INPUT/'))): +paddle.set_default_dtype(calculate_type) + +for i in range(len(os.listdir(input_file))): #for i in range(1): print(i) - x, w, bias, out_grad = paddle.load('FC_INPUT/linear_' + str(i)) + x, w, bias, out_grad = paddle.load(input_file + '/linear_' + str(i)) + w = w.cast(calculate_type) + bias = bias.cast(calculate_type) + x = x.cast(calculate_type) + linear = Linear(w.shape[0], w.shape[1], bias_attr=True) linear.weight.set_value(w) linear.bias.set_value(bias) linear.train() out = linear(x) - paddle.save([True, [out]], 'GPU/output/linear_' + str(i)) + paddle.save([True, [out]], out_file + '/linear_' + str(i)) out = paddle.cast(out, "float32") paddle.autograd.backward(tensors=[out], grad_tensors=[out_grad]) - paddle.save([True, [linear.weight.grad, linear.bias.grad]], 'GPU/output_backward/linear_' + str(i)) + paddle.save([True, [linear.weight.grad, linear.bias.grad]], out_back_file + '/linear_' + str(i)) diff --git a/tools.py b/tools.py new file mode 100644 index 0000000..1d7410b --- /dev/null +++ b/tools.py @@ -0,0 +1,61 @@ +import os + +def createDir(file_name): + if not os.path.exists(file_name): + os.makedirs(file_name) + + +def generate_true_random_number(num_bytes): + """ 生成真随机数,从 /dev/urandom 读取指定数量的字节 """ + # 从 /dev/urandom 读取 num_bytes 个字节 + random_bytes = os.urandom(num_bytes) + # 将字节转换为整数 + #print(int.from_bytes(random_bytes, 'big')) + return int.from_bytes(random_bytes, 'big') + + +# inf, nan +def get_rounded_num(x, round_up=True): + if math.isinf(x) or math.isnan(x): + msg = f"warning, x is inf or nan" + print(msg, x) + return x + if abs(x) <= 1e-10: + return 0 + + abs_x = abs(x) + log_x = math.log10(abs_x) + round_log_x = math.floor(log_x) if round_up ^ (x > 0) else math.ceil(log_x) + + result = 10**round_log_x + return result if x >= 0 else -result + + +def generate_random_array(mean, max_value, shape, seed=None): + if seed is not None: + np.random.seed(seed) + # 首先生成标准正态分布的随机数组 + random_array = np.random.randn(*shape).astype(np.float32) + # 计算当前随机数组的最大值 + current_max = random_array.max() + # 计算缩放因子,使得新的最大值为给定的max_value + scale_factor = max_value / current_max + # 对数组进行缩放 + random_array *= scale_factor + # 计算当前数组的均值 + current_mean = random_array.mean() + # 计算偏移量,使得新的均值为给定的mean + shift_value = mean - current_mean + # 对数组进行偏移 + random_array += shift_value + return random_array + + +def create_random_tensor(shape, dtype, seed, min_v=-1, mean_v=None, max_v=1): + if mean_v == None: + paddle.seed(seed) + data = np.random.uniform(min_v, max_v, shape).astype("float32") + else: + data = generate_random_array(mean_v, max_v, shape, seed) + data = paddle.to_tensor(data, stop_gradient=False).cast(dtype) + return data