aliyun · yanzhenghao · Jun 15, 2026
diff --git a/aicb b/aicb
diff --git a/aicb/.gitignore b/aicb/.gitignore
@@ -0,0 +1,4 @@
+*__pycache__*
+*local*
+.ipynb_checkpoints
+.pytest_cache
diff --git a/aicb/.omc/state/last-tool-error.json b/aicb/.omc/state/last-tool-error.json
@@ -0,0 +1,7 @@
+{
+  "tool_name": "Bash",
+  "tool_input_preview": "{\"command\":\"head -22 aicb/results/workload/A5-GPT13B-world_size64-tp8-pp1-ep1-gbs1024-mbs1-seq8192-MOE-False-GEMM-False-flash_attn-True.txt | sed '2s/8334/20/' > /tmp/test_20l.txt && python3 -c \\\"\\nim...",
+  "error": "Exit code 1\nhead: aicb/results/workload/A5-GPT13B-world_size64-tp8-pp1-ep1-gbs1024-mbs1-seq8192-MOE-False-GEMM-False-flash_attn-True.txt: No such file or directory\nTraceback (most recent call last):\n  File \"<string>\", line 4, in <module>\n    with open('lld.json') as f: lld = json.load(f)\n         ~~~~^^^^^^^^^^^^\nFileNotFoundError: [Errno 2] No such file or directory: 'lld.json'",
+  "timestamp": "2026-06-05T02:34:43.517Z",
+  "retry_count": 2
+}
diff --git a/aicb/Dockerfile b/aicb/Dockerfile
@@ -0,0 +1,30 @@
+# Base image: Official NVIDIA PyTorch image with Python 3 and GPU support.
+FROM nvcr.io/nvidia/pytorch:25.05-py3
+
+# Install git for version control operations and clean up apt cache.
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+
+# Set the application's working directory.
+WORKDIR /workspace/AICB
+
+# [Optional] Configure pip and uv to use Aliyun mirror for faster package downloads.
+RUN pip config set global.index-url http://mirrors.aliyun.com/pypi/simple
+RUN pip config set install.trusted-host mirrors.aliyun.com
+ENV UV_DEFAULT_INDEX="https://mirrors.aliyun.com/pypi/simple"
+
+RUN pip install --no-cache-dir uv
+
+# Copy only the requirements file first to leverage Docker's layer cache.
+# This layer is rebuilt only when requirements.txt changes.
+COPY requirements.txt .
+
+# Install Python dependencies using uv.
+RUN UV_TORCH_BACKEND=auto uv pip install -v --system --no-cache-dir --no-build-isolation --break-system-packages -r requirements.txt
+
+# Copy the rest of the application source code into the image.
+COPY . .
+
+RUN mv ./workload_generator /usr/local/lib/python3.12/dist-packages &&\
+    mv ./utils /usr/local/lib/python3.12/dist-packages &&\
+    mv ./log_analyzer /usr/local/lib/python3.12/dist-packages
+ENV PATH="/workspace/AICB/.venv/bin:$PATH"
diff --git a/aicb/License b/aicb/License
@@ -0,0 +1,14 @@
+/* 
+*Copyright (c) 2021, Alibaba Group;
+*Licensed under the Apache License, Version 2.0 (the "License");
+*you may not use this file except in compliance with the License.
+*You may obtain a copy of the License at
+
+*   http://www.apache.org/licenses/LICENSE-2.0
+
+*Unless required by applicable law or agreed to in writing, software
+*distributed under the License is distributed on an "AS IS" BASIS,
+*WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*See the License for the specific language governing permissions and
+*limitations under the License.
+*/
diff --git a/aicb/README.md b/aicb/README.md
diff --git a/aicb/aicb.py b/aicb/aicb.py
@@ -0,0 +1,94 @@
+"""
+Copyright (c) 2021, Alibaba Group;
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+from utils.utils import get_args, get_comp_out, extract_averages, Comp_with_aiob
+from utils.benchmark_logger import bench_logger
+from workload_generator.mocked_model.training.MockedDeepspeed import DeepspeedForCausalLM
+from workload_generator.mocked_model.training.MockedDeepSeek import DeepSeekV3Model
+from workload_generator.mocked_model.training.MockedMegatron import MegatronModel
+from workload_generator.generate_deepspeed_stage1_2_workload import (
+    DeepSpeedStage1,
+    DeepSpeedStage2,
+)
+from workload_generator.generate_deepspeed_stage3_workload import DeepSpeedStage3
+from workload_generator.generate_megatron_workload import MegatronWorkload
+from workload_generator.generate_collective_test import Collective_Test
+from workload_applyer import WorkloadApplyer
+from utils.utils import *
+
+if __name__ == "__main__":
+    args = get_args()
+    if not hasattr(args, "backend"):
+        args.backend = "nccl"
+    torch.distributed.init_process_group(backend=args.backend)
+    args.world_size = torch.distributed.get_world_size()
+    args.rank = torch.distributed.get_rank()
+    if args.frame == "Megatron":
+        model = MegatronModel(args)
+        workload_generator = MegatronWorkload(args, model)
+    elif args.frame == "DeepSpeed":
+        model = DeepspeedForCausalLM(args)
+        if args.stage == 1:
+            workload_generator = DeepSpeedStage1(args, model)
+        elif args.stage == 2:
+            workload_generator = DeepSpeedStage2(args, model)
+        elif args.stage == 3:
+            workload_generator = DeepSpeedStage3(args, model)
+    elif args.frame == "collective_test":
+        workload_generator = Collective_Test(args, None)
+    elif args.frame == "DeepSeek":
+        model = DeepSeekV3Model(args)
+        workload_generator = MegatronWorkload(args, model)
+    workload = workload_generator()
+    if args.aiob_enable and (args.frame == "Megatron" or args.frame == "DeepSeek"):
+
+        params = model.parameters()
+        args.model_param = sum(p.numel() for p in params)
+        args.activation_memory = 0
+        for sub_module in model.child_modules():
+            if hasattr(sub_module, "activation_memory"):
+                args.activation_memory += sub_module.activation_memory()
+        print("model_param:", args.model_param)
+        if args.comp_filepath == None:
+            local_rank = torch.distributed.get_rank() % torch.cuda.device_count()
+            if local_rank == 0:
+                filepath = get_comp_out(args)
+            else:
+                filepath = get_aiob_path(args)
+            torch.distributed.barrier()
+            compute_cache = extract_averages(filepath,args)
+        else:
+            print("comp_filepath:", args.comp_filepath)
+            compute_cache = extract_averages(args.comp_filepath,args)
+        workload = Comp_with_aiob(workload, compute_cache)
+    if torch.distributed.get_rank() == 0:
+        filename = f"{workload_generator.name}_{args.model_name}_sp_{args.enable_sequence_parallel}_iteration_{args.epoch_num}_computationEnable_{args.computation_enable}_{args.world_size}n.csv"
+        workload.dump(filename)
+    if not args.workload_only :
+        applyer = WorkloadApplyer(workload=workload, args=args)
+        cpu_time = applyer.apply_workload()
+        if torch.distributed.get_rank() == 0:
+            bench_logger.analyze_comm_log()
+            if args.frame != "collective_test":
+                bench_logger.analyze_comm_time()
+            csv_filename = bench_logger.dump_log(filename)
+            if args.enable_visual:
+                try:
+                    from visualize.generate import visualize_output
+                    visualize_output(csv_filename,False)
+                except ImportError: 
+                    print("visualize_output is not available because required library is not found")
+
+            print(
+                f"total time for {args.frame} and {args.epoch_num} iterations is {cpu_time:.4f} s"
+            )
diff --git a/aicb/core/__init__.py b/aicb/core/__init__.py
diff --git a/aicb/core/grouped_gemm_util.py b/aicb/core/grouped_gemm_util.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+
+try:
+    import grouped_gemm
+except ImportError:
+    grouped_gemm = None
+
+
+def grouped_gemm_is_available():
+    return grouped_gemm is not None
+
+
+def assert_grouped_gemm_is_available():
+    assert grouped_gemm_is_available(), (
+        "Grouped GEMM is not available. Please run "
+        "`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.0`."
+    )
+
+
+ops = grouped_gemm.ops if grouped_gemm_is_available() else None
diff --git a/aicb/download/AICB_v1.0.deb b/aicb/download/AICB_v1.0.deb
diff --git a/aicb/images/detail_log.png b/aicb/images/detail_log.png
diff --git a/aicb/images/readme_01.png b/aicb/images/readme_01.png
diff --git a/aicb/images/result_log.png b/aicb/images/result_log.png
diff --git a/aicb/images/simai_dingtalk.jpg b/aicb/images/simai_dingtalk.jpg
diff --git a/aicb/images/simai_wechat.jpeg b/aicb/images/simai_wechat.jpeg
diff --git a/aicb/images/time_log.png b/aicb/images/time_log.png
diff --git a/aicb/images/tutorial_1.png b/aicb/images/tutorial_1.png
diff --git a/aicb/images/tutorial_2.png b/aicb/images/tutorial_2.png
diff --git a/aicb/images/tutorial_3.png b/aicb/images/tutorial_3.png
diff --git a/aicb/images/tutorial_4.png b/aicb/images/tutorial_4.png
diff --git a/aicb/images/tutorial_5.png b/aicb/images/tutorial_5.png
diff --git a/aicb/images/tutorial_6.png b/aicb/images/tutorial_6.png
diff --git a/aicb/images/tutorial_7.png b/aicb/images/tutorial_7.png
diff --git a/aicb/log_analyzer/__init__.py b/aicb/log_analyzer/__init__.py
diff --git a/aicb/log_analyzer/analyze_res_csv.py b/aicb/log_analyzer/analyze_res_csv.py
@@ -0,0 +1,38 @@
+import pandas as pd
+from log_analyzer.utils import convert_msg_to_size, convert_size_to_msg
+import sys
+
+def analyze_csv(file_path):
+    df = pd.read_csv(file_path)
+
+    df = df.dropna(subset=['busbw'])
+
+    df['busbw'] = pd.to_numeric(df['busbw'], errors='coerce')
+
+    df = df.dropna(subset=['busbw'])
+
+    def exclude_min(group):
+        if len(group) > 1:
+            group = group.sort_values(by='busbw')
+            return group.iloc[2:]
+        return group
+
+    df_excluded_min = df.groupby(['comm_type', 'comm_group', 'msg_size']).apply(exclude_min).reset_index(drop=True)
+    grouped = df_excluded_min.groupby(['comm_type', 'comm_group', 'msg_size']).agg(
+        busbw_mean=('busbw', 'mean'),
+        busbw_max=('busbw', 'max'),
+        busbw_min=('busbw', 'min'),
+        busbw_std=('busbw', 'std'),
+        occurrence_count=('busbw', 'size')
+    ).reset_index()
+    grouped['msg_size'] = grouped['msg_size'].apply(convert_size_to_msg)
+    return grouped
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print("Usage: python -m log_analyzer.analyze_res_csv <path_to_csv>")
+        sys.exit(1)
+    grouped = analyze_csv(sys.argv[1])
+    print(grouped)
+
diff --git a/aicb/log_analyzer/ds_comm_log_analyzer.py b/aicb/log_analyzer/ds_comm_log_analyzer.py
@@ -0,0 +1,134 @@
+"""
+Copyright (c) 2021, Alibaba Group;
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+# /usr/bin/python3
+from utils.utils import CommType, CommGroup
+from log_analyzer.utils import convert_msg_to_size, convert_size_to_msg
+from utils.benchmark_logger import BenchLogger
+from log_analyzer.log import LogItem, Log
+
+COMM_OP = "comm op"
+CALLER_FUNC = "Caller Func"
+TIME_MS = "time (ms)"
+MSG_SIZE = "msg size"
+LOG_STARTER = "[rank 0]"
+WORLD_SIZE = 16
+TP_SIZE = 4
+DP_SIZE = 4
+# LOG_STARTER = "[INFO] "
+
+
+def clean_s(s):
+    return s.strip("[]\n\t ")
+
+
+def string2comm_type(s):
+    if "all_gather" in s:
+        return CommType.all_gather
+    if "reduce_scatter" in s:
+        return CommType.reduce_scatter
+    if "all_reduce" in s:
+        return CommType.all_reduce
+    if "broadcast" in s:
+        return CommType.broadcast
+    if "barrier" in s:
+        return CommType.barrier
+    if "reduce" in s:
+        return CommType.reduce
+    print(f"WARNING cannot convert {s} to CommType")
+    return CommType.epoch_end
+
+
+def parse_ds_log_item(line):
+    index = line.lower().find(LOG_STARTER)
+    if index == -1:
+        return None
+    item_list = line[index + len(LOG_STARTER) :].split("|")
+    item = {}
+    for raw_item in item_list:
+        if "epoch" in raw_item:
+            split_text = raw_item.split()
+            numbers = [word for word in split_text if word.isdigit()]
+            item["epoch_num"] = int(numbers[0])
+            continue
+        if "micro_step" in raw_item:
+            split_text = raw_item.split()
+            numbers = [word for word in split_text if word.replace(".", "").isdigit()]
+            item["iter_time"] = float(numbers[0])
+            continue
+        if ":" not in raw_item:
+            continue
+        key, value = raw_item.split(":")
+        key, value = clean_s(key), clean_s(value)
+        if key == COMM_OP:
+            item["comm_type"] = string2comm_type(value)
+        elif key == MSG_SIZE or MSG_SIZE in key:
+            item["msg_size"] = convert_msg_to_size(value)
+        elif key == CALLER_FUNC:
+            item["stage"] = value
+        elif key == TIME_MS or TIME_MS in key:
+            item["elapsed_time"] = float(value)
+        if key == "group":
+            group = eval(value)
+            if len(group) == WORLD_SIZE:
+                item["group"] = CommGroup.all
+            elif len(group) == TP_SIZE:
+                item["group"] = CommGroup.tp_group
+            elif len(group) == DP_SIZE:
+                item["group"] = CommGroup.dp_group
+        elif "algbw" in key:
+            item["algbw"] = float(value)
+        elif "busbw" in key:
+            item["busbw"] = float(value)
+        else:
+            try:
+                item[key] = float(value)
+            except:
+                item[key] = value
+    return item
+
+
+def parse_ds_comm_log(filename):
+    comm_log = Log()
+    with open(filename, "r") as f:
+        lines = f.read().split("\n")
+        for line in lines:
+            if "After initializing ZeRO optimizer" in line:
+                comm_log.add_comm_log(LogItem(comm_type=CommType.epoch_end))
+                continue
+            elif "microstep" in line:
+                comm_log.add_comm_log(LogItem(comm_type=CommType.epoch_end))
+                continue
+            log = parse_ds_log_item(line)
+            if log is None:
+                continue
+            if "comm_type" in log:
+                log_item = LogItem(
+                    comm_type=log["comm_type"],
+                    comm_group=log.get("group", CommGroup.dp_group),
+                    msg_size=log["msg_size"],
+                )
+                log_item._elapsed_time = log.get("elapsed_time", -1)
+                log_item.algbw, log_item.busbw = log.get("algbw", -1), log.get(
+                    "busbw", -1
+                )
+                comm_log.add_comm_log(log_item)
+    return comm_log
+
+
+if __name__ == "__main__":
+    import sys
+
+    filename = sys.argv[1]
+    comm_log = parse_ds_comm_log(filename)
+    comm_log.analyze()