Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion aicb
Submodule aicb deleted from 23eec3
4 changes: 4 additions & 0 deletions aicb/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*__pycache__*
*local*
.ipynb_checkpoints
.pytest_cache
7 changes: 7 additions & 0 deletions aicb/.omc/state/last-tool-error.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"tool_name": "Bash",
"tool_input_preview": "{\"command\":\"head -22 aicb/results/workload/A5-GPT13B-world_size64-tp8-pp1-ep1-gbs1024-mbs1-seq8192-MOE-False-GEMM-False-flash_attn-True.txt | sed '2s/8334/20/' > /tmp/test_20l.txt && python3 -c \\\"\\nim...",
"error": "Exit code 1\nhead: aicb/results/workload/A5-GPT13B-world_size64-tp8-pp1-ep1-gbs1024-mbs1-seq8192-MOE-False-GEMM-False-flash_attn-True.txt: No such file or directory\nTraceback (most recent call last):\n File \"<string>\", line 4, in <module>\n with open('lld.json') as f: lld = json.load(f)\n ~~~~^^^^^^^^^^^^\nFileNotFoundError: [Errno 2] No such file or directory: 'lld.json'",
"timestamp": "2026-06-05T02:34:43.517Z",
"retry_count": 2
}
30 changes: 30 additions & 0 deletions aicb/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Base image: Official NVIDIA PyTorch image with Python 3 and GPU support.
FROM nvcr.io/nvidia/pytorch:25.05-py3

# Install git for version control operations and clean up apt cache.
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*

# Set the application's working directory.
WORKDIR /workspace/AICB

# [Optional] Configure pip and uv to use Aliyun mirror for faster package downloads.
RUN pip config set global.index-url http://mirrors.aliyun.com/pypi/simple
RUN pip config set install.trusted-host mirrors.aliyun.com
ENV UV_DEFAULT_INDEX="https://mirrors.aliyun.com/pypi/simple"

RUN pip install --no-cache-dir uv

# Copy only the requirements file first to leverage Docker's layer cache.
# This layer is rebuilt only when requirements.txt changes.
COPY requirements.txt .

# Install Python dependencies using uv.
RUN UV_TORCH_BACKEND=auto uv pip install -v --system --no-cache-dir --no-build-isolation --break-system-packages -r requirements.txt

# Copy the rest of the application source code into the image.
COPY . .

RUN mv ./workload_generator /usr/local/lib/python3.12/dist-packages &&\
mv ./utils /usr/local/lib/python3.12/dist-packages &&\
mv ./log_analyzer /usr/local/lib/python3.12/dist-packages
ENV PATH="/workspace/AICB/.venv/bin:$PATH"
14 changes: 14 additions & 0 deletions aicb/License
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
/*
*Copyright (c) 2021, Alibaba Group;
*Licensed under the Apache License, Version 2.0 (the "License");
*you may not use this file except in compliance with the License.
*You may obtain a copy of the License at

* http://www.apache.org/licenses/LICENSE-2.0

*Unless required by applicable law or agreed to in writing, software
*distributed under the License is distributed on an "AS IS" BASIS,
*WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*See the License for the specific language governing permissions and
*limitations under the License.
*/
428 changes: 428 additions & 0 deletions aicb/README.md

Large diffs are not rendered by default.

94 changes: 94 additions & 0 deletions aicb/aicb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Copyright (c) 2021, Alibaba Group;
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import torch
from utils.utils import get_args, get_comp_out, extract_averages, Comp_with_aiob
from utils.benchmark_logger import bench_logger
from workload_generator.mocked_model.training.MockedDeepspeed import DeepspeedForCausalLM
from workload_generator.mocked_model.training.MockedDeepSeek import DeepSeekV3Model
from workload_generator.mocked_model.training.MockedMegatron import MegatronModel
from workload_generator.generate_deepspeed_stage1_2_workload import (
DeepSpeedStage1,
DeepSpeedStage2,
)
from workload_generator.generate_deepspeed_stage3_workload import DeepSpeedStage3
from workload_generator.generate_megatron_workload import MegatronWorkload
from workload_generator.generate_collective_test import Collective_Test
from workload_applyer import WorkloadApplyer
from utils.utils import *

if __name__ == "__main__":
args = get_args()
if not hasattr(args, "backend"):
args.backend = "nccl"
torch.distributed.init_process_group(backend=args.backend)
args.world_size = torch.distributed.get_world_size()
args.rank = torch.distributed.get_rank()
if args.frame == "Megatron":
model = MegatronModel(args)
workload_generator = MegatronWorkload(args, model)
elif args.frame == "DeepSpeed":
model = DeepspeedForCausalLM(args)
if args.stage == 1:
workload_generator = DeepSpeedStage1(args, model)
elif args.stage == 2:
workload_generator = DeepSpeedStage2(args, model)
elif args.stage == 3:
workload_generator = DeepSpeedStage3(args, model)
elif args.frame == "collective_test":
workload_generator = Collective_Test(args, None)
elif args.frame == "DeepSeek":
model = DeepSeekV3Model(args)
workload_generator = MegatronWorkload(args, model)
workload = workload_generator()
if args.aiob_enable and (args.frame == "Megatron" or args.frame == "DeepSeek"):

params = model.parameters()
args.model_param = sum(p.numel() for p in params)
args.activation_memory = 0
for sub_module in model.child_modules():
if hasattr(sub_module, "activation_memory"):
args.activation_memory += sub_module.activation_memory()
print("model_param:", args.model_param)
if args.comp_filepath == None:
local_rank = torch.distributed.get_rank() % torch.cuda.device_count()
if local_rank == 0:
filepath = get_comp_out(args)
else:
filepath = get_aiob_path(args)
torch.distributed.barrier()
compute_cache = extract_averages(filepath,args)
else:
print("comp_filepath:", args.comp_filepath)
compute_cache = extract_averages(args.comp_filepath,args)
workload = Comp_with_aiob(workload, compute_cache)
if torch.distributed.get_rank() == 0:
filename = f"{workload_generator.name}_{args.model_name}_sp_{args.enable_sequence_parallel}_iteration_{args.epoch_num}_computationEnable_{args.computation_enable}_{args.world_size}n.csv"
workload.dump(filename)
if not args.workload_only :
applyer = WorkloadApplyer(workload=workload, args=args)
cpu_time = applyer.apply_workload()
if torch.distributed.get_rank() == 0:
bench_logger.analyze_comm_log()
if args.frame != "collective_test":
bench_logger.analyze_comm_time()
csv_filename = bench_logger.dump_log(filename)
if args.enable_visual:
try:
from visualize.generate import visualize_output
visualize_output(csv_filename,False)
except ImportError:
print("visualize_output is not available because required library is not found")

print(
f"total time for {args.frame} and {args.epoch_num} iterations is {cpu_time:.4f} s"
)
Empty file added aicb/core/__init__.py
Empty file.
20 changes: 20 additions & 0 deletions aicb/core/grouped_gemm_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.

try:
import grouped_gemm
except ImportError:
grouped_gemm = None


def grouped_gemm_is_available():
return grouped_gemm is not None


def assert_grouped_gemm_is_available():
assert grouped_gemm_is_available(), (
"Grouped GEMM is not available. Please run "
"`pip install git+https://github.com/fanshiqing/grouped_gemm@v1.0`."
)


ops = grouped_gemm.ops if grouped_gemm_is_available() else None
Binary file added aicb/download/AICB_v1.0.deb
Binary file not shown.
Binary file added aicb/images/detail_log.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/readme_01.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/result_log.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/simai_dingtalk.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/simai_wechat.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/time_log.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/tutorial_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/tutorial_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/tutorial_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/tutorial_4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/tutorial_5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/tutorial_6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added aicb/images/tutorial_7.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file added aicb/log_analyzer/__init__.py
Empty file.
38 changes: 38 additions & 0 deletions aicb/log_analyzer/analyze_res_csv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pandas as pd
from log_analyzer.utils import convert_msg_to_size, convert_size_to_msg
import sys

def analyze_csv(file_path):
df = pd.read_csv(file_path)

df = df.dropna(subset=['busbw'])

df['busbw'] = pd.to_numeric(df['busbw'], errors='coerce')

df = df.dropna(subset=['busbw'])

def exclude_min(group):
if len(group) > 1:
group = group.sort_values(by='busbw')
return group.iloc[2:]
return group

df_excluded_min = df.groupby(['comm_type', 'comm_group', 'msg_size']).apply(exclude_min).reset_index(drop=True)
grouped = df_excluded_min.groupby(['comm_type', 'comm_group', 'msg_size']).agg(
busbw_mean=('busbw', 'mean'),
busbw_max=('busbw', 'max'),
busbw_min=('busbw', 'min'),
busbw_std=('busbw', 'std'),
occurrence_count=('busbw', 'size')
).reset_index()
grouped['msg_size'] = grouped['msg_size'].apply(convert_size_to_msg)
return grouped


if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: python -m log_analyzer.analyze_res_csv <path_to_csv>")
sys.exit(1)
grouped = analyze_csv(sys.argv[1])
print(grouped)

134 changes: 134 additions & 0 deletions aicb/log_analyzer/ds_comm_log_analyzer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""
Copyright (c) 2021, Alibaba Group;
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

# /usr/bin/python3
from utils.utils import CommType, CommGroup
from log_analyzer.utils import convert_msg_to_size, convert_size_to_msg
from utils.benchmark_logger import BenchLogger
from log_analyzer.log import LogItem, Log

COMM_OP = "comm op"
CALLER_FUNC = "Caller Func"
TIME_MS = "time (ms)"
MSG_SIZE = "msg size"
LOG_STARTER = "[rank 0]"
WORLD_SIZE = 16
TP_SIZE = 4
DP_SIZE = 4
# LOG_STARTER = "[INFO] "


def clean_s(s):
return s.strip("[]\n\t ")


def string2comm_type(s):
if "all_gather" in s:
return CommType.all_gather
if "reduce_scatter" in s:
return CommType.reduce_scatter
if "all_reduce" in s:
return CommType.all_reduce
if "broadcast" in s:
return CommType.broadcast
if "barrier" in s:
return CommType.barrier
if "reduce" in s:
return CommType.reduce
print(f"WARNING cannot convert {s} to CommType")
return CommType.epoch_end


def parse_ds_log_item(line):
index = line.lower().find(LOG_STARTER)
if index == -1:
return None
item_list = line[index + len(LOG_STARTER) :].split("|")
item = {}
for raw_item in item_list:
if "epoch" in raw_item:
split_text = raw_item.split()
numbers = [word for word in split_text if word.isdigit()]
item["epoch_num"] = int(numbers[0])
continue
if "micro_step" in raw_item:
split_text = raw_item.split()
numbers = [word for word in split_text if word.replace(".", "").isdigit()]
item["iter_time"] = float(numbers[0])
continue
if ":" not in raw_item:
continue
key, value = raw_item.split(":")
key, value = clean_s(key), clean_s(value)
if key == COMM_OP:
item["comm_type"] = string2comm_type(value)
elif key == MSG_SIZE or MSG_SIZE in key:
item["msg_size"] = convert_msg_to_size(value)
elif key == CALLER_FUNC:
item["stage"] = value
elif key == TIME_MS or TIME_MS in key:
item["elapsed_time"] = float(value)
if key == "group":
group = eval(value)
if len(group) == WORLD_SIZE:
item["group"] = CommGroup.all
elif len(group) == TP_SIZE:
item["group"] = CommGroup.tp_group
elif len(group) == DP_SIZE:
item["group"] = CommGroup.dp_group
elif "algbw" in key:
item["algbw"] = float(value)
elif "busbw" in key:
item["busbw"] = float(value)
else:
try:
item[key] = float(value)
except:
item[key] = value
return item


def parse_ds_comm_log(filename):
comm_log = Log()
with open(filename, "r") as f:
lines = f.read().split("\n")
for line in lines:
if "After initializing ZeRO optimizer" in line:
comm_log.add_comm_log(LogItem(comm_type=CommType.epoch_end))
continue
elif "microstep" in line:
comm_log.add_comm_log(LogItem(comm_type=CommType.epoch_end))
continue
log = parse_ds_log_item(line)
if log is None:
continue
if "comm_type" in log:
log_item = LogItem(
comm_type=log["comm_type"],
comm_group=log.get("group", CommGroup.dp_group),
msg_size=log["msg_size"],
)
log_item._elapsed_time = log.get("elapsed_time", -1)
log_item.algbw, log_item.busbw = log.get("algbw", -1), log.get(
"busbw", -1
)
comm_log.add_comm_log(log_item)
return comm_log


if __name__ == "__main__":
import sys

filename = sys.argv[1]
comm_log = parse_ds_comm_log(filename)
comm_log.analyze()
Loading