Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,13 @@
astra-sim-alibabacloud/build/simai_analytical/build/
astra-sim-alibabacloud/build/astra_ns3/build/
astra-sim-alibabacloud/extern/
build-optimized/
bin/
results/
test/log/
*.log
*.bak
ncclFlowModel_*.csv
send.txt
.cur*
.DS_Store
65 changes: 65 additions & 0 deletions astra-sim-alibabacloud/inputs/config/SimAI_nvlink_only.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
ENABLE_QCN 0
USE_DYNAMIC_PFC_THRESHOLD 1

PACKET_PAYLOAD_SIZE 9000

FLOW_FILE /etc/astra-sim/simulation/flow1.txt
TRACE_FILE /etc/astra-sim/simulation/trace1.txt
TRACE_OUTPUT_FILE /etc/astra-sim/simulation/llama_hpn7_mix.tr
FCT_OUTPUT_FILE /etc/astra-sim/simulation/llama_hpn7_fct.txt
PFC_OUTPUT_FILE /etc/astra-sim/simulation/llama_hpn7_pfc.txt

SIMULATOR_STOP_TIME 40000000000000.00

CC_MODE 1
ALPHA_RESUME_INTERVAL 1
RATE_DECREASE_INTERVAL 4
CLAMP_TARGET_RATE 0
RP_TIMER 900
EWMA_GAIN 0.00390625
FAST_RECOVERY_TIMES 1
RATE_AI 50Mb/s
RATE_HAI 100Mb/s
MIN_RATE 100Mb/s
DCTCP_RATE_AI 1000Mb/s

ERROR_RATE_PER_LINK 0.0000
L2_CHUNK_SIZE 4000
L2_ACK_INTERVAL 1
L2_BACK_TO_ZERO 0

HAS_WIN 1
GLOBAL_T 0
VAR_WIN 1
FAST_REACT 1
U_TARGET 0.95
MI_THRESH 0
INT_MULTI 1
MULTI_RATE 0
SAMPLE_FEEDBACK 0
PINT_LOG_BASE 1.05
PINT_PROB 1.0

RATE_BOUND 1

ACK_HIGH_PRIO 0

LINK_DOWN 0 0 0

ENABLE_TRACE 1

KMAX_MAP 6 25000000000 400 50000000000 800 100000000000 1600 200000000000 1200 400000000000 3200 1600000000000 2400
KMIN_MAP 6 25000000000 100 50000000000 200 100000000000 400 200000000000 300 400000000000 800 1600000000000 600
PMAX_MAP 6 25000000000 0.2 50000000000 0.2 100000000000 0.2 200000000000 0.8 400000000000 0.2 1600000000000 0.2

BUFFER_SIZE 32

QLEN_MON_FILE /etc/astra-sim/simulation/llama_hpn7_qlen.txt
BW_MON_FILE /etc/astra-sim/simulation/llama_hpn7_bw.txt
RATE_MON_FILE /etc/astra-sim/simulation/llama_hpn7_rate.txt
CNP_MON_FILE /etc/astra-sim/simulation/llama_hpn7_cnp.txt
MON_START 0
MON_END 20000
QP_MON_INTERVAL 100
QLEN_MON_INTERVAL 10000
BW_MON_INTERVAL 10000
53 changes: 50 additions & 3 deletions astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,42 @@
import argparse
import warnings

def NVLink_Only(parameters):
if parameters['gpu'] % parameters['gpu_per_server'] != 0:
raise ValueError("NVLink-only topology requires gpu to be divisible by gpu_per_server")
servers = int(parameters['gpu'] / parameters['gpu_per_server'])
nv_switch_num = servers * parameters['nv_switch_per_server']
nodes = parameters['gpu'] + nv_switch_num
links = parameters['gpu'] * parameters['nv_switch_per_server']
Comment on lines +9 to +15
file_name = parameters.get('output')
if not file_name:
file_name = str(parameters['gpu_type']) + "_" + str(parameters['gpu']) + "g_nvlink_only.topo"

with open(file_name, 'w') as f:
print(file_name)
first_line = (
str(nodes) + " " + str(parameters['gpu_per_server']) + " " +
str(nv_switch_num) + " 0 " + str(int(links)) + " " +
str(parameters['gpu_type'])
)
f.write(first_line)
f.write('\n')
f.write(" ".join(str(i) for i in range(parameters['gpu'], nodes)))
f.write(" \n")

for gpu_id in range(parameters['gpu']):
server_id = int(gpu_id / parameters['gpu_per_server'])
first_nv_switch = parameters['gpu'] + server_id * parameters['nv_switch_per_server']
for local_nv_switch in range(parameters['nv_switch_per_server']):
nv_switch_id = first_nv_switch + local_nv_switch
line = (
str(gpu_id) + " " + str(nv_switch_id) + " " +
str(parameters['nvlink_bw']) + " " + str(parameters['nv_latency']) +
" " + str(parameters['error_rate'])
)
f.write(line)
f.write('\n')

def Rail_Opti_SingleToR(parameters):
nodes_per_asw = parameters['nics_per_aswitch']
asw_switch_num_per_segment = parameters['gpu_per_server']
Expand Down Expand Up @@ -491,11 +527,18 @@ def main():
parser.add_argument('-psn','--psw_switch_num',type=int,default=None,help='psw_switch_num, default 64')
parser.add_argument('-apbw','--ap_bandwidth',type=str,default=None,help='asw to psw bandwidth,default 400Gbps')
parser.add_argument('-app','--asw_per_psw',type=int,default=None,help='asw for psw')
parser.add_argument('--no-asw', action='store_true', help='generate an NVSwitch-only topology without ASW switches')
parser.add_argument('--no-psw', action='store_true', help='generate an NVSwitch-only topology without PSW switches')
parser.add_argument('-o', '--output', type=str, default=None, help='output topology file path')
args = parser.parse_args()

default_parameters = []
parameters = analysis_template(args, default_parameters)
if not parameters['rail_optimized']:
if parameters['no_asw'] and parameters['no_psw']:
NVLink_Only(parameters)
elif parameters['no_asw'] or parameters['no_psw']:
raise ValueError("--no-asw and --no-psw must be used together for NVLink-only topology")
elif not parameters['rail_optimized']:
if parameters['dual_plane']:
raise ValueError("Sorry, None Rail-Optimized Structure doesn't support Dual Plane")
if parameters['dual_ToR']:
Expand All @@ -520,12 +563,16 @@ def analysis_template(args, default_parameters):
'gpu_per_server': 8, 'gpu_type': 'H100', 'nv_switch_per_server': 1,
'nvlink_bw': '2880Gbps','nv_latency': '0.000025ms', 'latency': '0.0005ms',
'bandwidth': '400Gbps', 'asw_switch_num': 8, 'nics_per_aswitch': 64,
'psw_switch_num': 64, 'ap_bandwidth': "400Gbps", 'asw_per_psw' : 64}
'psw_switch_num': 64, 'ap_bandwidth': "400Gbps", 'asw_per_psw' : 64,
'no_asw': False, 'no_psw': False, 'output': None}
parameters = {}
parameters['topology'] = args.topology
parameters['rail_optimized'] = bool(args.ro)
parameters['dual_ToR'] = bool(args.dt)
parameters['dual_plane'] = bool(args.dp)
parameters['no_asw'] = bool(args.no_asw)
parameters['no_psw'] = bool(args.no_psw)
parameters['output'] = args.output


if parameters['topology'] == 'Spectrum-X':
Expand Down Expand Up @@ -598,4 +645,4 @@ def analysis_template(args, default_parameters):


if __name__ =='__main__':
main()
main()