diff --git a/.gitignore b/.gitignore index e4a9790f4..e2a34f7b5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,13 @@ astra-sim-alibabacloud/build/simai_analytical/build/ astra-sim-alibabacloud/build/astra_ns3/build/ astra-sim-alibabacloud/extern/ +build-optimized/ bin/ results/ test/log/ *.log +*.bak +ncclFlowModel_*.csv +send.txt .cur* .DS_Store diff --git a/astra-sim-alibabacloud/inputs/config/SimAI_nvlink_only.conf b/astra-sim-alibabacloud/inputs/config/SimAI_nvlink_only.conf new file mode 100644 index 000000000..a28a9e6cf --- /dev/null +++ b/astra-sim-alibabacloud/inputs/config/SimAI_nvlink_only.conf @@ -0,0 +1,65 @@ +ENABLE_QCN 0 +USE_DYNAMIC_PFC_THRESHOLD 1 + +PACKET_PAYLOAD_SIZE 9000 + +FLOW_FILE /etc/astra-sim/simulation/flow1.txt +TRACE_FILE /etc/astra-sim/simulation/trace1.txt +TRACE_OUTPUT_FILE /etc/astra-sim/simulation/llama_hpn7_mix.tr +FCT_OUTPUT_FILE /etc/astra-sim/simulation/llama_hpn7_fct.txt +PFC_OUTPUT_FILE /etc/astra-sim/simulation/llama_hpn7_pfc.txt + +SIMULATOR_STOP_TIME 40000000000000.00 + +CC_MODE 1 +ALPHA_RESUME_INTERVAL 1 +RATE_DECREASE_INTERVAL 4 +CLAMP_TARGET_RATE 0 +RP_TIMER 900 +EWMA_GAIN 0.00390625 +FAST_RECOVERY_TIMES 1 +RATE_AI 50Mb/s +RATE_HAI 100Mb/s +MIN_RATE 100Mb/s +DCTCP_RATE_AI 1000Mb/s + +ERROR_RATE_PER_LINK 0.0000 +L2_CHUNK_SIZE 4000 +L2_ACK_INTERVAL 1 +L2_BACK_TO_ZERO 0 + +HAS_WIN 1 +GLOBAL_T 0 +VAR_WIN 1 +FAST_REACT 1 +U_TARGET 0.95 +MI_THRESH 0 +INT_MULTI 1 +MULTI_RATE 0 +SAMPLE_FEEDBACK 0 +PINT_LOG_BASE 1.05 +PINT_PROB 1.0 + +RATE_BOUND 1 + +ACK_HIGH_PRIO 0 + +LINK_DOWN 0 0 0 + +ENABLE_TRACE 1 + +KMAX_MAP 6 25000000000 400 50000000000 800 100000000000 1600 200000000000 1200 400000000000 3200 1600000000000 2400 +KMIN_MAP 6 25000000000 100 50000000000 200 100000000000 400 200000000000 300 400000000000 800 1600000000000 600 +PMAX_MAP 6 25000000000 0.2 50000000000 0.2 100000000000 0.2 200000000000 0.8 400000000000 0.2 1600000000000 0.2 + +BUFFER_SIZE 32 + +QLEN_MON_FILE /etc/astra-sim/simulation/llama_hpn7_qlen.txt +BW_MON_FILE /etc/astra-sim/simulation/llama_hpn7_bw.txt +RATE_MON_FILE /etc/astra-sim/simulation/llama_hpn7_rate.txt +CNP_MON_FILE /etc/astra-sim/simulation/llama_hpn7_cnp.txt +MON_START 0 +MON_END 20000 +QP_MON_INTERVAL 100 +QLEN_MON_INTERVAL 10000 +BW_MON_INTERVAL 10000 diff --git a/astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py b/astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py index 4277def70..3d76a8ccf 100755 --- a/astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py +++ b/astra-sim-alibabacloud/inputs/topo/gen_Topo_Template.py @@ -6,6 +6,42 @@ import argparse import warnings +def NVLink_Only(parameters): + if parameters['gpu'] % parameters['gpu_per_server'] != 0: + raise ValueError("NVLink-only topology requires gpu to be divisible by gpu_per_server") + servers = int(parameters['gpu'] / parameters['gpu_per_server']) + nv_switch_num = servers * parameters['nv_switch_per_server'] + nodes = parameters['gpu'] + nv_switch_num + links = parameters['gpu'] * parameters['nv_switch_per_server'] + file_name = parameters.get('output') + if not file_name: + file_name = str(parameters['gpu_type']) + "_" + str(parameters['gpu']) + "g_nvlink_only.topo" + + with open(file_name, 'w') as f: + print(file_name) + first_line = ( + str(nodes) + " " + str(parameters['gpu_per_server']) + " " + + str(nv_switch_num) + " 0 " + str(int(links)) + " " + + str(parameters['gpu_type']) + ) + f.write(first_line) + f.write('\n') + f.write(" ".join(str(i) for i in range(parameters['gpu'], nodes))) + f.write(" \n") + + for gpu_id in range(parameters['gpu']): + server_id = int(gpu_id / parameters['gpu_per_server']) + first_nv_switch = parameters['gpu'] + server_id * parameters['nv_switch_per_server'] + for local_nv_switch in range(parameters['nv_switch_per_server']): + nv_switch_id = first_nv_switch + local_nv_switch + line = ( + str(gpu_id) + " " + str(nv_switch_id) + " " + + str(parameters['nvlink_bw']) + " " + str(parameters['nv_latency']) + + " " + str(parameters['error_rate']) + ) + f.write(line) + f.write('\n') + def Rail_Opti_SingleToR(parameters): nodes_per_asw = parameters['nics_per_aswitch'] asw_switch_num_per_segment = parameters['gpu_per_server'] @@ -491,11 +527,18 @@ def main(): parser.add_argument('-psn','--psw_switch_num',type=int,default=None,help='psw_switch_num, default 64') parser.add_argument('-apbw','--ap_bandwidth',type=str,default=None,help='asw to psw bandwidth,default 400Gbps') parser.add_argument('-app','--asw_per_psw',type=int,default=None,help='asw for psw') + parser.add_argument('--no-asw', action='store_true', help='generate an NVSwitch-only topology without ASW switches') + parser.add_argument('--no-psw', action='store_true', help='generate an NVSwitch-only topology without PSW switches') + parser.add_argument('-o', '--output', type=str, default=None, help='output topology file path') args = parser.parse_args() default_parameters = [] parameters = analysis_template(args, default_parameters) - if not parameters['rail_optimized']: + if parameters['no_asw'] and parameters['no_psw']: + NVLink_Only(parameters) + elif parameters['no_asw'] or parameters['no_psw']: + raise ValueError("--no-asw and --no-psw must be used together for NVLink-only topology") + elif not parameters['rail_optimized']: if parameters['dual_plane']: raise ValueError("Sorry, None Rail-Optimized Structure doesn't support Dual Plane") if parameters['dual_ToR']: @@ -520,12 +563,16 @@ def analysis_template(args, default_parameters): 'gpu_per_server': 8, 'gpu_type': 'H100', 'nv_switch_per_server': 1, 'nvlink_bw': '2880Gbps','nv_latency': '0.000025ms', 'latency': '0.0005ms', 'bandwidth': '400Gbps', 'asw_switch_num': 8, 'nics_per_aswitch': 64, - 'psw_switch_num': 64, 'ap_bandwidth': "400Gbps", 'asw_per_psw' : 64} + 'psw_switch_num': 64, 'ap_bandwidth': "400Gbps", 'asw_per_psw' : 64, + 'no_asw': False, 'no_psw': False, 'output': None} parameters = {} parameters['topology'] = args.topology parameters['rail_optimized'] = bool(args.ro) parameters['dual_ToR'] = bool(args.dt) parameters['dual_plane'] = bool(args.dp) + parameters['no_asw'] = bool(args.no_asw) + parameters['no_psw'] = bool(args.no_psw) + parameters['output'] = args.output if parameters['topology'] == 'Spectrum-X': @@ -598,4 +645,4 @@ def analysis_template(args, default_parameters): if __name__ =='__main__': - main() \ No newline at end of file + main()