From 5c93a7c7274bef202b7e53bb7ae50f09f38d2282 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 30 Jun 2026 07:37:33 +0200 Subject: [PATCH 1/7] Add FIRST_CHECK support and DeepSeek V3 first_eval_samples compliance rule Add FIRST_CHECK to mlp_compliance.py that runs only on the first occurrence of a key. Use it in closed_deepseekv3_671b.yaml to require the first eval_accuracy samples_count equals GBS*floor(42+24576/GBS). --- mlperf_logging/compliance_checker/mlp_compliance.py | 4 ++++ .../training_6.0.0/closed_deepseekv3_671b.yaml | 1 + 2 files changed, 5 insertions(+) diff --git a/mlperf_logging/compliance_checker/mlp_compliance.py b/mlperf_logging/compliance_checker/mlp_compliance.py index e7ee391..3d2b30f 100644 --- a/mlperf_logging/compliance_checker/mlp_compliance.py +++ b/mlperf_logging/compliance_checker/mlp_compliance.py @@ -184,6 +184,7 @@ def configured_checks(self, loglines, config_file): at_least_one_checks = {} # executing the rules through log records has_been_exec = set([]) + first_check_seen = set([]) for line in loglines: key_record = None try: @@ -196,6 +197,9 @@ def configured_checks(self, loglines, config_file): if 'PRE' in key_record: self.run_check_exec(line, key_record['PRE'], state, 'PRE') if 'CHECK' in key_record: self.run_check_eval(line, key_record['CHECK'], state) if 'POST' in key_record: self.run_check_exec(line, key_record['POST'], state, 'POST') + if 'FIRST_CHECK' in key_record and line.key not in first_check_seen: + first_check_seen.add(line.key) + self.run_check_eval(line, key_record['FIRST_CHECK'], state) if 'ATLEAST_ONE_CHECK' in key_record: if line.key not in at_least_one_checks: at_least_one_checks[line.key] = [0, key_record['ATLEAST_ONE_CHECK']] diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml index f0edfbc..a81749c 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml @@ -86,4 +86,5 @@ REQ: AT_LEAST_ONE CHECK: - "'samples_count' in v['metadata']" + FIRST_CHECK: " int(v['metadata']['samples_count']) == s['global_batch_size'] * math.floor(42 + 24576 / s['global_batch_size']) " ATLEAST_ONE_CHECK: "(v['value'] <= 3.6) and v['value'] > 0.0" From 3f99c349387a807d206ca66567ee4beec83a09bc Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 30 Jun 2026 09:00:36 +0200 Subject: [PATCH 2/7] Add training_6.1.0 compliance configs and move first_eval_samples check there Copy training_6.0.0 to training_6.1.0. Remove FIRST_CHECK for first_eval_samples from 6.0.0 closed DeepSeek V3 config, keeping it only in 6.1.0. --- .../closed_deepseekv3_671b.yaml | 1 - .../training_6.1.0/closed_common.yaml | 11 + .../closed_deepseekv3_671b.yaml | 90 +++++++++ .../training_6.1.0/closed_dlrm_dcnv2.yaml | 59 ++++++ .../training_6.1.0/closed_flux1.yaml | 56 +++++ .../training_6.1.0/closed_gpt_oss_20b.yaml | 86 ++++++++ .../closed_llama2_70b_lora.yaml | 42 ++++ .../training_6.1.0/closed_llama31_405b.yaml | 88 ++++++++ .../training_6.1.0/closed_llama31_8b.yaml | 87 ++++++++ .../training_6.1.0/common.yaml | 191 ++++++++++++++++++ .../training_6.1.0/open_common.yaml | 6 + .../training_6.1.0/open_deepseekv3_671b.yaml | 78 +++++++ .../training_6.1.0/open_dlrm_dcnv2.yaml | 7 + .../training_6.1.0/open_flux1.yaml | 13 ++ .../training_6.1.0/open_gpt_oss_20b.yaml | 68 +++++++ .../training_6.1.0/open_llama2_70b_lora.yaml | 7 + .../training_6.1.0/open_llama31_405b.yaml | 78 +++++++ .../training_6.1.0/open_llama31_8b.yaml | 8 + 18 files changed, 975 insertions(+), 1 deletion(-) create mode 100755 mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/closed_deepseekv3_671b.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/closed_dlrm_dcnv2.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/closed_flux1.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/closed_gpt_oss_20b.yaml create mode 100755 mlperf_logging/compliance_checker/training_6.1.0/closed_llama2_70b_lora.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_405b.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_8b.yaml create mode 100755 mlperf_logging/compliance_checker/training_6.1.0/common.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/open_deepseekv3_671b.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/open_dlrm_dcnv2.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/open_flux1.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/open_gpt_oss_20b.yaml create mode 100755 mlperf_logging/compliance_checker/training_6.1.0/open_llama2_70b_lora.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/open_llama31_405b.yaml create mode 100644 mlperf_logging/compliance_checker/training_6.1.0/open_llama31_8b.yaml diff --git a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml index a81749c..f0edfbc 100644 --- a/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml +++ b/mlperf_logging/compliance_checker/training_6.0.0/closed_deepseekv3_671b.yaml @@ -86,5 +86,4 @@ REQ: AT_LEAST_ONE CHECK: - "'samples_count' in v['metadata']" - FIRST_CHECK: " int(v['metadata']['samples_count']) == s['global_batch_size'] * math.floor(42 + 24576 / s['global_batch_size']) " ATLEAST_ONE_CHECK: "(v['value'] <= 3.6) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml new file mode 100755 index 0000000..5211757 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml @@ -0,0 +1,11 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'gpt_oss_20b', 'deepseekv3_671b'] " + POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_deepseekv3_671b.yaml new file mode 100644 index 0000000..a81749c --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_deepseekv3_671b.yaml @@ -0,0 +1,90 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 15360 " + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4096 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " abs(v['value'] - 2.4e-05 * (s['global_batch_size'] / 16384) ** 0.5) < 1e-9 " + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4 " + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 12000 - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: moe_aux_loss_coeff + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.01 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + FIRST_CHECK: " int(v['metadata']['samples_count']) == s['global_batch_size'] * math.floor(42 + 24576 / s['global_batch_size']) " + ATLEAST_ONE_CHECK: "(v['value'] <= 3.6) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_dlrm_dcnv2.yaml new file mode 100644 index 0000000..45344bd --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_dlrm_dcnv2.yaml @@ -0,0 +1,59 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adagrad' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adagrad_learning_rate_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_adagrad_initial_accumulator_value + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_adagrad_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-8 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_learning_rate_decay_start_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0" + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 89137319 " diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_flux1.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_flux1.yaml new file mode 100644 index 0000000..49f60bd --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_flux1.yaml @@ -0,0 +1,56 @@ +- KEY: + NAME: global_batch_size + REQ: AT_LEAST_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: evaluation_frequency + REQ: EXACTLY_ONE + CHECK: " v['value'] == 262144" + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_gpt_oss_20b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_gpt_oss_20b.yaml new file mode 100644 index 0000000..25ce760 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_gpt_oss_20b.yaml @@ -0,0 +1,86 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1_200_000 - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.34) and v['value'] > 0.0" + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1200000 " \ No newline at end of file diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama2_70b_lora.yaml new file mode 100755 index 0000000..46de03e --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama2_70b_lora.yaml @@ -0,0 +1,42 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + + +- KEY: + NAME: opt_learning_rate_training_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_alpha + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_rank + REQ: EXACTLY_ONE + CHECK: " v['value'] == 16" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 0.925) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_405b.yaml new file mode 100644 index 0000000..90e2d45 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_405b.yaml @@ -0,0 +1,88 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] * 1152 == s['global_batch_size'] * 8e-5 " + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + POST: > + s['opt_learning_rate_warmup_steps'] = math.ceil(8000 * 1152 / s['global_batch_size'] ) + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == math.ceil(1_200_000 * 1152 / s['global_batch_size'] ) - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 5760 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_8b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_8b.yaml new file mode 100644 index 0000000..d12bf9c --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_8b.yaml @@ -0,0 +1,87 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1_200_000 - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.3) and v['value'] > 0.0" + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1200000 " + diff --git a/mlperf_logging/compliance_checker/training_6.1.0/common.yaml b/mlperf_logging/compliance_checker/training_6.1.0/common.yaml new file mode 100755 index 0000000..7526c47 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/common.yaml @@ -0,0 +1,191 @@ +# This file lists all the KEYs to be checked. Every line that matches mlperf logging regex (::MLL...) will be checked against these rules. +# In the order of the appearance in the log, for each line will execute the code specified under CHECK for the KEY in that line. +# The code will be launched using local state 'v' which is the content of value field in log line, and global state 's'. +# Global state 's' exists to allow cross-line checks, like start/stop pairs etc. To initialize 's' use BEGIN record which CODE will +# be executed before any checks. +# In addition, occurrence of each key will be counted and at the end if a requirement regarding the number of occurrences is defined it will +# be confirmed. This could be implemented using global state, but since this is a common thing to do it is natively supported. +# +# KEY record: +# NAME +# REQ - optional - {EXACTLY_ONE, AT_LEAST_ONE} +# PRE - optional - code to be executed before CHECK +# CHECK - optional - expression to be evaluated to verify correctness +# POST - optional - code to be executed after CHECK + +- BEGIN: + CODE: > + s.update({ + 'init_started': False, + 'init_stopped' : False, + 'run_started' : False, + 'run_stopped' : False, + 'in_epoch' : False, + 'last_epoch' : 0, + 'in_block' : False, + 'block_first_epoch' : -1, + 'first_init_start': 9e99, + 'compile_time_mins': 0, + }) + +- KEY: + NAME: submission_org + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_platform + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_division + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['closed', 'open'] " + POST: " enqueue_config('training_6.0.0/{}_common.yaml'.format(v['value'])); s['compile_time_mins'] = 240 if v['value'] == 'open' else 30 " + +# at least one record should be found, but any found records must pass the test +- KEY: + NAME: cache_clear + REQ: AT_LEAST_ONE + CHECK: + - "'value' in v" + +# frequency not checked +- KEY: + NAME: init_start + REQ: AT_LEAST_ONE + CHECK: + - "not s['init_stopped']" + - "not s['run_started']" + POST: " s['init_started'] = True; s['first_init_start']=min(s['first_init_start'], ll.timestamp) " + +# confirm less than 20min since the very first init_start +- KEY: + NAME: init_stop + REQ: EXACTLY_ONE + CHECK: + - "s['init_started']" + - "not s['run_started']" + - "ll.timestamp - s['first_init_start'] < (s['compile_time_mins']*60*1e3)" + POST: " s['init_stopped'] = True" + +- KEY: + NAME: run_start + REQ: EXACTLY_ONE + CHECK: " ( s['init_stopped'] == True )" + POST: " s['run_started'] = True " + +# status can also be aborted, but not allowing it here for now +# if eval is inside epoch and we decide to terminate, we can lack epoch_stop, it is ok +- KEY: + NAME: run_stop + REQ: EXACTLY_ONE + CHECK: + - "s['run_started']" + - "'status' in v['metadata']" + POST: " s['run_stopped'] = True " + +# FIXME: check epoch_count value match +- KEY: + NAME: block_start + REQ: AT_LEAST_ONE_OR(epoch_start) + CHECK: + - "s['run_started']" + - "('epoch_count' in v['metadata']) | ('samples_count' in v['metadata'])" + - "'first_epoch_num' in v['metadata'] if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['epoch_count'] > 0 if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['samples_count'] >= 0 if 'samples_count' in v['metadata'] else True" + +- KEY: + NAME: block_stop + REQ: AT_LEAST_ONE_OR(epoch_stop) + CHECK: + - "('first_epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: epoch_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: epoch_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +# making sure previous eval did print it's accuracy result +- KEY: + NAME: eval_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: eval_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: train_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +# Optional keys +- KEY: + NAME: lowest_numerical_precision_in_linear + REQ: OPTIONAL + CHECK: " v['value'] in ['fp64', 'fp32', 'tf32', 'fp16', 'fp8', 'nvfp4', 'mxfp4', 'bfloat16', 'Graphcore FLOAT 16.16', 'int8', 'uint8', 'int4', 'uint4'] " + +- KEY: + NAME: lowest_numerical_precision_in_attn + REQ: OPTIONAL + CHECK: " v['value'] in ['fp64', 'fp32', 'tf32', 'fp16', 'fp8', 'nvfp4', 'mxfp4', 'bfloat16', 'Graphcore FLOAT 16.16', 'int8', 'uint8', 'int4', 'uint4'] " + +- KEY: + NAME: lowest_numerical_precision_in_comm + REQ: OPTIONAL + CHECK: " v['value'] in ['fp64', 'fp32', 'tf32', 'fp16', 'fp8', 'nvfp4', 'mxfp4', 'bfloat16', 'Graphcore FLOAT 16.16', 'int8', 'uint8', 'int4', 'uint4'] " + +- KEY: + NAME: tensor_parallelism + REQ: OPTIONAL + CHECK: " is_integer(v['value']) " + +- KEY: + NAME: pipeline_parallelism + REQ: OPTIONAL + CHECK: " is_integer(v['value']) " + +- KEY: + NAME: context_parallelism + REQ: OPTIONAL + CHECK: " is_integer(v['value']) " + +- KEY: + NAME: expert_parallelism + REQ: OPTIONAL + CHECK: " is_integer(v['value']) " + +- KEY: + NAME: micro_batch_size + REQ: OPTIONAL + CHECK: " is_integer(v['value']) " + +- KEY: + NAME: config_filename + REQ: OPTIONAL + diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml new file mode 100644 index 0000000..bd0cde1 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml @@ -0,0 +1,6 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'gpt_oss_20b', 'deepseekv3_671b'] " + POST: " enqueue_config('training_6.0.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_deepseekv3_671b.yaml new file mode 100644 index 0000000..0ef940d --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_deepseekv3_671b.yaml @@ -0,0 +1,78 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 15360 " + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4096 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: moe_aux_loss_coeff + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.6) and v['value'] > 0.0" + diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_dlrm_dcnv2.yaml new file mode 100644 index 0000000..7f70c0c --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_dlrm_dcnv2.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'epoch_num' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_flux1.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_flux1.yaml new file mode 100644 index 0000000..19ee8de --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_flux1.yaml @@ -0,0 +1,13 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +# TODO: Update with official metric name +- KEY: + NAME: averaged_validation_loss + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_gpt_oss_20b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_gpt_oss_20b.yaml new file mode 100644 index 0000000..676d277 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_gpt_oss_20b.yaml @@ -0,0 +1,68 @@ +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.34) and v['value'] > 0.0" + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE \ No newline at end of file diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_llama2_70b_lora.yaml new file mode 100755 index 0000000..784c008 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_llama2_70b_lora.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_405b.yaml new file mode 100644 index 0000000..63016f4 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_405b.yaml @@ -0,0 +1,78 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 5760 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0" + +- KEY: + NAME: init_checkpoint_step + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0 " + diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_8b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_8b.yaml new file mode 100644 index 0000000..5df29d5 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_8b.yaml @@ -0,0 +1,8 @@ + +# TODO: Update with official compliance requirements +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" From 33ef7d1083db1625027d8f13de387764f14f9844 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 30 Jun 2026 09:01:42 +0200 Subject: [PATCH 3/7] Add rcp_checker training_6.1.0 as copy of training_6.0.0 --- .../training_6.1.0/rcps_deepseekv3_671b.json | 58 +++++++ .../training_6.1.0/rcps_dlrm_dcnv2.json | 162 ++++++++++++++++++ .../training_6.1.0/rcps_flux1.json | 90 ++++++++++ .../training_6.1.0/rcps_gpt_oss_20b.json | 64 +++++++ .../training_6.1.0/rcps_llama2_70b_lora.json | 95 ++++++++++ .../training_6.1.0/rcps_llama31_405b.json | 60 +++++++ .../training_6.1.0/rcps_llama31_8b.json | 112 ++++++++++++ 7 files changed, 641 insertions(+) create mode 100644 mlperf_logging/rcp_checker/training_6.1.0/rcps_deepseekv3_671b.json create mode 100644 mlperf_logging/rcp_checker/training_6.1.0/rcps_dlrm_dcnv2.json create mode 100644 mlperf_logging/rcp_checker/training_6.1.0/rcps_flux1.json create mode 100644 mlperf_logging/rcp_checker/training_6.1.0/rcps_gpt_oss_20b.json create mode 100644 mlperf_logging/rcp_checker/training_6.1.0/rcps_llama2_70b_lora.json create mode 100644 mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_405b.json create mode 100644 mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_8b.json diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_deepseekv3_671b.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_deepseekv3_671b.json new file mode 100644 index 0000000..c69b9bb --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_deepseekv3_671b.json @@ -0,0 +1,58 @@ +{ + "deepseekv3_671b_ref_15360": + { + "Benchmark": "deepseekv3_671b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 6.0 submission", + "Platform": "64 NVIDIA GB300 nodes", + "Precision": "BF16", + "BS": 15360, + "Hyperparams": { + "opt_base_learning_rate": 0.000023238, + "opt_learning_rate_warmup_steps": 4, + "opt_learning_rate_decay_steps": 11996, + "gradient_accumulation_steps": 240 + }, + "Epochs to converge": [ + 721920, 721920, 721920, 737280, 691200, 737280 + ] + }, + + "deepseekv3_671b_ref_16384": + { + "Benchmark": "deepseekv3_671b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 6.0 submission", + "Platform": "64 NVIDIA GB300 nodes", + "Precision": "BF16", + "BS": 16384, + "Hyperparams": { + "opt_base_learning_rate": 0.000024, + "opt_learning_rate_warmup_steps": 4, + "opt_learning_rate_decay_steps": 11996, + "gradient_accumulation_steps": 256 + }, + "Epochs to converge": [ + 770048, 786432, 770048, 753664, 753664, 770048 + ] + }, + + "deepseekv3_671b_ref_18432": + { + "Benchmark": "deepseekv3_671b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 6.0 submission", + "Platform": "64 NVIDIA GB300 nodes", + "Precision": "BF16", + "BS": 18432, + "Hyperparams": { + "opt_base_learning_rate": 0.000025456, + "opt_learning_rate_warmup_steps": 4, + "opt_learning_rate_decay_steps": 11996, + "gradient_accumulation_steps": 288 + }, + "Epochs to converge": [ + 847872, 866304, 866304, 829440, 866304, 866304 + ] + } + } diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_dlrm_dcnv2.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_dlrm_dcnv2.json new file mode 100644 index 0000000..3a71eff --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_dlrm_dcnv2.json @@ -0,0 +1,162 @@ +{ + + "dlrm_dcnv2_ref_32768": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "Precision": "FP32", + "BS": 32768, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, + 0.75, 0.7, 0.7, 0.7, 0.75, 0.75, 0.75, 0.7, 0.7, 0.7, + 0.7, 0.7, 0.75, 0.7, 0.65, 0.7, 0.7, 0.7, 0.7, 0.7 + ] + }, + + "dlrm_dcnv2_ref_55296": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "At 3.0 submission", + "Platform": "DGX-A100", + "Precision": "FP32", + "BS": 55296, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.75, 0.75, 0.75, 0.7, 0.8, 0.75, 0.75, 0.75, 0.75, 0.75, + 0.9, 0.7, 0.75, 0.8, 0.7, 0.8, 0.7, 0.7, 0.75, 0.7, + 0.7, 0.9, 0.75, 0.7, 0.8, 0.75, 0.75, 0.8, 0.75, 0.8, + 0.9, 0.75, 0.8, 0.75, 0.8, 0.75, 0.75, 0.75, 0.7, 0.75, + 0.75, 0.8, 0.75, 0.8, 0.8, 0.9, 0.75, 0.75, 0.7, 0.75 + ] + }, + + "dlrm_dcnv2_ref_65536": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "Precision": "FP32", + "BS": 65536, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.75, 0.8, 0.75, 0.75, 0.8, 0.75, 0.8, 0.9, 0.95, 0.75, + 0.75, 0.75, 0.85, 0.85, 0.7, 0.75, 0.75, 0.9, 0.85, 0.8, + 0.7, 0.75, 0.75, 0.75, 0.8, 0.9, 0.75, 0.8, 0.85, 0.8 + ] + }, + + "dlrm_dcnv2_ref_102400": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "Prior to 3.0 submission", + "Platform": "DGX-A100", + "Precision": "FP32", + "BS": 102400, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.85, 0.95, 0.95, 0.85, 0.9, 0.8, 0.85, 0.9, 0.9, 0.9, + 0.95, 0.9, 0.9, 0.9, 0.9, 0.9, 0.85, 0.85, 0.9, 0.9, + 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.85, 0.9, 0.9, + 0.9, 0.95, 0.85, 0.9, 0.9, 0.9, 0.85, 0.9, 0.95, 0.9, + 0.85, 0.95, 0.9, 0.9, 0.8, 0.9, 0.9, 0.9, 0.85, 0.9 + ] + }, + + "dlrm_dcnv2_ref_135168": { + "Benchmark": "dlrm_dcnv2", + "Creator": "NVIDIA", + "When": "At 3.0 submission", + "Platform": "DGX-A100", + "Precision": "FP32", + "BS": 135168, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.0034, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.95, 0.9, 0.9, 0.9, 0.9, 0.95, 0.9, 0.95, 0.95, 0.9, + 0.95, 0.95, 0.95, 1.0, 0.85, 0.9, 0.9, 0.95, 0.95, 0.95, + 0.95, 0.9, 0.9, 0.9, 0.95, 0.95, 1.0, 0.9, 0.95, 0.95, + 0.85, 0.95, 0.95, 0.95, 0.9, 0.95, 0.9, 0.9, 1.0, 0.9, + 0.95, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95, 0.9, 0.9, 0.9, + 0.9, 0.9, 0.9, 0.9, 0.95, 0.85, 0.95, 0.95, 0.9, 0.95, + 0.95, 0.95, 0.95, 1.0, 0.9, 0.95, 0.9, 1.0, 0.85, 0.9, + 0.9, 0.95, 0.95, 0.9, 0.95, 0.9, 0.95, 0.85, 0.95, 0.95, + 0.95, 0.9, 0.9, 0.95, 0.9, 0.95, 0.9, 1.0 + ] + }, + + "dlrm_dcnv2_ref_160000": { + "Benchmark": "dlrm_dcnv2", + "Creator": "Cisco", + "When": "At 5.1 submission", + "Platform": "DGX-H100", + "Precision": "FP32", + "BS": 160000, + "Hyperparams": { + "opt_name": "adagrad", + "opt_base_learning_rate": 0.004, + "opt_adagrad_learning_rate_decay": 0.0, + "opt_adagrad_initial_accumulator_value": 0.0, + "opt_adagrad_epsilon": 1e-08, + "opt_weight_decay": 0.0, + "opt_learning_rate_warmup_steps": 0, + "opt_learning_rate_decay_start_step": 0, + "opt_learning_rate_decay_steps": 0 + }, + "Epochs to converge": [ + 0.95, 0.95, 1, 1, 0.95, 1, 1, 0.95, 0.95, 0.90, + 0.90, 1, 0.90, 0.95, 0.90, 1, 0.95, 0.95, 0.95, 1 + ] + } + +} diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_flux1.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_flux1.json new file mode 100644 index 0000000..e071b7d --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_flux1.json @@ -0,0 +1,90 @@ +{ + "flux_ref_512": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 512, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.0e-4, + "opt_learning_rate_warmup_steps": 1600, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 7077888, 7340032, 7077888, 7077888, 7340032, 7340032, 7602176, 7340032, + 7077888, 7340032, 7077888, 7340032, 7340032, 7077888, 7077888, 7077888, + 7340032, 7340032, 7077888, 7340032 + ] + }, + "flux_ref_1024": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 1024, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.5e-4, + "opt_learning_rate_warmup_steps": 800, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 8650752, 8650752, 8126464, 8650752, 8650752, 8912896, 8126464, 8388608, + 8650752, 8126464, 8126464, 8650752, 8388608, 8388608, 8650752, 8388608, + 8388608, 8388608, 8912896, 8650752 + ] + }, + "flux_ref_2048": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 2048, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.5e-4, + "opt_learning_rate_warmup_steps": 0, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 9437184, 10223616, 10485760, 11010048, 10747904, 12320768, 10485760, + 9961472, 10485760, 9437184, 9699328, 11534336, 9699328, 9699328, 10747904, + 9961472, 10485760, 10747904, 9961472, 9961472 + ] + }, + "flux_ref_4096": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 4096, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 4.0e-4, + "opt_learning_rate_warmup_steps": 100, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 15204352, 15990784, 15466496, 15728640, 15204352, 15466496, 15990784, + 15204352, 14942208, 15204352, 15466496, 16252928, 14680064, 14942208, + 13893632, 15466496, 15466496, 15728640, 15466496, 15204352 + ] + } +} diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_gpt_oss_20b.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_gpt_oss_20b.json new file mode 100644 index 0000000..823ceb8 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_gpt_oss_20b.json @@ -0,0 +1,64 @@ +{ + "gpt_oss_20b_ref_16": + { + "Benchmark": "gpt_oss_20b", + "Creator": "AMD", + "When": "Reference RCPs before 6.0 submission", + "Platform": "1xMI355X", + "Precision": "BF16", + "BS": 16, + "Hyperparams": { + "opt_base_learning_rate": 4e-04, + "opt_learning_rate_warmup_samples": 2048, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 184320, 208896, 184320, 184320, 196608, + 196608, 196608, 196608, 196608, 208896, + 196608, 184320, 196608, 184320, 196608, + 208896, 196608, 196608, 196608, 184320 + ] + }, + + "gpt_oss_20b_ref_32": + { + "Benchmark": "gpt_oss_20b", + "Creator": "AMD", + "When": "Reference RCPs before 6.0 submission", + "Platform": "1xMI355X", + "Precision": "BF16", + "BS": 32, + "Hyperparams": { + "opt_base_learning_rate": 8e-04, + "opt_learning_rate_warmup_samples": 4096, + "gradient_accumulation_steps": 2 + }, + "Epochs to converge": [ + 245760, 233472, 233472, 233472, 233472, + 233472, 233472, 233472, 233472, 233472, + 233472, 221184, 233472, 233472, 245760, + 245760, 221184, 245760, 245760, 221184 + ] + }, + "gpt_oss_20b_ref_64": + { + "Benchmark": "gpt_oss_20b", + "Creator": "AMD", + "When": "Reference RCPs before 6.0 submission", + "Platform": "1xMI355X", + "Precision": "BF16", + "BS": 64, + "Hyperparams": { + "opt_base_learning_rate": 1e-03, + "opt_learning_rate_warmup_samples": 12288, + "gradient_accumulation_steps": 4 + }, + "Epochs to converge": [ + 282624, 307200, 307200, 307200, 294912, + 331776, 307200, 294912, 294912, 282624, + 319488, 282624, 294912, 319488, 294912, + 294912, 331776, 282624, 294912, 307200 + ] + } + } + \ No newline at end of file diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama2_70b_lora.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama2_70b_lora.json new file mode 100644 index 0000000..86630bd --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama2_70b_lora.json @@ -0,0 +1,95 @@ +{ + "llama2_70b_lora_ref_8": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 8, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 3072,2688,3456,3072,3072,3072,3456,3456,3072,2688, + 3456,3072,3072,3072,3840,3456,2688,3072,3456,3456 + ] + }, + + "llama2_70b_lora_ref_16": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 16, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 3840,3840,4224,3840,3840,3840,4608,3840,4608,3840, + 4992,3840,3840,3840,4992,3840,3840,4224,3840,3456 + ] + }, + "llama2_70b_lora_ref_32": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 32, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 5760,6528,6144,6528,5376,6528,5760,6144,6144,6528, + 6144,6144,6144,5760,5760,5760,5760,5760,6144,5760 + ] + }, + "llama2_70b_lora_ref_128": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 128, + "Hyperparams": { + "opt_base_learning_rate": 1e-3, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 11520,13056,10752,12672,12288,11136,10752,13056, 10752,9984, + 11136,11136,11136,10752,11520,11136,11136,10752,11136,9984 + ] + } +} diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_405b.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_405b.json new file mode 100644 index 0000000..d1a7620 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_405b.json @@ -0,0 +1,60 @@ +{ + "llama31_405b_ref_1152": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs after 5.0 submission", + "Platform": "288xDGX-H100", + "Precision": "BF16", + "BS": 1152, + "Hyperparams": { + "opt_base_learning_rate": 8e-05, + "opt_learning_rate_warmup_steps": 8000, + "gradient_accumulation_steps": 144 + }, + "Epochs to converge": [ + 313344,313344,313344, + 331776,313344,294912 + ] + }, + + "llama31_405b_ref_2304": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs after 5.0 submission", + "Platform": "288xDGX-H100", + "Precision": "BF16", + "BS": 2304, + "Hyperparams": { + "opt_base_learning_rate": 16e-05, + "opt_learning_rate_warmup_steps": 4000, + "gradient_accumulation_steps": 288 + }, + "Epochs to converge": [ + 368640,350208,387072, + 368640,368640,368640 + ] + }, + + "llama31_405b_ref_4608": + { + "Benchmark": "llama31_405b", + "Creator": "NVIDIA", + "When": "Reference RCPs after 5.0 submission", + "Platform": "288xDGX-H100", + "Precision": "BF16", + "BS": 4608, + "Hyperparams": { + "opt_base_learning_rate": 32e-05, + "opt_learning_rate_warmup_steps": 2000, + "gradient_accumulation_steps": 576 + }, + "Epochs to converge": [ + 497664,497664,460800, + 497664,479232,497664 + ] + } + } + + diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_8b.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_8b.json new file mode 100644 index 0000000..1b7143f --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_8b.json @@ -0,0 +1,112 @@ +{ + "llama31_8b_ref_16": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "2xDGX-B200", + "Precision": "BF16", + "BS": 16, + "Hyperparams": { + "opt_base_learning_rate": 4e-04, + "opt_learning_rate_warmup_samples": 256, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 159744, 159744, 159744, 159744, 159744, + 159744, 172032, 159744, 172032, 159744, + 172032, 159744, 159744, 159744, 159744, + 159744, 159744, 159744, 159744, 159744 + ] + }, + + "llama31_8b_ref_32": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "4xDGX-B200", + "Precision": "BF16", + "BS": 32, + "Hyperparams": { + "opt_base_learning_rate": 8e-04, + "opt_learning_rate_warmup_samples": 4096, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 196608, 172032, 184320, 184320, 172032, + 172032, 184320, 184320, 184320, 172032, + 172032, 172032, 184320, 184320, 184320, + 172032, 172032, 172032, 184320, 184320 + ] + }, + + "llama31_8b_ref_64": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "4xDGX-B200", + "Precision": "BF16", + "BS": 64, + "Hyperparams": { + "opt_base_learning_rate": 8e-04, + "opt_learning_rate_warmup_samples": 6144, + "gradient_accumulation_steps": 2 + }, + "Epochs to converge": [ + 233472, 208896, 208896, 233472, 233472, + 233472, 233472, 233472, 208896, 233472, + 233472, 233472, 245760, 221184, 208896, + 233472, 233472, 221184, 221184, 221184 + ] + }, + + "llama31_8b_ref_96": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "2xDGX-B200", + "Precision": "BF16", + "BS": 96, + "Hyperparams": { + "opt_base_learning_rate": 1e-03, + "opt_learning_rate_warmup_samples": 16348, + "gradient_accumulation_steps": 6 + }, + "Epochs to converge": [ + 297216, 284832, 272448, 272448, 272448, + 272448, 297216, 272448, 297216, 272448, + 297216, 260064, 272448, 272448, 272448, + 284832, 260064, 284832, 284832, 272448 + ] + }, + + "llama31_8b_ref_128": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "4xDGX-B200", + "Precision": "BF16", + "BS": 128, + "Hyperparams": { + "opt_base_learning_rate": 2e-03, + "opt_learning_rate_warmup_samples": 32768, + "gradient_accumulation_steps": 4 + }, + "Epochs to converge": [ + 368640, 344064, 356352, 344064, 368640, + 368640, 405504, 344064, 331776, 307200, + 331776, 380928, 307200, 344064, 319488, + 356352, 331776, 319488, 356352, 331776 + ] + } +} + + + + + + From 6e737293c6eed09389a9e8683e9552e5ca47e700 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 30 Jun 2026 09:03:54 +0200 Subject: [PATCH 4/7] Register ruleset 6.1.0 across all checkers and tools Add 6.1.0 to mlp_parser dispatch, rcp_checker supported set, package_checker version sets, repo_checker choices, result_summarizer config, and add verify_for_v6.1_training.sh script. --- .../compliance_checker/mlp_parser/__init__.py | 2 ++ .../package_checker/package_checker.py | 6 +++--- mlperf_logging/rcp_checker/rcp_checker.py | 4 ++-- mlperf_logging/repo_checker/repo_checker.py | 4 ++-- mlperf_logging/result_summarizer/config.yaml | 9 +++++++++ scripts/verify_for_v6.1_training.sh | 18 ++++++++++++++++++ 6 files changed, 36 insertions(+), 7 deletions(-) create mode 100755 scripts/verify_for_v6.1_training.sh diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py index 0f4d989..0d1f4ce 100644 --- a/mlperf_logging/compliance_checker/mlp_parser/__init__.py +++ b/mlperf_logging/compliance_checker/mlp_parser/__init__.py @@ -39,5 +39,7 @@ def parse_file(filename, ruleset='0.6.0'): return parse_file_510(filename) elif ruleset == '6.0.0': return parse_file_600(filename) + elif ruleset == '6.1.0': + return parse_file_600(filename) else: raise Exception(f'Ruleset "{ruleset}" is not supported') diff --git a/mlperf_logging/package_checker/package_checker.py b/mlperf_logging/package_checker/package_checker.py index 45eecb8..ceb7d21 100644 --- a/mlperf_logging/package_checker/package_checker.py +++ b/mlperf_logging/package_checker/package_checker.py @@ -191,14 +191,14 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, logging.error(" %d files do not comply, directory cannot be accepted", len(error_list)) # Check if each run use unique seeds. - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'} and division == 'closed': + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0', '6.1.0'} and division == 'closed': seed_checker_bypass = (global_seed_checker_bypass or system_seed_checker_bypass or result_seed_checker_bypass) if not seed_checker.check_seeds(result_files, seed_checker_bypass): too_many_errors = True logging.error('Seed checker failed') # Run RCP checker for >= 1.0.0 - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'} and division == 'closed' and benchmark != 'minigo': + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0', '6.1.0'} and division == 'closed' and benchmark != 'minigo': # Now go again through result files to do RCP checks rcp_bypass = (global_rcp_bypass or system_rcp_bypass or result_rcp_bypass) rcp_pass, rcp_msg, _ = rcp_checker.check_directory( @@ -252,7 +252,7 @@ def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rc ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, etc. """ too_many_errors = False - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'}: + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0', '6.1.0'}: logging.info(' Checking System Description Files') system_description_pass = check_systems(folder, usage, ruleset) too_many_errors = too_many_errors or not system_description_pass diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 038ca07..5568d20 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -193,8 +193,8 @@ def get_submission_epochs(result_files, ruleset, bert_train_samples): class RCP_Checker: def __init__(self, usage, ruleset, benchmark, verbose, rcp_file=None): - if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0", "6.0.0"}: - raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0 and 6.0.0') + if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0", "6.0.0", "6.1.0"}: + raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0, 6.0.0 and 6.1.0') self.usage = usage self.ruleset = ruleset self.benchmark = benchmark diff --git a/mlperf_logging/repo_checker/repo_checker.py b/mlperf_logging/repo_checker/repo_checker.py index 140bff9..0a1082f 100644 --- a/mlperf_logging/repo_checker/repo_checker.py +++ b/mlperf_logging/repo_checker/repo_checker.py @@ -127,8 +127,8 @@ def get_parser(): parser.add_argument( 'ruleset', type=str, - choices=['2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'], - help='the ruleset. 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0 and 6.0.0 are currently supported.' + choices=['2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0', '6.1.0'], + help='the ruleset. 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0, 6.0.0 and 6.1.0 are currently supported.' ) parser.add_argument( '--log_output', diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 79586bc..3e7b93a 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -111,6 +111,15 @@ columns: gpt_oss_20b: ["Benchmark results (minutes)", "LLM", "C4", "GPT-OSS-20B"] deepseekv3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeekV3-671B"] default: [" ", " ", " "] + "6.1.0": + dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] + flux1: ["Benchmark results (minutes)", "Text to image", "CC12M and Coco-2014 for eval", "Flux1"] + llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] + llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] + llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] + gpt_oss_20b: ["Benchmark results (minutes)", "LLM", "C4", "GPT-OSS-20B"] + deepseekv3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeekV3-671B"] + default: [" ", " ", " "] hpc: "2.0.0": diff --git a/scripts/verify_for_v6.1_training.sh b/scripts/verify_for_v6.1_training.sh new file mode 100755 index 0000000..faa266b --- /dev/null +++ b/scripts/verify_for_v6.1_training.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +# rcp_bypass and rcp_bert_train_samples package checker params +# need to be retrieved at package_checker_params file at top-level submission dir. +PACKAGE_CHECKER_PARAMS="" +PACKAGE_CHECKER_PARAMS_FILE="$1/package_checker_params" +if test -f "$PACKAGE_CHECKER_PARAMS_FILE"; then + while IFS= read -r line + do + PACKAGE_CHECKER_PARAMS="$PACKAGE_CHECKER_PARAMS --$line" + done < "$PACKAGE_CHECKER_PARAMS_FILE" +fi + +python3 -m mlperf_logging.package_checker $1 training 6.1.0 $PACKAGE_CHECKER_PARAMS +python3 -m mlperf_logging.result_summarizer $1 training 6.1.0 +python3 -m mlperf_logging.repo_checker $1 training 6.1.0 From 9e92cab875ae2770abef77b78aecd1b446bc4535 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 30 Jun 2026 09:05:12 +0200 Subject: [PATCH 5/7] Add ruleset_610.py parser for ruleset 6.1.0 --- .../compliance_checker/mlp_parser/__init__.py | 3 +- .../mlp_parser/ruleset_610.py | 105 ++++++++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 mlperf_logging/compliance_checker/mlp_parser/ruleset_610.py diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py index 0d1f4ce..9e40815 100644 --- a/mlperf_logging/compliance_checker/mlp_parser/__init__.py +++ b/mlperf_logging/compliance_checker/mlp_parser/__init__.py @@ -11,6 +11,7 @@ from .ruleset_500 import parse_file as parse_file_500 from .ruleset_510 import parse_file as parse_file_510 from .ruleset_600 import parse_file as parse_file_600 +from .ruleset_610 import parse_file as parse_file_610 def parse_file(filename, ruleset='0.6.0'): if ruleset == '0.6.0': @@ -40,6 +41,6 @@ def parse_file(filename, ruleset='0.6.0'): elif ruleset == '6.0.0': return parse_file_600(filename) elif ruleset == '6.1.0': - return parse_file_600(filename) + return parse_file_610(filename) else: raise Exception(f'Ruleset "{ruleset}" is not supported') diff --git a/mlperf_logging/compliance_checker/mlp_parser/ruleset_610.py b/mlperf_logging/compliance_checker/mlp_parser/ruleset_610.py new file mode 100644 index 0000000..e30b08d --- /dev/null +++ b/mlperf_logging/compliance_checker/mlp_parser/ruleset_610.py @@ -0,0 +1,105 @@ +''' +Parses a text MLPerf log into a structured format. +''' + +from __future__ import print_function + +import collections +import json +import re +import sys +from dataclasses import dataclass + +from io import open + +@dataclass +class LogLine: + """Class for keeping track of an item in inventory.""" + full_string: str + timestamp: float + key: str + value: str + lineno: int + +TOKEN = ':::MLLOG ' + + +def parse_line(line): + if not line.startswith(TOKEN): + return None + + return json.loads(line[len(TOKEN):]) + + +def string_to_logline(lineno, string): + ''' Returns a LogLine or raises a ValueError ''' + m = parse_line(string) + + if m is None: + raise ValueError('does not match regex') + + args = [] + args.append(string) # full string + + ts = float(m['time_ms']) # may raise error, e.g. "1.2.3" + # TODO check for weird values + args.append(ts) + + args.append(m['key']) # key + + j = { 'value': m['value'], 'metadata': m['metadata'] } + args.append(j) + + args.append(lineno) + return LogLine(*args) + + +def parse_file(filename): + ''' Reads a file by name and returns list of loglines and list of errors''' + with open(filename, encoding='latin-1') as f: + return parse_generator(f) + + +def strip_and_dedup(gen): + lines = [] + for l in gen: + if TOKEN not in l: + continue + lines.append(re.sub(".*"+TOKEN, TOKEN, l)) + return lines + + + +def parse_generator(gen): + ''' Reads a generator of lines and returns (loglines, errors) + The list of errors are any parsing issues as a tuple (str_line, error_msg) + ''' + loglines = [] + failed = [] + for lineno, line in enumerate(strip_and_dedup(gen)): + line = line.strip() + try: + ll = string_to_logline(lineno, line) + loglines.append(ll) + except ValueError as e: + failed.append((line, str(e))) + return loglines, failed + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print('usage: mlp_parser.py FILENAME') + print(' tests parsing on the file.') + sys.exit(1) + + filename = sys.argv[1] + lines, errors = parse_file(filename) + + print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors))) + + if len(errors) > 0: + print('Lines which failed to parse:') + for line, error in errors: + print(' Following line failed: {}'.format(error)) + print(line) + From c093f4b032d27c0a9cb325cece0b6e573c0fb712 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 30 Jun 2026 09:06:49 +0200 Subject: [PATCH 6/7] Remove llama31_405b and dlrm_dcnv2 from 6.1.0 Drop config and RCP files for both benchmarks, remove them from the closed/open common benchmark allowlists, and fix the common.yaml POST actions to reference training_6.1.0 instead of training_6.0.0. --- .../training_6.1.0/closed_common.yaml | 4 +- .../training_6.1.0/closed_dlrm_dcnv2.yaml | 59 ------- .../training_6.1.0/closed_llama31_405b.yaml | 88 ---------- .../training_6.1.0/open_common.yaml | 4 +- .../training_6.1.0/open_dlrm_dcnv2.yaml | 7 - .../training_6.1.0/open_llama31_405b.yaml | 78 --------- .../training_6.1.0/rcps_dlrm_dcnv2.json | 162 ------------------ .../training_6.1.0/rcps_llama31_405b.json | 60 ------- mlperf_logging/result_summarizer/config.yaml | 2 - 9 files changed, 4 insertions(+), 460 deletions(-) delete mode 100644 mlperf_logging/compliance_checker/training_6.1.0/closed_dlrm_dcnv2.yaml delete mode 100644 mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_405b.yaml delete mode 100644 mlperf_logging/compliance_checker/training_6.1.0/open_dlrm_dcnv2.yaml delete mode 100644 mlperf_logging/compliance_checker/training_6.1.0/open_llama31_405b.yaml delete mode 100644 mlperf_logging/rcp_checker/training_6.1.0/rcps_dlrm_dcnv2.json delete mode 100644 mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_405b.json diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml index 5211757..04f00f3 100755 --- a/mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml @@ -2,8 +2,8 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'gpt_oss_20b', 'deepseekv3_671b'] " - POST: " enqueue_config('training_6.0.0/closed_{}.yaml'.format(v['value'])) " + CHECK: " v['value'] in ['flux1', 'llama31_8b', 'llama2_70b_lora', 'gpt_oss_20b', 'deepseekv3_671b'] " + POST: " enqueue_config('training_6.1.0/closed_{}.yaml'.format(v['value'])) " - KEY: NAME: gradient_accumulation_steps diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_dlrm_dcnv2.yaml deleted file mode 100644 index 45344bd..0000000 --- a/mlperf_logging/compliance_checker/training_6.1.0/closed_dlrm_dcnv2.yaml +++ /dev/null @@ -1,59 +0,0 @@ -- KEY: - NAME: global_batch_size - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_name - REQ: EXACTLY_ONE - CHECK: " v['value'] == 'adagrad' " - -- KEY: - NAME: opt_base_learning_rate - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_adagrad_learning_rate_decay - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0 " - -- KEY: - NAME: opt_weight_decay - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0 " - -- KEY: - NAME: opt_adagrad_initial_accumulator_value - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0 " - -- KEY: - NAME: opt_adagrad_epsilon - REQ: EXACTLY_ONE - CHECK: " v['value'] == 1e-8 " - -- KEY: - NAME: opt_learning_rate_warmup_steps - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0 " - -- KEY: - NAME: opt_learning_rate_decay_start_step - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0 " - -- KEY: - NAME: opt_learning_rate_decay_steps - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0 " - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "v['value'] >= 0.80275 and v['value'] <= 1.0" - -- KEY: - NAME: eval_samples - REQ: EXACTLY_ONE - CHECK: " v['value'] == 89137319 " diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_405b.yaml deleted file mode 100644 index 90e2d45..0000000 --- a/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_405b.yaml +++ /dev/null @@ -1,88 +0,0 @@ -- KEY: - NAME: global_batch_size - REQ: EXACTLY_ONE - POST: > - s['global_batch_size'] = v['value'] - -- KEY: - NAME: max_sequence_length - REQ: EXACTLY_ONE - CHECK: " v['value'] == 8192 " - -- KEY: - NAME: opt_name - REQ: EXACTLY_ONE - CHECK: " v['value'] == 'adamw' " - -- KEY: - NAME: opt_base_learning_rate - REQ: EXACTLY_ONE - CHECK: " v['value'] * 1152 == s['global_batch_size'] * 8e-5 " - -- KEY: - NAME: opt_end_learning_rate - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_warmup_steps - REQ: EXACTLY_ONE - POST: > - s['opt_learning_rate_warmup_steps'] = math.ceil(8000 * 1152 / s['global_batch_size'] ) - -- KEY: - NAME: opt_learning_rate_decay_steps - REQ: EXACTLY_ONE - CHECK: " v['value'] == math.ceil(1_200_000 * 1152 / s['global_batch_size'] ) - s['opt_learning_rate_warmup_steps'] " - -- KEY: - NAME: opt_learning_rate_decay_schedule - REQ: EXACTLY_ONE - CHECK: " v['value'] == 'cosine with linear warmup' " - -- KEY: - NAME: opt_adamw_beta_1 - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0.9 " - -- KEY: - NAME: opt_adamw_beta_2 - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0.95 " - -- KEY: - NAME: opt_adamw_epsilon - REQ: EXACTLY_ONE - CHECK: " v['value'] == 1e-05 " - -- KEY: - NAME: opt_adamw_weight_decay - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0.1 " - -- KEY: - NAME: opt_gradient_clip_norm - REQ: EXACTLY_ONE - CHECK: " v['value'] == 1.0 " - -- KEY: - NAME: gradient_accumulation_steps - REQ: EXACTLY_ONE - CHECK: " v['value'] > 0 " - -- KEY: - NAME: eval_samples - REQ: EXACTLY_ONE - CHECK: " v['value'] == 5760 " - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'samples_count' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0" - -- KEY: - NAME: init_checkpoint_step - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0 " - diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml index bd0cde1..4b7761e 100644 --- a/mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml @@ -2,5 +2,5 @@ - KEY: NAME: submission_benchmark REQ: EXACTLY_ONE - CHECK: " v['value'] in ['flux1', 'dlrm_dcnv2', 'llama31_8b', 'llama2_70b_lora', 'llama31_405b', 'gpt_oss_20b', 'deepseekv3_671b'] " - POST: " enqueue_config('training_6.0.0/open_{}.yaml'.format(v['value'])) " + CHECK: " v['value'] in ['flux1', 'llama31_8b', 'llama2_70b_lora', 'gpt_oss_20b', 'deepseekv3_671b'] " + POST: " enqueue_config('training_6.1.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_dlrm_dcnv2.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_dlrm_dcnv2.yaml deleted file mode 100644 index 7f70c0c..0000000 --- a/mlperf_logging/compliance_checker/training_6.1.0/open_dlrm_dcnv2.yaml +++ /dev/null @@ -1,7 +0,0 @@ - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'epoch_num' in v['metadata']" - ATLEAST_ONE_CHECK: "v['value'] <= 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_405b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_405b.yaml deleted file mode 100644 index 63016f4..0000000 --- a/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_405b.yaml +++ /dev/null @@ -1,78 +0,0 @@ -- KEY: - NAME: global_batch_size - REQ: EXACTLY_ONE - POST: > - s['global_batch_size'] = v['value'] - -- KEY: - NAME: max_sequence_length - REQ: EXACTLY_ONE - CHECK: " v['value'] == 8192 " - -- KEY: - NAME: opt_name - REQ: EXACTLY_ONE - CHECK: " v['value'] == 'adamw' " - -- KEY: - NAME: opt_base_learning_rate - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_end_learning_rate - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_decay_steps - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_warmup_steps - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_learning_rate_decay_schedule - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_adamw_beta_1 - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_adamw_beta_2 - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_adamw_epsilon - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_adamw_weight_decay - REQ: EXACTLY_ONE - -- KEY: - NAME: opt_gradient_clip_norm - REQ: EXACTLY_ONE - -- KEY: - NAME: gradient_accumulation_steps - REQ: EXACTLY_ONE - CHECK: " v['value'] > 0 " - -- KEY: - NAME: eval_samples - REQ: EXACTLY_ONE - CHECK: " v['value'] == 5760 " - -- KEY: - NAME: eval_accuracy - REQ: AT_LEAST_ONE - CHECK: - - "'samples_count' in v['metadata']" - ATLEAST_ONE_CHECK: "(v['value'] <= 5.6) and v['value'] > 0.0" - -- KEY: - NAME: init_checkpoint_step - REQ: EXACTLY_ONE - CHECK: " v['value'] == 0 " - diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_dlrm_dcnv2.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_dlrm_dcnv2.json deleted file mode 100644 index 3a71eff..0000000 --- a/mlperf_logging/rcp_checker/training_6.1.0/rcps_dlrm_dcnv2.json +++ /dev/null @@ -1,162 +0,0 @@ -{ - - "dlrm_dcnv2_ref_32768": { - "Benchmark": "dlrm_dcnv2", - "Creator": "NVIDIA", - "When": "Prior to 3.0 submission", - "Platform": "DGX-A100", - "Precision": "FP32", - "BS": 32768, - "Hyperparams": { - "opt_name": "adagrad", - "opt_base_learning_rate": 0.004, - "opt_adagrad_learning_rate_decay": 0.0, - "opt_adagrad_initial_accumulator_value": 0.0, - "opt_adagrad_epsilon": 1e-08, - "opt_weight_decay": 0.0, - "opt_learning_rate_warmup_steps": 0, - "opt_learning_rate_decay_start_step": 0, - "opt_learning_rate_decay_steps": 0 - }, - "Epochs to converge": [ - 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, - 0.75, 0.7, 0.7, 0.7, 0.75, 0.75, 0.75, 0.7, 0.7, 0.7, - 0.7, 0.7, 0.75, 0.7, 0.65, 0.7, 0.7, 0.7, 0.7, 0.7 - ] - }, - - "dlrm_dcnv2_ref_55296": { - "Benchmark": "dlrm_dcnv2", - "Creator": "NVIDIA", - "When": "At 3.0 submission", - "Platform": "DGX-A100", - "Precision": "FP32", - "BS": 55296, - "Hyperparams": { - "opt_name": "adagrad", - "opt_base_learning_rate": 0.004, - "opt_adagrad_learning_rate_decay": 0.0, - "opt_adagrad_initial_accumulator_value": 0.0, - "opt_adagrad_epsilon": 1e-08, - "opt_weight_decay": 0.0, - "opt_learning_rate_warmup_steps": 0, - "opt_learning_rate_decay_start_step": 0, - "opt_learning_rate_decay_steps": 0 - }, - "Epochs to converge": [ - 0.75, 0.75, 0.75, 0.7, 0.8, 0.75, 0.75, 0.75, 0.75, 0.75, - 0.9, 0.7, 0.75, 0.8, 0.7, 0.8, 0.7, 0.7, 0.75, 0.7, - 0.7, 0.9, 0.75, 0.7, 0.8, 0.75, 0.75, 0.8, 0.75, 0.8, - 0.9, 0.75, 0.8, 0.75, 0.8, 0.75, 0.75, 0.75, 0.7, 0.75, - 0.75, 0.8, 0.75, 0.8, 0.8, 0.9, 0.75, 0.75, 0.7, 0.75 - ] - }, - - "dlrm_dcnv2_ref_65536": { - "Benchmark": "dlrm_dcnv2", - "Creator": "NVIDIA", - "When": "Prior to 3.0 submission", - "Platform": "DGX-A100", - "Precision": "FP32", - "BS": 65536, - "Hyperparams": { - "opt_name": "adagrad", - "opt_base_learning_rate": 0.004, - "opt_adagrad_learning_rate_decay": 0.0, - "opt_adagrad_initial_accumulator_value": 0.0, - "opt_adagrad_epsilon": 1e-08, - "opt_weight_decay": 0.0, - "opt_learning_rate_warmup_steps": 0, - "opt_learning_rate_decay_start_step": 0, - "opt_learning_rate_decay_steps": 0 - }, - "Epochs to converge": [ - 0.75, 0.8, 0.75, 0.75, 0.8, 0.75, 0.8, 0.9, 0.95, 0.75, - 0.75, 0.75, 0.85, 0.85, 0.7, 0.75, 0.75, 0.9, 0.85, 0.8, - 0.7, 0.75, 0.75, 0.75, 0.8, 0.9, 0.75, 0.8, 0.85, 0.8 - ] - }, - - "dlrm_dcnv2_ref_102400": { - "Benchmark": "dlrm_dcnv2", - "Creator": "NVIDIA", - "When": "Prior to 3.0 submission", - "Platform": "DGX-A100", - "Precision": "FP32", - "BS": 102400, - "Hyperparams": { - "opt_name": "adagrad", - "opt_base_learning_rate": 0.004, - "opt_adagrad_learning_rate_decay": 0.0, - "opt_adagrad_initial_accumulator_value": 0.0, - "opt_adagrad_epsilon": 1e-08, - "opt_weight_decay": 0.0, - "opt_learning_rate_warmup_steps": 0, - "opt_learning_rate_decay_start_step": 0, - "opt_learning_rate_decay_steps": 0 - }, - "Epochs to converge": [ - 0.85, 0.95, 0.95, 0.85, 0.9, 0.8, 0.85, 0.9, 0.9, 0.9, - 0.95, 0.9, 0.9, 0.9, 0.9, 0.9, 0.85, 0.85, 0.9, 0.9, - 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.85, 0.9, 0.9, - 0.9, 0.95, 0.85, 0.9, 0.9, 0.9, 0.85, 0.9, 0.95, 0.9, - 0.85, 0.95, 0.9, 0.9, 0.8, 0.9, 0.9, 0.9, 0.85, 0.9 - ] - }, - - "dlrm_dcnv2_ref_135168": { - "Benchmark": "dlrm_dcnv2", - "Creator": "NVIDIA", - "When": "At 3.0 submission", - "Platform": "DGX-A100", - "Precision": "FP32", - "BS": 135168, - "Hyperparams": { - "opt_name": "adagrad", - "opt_base_learning_rate": 0.0034, - "opt_adagrad_learning_rate_decay": 0.0, - "opt_adagrad_initial_accumulator_value": 0.0, - "opt_adagrad_epsilon": 1e-08, - "opt_weight_decay": 0.0, - "opt_learning_rate_warmup_steps": 0, - "opt_learning_rate_decay_start_step": 0, - "opt_learning_rate_decay_steps": 0 - }, - "Epochs to converge": [ - 0.95, 0.9, 0.9, 0.9, 0.9, 0.95, 0.9, 0.95, 0.95, 0.9, - 0.95, 0.95, 0.95, 1.0, 0.85, 0.9, 0.9, 0.95, 0.95, 0.95, - 0.95, 0.9, 0.9, 0.9, 0.95, 0.95, 1.0, 0.9, 0.95, 0.95, - 0.85, 0.95, 0.95, 0.95, 0.9, 0.95, 0.9, 0.9, 1.0, 0.9, - 0.95, 0.9, 0.95, 0.95, 0.95, 0.95, 0.95, 0.9, 0.9, 0.9, - 0.9, 0.9, 0.9, 0.9, 0.95, 0.85, 0.95, 0.95, 0.9, 0.95, - 0.95, 0.95, 0.95, 1.0, 0.9, 0.95, 0.9, 1.0, 0.85, 0.9, - 0.9, 0.95, 0.95, 0.9, 0.95, 0.9, 0.95, 0.85, 0.95, 0.95, - 0.95, 0.9, 0.9, 0.95, 0.9, 0.95, 0.9, 1.0 - ] - }, - - "dlrm_dcnv2_ref_160000": { - "Benchmark": "dlrm_dcnv2", - "Creator": "Cisco", - "When": "At 5.1 submission", - "Platform": "DGX-H100", - "Precision": "FP32", - "BS": 160000, - "Hyperparams": { - "opt_name": "adagrad", - "opt_base_learning_rate": 0.004, - "opt_adagrad_learning_rate_decay": 0.0, - "opt_adagrad_initial_accumulator_value": 0.0, - "opt_adagrad_epsilon": 1e-08, - "opt_weight_decay": 0.0, - "opt_learning_rate_warmup_steps": 0, - "opt_learning_rate_decay_start_step": 0, - "opt_learning_rate_decay_steps": 0 - }, - "Epochs to converge": [ - 0.95, 0.95, 1, 1, 0.95, 1, 1, 0.95, 0.95, 0.90, - 0.90, 1, 0.90, 0.95, 0.90, 1, 0.95, 0.95, 0.95, 1 - ] - } - -} diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_405b.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_405b.json deleted file mode 100644 index d1a7620..0000000 --- a/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_405b.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "llama31_405b_ref_1152": - { - "Benchmark": "llama31_405b", - "Creator": "NVIDIA", - "When": "Reference RCPs after 5.0 submission", - "Platform": "288xDGX-H100", - "Precision": "BF16", - "BS": 1152, - "Hyperparams": { - "opt_base_learning_rate": 8e-05, - "opt_learning_rate_warmup_steps": 8000, - "gradient_accumulation_steps": 144 - }, - "Epochs to converge": [ - 313344,313344,313344, - 331776,313344,294912 - ] - }, - - "llama31_405b_ref_2304": - { - "Benchmark": "llama31_405b", - "Creator": "NVIDIA", - "When": "Reference RCPs after 5.0 submission", - "Platform": "288xDGX-H100", - "Precision": "BF16", - "BS": 2304, - "Hyperparams": { - "opt_base_learning_rate": 16e-05, - "opt_learning_rate_warmup_steps": 4000, - "gradient_accumulation_steps": 288 - }, - "Epochs to converge": [ - 368640,350208,387072, - 368640,368640,368640 - ] - }, - - "llama31_405b_ref_4608": - { - "Benchmark": "llama31_405b", - "Creator": "NVIDIA", - "When": "Reference RCPs after 5.0 submission", - "Platform": "288xDGX-H100", - "Precision": "BF16", - "BS": 4608, - "Hyperparams": { - "opt_base_learning_rate": 32e-05, - "opt_learning_rate_warmup_steps": 2000, - "gradient_accumulation_steps": 576 - }, - "Epochs to converge": [ - 497664,497664,460800, - 497664,479232,497664 - ] - } - } - - diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 3e7b93a..6b8779f 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -112,11 +112,9 @@ columns: deepseekv3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeekV3-671B"] default: [" ", " ", " "] "6.1.0": - dlrm_dcnv2: ["Benchmark results (minutes)", "Recommendation", "1TB Multihot Clickthrough", "DLRM DCNv2"] flux1: ["Benchmark results (minutes)", "Text to image", "CC12M and Coco-2014 for eval", "Flux1"] llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] - llama31_405b: ["Benchmark results (minutes)", "LLM", "C4", "Llama31-405B"] gpt_oss_20b: ["Benchmark results (minutes)", "LLM", "C4", "GPT-OSS-20B"] deepseekv3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeekV3-671B"] default: [" ", " ", " "] From 36304a3e52ee78b87e8e9e69864008aea1c86799 Mon Sep 17 00:00:00 2001 From: Denys Fridman Date: Tue, 30 Jun 2026 09:30:49 +0200 Subject: [PATCH 7/7] Fix common.yaml in 6.1.0 to reference training_6.1.0 configs --- mlperf_logging/compliance_checker/training_6.1.0/common.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlperf_logging/compliance_checker/training_6.1.0/common.yaml b/mlperf_logging/compliance_checker/training_6.1.0/common.yaml index 7526c47..5be1088 100755 --- a/mlperf_logging/compliance_checker/training_6.1.0/common.yaml +++ b/mlperf_logging/compliance_checker/training_6.1.0/common.yaml @@ -42,7 +42,7 @@ NAME: submission_division REQ: EXACTLY_ONE CHECK: " v['value'] in ['closed', 'open'] " - POST: " enqueue_config('training_6.0.0/{}_common.yaml'.format(v['value'])); s['compile_time_mins'] = 240 if v['value'] == 'open' else 30 " + POST: " enqueue_config('training_6.1.0/{}_common.yaml'.format(v['value'])); s['compile_time_mins'] = 240 if v['value'] == 'open' else 30 " # at least one record should be found, but any found records must pass the test - KEY: