diff --git a/mlperf_logging/compliance_checker/mlp_compliance.py b/mlperf_logging/compliance_checker/mlp_compliance.py index e7ee391c..3d2b30f7 100644 --- a/mlperf_logging/compliance_checker/mlp_compliance.py +++ b/mlperf_logging/compliance_checker/mlp_compliance.py @@ -184,6 +184,7 @@ def configured_checks(self, loglines, config_file): at_least_one_checks = {} # executing the rules through log records has_been_exec = set([]) + first_check_seen = set([]) for line in loglines: key_record = None try: @@ -196,6 +197,9 @@ def configured_checks(self, loglines, config_file): if 'PRE' in key_record: self.run_check_exec(line, key_record['PRE'], state, 'PRE') if 'CHECK' in key_record: self.run_check_eval(line, key_record['CHECK'], state) if 'POST' in key_record: self.run_check_exec(line, key_record['POST'], state, 'POST') + if 'FIRST_CHECK' in key_record and line.key not in first_check_seen: + first_check_seen.add(line.key) + self.run_check_eval(line, key_record['FIRST_CHECK'], state) if 'ATLEAST_ONE_CHECK' in key_record: if line.key not in at_least_one_checks: at_least_one_checks[line.key] = [0, key_record['ATLEAST_ONE_CHECK']] diff --git a/mlperf_logging/compliance_checker/mlp_parser/__init__.py b/mlperf_logging/compliance_checker/mlp_parser/__init__.py index 0f4d989a..9e408159 100644 --- a/mlperf_logging/compliance_checker/mlp_parser/__init__.py +++ b/mlperf_logging/compliance_checker/mlp_parser/__init__.py @@ -11,6 +11,7 @@ from .ruleset_500 import parse_file as parse_file_500 from .ruleset_510 import parse_file as parse_file_510 from .ruleset_600 import parse_file as parse_file_600 +from .ruleset_610 import parse_file as parse_file_610 def parse_file(filename, ruleset='0.6.0'): if ruleset == '0.6.0': @@ -39,5 +40,7 @@ def parse_file(filename, ruleset='0.6.0'): return parse_file_510(filename) elif ruleset == '6.0.0': return parse_file_600(filename) + elif ruleset == '6.1.0': + return parse_file_610(filename) else: raise Exception(f'Ruleset "{ruleset}" is not supported') diff --git a/mlperf_logging/compliance_checker/mlp_parser/ruleset_610.py b/mlperf_logging/compliance_checker/mlp_parser/ruleset_610.py new file mode 100644 index 00000000..e30b08d2 --- /dev/null +++ b/mlperf_logging/compliance_checker/mlp_parser/ruleset_610.py @@ -0,0 +1,105 @@ +''' +Parses a text MLPerf log into a structured format. +''' + +from __future__ import print_function + +import collections +import json +import re +import sys +from dataclasses import dataclass + +from io import open + +@dataclass +class LogLine: + """Class for keeping track of an item in inventory.""" + full_string: str + timestamp: float + key: str + value: str + lineno: int + +TOKEN = ':::MLLOG ' + + +def parse_line(line): + if not line.startswith(TOKEN): + return None + + return json.loads(line[len(TOKEN):]) + + +def string_to_logline(lineno, string): + ''' Returns a LogLine or raises a ValueError ''' + m = parse_line(string) + + if m is None: + raise ValueError('does not match regex') + + args = [] + args.append(string) # full string + + ts = float(m['time_ms']) # may raise error, e.g. "1.2.3" + # TODO check for weird values + args.append(ts) + + args.append(m['key']) # key + + j = { 'value': m['value'], 'metadata': m['metadata'] } + args.append(j) + + args.append(lineno) + return LogLine(*args) + + +def parse_file(filename): + ''' Reads a file by name and returns list of loglines and list of errors''' + with open(filename, encoding='latin-1') as f: + return parse_generator(f) + + +def strip_and_dedup(gen): + lines = [] + for l in gen: + if TOKEN not in l: + continue + lines.append(re.sub(".*"+TOKEN, TOKEN, l)) + return lines + + + +def parse_generator(gen): + ''' Reads a generator of lines and returns (loglines, errors) + The list of errors are any parsing issues as a tuple (str_line, error_msg) + ''' + loglines = [] + failed = [] + for lineno, line in enumerate(strip_and_dedup(gen)): + line = line.strip() + try: + ll = string_to_logline(lineno, line) + loglines.append(ll) + except ValueError as e: + failed.append((line, str(e))) + return loglines, failed + + +if __name__ == '__main__': + if len(sys.argv) != 2: + print('usage: mlp_parser.py FILENAME') + print(' tests parsing on the file.') + sys.exit(1) + + filename = sys.argv[1] + lines, errors = parse_file(filename) + + print('Parsed {} log lines with {} errors.'.format(len(lines), len(errors))) + + if len(errors) > 0: + print('Lines which failed to parse:') + for line, error in errors: + print(' Following line failed: {}'.format(error)) + print(line) + diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml new file mode 100755 index 00000000..04f00f3b --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_common.yaml @@ -0,0 +1,11 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['flux1', 'llama31_8b', 'llama2_70b_lora', 'gpt_oss_20b', 'deepseekv3_671b'] " + POST: " enqueue_config('training_6.1.0/closed_{}.yaml'.format(v['value'])) " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_deepseekv3_671b.yaml new file mode 100644 index 00000000..a81749c0 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_deepseekv3_671b.yaml @@ -0,0 +1,90 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 15360 " + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4096 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " abs(v['value'] - 2.4e-05 * (s['global_batch_size'] / 16384) ** 0.5) < 1e-9 " + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4 " + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 12000 - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: moe_aux_loss_coeff + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.01 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + FIRST_CHECK: " int(v['metadata']['samples_count']) == s['global_batch_size'] * math.floor(42 + 24576 / s['global_batch_size']) " + ATLEAST_ONE_CHECK: "(v['value'] <= 3.6) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_flux1.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_flux1.yaml new file mode 100644 index 00000000..49f60bdb --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_flux1.yaml @@ -0,0 +1,56 @@ +- KEY: + NAME: global_batch_size + REQ: AT_LEAST_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: evaluation_frequency + REQ: EXACTLY_ONE + CHECK: " v['value'] == 262144" + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-08 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0.0 " + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 0 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_gpt_oss_20b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_gpt_oss_20b.yaml new file mode 100644 index 00000000..25ce7606 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_gpt_oss_20b.yaml @@ -0,0 +1,86 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1_200_000 - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.34) and v['value'] > 0.0" + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1200000 " \ No newline at end of file diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama2_70b_lora.yaml new file mode 100755 index 00000000..46de03ef --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama2_70b_lora.yaml @@ -0,0 +1,42 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + + +- KEY: + NAME: opt_learning_rate_training_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_alpha + REQ: EXACTLY_ONE + +- KEY: + NAME: lora_rank + REQ: EXACTLY_ONE + CHECK: " v['value'] == 16" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 0.925) and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_8b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_8b.yaml new file mode 100644 index 00000000..d12bf9c8 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/closed_llama31_8b.yaml @@ -0,0 +1,87 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + POST: > + s['opt_learning_rate_warmup_steps'] = v['value'] + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1_200_000 - s['opt_learning_rate_warmup_steps'] " + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'cosine with linear warmup' " + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.9 " + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.95 " + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1e-05 " + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + CHECK: " v['value'] == 0.1 " + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1.0 " + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.3) and v['value'] > 0.0" + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1200000 " + diff --git a/mlperf_logging/compliance_checker/training_6.1.0/common.yaml b/mlperf_logging/compliance_checker/training_6.1.0/common.yaml new file mode 100755 index 00000000..5be10883 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/common.yaml @@ -0,0 +1,191 @@ +# This file lists all the KEYs to be checked. Every line that matches mlperf logging regex (::MLL...) will be checked against these rules. +# In the order of the appearance in the log, for each line will execute the code specified under CHECK for the KEY in that line. +# The code will be launched using local state 'v' which is the content of value field in log line, and global state 's'. +# Global state 's' exists to allow cross-line checks, like start/stop pairs etc. To initialize 's' use BEGIN record which CODE will +# be executed before any checks. +# In addition, occurrence of each key will be counted and at the end if a requirement regarding the number of occurrences is defined it will +# be confirmed. This could be implemented using global state, but since this is a common thing to do it is natively supported. +# +# KEY record: +# NAME +# REQ - optional - {EXACTLY_ONE, AT_LEAST_ONE} +# PRE - optional - code to be executed before CHECK +# CHECK - optional - expression to be evaluated to verify correctness +# POST - optional - code to be executed after CHECK + +- BEGIN: + CODE: > + s.update({ + 'init_started': False, + 'init_stopped' : False, + 'run_started' : False, + 'run_stopped' : False, + 'in_epoch' : False, + 'last_epoch' : 0, + 'in_block' : False, + 'block_first_epoch' : -1, + 'first_init_start': 9e99, + 'compile_time_mins': 0, + }) + +- KEY: + NAME: submission_org + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_platform + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: submission_division + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['closed', 'open'] " + POST: " enqueue_config('training_6.1.0/{}_common.yaml'.format(v['value'])); s['compile_time_mins'] = 240 if v['value'] == 'open' else 30 " + +# at least one record should be found, but any found records must pass the test +- KEY: + NAME: cache_clear + REQ: AT_LEAST_ONE + CHECK: + - "'value' in v" + +# frequency not checked +- KEY: + NAME: init_start + REQ: AT_LEAST_ONE + CHECK: + - "not s['init_stopped']" + - "not s['run_started']" + POST: " s['init_started'] = True; s['first_init_start']=min(s['first_init_start'], ll.timestamp) " + +# confirm less than 20min since the very first init_start +- KEY: + NAME: init_stop + REQ: EXACTLY_ONE + CHECK: + - "s['init_started']" + - "not s['run_started']" + - "ll.timestamp - s['first_init_start'] < (s['compile_time_mins']*60*1e3)" + POST: " s['init_stopped'] = True" + +- KEY: + NAME: run_start + REQ: EXACTLY_ONE + CHECK: " ( s['init_stopped'] == True )" + POST: " s['run_started'] = True " + +# status can also be aborted, but not allowing it here for now +# if eval is inside epoch and we decide to terminate, we can lack epoch_stop, it is ok +- KEY: + NAME: run_stop + REQ: EXACTLY_ONE + CHECK: + - "s['run_started']" + - "'status' in v['metadata']" + POST: " s['run_stopped'] = True " + +# FIXME: check epoch_count value match +- KEY: + NAME: block_start + REQ: AT_LEAST_ONE_OR(epoch_start) + CHECK: + - "s['run_started']" + - "('epoch_count' in v['metadata']) | ('samples_count' in v['metadata'])" + - "'first_epoch_num' in v['metadata'] if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['epoch_count'] > 0 if 'epoch_count' in v['metadata'] else True" + - "v['metadata']['samples_count'] >= 0 if 'samples_count' in v['metadata'] else True" + +- KEY: + NAME: block_stop + REQ: AT_LEAST_ONE_OR(epoch_stop) + CHECK: + - "('first_epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: epoch_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: epoch_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +# making sure previous eval did print it's accuracy result +- KEY: + NAME: eval_start + REQ: AT_LEAST_ONE_OR(block_start) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: eval_stop + REQ: AT_LEAST_ONE_OR(block_stop) + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "('epoch_num' in v['metadata']) | ('samples_count' in v['metadata'])" + +- KEY: + NAME: train_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] != '' " + +# Optional keys +- KEY: + NAME: lowest_numerical_precision_in_linear + REQ: OPTIONAL + CHECK: " v['value'] in ['fp64', 'fp32', 'tf32', 'fp16', 'fp8', 'nvfp4', 'mxfp4', 'bfloat16', 'Graphcore FLOAT 16.16', 'int8', 'uint8', 'int4', 'uint4'] " + +- KEY: + NAME: lowest_numerical_precision_in_attn + REQ: OPTIONAL + CHECK: " v['value'] in ['fp64', 'fp32', 'tf32', 'fp16', 'fp8', 'nvfp4', 'mxfp4', 'bfloat16', 'Graphcore FLOAT 16.16', 'int8', 'uint8', 'int4', 'uint4'] " + +- KEY: + NAME: lowest_numerical_precision_in_comm + REQ: OPTIONAL + CHECK: " v['value'] in ['fp64', 'fp32', 'tf32', 'fp16', 'fp8', 'nvfp4', 'mxfp4', 'bfloat16', 'Graphcore FLOAT 16.16', 'int8', 'uint8', 'int4', 'uint4'] " + +- KEY: + NAME: tensor_parallelism + REQ: OPTIONAL + CHECK: " is_integer(v['value']) " + +- KEY: + NAME: pipeline_parallelism + REQ: OPTIONAL + CHECK: " is_integer(v['value']) " + +- KEY: + NAME: context_parallelism + REQ: OPTIONAL + CHECK: " is_integer(v['value']) " + +- KEY: + NAME: expert_parallelism + REQ: OPTIONAL + CHECK: " is_integer(v['value']) " + +- KEY: + NAME: micro_batch_size + REQ: OPTIONAL + CHECK: " is_integer(v['value']) " + +- KEY: + NAME: config_filename + REQ: OPTIONAL + diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml new file mode 100644 index 00000000..4b7761e3 --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_common.yaml @@ -0,0 +1,6 @@ + +- KEY: + NAME: submission_benchmark + REQ: EXACTLY_ONE + CHECK: " v['value'] in ['flux1', 'llama31_8b', 'llama2_70b_lora', 'gpt_oss_20b', 'deepseekv3_671b'] " + POST: " enqueue_config('training_6.1.0/open_{}.yaml'.format(v['value'])) " diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_deepseekv3_671b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_deepseekv3_671b.yaml new file mode 100644 index 00000000..0ef940df --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_deepseekv3_671b.yaml @@ -0,0 +1,78 @@ +- KEY: + NAME: global_batch_size + REQ: EXACTLY_ONE + CHECK: " v['value'] >= 15360 " + POST: > + s['global_batch_size'] = v['value'] + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 4096 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: moe_aux_loss_coeff + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + CHECK: " v['value'] > 0 " + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + CHECK: " v['value'] == 1024 " + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.6) and v['value'] > 0.0" + diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_flux1.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_flux1.yaml new file mode 100644 index 00000000..19ee8dea --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_flux1.yaml @@ -0,0 +1,13 @@ +# Stable diffusion uses two metrics, FID and CLIP. +# These metrics can be calculated offline, using different scripts +# and logged seperatly. Therefore, we create a virtual key +# called aggregated_eval_accuracy, which aggregates +# both metrics into a single log line + +# TODO: Update with official metric name +- KEY: + NAME: averaged_validation_loss + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] <= 0.586 and v['value'] > 0.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_gpt_oss_20b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_gpt_oss_20b.yaml new file mode 100644 index 00000000..676d277e --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_gpt_oss_20b.yaml @@ -0,0 +1,68 @@ +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "(v['value'] <= 3.34) and v['value'] > 0.0" + +- KEY: + NAME: max_sequence_length + REQ: EXACTLY_ONE + CHECK: " v['value'] == 8192 " + +- KEY: + NAME: opt_name + REQ: EXACTLY_ONE + CHECK: " v['value'] == 'adamw' " + +- KEY: + NAME: opt_base_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_end_learning_rate + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_warmup_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_learning_rate_decay_schedule + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_1 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_beta_2 + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_epsilon + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_adamw_weight_decay + REQ: EXACTLY_ONE + +- KEY: + NAME: opt_gradient_clip_norm + REQ: EXACTLY_ONE + +- KEY: + NAME: gradient_accumulation_steps + REQ: EXACTLY_ONE + +- KEY: + NAME: eval_samples + REQ: EXACTLY_ONE + +- KEY: + NAME: max_steps + REQ: EXACTLY_ONE \ No newline at end of file diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_llama2_70b_lora.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_llama2_70b_lora.yaml new file mode 100755 index 00000000..784c008a --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_llama2_70b_lora.yaml @@ -0,0 +1,7 @@ + +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_8b.yaml b/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_8b.yaml new file mode 100644 index 00000000..5df29d5c --- /dev/null +++ b/mlperf_logging/compliance_checker/training_6.1.0/open_llama31_8b.yaml @@ -0,0 +1,8 @@ + +# TODO: Update with official compliance requirements +- KEY: + NAME: eval_accuracy + REQ: AT_LEAST_ONE + CHECK: + - "'samples_count' in v['metadata']" + ATLEAST_ONE_CHECK: "v['value'] < 1.0" diff --git a/mlperf_logging/package_checker/package_checker.py b/mlperf_logging/package_checker/package_checker.py index 45eecb8a..ceb7d219 100644 --- a/mlperf_logging/package_checker/package_checker.py +++ b/mlperf_logging/package_checker/package_checker.py @@ -191,14 +191,14 @@ def check_training_result_files(folder, usage, ruleset, quiet, werror, logging.error(" %d files do not comply, directory cannot be accepted", len(error_list)) # Check if each run use unique seeds. - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'} and division == 'closed': + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0', '6.1.0'} and division == 'closed': seed_checker_bypass = (global_seed_checker_bypass or system_seed_checker_bypass or result_seed_checker_bypass) if not seed_checker.check_seeds(result_files, seed_checker_bypass): too_many_errors = True logging.error('Seed checker failed') # Run RCP checker for >= 1.0.0 - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'} and division == 'closed' and benchmark != 'minigo': + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0', '6.1.0'} and division == 'closed' and benchmark != 'minigo': # Now go again through result files to do RCP checks rcp_bypass = (global_rcp_bypass or system_rcp_bypass or result_rcp_bypass) rcp_pass, rcp_msg, _ = rcp_checker.check_directory( @@ -252,7 +252,7 @@ def check_training_package(folder, usage, ruleset, quiet, werror, rcp_bypass, rc ruleset: The ruleset such as 0.6.0, 0.7.0, 1.0.0, etc. """ too_many_errors = False - if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'}: + if ruleset in {'1.0.0', '1.1.0', '2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0', '6.1.0'}: logging.info(' Checking System Description Files') system_description_pass = check_systems(folder, usage, ruleset) too_many_errors = too_many_errors or not system_description_pass diff --git a/mlperf_logging/rcp_checker/rcp_checker.py b/mlperf_logging/rcp_checker/rcp_checker.py index 038ca07a..5568d20b 100644 --- a/mlperf_logging/rcp_checker/rcp_checker.py +++ b/mlperf_logging/rcp_checker/rcp_checker.py @@ -193,8 +193,8 @@ def get_submission_epochs(result_files, ruleset, bert_train_samples): class RCP_Checker: def __init__(self, usage, ruleset, benchmark, verbose, rcp_file=None): - if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0", "6.0.0"}: - raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0 and 6.0.0') + if ruleset not in {'1.0.0', "1.1.0", "2.0.0", "2.1.0", "3.0.0", "3.1.0", "4.0.0", "4.1.0", "5.0.0", "5.1.0", "6.0.0", "6.1.0"}: + raise Exception('RCP Checker only supported in 1.0.0, 1.1.0, 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0, 6.0.0 and 6.1.0') self.usage = usage self.ruleset = ruleset self.benchmark = benchmark diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_deepseekv3_671b.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_deepseekv3_671b.json new file mode 100644 index 00000000..c69b9bba --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_deepseekv3_671b.json @@ -0,0 +1,58 @@ +{ + "deepseekv3_671b_ref_15360": + { + "Benchmark": "deepseekv3_671b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 6.0 submission", + "Platform": "64 NVIDIA GB300 nodes", + "Precision": "BF16", + "BS": 15360, + "Hyperparams": { + "opt_base_learning_rate": 0.000023238, + "opt_learning_rate_warmup_steps": 4, + "opt_learning_rate_decay_steps": 11996, + "gradient_accumulation_steps": 240 + }, + "Epochs to converge": [ + 721920, 721920, 721920, 737280, 691200, 737280 + ] + }, + + "deepseekv3_671b_ref_16384": + { + "Benchmark": "deepseekv3_671b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 6.0 submission", + "Platform": "64 NVIDIA GB300 nodes", + "Precision": "BF16", + "BS": 16384, + "Hyperparams": { + "opt_base_learning_rate": 0.000024, + "opt_learning_rate_warmup_steps": 4, + "opt_learning_rate_decay_steps": 11996, + "gradient_accumulation_steps": 256 + }, + "Epochs to converge": [ + 770048, 786432, 770048, 753664, 753664, 770048 + ] + }, + + "deepseekv3_671b_ref_18432": + { + "Benchmark": "deepseekv3_671b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 6.0 submission", + "Platform": "64 NVIDIA GB300 nodes", + "Precision": "BF16", + "BS": 18432, + "Hyperparams": { + "opt_base_learning_rate": 0.000025456, + "opt_learning_rate_warmup_steps": 4, + "opt_learning_rate_decay_steps": 11996, + "gradient_accumulation_steps": 288 + }, + "Epochs to converge": [ + 847872, 866304, 866304, 829440, 866304, 866304 + ] + } + } diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_flux1.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_flux1.json new file mode 100644 index 00000000..e071b7db --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_flux1.json @@ -0,0 +1,90 @@ +{ + "flux_ref_512": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 512, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.0e-4, + "opt_learning_rate_warmup_steps": 1600, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 7077888, 7340032, 7077888, 7077888, 7340032, 7340032, 7602176, 7340032, + 7077888, 7340032, 7077888, 7340032, 7340032, 7077888, 7077888, 7077888, + 7340032, 7340032, 7077888, 7340032 + ] + }, + "flux_ref_1024": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 1024, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.5e-4, + "opt_learning_rate_warmup_steps": 800, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 8650752, 8650752, 8126464, 8650752, 8650752, 8912896, 8126464, 8388608, + 8650752, 8126464, 8126464, 8650752, 8388608, 8388608, 8650752, 8388608, + 8388608, 8388608, 8912896, 8650752 + ] + }, + "flux_ref_2048": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 2048, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 2.5e-4, + "opt_learning_rate_warmup_steps": 0, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 9437184, 10223616, 10485760, 11010048, 10747904, 12320768, 10485760, + 9961472, 10485760, 9437184, 9699328, 11534336, 9699328, 9699328, 10747904, + 9961472, 10485760, 10747904, 9961472, 9961472 + ] + }, + "flux_ref_4096": { + "Benchmark": "flux1", + "Creator": "NVIDIA", + "When": "Reference RCPs before v6.0", + "Platform": "8xDGX-B200", + "Precision": "BF16", + "BS": 4096, + "Hyperparams": { + "opt_adamw_beta_1": 0.9, + "opt_adamw_beta_2": 0.95, + "opt_adamw_epsilon": 1e-8, + "opt_adamw_weight_decay": 0.1, + "opt_base_learning_rate": 4.0e-4, + "opt_learning_rate_warmup_steps": 100, + "opt_gradient_clip_norm": 1.0 + }, + "samples to converge": [ + 15204352, 15990784, 15466496, 15728640, 15204352, 15466496, 15990784, + 15204352, 14942208, 15204352, 15466496, 16252928, 14680064, 14942208, + 13893632, 15466496, 15466496, 15728640, 15466496, 15204352 + ] + } +} diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_gpt_oss_20b.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_gpt_oss_20b.json new file mode 100644 index 00000000..823ceb85 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_gpt_oss_20b.json @@ -0,0 +1,64 @@ +{ + "gpt_oss_20b_ref_16": + { + "Benchmark": "gpt_oss_20b", + "Creator": "AMD", + "When": "Reference RCPs before 6.0 submission", + "Platform": "1xMI355X", + "Precision": "BF16", + "BS": 16, + "Hyperparams": { + "opt_base_learning_rate": 4e-04, + "opt_learning_rate_warmup_samples": 2048, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 184320, 208896, 184320, 184320, 196608, + 196608, 196608, 196608, 196608, 208896, + 196608, 184320, 196608, 184320, 196608, + 208896, 196608, 196608, 196608, 184320 + ] + }, + + "gpt_oss_20b_ref_32": + { + "Benchmark": "gpt_oss_20b", + "Creator": "AMD", + "When": "Reference RCPs before 6.0 submission", + "Platform": "1xMI355X", + "Precision": "BF16", + "BS": 32, + "Hyperparams": { + "opt_base_learning_rate": 8e-04, + "opt_learning_rate_warmup_samples": 4096, + "gradient_accumulation_steps": 2 + }, + "Epochs to converge": [ + 245760, 233472, 233472, 233472, 233472, + 233472, 233472, 233472, 233472, 233472, + 233472, 221184, 233472, 233472, 245760, + 245760, 221184, 245760, 245760, 221184 + ] + }, + "gpt_oss_20b_ref_64": + { + "Benchmark": "gpt_oss_20b", + "Creator": "AMD", + "When": "Reference RCPs before 6.0 submission", + "Platform": "1xMI355X", + "Precision": "BF16", + "BS": 64, + "Hyperparams": { + "opt_base_learning_rate": 1e-03, + "opt_learning_rate_warmup_samples": 12288, + "gradient_accumulation_steps": 4 + }, + "Epochs to converge": [ + 282624, 307200, 307200, 307200, 294912, + 331776, 307200, 294912, 294912, 282624, + 319488, 282624, 294912, 319488, 294912, + 294912, 331776, 282624, 294912, 307200 + ] + } + } + \ No newline at end of file diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama2_70b_lora.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama2_70b_lora.json new file mode 100644 index 00000000..86630bdf --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama2_70b_lora.json @@ -0,0 +1,95 @@ +{ + "llama2_70b_lora_ref_8": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 8, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 3072,2688,3456,3072,3072,3072,3456,3456,3072,2688, + 3456,3072,3072,3072,3840,3456,2688,3072,3456,3456 + ] + }, + + "llama2_70b_lora_ref_16": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 16, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 3840,3840,4224,3840,3840,3840,4608,3840,4608,3840, + 4992,3840,3840,3840,4992,3840,3840,4224,3840,3456 + ] + }, + "llama2_70b_lora_ref_32": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 32, + "Hyperparams": { + "opt_base_learning_rate": 4e-4, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 5760,6528,6144,6528,5376,6528,5760,6144,6144,6528, + 6144,6144,6144,5760,5760,5760,5760,5760,6144,5760 + ] + }, + "llama2_70b_lora_ref_128": + { + "Benchmark": "llama2_70b_lora", + "Creator": "NVIDIA", + "When": "Prior to 4.0 submission", + "Platform": "TBD", + "Precision": "BF16", + "BS": 128, + "Hyperparams": { + "opt_base_learning_rate": 1e-3, + "opt_max_grad_norm": 0.3, + "opt_learning_rate_warmup_epochs": 0, + "opt_learning_rate_decay_boundary_epochs": [], + "gradient_accumulation_steps": 1, + "lora_r": 16, + "lora_alpha": 32, + "max_steps": 1024 + }, + "samples to converge": [ + 11520,13056,10752,12672,12288,11136,10752,13056, 10752,9984, + 11136,11136,11136,10752,11520,11136,11136,10752,11136,9984 + ] + } +} diff --git a/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_8b.json b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_8b.json new file mode 100644 index 00000000..1b7143f1 --- /dev/null +++ b/mlperf_logging/rcp_checker/training_6.1.0/rcps_llama31_8b.json @@ -0,0 +1,112 @@ +{ + "llama31_8b_ref_16": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "2xDGX-B200", + "Precision": "BF16", + "BS": 16, + "Hyperparams": { + "opt_base_learning_rate": 4e-04, + "opt_learning_rate_warmup_samples": 256, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 159744, 159744, 159744, 159744, 159744, + 159744, 172032, 159744, 172032, 159744, + 172032, 159744, 159744, 159744, 159744, + 159744, 159744, 159744, 159744, 159744 + ] + }, + + "llama31_8b_ref_32": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "4xDGX-B200", + "Precision": "BF16", + "BS": 32, + "Hyperparams": { + "opt_base_learning_rate": 8e-04, + "opt_learning_rate_warmup_samples": 4096, + "gradient_accumulation_steps": 1 + }, + "Epochs to converge": [ + 196608, 172032, 184320, 184320, 172032, + 172032, 184320, 184320, 184320, 172032, + 172032, 172032, 184320, 184320, 184320, + 172032, 172032, 172032, 184320, 184320 + ] + }, + + "llama31_8b_ref_64": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "4xDGX-B200", + "Precision": "BF16", + "BS": 64, + "Hyperparams": { + "opt_base_learning_rate": 8e-04, + "opt_learning_rate_warmup_samples": 6144, + "gradient_accumulation_steps": 2 + }, + "Epochs to converge": [ + 233472, 208896, 208896, 233472, 233472, + 233472, 233472, 233472, 208896, 233472, + 233472, 233472, 245760, 221184, 208896, + 233472, 233472, 221184, 221184, 221184 + ] + }, + + "llama31_8b_ref_96": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "2xDGX-B200", + "Precision": "BF16", + "BS": 96, + "Hyperparams": { + "opt_base_learning_rate": 1e-03, + "opt_learning_rate_warmup_samples": 16348, + "gradient_accumulation_steps": 6 + }, + "Epochs to converge": [ + 297216, 284832, 272448, 272448, 272448, + 272448, 297216, 272448, 297216, 272448, + 297216, 260064, 272448, 272448, 272448, + 284832, 260064, 284832, 284832, 272448 + ] + }, + + "llama31_8b_ref_128": + { + "Benchmark": "llama31_8b", + "Creator": "NVIDIA", + "When": "Reference RCPs before 5.1 submission", + "Platform": "4xDGX-B200", + "Precision": "BF16", + "BS": 128, + "Hyperparams": { + "opt_base_learning_rate": 2e-03, + "opt_learning_rate_warmup_samples": 32768, + "gradient_accumulation_steps": 4 + }, + "Epochs to converge": [ + 368640, 344064, 356352, 344064, 368640, + 368640, 405504, 344064, 331776, 307200, + 331776, 380928, 307200, 344064, 319488, + 356352, 331776, 319488, 356352, 331776 + ] + } +} + + + + + + diff --git a/mlperf_logging/repo_checker/repo_checker.py b/mlperf_logging/repo_checker/repo_checker.py index 140bff92..0a1082f5 100644 --- a/mlperf_logging/repo_checker/repo_checker.py +++ b/mlperf_logging/repo_checker/repo_checker.py @@ -127,8 +127,8 @@ def get_parser(): parser.add_argument( 'ruleset', type=str, - choices=['2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0'], - help='the ruleset. 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0 and 6.0.0 are currently supported.' + choices=['2.0.0', '2.1.0', '3.0.0', '3.1.0', '4.0.0', '4.1.0', '5.0.0', '5.1.0', '6.0.0', '6.1.0'], + help='the ruleset. 2.0.0, 2.1.0, 3.0.0, 3.1.0, 4.0.0, 4.1.0, 5.0.0, 5.1.0, 6.0.0 and 6.1.0 are currently supported.' ) parser.add_argument( '--log_output', diff --git a/mlperf_logging/result_summarizer/config.yaml b/mlperf_logging/result_summarizer/config.yaml index 79586bc5..6b8779f3 100644 --- a/mlperf_logging/result_summarizer/config.yaml +++ b/mlperf_logging/result_summarizer/config.yaml @@ -111,6 +111,13 @@ columns: gpt_oss_20b: ["Benchmark results (minutes)", "LLM", "C4", "GPT-OSS-20B"] deepseekv3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeekV3-671B"] default: [" ", " ", " "] + "6.1.0": + flux1: ["Benchmark results (minutes)", "Text to image", "CC12M and Coco-2014 for eval", "Flux1"] + llama2_70b_lora: ["Benchmark results (minutes)", "LLM-Finetune", "SCROLSS Gov Report", "LLama2-70B-LoRA"] + llama31_8b: ["Benchmark results (minutes)", "Small LLM", "C4", "Llama31-8b"] + gpt_oss_20b: ["Benchmark results (minutes)", "LLM", "C4", "GPT-OSS-20B"] + deepseekv3_671b: ["Benchmark results (minutes)", "LLM", "C4", "DeepSeekV3-671B"] + default: [" ", " ", " "] hpc: "2.0.0": diff --git a/scripts/verify_for_v6.1_training.sh b/scripts/verify_for_v6.1_training.sh new file mode 100755 index 00000000..faa266b4 --- /dev/null +++ b/scripts/verify_for_v6.1_training.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +set -e + +# rcp_bypass and rcp_bert_train_samples package checker params +# need to be retrieved at package_checker_params file at top-level submission dir. +PACKAGE_CHECKER_PARAMS="" +PACKAGE_CHECKER_PARAMS_FILE="$1/package_checker_params" +if test -f "$PACKAGE_CHECKER_PARAMS_FILE"; then + while IFS= read -r line + do + PACKAGE_CHECKER_PARAMS="$PACKAGE_CHECKER_PARAMS --$line" + done < "$PACKAGE_CHECKER_PARAMS_FILE" +fi + +python3 -m mlperf_logging.package_checker $1 training 6.1.0 $PACKAGE_CHECKER_PARAMS +python3 -m mlperf_logging.result_summarizer $1 training 6.1.0 +python3 -m mlperf_logging.repo_checker $1 training 6.1.0