diff --git a/llm_moe_pretraining/nemo/Dockerfile b/llm_moe_pretraining/nemo/Dockerfile index 51eaf06e5..1cfb9340f 100644 --- a/llm_moe_pretraining/nemo/Dockerfile +++ b/llm_moe_pretraining/nemo/Dockerfile @@ -60,6 +60,7 @@ RUN pip uninstall -y megatron-core && \ # TODO(dfridman): knob for eval batch size. Remove once merged in the upstream git apply /workspace/code/eval_batch_size.patch && \ git apply /workspace/code/deepseek_mtp_module_conversion.patch && \ + git apply /workspace/code/start_eval_at_iter.patch && \ cd /workspace/Megatron-Bridge/3rdparty/Megatron-LM && \ echo MCORE_COMMIT_HASH=$(git rev-parse HEAD) && \ echo $(git rev-parse HEAD) > /MCORE_COMMIT_HASH.env && \ diff --git a/llm_moe_pretraining/nemo/config_GB300_64x4x240xtp1pp4cp1.sh b/llm_moe_pretraining/nemo/config_GB300_64x4x240xtp1pp4cp1.sh index 69f27a91d..9d74ab55c 100644 --- a/llm_moe_pretraining/nemo/config_GB300_64x4x240xtp1pp4cp1.sh +++ b/llm_moe_pretraining/nemo/config_GB300_64x4x240xtp1pp4cp1.sh @@ -50,7 +50,7 @@ export WARMUP_STEPS=4 export EVAL_CHECK_INTERVAL=1 # every $EVAL_CHECK_INTERVAL steps export EVAL_BATCHES=1 # evaluate on $EVAL_BATCHES * $GBS samples export EVAL_BATCH_SIZE=1024 - +export START_EVAL_AT_ITER=$(( 42 + 24576 / GBS )) export TENSOR_PARALLEL_SIZE=1 export PIPELINE_PARALLEL_SIZE=4 diff --git a/llm_moe_pretraining/nemo/config_GB300_64x4x256xtp1pp4cp1.sh b/llm_moe_pretraining/nemo/config_GB300_64x4x256xtp1pp4cp1.sh index e3897487c..526dbb226 100644 --- a/llm_moe_pretraining/nemo/config_GB300_64x4x256xtp1pp4cp1.sh +++ b/llm_moe_pretraining/nemo/config_GB300_64x4x256xtp1pp4cp1.sh @@ -50,7 +50,7 @@ export WARMUP_STEPS=4 export EVAL_CHECK_INTERVAL=1 # every $EVAL_CHECK_INTERVAL steps export EVAL_BATCHES=1 # evaluate on $EVAL_BATCHES * $GBS samples export EVAL_BATCH_SIZE=1024 - +export START_EVAL_AT_ITER=$(( 42 + 24576 / GBS )) export TENSOR_PARALLEL_SIZE=1 export PIPELINE_PARALLEL_SIZE=4 diff --git a/llm_moe_pretraining/nemo/config_GB300_64x4x288xtp1pp4cp1.sh b/llm_moe_pretraining/nemo/config_GB300_64x4x288xtp1pp4cp1.sh index ab7e41279..11069a704 100644 --- a/llm_moe_pretraining/nemo/config_GB300_64x4x288xtp1pp4cp1.sh +++ b/llm_moe_pretraining/nemo/config_GB300_64x4x288xtp1pp4cp1.sh @@ -50,7 +50,7 @@ export WARMUP_STEPS=4 export EVAL_CHECK_INTERVAL=1 # every $EVAL_CHECK_INTERVAL steps export EVAL_BATCHES=1 # evaluate on $EVAL_BATCHES * $GBS samples export EVAL_BATCH_SIZE=1024 - +export START_EVAL_AT_ITER=$(( 42 + 24576 / GBS )) export TENSOR_PARALLEL_SIZE=1 export PIPELINE_PARALLEL_SIZE=4 diff --git a/llm_moe_pretraining/nemo/pretrain_deepseek_v3_671b.py b/llm_moe_pretraining/nemo/pretrain_deepseek_v3_671b.py index 6b8ad635c..3d02e3ff2 100644 --- a/llm_moe_pretraining/nemo/pretrain_deepseek_v3_671b.py +++ b/llm_moe_pretraining/nemo/pretrain_deepseek_v3_671b.py @@ -167,6 +167,9 @@ def create_config(args): train_cfg.eval_iters = args.eval_batches if args.eval_batch_size is not None: train_cfg.eval_batch_size = args.eval_batch_size + start_eval_at_iter = os.getenv("START_EVAL_AT_ITER") + if start_eval_at_iter is not None: + train_cfg.start_eval_at_iter = int(start_eval_at_iter) # Optimizer configuration optimizer_cfg = config.optimizer diff --git a/llm_moe_pretraining/nemo/start_eval_at_iter.patch b/llm_moe_pretraining/nemo/start_eval_at_iter.patch new file mode 100644 index 000000000..924a742d3 --- /dev/null +++ b/llm_moe_pretraining/nemo/start_eval_at_iter.patch @@ -0,0 +1,42 @@ +From 21b78b0bdd731cdb10bbcb08e9c399f80fbd3fcb Mon Sep 17 00:00:00 2001 +From: Denys Fridman +Date: Sun, 29 Jun 2026 00:00:00 +0000 +Subject: [PATCH] feat(training): add start_eval_at_iter to skip first N evaluations + +--- + src/megatron/bridge/training/config.py | 5 +++++ + src/megatron/bridge/training/train.py | 4 ++++ + 2 files changed, 9 insertions(+) + +diff --git a/src/megatron/bridge/training/config.py b/src/megatron/bridge/training/config.py +--- a/src/megatron/bridge/training/config.py ++++ b/src/megatron/bridge/training/config.py +@@ -738,6 +738,11 @@ class TrainingConfig: + + skip_train: bool = False + """If set, bypass the training loop, optionally do evaluation for validation/test, and exit.""" + ++ start_eval_at_iter: Optional[int] = None ++ """If set, evaluation will only start after this iteration number. Useful for skipping ++ evaluation during early training iterations. If not set, evaluation starts from the ++ first eval_interval.""" ++ + def finalize(self) -> None: + """Validate training mode specification and calculate train_iters from train_samples if needed.""" + +diff --git a/src/megatron/bridge/training/train.py b/src/megatron/bridge/training/train.py +--- a/src/megatron/bridge/training/train.py ++++ b/src/megatron/bridge/training/train.py +@@ -501,6 +501,10 @@ def train( + if ( + global_state.train_state.do_valid + and train_config.eval_interval ++ and ( ++ train_config.start_eval_at_iter is None ++ or global_state.train_state.step >= train_config.start_eval_at_iter ++ ) + and global_state.train_state.step % train_config.eval_interval == 0 + ): + if energy_monitor is not None: +-- +2.34.1