Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions llm_moe_pretraining/nemo/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ RUN pip uninstall -y megatron-core && \
# TODO(dfridman): knob for eval batch size. Remove once merged in the upstream
git apply /workspace/code/eval_batch_size.patch && \
git apply /workspace/code/deepseek_mtp_module_conversion.patch && \
git apply /workspace/code/start_eval_at_iter.patch && \
cd /workspace/Megatron-Bridge/3rdparty/Megatron-LM && \
echo MCORE_COMMIT_HASH=$(git rev-parse HEAD) && \
echo $(git rev-parse HEAD) > /MCORE_COMMIT_HASH.env && \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ export WARMUP_STEPS=4
export EVAL_CHECK_INTERVAL=1 # every $EVAL_CHECK_INTERVAL steps
export EVAL_BATCHES=1 # evaluate on $EVAL_BATCHES * $GBS samples
export EVAL_BATCH_SIZE=1024

export START_EVAL_AT_ITER=$(( 42 + 24576 / GBS ))

export TENSOR_PARALLEL_SIZE=1
export PIPELINE_PARALLEL_SIZE=4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ export WARMUP_STEPS=4
export EVAL_CHECK_INTERVAL=1 # every $EVAL_CHECK_INTERVAL steps
export EVAL_BATCHES=1 # evaluate on $EVAL_BATCHES * $GBS samples
export EVAL_BATCH_SIZE=1024

export START_EVAL_AT_ITER=$(( 42 + 24576 / GBS ))

export TENSOR_PARALLEL_SIZE=1
export PIPELINE_PARALLEL_SIZE=4
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ export WARMUP_STEPS=4
export EVAL_CHECK_INTERVAL=1 # every $EVAL_CHECK_INTERVAL steps
export EVAL_BATCHES=1 # evaluate on $EVAL_BATCHES * $GBS samples
export EVAL_BATCH_SIZE=1024

export START_EVAL_AT_ITER=$(( 42 + 24576 / GBS ))

export TENSOR_PARALLEL_SIZE=1
export PIPELINE_PARALLEL_SIZE=4
Expand Down
3 changes: 3 additions & 0 deletions llm_moe_pretraining/nemo/pretrain_deepseek_v3_671b.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@ def create_config(args):
train_cfg.eval_iters = args.eval_batches
if args.eval_batch_size is not None:
train_cfg.eval_batch_size = args.eval_batch_size
start_eval_at_iter = os.getenv("START_EVAL_AT_ITER")
if start_eval_at_iter is not None:
train_cfg.start_eval_at_iter = int(start_eval_at_iter)

# Optimizer configuration
optimizer_cfg = config.optimizer
Expand Down
42 changes: 42 additions & 0 deletions llm_moe_pretraining/nemo/start_eval_at_iter.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
From 21b78b0bdd731cdb10bbcb08e9c399f80fbd3fcb Mon Sep 17 00:00:00 2001
From: Denys Fridman <dfridman@nvidia.com>
Date: Sun, 29 Jun 2026 00:00:00 +0000
Subject: [PATCH] feat(training): add start_eval_at_iter to skip first N evaluations

---
src/megatron/bridge/training/config.py | 5 +++++
src/megatron/bridge/training/train.py | 4 ++++
2 files changed, 9 insertions(+)

diff --git a/src/megatron/bridge/training/config.py b/src/megatron/bridge/training/config.py
--- a/src/megatron/bridge/training/config.py
+++ b/src/megatron/bridge/training/config.py
@@ -738,6 +738,11 @@ class TrainingConfig:

skip_train: bool = False
"""If set, bypass the training loop, optionally do evaluation for validation/test, and exit."""

+ start_eval_at_iter: Optional[int] = None
+ """If set, evaluation will only start after this iteration number. Useful for skipping
+ evaluation during early training iterations. If not set, evaluation starts from the
+ first eval_interval."""
+
def finalize(self) -> None:
"""Validate training mode specification and calculate train_iters from train_samples if needed."""

diff --git a/src/megatron/bridge/training/train.py b/src/megatron/bridge/training/train.py
--- a/src/megatron/bridge/training/train.py
+++ b/src/megatron/bridge/training/train.py
@@ -501,6 +501,10 @@ def train(
if (
global_state.train_state.do_valid
and train_config.eval_interval
+ and (
+ train_config.start_eval_at_iter is None
+ or global_state.train_state.step >= train_config.start_eval_at_iter
+ )
and global_state.train_state.step % train_config.eval_interval == 0
):
if energy_monitor is not None:
--
2.34.1
Loading