diff --git a/.claude/skills/migrate-to-resumable-dataloader/SKILL.md b/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
new file mode 100644
index 000000000000..0135f76ea036
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
@@ -0,0 +1,166 @@
+---
+name: migrate-to-resumable-dataloader
+description: This skill should be used when the user asks to "migrate to the resumable dataloader", "switch to indexed Lhotse", "adopt the indexed + resumable pipeline", "make my training resumable", "set up StatefulDataLoader for NeMo/Lhotse", "use AIStore GetBatch", or "convert this YAML to the resumable path". Walks a NeMo training YAML and optional launcher, data blend, and runtime context through the indexed + resumable Lhotse migration; lints interacting fields; auto-patches safe YAML changes; emits a migration report, pre-flight checklist, and index-build command. Static analysis only; never launches training.
+argument-hint: '<config.yaml> [launcher.py] [blend.yaml] [runtime-notes]'
+---
+
+# Migrate a NeMo training YAML to indexed + resumable Lhotse
+
+Use this skill to port a NeMo training config from streaming/replay-style Lhotse
+loading to indexed access plus `torchdata.StatefulDataLoader` checkpoint/restore.
+The migration is fragile because YAML flags, launcher seed policy, index paths,
+storage backend, and resume topology all interact.
+
+## Core concepts
+
+- Indexed sources need `.idx` sidecars for random access into JSONL, tar, and
+  supported Shar-style data. Build these once per blend/source set.
+- `use_stateful_dataloader: true` lets Lightning checkpoint the dataloader
+  iterator state, but only if seeds, worker counts, and distributed topology are
+  stable across chunks.
+- Training configs must use `force_map_dataset: false` so indexed sources
+  partition across data-parallel ranks and workers without map-style sampler
+  overhead. Treat `force_map_dataset: true` for training as not launch-ready
+  unless the user explicitly approves a temporary exception; every source in the
+  training iteration graph must be indexed and partition-compatible before
+  launch.
+- Remote audio on AIStore/S3 generally needs `USE_AIS_GET_BATCH=true` so audio
+  fetches are deferred to sample time instead of constructing eager tar readers
+  for every shard.
+
+## Inputs
+
+| input | required | source | purpose |
+|---|---|---|---|
+| Training YAML | yes | argument or `--config=` | Inspect `data.train_ds`, `data.validation_ds`, `trainer`, `exp_manager`, and any model fields that affect resume. |
+| Launcher script | no | argument or auto-detect from project conventions | Check per-chunk seed policy, resume topology invariance, Python path setup, AIStore env vars, and optional index staging. |
+| Data-blend YAML | no | resolved from `data.train_ds.input_cfg` when possible | Check indexability: compressed paths, non-seekable paths, unsupported `extra_fields`, `slice_length`, and mixed indexed/non-indexed chains. |
+| Runtime context | no | argument, config file, or user-provided notes | Detect storage backend, AIStore endpoint availability, container constraints, and index mirror destination. |
+
+## Outputs
+
+Every output lands in `migrate-resumable/<config-stem>/` in the current repo:
+
+| output | purpose |
+|---|---|
+| `migration-report.md` | Findings, rationale, patched fields, and unresolved blockers. |
+| `<config-stem>-resumable.yaml` | Patched training config when safe automatic edits are possible. |
+| `<blend-stem>-resumable.yaml` | Patched blend, only when a blend was inspected and safe changes are possible. |
+| `pre-flight-checklist.md` | User-run steps before submitting training. |
+| `build-indexes-cmd.sh` | One-shot index-build command using the project wrapper when available, otherwise the generic NeMo/Lhotse index builder. |
+
+## Workflow
+
+### 1. Discover and parse inputs
+
+1. Resolve the training YAML path and read it with OmegaConf or a
+   comment-preserving YAML parser.
+2. Resolve any referenced blend YAMLs from `data.*.input_cfg`. Prefer project
+   conventions when obvious, but fall back to paths relative to the config.
+3. If a launcher path is supplied, read it. Otherwise inspect likely project
+   launchers (`train.py`, `pretrain.py`, shell wrappers, or raw `torchrun` /
+   `python` commands) and pick the closest match.
+4. If runtime context is supplied, read it for container image, environment
+   variables, filesystem mounts, worker counts, and AIStore endpoint settings.
+5. Detect remote storage from source paths (`s3://`, `ais://`, `http(s)://`) and
+   local filesystem storage from ordinary absolute or relative paths.
+
+### 2. Run lint pipeline
+
+Run every relevant check in:
+
+- `references/option-reference.md`
+- `references/conflict-matrix.md`
+- `references/failure-modes.md`
+- `references/aistore-vs-non-aistore.md` when remote storage is present
+
+Each finding should include severity, field/path, current value, recommended
+value, and a short rationale.
+
+Severities:
+
+- **fatal**: automatic patching is not possible; user must preprocess data or
+  change the source layout.
+- **error**: automatic patching is safe and should be applied.
+- **warning**: context-dependent; emit a report item and optional YAML comment.
+- **note**: informational; no patch.
+
+### 3. Emit patched YAML and blend
+
+Apply safe `error`-severity patches. Preserve comments when possible with
+`ruamel.yaml`; otherwise serialize with OmegaConf/YAML and rely on the report for
+rationale. For blend edits, never silently drop data: leave an explicit report
+entry and comment for every excluded or rewritten source.
+
+### 4. Generate `migration-report.md`
+
+Use `templates/migration-report.md`. Include:
+
+1. Summary of storage workflow, counts by severity, and readiness.
+2. Inputs inspected.
+3. Findings table.
+4. Walkthrough for train data, validation data, trainer/exp manager, launcher,
+   and storage backend.
+5. Data-blend audit.
+6. Verification and pre-flight steps.
+
+### 5. Generate `pre-flight-checklist.md`
+
+Use `templates/pre-flight-checklist.md` when present. Required steps:
+
+- Build `.idx` sidecars for every training/validation/test blend involved.
+- Verify `indexes_root` points at the same stable mirror used by the runtime, or
+  that explicit node-local index staging populates it before training starts.
+- If AIStore is in play: verify `aistore` SDK availability, `AIS_ENDPOINT`, and
+  whether `USE_AIS_GET_BATCH` or `USE_AIS_INDIVIDUAL_GETS` is required.
+- Verify one invariant seed across resumable chunks.
+- Verify `num_workers`, `world_size`, and relevant distributed topology do not
+  change across resume boundaries.
+- Recommend a small smoke ladder: single-node single chunk, single-node resume,
+  then full topology.
+
+### 6. Generate `build-indexes-cmd.sh`
+
+Prefer a project-provided wrapper when one is clearly present. Otherwise emit a
+generic command using:
+
+```bash
+python <NeMo>/scripts/dataloading/build_indexes.py \
+    --indexes-root <shared-index-mirror> \
+    --workers <N> \
+    <blend>.yaml [<validation-blend>.yaml ...]
+```
+
+If running through a managed runtime or container wrapper, include comments for required
+container image, mounts, environment variables, worker count, and any CPU/GPU
+container-hook workaround the project requires.
+
+### 7. Print final summary to chat
+
+Keep the final chat response under 10 lines: output directory, finding counts,
+report path, and the next command the user should run.
+
+## Knowledge base
+
+- `references/option-reference.md`: field-by-field reference for YAML and
+  launcher settings.
+- `references/failure-modes.md`: known failure signatures, triggers, and fixes.
+- `references/conflict-matrix.md`: incompatible option pairs.
+- `references/best-practices.md`: priority-ordered checklist.
+- `references/aistore-vs-non-aistore.md`: storage workflow selection.
+- `templates/migration-report.md`: report template.
+- `templates/pre-flight-checklist.md`: checklist template, when present.
+- `scripts/analyze.py`: optional static-analysis helper, when present.
+
+## Constraints
+
+- Prefer static analysis. Do not launch training, build indexes, prefetch data, or
+  modify external runtime state unless the user explicitly asks.
+- Cross-check recommendations against the actual NeMo/Lhotse code in the user's
+  checkout when paths are available. Relevant areas are common Lhotse dataloader
+  config, indexed adapters, `lhotse.indexing`, AIStore batch loading, and NeMo
+  dataloader construction.
+- Treat project wrappers as optional conveniences, not as part of the generic
+  migration contract.
+- When evidence is missing, say so. Do not encode project-specific run history
+  or local experiment names as general guidance.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md b/.claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md
new file mode 100644
index 000000000000..54314e7364ee
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md
@@ -0,0 +1,79 @@
+# AIStore vs filesystem workflows
+
+Indexed + resumable Lhotse can read audio/tar sources from a local filesystem or
+from AIStore-compatible URLs. Manifests/cuts may be on disk in either workflow.
+Choose the workflow from source path schemes, not from where the process runs.
+
+## Detection
+
+| signal | workflow |
+|---|---|
+| `tarred_audio_filepaths: s3://...`, `ais://...`, or `http(s)://...` | AIStore/remote workflow |
+| `tarred_audio_filepaths: /path/...` or relative filesystem path | filesystem workflow |
+| mixed local and remote paths | remote workflow, because it has the stricter requirements |
+
+`AIS_ENDPOINT` in the environment is necessary for AIStore access, but it is not
+sufficient evidence that the blend uses AIStore.
+
+## Remote AIStore workflow
+
+Required setup:
+
+- `aistore` SDK installed in the build/training container.
+- `AIS_ENDPOINT` exported into the process that reads remote sources.
+- `USE_AIS_GET_BATCH=true` when remote tar/audio should be fetched lazily by
+  minibatch instead of opening every shard eagerly.
+
+Optional setup:
+
+- `USE_AIS_INDIVIDUAL_GETS=true` to bypass the batch endpoint and fetch each
+  object individually. This is slower but useful when the batch endpoint is
+  unavailable or returns empty content for some objects.
+
+Index building:
+
+- The index builder reads remote tar files through AIStore byte-range capable
+  paths and writes `.idx` sidecars to the configured index mirror.
+- A successful index build proves byte-range access worked for the indexed
+  source paths. It does not prove the batch endpoint will later serve every
+  object successfully.
+
+Runtime data access:
+
+1. Keep manifests/cuts on a local/shared filesystem when random access would be
+   inefficient from remote storage.
+2. Point `data.*.indexes_root` at a persistent index mirror by default.
+3. Use node-local index staging only when direct mirror reads are too slow or
+   metadata-heavy; make the YAML path match the staged destination.
+4. Use manifest prefetch only as a fallback for remote manifest paths that
+   cannot be cached persistently.
+
+## Filesystem-only workflow
+
+Required setup:
+
+- All audio/tar paths resolve through the local filesystem visible in the
+  container/process.
+- AIStore env vars are unset or ignored when no remote paths are present.
+- `USE_AIS_GET_BATCH=false` unless a mixed remote source requires it.
+
+Index building:
+
+- The index builder reads local files directly.
+- Filesystem throughput and metadata behavior determine the best worker count.
+
+Runtime data access:
+
+1. Keep manifests/cuts on a local/shared filesystem.
+2. Point `data.*.indexes_root` at a persistent index mirror.
+3. Stage indexes to node-local SSD only when needed and only with matching YAML
+   paths.
+
+## Common gotchas
+
+- Do not infer workflow from runtime labels alone; inspect the source paths.
+- Verify filesystem mounts inside the runtime/container, not only in the host shell.
+- Reusing an index mirror requires identical source path strings and unchanged
+  source contents.
+- AIStore individual GETs and batch GETs can exercise different backend paths;
+  test the exact access mode used by training.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md b/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
new file mode 100644
index 000000000000..6c206390080b
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
@@ -0,0 +1,79 @@
+# Best practices - indexed + resumable Lhotse migration
+
+Prioritized checklist for migrating a NeMo config to indexed access and
+checkpointable dataloading.
+
+## Tier 1 - non-negotiable
+
+1. **Pin `seed` and `shard_seed` to fixed integers.** The sampler and model RNG
+   must resume from a stable state. Avoid `"randomized"` for resumable chains.
+
+2. **Use one seed across every chunk of a resumable chain.** Lightning reseeds
+   global RNGs at chunk startup. Rotating the seed breaks bit-exact resume even
+   when dataloader state restores correctly.
+
+3. **Keep `num_workers` and distributed topology invariant.** Changing worker
+   count, world size, or rank/worker assignment invalidates stateful dataloader
+   snapshots and iterable partition state.
+
+4. **Build `.idx` sidecars once per stable source path set.** Reuse a persistent
+   index mirror across experiments. Rebuild only when source contents or path
+   strings change.
+
+5. **Disable concurrent bucketing for resumable training.** Background producer
+   threads can advance iterators outside the checkpointed main-thread state.
+
+## Tier 2 - strongly recommended
+
+6. **Run a bit-exact dataloader resume check before sweeping.** Take a few
+   batches, save dataloader state, take a few more as ground truth, restore in a
+   fresh process, and compare the restored batches.
+
+7. **Enforce `force_map_dataset: false` for training.** Map-style training has
+   too much sampler/manifest overhead. Before launch, confirm every training
+   source is indexed, multiplexer seeds are fixed, and topology is stable; if a
+   source cannot be indexed, report it as a migration blocker instead of
+   silently keeping map-style training.
+
+8. **Use frequent checkpoint triggers.** External termination may not execute a
+   graceful preemption callback. Step- or time-based saves reduce lost progress.
+
+9. **Smoke test in stages.** Run single-node single-chunk, then single-node
+   multi-chunk resume, then the intended full topology.
+
+10. **Keep `.idx` files on a persistent filesystem by default.** Stage to
+    node-local SSD only when direct filesystem reads are proven problematic, and
+    ensure the YAML `indexes_root` matches the staged destination.
+
+11. **Use AIStore batch fetching deliberately.** For remote tar/audio sources,
+    `USE_AIS_GET_BATCH=true` avoids eager remote tar-reader construction. If the
+    batch endpoint fails for a dataset, use `USE_AIS_INDIVIDUAL_GETS=true` as a
+    slower fallback while investigating storage availability.
+
+## Tier 3 - operational hygiene
+
+12. **Tune index-build workers to memory and storage backend.** Many workers can
+    OOM on large manifests or remote tar headers. Reduce workers or split the
+    blend when needed.
+
+13. **Keep optional prefetch steps explicit.** Manifest prefetch, index staging,
+    and model-cache preambles should be visible in the launcher and documented in
+    the report.
+
+14. **Use CPU-safe container settings for CPU-only index builds.** Some container
+    runtimes expect GPU hooks by default; bypass or disable them when the index
+    build runs without GPU access.
+
+## What not to do
+
+- Do not trust `meta.pt` key presence alone as proof of bit-exact resume.
+- Do not combine incompatible Lightning checkpoint triggers.
+- Do not point `indexes_root` at a node-local path unless the launcher populates
+  it before every chunk.
+- Do not launch iterable training until every source in the chain has been
+  audited and made partition-compatible.
+- Do not use map-style training to bypass indexing blockers; mark the migration
+  not launch-ready unless the user explicitly approves a temporary exception
+  with the blocker and expected overhead.
+- Do not set `LHOTSE_USE_WORKER_PARTITION` manually; it is an internal signal set
+  by the dataloader worker initialization path.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md b/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
new file mode 100644
index 000000000000..f117201a0e16
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
@@ -0,0 +1,31 @@
+# Conflict matrix - indexed + resumable Lhotse
+
+Table format: `A | B | conflict | severity | resolution`.
+
+Severities:
+
+- **fatal**: automatic patching is impossible; data must be preprocessed or the
+  launcher/storage setup must change.
+- **error**: automatic patching is usually safe.
+- **warning**: context-dependent; report clearly.
+- **note**: informational.
+
+| A | B | conflict | severity | resolution |
+|---|---|---|---|---|
+| `data.train_ds.indexed: true` | `extra_fields:` on indexed NeMo entries | Indexed adapters cannot preserve arbitrary runtime field rewrites. | fatal | Preprocess the manifest to materialize fields, then drop `extra_fields`. |
+| `data.train_ds.indexed: true` | `slice_length:` on indexed entries | Slicing changes cut/audio access and has no stable sidecar unless preprocessed. | fatal | Re-shard or preprocess offline, then drop `slice_length`. |
+| `data.train_ds.indexed: true` | compressed JSONL/Shar cuts or compressed tar paths | Compressed streams do not provide stable seekable offsets for sidecars. | fatal | Re-export uncompressed or materialize seekable sources. |
+| `data.train_ds.indexed: true` | `pipe:` paths | Pipes are not seekable. | fatal | Materialize upstream data to files or a seekable backend. |
+| `data.train_ds.force_map_dataset: true` | resumable training launch | Map-style training keeps too much sampler/manifest work on the main process. | error | Set `data.train_ds.force_map_dataset: false` after making every training source indexed and partition-compatible. |
+| `force_map_dataset: true` | `force_iterable_dataset: true` | Dataset mode selection is contradictory. | error | Keep one mode. For training, use `force_map_dataset: false`; for validation/test, keep map-style unless intentionally testing iterable behavior. |
+| `use_stateful_dataloader: true` | per-chunk seed rotation | Model-level RNG diverges across resumed chunks. | error | Pin one seed for the whole chain in YAML and launcher. |
+| `use_stateful_dataloader: true` | `num_workers` changes between chunks | Saved dataloader state is incompatible. | error | Keep worker count invariant or restart without dataloader state. |
+| `use_stateful_dataloader: true` | `world_size` / rank topology changes | Saved iterator and sampler state are topology-sensitive. | error | Keep topology invariant or restart without dataloader state. |
+| `force_map_dataset: false` | any non-indexed source in the chain | Non-indexed sources do not partition and are duplicated across ranks/workers. | fatal | Convert all sources to indexed access or split/remove the non-indexed source. Do not switch to map-style training to bypass this unless the user explicitly approves a temporary exception. |
+| `force_map_dataset: false` | multiplexer seed is `"randomized"` | Shards may choose different sources at the same step. | error | Use a fixed integer seed. |
+| `force_finite: true` | training dataset | Can cap infinite training mixtures unexpectedly. | error | Use finite mode for validation/test only unless intentionally bounded. |
+| Checkpoint cadence absent | external preemption / walltime kill | Chunk progress can be lost without mid-chunk saves. | warning | Add frequent step- or time-based checkpoints. |
+| Node-local `indexes_root` | no prefetch/staging before startup | `.idx` files are missing at runtime. | error | Point to a persistent mirror or stage indexes before every chunk. |
+| AIStore batch mode | objects unavailable through batch endpoint | Batch loader may return empty content or fail collation. | warning | Verify object availability, replicate data, or set `USE_AIS_INDIVIDUAL_GETS=true`. |
+| Container lacks AIStore SDK | AIStore source paths | Remote reads may fall back to the wrong backend or fail. | error | Install a compatible `aistore` SDK in build/training containers. |
+| CPU-only index build | GPU container hook requires GPU runtime | Container startup can fail before index build begins. | warning | Use CPU-safe container settings or bypass GPU hooks. |
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md b/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
new file mode 100644
index 000000000000..8927a805cce8
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
@@ -0,0 +1,278 @@
+# Failure-mode catalog
+
+Failure signatures, triggers, and fixes for indexed + resumable Lhotse
+migrations. These are generic patterns; verify exact file names and line numbers
+against the user's checkout before citing them in a report.
+
+## §1 - Compressed JSONL, Shar cuts, or tar paths
+
+**Signature**: index build raises a `ValueError` saying the source requires
+uncompressed JSONL or tar data but received a compressed path such as
+`*.jsonl.gz` or `*.tar.gz`.
+
+**Trigger**: an indexed source points at compressed cuts, manifests, or tar
+files. Sidecar offsets require stable byte positions in seekable files.
+
+**Fix**: re-export or materialize the source in an uncompressed seekable format.
+For Shar-style data, export cuts as plain JSONL when sidecar indexing is needed.
+
+## §2 - `extra_fields` or `slice_length` on indexed NeMo entries
+
+**Signature**: an indexed NeMo iterator raises that `extra_fields` is not
+supported, or data order diverges after slicing.
+
+**Trigger**: the source applies runtime field injection or slicing while also
+requesting indexed access.
+
+**Fix**: preprocess the manifest offline so the indexed source already contains
+all required fields and shard/slice layout. Drop `extra_fields` and
+`slice_length` from the indexed YAML entry.
+
+## §3 - Remote object reader is not seekable
+
+**Signature**: `io.UnsupportedOperation: seek` or `tell` on first read of a
+remote URL source.
+
+**Trigger**: the code path uses a backend reader that does not implement the
+seek/tell operations required by indexing.
+
+**Fix**: ensure the remote-storage SDK is installed and that Lhotse routes the
+path through the intended seekable/range-capable backend. For AIStore, verify
+`aistore` is installed and `AIS_ENDPOINT` is set.
+
+## §4 - Stdlib filesystem operations on URLs
+
+**Signature**: `FileNotFoundError` from `open("s3://...")` or
+`os.path.getsize("s3://...")`.
+
+**Trigger**: a URL path reaches code that assumes local filesystem semantics.
+
+**Fix**: route URL paths through the storage-aware reader and load index metadata
+from the `.idx` file rather than local `os.path` calls.
+
+## §5 - Too many memory maps for large shard counts
+
+**Signature**: `OSError: [Errno 12] Cannot allocate memory` or system
+`vm.max_map_count` exhaustion during startup.
+
+**Trigger**: one memory map per `.idx` file across a very large number of shards.
+
+**Fix**: load sidecars into resident arrays or otherwise reduce mmap count. The
+sidecars are usually small enough that resident arrays are acceptable.
+
+## §6 - Line-delimited JSON with `.json` extension rejected
+
+**Signature**: index validation rejects a line-delimited JSON manifest with a
+`.json` suffix.
+
+**Trigger**: extension filtering assumes only `.jsonl` is valid, while some NeMo
+manifests use `.json` for one-record-per-line JSON.
+
+**Fix**: accept both `.jsonl` and line-delimited `.json` when the contents are
+newline-separated records.
+
+## §7 - Process pool OOM during index build
+
+**Signature**: `concurrent.futures.process.BrokenProcessPool` after partial
+index-build progress.
+
+**Trigger**: too many workers parse large manifests or tar headers concurrently,
+exceeding available process memory.
+
+**Fix**: reduce worker count, split the blend/source list across multiple index
+runs, or increase available memory.
+
+## §8 - GPU container hook runs during CPU-only index build
+
+**Signature**: container startup fails before Python runs, often around a GPU
+runtime hook such as `nvidia-container-cli`.
+
+**Trigger**: a CPU-only index build uses a container/runtime setup that assumes
+GPU devices are present.
+
+**Fix**: use CPU-safe container settings for index builds, or bypass/disable GPU
+hooks when the runtime has no GPU access.
+
+## §9 - AIStore SDK response shape changed
+
+**Signature**: an AttributeError on fields returned by the AIStore batch API,
+often in an error or empty-content path.
+
+**Trigger**: code assumes one SDK response schema while the installed SDK returns
+another.
+
+**Fix**: normalize SDK response attributes at the boundary and use that helper at
+all consumer sites. Avoid raw direct field access in error-handling code.
+
+## §10 - `shard_seed: "randomized"` with stateful dataloading
+
+**Signature**: usually silent. Resume is not bit-exact even though the dataloader
+snapshot appears to restore.
+
+**Trigger**: randomized shard/sampler seed is re-derived at chunk startup while
+stateful sampler data is loaded from checkpoint.
+
+**Fix**: pin `shard_seed` to a fixed integer, typically matching the top-level
+training seed.
+
+## §11 - Per-chunk seed rotation in launcher
+
+**Signature**: silent model-level divergence across chunk boundaries. Data-order
+state may restore, but dropout, augmentation, and other model/global RNG draws do
+not match a continuous run.
+
+**Trigger**: the launcher chooses a different seed for each resumable chunk.
+
+**Fix**: use one invariant seed for the entire resumable chain. If the launcher
+computes seeds from run index, override that behavior for indexed + stateful
+runs.
+
+## §12 - No mid-chunk checkpoint trigger
+
+**Signature**: only epoch-boundary checkpoints exist; progress after the last
+boundary is lost when a chunk is preempted or reaches walltime.
+
+**Trigger**: checkpoint config relies only on long epoch boundaries or sparse
+validation events.
+
+**Fix**: add an appropriate step-based or time-based checkpoint trigger and keep
+resume-required checkpoints from being pruned prematurely.
+
+## §13 - Internal time guard does not catch external termination
+
+**Signature**: the runtime sends SIGTERM/SIGKILL and no final checkpoint is
+written.
+
+**Trigger**: external cancellation, node failure, preemption, or walltime signal
+bypasses the framework's graceful preemption callback.
+
+**Fix**: leave a walltime buffer for graceful stops and rely on frequent
+mid-chunk checkpoints as the primary mitigation.
+
+## §14 - Worker or world-size mismatch on resume
+
+**Signature**: `StatefulDataLoader` or indexed iterator state raises a mismatch
+error during `load_state_dict`, or restored data order is invalid.
+
+**Trigger**: chunk restores with different `num_workers`, world size, or
+rank/worker topology than the chunk that saved the checkpoint.
+
+**Fix**: keep topology invariant for a resumable chain. To change topology,
+restart from model weights without restoring dataloader state.
+
+## §15 - AIStore batch endpoint returns empty content
+
+**Signature**: batch collation receives empty content for one or more requested
+objects, often followed by a downstream `NoneType` or collation error.
+
+**Trigger**: object is not available through the batch endpoint, credentials are
+wrong, or batch and individual-object paths exercise different backend state.
+
+**Fix**: verify object availability through the exact access mode used by
+training. As a workaround, set `USE_AIS_INDIVIDUAL_GETS=true` and investigate
+backend replication/permission issues separately.
+
+## §16 - `indexes_root` points at missing node-local storage
+
+**Signature**: `FileNotFoundError` or `.idx file not found` from an indexed
+reader at startup.
+
+**Trigger**: YAML points at a node-local path such as `/tmp/idx`, but the launcher
+does not stage sidecars there before every chunk; or the staging destination does
+not match YAML.
+
+**Fix**: use a persistent shared mirror by default. If staging to node-local SSD,
+ensure the preamble runs before training in every chunk and the YAML path matches
+that destination exactly.
+
+## §17 - Concurrent bucketing breaks bit-exact resume
+
+**Signature**: silent data-order divergence across resume boundaries.
+
+**Trigger**: a background bucketing producer advances the source iterator outside
+the checkpointed main-thread state.
+
+**Fix**: set `concurrent_bucketing: false` for resumable training so only the
+checkpointed path advances the iterator.
+
+## §18 - Iterable mode partitions when partition signal is missing or wrong
+
+**Signature**: silent under-sampling or over-partitioning under distributed
+environment variables.
+
+**Trigger**: indexed iterators read rank/world environment directly instead of
+using a dataloader-worker partition signal.
+
+**Fix**: ensure partitioning is activated only by the intended worker init path.
+Map-style mode should see the trivial `(0, 1)` partition.
+
+## §19 - Iterable mode with non-indexed source in the chain
+
+**Signature**: non-indexed sources appear on every rank/worker while indexed
+sources are partitioned.
+
+**Trigger**: `force_map_dataset: false` with a chain that mixes indexed and
+non-indexed iterators.
+
+**Fix**: convert every source in the iterable chain to indexed access, or split
+or remove the non-indexed sources before launching training. Do not switch to
+map-style training to bypass this unless the user explicitly approves a
+temporary exception with the expected overhead.
+
+## §20 - Iterable mode with randomized multiplexer seed
+
+**Signature**: loud `ValueError` from the multiplexer, or silent source-weight
+drift if no guard exists.
+
+**Trigger**: each shard draws a different multiplexer RNG state and chooses a
+different source at the same logical step.
+
+**Fix**: pin multiplexer seed, usually through the top-level `shard_seed`.
+
+## §21 - Iterable resume topology mismatch
+
+**Signature**: indexed range or chain state reports `shard_id` / `num_shards` /
+`world_size` mismatch on restore.
+
+**Trigger**: a checkpoint saved under one distributed-worker topology is restored
+under another.
+
+**Fix**: keep `(world_size, num_workers)` invariant. To scale differently,
+restart without dataloader state.
+
+## §22 - Training left in map-style mode
+
+**Signature**: long startup or step-time overhead from repeated sampler/manifest
+work, especially at larger world sizes.
+
+**Trigger**: migrated training YAML keeps `data.train_ds.force_map_dataset: true`
+instead of enforcing iterable partitioning.
+
+**Fix**: set `data.train_ds.force_map_dataset: false` and make every source in
+the training iteration graph indexed and partition-compatible. If a source cannot
+yet be indexed, mark the migration not launch-ready unless the user explicitly
+approves a temporary map-style exception with the specific blocker and expected
+overhead.
+
+## §23 - Build/prefetch tool imports stock Lhotse/NeMo
+
+**Signature**: `ModuleNotFoundError`, missing `lhotse.indexing`, or import errors
+for indexed/resumable symbols.
+
+**Trigger**: build-index or prefetch command does not place the modified NeMo and
+Lhotse checkouts before stock packages on `PYTHONPATH`.
+
+**Fix**: set `PYTHONPATH` or install the correct packages so helper scripts and
+training use the same indexed/resumable implementation.
+
+## §23 - Distributed backend errors hide an earlier Python exception
+
+**Signature**: NCCL/watchdog/collective timeout or launcher-level distributed
+failure appears after one rank already logged a Python traceback.
+
+**Trigger**: one rank fails during data loading or collation; other ranks block
+in distributed work until the backend times out.
+
+**Fix**: inspect logs before the distributed timeout and identify the first
+Python exception. Treat later backend chatter as a cascade unless it is the first
+error in time.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md b/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
new file mode 100644
index 000000000000..aee3903a2b02
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
@@ -0,0 +1,92 @@
+# Option reference - indexed + resumable Lhotse migration
+
+Field-by-field reference for YAML and launcher settings that interact with
+indexed access, `StatefulDataLoader`, distributed topology, and storage backend.
+Line numbers in local code may drift; verify against the checkout in front of
+you when producing a report.
+
+## `data.train_ds`
+
+| field | required value | purpose | see also |
+|---|---|---|---|
+| `indexed` | `true` | Routes supported sources through indexed adapters such as `IndexedJsonlReader` and indexed NeMo-tar readers. Without it, streaming/replay behavior remains active. | `nemo.collections.common.data.lhotse.dataloader`, `lhotse.indexing` |
+| `use_stateful_dataloader` | `true` | Uses `torchdata.StatefulDataLoader` so dataloader iterator state can be saved in Lightning checkpoints. | NeMo Lhotse dataloader config |
+| `force_map_dataset` | `false` for training | Enforces iterable partitioning across data-parallel ranks and workers. Map-style training has too much sampler/manifest overhead; if a source cannot yet be indexed, report the migration as not launch-ready unless the user explicitly approves a temporary exception. | failure-modes §§18-22, conflict-matrix |
+| `indexes_root` | stable filesystem mirror, or node-local path populated before startup | Tells indexed readers where to find `.idx` sidecars. Prefer a persistent shared mirror. Use `/tmp/idx` only when the launcher stages indexes there before training. | failure-modes §16 |
+| `seed` | fixed integer, invariant across chunks | Lightning reseeds Python/NumPy/Torch at chunk start. Rotating this across resumable chunks breaks model-level bit-exactness even when sampler state restores correctly. | failure-modes §11 |
+| `shard_seed` | fixed integer, not `"randomized"` | Controls sampler/multiplexer RNG. Randomized shard seeds can diverge across resume and are invalid for multi-shard iterable partitioning. | conflict-matrix |
+| `num_workers` | invariant between save and restore | `StatefulDataLoader` and iterable partition state depend on worker topology. | failure-modes §14, §21 |
+| `concurrent_bucketing` | `false` for resumable training | Background bucketing producers can advance source iterators outside the checkpointed main-thread state. | failure-modes §17 |
+| `force_iterable_dataset` | unset or compatible with `force_map_dataset: false` | Do not enable mutually exclusive dataset modes. The training target is iterable partitioning through `force_map_dataset: false`. | conflict-matrix |
+| `force_finite` | unset/false for training | Training usually needs infinite or epoch-controlled iteration; finite mode is normally for validation. | validation section |
+| `extra_fields` on indexed NeMo entries | unset | Indexed NeMo adapters cannot preserve arbitrary runtime field rewrites. Preprocess manifests instead. | failure-modes §2 |
+| `slice_length` on indexed entries | unset | Slicing rewrites cut/audio access and has no stable index unless preprocessed. | failure-modes §2 |
+| compressed `.jsonl.gz` / `.tar.gz` paths | reject for indexed sidecars | Indexing requires seekable uncompressed JSONL/tar inputs. Re-export or unpack first. | failure-modes §1 |
+| `pipe:` paths | reject | Pipe commands are not seekable. Materialize data first. | `lhotse.indexing` |
+
+## Training iterable partition (`force_map_dataset: false`)
+
+This is the required training mode for efficient indexed/resumable runs. Do not
+ship a migrated training config in map-style mode. If an indexing blocker
+prevents iterable partitioning, mark the migration not launch-ready unless the
+user explicitly approves a temporary exception.
+
+| concern | requirement | purpose |
+|---|---|---|
+| Worker partition signal | Set only by NeMo/Lhotse worker init path | Prevents map-style mode from accidentally partitioning under `torchrun` environment variables. |
+| All sources indexed | required | Non-indexed sources do not partition and will be duplicated across ranks/workers. |
+| Multiplexer seed | fixed integer | All shards must pick the same source at each multiplexing step to preserve global weighted distribution. |
+| Resume topology | invariant `(world_size, num_workers)` | Saved iterator state validates topology on restore. |
+
+## `data.validation_ds`
+
+| field | required value | purpose |
+|---|---|---|
+| `indexed` | `true` when validation sources need indexed access | Uses the same sidecar/index readers as training. |
+| `force_map_dataset` | `true` | Validation should be finite and deterministic; map-style access is simpler. |
+| `force_finite` | `true` | Prevents infinite validation loops when the training blend is infinite. |
+| `use_stateful_dataloader` | usually `false` | Validation is normally run to completion and not resumed mid-loop. |
+| `indexes_root` | same mirror as training unless intentionally separate | Validation readers need the same sidecars. |
+| `seed` / `shard_seed` | fixed integers | Keeps validation deterministic. |
+
+## Lightning / trainer settings
+
+| field | recommendation | purpose |
+|---|---|---|
+| `resume_if_exists` or equivalent | enabled for resumable chains | Ensures later chunks restore checkpointed model, optimizer, scheduler, and dataloader state. |
+| `resume_ignore_no_checkpoint` or equivalent | enabled for first chunk when supported | Allows chunk 1 to start without an existing checkpoint. |
+| Checkpoint cadence | frequent step- or time-based saves | External termination may bypass graceful preemption callbacks. Avoid losing an entire chunk. |
+| `save_top_k` / pruning policy | do not prune required resume checkpoints | Resume needs recent checkpoints and dataloader metadata. |
+| `max_time_per_run` / walltime guard | comfortably below runtime walltime | Internal graceful-stop callbacks need teardown time. |
+| `devices`, `num_nodes`, distributed topology | invariant across resume | Dataloader state is topology-sensitive. To scale differently, restart without dataloader state. |
+| `max_steps` | stable across chain | Later chunks continue global step accounting. |
+
+## Launcher contract
+
+| concern | requirement | purpose |
+|---|---|---|
+| Per-chunk seed | invariant for all chunks in a resumable chain | Prevents model-level RNG divergence across resumes. |
+| Index mirror availability | `.idx` sidecars exist before training starts | Indexed readers fail or fall back to slow behavior when sidecars are missing. |
+| Optional index staging | YAML `indexes_root` matches the staged destination | Node-local paths such as `/tmp/idx` must be populated in every chunk. |
+| `num_workers`, `world_size` | unchanged between save and restore | Required by stateful dataloading and iterable partitioning. |
+| Python path / package selection | loads the NeMo and Lhotse versions with indexed/resumable support | Avoids accidentally using stock packages without the required code. |
+| Container/runtime hooks | compatible with available CPU/GPU runtime | CPU-only index builds may need different container settings than GPU training. |
+
+## AIStore environment
+
+| env var | required when | purpose |
+|---|---|---|
+| `AIS_ENDPOINT` | any `s3://` / `ais://` source is read through AIStore | Points Lhotse/AIS clients at the proxy. |
+| `USE_AIS_GET_BATCH` | remote tar/audio sources should be fetched lazily by batch | Avoids eager tar-reader construction for every remote shard. |
+| `USE_AIS_INDIVIDUAL_GETS` | batch endpoint is unavailable or returns empty content | Falls back to per-object reads. Slower but useful for backend-specific failures. |
+| `aistore` SDK | AIStore backend in builder/training container | Required by Lhotse AIStore access paths. |
+
+## Index building
+
+| concern | recommendation | purpose |
+|---|---|---|
+| Source format | uncompressed, seekable JSONL/tar or supported Shar cuts | Sidecar offsets must map to stable byte positions. |
+| Workers | tune for memory and storage backend | Large manifests/tars plus many workers can OOM. Reduce workers or split blends. |
+| Mirror destination | persistent shared filesystem when available | Reuse sidecars across runs and avoid per-launch rebuilds. |
+| Remote sources | verify credentials/backend before building | Indexing remote data exercises storage credentials and byte-range access. |
+| Reusability | build once per source path set | Existing sidecars can be reused while source contents and paths are unchanged. |
diff --git a/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md b/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md
new file mode 100644
index 000000000000..e92d4e8d6d7a
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md
@@ -0,0 +1,141 @@
+# Migration report - `<config-stem>`
+
+- **Generated**: <YYYY-MM-DD HH:MM>
+- **Source YAML**: `<path/to/source-config.yaml>`
+- **Patched YAML**: `<path/to/source-config-resumable.yaml>`
+- **Source blend** (if inspected): `<path/to/blend.yaml>`
+- **Patched blend** (if emitted): `<path/to/blend-resumable.yaml>`
+- **Launcher** (if inspected): `<path/to/launcher>` (or "skipped - no launcher provided")
+- **Storage workflow**: <filesystem-only | AIStore/remote | mixed | unknown>
+
+## Summary
+
+<One paragraph: what changed, severity counts, whether the patched YAML is ready
+to launch, and what manual work remains.>
+
+## Findings
+
+### Fatal (must fix; auto-patching not possible)
+
+- _none_ - OR -
+- **`<field-or-path>`** (`<file>:<line>`): <explanation>
+  - **Current**: `<value>`
+  - **Recommended**: `<value>` (or "manual rewrite")
+  - **Why fatal**: <reason auto-patch is unsafe or impossible>
+  - **References**: <reference file/section>
+
+### Errors (auto-patched; review the diff)
+
+- **`data.train_ds.indexed`** (`<file>:<line>`): <description>
+  - **Was**: `false` -> **now**: `true`
+  - **Why**: <rationale>
+  - **References**: <reference file/section>
+
+### Warnings (review manually)
+
+- **`<field-or-path>`** (`<file>:<line>`): <description>
+  - **Current**: `<value>`
+  - **Recommended**: `<value>`
+  - **Why**: <rationale>
+  - **References**: <reference file/section>
+
+### Notes (informational)
+
+- **`<field-or-path>`** (`<file>:<line>`): <description>
+
+## Dedup Mode
+
+<One paragraph: confirm training uses `force_map_dataset: false`; if not, mark
+the migration not launch-ready and list the blocker or explicit user exception.>
+
+- **Training target**: `force_map_dataset: false`. This enforces iterable
+  partitioning and avoids map-style sampler/manifest overhead.
+- **Validation/test target**: `force_map_dataset: true` unless intentionally
+  testing iterable behavior; finite deterministic validation is simpler in
+  map-style mode.
+- **Blocker/exception**: if training still uses `force_map_dataset: true`, mark
+  the migration not launch-ready unless the user explicitly approved an
+  exception; list the unindexed source or runtime blocker, expected overhead, and
+  work needed to move back to iterable training.
+
+For training iterable mode, list:
+
+- Sources confirmed indexed: <list>
+- Multiplexer seeds confirmed integer: <list>
+- World-size / num-workers commitment: `<W>` x `<NW>` for the full chain
+
+## Data Blend Audit
+
+<List unindexable entries such as compressed manifests/tars, `pipe:` paths,
+unsupported `extra_fields`, `slice_length`, or mixed indexed/non-indexed chains.>
+
+| entry | reason | upstream fix |
+|---|---|---|
+| `<source>` | compressed cuts/manifests | re-export as uncompressed seekable files |
+| `<source>` | unsupported `extra_fields` | preprocess fields into the manifest |
+
+## Launcher Review
+
+<If launcher was inspected, list findings. Otherwise write "skipped".>
+
+- **Per-chunk seed rotation**: <not detected | detected at file:line; must pin one seed>
+- **Index access wired**: <persistent mirror | node-local staging | missing>
+- **AIStore batch audio fetch**: <needed and enabled | not needed | missing>
+- **Topology invariance**: <verified | not verifiable | violated>
+- **Python path/package selection**: <verified | not verifiable | missing>
+
+## Storage Workflow
+
+<One paragraph: filesystem-only vs AIStore/remote workflow, whether manifests and
+indexes are local/shared filesystem paths, and whether any prefetch/staging is
+required.>
+
+## Patched Output Diff
+
+### `<config>.yaml` -> `<config>-resumable.yaml`
+
+```diff
+-  data.train_ds:
+-    indexed: false
+-    use_stateful_dataloader: false
+-    shard_seed: "randomized"
++  data.train_ds:
++    indexed: true
++    use_stateful_dataloader: true
++    force_map_dataset: false
++    indexes_root: /shared/fs/.../indexes_mirror
++    shard_seed: 42
+```
+
+_(full diff inline)_
+
+### `<blend>.yaml` -> `<blend>-resumable.yaml`
+
+```diff
+-  - type: lhotse_shar
+-    shar_path:
+-      cuts: s3://bucket/path/cuts.0.jsonl.gz
++  # Source excluded: compressed Shar cuts cannot be indexed.
++  # Re-export with uncompressed cuts or convert to another seekable format.
+```
+
+_(full diff inline)_
+
+## Pre-flight Checklist
+
+1. Build indexes via the generated `build-indexes-cmd.sh`.
+2. Run a bit-exact dataloader resume check on the migrated config.
+3. Confirm storage SDKs and environment variables required by the selected
+   workflow.
+4. Confirm `indexes_root` exists and is populated from every node/container that
+   will train.
+5. Run single-node single-chunk, single-node resume, then full-topology smoke.
+6. Submit the real run.
+
+## References
+
+- `references/option-reference.md`
+- `references/conflict-matrix.md`
+- `references/failure-modes.md`
+- `references/best-practices.md`
+- `references/aistore-vs-non-aistore.md`
diff --git a/CLAUDE.md b/CLAUDE.md
index 9a355c624155..097b681555f4 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -20,6 +20,8 @@ Dev quickstart: `uv sync --extra all --extra cu13` (Python 3.12+, PyTorch 2.7+;
 - Check: `isort --check <path> && black --check <path>` or `isort --check . && black --check .`
 - Fix: `isort <path> && black <path>` or `isort . && black .`
 - Jupyter Notebooks are excluded from automatic black reformatting (see `extend-exclude`), but can be still reformatted when passed directly. Do not reformat notebooks outside your changes.
+- **Helper placement**: keep public APIs and top-level classes/functions near the top of a file; place private
+  helpers and utilities at the bottom of the file unless a local module convention requires otherwise.
 
 ## Testing
 
diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index 09ff87ea180c..620194c53727 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -3,6 +3,12 @@ Datasets
 
 NeMo ASR models expect data as a set of audio files plus a manifest file describing each utterance.
 
+.. seealso::
+
+   For Lhotse-based dataloading (the recommended path for new ASR
+   recipes — dynamic bucketing, multi-source mixing, indexed/resumable
+   dataloading), see :doc:`/dataloaders`.
+
 .. _section-with-manifest-format-explanation:
 
 Manifest Format
diff --git a/docs/source/asr/speaker_diarization/intro.rst b/docs/source/asr/speaker_diarization/intro.rst
index b72177dbaa42..c65163ffff88 100644
--- a/docs/source/asr/speaker_diarization/intro.rst
+++ b/docs/source/asr/speaker_diarization/intro.rst
@@ -99,4 +99,3 @@ The full documentation tree is as follows:
    configs
    api
    resources
-
diff --git a/docs/source/audio/datasets.rst b/docs/source/audio/datasets.rst
index 4c023961a29e..781b0a9e99d8 100644
--- a/docs/source/audio/datasets.rst
+++ b/docs/source/audio/datasets.rst
@@ -3,6 +3,12 @@ Datasets
 
 The `audio` collection expect the training, validation and tests datasets in either NeMo format or Lhotse format.
 
+.. seealso::
+
+   For the Lhotse dataloader's full surface — supported ``input_cfg``
+   types, bucketing, indexed manifests + resumable dataloading, and the
+   ``LhotseDataLoadingConfig`` field reference — see :doc:`/dataloaders`.
+
 NeMo Format
 -----------
 
diff --git a/docs/source/dataloaders.rst b/docs/source/dataloaders.rst
index 20fd0f2f0b90..4eb5bfc59c1f 100644
--- a/docs/source/dataloaders.rst
+++ b/docs/source/dataloaders.rst
@@ -24,26 +24,6 @@ NeMo supports using `Lhotse`_, a speech data handling library, as a dataloading
     constant in time (i.e., stationary); in fact, each mini-batch will have roughly the same ratio of data coming from each source.
     Since the multiplexing is done dynamically, it is very easy to tune the sampling weights.
 
-Lhotse dataloading supports the following types of inputs:
-
-* NeMo manifests
-    Regular NeMo JSON manifests.
-* NeMo tarred data
-    Tarred NeMo JSON manifests + audio tar files; we also support combination of multiple NeMo
-    tarred data sources (e.g., multiple buckets of NeMo data or multiple datasets) via dynamic multiplexing.
-
-    We support using a subset of Tarred NeMo JSON manifests along with audio tar files without disrupting the alignment between the tarred files and their corresponding manifests.
-    This feature is essential because large datasets often consist of numerous tar files and multiple versions of Tarred NeMo JSON manifest subsets, which may contain only a portion of the audio files due to filtering for various reasons.
-    To skip specific entries in the manifests without repeatedly copying and retarring audio files, the entries must include a ``_skipme`` key. This key should be set to ``True``, ``1``, or a reason for skipping (e.g., ``low character-rate``).
-
-* Lhotse CutSet manifests
-    Regular Lhotse CutSet manifests (typically gzipped JSONL).
-    See `Lhotse Cuts documentation`_ to learn more about Lhotse data formats.
-* Lhotse Shar data
-    Lhotse Shar is a data format that also uses tar files for sequential data loading,
-    but is designed to be modular (i.e., easily extensible with new data sources and with new feature fields).
-    More details can be found here: |tutorial_shar|
-
 .. caution:: As of now, Lhotse is mainly supported in most ASR model configurations. We aim to gradually extend this support to other speech tasks.
 
 .. _Lhotse: https://github.com/lhotse-speech/lhotse
@@ -51,6 +31,269 @@ Lhotse dataloading supports the following types of inputs:
 .. |tutorial_shar| image:: https://colab.research.google.com/assets/colab-badge.svg
     :target: https://colab.research.google.com/github/lhotse-speech/lhotse/blob/master/examples/04-lhotse-shar.ipynb
 
+Architecture overview
+---------------------
+
+The Lhotse dataloader is a pipeline of small components. Each YAML option you
+set lands in exactly one of them, so it pays to know which is which::
+
+    input_cfg entry  ──►  parser_fn  ──►  Adapter (IteratorNode)
+                          (registered                 │
+                           via @data_type_parser)     ▼
+                                            CutSet (lazy iterator graph)
+                                                      │
+                              SamplingConstraint  ──► CutSampler
+                                                      │
+                                                      ▼
+                                          IterableDatasetWrapper
+                                                      │
+                                                      ▼
+                                            user-defined Dataset
+                                                      │
+                                                      ▼
+                                                 DataLoader
+                                                 (or StatefulDataLoader)
+
+Components, top to bottom:
+
+* **input_cfg entry** — one YAML dict identified by ``type:`` (e.g.
+  ``type: nemo_tarred``). Listed below in :ref:`lhotse-format-reference`.
+* **parser_fn** — registered with the ``@data_type_parser`` decorator in
+  ``nemo/collections/common/data/lhotse/cutset.py``. Reads the entry and
+  returns ``(CutSet, is_tarred)``. Users can add their own (see
+  :ref:`lhotse-extension-hooks`).
+* **Adapter** — a class that knows how to iterate one specific on-disk
+  format (e.g. ``LazyNeMoTarredIterator``, ``LazyParquetIterator``,
+  ``NeMoMultimodalConversationJsonlAdapter``). All recent adapters are
+  Lhotse :class:`~lhotse.lazy.IteratorNode` subclasses and support
+  ``indexed=True`` for O(1) random access — see
+  :ref:`indexed-resumable-dataloading`.
+* **CutSet** — Lhotse's lazy manifest wrapper. Composing multiple sources
+  produces a graph of iterator nodes (mux, mix, map, filter, …) underneath.
+* **SamplingConstraint** — defines what "length" means for batch packing:
+  :class:`~lhotse.dataset.sampling.base.TimeConstraint` (audio duration,
+  default), :class:`~lhotse.dataset.sampling.base.TokenConstraint` (token
+  count, multimodal), ``MultimodalSamplingConstraint`` /
+  ``FixedBucketBatchSizeConstraint2D`` (NeMo extensions; see
+  :ref:`lhotse-sampling-constraints`).
+* **CutSampler** — :class:`~lhotse.dataset.sampling.DynamicCutSampler` or
+  :class:`~lhotse.dataset.sampling.DynamicBucketingSampler`, picked
+  automatically based on ``use_bucketing``.
+* **IterableDatasetWrapper** — Lhotse helper that turns the sampler-produced
+  ``CutSet`` mini-batches into a stream the PyTorch ``DataLoader`` can
+  consume.
+* **Dataset class** — supplied by the model code; converts a ``CutSet``
+  mini-batch into a ``dict[str, Tensor]``. The same dataset class can serve
+  multiple model architectures because all batching is upstream.
+
+.. _lhotse-format-reference:
+
+Supported input formats
+-----------------------
+
+Every entry in ``input_cfg`` is identified by ``type:``. The table below is
+the canonical list of every type the dataloader understands today, what it
+returns, and the on-disk shape it expects.
+
+.. list-table::
+   :header-rows: 1
+   :widths: 18 32 14 8 8 10 10
+
+   * - ``type:``
+     - Purpose
+     - Yields
+     - Audio
+     - Tarred
+     - Indexable
+     - Adapter / parser
+   * - ``nemo``
+     - NeMo non-tarred JSON manifest (per-file audio)
+     - ``Cut``
+     - yes
+     - no
+     - yes
+     - ``LazyNeMoIterator``
+   * - ``nemo_tarred``
+     - NeMo tarred manifest + audio tar shards
+     - ``Cut``
+     - yes
+     - yes
+     - yes
+     - ``LazyNeMoTarredIterator``
+   * - ``lhotse``
+     - Plain Lhotse cuts JSONL
+     - ``Cut``
+     - yes
+     - no
+     - yes
+     - lhotse ``LazyJsonlIterator`` / ``LazyIndexedManifestIterator``
+   * - ``lhotse_shar``
+     - Lhotse Shar (sharded archive directory)
+     - ``Cut``
+     - yes
+     - yes
+     - yes
+     - lhotse ``LazySharIterator``
+   * - ``parquet``
+     - Parquet file with audio bytes column
+     - ``Cut``
+     - yes
+     - no
+     - yes (row groups)
+     - ``LazyParquetIterator``
+   * - ``txt``
+     - One example per line, raw text
+     - ``TextExample``
+     - no
+     - n/a
+     - no
+     - ``LhotseTextAdapter``
+   * - ``txt_jsonl``
+     - One JSON object per line; configurable text field
+     - ``TextExample``
+     - no
+     - n/a
+     - yes
+     - ``LhotseTextJsonlAdapter``
+   * - ``txt_pair``
+     - Source + target text files for translation
+     - ``SourceTargetTextExample``
+     - no
+     - n/a
+     - no
+     - ``LhotseTextPairAdapter``
+   * - ``multimodal_conversation``
+     - Multi-turn chat with mixed text/audio turns (JSONL)
+     - ``NeMoMultimodalConversation``
+     - optional
+     - optional
+     - yes
+     - ``NeMoMultimodalConversationJsonlAdapter``
+   * - ``share_gpt``
+     - ShareGPT-format JSONL → conversation
+     - ``NeMoMultimodalConversation``
+     - optional
+     - optional
+     - yes
+     - ``NeMoMultimodalConversationShareGPTJsonlAdapter``
+   * - ``share_gpt_webdataset``
+     - ShareGPT in WebDataset tar shards
+     - ``NeMoMultimodalConversation``
+     - optional
+     - yes
+     - yes
+     - ``NeMoMultimodalConversationShareGPTWebdatasetAdapter``
+   * - ``lhotse_as_conversation``
+     - Read ASR data and emit it as ASR conversation
+     - ``NeMoMultimodalConversation``
+     - yes
+     - inherits
+     - inherits
+     - transform on ``read_cutset_from_config``
+   * - ``sqa_as_conversation``
+     - Spoken-QA → 3-turn conversation (question / audio / answer)
+     - ``NeMoMultimodalConversation``
+     - yes
+     - inherits
+     - inherits
+     - transform
+   * - ``s2s_as_conversation``
+     - Duplex S2S → conversation
+     - ``NeMoMultimodalConversation``
+     - yes
+     - inherits
+     - inherits
+     - transform
+   * - ``s2s_duplex_overlap_as_s2s_duplex``
+     - Overlapping agent/user segments → unified S2S timeline
+     - ``Cut``
+     - yes
+     - inherits
+     - inherits
+     - transform
+   * - ``s2s_duplex_reverse_role``
+     - Swap user and agent in a duplex cut
+     - ``Cut``
+     - yes
+     - inherits
+     - inherits
+     - transform
+   * - ``lhotse_magpietts_data_as_continuation``
+     - MagpieTTS dataset → S2S duplex continuation
+     - ``Cut``
+     - yes
+     - inherits
+     - inherits
+     - transform
+   * - ``nemo_tarred_to_duplex``
+     - Single-supervision NeMo → duplex (user speech + agent silence)
+     - ``Cut``
+     - yes
+     - yes
+     - inherits
+     - transform
+   * - ``multi_speaker_simulator``
+     - Synthetic multi-speaker mixtures from a manifest
+     - ``Cut``
+     - yes
+     - n/a
+     - no
+     - ``MultiSpeakerMixtureGenerator``
+   * - ``group``
+     - Wrap a list of entries with a shared ``weight`` and ``tags``
+     - (nested)
+     - n/a
+     - n/a
+     - n/a
+     - n/a
+
+Notes:
+
+* "Inherits" means the type is a transform that wraps another underlying
+  source via ``read_cutset_from_config(config)``. Such entries accept the
+  underlying source's keys (e.g. ``cuts_path`` and ``manifest_filepath``)
+  *in addition to* their own.
+* Tarred NeMo manifests support a ``_skipme`` key to omit specific manifest
+  rows without repacking tars (set to ``True``, ``1``, or a reason string).
+* Lhotse Shar is documented in the upstream tutorial: |tutorial_shar|.
+
+Conversation / multimodal types — when to use which
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Six types yield ``NeMoMultimodalConversation`` from very different sources.
+Pick by the shape of your input data:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 35 25 40
+
+   * - Your data
+     - ``type:``
+     - Notes
+   * - JSONL of multi-turn chats with mixed text/audio turns
+     - ``multimodal_conversation``
+     - Native chat schema; audio turns reference paths or tar members
+   * - JSONL in ShareGPT chat schema
+     - ``share_gpt``
+     - Adds ShareGPT-specific role/value parsing
+   * - ShareGPT data packed in WebDataset tar shards
+     - ``share_gpt_webdataset``
+     - Same parsing as ``share_gpt``, reads tarred shards
+   * - ASR data in NeMo or Lhotse format
+     - ``lhotse_as_conversation``
+     - Builds a 2-turn (instruction+audio / transcript) conversation per cut
+   * - Spoken-QA data with ``question`` / ``answer`` fields
+     - ``sqa_as_conversation``
+     - Builds a 3-turn (question / audio / answer) conversation per cut
+   * - Duplex S2S data with user/agent supervisions
+     - ``s2s_as_conversation``
+     - Maps duplex roles onto chat turns
+
+The last three (``*_as_conversation``) are *transforms*: they delegate to
+``read_cutset_from_config(config)`` for the underlying audio source, so the
+nested keys like ``manifest_filepath``, ``cuts_path``, or ``shar_path``
+belong on the same entry.
+
 Enabling Lhotse via configuration
 ----------------------------------
 
@@ -128,6 +371,16 @@ Some other Lhotse related arguments we support:
     When ``batch_duration`` is not set, it acts as a static batch size.
 * ``seed`` sets a random seed for the shuffle buffer.
 
+* ``indexed`` (default ``False``) opts the dataloader into Lhotse's indexed-manifest
+  path, giving every adapter O(1) random access and graph-token-based exact restore.
+  Requires ``.idx`` sidecars next to every JSONL/tar file. See
+  :ref:`indexed-resumable-dataloading` below.
+
+* ``use_stateful_dataloader`` (default ``False``) swaps PyTorch's
+  ``DataLoader`` for ``torchdata.stateful_dataloader.StatefulDataLoader`` so
+  that per-worker iterator state is captured in checkpoints and restored
+  exactly on resume. Pair with ``indexed: true`` for full O(1) restore.
+
 The full and always up-to-date list of supported options can be found in ``LhotseDataLoadingConfig`` class.
 
 .. _asr-dataset-config-format:
@@ -147,6 +400,29 @@ The dataset class which converts these examples to tensors can partition the min
 different processing to each group.
 For example, you may want to construct different prompts for the model using metadata in ``tags``.
 
+How ``tags`` is applied
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Every key/value pair in ``tags`` becomes an attribute on every cut produced
+by that entry. The dataloader walks the cuts via ``cuts.map(...)`` and runs::
+
+    for key, val in tags.items():
+        setattr(cut, key, val)
+
+So in your dataset class you read them back as ordinary attributes::
+
+    def __getitem__(self, cuts):
+        for cut in cuts:
+            lang   = cut.lang
+            task   = cut.task
+            ctx    = cut.context
+            ...
+
+Tags set on a ``group`` apply to every nested entry; tags set on an inner
+entry override the outer ones for that source. Conflicts with built-in cut
+fields (``id``, ``duration``, ``supervisions``, …) silently overwrite the
+built-in — pick tag names that don't collide.
+
 .. note:: When fine-tuning a model that was trained with ``input_cfg`` option, typically you'd only need
     to override the following options: ``input_cfg=null`` and ``manifest_filepath=path/to/manifest.json``.
 
@@ -384,6 +660,12 @@ Python dataloader instantiation example::
         tokenizer=my_tokenizer,
     )
 
+**Indexed mode for text/multimodal sources.** All of the parsers above
+(``txt_jsonl``, ``nemo_sft_jsonl``, ``multimodal_conversation``, ``share_gpt``,
+``share_gpt_webdataset``) accept ``indexed: true`` and integrate with
+``StatefulDataLoader``-based exact resume. ``txt`` and ``txt_pair`` are
+intentionally streaming-only. See :ref:`indexed-resumable-dataloading`.
+
 **Dataloading and bucketing of text and multimodal data.** When dataloading text or multimodal data, pay attention to the following config options (we provide example values for convenience):
 
 * ``use_multimodal_sampling: true`` tells Lhotse to switch from measuring audio duration to measuring token counts; required for text.
@@ -419,6 +701,25 @@ To enable bucketing, set ``batch_size: null`` and use the following options:
 **Joint dataloading of text/audio/multimodal data.** The key strength of this approach is that we can easily combine audio datasets and text datasets,
 and benefit from every other technique we described in this doc, such as: dynamic data mixing, data weighting, dynamic bucketing, and so on.
 
+Single-config vs. ``multi_config: true``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By default the dataloader builds **one** ``CutSet`` and **one** sampler from
+the top-level config. Setting ``multi_config: true`` switches to a
+**multi-modality** layout where each named sub-block (typically ``audio:``
+and ``text:``) is parsed as its own dataloader config, with its own
+sampling/bucketing options, and the per-modality samplers are fused at the
+batch level.
+
+When ``multi_config: true`` is set:
+
+* Top-level keys (``num_workers``, ``shuffle``, ``seed``, ``sample_rate``,
+  …) apply globally and are inherited by every sub-block.
+* Per-modality overrides — including the ``input_cfg`` itself — go inside
+  the named sub-block (``audio: ...`` / ``text: ...``).
+* The per-modality samplers are combined into one stream by
+  ``sampler_fusion``.
+
 This approach is described in the `EMMeTT`_ paper. There's also a notebook tutorial called Multimodal Lhotse Dataloading. We construct a separate sampler (with its own batching settings) for each modality,
 and specify how the samplers should be fused together via the option ``sampler_fusion``:
 
@@ -481,6 +782,223 @@ Example. Combine an ASR (audio-text) dataset with an MT (text-only) dataset so t
 
 .. caution:: We strongly recommend to use multiple shards for text files as well so that different nodes and dataloading workers are able to randomize the order of text iteration. Otherwise, multi-GPU training has a high risk of duplication of text examples.
 
+.. _lhotse-sampling-constraints:
+
+Sampling constraints
+--------------------
+
+A :class:`~lhotse.dataset.sampling.base.SamplingConstraint` decides what
+"length" means when the sampler packs a mini-batch. NeMo uses four:
+
+* :class:`~lhotse.dataset.sampling.base.TimeConstraint` — default.
+  Length = audio duration in seconds. Enforces ``max_duration`` /
+  ``batch_duration`` / ``quadratic_duration``.
+* :class:`~lhotse.dataset.sampling.base.TokenConstraint` — activated by
+  ``use_multimodal_sampling: true`` for text-only flows. Length = token
+  count after applying the tokenizer (and optionally the prompt format).
+  Enforces ``max_tokens`` / ``batch_tokens`` / ``quadratic_factor``.
+* ``MultimodalSamplingConstraint`` — Lhotse-style mixed-modality
+  packing. Activated by setting both ``use_multimodal_sampling: true``
+  and a ``token_equivalent_duration`` so audio cuts are measured in
+  equivalent-token units alongside text. Enforces all of the above plus
+  ``min_tpt``/``max_tpt`` (token-per-token ratio filtering).
+* ``FixedBucketBatchSizeConstraint2D`` — activated automatically when
+  ``bucket_duration_bins`` is given as a list of ``[duration, tokens]``
+  pairs **and** ``bucket_batch_size`` is set. Each bucket gets its own
+  fixed batch size; this is the layout produced by
+  ``estimate_duration_bins_2d.py`` and the OOMptimizer.
+
+You usually don't pick a constraint by name — it's inferred from the
+combination of YAML options. The names matter when you read NeMo's source,
+extend the system with a custom constraint, or interpret error messages.
+
+.. _indexed-resumable-dataloading:
+
+Resumable / indexed dataloading
+-------------------------------
+
+Setting ``indexed: true`` (per-source or top-level) plus
+``use_stateful_dataloader: true`` (top-level) opts NeMo's Lhotse dataloader
+into Lhotse's indexed iterator graph and torchdata's
+``StatefulDataLoader``. The combination gives you:
+
+* O(1) checkpoint/restore of the *whole* dataloading pipeline — sampler RNG,
+  bucketer state, multiplexer choice RNG, per-source iterator cursors, and
+  per-worker prefetch queues — without any replay from the start of the epoch.
+* Random access (``__getitem__``) over every supported adapter.
+
+When set at the top level, ``indexed: true`` is propagated by
+``read_dataset_config`` through the ``propagate_attrs`` cascade, so a single
+top-level flag covers every nested ``input_cfg`` group. You can still override
+it per-source if needed.
+
+Per-adapter support
+^^^^^^^^^^^^^^^^^^^
+
+The following ``input_cfg`` types accept ``indexed: true`` today and require an
+``.idx`` sidecar next to each data file:
+
+* ``nemo`` / ``nemo_tarred`` — JSONL manifest gets ``manifest.json.idx``;
+  every audio tar in ``tarred_audio_filepaths`` gets ``shard.tar.idx``.
+* ``lhotse`` (plain) — ``cuts.jsonl`` gets ``cuts.jsonl.idx``.
+* ``lhotse_shar`` — every uncompressed ``cuts.<NNNNNN>.jsonl`` and field tar
+  inside the Shar dir.
+* ``parquet`` — no sidecar required, but the file must expose row-group
+  statistics (the default for files written by pyarrow / pandas).
+* ``txt_jsonl`` — every file in ``paths``.
+* ``multimodal_conversation`` and ``share_gpt`` — JSONL manifest plus optional
+  audio tars in ``tarred_audio_filepaths``.
+* ``share_gpt_webdataset`` — every ``shard-*.tar`` inside ``data_dir``.
+
+``txt`` and ``txt_pair`` remain streaming-only (no random-access support).
+
+Two caveats to be aware of:
+
+* ``indexed: true`` is incompatible with ``extra_fields`` and ``slice_length``
+  on ``nemo``/``nemo_tarred``: those features mutate or expand cuts in a way
+  that has no stable index. Pre-process the manifest offline if you need them
+  in an indexed pipeline.
+* Only **uncompressed** files can be indexed (no ``.jsonl.gz``,
+  ``.tar.gz``, etc.) and only files on a backend that supports indexed reads
+  (local FS, S3-compatible object stores, AIStore).
+
+Building ``.idx`` sidecars
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Two equivalent ways:
+
+1. Lhotse's CLI per file::
+
+       lhotse index jsonl path/to/cuts.jsonl
+       lhotse index tar  path/to/shard.tar
+       lhotse index shar path/to/shar_dir/
+
+2. NeMo's batch helper that takes a config and indexes everything it
+   references in one shot::
+
+       python scripts/dataloading/build_indexes.py path/to/input_cfg.yaml
+
+   The script walks ``input_cfg`` (including nested ``group`` entries and
+   per-entry YAML references), dispatches the right tar layout for each
+   adapter (NeMo one-member-per-sample vs. WebDataset/Shar pair format), and
+   skips files that already have an up-to-date ``.idx``. Use ``--force`` to
+   rebuild, ``--workers N`` for parallelism, ``--dry-run`` to preview.
+
+   Pass ``--indexes-root /path/to/mirror`` to write the sidecars to a
+   separate directory tree that mirrors the data files' layout instead of
+   placing them next to the data — see :ref:`lhotse-indexes-root` below.
+
+.. _lhotse-indexes-root:
+
+Storing ``.idx`` sidecars in a separate directory (``indexes_root``)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By default, every ``.idx`` lives next to its data file
+(``cuts.jsonl`` ↔ ``cuts.jsonl.idx``). If your data sits on shared, slow,
+or read-only storage (NFS, S3, AIStore), you may want to keep the indexes
+on a fast local disk instead. Set ``indexes_root`` at the top of the
+dataloader config:
+
+.. code-block:: yaml
+
+    data:
+      train_ds:
+        indexed: true
+        use_stateful_dataloader: true
+        indexes_root: /scratch/idx     # mirror lives here
+        input_cfg:
+          - type: nemo_tarred
+            manifest_filepath: /shared/data/asr/manifest__OP_0..127_CL_.jsonl
+            tarred_audio_filepaths: ais://bucket/asr/audio__OP_0..127_CL_.tar
+
+Index lookups for each data file ``D`` resolve to
+``<indexes_root>/<D-with-scheme-stripped>.idx``. Examples::
+
+    /shared/data/asr/manifest_0.jsonl    -> /scratch/idx/shared/data/asr/manifest_0.jsonl.idx
+    ais://bucket/asr/audio_0.tar        -> /scratch/idx/bucket/asr/audio_0.tar.idx
+
+The setting cascades through ``read_dataset_config`` to every nested
+``input_cfg`` entry, so a single top-level value covers the whole pipeline.
+You can override it per-source on any entry that needs a different mirror.
+
+Two ways to populate the mirror:
+
+1. **Build the indexes there to begin with**::
+
+       python scripts/dataloading/build_indexes.py \
+           --indexes-root /scratch/idx path/to/input_cfg.yaml
+
+   The script reads each data file in place, computes the offsets, and
+   writes the ``.idx`` directly to the mirrored target.
+
+2. **Prefetch existing remote indexes** when sidecars already live next to
+   the data on shared/object storage and you just want a local copy::
+
+       python scripts/dataloading/prefetch_indexes.py \
+           --indexes-root /scratch/idx path/to/input_cfg.yaml
+
+   ``prefetch_indexes.py`` walks the same ``input_cfg``, locates every
+   sidecar at its natural location (via lhotse's ``open_best``, so
+   ``ais://`` / ``s3://`` / ``http://`` are all supported as sources),
+   and copies it into the local mirror. Use ``--source-indexes-root``
+   when the source sidecars themselves live under another mirror.
+
+Both scripts accept ``--force``, ``--workers N``, and ``--dry-run``.
+
+End-to-end YAML example
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+    model:
+      train_ds:
+        # Top-level switches enable indexed restore for every source below.
+        indexed: true
+        use_stateful_dataloader: true
+        force_finite: true
+        force_map_dataset: true
+
+        sample_rate: 16000
+        num_workers: 4
+        seed: 42
+        shard_seed: randomized
+
+        # Bucketing and the rest of the dataloader knobs work exactly as before.
+        use_bucketing: true
+        num_buckets: 30
+        batch_duration: 1100
+        quadratic_duration: 30
+
+        input_cfg:
+          - type: nemo_tarred
+            manifest_filepath: /data/asr/manifest__OP_0..127_CL_.jsonl
+            tarred_audio_filepaths: /data/asr/audio__OP_0..127_CL_.tar
+            weight: 0.7
+          - type: lhotse
+            cuts_path: /data/extra/cuts.jsonl
+            weight: 0.3
+
+Resume contract
+^^^^^^^^^^^^^^^
+
+When ``use_stateful_dataloader: true`` is set, Lightning's checkpoint will
+contain the full lhotse iterator graph state under the dataloader key. On
+resume:
+
+* iterator positions advance to where they were at save time (no replay from
+  position 0);
+* ``set_epoch`` is a no-op while restored state is pending, so the resumed run
+  continues the same epoch instead of starting a new one;
+* ``num_workers`` and ``world_size`` must match between save and restore (a
+  hard requirement of ``StatefulDataLoader``).
+
+Non-indexed pipelines fall back to Lhotse's ``_fast_forward()`` replay (O(N)
+in batches consumed before the checkpoint) and require ``num_workers`` only to
+be consistent for replay-based restore — not exact restore.
+
+For the iterator graph contract itself, see Lhotse's
+`indexed manifests guide <https://lhotse.readthedocs.io/en/latest/indexed-manifests.html>`_.
+
 Pre-computing bucket duration bins
 ------------------------------------
 
@@ -594,7 +1112,7 @@ For Canary-1B, we'll also provide the special tokens tokenizer. Example:
         input_cfg.yaml
 
 Pushing GPU utilization to the limits with bucketing and OOMptimizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The default approach of specifying a ``batch_duration``, ``bucket_duration_bins`` and ``quadratic_duration``
 is quite flexible, but is not maximally efficient. We observed that in practice it often leads to under-utilization
@@ -743,3 +1261,302 @@ implements those methods.
 The wrapper is a no-op when ``device_mesh`` is ``None`` or every named
 axis present in the mesh has size 1, so the same call site works for
 single-GPU, DDP-only, and CP/TP runs without a separate code path.
+
+Train vs. validation / test configs
+-----------------------------------
+
+The training and validation/test sections of a NeMo recipe use the same
+underlying dataloader builder but have a different shape and a different
+default behavior.
+
+**Training (``train_ds``).** A single config that produces one infinite
+``CutSet``. The dataloader is wrapped to never run out of data, so
+``trainer.max_steps`` (and ``limit_train_batches`` for tarred sources)
+controls the run length:
+
+.. code-block:: yaml
+
+    model:
+      train_ds:
+        sample_rate: 16000
+        num_workers: 4
+        shuffle: true
+        use_bucketing: true
+        num_buckets: 30
+        batch_duration: 1100
+        input_cfg:
+          - type: nemo_tarred
+            manifest_filepath: /data/asr/manifest__OP_0..127_CL_.json
+            tarred_audio_filepaths: /data/asr/audio__OP_0..127_CL_.tar
+
+**Validation / test (``validation_ds`` / ``test_ds``).** A *named* dict of
+configs — one per evaluation set — that produces finite iteration:
+
+.. code-block:: yaml
+
+    model:
+      validation_ds:
+        sample_rate: 16000
+        batch_size: 16
+        # Per-set entries; keys become the metric prefixes in logging.
+        datasets:
+          dev_clean:
+            cuts_path: /data/dev-clean/cuts.jsonl
+          dev_other:
+            cuts_path: /data/dev-other/cuts.jsonl
+
+The most common eval-side overrides:
+
+* ``shuffle: false`` — deterministic order.
+* ``force_finite: true`` — break out of the infinite-mux that's safe for
+  training but would loop forever in eval.
+* ``use_bucketing: false`` — bucketing trades padding for randomness; on a
+  small eval set the savings are negligible and a fixed batch size makes
+  results easier to interpret.
+* ``num_workers: 0`` (or a small number) — eval is short, the worker
+  startup cost matters more.
+
+When the model code expects a single eval set, use the plain ``cuts_path`` /
+``manifest_filepath`` form at the same level as ``train_ds`` instead of the
+``datasets:`` dict.
+
+Preparing your data
+-------------------
+
+Three minimal recipes covering the main on-disk formats.
+
+**NeMo manifest** — one JSON object per line, fields read by ``LazyNeMoIterator``::
+
+    {"audio_filepath": "/data/utt_0001.wav", "duration": 3.42, "text": "hello world", "lang": "en"}
+    {"audio_filepath": "/data/utt_0002.wav", "duration": 5.10, "text": "another example", "lang": "en"}
+
+For tarred NeMo manifests, see
+``scripts/speech_recognition/convert_to_tarred_audio_dataset.py`` in the NeMo
+repo.
+
+**Lhotse cuts JSONL** — build a ``CutSet`` from raw recordings + supervisions:
+
+.. code-block:: python
+
+    from lhotse import CutSet, Recording, SupervisionSegment
+
+    cuts = []
+    for path, transcript in pairs:
+        rec = Recording.from_file(path)
+        sup = SupervisionSegment(
+            id=rec.id, recording_id=rec.id,
+            start=0.0, duration=rec.duration,
+            text=transcript, language="en",
+        )
+        cut = rec.to_cut()
+        cut.supervisions = [sup]
+        cuts.append(cut)
+
+    CutSet.from_cuts(cuts).to_file("cuts.jsonl")  # uncompressed!
+
+For Lhotse Shar (sharded archive), see the upstream tutorial: |tutorial_shar|.
+
+**Parquet** — write a ``pyarrow`` table with the column names the
+``LazyParquetIterator`` reads (``audio``, ``text``, ``duration``,
+optional ``lang``):
+
+.. code-block:: python
+
+    import pyarrow as pa, pyarrow.parquet as pq
+
+    table = pa.table({
+        "audio":    [open(p, "rb").read() for p in paths],
+        "text":     transcripts,
+        "duration": durations,
+        "lang":     ["en"] * len(paths),
+    })
+    pq.write_table(table, "shard_000.parquet")  # row-group stats kept by default
+
+Once your manifests are written, build the indexed sidecars in one shot::
+
+    python scripts/dataloading/build_indexes.py path/to/input_cfg.yaml
+
+See :ref:`indexed-resumable-dataloading` for the resumable side.
+
+.. _lhotse-storage-backends:
+
+Storage backends: local, object store, AIStore
+----------------------------------------------
+
+Every input path the dataloader reads goes through Lhotse's ``open_best``,
+which routes file paths and URIs to the right backend automatically:
+
+* **Local files** — paths like ``/data/...`` work out of the box, no
+  configuration needed.
+* **Generic object stores via ``smart_open``** — ``s3://``, ``gs://``,
+  ``http://``, ``https://`` URIs work after ``pip install smart_open``.
+  Authentication uses the underlying SDK's defaults (e.g. AWS env vars).
+* **AIStore** — ``ais://bucket/key`` URIs work after ``pip install aistore``
+  and ``export AIS_ENDPOINT=http://...``. Optional tuning env vars
+  ``AIS_CONNECT_TIMEOUT`` and ``AIS_READ_TIMEOUT`` are honored by the SDK.
+
+The same routing applies to ``.idx`` sidecars: they are read and written
+next to the data file, so the backend must accept writes at that location
+or the indexes need to be pre-built locally and uploaded.
+
+AIStore GetBatch (separate optimization)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For tarred multimodal-conversation manifests, NeMo also supports AIStore's
+batched object-fetch API (``GetBatch``) via ``USE_AIS_GET_BATCH=true``,
+which issues one batched fetch per minibatch instead of per-cut tar reads.
+This is independent of using AIStore as a generic backend — see
+:doc:`speechlm2/datasets` for the speech-LM-specific details, including
+how it composes with ``indexed: true``.
+
+.. _lhotse-extension-hooks:
+
+Registering a custom format
+---------------------------
+
+Adding a new ``type:`` to the ``input_cfg`` registry is one decorator and
+one function:
+
+.. code-block:: python
+
+    from nemo.collections.common.data.lhotse.cutset import data_type_parser
+    from lhotse import CutSet
+
+    @data_type_parser("my_format")
+    def read_my_format(config) -> tuple[CutSet, bool]:
+        cuts = CutSet(MyAdapter(path=config.path, ...))
+        is_tarred = True  # True ⇒ IterableDataset path; False ⇒ map-style
+        return cuts, is_tarred
+
+The parser must accept arbitrary keys: ``read_dataset_config`` cascades
+options like ``indexed``, ``shard_seed``, ``metadata_only``,
+``force_finite``, ``audio_locator_tag`` from the top of the YAML down into
+every entry via ``propagate_attrs``. Missing keys should fall back to
+sensible defaults via ``config.get(...)``.
+
+To make ``MyAdapter`` participate in the indexed/resumable path
+(:ref:`indexed-resumable-dataloading`), implement Lhotse's
+:class:`~lhotse.lazy.IteratorNode` contract — see
+`indexed manifests guide <https://lhotse.readthedocs.io/en/latest/indexed-manifests.html>`_
+for the requirements.
+
+Common pitfalls
+---------------
+
+The most common foot-guns when standing up a NeMo Lhotse recipe:
+
+1. **Forgetting** ``trainer.use_distributed_sampler=false``. NeMo's Lhotse
+   integration handles distributed sampling itself; leaving Lightning's
+   default on causes silent batch duplication across DP ranks.
+
+2. **No** ``max_steps`` **with tarred / Shar data.** Tarred sources are
+   infinite by design, so without ``trainer.max_steps`` (and
+   ``limit_train_batches`` for the periodic validation cadence) training
+   never completes the first "epoch". Always set both.
+
+3. **Compressed inputs cannot be indexed.** ``.jsonl.gz`` and ``.tar.gz``
+   work for streaming, but ``indexed: true`` requires uncompressed,
+   seekable files. Re-extract or re-write before building ``.idx``.
+
+4. **Mismatched** ``num_workers`` / ``world_size`` **on resume.** Exact
+   per-worker resume with ``StatefulDataLoader`` requires both to match
+   between save and restore. Replay-based restore with the regular
+   ``DataLoader`` is more lenient.
+
+5. ``indexed: true`` **is incompatible with** ``extra_fields`` **and**
+   ``slice_length`` on ``nemo`` / ``nemo_tarred``. Both expand or rewrite
+   cuts in a way that has no stable index. Pre-process the manifest
+   offline if you need them in an indexed pipeline.
+
+6. ``shard_seed: "trng"`` **deadlocks under TP/PP.** Tensor- and pipeline-
+   parallel ranks must see the same shard order, but ``"trng"`` draws an
+   independent seed per worker. Use ``shard_seed: "randomized"`` whenever
+   you have model parallelism on top of DDP.
+
+7. **Missing** ``force_finite: true`` **on validation.** Validation configs
+   that reuse training infrastructure inherit the infinite-mux behavior;
+   without ``force_finite: true`` the validation loop never terminates.
+
+.. _lhotse-config-reference:
+
+``LhotseDataLoadingConfig`` field reference
+-------------------------------------------
+
+The complete option schema lives in ``LhotseDataLoadingConfig``
+(``nemo/collections/common/data/lhotse/dataloader.py``). It carries ~80
+fields; the categorization below mirrors the source order and groups
+options by what they control.
+
+**Inputs.** ``input_cfg``, ``manifest_filepath``,
+``tarred_audio_filepaths``, ``cuts_path``, ``shar_path``,
+``skip_missing_manifest_entries``.
+
+**Sampling — basic.** ``batch_size``, ``batch_duration``,
+``quadratic_duration``, ``min_duration``, ``max_duration``, ``min_tps``,
+``max_tps``.
+
+**Sampling — bucketing.** ``use_bucketing``, ``num_buckets``,
+``bucket_duration_bins``, ``bucket_batch_size``, ``bucket_buffer_size``,
+``num_cuts_for_bins_estimate``, ``concurrent_bucketing``.
+
+**Sampling — multimodal.** ``use_multimodal_sampling``, ``prompt_format``,
+``pretokenize``, ``audio_locator_tag``, ``token_equivalent_duration``,
+``batch_tokens``, ``quadratic_factor``, ``min_tokens``, ``max_tokens``,
+``min_tpt``, ``max_tpt``, ``measure_total_length``.
+
+**Sampling — fusion (multi-config).** ``multi_config``, ``sampler_fusion``,
+``sampler_weights``.
+
+**Indexed / resumable.** ``indexed``, ``use_stateful_dataloader``,
+``indexes_root``. See :ref:`indexed-resumable-dataloading` and
+:ref:`lhotse-indexes-root`.
+
+**Mixing & weighting.** ``reweight_temperature``, ``max_open_streams``.
+
+**I/O & distributed.** ``num_workers``, ``pin_memory``, ``shard_seed``,
+``seed``, ``shuffle``, ``shuffle_buffer_size``, ``drop_last``,
+``force_finite``, ``force_map_dataset``, ``force_iterable_dataset``,
+``metadata_only``, ``cuda_expandable_segments``.
+
+**On-the-fly augmentation.**
+
+* Speed/RIR — ``perturb_speed``, ``rir_enabled``, ``rir_path``, ``rir_prob``.
+* Noise — ``noise_path``, ``noise_snr``, ``noise_mix_prob``.
+* Lowpass — ``lowpass_enabled``, ``lowpass_frequencies_interval``,
+  ``lowpass_prob``.
+* Compression — ``compression_enabled``, ``compression_prob``,
+  ``compression_level_interval``, ``compression_codecs``,
+  ``compression_codec_weights``, ``compression_enable_for_custom_fields``.
+* Clipping — ``clipping_enabled``, ``clipping_gain_db``,
+  ``clipping_normalize``, ``clipping_oversampling``, ``clipping_prob``,
+  ``clipping_prob_hard``.
+* Concatenation — ``concatenate_samples``, ``concatenate_gap_seconds``,
+  ``concatenate_duration_factor``, ``concatenate_merge_supervisions``,
+  ``db_norm``.
+
+**Cut transforms.** ``truncate_duration``, ``truncate_offset_type``,
+``cut_into_windows_duration``, ``cut_into_windows_hop``,
+``pad_min_duration``, ``pad_direction``, ``cut_text_into_windows_tokens``,
+``keep_excessive_supervisions``.
+
+**Field-name overrides.** ``text_field``, ``lang_field``,
+``channel_selector``, ``sample_rate``.
+
+**Filtering.** ``max_cer``, ``min_context_speaker_similarity``, ``keep``.
+
+For exact types and defaults, see the dataclass definition in the source
+file — it is the single source of truth.
+
+See also
+--------
+
+* :doc:`speechlm2/datasets` — speech-LM-specific data classes, AIStore
+  GetBatch with indexed mode, and the SpeechLM ``DataModule`` resume
+  contract.
+* :doc:`asr/datasets` — ASR-specific data preparation conventions.
+* :doc:`audio/datasets` — audio (codec, enhancement) data flows.
+* `Lhotse PyTorch Datasets <https://lhotse.readthedocs.io/en/latest/datasets.html>`_
+  — upstream sampler API, ``StatefulDataLoader`` integration, custom RNG
+  state in batch transforms.
+* `Lhotse indexed manifests <https://lhotse.readthedocs.io/en/latest/indexed-manifests.html>`_
+  — the iterator-graph contract that makes O(1) restore work.
diff --git a/docs/source/speechlm2/datasets.rst b/docs/source/speechlm2/datasets.rst
index 2006fbc59cc6..505d0c7ce5b1 100644
--- a/docs/source/speechlm2/datasets.rst
+++ b/docs/source/speechlm2/datasets.rst
@@ -4,6 +4,16 @@ Datasets
 The speechlm2 collection supports datasets that contain both audio and text data for training models that can understand speech and generate appropriate responses.
 This section describes the dataset format, preparation, and usage with the speechlm2 models.
 
+.. seealso::
+
+   :doc:`/dataloaders` is the canonical reference for the underlying Lhotse
+   dataloader: ``input_cfg`` shape, supported formats, sampling/bucketing
+   options, indexed manifests + resumable dataloading, and
+   ``LhotseDataLoadingConfig`` field schema. The page below covers what's
+   speech-LM-specific on top of that — datamodule resume contract,
+   AIStore GetBatch, conversation type semantics in the SALM/duplex
+   recipes.
+
 Dataset Format
 --------------
 
@@ -228,6 +238,27 @@ When enabled:
 
 Leave the env var unset to keep the original tar-iterating loader.
 
+Combining with ``indexed: true``
+""""""""""""""""""""""""""""""""
+
+``USE_AIS_GET_BATCH=true`` coexists with ``indexed: true`` on
+``LazyNeMoTarredIterator`` (and on the multimodal-conversation adapters).
+Indexed mode keeps the JSONL-driven O(1) global indexing and graph-token
+checkpointing, while AIStore GetBatch handles the actual audio fetch:
+
+* The audio-tar ``.idx`` sidecar is **not** required when GetBatch is enabled
+  — the iterator skips opening tar files entirely and emits URL-backed cuts
+  whose ``AudioSource`` points at ``{tar_path}/{audio_filename}``
+  (``type="url"`` for ``ais://...`` paths, ``type="file"`` otherwise).
+* Manifest JSONLs still need their ``.idx`` sidecars; they drive the indexed
+  iterator graph and the ``state_dict`` / ``load_state_dict`` round-trip.
+* Audio bytes are fetched lazily by ``AudioSamples(use_batch_loader=True)`` at
+  collation time, which issues one batched GetBatch request per minibatch.
+
+Use this combination when shards live on AIStore and you want both the
+network efficiency of GetBatch and the exact-resume guarantees of the
+indexed/stateful pipeline.
+
 DuplexSTTDataset
 ****************
 
@@ -261,9 +292,43 @@ The DataModule class in the speechlm2 collection manages dataset loading, prepar
     )
 
 The DataModule takes care of:
+
 1. Setting up proper data parallel ranks for dataloaders
 2. Instantiating the dataloaders with configuration from YAML
 3. Managing multiple datasets for validation/testing
+4. Persisting the train dataloader's iterator state across checkpoints
+   (when ``use_stateful_dataloader: true``)
+
+Checkpointed / resumable training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The DataModule caches the train dataloader on first ``train_dataloader()``
+call and exposes ``state_dict()`` / ``load_state_dict()`` that delegate to the
+cached dataloader when it supports them. Lightning's trainer wires those into
+every checkpoint automatically, so an experiment configured with::
+
+    data:
+      train_ds:
+        indexed: true
+        use_stateful_dataloader: true
+        ...
+
+resumes O(1) — sampler RNG, bucketer state, multiplexer choice RNG,
+per-source iterator cursors, and per-worker prefetch queues are all restored
+exactly without replay.
+
+With a regular ``DataLoader`` (``use_stateful_dataloader`` unset or
+``False``) ``state_dict``/``load_state_dict`` become no-ops and resume falls
+back to Lhotse's ``_fast_forward()`` replay path.
+
+Two constraints to keep in mind across save/restore:
+
+* ``num_workers`` and ``world_size`` must match between save and restore
+  (a hard requirement of ``StatefulDataLoader``).
+* All data files must be **uncompressed** and accompanied by ``.idx``
+  sidecars. Build them in one shot with ``scripts/dataloading/build_indexes.py``
+  (see :ref:`indexed-resumable-dataloading` in the main Lhotse dataloading
+  guide).
 
 Bucketing for Efficient Training
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/examples/speechlm2/conf/salm_automodel_pee.yaml b/examples/speechlm2/conf/salm_automodel_pee.yaml
index faceb4a9ddb1..7de36512c08a 100644
--- a/examples/speechlm2/conf/salm_automodel_pee.yaml
+++ b/examples/speechlm2/conf/salm_automodel_pee.yaml
@@ -192,7 +192,7 @@ trainer:
     # ``activation_checkpointing_llm`` is a single switch that covers both the
     # non-EP FSDP2 path (via FSDP2Config.activation_checkpointing) and the
     # EP/MoE parallelizer path.
-    # ``activation_checkpointing_perception`` wraps each layer in ``perception.encoder.layers`` 
+    # ``activation_checkpointing_perception`` wraps each layer in ``perception.encoder.layers``
     # with ``checkpoint_wrapper`` before FSDP2 sharding.
     activation_checkpointing_llm: false
     activation_checkpointing_perception: false
diff --git a/examples/speechlm2/salm_train.py b/examples/speechlm2/salm_train.py
index 02a56e1f2f44..eb6da6286384 100644
--- a/examples/speechlm2/salm_train.py
+++ b/examples/speechlm2/salm_train.py
@@ -19,6 +19,7 @@
 
 from nemo.collections.speechlm2 import SALM, DataModule, SALMDataset
 from nemo.core.config import hydra_runner
+from nemo.utils.callbacks.training_stats import TrainingStatsCallback
 from nemo.utils.exp_manager import exp_manager
 from nemo.utils.trainer_utils import resolve_trainer_cfg
 
@@ -35,6 +36,11 @@ def train(cfg):
     torch.set_float32_matmul_precision("medium")
     trainer = Trainer(**resolve_trainer_cfg(cfg.trainer))
     log_dir = exp_manager(trainer, cfg.get("exp_manager", None))
+    # Insert at position 0 so our ``on_train_batch_end`` runs BEFORE the
+    # StatelessTimer's hook (which can trigger a checkpoint save mid-
+    # batch-end). Without this, the saved ``state_dict`` would lag the
+    # accumulators by one batch on every wall-time-induced save.
+    trainer.callbacks.insert(0, TrainingStatsCallback())
     OmegaConf.save(cfg, log_dir / "exp_config.yaml")
 
     model_cls = SALM
@@ -51,6 +57,9 @@ def train(cfg):
 
     trainer.fit(model, datamodule)
 
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+
 
 if __name__ == "__main__":
     train()
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse.py b/nemo/collections/asr/data/audio_to_text_lhotse.py
index 46c301be0822..ec9b92045bb5 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 from typing import Dict, Optional, Tuple
 
@@ -53,20 +54,12 @@ def __init__(self, tokenizer: TokenizerSpec, return_cuts: bool = False):
         super().__init__()
         self.tokenizer = TokenizerWrapper(tokenizer)
         self.use_ais_get_batch = os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true"
+        self.ais_force_individual = os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true"
 
-        # Try to use use_batch_loader if available (Lhotse >= 1.32.0)
-        try:
-            self.load_audio = AudioSamples(fault_tolerant=True, use_batch_loader=self.use_ais_get_batch)
-        except TypeError:
-            # Lhotse < 1.32.0 doesn't support use_batch_loader
-            if self.use_ais_get_batch:
-                import logging
-
-                logging.warning(
-                    "AIS batch loading requested but not supported by this Lhotse version. "
-                    "Please upgrade to Lhotse >= 1.32.0"
-                )
-            self.load_audio = AudioSamples(fault_tolerant=True)
+        self.load_audio = _make_audio_samples(
+            use_batch_loader=self.use_ais_get_batch,
+            ais_force_individual=self.ais_force_individual,
+        )
 
         self.return_cuts = return_cuts
 
@@ -87,3 +80,30 @@ def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]:
         if self.return_cuts:
             return audio, audio_lens, tokens, token_lens, cuts.drop_in_memory_data()
         return audio, audio_lens, tokens, token_lens
+
+
+def _make_audio_samples(use_batch_loader: bool, ais_force_individual: bool) -> AudioSamples:
+    kwargs = {
+        "fault_tolerant": True,
+        "use_batch_loader": use_batch_loader,
+        "ais_force_individual": ais_force_individual,
+    }
+    try:
+        return AudioSamples(**kwargs)
+    except TypeError as exc:
+        if "ais_force_individual" in str(exc):
+            kwargs.pop("ais_force_individual")
+            try:
+                return AudioSamples(**kwargs)
+            except TypeError as retry_exc:
+                exc = retry_exc
+
+        if "use_batch_loader" not in str(exc):
+            raise
+
+        if use_batch_loader:
+            logging.warning(
+                "AIS batch loading requested but not supported by this Lhotse version. "
+                "Please upgrade to Lhotse >= 1.32.0"
+            )
+        return AudioSamples(fault_tolerant=True)
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
index a3510a78836a..a4689fd177b4 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -18,9 +18,9 @@
 import torch.utils.data
 from lhotse import CutSet
 from lhotse.cut import MixedCut
-from lhotse.dataset import AudioSamples
 from lhotse.dataset.collation import collate_vectors
 
+from nemo.collections.asr.data.audio_to_text_lhotse import _make_audio_samples
 from nemo.collections.common.data import apply_prompt_format_fn
 from nemo.collections.common.prompts import PromptFormatter
 from nemo.collections.common.tokenizers import TokenizerSpec
@@ -83,20 +83,12 @@ def __init__(
         super().__init__()
         self.tokenizer = tokenizer
         self.use_ais_get_batch = os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true"
+        self.ais_force_individual = os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true"
 
-        # Try to use use_batch_loader if available (Lhotse >= 1.32.0)
-        try:
-            self.load_audio = AudioSamples(fault_tolerant=True, use_batch_loader=self.use_ais_get_batch)
-        except TypeError:
-            # Lhotse < 1.32.0 doesn't support use_batch_loader
-            if self.use_ais_get_batch:
-                import logging
-
-                logging.warning(
-                    "AIS batch loading requested but not supported by this Lhotse version. "
-                    "Please upgrade to Lhotse >= 1.32.0"
-                )
-            self.load_audio = AudioSamples(fault_tolerant=True)
+        self.load_audio = _make_audio_samples(
+            use_batch_loader=self.use_ais_get_batch,
+            ais_force_individual=self.ais_force_individual,
+        )
 
         self.padding_value = self.tokenizer.pad_id
         self.prompt = prompt
diff --git a/nemo/collections/common/callbacks/ema.py b/nemo/collections/common/callbacks/ema.py
index dee125be54ef..7dbf08267ef5 100644
--- a/nemo/collections/common/callbacks/ema.py
+++ b/nemo/collections/common/callbacks/ema.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# pylint: disable=C0116
 import contextlib
 import copy
 import os
@@ -135,7 +136,7 @@ def on_load_checkpoint(
                 return
             ema_path = ckpt_path.replace(ext, f'-EMA{ext}')
             if os.path.exists(ema_path):
-                ema_state_dict = torch.load(ema_path, map_location=torch.device('cpu'))
+                ema_state_dict = torch.load(ema_path, map_location=torch.device('cpu'), weights_only=False)
 
                 checkpoint['optimizer_states'] = ema_state_dict['optimizer_states']
                 del ema_state_dict
diff --git a/nemo/collections/common/data/lhotse/_compat.py b/nemo/collections/common/data/lhotse/_compat.py
new file mode 100644
index 000000000000..12f5c3d78360
--- /dev/null
+++ b/nemo/collections/common/data/lhotse/_compat.py
@@ -0,0 +1,195 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=unused-import
+"""Compatibility shims for optional Lhotse indexed/resumable dataloading APIs.
+
+This module lets NeMo import with released Lhotse versions that do not expose
+those APIs yet, while delegating to the real implementations when a resumable
+Lhotse checkout is available.
+"""
+import os
+from collections.abc import Generator, Iterable
+from typing import Any
+
+import torch
+from torch import distributed as dist
+
+__all__ = [
+    "GraphOriginDict",
+    "IteratorNode",
+    "LazyIndexedManifestIterator",
+    "PartitionedIndexedIterator",
+    "attach_graph_origin",
+    "normalize_graph_token",
+]
+
+try:
+    from lhotse.dataset import dataloading as _lhotse_dataloading
+
+    PartitionedIndexedIterator = _lhotse_dataloading.PartitionedIndexedIterator
+except (ImportError, AttributeError):
+    LHOTSE_USE_WORKER_PARTITION = "LHOTSE_USE_WORKER_PARTITION"
+
+    def _get_world_size() -> int:
+        if "WORLD_SIZE" in os.environ:
+            return int(os.environ["WORLD_SIZE"])
+        if dist.is_available() and dist.is_initialized():
+            return dist.get_world_size()
+        return 1
+
+    def _get_rank() -> int:
+        if "RANK" in os.environ:
+            return int(os.environ["RANK"])
+        if dist.is_available() and dist.is_initialized():
+            return dist.get_rank()
+        return 0
+
+    def _get_worker_partition() -> tuple[int, int]:
+        if os.environ.get(LHOTSE_USE_WORKER_PARTITION) != "1":
+            return 0, 1
+        rank = _get_rank()
+        world_size = _get_world_size()
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            worker_id, num_workers = 0, 1
+        else:
+            worker_id = worker_info.id
+            num_workers = max(worker_info.num_workers, 1)
+        return rank * num_workers + worker_id, world_size * num_workers
+
+    class PartitionedIndexedIterator:
+        def __init__(self, shuffle: bool = False, seed: int = 0) -> None:
+            self._shuffle = shuffle
+            self._seed = seed
+            self._position = 0
+            self._shard_id: int | None = None
+            self._num_shards: int | None = None
+            self._restored = False
+            self._range = None
+            self._pending_range_state = None
+
+        @property
+        def position(self) -> int:
+            return self._position
+
+        def iterate(self, total_len: int) -> Generator[int, None, None]:
+            shard_id, num_shards = _get_worker_partition()
+
+            if self._restored:
+                self._restored = False
+                if self._num_shards is not None and (self._shard_id != shard_id or self._num_shards != num_shards):
+                    raise ValueError(
+                        f"PartitionedIndexedIterator topology mismatch on resume: "
+                        f"saved (shard_id={self._shard_id}, num_shards={self._num_shards}), "
+                        f"current (shard_id={shard_id}, num_shards={num_shards})."
+                    )
+                start = self._position
+            else:
+                start = 0
+                self._position = 0
+
+            self._shard_id = shard_id
+            self._num_shards = num_shards
+
+            if self._shuffle:
+                from lhotse.indexing import LazyShuffledRange
+
+                self._range = LazyShuffledRange(total_len, seed=self._seed, shard_id=shard_id, num_shards=num_shards)
+                if self._pending_range_state is not None:
+                    self._range.load_state_dict(self._pending_range_state)
+                    self._pending_range_state = None
+                shard_len = len(self._range)
+            else:
+                self._range = None
+                shard_len = (total_len - shard_id + num_shards - 1) // num_shards if total_len > shard_id else 0
+
+            for i in range(start, shard_len):
+                self._position = i + 1
+                yield self._range[i] if self._range is not None else shard_id + i * num_shards
+
+        def state_dict(self) -> dict:
+            sd = {
+                "position": self._position,
+                "shard_id": self._shard_id,
+                "num_shards": self._num_shards,
+            }
+            if self._range is not None:
+                sd["range"] = self._range.state_dict()
+            elif self._pending_range_state is not None:
+                sd["range"] = self._pending_range_state
+            return sd
+
+        def load_state_dict(self, sd: dict) -> None:
+            self._position = sd.get("position", 0)
+            self._shard_id = sd.get("shard_id")
+            self._num_shards = sd.get("num_shards")
+            if self._shuffle:
+                self._pending_range_state = sd.get("range")
+                self._range = None
+            self._restored = True
+
+
+try:
+    from lhotse import lazy as _lhotse_lazy
+
+    GraphOriginDict = _lhotse_lazy.GraphOriginDict
+    IteratorNode = _lhotse_lazy.IteratorNode
+    LazyIndexedManifestIterator = _lhotse_lazy.LazyIndexedManifestIterator
+    attach_graph_origin = _lhotse_lazy.attach_graph_origin
+    normalize_graph_token = _lhotse_lazy.normalize_graph_token
+except (ImportError, AttributeError):
+
+    class IteratorNode(Iterable):
+        is_checkpointable = False
+        is_indexed = False
+        has_constant_time_access = False
+
+        def state_dict(self) -> dict:
+            raise NotImplementedError(f"{type(self).__name__} is not checkpointable.")
+
+        def load_state_dict(self, sd: dict) -> None:
+            raise NotImplementedError(f"{type(self).__name__} is not checkpointable.")
+
+        def iter_children(self):
+            if hasattr(self, "source"):
+                yield getattr(self, "source")
+            if hasattr(self, "sources"):
+                yield from getattr(self, "sources")
+
+    class GraphOriginDict(dict):
+        __slots__ = ("_graph_origin",)
+
+    def normalize_graph_token(token: Any) -> Any:
+        if isinstance(token, list):
+            return tuple(normalize_graph_token(part) for part in token)
+        if isinstance(token, tuple):
+            return tuple(normalize_graph_token(part) for part in token)
+        return token
+
+    def attach_graph_origin(item: Any, token: Any) -> Any:
+        try:
+            object.__setattr__(item, "_graph_origin", token)
+        except Exception:
+            try:
+                setattr(item, "_graph_origin", token)
+            except Exception:
+                # Immutable extension objects may not accept ad-hoc metadata.
+                return item
+        return item
+
+    class LazyIndexedManifestIterator(IteratorNode):
+        def __init__(self, *args, **kwargs) -> None:
+            raise ImportError(
+                "LazyIndexedManifestIterator requires a Lhotse version with indexed/resumable dataloading support."
+            )
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 8e613091b7e1..921dad55ea6e 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -51,6 +51,7 @@
     NeMoMultimodalConversationShareGPTJsonlAdapter,
     NeMoMultimodalConversationShareGPTWebdatasetAdapter,
     NeMoSFTJsonlAdapter,
+    NemotronTextConversationAdapter,
     TextTurn,
 )
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
@@ -285,6 +286,8 @@ def read_dataset_config(config) -> tuple[CutSet, bool]:
         "force_map_dataset": config.get("force_map_dataset", False),
         "force_iterable_dataset": config.get("force_iterable_dataset", False),
         "slice_length": config.get("slice_length", None),
+        "indexed": config.get("indexed", False),
+        "indexes_root": config.get("indexes_root", None),
         # Temperature for re-weighting datasets. 1 is a neutral value. Lower temperature over-samples smaller datasets, and vice versa.
         "reweight_temperature": config.get("reweight_temperature", None),
     }
@@ -348,6 +351,8 @@ def read_txt_jsonl_paths(config: DictConfig) -> tuple[CutSet, bool]:
             text_field=config.text_field,
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
+            indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
         )
     )
     if not config.get("force_finite", False):
@@ -384,6 +389,25 @@ def read_nemo_sft_jsonl(config: DictConfig) -> tuple[CutSet, bool]:
             language=config.get("language"),
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
+            indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
+        )
+    )
+    if not config.get("force_finite", False):
+        cuts = cuts.repeat(preserve_id=True)
+    return cuts, True
+
+
+@data_type_parser("nemotron_text_converation")
+def read_nemotron_text_converation(config: DictConfig) -> tuple[CutSet, bool]:
+    """Read Nemotron/Energon text-only conversation JSONL files or tar directories."""
+    cuts = CutSet(
+        NemotronTextConversationAdapter(
+            paths=config.paths,
+            shuffle_shards=config.shuffle,
+            shard_seed=config.shard_seed,
+            indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
         )
     )
     if not config.get("force_finite", False):
@@ -405,6 +429,8 @@ def read_multimodal_conversation_jsonl(config: DictConfig) -> tuple[CutSet, bool
             system_prompt=config.get("tags", {}).get("system_prompt"),
             context=config.get("tags", {}).get("context"),
             slice_length=config.get("slice_length"),
+            indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
         )
     )
     if not config.get("force_finite", False):
@@ -426,6 +452,9 @@ def read_share_gpt_as_conversation(config) -> tuple[CutSet, bool]:
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
             slice_length=config.get("slice_length"),
+            indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
+            skip_missing_manifest_entries=config.get("skip_missing_manifest_entries", False),
         )
     )
     if not config.get("force_finite", False):
@@ -444,6 +473,8 @@ def read_share_gpt_webdataset_as_conversation(config) -> tuple[CutSet, bool]:
             token_equivalent_duration=config.get("token_equivalent_duration"),
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
+            indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
         )
     )
     # When force_finite is False (default), repeat the dataset infinitely so that
@@ -640,6 +671,8 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
         shard_seed = config.get("shard_seed", "trng")
         metadata_only = config.get("metadata_only", False)
         force_finite = config.get("force_finite", False)
+        indexed = config.get("indexed", False)
+        indexes_root = config.get("indexes_root", None)
         if config.get("cuts_path") is not None:
             warnings.warn("Note: lhotse.cuts_path will be ignored because lhotse.shar_path was provided.")
         if isinstance(config.shar_path, (str, Path)):
@@ -648,6 +681,8 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
                 shuffle_shards=True,
                 seed=shard_seed,
                 slice_length=config.get("slice_length", None),
+                indexed=indexed,
+                indexes_root=indexes_root,
             )
             if not metadata_only and not force_finite:
                 cuts = cuts.repeat(preserve_id=True)
@@ -664,6 +699,8 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
                         shuffle_shards=True,
                         seed=shard_seed,
                         slice_length=config.get("slice_length", None),
+                        indexed=indexed,
+                        indexes_root=indexes_root,
                     )
                     weight = len(cs)
                 else:
@@ -679,6 +716,8 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
                         shuffle_shards=True,
                         seed=shard_seed,
                         slice_length=config.get("slice_length", None),
+                        indexed=indexed,
+                        indexes_root=indexes_root,
                     )
                 cutsets.append(cs)
                 weights.append(weight)
@@ -703,6 +742,8 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
                 shuffle_shards=True,
                 seed=shard_seed,
                 slice_length=config.get("slice_length", None),
+                indexed=indexed,
+                indexes_root=indexes_root,
             )
             if not metadata_only and not force_finite:
                 cuts = cuts.repeat(preserve_id=True)
@@ -715,7 +756,13 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
     else:
         # Regular Lhotse manifest points to individual audio files (like native NeMo manifest).
         path = config.cuts_path
-        cuts = CutSet.from_file(path).map(partial(resolve_relative_paths, manifest_path=path))
+        from lhotse.indexing import index_file_path
+
+        indexes_root = config.get("indexes_root", None)
+        from_file_kwargs = {"indexed": config.get("indexed", None)}
+        if indexes_root is not None:
+            from_file_kwargs["index_path"] = index_file_path(path, indexes_root)
+        cuts = CutSet.from_file(path, **from_file_kwargs).map(partial(resolve_relative_paths, manifest_path=path))
     return cuts, is_tarred
 
 
@@ -749,6 +796,7 @@ def read_parquet_manifest(config: DictConfig) -> tuple[CutSet, bool]:
     # Extract shuffling options (CRITICAL for distributed training)
     shuffle_shards = config.get("shuffle", False)
     shard_seed = config.get("shard_seed", "trng")
+    indexed = config.get("indexed", False)
 
     # 3. Create Iterators for each file
     iterators = []
@@ -761,6 +809,7 @@ def read_parquet_manifest(config: DictConfig) -> tuple[CutSet, bool]:
             duration_field=duration_field,
             lang_field=lang_field,
             sampling_rate=sampling_rate,
+            indexed=indexed,
         )
         iterators.append(adapter)
 
@@ -1459,6 +1508,10 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
                 common_kwargs["shuffle_shards"] = config[key]
             else:
                 common_kwargs[key] = config[key]
+    indexed = config.get("indexed", False)
+    indexes_root = config.get("indexes_root", None)
+    indexed_extra = {"indexes_root": indexes_root} if (indexed and indexes_root is not None) else {}
+    notar_kwargs_extra = {"indexed": indexed, **indexed_extra} if indexed else {}
     # The option below is to allow a special case of NeMo manifest iteration as Lhotse CutSet
     # without performing any I/O. NeMo manifests typically don't have sampling_rate information required by Lhotse,
     # so lhotse has to look up the headers of audio files to fill it on-the-fly.
@@ -1467,7 +1520,11 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
     # and other data statistics.
     metadata_only = config.get("metadata_only", False)
     force_finite = config.get("force_finite", False)
-    notar_kwargs = {"metadata_only": metadata_only}
+    notar_kwargs = {
+        "metadata_only": metadata_only,
+        "skip_missing_manifest_entries": config.get("skip_missing_manifest_entries", False),
+    }
+    tar_kwargs_extra = {"indexed": indexed, **indexed_extra} if indexed else {}
     is_tarred = config.get("tarred_audio_filepaths") is not None
     if isinstance(config.manifest_filepath, (str, Path)):
         if is_tarred and not metadata_only:
@@ -1477,13 +1534,16 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
                     tar_paths=config.tarred_audio_filepaths,
                     skip_missing_manifest_entries=config.get("skip_missing_manifest_entries", False),
                     slice_length=config.get("slice_length", None),
+                    **tar_kwargs_extra,
                     **common_kwargs,
                 )
             )
             if not force_finite:
                 cuts = cuts.repeat(preserve_id=True)
         else:
-            cuts = CutSet(LazyNeMoIterator(config.manifest_filepath, **notar_kwargs, **common_kwargs))
+            cuts = CutSet(
+                LazyNeMoIterator(config.manifest_filepath, **notar_kwargs, **notar_kwargs_extra, **common_kwargs)
+            )
     else:
         # Format option 1:
         #   Assume it's [[path1], [path2], ...] (same for tarred_audio_filepaths).
@@ -1517,10 +1577,11 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
                     tar_paths=tar_path,
                     skip_missing_manifest_entries=config.get("skip_missing_manifest_entries", False),
                     slice_length=config.get("slice_length", None),
+                    **tar_kwargs_extra,
                     **common_kwargs,
                 )
             else:
-                nemo_iter = LazyNeMoIterator(manifest_path, **notar_kwargs, **common_kwargs)
+                nemo_iter = LazyNeMoIterator(manifest_path, **notar_kwargs, **notar_kwargs_extra, **common_kwargs)
             # Then, determine the weight or use one provided
             if isinstance(manifest_info, str) or len(manifest_info) == 1:
                 weight = len(nemo_iter)
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 5af5f5d004d7..fc7b82439356 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -40,7 +40,7 @@
 from lhotse.dataset.dataloading import resolve_seed
 from lhotse.dataset.sampling.base import CutSampler, SamplingConstraint, TimeConstraint
 from lhotse.lazy import LazyFlattener
-from lhotse.utils import fastcopy, fix_random_seed
+from lhotse.utils import fix_random_seed
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.common.data.lhotse.cutset import (
@@ -254,6 +254,22 @@ class LhotseDataLoadingConfig:
     # The first K examples will actually be read and then discarded, incurring the IO cost, due to
     # our support of object stores and gzipped files that generally don't have indexes of byte offsets per line.
     slice_length: Optional[int] = None
+    # Forwarded to ``CutSet.from_file(path, indexed=...)`` for plain JSONL ``cuts_path`` inputs.
+    # ``None`` = lhotse auto-detect (uses .idx if present, falls back to streaming).
+    # ``True`` = require indexed reads (errors if .idx is missing).
+    # ``False`` = streaming reads only.
+    indexed: Optional[bool] = None
+    # When set, ``.idx`` sidecars are read from a mirror under this root that
+    # preserves the data files' directory structure (URL schemes are stripped,
+    # leading separators dropped). Use this to keep indexes on a fast local
+    # disk while the data lives on shared / object storage. Cascades through
+    # ``read_dataset_config`` to every nested ``input_cfg`` entry.
+    indexes_root: Optional[str] = None
+    # When True, build the dataloader with ``torchdata.stateful_dataloader.StatefulDataLoader``
+    # instead of ``torch.utils.data.DataLoader``. Combined with a checkpointable lhotse sampler
+    # (DynamicBucketingSampler / DynamicCutSampler), this enables exact resume from the next batch
+    # within the current epoch via the standard PyTorch state_dict / load_state_dict protocol.
+    use_stateful_dataloader: bool = False
 
 
 def determine_use_iterable_dataset(use_iterable_dataset: bool, config: DictConfig) -> bool:
@@ -265,12 +281,184 @@ def determine_use_iterable_dataset(use_iterable_dataset: bool, config: DictConfi
     return use_iterable_dataset
 
 
+def _build_dataloader(
+    use_stateful_dataloader: bool,
+    *,
+    dp_rank: Optional[int] = None,
+    dp_world_size: Optional[int] = None,
+    dp_group: Optional[Any] = None,
+    **kwargs,
+) -> torch.utils.data.DataLoader:
+    """
+    Construct a DataLoader, optionally using ``torchdata.stateful_dataloader.StatefulDataLoader``
+    so that resume picks up at the exact next batch via ``state_dict()`` / ``load_state_dict()``.
+
+    When ``dp_rank`` / ``dp_world_size`` are provided AND we're building a
+    stateful loader under multi-rank training, wrap ``StatefulDataLoader`` in
+    :class:`_PerRankStatefulDataLoader`. The wrapper all-gathers each rank's
+    local state at save time and scatters back the right entry at load time,
+    so Lightning's automatic ``FitLoop`` save-and-restore of
+    ``CombinedLoader._state_dicts()`` doesn't broadcast rank-0's iterator
+    state to every rank (which would corrupt per-shard partitioning — see
+    the 2026-05-14 post-mortem).
+    """
+    if use_stateful_dataloader:
+        from torchdata.stateful_dataloader import StatefulDataLoader
+
+        if dp_world_size is not None and dp_world_size > 1:
+            return _PerRankStatefulDataLoader(
+                dp_rank=dp_rank if dp_rank is not None else 0,
+                dp_world_size=dp_world_size,
+                dp_group=dp_group,
+                **kwargs,
+            )
+        return StatefulDataLoader(**kwargs)
+    return torch.utils.data.DataLoader(**kwargs)
+
+
+class _PerRankStatefulDataLoader:
+    """``StatefulDataLoader`` whose ``state_dict`` is a per-rank list.
+
+    Why this exists: Lightning's ``FitLoop`` saves dataloader state via
+    ``CombinedLoader._state_dicts()`` → ``loader.state_dict()`` (collective
+    across ranks but only rank 0's return value is persisted to meta.pt),
+    then on resume calls ``loader.load_state_dict(state)`` on EVERY rank with
+    that single rank-0-only state. Per-shard partitioning (``shard_id =
+    dp_rank * num_workers + worker_id`` inside lhotse's
+    ``PartitionedIndexedIterator``) then desynchronises — rank 28 worker 0
+    loads rank 0 worker 0's ``shard_id=0`` while its own current shard_id is
+    112, the iterator's first ``iterate()`` call raises ValueError, and the
+    rest of the ranks get SIGTERMed via ``srun --kill-on-bad-exit=1``. (See
+    ``agent-debug-workspace/0909-en-only-id2-4node-postfix/DIAGNOSIS_ORD_vs_IAD.md``.)
+
+    The fix turns ``state_dict()`` into a per-rank gather and
+    ``load_state_dict(state)`` into a per-rank scatter. The serialised payload
+    on disk becomes a list of N tagged state dicts (one per DP rank); on
+    every rank, the wrapper picks ``per_rank[self._dp_rank]``. This works
+    whether the call comes from Lightning's automatic FitLoop path OR from
+    our DataModule.load_state_dict override, because both go through this
+    one method.
+
+    We delegate to a contained ``StatefulDataLoader`` rather than subclass
+    it: subclassing would inherit ``_Stateful`` via the runtime-checkable
+    Protocol AND every attribute Lightning's iterator-management code
+    introspects (``flattened``, ``persistent_workers``, etc.), which is what
+    we want; but it would also inherit ``__init__`` whose signature includes
+    parameters we don't want at this layer. Composition keeps the wrapper's
+    constructor clean and lets us forward attribute lookups via
+    ``__getattr__``.
+    """
+
+    def __init__(
+        self,
+        *,
+        dp_rank: int,
+        dp_world_size: int,
+        dp_group: Optional[Any] = None,
+        **kwargs,
+    ) -> None:
+        from torchdata.stateful_dataloader import StatefulDataLoader
+
+        self._dp_rank = int(dp_rank)
+        self._dp_world_size = int(dp_world_size)
+        self._dp_group = dp_group
+        self._inner = StatefulDataLoader(**kwargs)
+
+    def state_dict(self) -> dict:
+        local_state = self._inner.state_dict()
+        tagged = {
+            "dp_rank": self._dp_rank,
+            "dp_world_size": self._dp_world_size,
+            "state": local_state,
+        }
+        if self._dp_world_size <= 1 or not (torch.distributed.is_available() and torch.distributed.is_initialized()):
+            per_rank = [tagged]
+        else:
+            per_rank: List[Optional[dict]] = [None] * self._dp_world_size
+            torch.distributed.all_gather_object(per_rank, tagged, group=self._dp_group)
+        return {"train_dataloader_per_rank": per_rank}
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        if not state_dict:
+            return
+        # We exclusively support the per-rank wire format produced by our
+        # own ``state_dict()``. Anything else — a bare inner state, a
+        # rank-0-only StatefulDataLoader payload (the shape Lightning's
+        # FitLoop used to broadcast and silently corrupt resume), an old
+        # DataModule key — must fail loudly so any partial-rollforward or
+        # checkpoint-format mismatch is caught at load time rather than
+        # producing wrong data several minutes into training.
+        if "train_dataloader_per_rank" not in state_dict:
+            raise RuntimeError(
+                "PerRankStatefulDataLoader.load_state_dict: state must use "
+                "the per-rank wire format (top-level key "
+                "'train_dataloader_per_rank'); got keys "
+                f"{sorted(state_dict.keys())}. This dataloader only supports "
+                "states produced by its own state_dict()."
+            )
+        per_rank = state_dict["train_dataloader_per_rank"]
+        if not isinstance(per_rank, list) or len(per_rank) != self._dp_world_size:
+            raise RuntimeError(
+                f"PerRankStatefulDataLoader: state has dp_world_size="
+                f"{len(per_rank) if isinstance(per_rank, list) else 'unknown'} "
+                f"but the current run has dp_world_size={self._dp_world_size}."
+            )
+        entry = per_rank[self._dp_rank]
+        if (
+            not isinstance(entry, dict)
+            or "state" not in entry
+            or "dp_rank" not in entry
+            or "dp_world_size" not in entry
+        ):
+            raise RuntimeError(
+                f"PerRankStatefulDataLoader: malformed per-rank entry at index "
+                f"{self._dp_rank}: expected keys {{'dp_rank', 'dp_world_size', "
+                f"'state'}}, got {list(entry.keys()) if isinstance(entry, dict) else type(entry).__name__}."
+            )
+        saved_rank, saved_world = entry["dp_rank"], entry["dp_world_size"]
+        if saved_rank != self._dp_rank or saved_world != self._dp_world_size:
+            raise RuntimeError(
+                f"PerRankStatefulDataLoader: state tagged (dp_rank={saved_rank}, "
+                f"dp_world_size={saved_world}) loaded on (dp_rank={self._dp_rank}, "
+                f"dp_world_size={self._dp_world_size})."
+            )
+        self._inner.load_state_dict(entry["state"])
+
+    # Forward everything else to the inner StatefulDataLoader so Lightning's
+    # iterator-management, ``flattened``-discovery and friends keep working.
+    def __getattr__(self, name: str) -> Any:
+        # ``__getattr__`` only fires when normal attribute lookup fails, so the
+        # explicit attributes (``_inner``, ``_dp_rank``, ...) are reached
+        # directly without bouncing through here.
+        return getattr(self._inner, name)
+
+    def __iter__(self):
+        return iter(self._inner)
+
+    def __len__(self):
+        return len(self._inner)
+
+
+def _maybe_init_main_process_for_iterable(num_workers: int, global_rank: int, world_size: int, seed: int) -> None:
+    """When ``num_workers == 0`` the iterable-path sampler runs in the main training
+    process; PyTorch's DataLoader never invokes ``worker_init_fn`` in that case.
+    Call it eagerly so env vars (``RANK``/``WORLD_SIZE``/``LHOTSE_PROCESS_SEED``) and
+    the per-process random seed are set before any iterator is consumed — required so
+    ``get_worker_partition`` returns the correct DP-rank shard inside lhotse's lazy
+    indexed iterators (e.g. ``LazyShuffledRange``)."""
+    if num_workers == 0:
+        from lhotse.dataset.dataloading import worker_init_fn
+
+        worker_init_fn(0, rank=global_rank, world_size=world_size, seed=seed)
+
+
 def get_lhotse_dataloader_from_config(
     config: Union[dict, DictConfig],
     global_rank: int,
     world_size: int,
     dataset: torch.utils.data.Dataset,
     tokenizer=None,
+    dp_group: Optional[Any] = None,
 ) -> torch.utils.data.DataLoader:
     """
     Set up a Lhotse training dataloader.
@@ -304,10 +492,16 @@ def get_lhotse_dataloader_from_config(
             world_size=world_size,
             dataset=dataset,
             tokenizer=tokenizer,
+            dp_group=dp_group,
         )
     else:
         return get_lhotse_dataloader_from_single_config(
-            config=config, global_rank=global_rank, world_size=world_size, dataset=dataset, tokenizer=tokenizer
+            config=config,
+            global_rank=global_rank,
+            world_size=world_size,
+            dataset=dataset,
+            tokenizer=tokenizer,
+            dp_group=dp_group,
         )
 
 
@@ -317,6 +511,7 @@ def get_lhotse_dataloader_from_single_config(
     world_size: int,
     dataset: torch.utils.data.Dataset,
     tokenizer=None,
+    dp_group: Optional[Any] = None,
 ) -> torch.utils.data.DataLoader:
     """
     Set up a Lhotse training dataloader.
@@ -359,6 +554,7 @@ def get_lhotse_dataloader_from_single_config(
         # We use lhotse's own worker_init_fn which leverages information such as rank, world_size,
         # worker_id, etc. to set a different random seed for each (node, worker) combination.
         # This together with infinite datasets removes the need to split data across nodes/workers.
+        _maybe_init_main_process_for_iterable(config.num_workers, global_rank, world_size, config.seed)
         dloader_kwargs = dict(
             dataset=IterableDatasetWrapper(dataset=dataset, sampler=sampler),
             worker_init_fn=make_worker_init_fn(rank=global_rank, world_size=world_size, seed=config.seed),
@@ -369,7 +565,11 @@ def get_lhotse_dataloader_from_single_config(
         # reads only light-weight JSON objects; it samples mini-batches and passes
         # the meta-data to Dataset, which performs the actual I/O inside its __getitem__ method.
         dloader_kwargs = dict(dataset=dataset, sampler=sampler)
-    dloader = torch.utils.data.DataLoader(
+    dloader = _build_dataloader(
+        use_stateful_dataloader=config.use_stateful_dataloader,
+        dp_rank=global_rank,
+        dp_world_size=world_size,
+        dp_group=dp_group,
         **dloader_kwargs,
         batch_size=None,
         num_workers=config.num_workers,
@@ -385,6 +585,7 @@ def get_lhotse_dataloader_from_multi_config(
     world_size: int,
     dataset: torch.utils.data.Dataset,
     tokenizer=None,
+    dp_group: Optional[Any] = None,
 ) -> torch.utils.data.DataLoader:
     """
     Set up a Lhotse training dataloder.
@@ -420,6 +621,13 @@ def gather_shared_opts():
             "multi_config",
             "metadata_only",
             "force_finite",
+            "use_stateful_dataloader",
+            # Indexed dataloading flags must propagate too — otherwise a
+            # top-level ``indexed: true`` / ``indexes_root: /tmp/idx`` on the
+            # train_ds namespace silently fails to reach sub-configs, and the
+            # underlying readers fall back to streaming.
+            "indexed",
+            "indexes_root",
         ]
         defaults = OmegaConf.structured(LhotseDataLoadingConfig)
         top_level_config["seed"] = resolve_seed(top_level_config["seed"])
@@ -483,6 +691,7 @@ def gather_shared_opts():
         # We use lhotse's own worker_init_fn which leverages information such as rank, world_size,
         # worker_id, etc. to set a different random seed for each (node, worker) combination.
         # This together with infinite datasets removes the need to split data across nodes/workers.
+        _maybe_init_main_process_for_iterable(shared_opts.num_workers, global_rank, world_size, shared_opts.seed)
         dloader_kwargs = dict(
             dataset=IterableDatasetWrapper(dataset=dataset, sampler=sampler),
             worker_init_fn=make_worker_init_fn(rank=global_rank, world_size=world_size, seed=shared_opts.seed),
@@ -493,7 +702,11 @@ def gather_shared_opts():
         # reads only light-weight JSON objects; it samples mini-batches and passes
         # the meta-data to Dataset, which performs the actual I/O inside its __getitem__ method.
         dloader_kwargs = dict(dataset=dataset, sampler=sampler)
-    dloader = torch.utils.data.DataLoader(
+    dloader = _build_dataloader(
+        use_stateful_dataloader=shared_opts.use_stateful_dataloader,
+        dp_rank=global_rank,
+        dp_world_size=world_size,
+        dp_group=dp_group,
         **dloader_kwargs,
         batch_size=None,
         num_workers=shared_opts.num_workers,
@@ -509,6 +722,38 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No
     cuts, use_iterable_dataset = read_cutset_from_config(config)
     use_iterable_dataset = determine_use_iterable_dataset(use_iterable_dataset, config)
 
+    # Map-style + StatefulDataLoader requires shard_seed to be a fixed integer:
+    #   * On the map path, cross-rank de-duplication is by ``rank/world_size``
+    #     index slicing (passed below to DynamicBucketingSampler/DynamicCutSampler),
+    #     NOT by per-rank seed differentiation. ``shard_seed="randomized"`` is
+    #     iterable-path machinery that injects worker-PID-derived seeding;
+    #     across resume boundaries the new process has a different PID, so the
+    #     freshly-initialised sampler RNG diverges from the saved snapshot.
+    #     ``StatefulDataLoader.load_state_dict`` overrides that init RNG state
+    #     in practice, but it's a footgun: any RNG draw before the first
+    #     ``__iter__`` (e.g. shuffle of shards in the parent process) is lost.
+    # If the user sets ``shard_seed="randomized"`` AND ``force_map_dataset=True``
+    # AND ``use_stateful_dataloader=True``, warn loudly and auto-overwrite with
+    # the fixed ``seed`` integer so resume semantics stay clean.
+    if (
+        getattr(config, "force_map_dataset", False)
+        and getattr(config, "use_stateful_dataloader", False)
+        and isinstance(config.get("shard_seed"), str)
+        and str(config.shard_seed).lower() == "randomized"
+    ):
+        fixed_seed = int(config.seed)
+        logging.warning(
+            "shard_seed=%r is incompatible with force_map_dataset=True + "
+            "use_stateful_dataloader=True (the map path doesn't need per-rank "
+            "seed differentiation; cross-rank de-dup is by index slicing). "
+            "Auto-overriding shard_seed -> %d (the value of `seed`) for "
+            "deterministic StatefulDataLoader resume. Pin shard_seed to an "
+            "integer in your YAML to silence this warning.",
+            config.shard_seed,
+            fixed_seed,
+        )
+        config.shard_seed = fixed_seed
+
     _auto_detect_bucketing_and_validate_batch_size(config)
 
     # Apply channel selector
@@ -519,9 +764,6 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No
     # Resample as a safeguard; it's a no-op when SR is already OK
     cuts = cuts.map(partial(resample, sampling_rate=config.sample_rate), apply_fn=None)
 
-    # Expands cuts if multiple translations are provided.
-    cuts = CutSet(LazyFlattener(cuts.map(_flatten_alt_text, apply_fn=None)))
-
     if config.use_multimodal_sampling:
         assert tokenizer is not None, (
             "You must pass a tokenizer to `get_lhotse_dataloader_from_config` in order to"
@@ -938,22 +1180,6 @@ def _merge_supervisions(cuts: CutSet) -> CutSet:
     return cuts.merge_supervisions()
 
 
-def _flatten_alt_text(cut) -> list:
-    ans = [cut]
-    if not isinstance(cut, Cut) or cut.custom is None or cut.custom.get("alt_text") is None:
-        return ans
-    cut = cut.move_to_memory(audio_format="wav")  # performs I/O once and holds audio in memory from now on
-    # Popping to ease eyesight on debug.
-    paired_text = cut.custom.pop("alt_text")
-    for data in paired_text.values():
-        # Copy to avoid lazy dataloading issues
-        data = data.copy()
-        text_instance = cut.map_supervisions(lambda s: fastcopy(s, text=data["text"], language=data["lang"]))
-        text_instance.custom = {"text": data.pop("text"), "lang": data.pop("lang"), **data}
-        ans.append(text_instance)
-    return ans
-
-
 def maybe_set_cuda_expandable_segments(enabled: bool):
     """
     Configures PyTorch memory allocator to expand existing allocated segments
diff --git a/nemo/collections/common/data/lhotse/indexed_adapters.py b/nemo/collections/common/data/lhotse/indexed_adapters.py
index 831edf0b1f54..989db03e0406 100644
--- a/nemo/collections/common/data/lhotse/indexed_adapters.py
+++ b/nemo/collections/common/data/lhotse/indexed_adapters.py
@@ -13,96 +13,90 @@
 # limitations under the License.
 import json
 import os
-import random
+import re
 import struct
 import tarfile
 from pathlib import Path
-from typing import NamedTuple
+from typing import NamedTuple, Optional
 
 import numpy as np
 
-# Knuth's multiplicative hash constant (golden-ratio derived, 32-bit).
-_KNUTH_HASH = 2654435761
 
+# Tar block size + the all-zeros block that marks end-of-archive in tar.
+_TAR_BLOCK_SIZE = 512
+_TAR_ZERO_BLOCK = b'\0' * _TAR_BLOCK_SIZE
 
-class LazyShuffledRange:
-    """
-    Generates a permutation of ``range(n)`` lazily using a Feistel cipher,
-    without materializing the full index list. Each element is computed on
-    the fly in O(1) time and the object itself uses O(1) memory regardless
-    of ``n``.
-
-    The technique is known as *cycle-walking* format-preserving encryption:
-    a Feistel network is a bijection on ``[0, 2^k)``, and repeatedly applying
-    it until the output falls within ``[0, n)`` restricts it to a bijection
-    on the desired domain.
-
-    Args:
-        n: Size of the range to permute.
-        rng: A ``random.Random`` instance used to derive round keys.
-        num_rounds: Number of Feistel rounds (more rounds = better uniformity,
-            6 is a good default for typical dataset sizes).
-    """
+# Recognized URL schemes whose authority ("host" component) is part of the
+# logical path (e.g. the bucket name). Stripping just the scheme keeps the
+# bucket+key in the relative path used to mirror under indexes_root.
+_URL_RE = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.\-]*://")
 
-    def __init__(self, n: int, rng: random.Random, num_rounds: int = 6):
-        self.n = n
-        if n <= 1:
-            return
-        bits = (n - 1).bit_length()
-        if bits < 2:
-            bits = 2
-        if bits % 2:
-            bits += 1
-        self._half = bits // 2
-        self._mask = (1 << self._half) - 1
-        self._num_rounds = num_rounds
-        self._keys = [rng.getrandbits(64) for _ in range(num_rounds)]
-
-    def _permute_one(self, x: int) -> int:
-        left = (x >> self._half) & self._mask
-        right = x & self._mask
-        for key in self._keys:
-            left, right = right, left ^ (((right * _KNUTH_HASH) ^ key) >> 32 & self._mask)
-        return (left << self._half) | right
 
-    def __len__(self) -> int:
-        return self.n
+def _is_remote_path(path) -> bool:
+    """True if *path* is a URL/URI (s3://, ais://, http(s)://, gs://, …)."""
+    return bool(_URL_RE.match(str(path)))
 
-    def __iter__(self):
-        n = self.n
-        if n <= 0:
-            return
-        if n == 1:
-            yield 0
-            return
-        for i in range(n):
-            x = i
-            while True:
-                x = self._permute_one(x)
-                if x < n:
-                    yield x
-                    break
+
+def _open_data_path(path: str):
+    """
+    Return a seekable file-like for *path*, suitable for the indexed
+    tar readers' ``self._fh`` slot.
+
+    Local paths get a regular ``open(path, "rb")``. URL/URI paths return an
+    :class:`lhotse.ais.AISRangeReader` (imported from lhotse to keep the
+    seekable-AIS wrapper as a single source of truth shared with
+    :func:`lhotse.indexing._open_for_indexed_read`). Other URL schemes
+    (``http://``, ``gs://``, …) currently fall through to ``AISRangeReader``
+    as well — the aistore SDK is the only seekable remote backend lhotse
+    exposes today; if a future backend gains a seekable wrapper, dispatch
+    here.
+    """
+    if _is_remote_path(path):
+        from lhotse.ais import AISRangeReader
+
+        return AISRangeReader(str(path))
+    return open(path, "rb")
 
 
-def _load_index(data_path: str, idx_path: str | None = None):
+def _load_index(data_path: str, idx_path: Optional[str] = None):
     """
-    Load a memmap'd offset index for *data_path*.
+    Load an offset index for *data_path*, layering NeMo-specific validation
+    on top of :func:`lhotse.indexing.read_index`.
 
     Returns ``(offsets, num_samples)`` where ``offsets`` always has
     ``num_samples + 1`` entries — the last one being the data file size
-    (appended if absent in the on-disk index).
+    (appended if absent in the on-disk index, for legacy ``.idx`` files
+    written before the sentinel convention was added).
 
     Validates that all sample offsets fall within the data file.
+
+    For remote ``data_path`` URIs (``s3://`` / ``ais://`` / ``http(s)://`` /
+    ``gs://``) ``os.path.getsize`` is not callable; we trust the size
+    sentinel that ``create_tar_index`` / ``create_jsonl_index`` recorded as
+    the last offset in the on-disk index. The same indexes are emitted for
+    local and remote sources, so the on-disk format is identical — only the
+    file-size cross-check is skipped.
     """
+    from lhotse.indexing import read_index
+
     if idx_path is None:
         idx_path = data_path + '.idx'
-    offsets = np.memmap(idx_path, dtype=np.dtype('<u8'), mode='r')
-    data_size = os.path.getsize(data_path)
-    if offsets[-1] == data_size:
+    offsets = read_index(idx_path)
+    if _URL_RE.match(str(data_path)):
+        if offsets.shape[0] < 1:
+            raise ValueError(
+                f"Index for remote source {data_path} is empty; expected at "
+                f"least a size sentinel. Rebuild via build_indexes.py."
+            )
+        data_size = int(offsets[-1])
         num_samples = offsets.shape[0] - 1
     else:
-        num_samples = offsets.shape[0]
-        offsets = np.append(offsets, np.uint64(data_size))
+        data_size = os.path.getsize(data_path)
+        if offsets[-1] == data_size:
+            num_samples = offsets.shape[0] - 1
+        else:
+            num_samples = offsets.shape[0]
+            offsets = np.append(offsets, np.uint64(data_size))
     if num_samples > 0:
         max_offset = int(offsets[:num_samples].max())
         if max_offset >= data_size:
@@ -123,24 +117,6 @@ def _resolve_idx(idx: int, length: int) -> int:
     return idx
 
 
-class IndexedJSONLReader:
-    def __init__(self, jsonl_path: Path | str, idx_path: Path | str | None = None):
-        self.data_path = str(jsonl_path)
-        self.offsets, self._len = _load_index(self.data_path, str(idx_path) if idx_path else None)
-
-    def __len__(self):
-        return self._len
-
-    def __getitem__(self, idx):
-        idx = _resolve_idx(idx, self._len)
-        start = int(self.offsets[idx])
-        end = int(self.offsets[idx + 1])
-        with open(self.data_path, 'rb') as f:
-            f.seek(start)
-            data = f.read(end - start)
-        return json.loads(data.decode('utf-8'))
-
-
 class TarSample(NamedTuple):
     """A single sample extracted from a WebDataset tar archive."""
 
@@ -161,8 +137,9 @@ def _split_json_audio_pair(name_a, bytes_a, name_b, bytes_b) -> TarSample:
 class IndexedTarSampleReader:
     """
     Random access to WebDataset tar samples (``N.json`` + ``N.<audio>``) via an index file.
-    Index format is identical to ``IndexedJSONLReader``: little-endian uint64 offsets,
-    optionally followed by a sentinel equal to the tar file size.
+    Index format is the same little-endian ``uint64`` offsets as
+    :class:`lhotse.indexing.IndexedJsonlReader`, optionally followed by a
+    sentinel equal to the tar file size.
     """
 
     def __init__(self, tar_path: str | Path, idx_path: str | Path | None = None):
@@ -182,24 +159,24 @@ def _validate_index(self):
         # file size (which _load_index already handles).
         while self._len > 0:
             last = int(self.offsets[self._len - 1])
-            with open(self.data_path, 'rb') as f:
+            with _open_data_path(self.data_path) as f:
                 f.seek(last)
-                buf = f.read(512)
-            if len(buf) < 512 or buf == b'\0' * 512:
+                buf = f.read(_TAR_BLOCK_SIZE)
+            if len(buf) < _TAR_BLOCK_SIZE or buf == _TAR_ZERO_BLOCK:
                 self._len -= 1
             else:
                 break
 
     def _check_offset_is_tar_header(self, offset: int, label: str = ""):
-        with open(self.data_path, 'rb') as f:
+        with _open_data_path(self.data_path) as f:
             f.seek(offset)
-            buf = f.read(512)
-        if len(buf) < 512:
+            buf = f.read(_TAR_BLOCK_SIZE)
+        if len(buf) < _TAR_BLOCK_SIZE:
             raise ValueError(
                 f"Tar index for {self.data_path}: {label} offset {offset} "
                 f"is too close to EOF (file size {self._data_size})."
             )
-        if buf == b'\0' * 512:
+        if buf == _TAR_ZERO_BLOCK:
             raise ValueError(
                 f"Tar index for {self.data_path}: {label} offset {offset} "
                 f"points to a zero block (end-of-archive marker), not a tar header. "
@@ -223,7 +200,7 @@ def __len__(self):
     def __getitem__(self, idx):
         idx = _resolve_idx(idx, self._len)
         offset = int(self.offsets[idx])
-        with open(self.data_path, 'rb') as f:
+        with _open_data_path(self.data_path) as f:
             f.seek(offset)
             try:
                 name_a, bytes_a = _read_tar_member(f)
@@ -245,6 +222,121 @@ def __getitem__(self, idx):
         return _split_json_audio_pair(name_a, bytes_a, name_b, bytes_b)
 
 
+class IndexedTarMemberReader:
+    """
+    Random access to a NeMo-style tar archive that stores **one regular member
+    per sample** (e.g. ``<cut_id>.flac`` per line of an external NeMo manifest).
+
+    Uses the same ``.idx`` format as :class:`lhotse.indexing.IndexedJsonlReader`
+    and :class:`IndexedTarSampleReader`: little-endian uint64 byte offsets, with
+    a sentinel equal to the tar file size at the end. Each entry points at
+    one tar header, and the corresponding payload starts ``512`` bytes later.
+
+    Two access patterns:
+
+    * Positional: ``reader[idx]`` returns ``(member_name, payload_bytes)``.
+    * Name-keyed: ``reader.get(name)`` returns just the payload bytes. The
+      name → position map is built lazily on first use by walking the tar
+      headers (no payload reads), then cached for subsequent calls.
+    """
+
+    def __init__(
+        self,
+        tar_path: str | Path,
+        idx_path: str | Path | None = None,
+        auto_create_index: bool = True,
+    ):
+        self.data_path = str(tar_path)
+        resolved_idx = str(idx_path) if idx_path else self.data_path + ".idx"
+        if auto_create_index and not os.path.exists(resolved_idx):
+            create_tar_index(self.data_path, resolved_idx)
+        self.offsets, self._len = _load_index(self.data_path, resolved_idx)
+        self._fh = None
+        self._name_to_idx: dict[str, int] | None = None
+
+    def _ensure_open(self):
+        if self._fh is None:
+            self._fh = _open_data_path(self.data_path)
+
+    def close(self):
+        if self._fh is not None:
+            self._fh.close()
+            self._fh = None
+
+    def __del__(self):
+        self.close()
+
+    def __getstate__(self):
+        s = self.__dict__.copy()
+        s["_fh"] = None  # file handles are not picklable
+        return s
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def __len__(self) -> int:
+        return self._len
+
+    def __getitem__(self, idx: int) -> tuple[str, bytes]:
+        idx = _resolve_idx(idx, self._len)
+        offset = int(self.offsets[idx])
+        self._ensure_open()
+        self._fh.seek(offset)
+        try:
+            name, data = _read_tar_member(self._fh)
+        except (EOFError, tarfile.TarError) as e:
+            raise type(e)(f"{e} — reading sample {idx}/{self._len} at offset {offset} " f"in {self.data_path}") from e
+        return name, data
+
+    def _build_name_index(self) -> dict[str, int]:
+        """Walk the tar headers once to build a name → sample-index map.
+
+        Reads only the 512-byte tar headers (no payloads), so this is
+        relatively cheap even on remote storage. Done lazily on first
+        :meth:`get` call.
+
+        ``tar.add`` writes a PAX extended header (``@PaxHeader``) before any
+        member with a long path or extended attributes. We skip those and
+        record the *regular* file's name at each indexed offset.
+        """
+        name_to_idx: dict[str, int] = {}
+        self._ensure_open()
+        for i in range(self._len):
+            self._fh.seek(int(self.offsets[i]))
+            while True:
+                header = self._fh.read(_TAR_BLOCK_SIZE)
+                if len(header) < _TAR_BLOCK_SIZE or header == _TAR_ZERO_BLOCK:
+                    break
+                info = tarfile.TarInfo.frombuf(header, tarfile.ENCODING, "surrogateescape")
+                if info.type in (tarfile.REGTYPE, tarfile.AREGTYPE):
+                    name_to_idx[info.name] = i
+                    break
+                # Skip non-regular member (PAX/GNU long-name) data + padding.
+                size_blocks = -(-info.size // _TAR_BLOCK_SIZE) * _TAR_BLOCK_SIZE
+                self._fh.seek(size_blocks, 1)
+        return name_to_idx
+
+    def get(self, name: str) -> bytes:
+        """Return the payload bytes of the tar member named ``name``."""
+        if self._name_to_idx is None:
+            self._name_to_idx = self._build_name_index()
+        try:
+            idx = self._name_to_idx[name]
+        except KeyError as e:
+            raise KeyError(
+                f"Tar {self.data_path} has no member named '{name}'. "
+                f"The .idx may be stale or the manifest is referencing a "
+                f"different tar."
+            ) from e
+        _, data = self[idx]
+        return data
+
+    def __contains__(self, name: str) -> bool:
+        if self._name_to_idx is None:
+            self._name_to_idx = self._build_name_index()
+        return name in self._name_to_idx
+
+
 def _read_tar_member(f):
     """Read the next regular-file tar member, skipping non-regular entries
     (PAX headers, GNU long-name headers, directory entries, etc.).
@@ -256,64 +348,89 @@ def _read_tar_member(f):
     arbitrary byte offset and read just the members we need in O(1).
     """
     while True:
-        header_buf = f.read(512)
-        if len(header_buf) < 512 or header_buf == b'\0' * 512:
+        header_buf = f.read(_TAR_BLOCK_SIZE)
+        if len(header_buf) < _TAR_BLOCK_SIZE or header_buf == _TAR_ZERO_BLOCK:
             raise EOFError("End of tar archive or unexpected EOF")
         info = tarfile.TarInfo.frombuf(header_buf, tarfile.ENCODING, "surrogateescape")
         data = f.read(info.size)
         if len(data) < info.size:
             raise EOFError("Unexpected end of tar file while reading data")
-        remainder = info.size % 512
+        remainder = info.size % _TAR_BLOCK_SIZE
         if remainder:
-            f.seek(512 - remainder, 1)
+            f.seek(_TAR_BLOCK_SIZE - remainder, 1)
         if info.type not in (tarfile.REGTYPE, tarfile.AREGTYPE):
             continue
         return info.name, data
 
 
-def create_index(jsonl_path, idx_path):
+class _CountingReader:
     """
-    Creates a raw binary index file compatible with Megatron-Energon (CrudeJsonlDataset).
-
-    Format: sequence of little-endian uint64 values
-    ``[Offset_0, Offset_1, ..., Offset_N, File_Size]``
+    Minimal file-like wrapper that delegates everything to an inner stream
+    while counting the total number of bytes read. Used by
+    :func:`create_tar_index` to compute a tar file's size without calling
+    ``tell()`` — necessary because non-seekable remote streams (AIStore's
+    ``ObjectFileReader``, smart_open's S3 reader without seek support, …)
+    raise ``io.UnsupportedOperation`` on ``tell()`` even when sequential
+    reads succeed.
     """
-    # Flush the write buffer every 8 MiB to limit memory usage on large files.
-    flush_threshold = 8 * 1024 * 1024
-    with open(jsonl_path, 'rb') as f_in, open(idx_path, 'wb') as f_out:
-        current_offset = 0
-        write_buffer = bytearray()
-        write_buffer.extend(struct.pack('<Q', current_offset))
-        for line in f_in:
-            current_offset += len(line)
-            write_buffer.extend(struct.pack('<Q', current_offset))
-            if len(write_buffer) > flush_threshold:
-                f_out.write(write_buffer)
-                write_buffer.clear()
-        if write_buffer:
-            f_out.write(write_buffer)
+
+    def __init__(self, fileobj):
+        self._f = fileobj
+        self.bytes_read = 0
+
+    def read(self, n=-1):
+        data = self._f.read(n)
+        self.bytes_read += len(data)
+        return data
+
+    def readable(self):
+        return True
+
+    def seekable(self):
+        # tarfile's ``r|`` (stream) mode falls back to read+discard when
+        # the fileobj is not seekable, which is exactly what we want.
+        return False
 
 
 def create_tar_index(tar_path, idx_path):
     """
     Creates a raw binary index file for a WebDataset tar archive.
     Stores the byte offset of the first member of each sample (grouped by basename),
-    followed by a sentinel equal to the tar file size.
-    Format is identical to :func:`create_index`.
+    followed by a sentinel equal to the tar file size. On-disk format matches
+    :func:`lhotse.indexing.create_jsonl_index` and the other readers in this
+    module: a sequence of little-endian uint64 byte offsets.
+
+    Reads ``tar_path`` via ``lhotse.serialization.open_best`` so the function
+    works for local files as well as ``s3://`` / ``ais://`` / ``http(s)://``
+    URIs. The tar is opened in streaming mode (``r|``) — remote backends are
+    not seekable — and the sentinel records the total bytes read through a
+    ``_CountingReader`` wrapper rather than ``os.path.getsize`` /
+    ``f.tell()``, both of which fail on non-seekable URI streams.
+
+    Written atomically: data is staged in a per-process temp file next to
+    ``idx_path`` and then ``os.replace()``-d into place, so concurrent writers
+    can't observe a half-written ``.idx``.
     """
+    from lhotse.serialization import open_best
+
     offsets = []
     prev_stem = None
-    with tarfile.open(tar_path, 'r:') as tar:
-        for member in tar:
-            if not member.isreg():
-                continue
-            stem = Path(member.name).stem
-            if stem != prev_stem:
-                offsets.append(member.offset)
-                prev_stem = stem
-    with open(idx_path, 'wb') as f:
+    with open_best(tar_path, "rb") as f:
+        counter = _CountingReader(f)
+        with tarfile.open(fileobj=counter, mode='r|') as tar:
+            for member in tar:
+                if not member.isreg():
+                    continue
+                stem = Path(member.name).stem
+                if stem != prev_stem:
+                    offsets.append(member.offset)
+                    prev_stem = stem
+        file_size = counter.bytes_read
+    tmp_path = f"{idx_path}.tmp.{os.getpid()}"
+    with open(tmp_path, 'wb') as f_out:
         buf = bytearray()
         for off in offsets:
             buf.extend(struct.pack('<Q', off))
-        buf.extend(struct.pack('<Q', os.path.getsize(tar_path)))
-        f.write(buf)
+        buf.extend(struct.pack('<Q', file_size))
+        f_out.write(buf)
+    os.replace(tmp_path, idx_path)
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 824a7a697d9b..a0d39c0017fd 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -13,14 +13,17 @@
 # limitations under the License.
 
 """Lhotse adapters for NeMo datasets including Parquet support."""
+import bisect
+import json
 import os
 import random
 import re
 import tarfile
 from collections.abc import Mapping, Sequence
+from contextlib import closing
 from io import BytesIO
 from pathlib import Path
-from typing import Generator, Iterable, List, Literal
+from typing import Generator, Iterable, List, Literal, Union
 
 try:
     import pyarrow.parquet as pq
@@ -38,12 +41,41 @@
 from lhotse.serialization import open_best
 from lhotse.utils import compute_num_samples, ifnone
 
+from nemo.collections.common.data.lhotse._compat import (
+    GraphOriginDict,
+    IteratorNode,
+    LazyIndexedManifestIterator,
+    PartitionedIndexedIterator,
+    attach_graph_origin,
+    normalize_graph_token,
+)
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 from nemo.utils import logging
 from nemo.utils.data_utils import is_datastore_path
 
+# NeMo tarred manifests support per-recording offsets via "-subN" audio_filepath
+# suffixes. We use this pattern in both indexed and streaming code paths to
+# recover the actual tar member name (offsets share a single member).
+_OFFSET_PATTERN = re.compile(r'^(?P<stem>.+)(?P<sub>-sub\d+)(?P<ext>\.\w+)?$')
+ShardKey = Union[int, tuple[int, int]]
 
-class LazyNeMoIterator:
+
+_MALFORMED_INDEXED_MANIFEST_WARNING_KEYS: set[tuple[str, str]] = set()
+
+
+def _warn_malformed_indexed_manifest_record(ex: BaseException, idx: int, path: str | Path) -> None:
+    key = (str(path), type(ex).__name__)
+    if key in _MALFORMED_INDEXED_MANIFEST_WARNING_KEYS:
+        return
+    _MALFORMED_INDEXED_MANIFEST_WARNING_KEYS.add(key)
+    logging.warning(
+        "Skipping malformed indexed NeMo manifest records; "
+        f"first occurrence path={path!r} idx={idx} error={type(ex).__name__}: {ex}. "
+        "Further records with the same path/error type are suppressed in this worker."
+    )
+
+
+class LazyNeMoIterator(IteratorNode):
     """
     ``LazyNeMoIterator`` reads a NeMo (non-tarred) JSON manifest and converts it on the fly to an ``Iterable[Cut]``.
     It's used to create a ``lhotse.CutSet``.
@@ -85,6 +117,24 @@ class LazyNeMoIterator:
         ...     "nemo_manifests/train.json",
         ...     extra_fields=[{"type": "text_sample", "name": "question", "path": "questions.txt"}],
         ... ))
+
+    Indexed mode (``indexed=True``)
+    -------------------------------
+
+    When the underlying manifest is uncompressed JSONL, set ``indexed=True`` to enable
+    O(1) random access and exact graph-token checkpointing through
+    :class:`lhotse.indexing.IndexedJsonlReader`. In indexed mode this iterator becomes
+    an indexed ``IteratorNode`` that can be combined with ``StatefulDataLoader`` for
+    bit-exact mid-epoch resume.
+
+    Indexed mode requires:
+
+    * the manifest path(s) to use ``.jsonl`` extension and be uncompressed;
+    * ``extra_fields`` to be unset (lookup-based fields are positional and cannot be
+      reproduced after a Feistel-permuted random access).
+
+    Sharded indexed inputs are composed via :class:`lhotse.lazy.LazyIteratorChain`,
+    which picks a Feistel cross-shard permutation for true item-level shuffling.
     """
 
     def __init__(
@@ -96,68 +146,142 @@ def __init__(
         shuffle_shards: bool = False,
         shard_seed: int | Literal["randomized", "trng"] = "trng",
         extra_fields: list[dict[str, str]] | None = None,
+        indexed: bool = False,
+        indexes_root: str | Path | None = None,
+        skip_missing_manifest_entries: bool = False,
     ) -> None:
         self.path = path
         self.shuffle_shards = shuffle_shards
         self.shard_seed = shard_seed
-        paths = expand_sharded_filepaths(path)
-
-        if len(paths) == 1:
-            self.source = LazyJsonlIterator(paths[0])
-        else:
-            self.source = LazyIteratorChain(
-                *(LazyJsonlIterator(p) for p in paths), shuffle_iters=self.shuffle_shards, seed=self.shard_seed
-            )
         self.text_field = text_field
         self.lang_field = lang_field
         self.metadata_only = metadata_only
         self.extra_fields = extra_fields
+        self.indexed = indexed
+        self.indexes_root = indexes_root
+        self.skip_missing_manifest_entries = skip_missing_manifest_entries
         validate_extra_fields(self.extra_fields)
+        paths = expand_sharded_filepaths(path)
+
+        if indexed:
+            if extra_fields:
+                raise ValueError(
+                    "LazyNeMoIterator(indexed=True) does not support 'extra_fields' because "
+                    "their values are positional/streaming and cannot be reconstructed under "
+                    "graph-token random access."
+                )
+            from lhotse.indexing import index_file_path
+
+            seed = resolve_seed(shard_seed) if shard_seed not in (None, "trng", "randomized") else 0
+            indexed_sources = [
+                LazyIndexedManifestIterator(
+                    p,
+                    index_path=index_file_path(p, indexes_root),
+                    decode=GraphOriginDict,
+                    skip_decode_errors=skip_missing_manifest_entries,
+                    decode_error_callback=_warn_malformed_indexed_manifest_record,
+                )
+                for p in paths
+            ]
+            if len(indexed_sources) == 1:
+                self.source = indexed_sources[0]
+            else:
+                self.source = LazyIteratorChain(*indexed_sources, shuffle_iters=shuffle_shards, seed=seed)
+        else:
+            if len(paths) == 1:
+                self.source = LazyJsonlIterator(paths[0])
+            else:
+                self.source = LazyIteratorChain(
+                    *(LazyJsonlIterator(p) for p in paths),
+                    shuffle_iters=self.shuffle_shards,
+                    seed=self.shard_seed,
+                )
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
 
     def __iter__(self) -> Generator[Cut, None, None]:
         seed = resolve_seed(self.shard_seed)
         # Propagate the random seed
         extra_fields = [ExtraField.from_dict({"seed": seed, **field_cfg}) for field_cfg in self.extra_fields or ()]
         for data in self.source:
+            graph_token = getattr(data, "_graph_origin", None) if self.indexed else None
             # filter out entries with valid "_skipme" values.
             if data.get("_skipme", False):
                 continue
-            audio_path = get_full_path(str(data.pop("audio_filepath")), str(self.path), force_cache=False)
-            duration = data.pop("duration")
-            offset = data.pop("offset", None)
-            sampling_rate = data.pop("sampling_rate", None)
-            if sampling_rate is None:
-                sampling_rate = data.pop("sample_rate", None)
-            cut = self._create_cut(
-                audio_path=audio_path,
-                offset=offset,
-                duration=duration,
-                sampling_rate=sampling_rate,
-            )
-            # Note that start=0 and not start=offset because supervision's start if relative to the
-            # start of the cut; and cut.start is already set to offset
-            cut.supervisions.append(
-                SupervisionSegment(
-                    id=cut.id,
-                    recording_id=cut.recording_id,
-                    start=0,
-                    duration=cut.duration,
-                    channel=cut.channel,
-                    text=data.get(self.text_field),
-                    language=data.get(self.lang_field),
-                )
-            )
-            cut.custom = data
+            cut = self._build_cut_from_dict(data)
             for extra_field in extra_fields:
                 extra_field.attach_to(cut)
+            if graph_token is not None:
+                attach_graph_origin(cut, graph_token)
             yield cut
 
+    def __getitem__(self, token):
+        token = normalize_graph_token(token)
+        if self.extra_fields:
+            raise NotImplementedError(
+                "LazyNeMoIterator does not support __getitem__ when extra_fields are configured."
+            )
+        data = self.source[token]
+        cut = self._build_cut_from_dict(data)
+        return attach_graph_origin(cut, token) if self.indexed else cut
+
     def __len__(self) -> int:
         return len(self.source)
 
     def __add__(self, other):
         return LazyIteratorChain(self, other)
 
+    def state_dict(self) -> dict:
+        if not self.indexed:
+            return {}
+        return {"source": self.source.state_dict()}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        if "source" in sd:
+            self.source.load_state_dict(sd["source"])
+
+    def _build_cut_from_dict(self, data: dict) -> Cut:
+        # Note: ``data`` may be reused across calls in indexed mode (the reader returns
+        # a fresh dict each time, but we still avoid mutating the inner object).
+        data = dict(data)
+        audio_path = get_full_path(str(data.pop("audio_filepath")), str(self.path), force_cache=False)
+        duration = data.pop("duration")
+        offset = data.pop("offset", None)
+        sampling_rate = data.pop("sampling_rate", None)
+        if sampling_rate is None:
+            sampling_rate = data.pop("sample_rate", None)
+        cut = self._create_cut(
+            audio_path=audio_path,
+            offset=offset,
+            duration=duration,
+            sampling_rate=sampling_rate,
+        )
+        cut.supervisions.append(
+            SupervisionSegment(
+                id=cut.id,
+                recording_id=cut.recording_id,
+                start=0,
+                duration=cut.duration,
+                channel=cut.channel,
+                text=data.get(self.text_field),
+                language=data.get(self.lang_field),
+            )
+        )
+        cut.custom = data
+        return cut
+
     def _create_cut(
         self,
         audio_path: str,
@@ -216,7 +340,7 @@ def _create_recording(
             return Recording.from_file(audio_path)
 
 
-class LazyNeMoTarredIterator:
+class LazyNeMoTarredIterator(IteratorNode):
     r"""
     ``LazyNeMoTarredIterator`` reads a NeMo tarred JSON manifest and converts it on the fly to an ``Iterable[Cut]``.
     It's used to create a ``lhotse.CutSet``.
@@ -300,43 +424,44 @@ def __init__(
         skip_missing_manifest_entries: bool = False,
         extra_fields: list[dict[str, str]] | None = None,
         slice_length: int = None,
+        indexed: bool = False,
+        indexes_root: str | Path | None = None,
     ) -> None:
         self.skip_missing_manifest_entries = skip_missing_manifest_entries
-        self.shard_id_to_manifest: dict[int, Iterable[dict]]
+        self._malformed_manifest_warning_keys: set[tuple[str, ShardKey]] = set()
+        self.indexed = indexed
+        self.indexes_root = indexes_root
+        self.shard_id_to_manifest: dict[ShardKey, Iterable[dict]]
+        self._shard_key_to_manifest_path: dict[ShardKey, str] = {}
         self.paths = expand_sharded_filepaths(manifest_path)
         if len(self.paths) == 1:
-            logging.warning(
-                f"You are using Lhotse dataloading for tarred audio with a non-sharded manifest. "
-                f"This will incur significant memory overhead. To prevent this, please shard file "
-                f"'{self.paths[0]}' using 'scripts/speech_recognition/convert_to_tarred_audio_dataset.py' "
-                f"WITHOUT '--no_shard_manifest'"
-            )
+            if not indexed:
+                logging.warning(
+                    f"You are using Lhotse dataloading for tarred audio with a non-sharded manifest. "
+                    f"This will incur significant memory overhead. To prevent this, please shard file "
+                    f"'{self.paths[0]}' using 'scripts/speech_recognition/convert_to_tarred_audio_dataset.py' "
+                    f"WITHOUT '--no_shard_manifest'"
+                )
             self.source = LazyJsonlIterator(self.paths[0])
-            self.shard_id_to_manifest = groupby("shard_id", self.source)
+            if indexed:
+                # In indexed mode we will not consume self.source for grouping — the per-shard
+                # IndexedJsonlReaders below take over, keyed by the position-derived shard_id 0.
+                self.shard_id_to_manifest = {0: self.source}
+            else:
+                self.shard_id_to_manifest = groupby("shard_id", self.source)
         else:
             json_pattern = re.compile(r"manifest[^/]*_(\d+)[^/]*\.json")
-            shard_ids = []
-            for p in self.paths:
-                m = json_pattern.search(p)
-                assert m is not None, (
-                    f"Cannot determine shard_id from manifest input specified: "
-                    f"we searched with regex '{json_pattern.pattern}' in input '{p}'"
-                )
-                shard_ids.append(int(m.group(1)))
-            self.shard_id_to_manifest = {sid: LazyJsonlIterator(p) for sid, p in zip(shard_ids, self.paths)}
+            shard_keys, _ = _extract_unique_shard_keys(self.paths, json_pattern, path_kind="manifest")
+            self._shard_key_to_manifest_path = {key: path for key, path in zip(shard_keys, self.paths)}
+            self.shard_id_to_manifest = {
+                key: LazyJsonlIterator(path) for key, path in self._shard_key_to_manifest_path.items()
+            }
             self.source = LazyIteratorChain(*self.shard_id_to_manifest.values())
 
         self.tar_paths = expand_sharded_filepaths(tar_paths)
         tar_pattern = re.compile(r"audio[^/]*_(\d+)[^/]*\.tar")
-        shard_ids = []
-        for p in self.tar_paths:
-            m = tar_pattern.search(p)
-            assert m is not None, (
-                f"Cannot determine shard_id from tar input specifier: "
-                f"we searched with regex '{tar_pattern.pattern}' in input '{p}'"
-            )
-            shard_ids.append(int(m.group(1)))
-        self.shard_id_to_tar_path = dict(zip(shard_ids, self.tar_paths))
+        shard_keys, _ = _extract_unique_shard_keys(self.tar_paths, tar_pattern, path_kind="tar")
+        self.shard_id_to_tar_path: dict[ShardKey, str] = {key: path for key, path in zip(shard_keys, self.tar_paths)}
 
         self.shuffle_shards = shuffle_shards
         self.shard_seed = shard_seed
@@ -348,8 +473,80 @@ def __init__(
         self._validate()
         self.use_ais_get_batch = os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true"
 
+        if indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        """Build per-shard IndexedJsonlReaders + audio-tar index for indexed/random access."""
+        from lhotse.indexing import IndexedJsonlReader, index_file_path
+
+        from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
+
+        if self.extra_fields:
+            raise ValueError(
+                "LazyNeMoTarredIterator(indexed=True) does not support 'extra_fields' "
+                "because their values are positional and cannot be reproduced under "
+                "graph-token random access."
+            )
+        if self.slice_length is not None:
+            raise ValueError("LazyNeMoTarredIterator(indexed=True) does not support 'slice_length'.")
+
+        # Order shards by stable shard key so global indices are reproducible.
+        # Multi-bucket NeMo specs may expand to paths such as
+        # bucket_1/audio_0.tar and bucket_2/audio_0.tar; the occurrence suffix in
+        # ShardKey prevents those duplicate numeric shard ids from overwriting.
+        self._sorted_shard_ids: list[ShardKey] = sorted(self.shard_id_to_tar_path.keys())
+        self._cuts_readers: dict[ShardKey, IndexedJsonlReader] = {}
+        # In USE_AIS_GET_BATCH mode we never open the tar files locally — audio is
+        # fetched lazily via URL/file AudioSource by AudioSamples (typically batched).
+        self._tar_readers: dict[ShardKey, IndexedTarMemberReader] = {}
+
+        # Map shard key → manifest path (single or multi-file).
+        if len(self.paths) == 1:
+            shard_id_to_manifest_path = {sid: self.paths[0] for sid in self._sorted_shard_ids}
+        else:
+            shard_id_to_manifest_path = self._shard_key_to_manifest_path
+
+        cum = 0
+        cum_lens = [0]
+        for sid in self._sorted_shard_ids:
+            jsonl_path = shard_id_to_manifest_path[sid]
+            tar_path = self.shard_id_to_tar_path[sid]
+            self._cuts_readers[sid] = IndexedJsonlReader(
+                jsonl_path, index_path=index_file_path(jsonl_path, self.indexes_root)
+            )
+            if not self.use_ais_get_batch:
+                self._tar_readers[sid] = IndexedTarMemberReader(
+                    tar_path, idx_path=index_file_path(tar_path, self.indexes_root)
+                )
+            cum += len(self._cuts_readers[sid])
+            cum_lens.append(cum)
+        self._cum_lens = cum_lens
+        self._total_len = cum
+        self._iter_state = PartitionedIndexedIterator()
+
     def to_shards(self) -> List["LazyNeMoTarredIterator"]:
-        """Convert this iterator to a list of separate iterators for each shard."""
+        """Convert this iterator to a list of separate iterators for each shard.
+
+        Forwards every constructor knob (notably ``indexed``/``indexes_root``,
+        ``extra_fields``, ``slice_length``, ``skip_missing_manifest_entries``)
+        so per-shard sub-iterators behave identically to the parent. Dropping
+        these silently re-enters streaming mode, which a downstream caller
+        like ``mux(..., max_open_streams=N)`` won't notice until the bucketer
+        fails to checkpoint.
+        """
         if len(self.paths) == 1:
             # Cannot do that if the JSON manifest is a single file for all shards;
             # just return self.
@@ -363,11 +560,23 @@ def to_shards(self) -> List["LazyNeMoTarredIterator"]:
                     shard_seed=self.shard_seed,
                     text_field=self.text_field,
                     lang_field=self.lang_field,
+                    skip_missing_manifest_entries=self.skip_missing_manifest_entries,
+                    extra_fields=self.extra_fields,
+                    slice_length=self.slice_length,
+                    indexed=self.indexed,
+                    indexes_root=self.indexes_root,
                 )
                 for path, tarpath in zip(self.paths, self.shard_id_to_tar_path.values())
             ]
 
     def _validate(self) -> None:
+        if self.indexed:
+            # Indexed mode pairs tar and manifest paths by stable shard key in
+            # ``_init_indexed``. The streaming-time shard_id consistency check below
+            # would otherwise reject single-file inputs when the jsonl groups by a
+            # different shard_id field.
+            validate_extra_fields(self.extra_fields)
+            return
         shard_ids_tars = set(self.shard_id_to_tar_path)
         shard_ids_manifest = set(self.shard_id_to_manifest)
         assert shard_ids_tars == shard_ids_manifest, (
@@ -383,7 +592,7 @@ def _get_seed(self) -> int:
         return resolve_seed(self.shard_seed) + self.epoch
 
     @property
-    def shard_ids(self) -> List[int]:
+    def shard_ids(self) -> List[ShardKey]:
         return sorted(self.shard_id_to_manifest.keys())
 
     def _iter_batch_for_ais_get_batch(
@@ -502,7 +711,174 @@ def _iter_sequential(
                             f"Cannot locate JSON entry for tar file '{tar_info.name}'"
                         ) from e
 
+    # ---------------------------------------------------------------------- indexed
+    def _resolve_global_idx(self, idx: int) -> tuple[ShardKey, int]:
+        if idx < 0:
+            idx += self._total_len
+        if idx < 0 or idx >= self._total_len:
+            raise IndexError(f"index {idx} out of range for LazyNeMoTarredIterator with {self._total_len} cuts")
+        shard_pos = bisect.bisect_right(self._cum_lens, idx) - 1
+        sid = self._sorted_shard_ids[shard_pos]
+        return sid, idx - self._cum_lens[shard_pos]
+
+    def _audio_member_name_from_entry(self, entry: dict) -> str:
+        af = entry["audio_filepath"]
+        m = _OFFSET_PATTERN.match(af)
+        if m is None:
+            return af
+        return m.group("stem") + ifnone(m.group("ext"), "")
+
+    def _attach_supervision_and_metadata(self, cut: Cut, data: dict, manifest_path: str, tar_path: str) -> Cut:
+        cut.supervisions.append(
+            SupervisionSegment(
+                id=cut.id,
+                recording_id=cut.recording_id,
+                start=0,
+                duration=cut.duration,
+                text=data.get(self.text_field),
+                language=data.get(self.lang_field),
+            )
+        )
+        cut.custom = _to_custom_attr_dict(data)
+        cut.manifest_origin = manifest_path
+        cut.tar_origin = tar_path
+        return cut
+
+    def _build_indexed_cut(self, data: dict, audio_bytes: bytes, manifest_path: str, tar_path: str) -> Cut | None:
+        """Decode a single (manifest_entry, audio_bytes) pair into a Cut, mirroring the streaming path."""
+        if data.get("_skipme", False):
+            return None
+        try:
+            meta = soundfile.info(BytesIO(audio_bytes))
+        except Exception:
+            logging.warning(
+                f"Skipped corrupted audio member referenced by '{data.get('audio_filepath')}' in {tar_path=}."
+            )
+            return None
+        recording = Recording(
+            id=str(data["audio_filepath"]),
+            sources=[AudioSource(type="memory", channels=list(range(meta.channels)), source=audio_bytes)],
+            sampling_rate=int(meta.samplerate),
+            num_samples=meta.frames,
+            duration=meta.duration,
+        )
+        cut = make_cut_with_subset_inmemory_recording(
+            recording, offset=data.get("offset", 0.0), duration=data.get("duration")
+        )
+        return self._attach_supervision_and_metadata(cut, data, manifest_path, tar_path)
+
+    def _build_indexed_url_cut(self, data: dict, manifest_path: str, tar_path: str) -> Cut | None:
+        """
+        AIS GetBatch counterpart of ``_build_indexed_cut``: produces a Cut backed
+        by a URL/file AudioSource (no audio bytes loaded), so that
+        ``AudioSamples(use_batch_loader=True)`` can fetch the entire minibatch in
+        a single AIS GetBatch request. Mirrors ``_iter_batch_for_ais_get_batch``.
+        """
+        if data.get("_skipme", False):
+            return None
+        duration = data.get("duration")
+        if duration is None:
+            logging.warning(f"Skipping '{data.get('audio_filepath')}' - missing duration in manifest")
+            return None
+        audio_filename = self._audio_member_name_from_entry(data)
+        audio_url = f"{tar_path.rstrip('/')}/{audio_filename.lstrip('/')}"
+        # ``open_best`` handles ais://, http(s)://, and local paths uniformly;
+        # the AIS GetBatch loader still keys off the URL scheme.
+        source_type = "url" if "://" in tar_path else "file"
+        offset = data.get("offset", 0.0)
+        sampling_rate = data.get("sampling_rate", 16000)
+        recording = Recording(
+            id=audio_filename,
+            sources=[AudioSource(type=source_type, channels=[0], source=audio_url)],
+            sampling_rate=sampling_rate,
+            num_samples=compute_num_samples(duration, sampling_rate),
+            duration=duration,
+        )
+        cut = recording.to_cut()
+        if offset > 0:
+            cut = cut.truncate(offset=offset, duration=duration, preserve_id=True)
+            cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}"
+        return self._attach_supervision_and_metadata(cut, data, manifest_path, tar_path)
+
+    def _decode_cut_at(self, idx: int) -> Cut | None:
+        """Build the Cut for a global index in indexed mode (AIS or local).
+
+        Returns ``None`` if the manifest entry/audio member is missing or
+        malformed and ``skip_missing_manifest_entries`` is set, or if the
+        entry has ``_skipme=True`` / undecodable audio.
+        """
+        sid, local_idx = self._resolve_global_idx(idx)
+        cuts_reader = self._cuts_readers[sid]
+        manifest_path = cuts_reader.path
+        try:
+            data = cuts_reader[local_idx]
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            if self.skip_missing_manifest_entries:
+                warning_key = (str(manifest_path), sid)
+                if warning_key not in self._malformed_manifest_warning_keys:
+                    self._malformed_manifest_warning_keys.add(warning_key)
+                    logging.warning(
+                        "Skipping malformed manifest entries in indexed Lhotse dataloader: "
+                        f"{manifest_path=} {sid=} first_local_idx={local_idx} first_global_idx={idx}. "
+                        "Further malformed entries for this manifest/shard will be skipped without additional "
+                        "warnings."
+                    )
+                return None
+            raise
+        tar_path = self.shard_id_to_tar_path[sid]
+        if self.use_ais_get_batch:
+            return self._build_indexed_url_cut(data, manifest_path, tar_path)
+        member_name = self._audio_member_name_from_entry(data)
+        try:
+            audio_bytes = self._tar_readers[sid].get(member_name)
+        except KeyError:
+            if self.skip_missing_manifest_entries:
+                return None
+            raise
+        return self._build_indexed_cut(data, audio_bytes, manifest_path, tar_path)
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "LazyNeMoTarredIterator only supports __getitem__ when constructed with indexed=True."
+            )
+        idx = int(normalize_graph_token(token))
+        cut = self._decode_cut_at(idx)
+        if cut is None:
+            raise IndexError(f"Cut at global index {idx} is not decodable; cannot satisfy random-access __getitem__.")
+        return attach_graph_origin(cut, idx)
+
+    def __len__(self) -> int:
+        if self.indexed:
+            return self._total_len
+        return len(self.source)
+
+    def state_dict(self) -> dict:
+        if not self.indexed:
+            return {}
+        return {**self._iter_state.state_dict(), "epoch": self.epoch}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._iter_state.load_state_dict(sd)
+        self.epoch = sd.get("epoch", 0)
+
+    def _iter_indexed(self) -> Generator[Cut, None, None]:
+        for global_idx in self._iter_state.iterate(self._total_len):
+            cut = self._decode_cut_at(global_idx)
+            if cut is None:
+                continue
+            attach_graph_origin(cut, global_idx)
+            yield cut
+        self.epoch += 1
+
+    # ---------------------------------------------------------------- streaming
     def __iter__(self) -> Generator[Cut, None, None]:
+        if self.indexed:
+            yield from self._iter_indexed()
+            return
+
         shard_ids = self.shard_ids
 
         seed = self._get_seed()
@@ -513,17 +889,15 @@ def __iter__(self) -> Generator[Cut, None, None]:
         # Propagate the random seed
         extra_fields = [ExtraField.from_dict({"seed": seed, **field_cfg}) for field_cfg in self.extra_fields or ()]
 
-        # Handle NeMo tarred manifests with offsets.
-        # They have multiple JSONL entries where audio paths end with '-sub1', '-sub2', etc. for each offset.
-        offset_pattern = re.compile(r'^(?P<stem>.+)(?P<sub>-sub\d+)(?P<ext>\.\w+)?$')
-
+        # NeMo tarred manifests can have multiple JSONL entries pointing at the
+        # same audio member with -subN audio_filepath suffixes (per-offset cuts).
         for sid in shard_ids:
-            manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
+            manifest_path = self._shard_key_to_manifest_path[sid] if len(self.paths) > 1 else self.paths[0]
 
             def basename(d: dict) -> str:
                 return (
                     m.group("stem") + ifnone(m.group("ext"), "")
-                    if (m := offset_pattern.match(k := d["audio_filepath"])) is not None
+                    if (m := _OFFSET_PATTERN.match(k := d["audio_filepath"])) is not None
                     else k
                 )
 
@@ -585,9 +959,6 @@ def basename(d: dict) -> str:
 
         self.epoch += 1
 
-    def __len__(self) -> int:
-        return len(self.source)
-
     def __add__(self, other):
         return LazyIteratorChain(self, other)
 
@@ -743,7 +1114,7 @@ def _to_custom_attr_dict(d: dict, _excluded_fields: set[str] = {"duration", "aud
     return {k: v for k, v in d.items() if k not in _excluded_fields}
 
 
-class LazyParquetIterator:
+class LazyParquetIterator(IteratorNode):
     """
     LazyParquetIterator reads a Parquet file (local or remote) and yields Lhotse Cut objects.
     It streams data using PyArrow's iter_batches to avoid loading the full file into memory.
@@ -755,6 +1126,13 @@ class LazyParquetIterator:
         duration_field (str): Name of the column containing duration (default: "duration").
         lang_field (str): Name of the column containing language (default: "lang").
         sampling_rate (int): Fallback sampling rate if not found in metadata (default: 16000).
+        indexed (bool): When True, enable O(1) random access via row-group lookup
+            and graph-token checkpointing. Requires the parquet file to expose
+            row-group statistics (the default for files written by pyarrow/pandas).
+
+    Indexed mode reads one row group at a time on demand and caches the most
+    recently used row group, so unshuffled or locality-friendly access patterns
+    avoid repeated decompression.
     """
 
     def __init__(
@@ -765,6 +1143,7 @@ def __init__(
         duration_field: str = "duration",
         lang_field: str = "lang",
         sampling_rate: int = 16000,
+        indexed: bool = False,
     ) -> None:
         # SAFETY CHECK: Ensure pyarrow is actually installed
         if not HAVE_PYARROW:
@@ -778,8 +1157,139 @@ def __init__(
         self.duration_field = duration_field
         self.lang_field = lang_field
         self.sampling_rate = sampling_rate
+        self.indexed = indexed
+        self._row_group_offsets: list[int] | None = None
+        self._num_row_groups: int | None = None
+        self._total_rows: int | None = None
+        self._cached_row_group_idx: int | None = None
+        self._cached_row_group: list[dict] | None = None
+        self._iter_state = PartitionedIndexedIterator()
+        if indexed:
+            self._ensure_row_group_offsets()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _ensure_row_group_offsets(self) -> None:
+        if self._row_group_offsets is not None:
+            return
+        try:
+            with closing(pq.ParquetFile(self.path)) as parquet_file:
+                offsets = [0]
+                for i in range(parquet_file.num_row_groups):
+                    offsets.append(offsets[-1] + parquet_file.metadata.row_group(i).num_rows)
+                self._row_group_offsets = offsets
+                self._num_row_groups = parquet_file.num_row_groups
+                self._total_rows = offsets[-1]
+        except Exception as e:
+            raise RuntimeError(f"Failed to open Parquet file: {self.path}") from e
+
+    def _load_row_group(self, rg_idx: int) -> list[dict]:
+        if self._cached_row_group_idx == rg_idx and self._cached_row_group is not None:
+            return self._cached_row_group
+        with closing(pq.ParquetFile(self.path)) as parquet_file:
+            df = parquet_file.read_row_group(rg_idx).to_pandas()
+        rows = df.to_dict("records")
+        self._cached_row_group_idx = rg_idx
+        self._cached_row_group = rows
+        return rows
+
+    def _resolve_row_group(self, idx: int) -> tuple[int, int]:
+        # Find row group containing global ``idx`` via simple linear/bisect lookup.
+        offsets = self._row_group_offsets
+        # Linear scan is fine because num_row_groups is typically small.
+        for rg_idx in range(self._num_row_groups):
+            if idx < offsets[rg_idx + 1]:
+                return rg_idx, idx - offsets[rg_idx]
+        raise IndexError(f"index {idx} out of range for parquet file with {self._total_rows} rows")
+
+    def _build_cut_from_row(self, row: dict, fallback_idx: int) -> Cut | None:
+        audio_data = row.get(self.audio_field)
+        if isinstance(audio_data, dict) and 'bytes' in audio_data:
+            audio_bytes = audio_data['bytes']
+        elif isinstance(audio_data, bytes):
+            audio_bytes = audio_data
+        else:
+            logging.warning(f"Skipping row {fallback_idx}: Audio column '{self.audio_field}' format unrecognized.")
+            return None
+
+        text = row.get(self.text_field, "")
+        language = row.get(self.lang_field, None)
+        row_id = str(row.get('id', f"{Path(self.path).stem}_{fallback_idx}"))
+        try:
+            recording = Recording.from_bytes(data=audio_bytes, recording_id=row_id)
+        except (RuntimeError, ValueError, TypeError) as e:
+            logging.warning(f"Skipping row {row_id}: Failed to decode audio bytes. {e}")
+            return None
+        cut = recording.to_cut()
+        cut.supervisions.append(
+            SupervisionSegment(
+                id=row_id,
+                recording_id=row_id,
+                start=0.0,
+                duration=cut.duration,
+                channel=0,
+                text=text,
+                language=language,
+            )
+        )
+        cut.custom = {k: v for k, v in row.items() if k != self.audio_field}
+        return cut
+
+    def __getitem__(self, token):
+        self._ensure_row_group_offsets()
+        idx = int(normalize_graph_token(token))
+        if idx < 0:
+            idx += self._total_rows
+        if idx < 0 or idx >= self._total_rows:
+            raise IndexError(f"index {token} out of range for parquet file with {self._total_rows} rows")
+        rg_idx, local_idx = self._resolve_row_group(idx)
+        rows = self._load_row_group(rg_idx)
+        cut = self._build_cut_from_row(rows[local_idx], fallback_idx=idx)
+        if cut is None:
+            raise IndexError(f"Row {idx} in {self.path} is not decodable; cannot satisfy random-access __getitem__.")
+        return attach_graph_origin(cut, idx)
+
+    def __len__(self) -> int:
+        self._ensure_row_group_offsets()
+        return self._total_rows
+
+    def state_dict(self) -> dict:
+        if not self.indexed:
+            return {}
+        return self._iter_state.state_dict()
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._iter_state.load_state_dict(sd)
 
     def __iter__(self) -> Generator[Cut, None, None]:
+        if self.indexed:
+            yield from self._iter_indexed()
+        else:
+            yield from self._iter_streaming()
+
+    def _iter_indexed(self) -> Generator[Cut, None, None]:
+        for global_idx in self._iter_state.iterate(self._total_rows):
+            rg_idx, local_idx = self._resolve_row_group(global_idx)
+            rows = self._load_row_group(rg_idx)
+            cut = self._build_cut_from_row(rows[local_idx], fallback_idx=global_idx)
+            if cut is None:
+                continue
+            attach_graph_origin(cut, global_idx)
+            yield cut
+
+    def _iter_streaming(self) -> Generator[Cut, None, None]:
         # Open Parquet file in streaming mode inside __iter__
         # This ensures each DataLoader worker gets its own file handle.
         try:
@@ -792,53 +1302,47 @@ def __iter__(self) -> Generator[Cut, None, None]:
             df = batch.to_pandas()
 
             for idx, row in df.iterrows():
-                # 1. Extract Audio Bytes
-                # Handle HuggingFace format: {'bytes': b'...', 'path': '...'} or raw bytes
-                audio_data = row.get(self.audio_field)
-                if isinstance(audio_data, dict) and 'bytes' in audio_data:
-                    audio_bytes = audio_data['bytes']
-                elif isinstance(audio_data, bytes):
-                    audio_bytes = audio_data
-                else:
-                    logging.warning(f"Skipping row {idx}: Audio column '{self.audio_field}' format unrecognized.")
+                cut = self._build_cut_from_row(row, fallback_idx=idx)
+                if cut is None:
                     continue
+                yield cut
 
-                # 2. Extract Metadata
-                text = row.get(self.text_field, "")
-                language = row.get(self.lang_field, None)
 
-                # 3. Create Unique ID
-                # Use 'id' column if exists, else combine filename + index
-                row_id = str(row.get('id', f"{Path(self.path).stem}_{idx}"))
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
 
-                # 4. Create Lhotse Recording
-                try:
-                    recording = Recording.from_bytes(
-                        data=audio_bytes,
-                        recording_id=row_id,
-                    )
-                except (RuntimeError, ValueError, TypeError) as e:
-                    logging.warning(f"Skipping row {row_id}: Failed to decode audio bytes. {e}")
-                    continue
 
-                # 5. Create Cut
-                cut = recording.to_cut()
+def _extract_unique_shard_keys(
+    paths: list[str], pattern: re.Pattern, *, path_kind: str
+) -> tuple[list[ShardKey], list[int]]:
+    """Extract shard ids while preserving duplicate ids from expanded paths.
 
-                # Add Supervision (Transcript)
-                cut.supervisions.append(
-                    SupervisionSegment(
-                        id=row_id,
-                        recording_id=row_id,
-                        start=0.0,
-                        duration=cut.duration,
-                        channel=0,
-                        text=text,
-                        language=language,
-                    )
-                )
+    NeMo tarred dataset specs may contain multiple independent path dimensions,
+    e.g. ``bucket_OP_1..8_CL_/audio__OP_0..127_CL_.tar``. After expansion,
+    every bucket contains numeric tar shard ids ``0..127``. Keying readers only
+    by that numeric id silently overwrites all but the last bucket, shrinking the
+    effective dataset and causing extreme oversampling of the remaining shards.
 
-                # Attach any extra metadata from the row to cut.custom
-                # (Exclude the heavy audio bytes to save RAM)
-                cut.custom = {k: v for k, v in row.items() if k != self.audio_field}
-
-                yield cut
+    When numeric ids are unique, keep the historical ``int`` keys. When a
+    numeric id repeats, key each occurrence as ``(shard_id, occurrence)`` so
+    manifest and tar paths remain paired one-to-one across all expanded files.
+    The raw ids are returned for callers that need the original parsed values.
+    """
+    raw_ids = []
+    for path in paths:
+        match = pattern.search(path)
+        assert match is not None, (
+            f"Cannot determine shard_id from {path_kind} input specifier: "
+            f"we searched with regex '{pattern.pattern}' in input '{path}'"
+        )
+        raw_ids.append(int(match.group(1)))
+    if len(set(raw_ids)) == len(raw_ids):
+        return raw_ids, raw_ids
+    occurrences: dict[int, int] = {}
+    keys: list[ShardKey] = []
+    for shard_id in raw_ids:
+        occurrence = occurrences.get(shard_id, 0)
+        occurrences[shard_id] = occurrence + 1
+        keys.append((shard_id, occurrence))
+    return keys, raw_ids
diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py
index 8022e9c9e61e..bcff2069922b 100644
--- a/nemo/collections/common/data/lhotse/text_adapters.py
+++ b/nemo/collections/common/data/lhotse/text_adapters.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import logging
 import math
 import os
@@ -25,6 +26,7 @@
 import numpy as np
 import torch
 from lhotse import AudioSource, CutSet, Recording
+from lhotse.audio import AudioLoadingError
 from lhotse.custom import CustomFieldMixin
 from lhotse.cut import Cut
 from lhotse.dataset import AudioSamples
@@ -33,10 +35,15 @@
 from lhotse.shar import AudioTarWriter, JsonlShardWriter
 from lhotse.utils import Pathlike, compute_num_samples, is_valid_url
 
+from nemo.collections.common.data.lhotse._compat import (
+    IteratorNode,
+    PartitionedIndexedIterator,
+    attach_graph_origin,
+    normalize_graph_token,
+)
 from nemo.collections.common.data.lhotse.indexed_adapters import (
-    IndexedJSONLReader,
+    IndexedTarMemberReader,
     IndexedTarSampleReader,
-    LazyShuffledRange,
     _split_json_audio_pair,
 )
 from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths
@@ -132,10 +139,13 @@ def __iter__(self) -> Iterator[TextExample]:
 
 
 @dataclass
-class LhotseTextJsonlAdapter:
+class LhotseTextJsonlAdapter(IteratorNode):
     """
     ``LhotseTextJsonlAdapter`` is used to read a JSONL file and wrap
     the text field of each line into a ``TextExample``.
+
+    Set ``indexed=True`` to enable O(1) random access plus graph-token
+    checkpointing (requires uncompressed ``.jsonl`` paths).
     """
 
     paths: Union[Pathlike, list[Pathlike]]
@@ -143,11 +153,93 @@ class LhotseTextJsonlAdapter:
     text_field: str = "text"
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
+    indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
 
     def __post_init__(self):
         self.paths = expand_sharded_filepaths(self.paths)
+        self._readers: list = []
+        self._cum_lens: list[int] = []
+        self._iter_state = PartitionedIndexedIterator()
+        if self.indexed:
+            from lhotse.indexing import IndexedJsonlReader, index_file_path
+
+            for p in self.paths:
+                self._readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
+            cum = 0
+            self._cum_lens.append(cum)
+            for r in self._readers:
+                cum += len(r)
+                self._cum_lens.append(cum)
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def __len__(self) -> int:
+        if not self.indexed:
+            raise TypeError("LhotseTextJsonlAdapter has unknown length unless constructed with indexed=True.")
+        return self._cum_lens[-1] if self._cum_lens else 0
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._cum_lens[-1]
+        for s in range(len(self._readers)):
+            if idx < self._cum_lens[s + 1]:
+                return s, idx - self._cum_lens[s]
+        raise IndexError(idx)
+
+    def _data_to_example(self, data: dict) -> TextExample | None:
+        if self.text_field not in data:
+            return None
+        return TextExample(data[self.text_field], language=self.language)
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError("LhotseTextJsonlAdapter only supports __getitem__ when indexed=True.")
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        ex = self._data_to_example(self._readers[shard_idx][local_idx])
+        if ex is None:
+            raise IndexError(
+                f"Index {idx} in {self.paths[shard_idx]} has no '{self.text_field}' field; "
+                f"cannot satisfy random-access __getitem__."
+            )
+        return attach_graph_origin(ex, idx)
+
+    def state_dict(self) -> dict:
+        return self._iter_state.state_dict() if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._iter_state.load_state_dict(sd)
 
     def __iter__(self) -> Iterator[TextExample]:
+        if self.indexed:
+            yield from self._iter_indexed()
+        else:
+            yield from self._iter_streaming()
+
+    def _iter_indexed(self) -> Iterator[TextExample]:
+        total = self._cum_lens[-1] if self._cum_lens else 0
+        for global_idx in self._iter_state.iterate(total):
+            shard_idx, local_idx = self._resolve(global_idx)
+            ex = self._data_to_example(self._readers[shard_idx][local_idx])
+            if ex is None:
+                continue
+            attach_graph_origin(ex, global_idx)
+            yield ex
+
+    def _iter_streaming(self) -> Iterator[TextExample]:
         paths = self.paths
         if self.shuffle_shards:
             seed = resolve_seed(self.shard_seed)
@@ -296,7 +388,7 @@ def default_sft_prompt_format_fn(example: NeMoSFTExample, prompt):
 
 
 @dataclass
-class NeMoSFTJsonlAdapter:
+class NeMoSFTJsonlAdapter(IteratorNode):
     """
     ``NeMoSFTJsonlAdapter`` is used to read a NeMo LM SFT Chat JSONL file and yield objects of type
     ``NeMoSFTExample`` that can be sampled with Lhotse.
@@ -318,17 +410,90 @@ class NeMoSFTJsonlAdapter:
             "dataset": str,
             "category": str,
         }
+
+    Set ``indexed=True`` to enable O(1) random access plus graph-token
+    checkpointing (requires uncompressed ``.jsonl`` paths).
     """
 
     paths: Union[Pathlike, list[Pathlike]]
     language: str | None = None
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
+    indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
 
     def __post_init__(self):
         self.paths = expand_sharded_filepaths(self.paths)
+        self._readers: list = []
+        self._cum_lens: list[int] = []
+        self._iter_state = PartitionedIndexedIterator()
+        if self.indexed:
+            from lhotse.indexing import IndexedJsonlReader, index_file_path
+
+            for p in self.paths:
+                self._readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
+            cum = 0
+            self._cum_lens.append(cum)
+            for r in self._readers:
+                cum += len(r)
+                self._cum_lens.append(cum)
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def __len__(self) -> int:
+        if not self.indexed:
+            raise TypeError("NeMoSFTJsonlAdapter has unknown length unless constructed with indexed=True.")
+        return self._cum_lens[-1] if self._cum_lens else 0
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._cum_lens[-1]
+        for s in range(len(self._readers)):
+            if idx < self._cum_lens[s + 1]:
+                return s, idx - self._cum_lens[s]
+        raise IndexError(idx)
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError("NeMoSFTJsonlAdapter only supports __getitem__ when indexed=True.")
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        ex = NeMoSFTExample(self._readers[shard_idx][local_idx], language=self.language)
+        return attach_graph_origin(ex, idx)
+
+    def state_dict(self) -> dict:
+        return self._iter_state.state_dict() if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._iter_state.load_state_dict(sd)
 
     def __iter__(self) -> Iterator[NeMoSFTExample]:
+        if self.indexed:
+            yield from self._iter_indexed()
+        else:
+            yield from self._iter_streaming()
+
+    def _iter_indexed(self) -> Iterator[NeMoSFTExample]:
+        total = self._cum_lens[-1] if self._cum_lens else 0
+        for global_idx in self._iter_state.iterate(total):
+            shard_idx, local_idx = self._resolve(global_idx)
+            ex = NeMoSFTExample(self._readers[shard_idx][local_idx], language=self.language)
+            attach_graph_origin(ex, global_idx)
+            yield ex
+
+    def _iter_streaming(self) -> Iterator[NeMoSFTExample]:
         paths = self.paths
         if self.shuffle_shards:
             seed = resolve_seed(self.shard_seed)
@@ -338,6 +503,221 @@ def __iter__(self) -> Iterator[NeMoSFTExample]:
                 yield NeMoSFTExample(data, language=self.language)
 
 
+def _normalize_nemotron_text_sender(sender: str, sample_id: str) -> str:
+    role = str(sender).lower()
+    if role in ("user", "human"):
+        return "user"
+    if role in ("assistant", "gpt", "model", "bot"):
+        return "assistant"
+    if role == "system":
+        return "system"
+    if role == "tool":
+        return "tool"
+    raise ValueError(f"Unsupported sender={sender!r} in Nemotron text conversation sample id={sample_id}")
+
+
+def _flatten_nemotron_text_fragments(fragments: list, sample_id: str) -> str:
+    values = []
+    for fragment in fragments:
+        if isinstance(fragment, str):
+            values.append(fragment)
+            continue
+        if not isinstance(fragment, dict):
+            raise ValueError(
+                f"Unsupported fragment type={type(fragment).__name__} in Nemotron text conversation sample id={sample_id}"
+            )
+        fragment_type = fragment.get("t")
+        if fragment_type not in (None, "text"):
+            raise ValueError(
+                f"Unsupported fragment t={fragment_type!r} in Nemotron text conversation sample id={sample_id}"
+            )
+        values.append(str(fragment.get("value", "")))
+    return "".join(values)
+
+
+def _transform_nemotron_text_conversation(data: dict, sample_id: str) -> "NeMoMultimodalConversation":
+    conversation = data.get("conversation")
+    if not isinstance(conversation, list):
+        raise ValueError(f"Nemotron text conversation sample id={sample_id} has no list-valued 'conversation' field")
+
+    turns = []
+    for turn in conversation:
+        if not isinstance(turn, dict):
+            raise ValueError(
+                f"Unsupported turn type={type(turn).__name__} in Nemotron text conversation sample id={sample_id}"
+            )
+        role = _normalize_nemotron_text_sender(turn.get("sender"), sample_id)
+        value = _flatten_nemotron_text_fragments(turn.get("fragments", []), sample_id)
+        turns.append(TextTurn(value=value, role=role))
+    return NeMoMultimodalConversation(
+        id=str(data.get("id") or sample_id),
+        turns=turns,
+        custom=data.get("custom"),
+    )
+
+
+@dataclass
+class NemotronTextConversationAdapter(IteratorNode):
+    """
+    Read Nemotron/Energon text-only conversation data.
+
+    Supported inputs are JSONL files and materialized tar directories whose JSON
+    rows contain ``conversation`` turns with ``sender`` and ``fragments`` fields.
+    """
+
+    paths: Union[Pathlike, list[Pathlike]]
+    shuffle_shards: bool = False
+    shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
+    indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
+
+    def __post_init__(self):
+        paths = [self.paths] if isinstance(self.paths, (str, Path)) else list(self.paths)
+        self.paths = [str(p) for raw in paths for p in expand_sharded_filepaths(str(raw))]
+        self._readers: list = []
+        self._reader_kinds: list[str] = []
+        self._source_paths: list[str] = []
+        self._cum_lens: list[int] = []
+        self._iter_state = PartitionedIndexedIterator()
+        if self.indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        from lhotse.indexing import IndexedJsonlReader, index_file_path
+
+        for p in self.paths:
+            path = Path(p)
+            if path.is_dir():
+                tar_paths = sorted(path.rglob("*.tar"))
+                if not tar_paths:
+                    raise FileNotFoundError(f"No .tar files found under Nemotron text conversation directory: {path}")
+                for tar_path in tar_paths:
+                    self._add_indexed_tar_reader(str(tar_path), index_file_path(str(tar_path), self.indexes_root))
+            elif path.suffix == ".tar":
+                self._add_indexed_tar_reader(p, index_file_path(p, self.indexes_root))
+            else:
+                self._readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
+                self._reader_kinds.append("jsonl")
+                self._source_paths.append(p)
+        cum = 0
+        self._cum_lens.append(cum)
+        for reader in self._readers:
+            cum += len(reader)
+            self._cum_lens.append(cum)
+
+    def _add_indexed_tar_reader(self, tar_path: str, idx_path: Pathlike) -> None:
+        self._readers.append(IndexedTarMemberReader(tar_path, idx_path=idx_path))
+        self._reader_kinds.append("tar")
+        self._source_paths.append(tar_path)
+
+    def __len__(self) -> int:
+        if not self.indexed:
+            raise TypeError("NemotronTextConversationAdapter has unknown length unless constructed with indexed=True.")
+        return self._cum_lens[-1] if self._cum_lens else 0
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._cum_lens[-1]
+        for shard_idx in range(len(self._readers)):
+            if idx < self._cum_lens[shard_idx + 1]:
+                return shard_idx, idx - self._cum_lens[shard_idx]
+        raise IndexError(idx)
+
+    def _data_to_conversation(
+        self, data: dict, source_path: Union[str, Path], local_idx: int
+    ) -> "NeMoMultimodalConversation":
+        sample_id = f"{Path(source_path).stem}-{local_idx:012d}"
+        return _transform_nemotron_text_conversation(data, sample_id)
+
+    def _reader_item_to_conversation(self, shard_idx: int, local_idx: int) -> "NeMoMultimodalConversation":
+        item = self._readers[shard_idx][local_idx]
+        source_path = self._source_paths[shard_idx]
+        if self._reader_kinds[shard_idx] == "tar":
+            name, payload = item
+            if not name.endswith(".json"):
+                raise RuntimeError(
+                    f"Index {local_idx} in {source_path} points to non-JSON tar member {name!r}; "
+                    "Nemotron text conversation tar shards are expected to contain JSON samples."
+                )
+            return _transform_nemotron_text_conversation(json.loads(payload), Path(name).stem)
+        return self._data_to_conversation(item, source_path, local_idx)
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError("NemotronTextConversationAdapter only supports __getitem__ when indexed=True.")
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        conversation = self._reader_item_to_conversation(shard_idx, local_idx)
+        return attach_graph_origin(conversation, idx)
+
+    def state_dict(self) -> dict:
+        return self._iter_state.state_dict() if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._iter_state.load_state_dict(sd)
+
+    def __iter__(self) -> Iterator["NeMoMultimodalConversation"]:
+        if self.indexed:
+            yield from self._iter_indexed()
+            return
+        yield from self._iter_streaming()
+
+    def _iter_indexed(self) -> Iterator["NeMoMultimodalConversation"]:
+        total = self._cum_lens[-1] if self._cum_lens else 0
+        for global_idx in self._iter_state.iterate(total):
+            shard_idx, local_idx = self._resolve(global_idx)
+            conversation = self._reader_item_to_conversation(shard_idx, local_idx)
+            attach_graph_origin(conversation, global_idx)
+            yield conversation
+
+    def _iter_streaming(self) -> Iterator["NeMoMultimodalConversation"]:
+        paths = list(self.paths)
+        if self.shuffle_shards:
+            random.Random(resolve_seed(self.shard_seed)).shuffle(paths)
+        for path in paths:
+            yield from self._iter_path(Path(path))
+
+    def _iter_path(self, path: Path) -> Iterator["NeMoMultimodalConversation"]:
+        if path.is_dir():
+            tar_paths = sorted(path.rglob("*.tar"))
+            if not tar_paths:
+                raise FileNotFoundError(f"No .tar files found under Nemotron text conversation directory: {path}")
+            for tar_path in tar_paths:
+                yield from self._iter_tar(tar_path)
+        elif path.suffix == ".tar":
+            yield from self._iter_tar(path)
+        else:
+            yield from self._iter_jsonl(path)
+
+    def _iter_jsonl(self, path: Path) -> Iterator["NeMoMultimodalConversation"]:
+        for idx, data in enumerate(load_jsonl(path)):
+            sample_id = f"{path.stem}-{idx:012d}"
+            yield _transform_nemotron_text_conversation(data, sample_id)
+
+    def _iter_tar(self, path: Path) -> Iterator["NeMoMultimodalConversation"]:
+        with tarfile.open(path, "r:*") as tar:
+            for info in tar:
+                if not info.isfile() or not info.name.endswith(".json"):
+                    continue
+                data = json.load(tar.extractfile(info))
+                sample_id = Path(info.name).stem
+                yield _transform_nemotron_text_conversation(data, sample_id)
+
+
 """
 NeMoMultimodalConversation: data types, file parser, default prompt formatting logic.
 """
@@ -596,7 +976,7 @@ def _make_url_cut(
 
 
 @dataclass
-class NeMoMultimodalConversationJsonlAdapter:
+class NeMoMultimodalConversationJsonlAdapter(IteratorNode):
     """
     ``NeMoMultimodalConversationJsonlAdapter`` is used to read a NeMo multimodal conversation JSONL
     and yield objects of type ``NeMoMultimodalConversation`` that can be sampled with Lhotse.
@@ -615,6 +995,11 @@ class NeMoMultimodalConversationJsonlAdapter:
                 ...
             ],
         }
+
+    Set ``indexed=True`` to enable O(1) random access plus graph-token
+    checkpointing. Indexed mode requires uncompressed JSONL manifests; for the
+    tarred path it additionally requires uncompressed tar shards (the canonical
+    ``.idx`` sidecars are built lazily on first construction).
     """
 
     manifest_filepath: str | list[str]
@@ -626,6 +1011,8 @@ class NeMoMultimodalConversationJsonlAdapter:
     system_prompt: str | None = None
     context: str | None = None
     slice_length: int | None = None
+    indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
 
     def __post_init__(self):
         self.manifest_filepath = expand_sharded_filepaths(self.manifest_filepath)
@@ -635,13 +1022,210 @@ def __post_init__(self):
                 self.tarred_audio_filepaths
             ), f"{len(self.manifest_filepath)} != {len(self.tarred_audio_filepaths)}"
         self.epoch = 0
+        self._cuts_readers: list = []
+        self._tar_readers: list = []
+        self._cum_lens: list[int] = []
+        self._total_len = 0
+        self._iter_state = PartitionedIndexedIterator()
+        if self.indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        from lhotse.indexing import IndexedJsonlReader, index_file_path
+
+        if self.slice_length is not None:
+            raise ValueError("NeMoMultimodalConversationJsonlAdapter(indexed=True) does not support slice_length.")
+        for p in self.manifest_filepath:
+            self._cuts_readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
+        if self.tarred_audio_filepaths is not None:
+            from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
+
+            for p in self.tarred_audio_filepaths:
+                self._tar_readers.append(IndexedTarMemberReader(p, idx_path=index_file_path(p, self.indexes_root)))
+        cum = 0
+        self._cum_lens.append(cum)
+        for r in self._cuts_readers:
+            cum += len(r)
+            self._cum_lens.append(cum)
+        self._total_len = cum
+
+    def __len__(self) -> int:
+        if self.indexed:
+            return self._total_len
+        raise TypeError(
+            "NeMoMultimodalConversationJsonlAdapter has unknown length unless constructed with indexed=True."
+        )
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._total_len
+        for s in range(len(self._cuts_readers)):
+            if idx < self._cum_lens[s + 1]:
+                return s, idx - self._cum_lens[s]
+        raise IndexError(idx)
+
+    def state_dict(self) -> dict:
+        return {**self._iter_state.state_dict(), "epoch": self.epoch} if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._iter_state.load_state_dict(sd)
+        self.epoch = sd.get("epoch", 0)
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "NeMoMultimodalConversationJsonlAdapter only supports __getitem__ when indexed=True."
+            )
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        data = self._cuts_readers[shard_idx][local_idx]
+        if self._tar_readers:
+            convo = self._build_conversation_tarred(
+                data,
+                tar_reader=self._tar_readers[shard_idx],
+                tar_path=self.tarred_audio_filepaths[shard_idx],
+            )
+        else:
+            convo = self._build_conversation_local(data, manifest_path=self._cuts_readers[shard_idx].path)
+        if convo is None:
+            raise IndexError(
+                f"Conversation at index {idx} (shard {shard_idx}, local {local_idx}) "
+                f"could not be built; cannot satisfy random-access __getitem__."
+            )
+        return attach_graph_origin(convo, idx)
+
+    def _build_conversation_local(self, data: dict, manifest_path: str) -> NeMoMultimodalConversation | None:
+        if self._should_skip(data):
+            return None
+        turns = [
+            (
+                TextTurn(
+                    value=turn["value"],
+                    role=turn["from"].lower(),
+                )
+                if turn["type"] == "text"
+                else AudioTurn(
+                    cut=(
+                        cut := Recording.from_file(get_full_path(turn["value"], manifest_path))
+                        .to_cut()
+                        .truncate(offset=turn.get("offset", 0.0), duration=turn.get("duration"))
+                    ).with_id(self._make_cut_id(cut, turn)),
+                    text=cut.supervisions[0].text if cut.supervisions else None,
+                    role=turn["from"].lower(),
+                    audio_locator_tag=self.audio_locator_tag,
+                )
+            )
+            for turn in data["conversations"]
+        ]
+        if self.context is not None and turns[0].role == "user" and isinstance(turns[0], AudioTurn):
+            turns = [TextTurn(role="user", value=self.context)] + turns
+        if self.system_prompt is not None and turns[0].role != "system":
+            turns = [TextTurn(role="system", value=self.system_prompt)] + turns
+        return NeMoMultimodalConversation(
+            id=data["id"],
+            turns=turns,
+            token_equivalent_duration=self.token_equivalent_duration,
+            custom=data.get("custom"),
+        )
+
+    def _build_conversation_tarred(self, data: dict, tar_reader, tar_path: str) -> NeMoMultimodalConversation | None:
+        import io as _io
+
+        import soundfile as _sf
+        from lhotse import AudioSource as _AudioSource
+        from lhotse import Recording as _Recording
+
+        if self._should_skip(data):
+            return None
+        cuts: list = []
+        for turn in data["conversations"]:
+            if turn["type"] != "audio":
+                continue
+            audio_bytes = tar_reader.get(turn["value"])
+            try:
+                meta = _sf.info(_io.BytesIO(audio_bytes))
+            except Exception:
+                logging.warning(f"Skipped corrupted audio member '{turn['value']}' in {tar_path=}.")
+                return None
+            recording = _Recording(
+                id=turn["value"],
+                sources=[_AudioSource(type="memory", channels=list(range(meta.channels)), source=audio_bytes)],
+                sampling_rate=int(meta.samplerate),
+                num_samples=meta.frames,
+                duration=meta.duration,
+            )
+            cut = recording.to_cut().truncate(offset=turn.get("offset", 0.0), duration=turn.get("duration"))
+            cut = cut.with_id(self._make_cut_id(cut, turn))
+            cuts.append(cut)
+        cuts = deque(cuts)
+        turns = [
+            (
+                TextTurn(
+                    value=turn["value"],
+                    role=turn["from"].lower(),
+                )
+                if turn["type"] == "text"
+                else AudioTurn(
+                    cut=(c := cuts.popleft()),
+                    text=c.supervisions[0].text if c.supervisions else None,
+                    role=turn["from"].lower(),
+                    audio_locator_tag=self.audio_locator_tag,
+                )
+            )
+            for turn in data["conversations"]
+        ]
+        if self.context is not None and turns[0].role == "user" and isinstance(turns[0], AudioTurn):
+            turns = [TextTurn(role="user", value=self.context)] + turns
+        if self.system_prompt is not None and turns[0].role != "system":
+            turns = [TextTurn(role="system", value=self.system_prompt)] + turns
+        return NeMoMultimodalConversation(
+            id=data["id"],
+            turns=turns,
+            token_equivalent_duration=self.token_equivalent_duration,
+            custom=data.get("custom"),
+        )
 
     def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
+        if self.indexed:
+            yield from self._iter_indexed()
+            return
         if self.tarred_audio_filepaths is not None:
             yield from self._iter_tar()
         else:
             yield from self._iter_jsonl()
 
+    def _iter_indexed(self) -> Iterator[NeMoMultimodalConversation]:
+        for global_idx in self._iter_state.iterate(self._total_len):
+            shard_idx, local_idx = self._resolve(global_idx)
+            data = self._cuts_readers[shard_idx][local_idx]
+            if self._tar_readers:
+                convo = self._build_conversation_tarred(
+                    data,
+                    tar_reader=self._tar_readers[shard_idx],
+                    tar_path=self.tarred_audio_filepaths[shard_idx],
+                )
+            else:
+                convo = self._build_conversation_local(data, manifest_path=self._cuts_readers[shard_idx].path)
+            if convo is None:
+                continue
+            attach_graph_origin(convo, global_idx)
+            yield convo
+        self.epoch += 1
+
     def _should_skip(self, example: dict) -> bool:
         custom = example.get("custom")
         if custom is None:
@@ -787,65 +1371,8 @@ def _iter_jsonl(self):
         self.epoch += 1
 
 
-def _normalize_audio_placeholders(val: Union[str, list[str], None]) -> list[str]:
-    if val is None:
-        return ["<sound>", "<speech>"]
-    return [val] if isinstance(val, str) else list(val)
-
-
-def _transform_sharegpt(placeholders: list[str], data: dict, audio_path_fallback: str | None = None) -> list[dict]:
-    """Parse a ShareGPT dict into a flat list of ``{"type", "from", "value", ...}`` turn dicts."""
-    conversations = []
-    audio_path = data.get("sound") or data.get("ori_sound") or audio_path_fallback
-    for turn in data["conversations"]:
-        role = "user" if turn["from"].lower() in ("human", "user") else "assistant"
-        found = next((p for p in placeholders if p in turn["value"]), None)
-        if found:
-            parts = turn["value"].split(found)
-            if parts[0].strip():
-                conversations.append({"type": "text", "from": role.title(), "value": parts[0].strip()})
-            if not audio_path:
-                raise ValueError(
-                    f"Conversation turn contains audio placeholder '{found}' but no audio path "
-                    f"was found in 'sound', 'ori_sound' fields or fallback for sample id={data.get('id', '?')}"
-                )
-            conversations.append(
-                {
-                    "type": "audio",
-                    "from": role.title(),
-                    "value": audio_path,
-                    "duration": turn.get("duration", None),
-                    "offset": turn.get("offset", 0.0),
-                }
-            )
-            if len(parts) > 1 and parts[1].strip():
-                conversations.append({"type": "text", "from": role.title(), "value": parts[1].strip()})
-        else:
-            conversations.append({"type": "text", "from": role.title(), "value": turn["value"]})
-    return conversations
-
-
-def _create_sharegpt_turns(audio_locator_tag: str, conversations: list[dict], resolve_cut) -> list:
-    """Build ``TextTurn`` / ``AudioTurn`` objects.  *resolve_cut(turn_dict) -> Cut* supplies audio."""
-    turns = []
-    for t in conversations:
-        if t["type"] == "text":
-            turns.append(TextTurn(value=t["value"], role=t["from"].lower()))
-        else:
-            cut = resolve_cut(t)
-            turns.append(
-                AudioTurn(
-                    cut=cut,
-                    text=cut.supervisions[0].text if cut.supervisions else None,
-                    role=t["from"].lower(),
-                    audio_locator_tag=audio_locator_tag,
-                )
-            )
-    return turns
-
-
 @dataclass
-class NeMoMultimodalConversationShareGPTJsonlAdapter:
+class NeMoMultimodalConversationShareGPTJsonlAdapter(IteratorNode):
     """
     ``NeMoMultimodalConversationShareGPTJsonlAdapter`` is used to read a ShareGPT format multimodal
     conversation JSONL and yield objects of type ``NeMoMultimodalConversation`` that can be sampled with Lhotse.
@@ -878,6 +1405,9 @@ class NeMoMultimodalConversationShareGPTJsonlAdapter:
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
     slice_length: int | None = None
+    indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
+    skip_missing_manifest_entries: bool = False
 
     def __post_init__(self):
         self.manifest_filepath = expand_sharded_filepaths(self.manifest_filepath)
@@ -887,17 +1417,167 @@ def __post_init__(self):
                 self.tarred_audio_filepaths
             ), f"{len(self.manifest_filepath)} != {len(self.tarred_audio_filepaths)}"
         self.audio_placeholders = _normalize_audio_placeholders(self.audio_placeholders)
-        self._has_index = all(Path(p + ".idx").exists() for p in self.manifest_filepath)
         self.epoch = 0
+        self._cuts_readers: list = []
+        self._tar_readers: list = []
+        self._cum_lens: list[int] = []
+        self._total_len = 0
+        self._iter_state = PartitionedIndexedIterator()
+        if self.indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        from lhotse.indexing import IndexedJsonlReader, index_file_path
+
+        if self.slice_length is not None:
+            raise ValueError(
+                "NeMoMultimodalConversationShareGPTJsonlAdapter(indexed=True) does not support slice_length."
+            )
+        for p in self.manifest_filepath:
+            self._cuts_readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
+        if self.tarred_audio_filepaths is not None:
+            from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
+
+            for p in self.tarred_audio_filepaths:
+                self._tar_readers.append(IndexedTarMemberReader(p, idx_path=index_file_path(p, self.indexes_root)))
+        cum = 0
+        self._cum_lens.append(cum)
+        for r in self._cuts_readers:
+            cum += len(r)
+            self._cum_lens.append(cum)
+        self._total_len = cum
+
+    def __len__(self) -> int:
+        if self.indexed:
+            return self._total_len
+        raise TypeError(
+            "NeMoMultimodalConversationShareGPTJsonlAdapter has unknown length unless constructed with indexed=True."
+        )
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._total_len
+        for s in range(len(self._cuts_readers)):
+            if idx < self._cum_lens[s + 1]:
+                return s, idx - self._cum_lens[s]
+        raise IndexError(idx)
+
+    def state_dict(self) -> dict:
+        return {**self._iter_state.state_dict(), "epoch": self.epoch} if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._iter_state.load_state_dict(sd)
+        self.epoch = sd.get("epoch", 0)
+
+    def _build_one(self, data: dict, shard_idx: int) -> NeMoMultimodalConversation | None:
+        try:
+            conversations = _ShareGPTConversationParser(self.audio_placeholders, data).transform()
+            if self._tar_readers:
+                tar_reader = self._tar_readers[shard_idx]
+                tar_path = self.tarred_audio_filepaths[shard_idx]
+                return NeMoMultimodalConversation(
+                    id=data.get("id", "missing-example-id"),
+                    turns=_ShareGPTConversationParser.create_turns(
+                        self.audio_locator_tag,
+                        conversations,
+                        lambda t: self._resolve_cut_from_indexed_tar(t, tar_reader, tar_path),
+                    ),
+                    token_equivalent_duration=self.token_equivalent_duration,
+                )
+            manifest_path = self._cuts_readers[shard_idx].path
+            return NeMoMultimodalConversation(
+                id=data.get("id", "missing-example-id"),
+                turns=_ShareGPTConversationParser.create_turns(
+                    self.audio_locator_tag,
+                    conversations,
+                    lambda t, _p=manifest_path: self._resolve_cut_from_path(t, _p),
+                ),
+                token_equivalent_duration=self.token_equivalent_duration,
+            )
+        except _SHAREGPT_AUDIO_LOADING_ERRORS as e:
+            if not self.skip_missing_manifest_entries:
+                raise
+            logging.warning(
+                "Skipping ShareGPT sample due to audio loading failure: "
+                f"sample_id={data.get('id', 'missing-example-id')!r} shard_idx={shard_idx} "
+                f"error={type(e).__name__}: {e}"
+            )
+            return None
+
+    def _resolve_cut_from_indexed_tar(self, turn, tar_reader, tar_path):
+        import io as _io
+
+        import soundfile as _sf
+        from lhotse import AudioSource as _AudioSource
+        from lhotse import Recording as _Recording
+
+        audio_path = os.fspath(
+            _ShareGPTConversationParser.expect_one_audio_path(
+                turn["value"], sample_id=turn.get("id", "?"), context="audio turn value"
+            )
+        )
+        turn_for_id = {**turn, "value": audio_path}
+        audio_bytes = tar_reader.get(audio_path)
+        meta = _sf.info(_io.BytesIO(audio_bytes))
+        recording = _Recording(
+            id=audio_path,
+            sources=[_AudioSource(type="memory", channels=list(range(meta.channels)), source=audio_bytes)],
+            sampling_rate=int(meta.samplerate),
+            num_samples=meta.frames,
+            duration=meta.duration,
+        )
+        cut = recording.to_cut().truncate(offset=turn.get("offset", 0.0), duration=turn.get("duration"))
+        return cut.with_id(self._make_cut_id(cut, turn_for_id))
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "NeMoMultimodalConversationShareGPTJsonlAdapter only supports __getitem__ when indexed=True."
+            )
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        data = self._cuts_readers[shard_idx][local_idx]
+        convo = self._build_one(data, shard_idx)
+        if convo is None:
+            raise IndexError(
+                f"ShareGPT sample at global index {idx} is not decodable; cannot satisfy random-access __getitem__."
+            )
+        return attach_graph_origin(convo, idx)
 
     def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
+        if self.indexed:
+            yield from self._iter_indexed_node()
+            return
         if self.tarred_audio_filepaths is not None:
             yield from self._iter_tar()
-        elif self.shuffle_shards and self._has_index:
-            yield from self._iter_jsonl_indexed()
         else:
             yield from self._iter_jsonl()
 
+    def _iter_indexed_node(self) -> Iterator[NeMoMultimodalConversation]:
+        for global_idx in self._iter_state.iterate(self._total_len):
+            shard_idx, local_idx = self._resolve(global_idx)
+            data = self._cuts_readers[shard_idx][local_idx]
+            convo = self._build_one(data, shard_idx)
+            if convo is None:
+                continue
+            attach_graph_origin(convo, global_idx)
+            yield convo
+        self.epoch += 1
+
     def _get_rng(self) -> random.Random:
         return random.Random(resolve_seed(self.shard_seed) + self.epoch)
 
@@ -909,14 +1589,22 @@ def _make_cut_id(self, cut, turn) -> str:
         return Path(turn['value']).stem
 
     def _resolve_cut_from_path(self, turn, manifest_path):
-        if is_valid_url(turn["value"]):
-            data = open_best(turn["value"], "rb").read()
-            cut = Recording.from_bytes(data, recording_id=turn["value"]).to_cut()
+        audio_path = os.fspath(
+            _ShareGPTConversationParser.expect_one_audio_path(
+                turn["value"], sample_id=turn.get("id", "?"), context="audio turn value"
+            )
+        )
+        turn_for_id = {**turn, "value": audio_path}
+        if is_valid_url(audio_path):
+            data = open_best(audio_path, "rb").read()
+            cut = Recording.from_bytes(data, recording_id=audio_path).to_cut()
         elif self.audio_root is not None:
-            cut = Recording.from_file(get_full_path(turn["value"], data_dir=self.audio_root)).to_cut()
+            cut = Recording.from_file(get_full_path(audio_path, data_dir=self.audio_root)).to_cut()
         else:
-            cut = Recording.from_file(get_full_path(turn["value"], manifest_path)).to_cut()
-        return cut.truncate(offset=turn["offset"], duration=turn["duration"]).with_id(self._make_cut_id(cut, turn))
+            cut = Recording.from_file(get_full_path(audio_path, manifest_path)).to_cut()
+        return cut.truncate(offset=turn["offset"], duration=turn["duration"]).with_id(
+            self._make_cut_id(cut, turn_for_id)
+        )
 
     def _iter_tar(self):
         # See NeMoMultimodalConversationJsonlAdapter._iter_tar for GetBatch-mode rationale.
@@ -937,7 +1625,7 @@ def _iter_tar(self):
             )
             cntr = 0
             for idx, data in enumerate(jsonl):
-                conversations = _transform_sharegpt(self.audio_placeholders, data)
+                conversations = _ShareGPTConversationParser(self.audio_placeholders, data).transform()
                 audio_turns = [t for t in conversations if t["type"] == "audio"]
                 cuts = []
                 for turn in audio_turns:
@@ -972,7 +1660,9 @@ def _iter_tar(self):
 
                 yield NeMoMultimodalConversation(
                     id=data.get("id", "missing-example-id"),
-                    turns=_create_sharegpt_turns(self.audio_locator_tag, conversations, lambda t: cuts.popleft()),
+                    turns=_ShareGPTConversationParser.create_turns(
+                        self.audio_locator_tag, conversations, lambda t: cuts.popleft()
+                    ),
                     token_equivalent_duration=self.token_equivalent_duration,
                 )
                 cntr += 1
@@ -990,41 +1680,30 @@ def _iter_jsonl(self):
                 jsonl_iter = list(jsonl_iter)
                 rng.shuffle(jsonl_iter)
             for data in jsonl_iter:
-                conversations = _transform_sharegpt(self.audio_placeholders, data)
-                yield NeMoMultimodalConversation(
-                    id=data.get("id", "missing-example-id"),
-                    turns=_create_sharegpt_turns(
-                        self.audio_locator_tag,
-                        conversations,
-                        lambda t, _p=path: self._resolve_cut_from_path(t, _p),
-                    ),
-                    token_equivalent_duration=self.token_equivalent_duration,
-                )
-        self.epoch += 1
-
-    def _iter_jsonl_indexed(self):
-        paths = list(self.manifest_filepath)
-        rng = self._get_rng()
-        rng.shuffle(paths)
-        for path in paths:
-            reader = IndexedJSONLReader(path)
-            for idx in LazyShuffledRange(len(reader), rng):
-                data = reader[idx]
-                conversations = _transform_sharegpt(self.audio_placeholders, data)
-                yield NeMoMultimodalConversation(
-                    id=data.get("id", "missing-example-id"),
-                    turns=_create_sharegpt_turns(
-                        self.audio_locator_tag,
-                        conversations,
-                        lambda t, _p=path: self._resolve_cut_from_path(t, _p),
-                    ),
-                    token_equivalent_duration=self.token_equivalent_duration,
-                )
+                try:
+                    conversations = _ShareGPTConversationParser(self.audio_placeholders, data).transform()
+                    yield NeMoMultimodalConversation(
+                        id=data.get("id", "missing-example-id"),
+                        turns=_ShareGPTConversationParser.create_turns(
+                            self.audio_locator_tag,
+                            conversations,
+                            lambda t, _p=path: self._resolve_cut_from_path(t, _p),
+                        ),
+                        token_equivalent_duration=self.token_equivalent_duration,
+                    )
+                except _SHAREGPT_AUDIO_LOADING_ERRORS as e:
+                    if not self.skip_missing_manifest_entries:
+                        raise
+                    logging.warning(
+                        "Skipping ShareGPT sample due to audio loading failure: "
+                        f"sample_id={data.get('id', 'missing-example-id')!r} manifest_path={path} "
+                        f"error={type(e).__name__}: {e}"
+                    )
         self.epoch += 1
 
 
 @dataclass
-class NeMoMultimodalConversationShareGPTWebdatasetAdapter:
+class NeMoMultimodalConversationShareGPTWebdatasetAdapter(IteratorNode):
     """
     ``NeMoMultimodalConversationShareGPTWebdatasetAdapter`` reads ShareGPT format multimodal
     conversations from WebDataset tar archives and yields ``NeMoMultimodalConversation`` objects.
@@ -1059,6 +1738,8 @@ class NeMoMultimodalConversationShareGPTWebdatasetAdapter:
     token_equivalent_duration: float = None
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
+    indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
 
     def __post_init__(self):
         import json as _json
@@ -1073,14 +1754,87 @@ def __post_init__(self):
             if not self._shard_paths:
                 raise FileNotFoundError(f"No wids-meta.json and no .tar files found under {self.data_dir}")
         self.audio_placeholders = _normalize_audio_placeholders(self.audio_placeholders)
-        self._has_index = all(Path(p + ".idx").exists() for p in self._shard_paths)
         self.epoch = 0
+        self._tar_readers: list = []
+        self._cum_lens: list[int] = []
+        self._total_len = 0
+        self._iter_state = PartitionedIndexedIterator()
+        if self.indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        from lhotse.indexing import index_file_path
+
+        for p in self._shard_paths:
+            self._tar_readers.append(IndexedTarSampleReader(p, idx_path=index_file_path(p, self.indexes_root)))
+        cum = 0
+        self._cum_lens.append(cum)
+        for r in self._tar_readers:
+            cum += len(r)
+            self._cum_lens.append(cum)
+        self._total_len = cum
+
+    def __len__(self) -> int:
+        if self.indexed:
+            return self._total_len
+        raise TypeError(
+            "NeMoMultimodalConversationShareGPTWebdatasetAdapter has unknown length unless constructed with indexed=True."
+        )
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._total_len
+        for s in range(len(self._tar_readers)):
+            if idx < self._cum_lens[s + 1]:
+                return s, idx - self._cum_lens[s]
+        raise IndexError(idx)
+
+    def state_dict(self) -> dict:
+        return {**self._iter_state.state_dict(), "epoch": self.epoch} if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._iter_state.load_state_dict(sd)
+        self.epoch = sd.get("epoch", 0)
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "NeMoMultimodalConversationShareGPTWebdatasetAdapter only supports __getitem__ when indexed=True."
+            )
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        json_data, audio_bytes, audio_name = self._tar_readers[shard_idx][local_idx]
+        convo = self._yield_from_sample(json_data, audio_bytes, audio_name)
+        return attach_graph_origin(convo, idx)
 
     def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
-        if self.shuffle_shards and self._has_index:
-            yield from self._iter_indexed()
-        else:
-            yield from self._iter_sequential()
+        if self.indexed:
+            yield from self._iter_indexed_node()
+            return
+        yield from self._iter_sequential()
+
+    def _iter_indexed_node(self) -> Iterator[NeMoMultimodalConversation]:
+        for global_idx in self._iter_state.iterate(self._total_len):
+            shard_idx, local_idx = self._resolve(global_idx)
+            json_data, audio_bytes, audio_name = self._tar_readers[shard_idx][local_idx]
+            convo = self._yield_from_sample(json_data, audio_bytes, audio_name)
+            attach_graph_origin(convo, global_idx)
+            yield convo
+        self.epoch += 1
 
     def _get_rng(self) -> random.Random:
         return random.Random(resolve_seed(self.shard_seed) + self.epoch)
@@ -1088,11 +1842,11 @@ def _get_rng(self) -> random.Random:
     def _yield_from_sample(self, json_data, audio_bytes, audio_name):
         sample_id = Path(audio_name).stem
         recording = Recording.from_bytes(audio_bytes, recording_id=sample_id)
-        conversations = _transform_sharegpt(self.audio_placeholders, json_data, audio_name)
+        conversations = _ShareGPTConversationParser(self.audio_placeholders, json_data, audio_name).transform()
         base_cut = recording.to_cut()
         return NeMoMultimodalConversation(
             id=json_data.get("id", sample_id),
-            turns=_create_sharegpt_turns(
+            turns=_ShareGPTConversationParser.create_turns(
                 self.audio_locator_tag,
                 conversations,
                 lambda t: base_cut.truncate(offset=t.get("offset", 0.0), duration=t.get("duration")),
@@ -1118,17 +1872,6 @@ def _iter_sequential(self):
                     yield self._yield_from_sample(json_data, audio_bytes, audio_name)
         self.epoch += 1
 
-    def _iter_indexed(self):
-        shard_paths = list(self._shard_paths)
-        rng = self._get_rng()
-        rng.shuffle(shard_paths)
-        for tar_path in shard_paths:
-            reader = IndexedTarSampleReader(tar_path)
-            for idx in LazyShuffledRange(len(reader), rng):
-                json_data, audio_bytes, audio_name = reader[idx]
-                yield self._yield_from_sample(json_data, audio_bytes, audio_name)
-        self.epoch += 1
-
 
 class TarIterator:
     """
@@ -1240,3 +1983,193 @@ def _setup_writers(self):
             Path(self.output_dir).mkdir(exist_ok=True)
         self.manifest_writer = JsonlShardWriter(f"{self.output_dir}/manifest_{self.shard_idx}.jsonl", shard_size=None)
         self.tar_writer = AudioTarWriter(f"{self.output_dir}/audio_{self.shard_idx}.tar", shard_size=None)
+
+
+_SHAREGPT_AUDIO_LOADING_ERRORS = (AudioLoadingError, OSError)
+
+
+def _normalize_audio_placeholders(val: Union[str, list[str], None]) -> list[str]:
+    if val is None:
+        return ["<sound>", "<speech>"]
+    return [val] if isinstance(val, str) else list(val)
+
+
+class _ShareGPTConversationParser:
+    """Normalize ShareGPT multimodal records for the conversation adapters.
+
+    ShareGPT audio examples are intentionally loose: audio paths may be stored
+    in ``sound`` or ``ori_sound``, may be scalar or list-valued, and placement
+    in the text is expressed with placeholders such as ``<sound>``. This class
+    owns those conventions and emits the flat internal turn dictionaries shared
+    by the JSONL and WebDataset adapters.
+    """
+
+    def __init__(self, placeholders: list[str], data: dict, audio_path_fallback: str | None = None) -> None:
+        self.placeholders = placeholders
+        self.data = data
+        self.sample_id = data.get("id", "?")
+        audio_path_value = data.get("sound") or data.get("ori_sound") or audio_path_fallback
+        self.audio_paths = self.normalize_audio_paths(audio_path_value, sample_id=self.sample_id, field_name="sound")
+
+    def transform(self) -> list[dict]:
+        """Convert one raw ShareGPT sample into text/audio turn dictionaries.
+
+        User/human placeholders consume audio. Assistant turns are preserved as
+        text so literal tokens such as an HTML ``<audio>`` tag are not mistaken
+        for data references.
+        """
+        conversations = []
+        placeholder_count = self._placeholder_count()
+        if len(self.audio_paths) > 1 and placeholder_count > 1 and len(self.audio_paths) != placeholder_count:
+            raise ValueError(
+                f"ShareGPT sample id={self.sample_id} has {len(self.audio_paths)} audio paths but "
+                f"{placeholder_count} audio placeholders. Use one path for all placeholders, one path per "
+                f"placeholder, or a single placeholder for all paths."
+            )
+
+        audio_idx = 0
+        for turn in self.data["conversations"]:
+            role = self.role(turn)
+            remaining = turn["value"]
+            if not self.turn_can_consume_audio(turn):
+                conversations.append({"type": "text", "from": role.title(), "value": remaining.strip()})
+                continue
+
+            found_any = False
+            while True:
+                idx, found = self.find_next_audio_placeholder(remaining, self.placeholders)
+                if found is None:
+                    if remaining.strip() or not found_any:
+                        conversations.append({"type": "text", "from": role.title(), "value": remaining.strip()})
+                    break
+
+                found_any = True
+                prefix = remaining[:idx]
+                if prefix.strip():
+                    conversations.append({"type": "text", "from": role.title(), "value": prefix.strip()})
+                if not self.audio_paths:
+                    raise ValueError(
+                        f"Conversation turn contains audio placeholder '{found}' but no audio path was found in "
+                        f"'sound', 'ori_sound' fields or fallback for sample id={self.sample_id}"
+                    )
+
+                if len(self.audio_paths) > 1 and placeholder_count == 1:
+                    path_indexes = range(len(self.audio_paths))
+                elif len(self.audio_paths) > 1:
+                    path_indexes = [audio_idx]
+                    audio_idx += 1
+                else:
+                    path_indexes = [0]
+
+                for path_idx in path_indexes:
+                    audio_turn = {
+                        "type": "audio",
+                        "from": role.title(),
+                        "value": self.audio_paths[path_idx],
+                        "duration": self.audio_turn_field(turn, "duration", path_idx, self.sample_id),
+                        "offset": self.audio_turn_field(turn, "offset", path_idx, self.sample_id, default=0.0),
+                    }
+                    if "sampling_rate" in turn:
+                        audio_turn["sampling_rate"] = self.audio_turn_field(
+                            turn, "sampling_rate", path_idx, self.sample_id
+                        )
+                    conversations.append(audio_turn)
+                remaining = remaining[idx + len(found) :]
+        return conversations
+
+    def _placeholder_count(self) -> int:
+        return sum(
+            self.count_audio_placeholders(turn["value"], self.placeholders)
+            for turn in self.data["conversations"]
+            if self.turn_can_consume_audio(turn)
+        )
+
+    @staticmethod
+    def create_turns(audio_locator_tag: str, conversations: list[dict], resolve_cut) -> list:
+        """Build ``TextTurn`` / ``AudioTurn`` objects using ``resolve_cut(turn_dict)`` for audio."""
+        turns = []
+        for turn in conversations:
+            if turn["type"] == "text":
+                turns.append(TextTurn(value=turn["value"], role=turn["from"].lower()))
+            else:
+                cut = resolve_cut(turn)
+                turns.append(
+                    AudioTurn(
+                        cut=cut,
+                        text=cut.supervisions[0].text if cut.supervisions else None,
+                        role=turn["from"].lower(),
+                        audio_locator_tag=audio_locator_tag,
+                    )
+                )
+        return turns
+
+    @classmethod
+    def expect_one_audio_path(cls, value, sample_id: str, context: str) -> Pathlike:
+        paths = cls.normalize_audio_paths(value, sample_id=sample_id, field_name=context)
+        if len(paths) != 1:
+            raise ValueError(
+                f"ShareGPT sample id={sample_id} resolved one audio turn to {len(paths)} audio paths. "
+                f"Multiple paths must be expanded into separate audio turns before loading."
+            )
+        return paths[0]
+
+    @staticmethod
+    def normalize_audio_paths(value, sample_id: str, field_name: str) -> list[Pathlike]:
+        if value is None or value == "":
+            return []
+        if isinstance(value, (str, os.PathLike)):
+            return [value]
+        if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)):
+            paths = list(value)
+            for idx, path in enumerate(paths):
+                if not isinstance(path, (str, os.PathLike)):
+                    raise ValueError(
+                        f"ShareGPT sample id={sample_id} has unsupported {field_name}[{idx}]={path!r}; "
+                        f"expected a string or os.PathLike audio path."
+                    )
+            return paths
+        raise ValueError(
+            f"ShareGPT sample id={sample_id} has unsupported {field_name}={value!r}; "
+            f"expected a string, os.PathLike, or a list of audio paths."
+        )
+
+    @staticmethod
+    def find_next_audio_placeholder(text: str, placeholders: list[str]) -> tuple[int, str] | tuple[None, None]:
+        matches = [(idx, placeholder) for placeholder in placeholders if (idx := text.find(placeholder)) >= 0]
+        if not matches:
+            return None, None
+        return min(matches, key=lambda item: item[0])
+
+    @classmethod
+    def count_audio_placeholders(cls, text: str, placeholders: list[str]) -> int:
+        count = 0
+        remaining = text
+        while True:
+            idx, placeholder = cls.find_next_audio_placeholder(remaining, placeholders)
+            if placeholder is None:
+                return count
+            count += 1
+            remaining = remaining[idx + len(placeholder) :]
+
+    @staticmethod
+    def role(turn: dict) -> str:
+        return "user" if turn["from"].lower() in ("human", "user") else "assistant"
+
+    @classmethod
+    def turn_can_consume_audio(cls, turn: dict) -> bool:
+        return cls.role(turn) == "user"
+
+    @staticmethod
+    def audio_turn_field(turn: dict, field_name: str, audio_idx: int, sample_id: str, default=None):
+        value = turn.get(field_name, default)
+        if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)):
+            values = list(value)
+            if len(values) == 1:
+                return values[0]
+            if audio_idx < len(values):
+                return values[audio_idx]
+            raise ValueError(
+                f"ShareGPT sample id={sample_id} has {len(values)} values for turn field {field_name!r}, "
+                f"but audio path index {audio_idx} was requested."
+            )
+        return value
diff --git a/nemo/collections/common/prompts/nemotron_nano_v3.py b/nemo/collections/common/prompts/nemotron_nano_v3.py
index 4840702e92ff..1162ba3dee4a 100644
--- a/nemo/collections/common/prompts/nemotron_nano_v3.py
+++ b/nemo/collections/common/prompts/nemotron_nano_v3.py
@@ -40,6 +40,12 @@ class NemotronNanoV3PromptFormatter(PromptFormatter):
                 "message": Modality.Text,
             },
         },
+        "tool": {
+            "template": f"{NANO_BOT}tool\n|message|{NANO_EOT}\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
         OUTPUT_ROLE: {
             "template": f"{NANO_BOT}assistant\n|message|{NANO_EOT}\n",
             "slots": {
diff --git a/nemo/collections/speechlm2/data/datamodule.py b/nemo/collections/speechlm2/data/datamodule.py
index 4551a7880dbd..4c1b70ba47d9 100644
--- a/nemo/collections/speechlm2/data/datamodule.py
+++ b/nemo/collections/speechlm2/data/datamodule.py
@@ -69,22 +69,50 @@ def __init__(self, cfg, tokenizer: TokenizerSpec, dataset: torch.utils.data.Data
                     getattr(self.cfg, k).force_map_dataset = True
         self.tokenizer = tokenizer
         self.dataset = dataset
+        self._train_dl = None
 
     def train_dataloader(self):
         if "train_ds" not in self.cfg:
             return None
         mesh = self._get_device_mesh()
-        if is_dp_source_rank(mesh):
-            source = get_lhotse_dataloader_from_config(
-                config=self.cfg.train_ds,
-                global_rank=self._get_dp_rank(),
-                world_size=self._get_world_size(),
-                dataset=FallbackDataset(self.dataset),
-                tokenizer=self.tokenizer,
-            )
-        else:
-            source = None
-        return BroadcastingDataLoader(source=source, device_mesh=mesh)
+        if self._train_dl is None:
+            if is_dp_source_rank(mesh):
+                source = get_lhotse_dataloader_from_config(
+                    config=self.cfg.train_ds,
+                    global_rank=self._get_dp_rank(),
+                    world_size=self._get_world_size(),
+                    dataset=FallbackDataset(self.dataset),
+                    tokenizer=self.tokenizer,
+                    dp_group=self._get_dp_group(),
+                )
+            else:
+                source = None
+            self._train_dl = BroadcastingDataLoader(source=source, device_mesh=mesh)
+        return self._train_dl
+
+    # state_dict / load_state_dict are intentionally NOT overridden.
+    #
+    # Per-rank dataloader state is now produced and consumed by
+    # ``_PerRankStatefulDataLoader`` (in
+    # ``nemo.collections.common.data.lhotse.dataloader``). ``DataModule``
+    # passes a DP-only process group into that wrapper so ``state_dict``
+    # all-gathers only across DP ranks; ``load_state_dict`` picks the entry
+    # matching the current DP rank. Lightning's ``FitLoop``
+    # already round-trips ``CombinedLoader._state_dicts()`` through
+    # ``loader.state_dict()`` / ``loader.load_state_dict()`` on every rank,
+    # so the wrapper alone is sufficient to keep per-rank shard partitioning
+    # synchronised on resume.
+    #
+    # Historically this class also gathered+scattered the state at the
+    # DataModule level. That worked for the save, but on load, Lightning's
+    # automatic ``FitLoop._load_combined_loader_states`` fired AFTER
+    # ``restore_datamodule`` and overwrote our per-rank load with the
+    # rank-0-only state captured under ``loops.fit_loop.state_dict.combined_loader``
+    # — every non-zero rank's iterator ended up with ``shard_id=0`` (the
+    # rank-0 worker-0 value) and ``PartitionedIndexedIterator.iterate``
+    # raised ``topology mismatch on resume`` ~14 min into training. See
+    # ``agent-debug-workspace/0909-en-only-id2-4node-postfix/DIAGNOSIS_ORD_vs_IAD.md``
+    # for the full post-mortem.
 
     def val_dataloader(self):
         if "validation_ds" not in self.cfg:
@@ -131,6 +159,7 @@ def _build_test_dataloader(self, cfg: DictConfig) -> torch.utils.data.DataLoader
                     world_size=self._get_world_size(),
                     dataset=self.dataset,
                     tokenizer=self.tokenizer,
+                    dp_group=self._get_dp_group(),
                 )
             else:
                 source = None
@@ -211,3 +240,29 @@ def _get_world_size(self):
                 return torch.distributed.get_world_size()
         else:
             return 1  # 1 GPU
+
+    def _get_dp_group(self):
+        """Return the torch.distributed process group covering this rank's DP siblings.
+
+        Passed to ``_PerRankStatefulDataLoader`` so dataloader state is
+        gathered across DP ranks only, excluding CP/TP/PP/EP duplicates that
+        receive batches via ``BroadcastingDataLoader``. Returns ``None`` for
+        plain DDP and single-process runs, where the default world group is the
+        DP group.
+        """
+        if not (torch.distributed.is_available() and torch.distributed.is_initialized()):
+            return None
+        if (
+            hasattr(self.trainer, "model")
+            and hasattr(self.trainer.model, "device_mesh")
+            and (dm := self.trainer.model.device_mesh) is not None
+        ):
+            if "data_parallel" in dm.mesh_dim_names:  # Lightning's ModelParallelStrategy
+                return dm["data_parallel"].get_group()
+            if "dp_shard" in dm.mesh_dim_names and "dp_replicate" in dm.mesh_dim_names:
+                try:
+                    return dm["dp"].get_group()
+                except (KeyError, RuntimeError, ValueError):
+                    # Compatibility for older Automodel/PyTorch meshes without a flattened "dp" submesh.
+                    return dm["dp_replicate", "dp_shard"].get_group()
+        return None  # default = global DDP group
diff --git a/nemo/collections/speechlm2/data/salm_dataset.py b/nemo/collections/speechlm2/data/salm_dataset.py
index 3f8a8e6a7b0e..45591a547aba 100644
--- a/nemo/collections/speechlm2/data/salm_dataset.py
+++ b/nemo/collections/speechlm2/data/salm_dataset.py
@@ -98,9 +98,14 @@ def __init__(self, tokenizer: AutoTokenizer, multispeaker_cfg: dict | None = Non
         # Setting USE_AIS_GET_BATCH=true makes the loader issue a single AIStore GetBatch
         # call per minibatch, paired with URL-backed cuts produced by the multimodal
         # conversation adapters (NeMoMultimodalConversation{Jsonl,ShareGPTJsonl}Adapter).
+        # USE_AIS_INDIVIDUAL_GETS=true (only meaningful when USE_AIS_GET_BATCH=true) forces
+        # the underlying AISBatchLoader to skip MOSS GetBatch and issue one
+        # ``Object.get_reader().read_all()`` per object — useful when the deployment
+        # doesn't support GetBatch or its performance is degraded.
         self.load_audio = AudioSamples(
             fault_tolerant=True,
             use_batch_loader=os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true",
+            ais_force_individual=os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true",
             mono_downmix=True,
         )
         self.multispeaker_cfg = MultiSpeakerConfig.from_dict(multispeaker_cfg)
diff --git a/nemo/collections/speechlm2/models/salm.py b/nemo/collections/speechlm2/models/salm.py
index 08967ea32e6c..45517d3094b2 100644
--- a/nemo/collections/speechlm2/models/salm.py
+++ b/nemo/collections/speechlm2/models/salm.py
@@ -216,6 +216,11 @@ def training_step(self, batch: dict, batch_idx: int):
                 m.eval()
 
         inputs = self.prepare_inputs(batch)
+        # Counters consumed by TrainingStatsCallback. ``attention_mask`` is 1
+        # for every real LLM input position (text non-pad + audio frames
+        # post-perception) and 0 for padding.
+        self._last_batch_num_tokens = int(inputs["attention_mask"].long().sum().item())
+        self._last_batch_num_examples = int(inputs["input_embeds"].shape[0])
         forward_outputs = self(inputs["input_embeds"], attention_mask=inputs["attention_mask"])
         num_frames = (inputs["target_ids"] != -100).long().sum()
         with loss_parallel():
diff --git a/nemo/collections/speechlm2/models/salm_automodel.py b/nemo/collections/speechlm2/models/salm_automodel.py
index a75ebbc0d722..f1cc21b35daf 100644
--- a/nemo/collections/speechlm2/models/salm_automodel.py
+++ b/nemo/collections/speechlm2/models/salm_automodel.py
@@ -41,6 +41,7 @@
     update_perception_output_dim,
 )
 from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, MaskType, NeuralType
+from nemo.core.utils.lightning_utils import read_batch
 
 
 class SALMAutomodel(LightningModule, HFHubMixin):
@@ -237,19 +238,27 @@ def prepare_inputs(self, batch: dict):
         targets are injected into a ``ParallelExpertEncoder``. Otherwise, the
         encoder runs its embedded Sortformer to predict diarization.
         """
+        from nemo.collections.speechlm2.parts.cp_helpers import (
+            encode_audio_with_cp_distribution,
+            get_cp_mesh,
+            get_perception_fsdp_group,
+        )
+
+        device_mesh = getattr(self, "_device_mesh", None)
+        spk_targets = batch.get("spk_targets", None)
+        cp_mesh, cp_size, _ = get_cp_mesh(device_mesh)
+        fsdp_sync_group = get_perception_fsdp_group(device_mesh)
+
         # Source audio encoding.
         # Input audio: (B, T_samples)
         # Audio embeddings: (B, T, H)
-        from nemo.collections.speechlm2.parts.cp_helpers import encode_audio_with_cp_distribution, get_cp_mesh
-
-        spk_targets = batch.get("spk_targets", None)
-        cp_mesh, cp_size, _ = get_cp_mesh(getattr(self, "_device_mesh", None))
         # Encoder path by (PEE, spk_targets):
         # PEE=true  & spk_targets=None  : Inference mode, uses recursive encoding in PEE, NO chunking/CP.
         # PEE=true  & spk_targets!=None : Training mode, ``spk_targets`` injected into PEE with chunking/CP.
         # PEE=false & spk_targets=None  : Training/Inference mode, plain encoder with chunking/CP.
         # PEE=false & spk_targets!=None : Training/Inference mode, plain encoder with chunking/CP and
         #                                 the provided ``spk_targets`` is ignored (no-op).
+        dummy_audio_loss = None
         if self._uses_parallel_expert_encoder() and spk_targets is None:
             self._warn_parallel_expert_encoder_inference_compatibility(cp_size)
             audio_embs, audio_emb_lens = self.perception(
@@ -257,7 +266,7 @@ def prepare_inputs(self, batch: dict):
             )
             audio_embs = [emb[:emblen] for emb, emblen in zip(audio_embs, audio_emb_lens)]
         else:
-            audio_embs = encode_audio_with_cp_distribution(
+            audio_embs, dummy_audio_loss = encode_audio_with_cp_distribution(
                 self.perception,
                 batch["audios"],
                 batch["audio_lens"],
@@ -265,6 +274,8 @@ def prepare_inputs(self, batch: dict):
                 sampling_rate=self.sampling_rate,
                 cp_mesh=cp_mesh,
                 spk_targets=spk_targets,
+                fsdp_sync_group=fsdp_sync_group,
+                return_dummy_loss=True,
             )
         input_ids_to_embed = torch.where(batch["input_ids"] == self.audio_locator_tag_id, 0, batch["input_ids"])
         text_embs = self._embed_tokens(input_ids_to_embed)
@@ -275,15 +286,18 @@ def prepare_inputs(self, batch: dict):
         if self.cfg.get("packed_sequences", False):
             from nemo.collections.speechlm2.parts.packed_sequences import prepare_packed_llm_inputs
 
-            return prepare_packed_llm_inputs(
+            ans = prepare_packed_llm_inputs(
                 input_ids=batch["input_ids"],
                 text_embs=text_embs,
                 audio_embs=audio_embs,
                 target_ids=target_ids_full,
                 padding_id=self.text_pad_id,
                 placeholder_id=self.audio_locator_tag_id,
-                device_mesh=getattr(self, "_device_mesh", None),
+                device_mesh=device_mesh,
             )
+            if dummy_audio_loss is not None:
+                ans["dummy_audio_loss"] = dummy_audio_loss
+            return ans
 
         input_embs, target_ids, attention_mask = replace_placeholders_and_build_targets(
             input_ids=batch["input_ids"],
@@ -308,12 +322,15 @@ def prepare_inputs(self, batch: dict):
                 attention_mask = attention_mask[:, :-remainder]
                 target_ids = target_ids[:, :-remainder]
 
-        return {
+        ans = {
             "input_embeds": input_embs,
             "attention_mask": attention_mask,
             "target_ids": target_ids,
             "llm_kwargs": {},
         }
+        if dummy_audio_loss is not None:
+            ans["dummy_audio_loss"] = dummy_audio_loss
+        return ans
 
     def on_fit_start(self) -> None:
         """Configure the MoE aux-loss backward scaler to cancel FSDP's gradient
@@ -350,13 +367,21 @@ def _validate_parallelism_compatibility(self) -> None:
             device_capability=device_capability,
         )
 
-    def training_step(self, batch: dict, batch_idx: int):
+    def training_step(self, dataloader_iter):
+        # ``dataloader_iter`` signature → Lightning selects
+        # ``_DataLoaderIterDataFetcher`` (no prefetch) which is required for
+        # bit-identical checkpoint resumption. See ``read_batch`` docstring.
+        batch, batch_idx = read_batch(dataloader_iter, self)
+        return self._training_step_batch(batch, batch_idx)
+
+    def _training_step_batch(self, batch: dict, batch_idx: int):
         self._current_batch_idx = batch_idx
         for m in (self.perception.preprocessor, self.perception.encoder, self.llm):
             if is_frozen(m):
                 m.eval()
 
         inputs = self.prepare_inputs(batch)
+        self._record_training_stats(batch, inputs)
         forward_outputs = self(
             inputs["input_embeds"],
             attention_mask=inputs["attention_mask"],
@@ -388,6 +413,8 @@ def training_step(self, batch: dict, batch_idx: int):
                 ignore_index=-100,
             )
             loss = loss_sum * dp_size / num_frames_global
+        if (dummy_audio_loss := inputs.get("dummy_audio_loss")) is not None:
+            loss = loss + dummy_audio_loss
 
         # Latent speaker supervision loss (auxiliary, optional).
         if self.lss_loss is not None and num_frames > 0:
@@ -419,11 +446,30 @@ def training_step(self, batch: dict, batch_idx: int):
             "target_to_input_ratio": num_frames / (B * T),
             "padding_ratio": (batch["input_ids"] != self.text_pad_id).long().sum() / batch["input_ids"].numel(),
         }
-        self.log("loss", loss_display, on_step=True, prog_bar=True)
-        self.log_dict({k: v for k, v in ans.items() if k != "loss"}, on_step=True)
+        # batch_size kwarg is required by Lightning when training_step uses
+        # the ``dataloader_iter`` signature (it can't auto-infer otherwise).
+        self.log("loss", loss_display, on_step=True, prog_bar=True, batch_size=B)
+        self.log_dict({k: v for k, v in ans.items() if k != "loss"}, on_step=True, batch_size=B)
         self.maybe_log_moe_metrics(batch_idx)
         return ans
 
+    def _record_training_stats(self, batch: dict, inputs: dict) -> None:
+        # Counters consumed by TrainingStatsCallback. In BSHD, the attention mask
+        # counts every real LLM input position. In THD, packed input metadata must
+        # come from pre-CP sequence lengths so CP/TP-local tensor shapes do not
+        # over- or under-count the global batch.
+        if inputs["attention_mask"] is not None:
+            num_tokens = inputs["attention_mask"].long().sum()
+        else:
+            num_tokens = inputs["num_tokens"]
+        num_examples = inputs.get("num_examples", batch["input_ids"].shape[0])
+        if torch.is_tensor(num_tokens):
+            num_tokens = num_tokens.detach().cpu().item()
+        if torch.is_tensor(num_examples):
+            num_examples = num_examples.detach().cpu().item()
+        self._last_batch_num_tokens = int(num_tokens)
+        self._last_batch_num_examples = int(num_examples)
+
     def on_validation_epoch_start(self) -> None:
         self._partial_val_loss_sums = defaultdict(list)
         self._partial_val_corrects = defaultdict(list)
@@ -787,7 +833,14 @@ def maybe_log_moe_metrics(self, step: int):
         else:
             metrics = compute_brief_metrics(layer_loads, top_k=top_k)
 
-        self.log_dict(metrics, on_step=True)
+        # ``batch_size=1`` is required when training_step uses the
+        # ``dataloader_iter`` flavor: Lightning cannot infer the batch size
+        # from the closure, and these MoE metrics are model-internal
+        # aggregates (load fractions, top-k expert utilization), so the
+        # per-call batch_size is just a logging-aggregation hint, not a true
+        # sample count. Without it Lightning raises
+        # ``MisconfigurationException`` on the very first training step.
+        self.log_dict(metrics, on_step=True, batch_size=1)
 
     def _get_moe_dp_group(self):
         """Return the DP process group for MoE metrics all-reduce.
diff --git a/nemo/collections/speechlm2/parts/cp_helpers.py b/nemo/collections/speechlm2/parts/cp_helpers.py
index 6c432a6081c4..acdfa77fd0d9 100644
--- a/nemo/collections/speechlm2/parts/cp_helpers.py
+++ b/nemo/collections/speechlm2/parts/cp_helpers.py
@@ -34,7 +34,10 @@
 from torch import Tensor
 from torch.distributed.nn.functional import all_gather as differentiable_all_gather
 
-from nemo.collections.speechlm2.parts.encoder_chunking import encode_audio_with_optional_chunking
+from nemo.collections.speechlm2.parts.encoder_chunking import (
+    _get_min_chunk_size_samples,
+    encode_audio_with_optional_chunking,
+)
 
 
 def get_cp_mesh(device_mesh) -> tuple[Optional[object], int, int]:
@@ -49,6 +52,22 @@ def get_cp_mesh(device_mesh) -> tuple[Optional[object], int, int]:
     return cp_mesh, cp_mesh.size(), cp_rank
 
 
+def get_perception_fsdp_group(device_mesh):
+    """Return the process group used to FSDP-shard the perception module, if any."""
+    if device_mesh is None:
+        return None
+    dim_names = device_mesh.mesh_dim_names or ()
+    if "dp_replicate" in dim_names and "dp_shard_cp" in dim_names:
+        fsdp_mesh = device_mesh["dp_replicate", "dp_shard_cp"]
+    elif "dp_shard_cp" in dim_names:
+        fsdp_mesh = device_mesh["dp_shard_cp"]
+    else:
+        fsdp_mesh = device_mesh["dp"]
+    if fsdp_mesh.size() <= 1:
+        return None
+    return fsdp_mesh.get_group()
+
+
 def encode_audio_with_cp_distribution(
     perception,
     audios: Tensor,
@@ -58,11 +77,13 @@ def encode_audio_with_cp_distribution(
     sampling_rate: int,
     cp_mesh=None,
     spk_targets: Tensor | None = None,
-) -> list[Tensor]:
+    fsdp_sync_group=None,
+    return_dummy_loss: bool = False,
+) -> list[Tensor] | tuple[list[Tensor], Tensor | None]:
     """Distribute the audio encoder forward across CP ranks.
 
     Falls back to :func:`encode_audio_with_optional_chunking` when ``cp_mesh is
-    None`` or there are no audios in the batch.
+    None``.
 
     With CP active, each rank encodes a contiguous slice of the audio batch
     (rank ``r`` gets ``audios[r*per_rank : (r+1)*per_rank]`` where
@@ -78,10 +99,33 @@ def encode_audio_with_cp_distribution(
     zero-padded to a globally-consistent ``max_L`` and ``all_gather``ed across
     the CP group. The full ordered list is reconstructed and dummies are
     dropped, so the return value is identical on every CP rank.
+
+    When ``fsdp_sync_group`` is provided and this rank has a text-only batch
+    while another rank in the perception FSDP group has audio, this function
+    runs a single dummy audio row through ``perception`` and returns a zero-valued
+    loss term. Adding that term to the training loss preserves the autograd edge
+    so FSDP forward/backward hooks fire on the text-only rank without affecting
+    gradients numerically.
     """
     B_aud = int(audios.shape[0])
-    if cp_mesh is None or B_aud == 0:
-        return encode_audio_with_optional_chunking(
+    fsdp_group_has_audio = _fsdp_group_has_audio(B_aud, audios.device, fsdp_sync_group)
+    if B_aud == 0:
+        dummy_loss = (
+            _dummy_audio_loss_for_fsdp_sync(
+                perception,
+                audios,
+                audio_lens,
+                chunk_size_seconds=chunk_size_seconds,
+                sampling_rate=sampling_rate,
+            )
+            if fsdp_group_has_audio
+            else None
+        )
+        ans = []
+        return (ans, dummy_loss) if return_dummy_loss else ans
+
+    if cp_mesh is None:
+        ans = encode_audio_with_optional_chunking(
             perception,
             audios,
             audio_lens,
@@ -89,6 +133,7 @@ def encode_audio_with_cp_distribution(
             sampling_rate=sampling_rate,
             spk_targets=spk_targets,
         )
+        return (ans, None) if return_dummy_loss else ans
 
     cp_size = cp_mesh.size()
     cp_group = cp_mesh.get_group()
@@ -157,4 +202,36 @@ def encode_audio_with_cp_distribution(
             L = int(gathered_lens[r][i].item())
             full_embs.append(gathered_stack[r][i, :L])
 
-    return full_embs
+    return (full_embs, None) if return_dummy_loss else full_embs
+
+
+def _fsdp_group_has_audio(B_aud: int, device: torch.device, fsdp_sync_group=None) -> bool:
+    if fsdp_sync_group is None or not (dist.is_available() and dist.is_initialized()):
+        return False
+    local_has_audio = torch.tensor(1 if B_aud > 0 else 0, dtype=torch.int32, device=device)
+    dist.all_reduce(local_has_audio, op=dist.ReduceOp.MAX, group=fsdp_sync_group)
+    return bool(int(local_has_audio.item()))
+
+
+def _dummy_audio_loss_for_fsdp_sync(
+    perception,
+    audios: Tensor,
+    audio_lens: Tensor,
+    *,
+    chunk_size_seconds: Optional[float],
+    sampling_rate: int,
+) -> Tensor | None:
+    # The preprocessor minimum alone can be too short after Conformer
+    # subsampling, leaving BatchNorm with a single value per channel.
+    dummy_len = max(_get_min_chunk_size_samples(perception), int(sampling_rate))
+    dummy_audio = torch.zeros(1, dummy_len, dtype=audios.dtype, device=audios.device)
+    dummy_lens = torch.full((1,), dummy_len, dtype=audio_lens.dtype, device=audio_lens.device)
+    dummy_embs = encode_audio_with_optional_chunking(
+        perception,
+        dummy_audio,
+        dummy_lens,
+        chunk_size_seconds=chunk_size_seconds,
+        sampling_rate=sampling_rate,
+    )
+    dummy_loss = sum(emb.float().sum() for emb in dummy_embs)
+    return dummy_loss * 0.0
diff --git a/nemo/collections/speechlm2/parts/encoder_chunking.py b/nemo/collections/speechlm2/parts/encoder_chunking.py
index 0851ddb149f5..32309ae3e148 100644
--- a/nemo/collections/speechlm2/parts/encoder_chunking.py
+++ b/nemo/collections/speechlm2/parts/encoder_chunking.py
@@ -52,6 +52,9 @@ def encode_audio_with_optional_chunking(
         embeddings are concatenated along the time axis to recover a single tensor per
         original audio row.
     """
+    if input_signal_length.numel() == 0:
+        return []
+
     chunk_size_samples = _get_chunk_size_samples(chunk_size_seconds, sampling_rate)
     perception_kwargs = {"input_signal": input_signal, "input_signal_length": input_signal_length}
     if spk_targets is not None:
diff --git a/nemo/collections/speechlm2/parts/packed_sequences.py b/nemo/collections/speechlm2/parts/packed_sequences.py
index c6ef01564697..3d89c814dca0 100644
--- a/nemo/collections/speechlm2/parts/packed_sequences.py
+++ b/nemo/collections/speechlm2/parts/packed_sequences.py
@@ -283,6 +283,8 @@ def prepare_packed_llm_inputs(
         cp_size=cp_size,
         tp_size=tp_size,
     )
+    num_tokens = packed["seq_lens"].sum()
+    num_examples = torch.tensor(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
 
     if cp_mesh is not None:
         packed = _shard_packed_for_cp(packed, cp_mesh)
@@ -291,6 +293,8 @@ def prepare_packed_llm_inputs(
         "input_embeds": packed["inputs_embeds"],
         "attention_mask": None,
         "target_ids": packed["labels"],
+        "num_tokens": num_tokens,
+        "num_examples": num_examples,
         "llm_kwargs": {
             "qkv_format": "thd",
             # Match Automodel's standard THD contract (``thd_utils.process_input_for_thd``
diff --git a/nemo/core/utils/lightning_utils.py b/nemo/core/utils/lightning_utils.py
new file mode 100644
index 000000000000..b55f03769998
--- /dev/null
+++ b/nemo/core/utils/lightning_utils.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helpers for working with PyTorch Lightning's ``training_step``."""
+from typing import Any, Iterator, Tuple
+
+import lightning.pytorch as pl
+
+
+def read_batch(dataloader_iter: Iterator, model: pl.LightningModule) -> Tuple[Any, int]:
+    """Pull the next batch from a Lightning ``dataloader_iter`` and apply the
+    device/precision conversions that ``_PrefetchDataFetcher`` would have
+    applied for the default ``training_step(batch, batch_idx)`` signature.
+
+    Use this from a ``training_step(self, dataloader_iter)``-style step. That
+    signature makes Lightning select ``_DataLoaderIterDataFetcher`` (no
+    prefetch), which is required for bit-identical checkpoint resumption with
+    a stateful dataloader: the default ``_PrefetchDataFetcher`` re-primes one
+    batch on every iter init (including on resume), advancing the stateful
+    dataloader past the saved snapshot point and giving the resumed run a
+    one-batch drift versus the continuous run.
+
+    Also checks shutdown conditions before pulling the next batch. Lightning
+    still calls timer and preemption callbacks in normal ``dataloader_iter``
+    runs, but checking here closes the deadline/preemption window before user
+    code advances a stateful iterator. If the time budget is already exhausted
+    or preemption was already signaled, the helper saves ``last.ckpt`` and
+    exits before another sample is consumed.
+
+    Args:
+        dataloader_iter: The iterator passed by Lightning into a
+            ``training_step(self, dataloader_iter)`` (an instance of
+            ``_DataFetcherWrapper``). Yields ``(batch, batch_idx, dataloader_idx)``.
+        model: The ``LightningModule`` whose ``trainer`` carries the precision
+            plugin and strategy used to move the batch to device.
+
+    Returns:
+        ``(batch, batch_idx)`` — batch is already converted to the right
+        precision and moved to the model's device, ready for forward.
+    """
+    trainer = model.trainer
+    _check_shutdown_before_next_batch(trainer)
+    batch, batch_idx, dataloader_idx = next(dataloader_iter)
+    batch = trainer.precision_plugin.convert_input(batch)
+    batch = model._on_before_batch_transfer(batch, dataloader_idx=dataloader_idx)
+    batch = trainer.strategy.batch_to_device(batch, dataloader_idx=dataloader_idx)
+    return batch, batch_idx
+
+
+def _check_shutdown_before_next_batch(trainer: pl.Trainer) -> None:
+    """Handle pending shutdown before advancing a stateful ``dataloader_iter``."""
+    _log_read_batch_shutdown_guards_once(trainer)
+    _force_fire_preemption_callback(trainer)
+    _save_and_exit_if_lightning_received_sigterm(trainer)
+    _force_fire_stateless_timer(trainer)
+
+
+def _log_read_batch_shutdown_guards_once(trainer: pl.Trainer) -> None:
+    """Log the active ``read_batch`` shutdown guards once per trainer."""
+    if getattr(trainer, "_nemo_read_batch_shutdown_guards_logged", False):
+        return
+    setattr(trainer, "_nemo_read_batch_shutdown_guards_logged", True)
+
+    try:
+        has_preemption = any(getattr(cb, "preemption_enabled", False) for cb in trainer.callbacks)
+        has_sigterm = hasattr(trainer, "received_sigterm")
+        has_timer = _has_stateless_timer(trainer)
+        from nemo.utils import logging
+
+        logging.info(
+            "read_batch shutdown guards active: "
+            f"stateless_timer={has_timer} preemption_callback={has_preemption} "
+            f"lightning_sigterm_state={has_sigterm}"
+        )
+    except Exception:
+        # This is observability only; never let it affect the training path.
+        return
+
+
+def _force_fire_preemption_callback(trainer: pl.Trainer) -> None:
+    """Save and exit if NeMo's preemption callback has observed SIGTERM.
+
+    ``PreemptionCallback.on_train_batch_end`` still handles the normal
+    post-batch case. This pre-fetch check covers the ``training_step(
+    dataloader_iter)`` path where user code is responsible for advancing the
+    stateful iterator and can otherwise enter ``next(dataloader_iter)`` after
+    rank 0 already received the preemption signal.
+    """
+    for cb in trainer.callbacks:
+        if not getattr(cb, "preemption_enabled", False):
+            continue
+        if cb.interrupted:
+            from nemo.utils.exp_manager import _save_last_checkpoint_and_exit
+
+            _save_last_checkpoint_and_exit(
+                trainer,
+                "read_batch observed a pending preemption signal before consuming the next batch",
+            )
+
+
+def _save_and_exit_if_lightning_received_sigterm(trainer: pl.Trainer) -> None:
+    """Handle Lightning's own SIGTERM state before consuming a stateful batch."""
+    if not getattr(trainer, "received_sigterm", False):
+        return
+
+    from nemo.utils.exp_manager import _save_last_checkpoint_and_exit
+
+    _save_last_checkpoint_and_exit(
+        trainer,
+        "read_batch observed trainer.received_sigterm before consuming the next batch",
+    )
+
+
+def _has_stateless_timer(trainer: pl.Trainer) -> bool:
+    """Return whether trainer has NeMo's StatelessTimer callback."""
+    from nemo.utils.exp_manager import StatelessTimer
+
+    return any(isinstance(cb, StatelessTimer) for cb in trainer.callbacks)
+
+
+def _force_fire_stateless_timer(trainer: pl.Trainer) -> None:
+    """Invoke ``StatelessTimer._check_time_remaining`` directly.
+
+    Defensive deadline check for Lightning's ``dataloader_iter`` step flavor.
+    The standard callback path checks the timer after a batch. This pre-fetch
+    check prevents a resumed stateful iterator from being advanced when the
+    deadline has already expired before the next batch is requested.
+
+    Idempotent on the time-not-yet-up case (cheap: one ``time_elapsed()``
+    check + one comparison). On the time-up case, ``StatelessTimer`` saves a
+    ``-last.ckpt`` via ``NeMoModelCheckpoint._save_last_checkpoint`` and
+    raises ``_TunerExitException`` to exit Lightning gracefully — that
+    exception propagates up through ``read_batch`` → ``training_step`` →
+    Lightning's epoch loop, which Lightning treats as a clean stop.
+    """
+    # Local import to avoid a circular import at module load time
+    # (exp_manager imports from various nemo submodules).
+    from nemo.utils.exp_manager import StatelessTimer
+
+    for cb in trainer.callbacks:
+        if isinstance(cb, StatelessTimer):
+            cb._check_time_remaining(trainer)
+            return
diff --git a/nemo/utils/callbacks/preemption.py b/nemo/utils/callbacks/preemption.py
index be4712a60241..4c4748821d63 100644
--- a/nemo/utils/callbacks/preemption.py
+++ b/nemo/utils/callbacks/preemption.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import signal
-import sys
 
 import torch
 from lightning.pytorch.callbacks import Callback
@@ -66,6 +65,7 @@ def on_train_start(self, trainer, pl_module):
 
             # Master handler on rank 0 only upon preemption signal to avoid deadlock conditions
             def master_handler(signum, frame):
+                logging.info("Received preemption signal on rank 0; checkpoint save will run after current batch")
                 self.release()
                 self._interrupted = True
 
@@ -76,6 +76,7 @@ def ignoring_handler(signum, frame):
             self.private_rank = torch.distributed.get_rank()
             if self.private_rank == 0:
                 signal.signal(self.sig, master_handler)
+                logging.info(f"PreemptionCallback enabled on rank 0 for signal {getattr(self.sig, 'name', self.sig)}")
             else:
                 signal.signal(self.sig, ignoring_handler)
 
@@ -96,17 +97,12 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int)
             # a regular local variable
             interrupted = self.interrupted
             if interrupted:
-                logging.info("Received SIGTERM, saving checkpoint and exiting")
-                # Same off-by-one as in StatelessTimer: on_train_batch_end fires before
-                # batch_progress.increment_completed(), but the batch's optim step has
-                # already advanced global_step. Flush the in-flight batch so resume
-                # doesn't replay it and double-count the optim step.
-                from nemo.utils.exp_manager import _flush_in_flight_batch_progress
-
-                _flush_in_flight_batch_progress(trainer)
-                monitor_candidates = self.checkpoint_callback._monitor_candidates(trainer)
-                self.checkpoint_callback._save_last_checkpoint(trainer, monitor_candidates)
-                sys.exit(0)
+                from nemo.utils.exp_manager import _save_last_checkpoint_and_exit
+
+                _save_last_checkpoint_and_exit(
+                    trainer,
+                    "PreemptionCallback observed SIGTERM at train batch end",
+                )
 
     def release(self):
         """Restore the original signal handler; returns False if already released, True otherwise."""
diff --git a/nemo/utils/callbacks/training_stats.py b/nemo/utils/callbacks/training_stats.py
new file mode 100644
index 000000000000..0ef07b59d891
--- /dev/null
+++ b/nemo/utils/callbacks/training_stats.py
@@ -0,0 +1,225 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=C0116
+"""
+Training-throughput metrics that are not specific to a single model.
+
+Three metrics are emitted at ``on_train_batch_end`` via ``pl_module.log()``:
+
+* ``dataloader_wait_s`` — wall-clock seconds spent between the previous
+  batch's ``on_train_batch_end`` and the current batch's
+  ``on_train_batch_start``. With PTL's prefetcher this is normally near
+  zero; large values mean the dataloader couldn't keep up. Useful for
+  catching AIS / lustre stalls before they crater the run.
+* ``num_tokens_total`` — running sum across the whole training of every
+  non-padding token position fed into the LLM (text non-pad + audio
+  frames after perception subsampling). Includes loss-masked tokens.
+  Survives job restarts via callback ``state_dict``.
+* ``num_examples_total`` — running sum across the whole training of
+  per-batch example counts. Also restart-safe.
+
+The model is expected to populate two short-lived attributes inside its
+``training_step`` so the callback can pick them up without parsing the
+batch a second time::
+
+    pl_module._last_batch_num_tokens = int(...)
+    pl_module._last_batch_num_examples = int(...)
+
+If either attribute is missing the callback falls back to counting from
+``batch["input_ids"]`` (non-pad text tokens only) — useful for non-SALM
+models, but loses audio-frame contribution.
+"""
+
+import time
+from typing import Any, Dict, Optional
+
+import torch
+import torch.distributed as dist
+from lightning.pytorch import Callback, LightningModule, Trainer
+
+__all__ = ["TrainingStatsCallback"]
+
+
+class TrainingStatsCallback(Callback):
+    """Logs dataloader wait time and accumulates token/example counts.
+
+    Persists ``num_tokens_total`` and ``num_examples_total`` via the
+    Lightning checkpoint state-dict mechanism so the counters survive
+    job restarts. The per-step ``dataloader_wait_s`` gauge is
+    intentionally NOT persisted (it has no meaningful value across a
+    process boundary).
+
+    The first batch after a fresh process start has no meaningful
+    ``dataloader_wait_s`` (no preceding ``on_train_batch_end``); the
+    callback skips logging it for that step.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        # Persisted state — survives checkpoint resume.
+        self.num_tokens_total: int = 0
+        self.num_examples_total: int = 0
+        # Per-process state — not persisted.
+        self._prev_batch_end_monotonic: Optional[float] = None
+
+    # ------------------------------------------------------------------ state
+    def state_dict(self) -> Dict[str, Any]:
+        return {
+            "num_tokens_total": int(self.num_tokens_total),
+            "num_examples_total": int(self.num_examples_total),
+        }
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        self.num_tokens_total = int(state_dict.get("num_tokens_total", 0))
+        self.num_examples_total = int(state_dict.get("num_examples_total", 0))
+
+    # ----------------------------------------------------------------- hooks
+    def on_train_batch_start(
+        self,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        batch: Any,
+        batch_idx: int,
+    ) -> None:
+        if self._prev_batch_end_monotonic is None:
+            # First batch of the process — no previous end timestamp to
+            # diff against; skip emitting a misleading value.
+            return
+        wait_s = time.monotonic() - self._prev_batch_end_monotonic
+        # ``batch_size`` is required when the LightningModule uses
+        # ``def training_step(self, dataloader_iter)`` (SALMAutomodel does):
+        # Lightning can't auto-infer it from a ``dataloader_iter`` arg.
+        # The value is only used for epoch-level aggregation; we log
+        # ``on_step=True, on_epoch=False`` so the actual number is
+        # irrelevant — pass 1 as a sentinel.
+        pl_module.log(
+            "dataloader_wait_s",
+            wait_s,
+            on_step=True,
+            on_epoch=False,
+            prog_bar=False,
+            rank_zero_only=True,
+            batch_size=1,
+        )
+
+    def on_train_batch_end(
+        self,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        outputs: Any,
+        batch: Any,
+        batch_idx: int,
+    ) -> None:
+        # Pull per-batch counts the model exposed in training_step.
+        local_tokens = int(getattr(pl_module, "_last_batch_num_tokens", -1))
+        local_examples = int(getattr(pl_module, "_last_batch_num_examples", -1))
+        if local_tokens < 0 or local_examples < 0:
+            # Model didn't expose the attributes — fall back to a generic
+            # estimate from the batch. Counts non-pad text tokens only;
+            # audio frame contribution is lost. Better than zero.
+            local_tokens, local_examples = self._fallback_counts(batch, pl_module)
+
+        # All-reduce across DP ranks so every rank holds the same cumulative
+        # value (required for state_dict consistency across ranks on save).
+        # Under CP/TP, batch broadcasting gives model-parallel ranks duplicate
+        # data, so reducing over the full world would over-count.
+        if dist.is_available() and dist.is_initialized():
+            buf = torch.tensor(
+                [local_tokens, local_examples],
+                dtype=torch.long,
+                device=pl_module.device,
+            )
+            dist.all_reduce(buf, op=dist.ReduceOp.SUM, group=self._get_dp_group(pl_module))
+            global_tokens, global_examples = buf.tolist()
+        else:
+            global_tokens, global_examples = local_tokens, local_examples
+
+        self.num_tokens_total += global_tokens
+        self.num_examples_total += global_examples
+
+        pl_module.log_dict(
+            {
+                "num_tokens_total": float(self.num_tokens_total),
+                "num_examples_total": float(self.num_examples_total),
+            },
+            on_step=True,
+            on_epoch=False,
+            prog_bar=False,
+            rank_zero_only=True,
+            batch_size=max(local_examples, 1),
+        )
+
+        self._prev_batch_end_monotonic = time.monotonic()
+
+    # ------------------------------------------------------------ fallbacks
+    @staticmethod
+    def _fallback_counts(batch: Any, pl_module: LightningModule) -> tuple[int, int]:
+        """Best-effort token/example count from a generic ``batch`` dict.
+
+        Used only when the model didn't expose
+        ``_last_batch_num_tokens`` / ``_last_batch_num_examples``. Counts
+        non-pad text tokens via ``batch["input_ids"]`` and
+        ``pl_module.text_pad_id`` when both exist. Audio-frame
+        contribution is not visible from here.
+        """
+        try:
+            ids = batch["input_ids"]
+        except (KeyError, TypeError):
+            return 0, 0
+        if not torch.is_tensor(ids):
+            return 0, 0
+        pad_id = getattr(pl_module, "text_pad_id", None)
+        if pad_id is None:
+            n_tokens = int(ids.numel())
+        else:
+            n_tokens = int((ids != pad_id).long().sum().item())
+        n_examples = int(ids.shape[0])
+        return n_tokens, n_examples
+
+    @staticmethod
+    def _get_dp_group(pl_module: LightningModule):
+        """Return a DP-only process group when model parallelism is active.
+
+        ``None`` intentionally means the default world group, which is correct
+        for plain DDP and single-process runs.
+        """
+        device_mesh = getattr(pl_module, "_device_mesh", None)
+        if device_mesh is None:
+            trainer = getattr(pl_module, "trainer", None)
+            trainer_model = getattr(trainer, "model", None)
+            device_mesh = getattr(trainer_model, "device_mesh", None)
+        if device_mesh is None:
+            return None
+
+        names = device_mesh.mesh_dim_names or ()
+        if "data_parallel" in names:
+            return device_mesh["data_parallel"].get_group()
+
+        try:
+            from nemo_automodel.components.distributed.mesh_utils import get_flat_mesh
+
+            return get_flat_mesh(device_mesh, "dp").get_group()
+        except (ImportError, KeyError, RuntimeError, ValueError):
+            pass
+
+        try:
+            return device_mesh["dp"].get_group()
+        except (KeyError, RuntimeError, ValueError):
+            pass
+
+        if "dp_shard" in names and "dp_replicate" in names:
+            return device_mesh["dp_replicate", "dp_shard"].get_group()
+        if "dp_shard" in names:
+            return device_mesh["dp_shard"].get_group()
+        return None
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index ad7757b72ba5..202d6cb8bd35 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -20,6 +20,7 @@
 import time
 import warnings
 from collections import defaultdict
+from copy import deepcopy
 from dataclasses import dataclass, field
 from datetime import timedelta
 from pathlib import Path
@@ -1452,6 +1453,13 @@ def _check_time_remaining(self, trainer: lightning.pytorch.Trainer) -> None:
         """_check_time_remaining"""
         super()._check_time_remaining(trainer)
         if trainer.should_stop:
+            before_flush = _describe_batch_progress(trainer)
+            logging.info(
+                "StatelessTimer deadline reached; saving last checkpoint "
+                f"global_step={getattr(trainer, 'global_step', None)} "
+                f"current_epoch={getattr(trainer, 'current_epoch', None)} "
+                f"batch_progress_before_flush={before_flush}"
+            )
             # PTL's TrainingEpochLoop.advance() calls the on_train_batch_end hooks (which is where
             # Timer._check_time_remaining fires) BEFORE batch_progress.increment_completed(). The
             # current batch's optim step has already advanced global_step, so saving here would
@@ -1461,16 +1469,46 @@ def _check_time_remaining(self, trainer: lightning.pytorch.Trainer) -> None:
             # global_step per wall-time resume. Flush the in-flight batch first to keep the
             # saved state self-consistent.
             _flush_in_flight_batch_progress(trainer)
+            after_flush = _describe_batch_progress(trainer)
             checkpoint_callback: Optional[NeMoModelCheckpoint] = trainer.checkpoint_callback
             if checkpoint_callback:
+                save_started = time.monotonic()
                 monitor_candidates = checkpoint_callback._monitor_candidates(trainer)
                 checkpoint_callback._save_last_checkpoint(trainer, monitor_candidates)
+                logging.info(
+                    "StatelessTimer last checkpoint save finished "
+                    f"global_step={getattr(trainer, 'global_step', None)} "
+                    f"current_epoch={getattr(trainer, 'current_epoch', None)} "
+                    f"batch_progress_after_flush={after_flush} "
+                    f"last_model_path={getattr(checkpoint_callback, 'last_model_path', None)} "
+                    f"save_duration_sec={time.monotonic() - save_started:.3f}"
+                )
+            else:
+                logging.warning("StatelessTimer deadline reached but trainer.checkpoint_callback is not configured")
             # Throw this exception to signal to Lightning to terminate gracefully.
             from lightning.pytorch.utilities.exceptions import _TunerExitException
 
             raise _TunerExitException()
 
 
+def _describe_batch_progress(trainer: lightning.pytorch.Trainer) -> Dict[str, Any]:
+    """Return a compact, log-friendly snapshot of Lightning's train batch progress."""
+    try:
+        batch_progress = trainer.fit_loop.epoch_loop.batch_progress
+    except AttributeError:
+        return {}
+
+    return {
+        "current_ready": getattr(batch_progress.current, "ready", None),
+        "current_processed": getattr(batch_progress.current, "processed", None),
+        "current_completed": getattr(batch_progress.current, "completed", None),
+        "total_ready": getattr(batch_progress.total, "ready", None),
+        "total_processed": getattr(batch_progress.total, "processed", None),
+        "total_completed": getattr(batch_progress.total, "completed", None),
+        "is_last_batch": getattr(batch_progress, "is_last_batch", None),
+    }
+
+
 def _flush_in_flight_batch_progress(trainer: lightning.pytorch.Trainer) -> None:
     """Bring batch_progress.current.completed up to .ready if a batch is in flight.
 
@@ -1487,11 +1525,63 @@ def _flush_in_flight_batch_progress(trainer: lightning.pytorch.Trainer) -> None:
         batch_progress.increment_completed()
 
 
+def _save_last_checkpoint_and_exit(trainer: lightning.pytorch.Trainer, reason: str) -> None:
+    """Save the last checkpoint for graceful shutdown and exit Lightning.
+
+    ``reason`` should describe the caller-visible shutdown trigger. The
+    checkpoint policy itself is unchanged: this only asks the configured
+    ``NeMoModelCheckpoint`` to update its existing ``*-last.ckpt`` target.
+    """
+    before_flush = _describe_batch_progress(trainer)
+    logging.info(
+        f"{reason}; saving last checkpoint "
+        f"global_step={getattr(trainer, 'global_step', None)} "
+        f"current_epoch={getattr(trainer, 'current_epoch', None)} "
+        f"batch_progress_before_flush={before_flush}"
+    )
+    _flush_in_flight_batch_progress(trainer)
+    after_flush = _describe_batch_progress(trainer)
+
+    checkpoint_callback: Optional[NeMoModelCheckpoint] = getattr(trainer, "checkpoint_callback", None)
+    if checkpoint_callback:
+        save_started = time.monotonic()
+        monitor_candidates = checkpoint_callback._monitor_candidates(trainer)
+        checkpoint_callback._save_last_checkpoint(trainer, monitor_candidates)
+        logging.info(
+            "Graceful shutdown last checkpoint save finished "
+            f"global_step={getattr(trainer, 'global_step', None)} "
+            f"current_epoch={getattr(trainer, 'current_epoch', None)} "
+            f"batch_progress_after_flush={after_flush} "
+            f"last_model_path={getattr(checkpoint_callback, 'last_model_path', None)} "
+            f"save_duration_sec={time.monotonic() - save_started:.3f}"
+        )
+    else:
+        logging.warning(f"{reason}; trainer.checkpoint_callback is not configured")
+
+    from lightning.pytorch.utilities.exceptions import _TunerExitException
+
+    raise _TunerExitException()
+
+
 def configure_no_restart_validation_training_loop(trainer: lightning.pytorch.Trainer) -> None:
     """configure_no_restart_validation_training_loop"""
-    if type(trainer.fit_loop.epoch_loop) != _TrainingEpochLoop:
+    if type(trainer.fit_loop.epoch_loop) is not _TrainingEpochLoop:
         warnings.warn("Detected custom epoch loop. Skipping no validation on restart support.", UserWarning)
         return
+
+    fit_loop = trainer.fit_loop
+    if not getattr(fit_loop, "_nemo_restart_loader_state_cache_installed", False):
+        original_load_combined_loader_states = fit_loop._load_combined_loader_states
+
+        def _load_combined_loader_states_with_cache() -> None:
+            states = getattr(fit_loop, "_combined_loader_states_to_load", None)
+            if getattr(fit_loop, "restarting", False) and states:
+                fit_loop._nemo_restart_combined_loader_states = deepcopy(states)
+            original_load_combined_loader_states()
+
+        fit_loop._load_combined_loader_states = _load_combined_loader_states_with_cache
+        fit_loop._nemo_restart_loader_state_cache_installed = True
+
     # Pass trainer object to avoid trainer getting overwritten as None
     loop = SkipResumeTrainingValidationLoop(trainer, trainer.min_steps, trainer.max_steps)
     trainer.fit_loop.epoch_loop = loop
@@ -1504,8 +1594,43 @@ class SkipResumeTrainingValidationLoop(_TrainingEpochLoop):
     the training state before validation has run.
     """
 
+    def __init__(self, *args, **kwargs) -> None:
+        """Initialize skip-validation bookkeeping."""
+        super().__init__(*args, **kwargs)
+        self._skip_resume_validation_once = False
+
+    def advance(self, data_fetcher) -> None:
+        """Skip restart validation without replaying an already-completed train batch."""
+        if self.restarting and super()._should_check_val_fx(data_fetcher):
+            logging.info("Skipping restart validation without replaying a completed training batch")
+            self._reload_unconsumed_restart_dataloader_state()
+            self._skip_resume_validation_once = True
+            self.restarting = False
+            return
+        super().advance(data_fetcher)
+
+    def _reload_unconsumed_restart_dataloader_state(self) -> None:
+        """Reapply the checkpoint dataloader cursor after skipping restart validation."""
+        fit_loop = self.trainer.fit_loop
+        states = getattr(fit_loop, "_nemo_restart_combined_loader_states", None)
+        combined_loader = getattr(fit_loop, "_combined_loader", None)
+        if not states or combined_loader is None or not hasattr(combined_loader, "_load_state_dicts"):
+            return
+
+        combined_loader._load_state_dicts(deepcopy(states))
+        fit_loop._nemo_restart_combined_loader_states = None
+
+    def on_advance_end(self, data_fetcher) -> None:
+        """Clear the one-shot restart-validation skip after normal epoch-loop bookkeeping."""
+        try:
+            return super().on_advance_end(data_fetcher)
+        finally:
+            self._skip_resume_validation_once = False
+
     def _should_check_val_fx(self, data_fetcher) -> bool:
         """_should_check_val_fx"""
+        if self._skip_resume_validation_once:
+            return False
         if self.restarting:
             return False
         return super()._should_check_val_fx(data_fetcher)
diff --git a/pyproject.toml b/pyproject.toml
index ad7711a3d582..3e233a9e2816 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -115,7 +115,7 @@ asr-only = [
     "braceexpand",
     "einops",
     "kaldialign",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.1",
     "packaging",
     "sacrebleu",
@@ -138,7 +138,7 @@ tts = [
     "pyopenjtalk",
     "braceexpand",
     "kaldialign",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.1",
     "packaging",
     "sacrebleu",
@@ -161,7 +161,7 @@ tts = [
 
 audio = [
     "einops",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.0",
     "matplotlib",
     "pesq; (platform_machine != 'x86_64' or platform_system != 'Darwin')",
@@ -206,7 +206,7 @@ all = [
     "sentencepiece<1.0.0",
     "braceexpand",
     "kaldialign",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.1",
     "packaging",
     "sacrebleu",
@@ -288,7 +288,7 @@ asr = [
     "braceexpand",
     "einops",
     "kaldialign",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.1",
     "packaging",
     "sacrebleu",
@@ -317,7 +317,7 @@ speechlm2 = [
     "braceexpand",
     "einops",
     "kaldialign",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.1",
     "packaging",
     "sacrebleu",
diff --git a/scripts/dataloading/_validate_dataloader/__init__.py b/scripts/dataloading/_validate_dataloader/__init__.py
new file mode 100644
index 000000000000..845889837cb5
--- /dev/null
+++ b/scripts/dataloading/_validate_dataloader/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Internal helpers for ``scripts/dataloading/validate_dataloader.py``."""
diff --git a/scripts/dataloading/_validate_dataloader/config_inject.py b/scripts/dataloading/_validate_dataloader/config_inject.py
new file mode 100644
index 000000000000..d58c14f5105f
--- /dev/null
+++ b/scripts/dataloading/_validate_dataloader/config_inject.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Recursively inject validator-specific flags into a train_ds-shaped
+OmegaConf node and every nested ``input_cfg``."""
+
+import logging
+from typing import Any
+
+from omegaconf import DictConfig, ListConfig
+
+LOG = logging.getLogger(__name__)
+
+
+def inject_validator_flags(cfg: DictConfig, *, force_finite: bool, metadata_only: bool) -> DictConfig:
+    """Mutate-in-place: set ``force_finite`` and ``metadata_only`` on ``cfg``
+    and on every nested ``input_cfg`` entry (recursively). Logs every
+    injection so the user can see exactly what was changed."""
+    if force_finite:
+        _inject_key(cfg, "force_finite", True, ctx="train_ds (top-level)")
+    if metadata_only:
+        _inject_key(cfg, "metadata_only", True, ctx="train_ds (top-level)")
+    _walk_input_cfg(cfg.get("input_cfg"), force_finite=force_finite, metadata_only=metadata_only)
+    return cfg
+
+
+def _walk_input_cfg(node: Any, *, force_finite: bool, metadata_only: bool, path: str = "input_cfg") -> None:
+    if node is None:
+        return
+    if isinstance(node, (list, ListConfig)):
+        for i, sub in enumerate(node):
+            _walk_input_cfg(sub, force_finite=force_finite, metadata_only=metadata_only, path=f"{path}[{i}]")
+        return
+    if isinstance(node, str):
+        return  # input_cfg reference to a YAML file path — resolved later by NeMo
+    if not isinstance(node, (dict, DictConfig)):
+        return
+    typ = node.get("type", "<no-type>")
+    if force_finite and "force_finite" not in node:
+        _inject_key(node, "force_finite", True, ctx=f"{path} (type={typ})")
+    if metadata_only and "metadata_only" not in node:
+        _inject_key(node, "metadata_only", True, ctx=f"{path} (type={typ})")
+    if "input_cfg" in node:
+        _walk_input_cfg(
+            node["input_cfg"],
+            force_finite=force_finite,
+            metadata_only=metadata_only,
+            path=f"{path}.input_cfg",
+        )
+
+
+def _inject_key(node: Any, key: str, value: Any, *, ctx: str) -> None:
+    prev = node.get(key) if isinstance(node, (dict, DictConfig)) else None
+    if prev == value:
+        return
+    node[key] = value
+    LOG.info("inject %s=%s into %s (was %r)", key, value, ctx, prev)
diff --git a/scripts/dataloading/_validate_dataloader/consolidate.py b/scripts/dataloading/_validate_dataloader/consolidate.py
new file mode 100644
index 000000000000..38a93077a05e
--- /dev/null
+++ b/scripts/dataloading/_validate_dataloader/consolidate.py
@@ -0,0 +1,420 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Consolidate per-rank validator JSONLs and emit PASS/FAIL on Q1..Q5.
+
+Layout the per-rank entry writes:
+
+    {output_dir}/
+        baseline/run0/rank_NNN.jsonl
+        baseline/run0/state_rank_NNN.pt
+        baseline/run0/throughput_rank_NNN.json
+        baseline/run1/rank_NNN.jsonl            # if --num-determinism-runs >= 2
+        resumed/run0/rank_NNN.jsonl             # phase=resumed
+        groundtruth/cuts.jsonl                  # phase=groundtruth (single-rank)
+        pre_validation.json                     # written by pre_validation.py
+
+This module is the post-iteration aggregator. Exit code: 0 if all checks
+pass, 1 if any check fails, 2 if there's a structural problem
+(no JSONLs, missing groundtruth, etc.).
+"""
+
+import json
+import logging
+import statistics
+import sys
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+import click
+
+LOG = logging.getLogger(__name__)
+
+
+PASS = "PASS"
+FAIL = "FAIL"
+WARN = "WARN"
+SKIP = "SKIP"
+
+
+@dataclass
+class QResult:
+    q_id: str
+    status: str
+    tag: Optional[str] = None
+    detail: str = ""
+    extra: dict = field(default_factory=dict)
+
+
+@dataclass
+class ValidationReport:
+    questions: list[QResult]
+    throughput: dict
+
+    def to_dict(self):
+        return {
+            "questions": {
+                q.q_id: {"status": q.status, "tag": q.tag, "detail": q.detail, **q.extra} for q in self.questions
+            },
+            "throughput": self.throughput,
+        }
+
+    @property
+    def all_passed(self) -> bool:
+        return all(q.status != FAIL for q in self.questions)
+
+
+# --------------------------------------------------------------------------- #
+# Public API.
+# --------------------------------------------------------------------------- #
+
+
+def consolidate(output_dir: Path, *, checkpoint_at: int, num_determinism_runs: int) -> ValidationReport:
+    """Read every artifact under ``output_dir`` and produce a ValidationReport."""
+    baseline = _load_phase(output_dir / "baseline" / "run0")
+    questions: list[QResult] = []
+
+    questions.append(_q1_no_duplication(baseline))
+    questions.append(_q2_no_skipping(baseline, output_dir / "groundtruth" / "cuts.jsonl"))
+    questions.append(_q3_partition_correctness(baseline))
+    questions.append(
+        _q4_exact_resume(
+            baseline,
+            _load_phase(output_dir / "resumed" / "run0"),
+            checkpoint_at=checkpoint_at,
+        )
+    )
+    if num_determinism_runs >= 2:
+        run1 = _load_phase(output_dir / "baseline" / "run1")
+        questions.append(_q5_determinism(baseline, run1))
+    else:
+        questions.append(QResult("Q5", SKIP, detail="num_determinism_runs < 2"))
+
+    throughput = _collect_throughput(output_dir / "baseline" / "run0")
+    return ValidationReport(questions=questions, throughput=throughput)
+
+
+# --------------------------------------------------------------------------- #
+# Question implementations.
+# --------------------------------------------------------------------------- #
+
+
+def _q1_no_duplication(rows: list[dict]) -> QResult:
+    """Q1: no cut appears twice within phase 1. Tag ``partition-rank-leak``
+    if cross-rank, ``partition-worker-leak`` if within one rank."""
+    if not rows:
+        return QResult("Q1", SKIP, detail="no baseline rows loaded")
+    # Map cut_id -> set of (rank, worker) tuples that saw it.
+    sightings: dict[str, set[tuple[int, int]]] = defaultdict(set)
+    for r in rows:
+        for cid in r["cut_ids"]:
+            sightings[cid].add((r["rank"], r["worker_id"]))
+    dup_cross_rank: list[str] = []
+    dup_within_rank: list[str] = []
+    for cid, seen in sightings.items():
+        if len(seen) <= 1:
+            continue
+        ranks = {rank for rank, _ in seen}
+        if len(ranks) > 1:
+            dup_cross_rank.append(cid)
+        else:
+            dup_within_rank.append(cid)
+    if dup_cross_rank:
+        return QResult(
+            "Q1",
+            FAIL,
+            tag="partition-rank-leak",
+            detail=f"{len(dup_cross_rank)} cut.id(s) appeared on multiple ranks",
+            extra={"examples": dup_cross_rank[:5]},
+        )
+    if dup_within_rank:
+        return QResult(
+            "Q1",
+            FAIL,
+            tag="partition-worker-leak",
+            detail=f"{len(dup_within_rank)} cut.id(s) seen by multiple workers within one rank",
+            extra={"examples": dup_within_rank[:5]},
+        )
+    return QResult("Q1", PASS, detail=f"{len(sightings)} distinct cuts, no duplicates")
+
+
+def _q2_no_skipping(rows: list[dict], groundtruth_path: Path) -> QResult:
+    """Q2: yielded ID set equals the ground-truth set (force_finite mode)."""
+    if not rows:
+        return QResult("Q2", SKIP, detail="no baseline rows loaded")
+    if not groundtruth_path.exists():
+        return QResult("Q2", SKIP, detail=f"groundtruth file missing: {groundtruth_path}")
+    expected: set[str] = set()
+    with open(groundtruth_path) as f:
+        for line in f:
+            obj = json.loads(line)
+            expected.update(obj.get("cut_ids", []))
+    yielded: set[str] = set()
+    for r in rows:
+        yielded.update(r["cut_ids"])
+    missing = expected - yielded
+    unexpected = yielded - expected
+    if missing:
+        return QResult(
+            "Q2",
+            FAIL,
+            tag="skip",
+            detail=f"{len(missing)} of {len(expected)} expected cut.id(s) never yielded",
+            extra={"missing_examples": list(missing)[:5], "unexpected_count": len(unexpected)},
+        )
+    if unexpected:
+        return QResult(
+            "Q2",
+            FAIL,
+            tag="id-collision",
+            detail=f"{len(unexpected)} cut.id(s) yielded but not in ground truth",
+            extra={"unexpected_examples": list(unexpected)[:5]},
+        )
+    return QResult("Q2", PASS, detail=f"yielded ({len(yielded)}) == ground truth ({len(expected)})")
+
+
+def _q3_partition_correctness(rows: list[dict]) -> QResult:
+    """Q3: per-rank cut sets are pairwise disjoint."""
+    if not rows:
+        return QResult("Q3", SKIP, detail="no baseline rows loaded")
+    per_rank: dict[int, set[str]] = defaultdict(set)
+    for r in rows:
+        per_rank[r["rank"]].update(r["cut_ids"])
+    grand_union = set()
+    for s in per_rank.values():
+        grand_union.update(s)
+    sum_distinct = sum(len(s) for s in per_rank.values())
+    if sum_distinct == len(grand_union):
+        return QResult("Q3", PASS, detail=f"{len(per_rank)} ranks, |union|={len(grand_union)}")
+    overlap = sum_distinct - len(grand_union)
+    # Detect broadcast vs partial overlap.
+    n_ranks = max(len(per_rank), 1)
+    ratio = sum_distinct / max(len(grand_union), 1)
+    tag = "partition-rank-leak"
+    if ratio >= n_ranks - 0.5:
+        detail = f"FULL BROADCAST: each cut.id appears on ~{ratio:.1f}/{n_ranks} ranks " f"(overlap={overlap})"
+    else:
+        detail = (
+            f"PARTIAL OVERLAP: per-rank distinct sums to {sum_distinct} but |union|={len(grand_union)} "
+            f"(overlap={overlap})"
+        )
+    return QResult("Q3", FAIL, tag=tag, detail=detail)
+
+
+def _q4_exact_resume(baseline: list[dict], resumed: list[dict], *, checkpoint_at: int) -> QResult:
+    """Q4: per-(rank, step) cut sets in resumed match the baseline tail.
+
+    The validator saves ``state_dict()`` AFTER yielding baseline step
+    ``checkpoint_at``; StatefulDataLoader's state points at the NEXT
+    element, so resumed[0] should equal baseline[checkpoint_at + 1].
+
+    The comparison runs on the **overlapping window** only: cells where
+    both the baseline and the resumed JSONL have an entry. Cells
+    beyond that (resumed ran longer than baseline tail, or vice versa)
+    are reported in ``extra`` but don't trigger FAIL on their own —
+    they just mean one side iterated more batches than necessary."""
+    if not resumed:
+        return QResult("Q4", SKIP, detail="no resumed rows loaded")
+    base_by_key = {(r["rank"], r["step"]): set(r["cut_ids"]) for r in baseline}
+    res_by_key = {(r["rank"], r["step"]): set(r["cut_ids"]) for r in resumed}
+    divergences: list[dict] = []
+    overlap = 0
+    extra_resumed = 0
+    extra_baseline_tail = 0
+    # Compare every resumed cell to its baseline counterpart at step + checkpoint_at + 1.
+    for (rank, rstep), res_cuts in sorted(res_by_key.items()):
+        base_step = rstep + checkpoint_at + 1
+        base_cuts = base_by_key.get((rank, base_step))
+        if base_cuts is None:
+            extra_resumed += 1
+            continue
+        overlap += 1
+        if base_cuts != res_cuts:
+            divergences.append(
+                {
+                    "rank": rank,
+                    "step": rstep,
+                    "baseline_step": base_step,
+                    "only_in_baseline": list(base_cuts - res_cuts)[:3],
+                    "only_in_resumed": list(res_cuts - base_cuts)[:3],
+                }
+            )
+    # Cells in baseline-tail that the resumed run never reached.
+    for rank, bstep in base_by_key:
+        if bstep <= checkpoint_at:
+            continue
+        rstep = bstep - checkpoint_at - 1
+        if (rank, rstep) not in res_by_key:
+            extra_baseline_tail += 1
+    extras = {
+        "overlap_cells": overlap,
+        "extra_resumed_cells": extra_resumed,
+        "extra_baseline_tail_cells": extra_baseline_tail,
+    }
+    if divergences:
+        return QResult(
+            "Q4",
+            FAIL,
+            tag="resume-rng-divergence",
+            detail=f"{len(divergences)}/{overlap} overlapping cell(s) diverge after resume",
+            extra={**extras, "examples": divergences[:5]},
+        )
+    if overlap == 0:
+        return QResult(
+            "Q4",
+            FAIL,
+            tag="resume-length-mismatch",
+            detail="zero overlap between resumed and baseline-tail windows",
+            extra=extras,
+        )
+    return QResult("Q4", PASS, detail=f"{overlap} overlapping cell(s) match baseline tail bit-for-bit", extra=extras)
+
+
+def _q5_determinism(run0: list[dict], run1: list[dict]) -> QResult:
+    """Q5: two independent baseline runs produce identical (rank, step) cut sets."""
+    if not run1:
+        return QResult("Q5", SKIP, detail="run1 missing")
+    a = {(r["rank"], r["step"]): set(r["cut_ids"]) for r in run0}
+    b = {(r["rank"], r["step"]): set(r["cut_ids"]) for r in run1}
+    if a.keys() != b.keys():
+        only_a = list(a.keys() - b.keys())[:3]
+        only_b = list(b.keys() - a.keys())[:3]
+        return QResult(
+            "Q5",
+            FAIL,
+            tag="non-determinism",
+            detail="run0/run1 step coverage differs",
+            extra={"only_in_run0": only_a, "only_in_run1": only_b},
+        )
+    divergences: list[dict] = []
+    for k, va in a.items():
+        vb = b[k]
+        if va != vb:
+            divergences.append(
+                {"rank": k[0], "step": k[1], "only_run0": list(va - vb)[:3], "only_run1": list(vb - va)[:3]}
+            )
+    if divergences:
+        return QResult(
+            "Q5",
+            FAIL,
+            tag="non-determinism",
+            detail=f"{len(divergences)} cell(s) differ between determinism runs",
+            extra={"examples": divergences[:5]},
+        )
+    return QResult("Q5", PASS, detail="run0 == run1 across all (rank, step) cells")
+
+
+# --------------------------------------------------------------------------- #
+# Throughput summary (v1 minimal: t_total only).
+# --------------------------------------------------------------------------- #
+
+
+def _collect_throughput(run_dir: Path) -> dict:
+    files = sorted(run_dir.glob("throughput_rank_*.json"))
+    if not files:
+        return {"available": False}
+    aggregates = [json.loads(f.read_text()) for f in files]
+    p50s = [a["p50_ms"] for a in aggregates if a.get("p50_ms") is not None]
+    p95s = [a["p95_ms"] for a in aggregates if a.get("p95_ms") is not None]
+    num_workers = aggregates[0].get("num_workers")
+    p50 = statistics.median(p50s) if p50s else None
+    p95 = max(p95s) if p95s else None
+    out = {
+        "available": True,
+        "num_workers": num_workers,
+        "num_ranks": len(aggregates),
+        "p50_ms_median": p50,
+        "p95_ms_max": p95,
+        "batches_per_s_per_rank": (1000.0 / p50) if p50 else None,
+        "t_first_batch_ms_max": max((a.get("t_first_batch_ms") or 0) for a in aggregates) or None,
+    }
+    if p50 and num_workers:
+        out["t_gpu_min_for_overlap_ms"] = p50 / num_workers
+    return out
+
+
+# --------------------------------------------------------------------------- #
+# IO helpers.
+# --------------------------------------------------------------------------- #
+
+
+def _load_phase(phase_dir: Path) -> list[dict]:
+    """Load every ``rank_*.jsonl`` under ``phase_dir`` into a flat list of rows."""
+    if not phase_dir.exists():
+        return []
+    rows: list[dict] = []
+    for f in sorted(phase_dir.glob("rank_*.jsonl")):
+        with open(f) as fp:
+            for line in fp:
+                line = line.strip()
+                if line:
+                    rows.append(json.loads(line))
+    return rows
+
+
+# --------------------------------------------------------------------------- #
+# CLI.
+# --------------------------------------------------------------------------- #
+
+
+@click.command(help=__doc__)
+@click.option(
+    "--output-dir", required=True, type=click.Path(exists=True), help="Directory written by validate_dataloader.py."
+)
+@click.option(
+    "--checkpoint-at",
+    type=int,
+    default=0,
+    show_default=True,
+    help="Step index at which the baseline saved state. Must match the baseline run.",
+)
+@click.option(
+    "--num-determinism-runs",
+    type=int,
+    default=1,
+    show_default=True,
+    help="If >= 2, compares baseline/run0 vs baseline/run1 for Q5.",
+)
+@click.option("-v", "--verbose", is_flag=True, default=False)
+def cli(output_dir: str, checkpoint_at: int, num_determinism_runs: int, verbose: bool) -> None:
+    logging.basicConfig(
+        level=logging.DEBUG if verbose else logging.INFO,
+        format="[%(asctime)s %(levelname)s] %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    out_dir = Path(output_dir)
+    report = consolidate(out_dir, checkpoint_at=checkpoint_at, num_determinism_runs=num_determinism_runs)
+
+    print(f"\n=== validation report ({len(report.questions)} questions) ===")
+    for q in report.questions:
+        marker = {PASS: "  PASS", WARN: "  WARN", FAIL: "  FAIL", SKIP: "  skip"}[q.status]
+        tag = f" [{q.tag}]" if q.tag else ""
+        print(f"{marker}  {q.q_id}{tag}: {q.detail}")
+    if report.throughput.get("available"):
+        t = report.throughput
+        print(
+            f"\nthroughput: p50={t['p50_ms_median']:.1f}ms p95={t['p95_ms_max']:.1f}ms "
+            f"=> {t['batches_per_s_per_rank']:.2f} batches/s/rank "
+            f"(num_workers={t['num_workers']}, T_gpu_min={t.get('t_gpu_min_for_overlap_ms', 0):.1f}ms)"
+        )
+    else:
+        print("\nthroughput: <not collected>")
+    (out_dir / "validation_report.json").write_text(json.dumps(report.to_dict(), indent=2))
+    print(f"wrote {out_dir / 'validation_report.json'}")
+    sys.exit(0 if report.all_passed else 1)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/scripts/dataloading/_validate_dataloader/cut_id_dataset.py b/scripts/dataloading/_validate_dataloader/cut_id_dataset.py
new file mode 100644
index 000000000000..373b24c5bb1a
--- /dev/null
+++ b/scripts/dataloading/_validate_dataloader/cut_id_dataset.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""No-op dataset that materializes the per-batch ``cut.id`` list and the
+worker subprocess metadata. The sampler/dataloader machinery decides
+*which* cuts each call gets, which is exactly the question the
+validator answers."""
+
+import torch.utils.data
+
+
+class CutIdDataset(torch.utils.data.Dataset):
+    """Returns per-batch ``cut.id`` list and ``worker_info`` instead of
+    realizing audio/features. Bypasses ``SALMDataset`` and the tokenizer
+    so the validator can iterate orders of magnitude faster than a real
+    training step."""
+
+    def __getitem__(self, cuts):
+        info = torch.utils.data.get_worker_info()
+        return {
+            "cut_ids": [str(cut.id) for cut in cuts],
+            "worker_id": int(info.id) if info is not None else 0,
+            "num_workers": int(info.num_workers) if info is not None else 1,
+        }
diff --git a/scripts/dataloading/_validate_dataloader/pre_validation.py b/scripts/dataloading/_validate_dataloader/pre_validation.py
new file mode 100644
index 000000000000..66ae9155dcda
--- /dev/null
+++ b/scripts/dataloading/_validate_dataloader/pre_validation.py
@@ -0,0 +1,720 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Static pre-validation checks for a train_ds-shaped Lhotse dataloader config.
+
+Run as either a function (``run_pre_validation(cfg)``) or a CLI
+(``python pre_validation.py --config ... --output-dir ...``). All checks
+operate on the resolved OmegaConf node — no iteration, no GPUs, no
+SLURM. Intended runtime: < 5 s on a typical SALM ``train_ds`` config.
+
+The output is a structured report (``pre_validation.json``) listing each
+check's ``PASS``/``WARN``/``FAIL`` status. Exit code is ``0`` iff no
+``FAIL`` checks remain after applying ``--ignore-fail`` overrides.
+"""
+
+import json
+import logging
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Iterable, Optional
+
+import click
+from omegaconf import DictConfig, ListConfig, OmegaConf
+
+LOG = logging.getLogger(__name__)
+
+PASS = "PASS"
+WARN = "WARN"
+FAIL = "FAIL"
+SKIP = "SKIP"
+
+_NON_INT_SEED_VALUES = {"randomized", "trng", "trng_initial", None}
+
+
+@dataclass
+class CheckResult:
+    check_id: str
+    severity: str  # FAIL or WARN — the worst this check is permitted to emit
+    status: str  # PASS | WARN | FAIL | SKIP
+    detail: str = ""
+    extra: dict = field(default_factory=dict)
+
+
+@dataclass
+class PreValidationReport:
+    checks: list[CheckResult]
+    summary: dict
+
+    def to_dict(self):
+        return {
+            "checks": {
+                c.check_id: {"status": c.status, "severity": c.severity, "detail": c.detail, **c.extra}
+                for c in self.checks
+            },
+            "summary": self.summary,
+        }
+
+    @property
+    def all_passed(self) -> bool:
+        return not any(c.status == FAIL for c in self.checks)
+
+
+# --------------------------------------------------------------------------- #
+# Public API.
+# --------------------------------------------------------------------------- #
+
+
+def run_pre_validation(cfg: DictConfig, *, ignore_fail: Iterable[str] = ()) -> PreValidationReport:
+    """Run every registered check against ``cfg`` (a train_ds-shaped node).
+
+    Set ``ignore_fail`` to a list of check IDs to downgrade their ``FAIL``
+    outcome to ``WARN``. Always run every check — never short-circuit —
+    so the user sees the full picture.
+    """
+    ignore = set(ignore_fail)
+    results: list[CheckResult] = []
+    for check_id, severity, fn in _REGISTRY:
+        try:
+            status, detail, extra = fn(cfg)
+        except Exception as e:  # pragma: no cover — safety net
+            status, detail, extra = FAIL, f"check raised {type(e).__name__}: {e}", {}
+        if check_id in ignore and status == FAIL:
+            status = WARN
+            detail = f"(downgraded to WARN via --ignore-fail) {detail}"
+        results.append(CheckResult(check_id, severity, status, detail, extra))
+    summary = {
+        "total": len(results),
+        "pass": sum(1 for r in results if r.status == PASS),
+        "warn": sum(1 for r in results if r.status == WARN),
+        "fail": sum(1 for r in results if r.status == FAIL),
+        "skip": sum(1 for r in results if r.status == SKIP),
+    }
+    return PreValidationReport(checks=results, summary=summary)
+
+
+# --------------------------------------------------------------------------- #
+# Individual checks. Each returns (status, detail, extra_fields).
+# --------------------------------------------------------------------------- #
+
+
+def _check_seed_int(cfg: DictConfig):
+    seed = cfg.get("seed", None)
+    if isinstance(seed, int):
+        return PASS, f"seed={seed}", {}
+    if seed in _NON_INT_SEED_VALUES:
+        return (
+            FAIL,
+            (
+                f"train_ds.seed is {seed!r}; must be an integer for reproducibility across "
+                "launches and determinism re-runs."
+            ),
+            {},
+        )
+    return FAIL, f"train_ds.seed={seed!r} (type={type(seed).__name__}); must be int", {}
+
+
+def _check_shard_seed_int(cfg: DictConfig):
+    shard_seed = cfg.get("shard_seed", None)
+    if isinstance(shard_seed, int):
+        return PASS, f"shard_seed={shard_seed}", {}
+    return (
+        FAIL,
+        (
+            f"train_ds.shard_seed={shard_seed!r}; must be an integer. "
+            "LazyIteratorMultiplexer raises under multi-shard + 'randomized'."
+        ),
+        {},
+    )
+
+
+def _check_stateful_on(cfg: DictConfig):
+    if cfg.get("use_stateful_dataloader", False) is True:
+        return PASS, "", {}
+    return (
+        FAIL,
+        ("use_stateful_dataloader is not True; resumability validation requires the " "StatefulDataLoader path."),
+        {},
+    )
+
+
+def _check_indexed_implies_root(cfg: DictConfig):
+    indexed = cfg.get("indexed", False)
+    indexes_root = cfg.get("indexes_root", None)
+    if not indexed:
+        return SKIP, "train_ds.indexed != True; check not applicable", {}
+    if indexes_root in (None, "", "null"):
+        return (
+            FAIL,
+            (
+                "train_ds.indexed=True but indexes_root is unset. Without indexes_root, "
+                "LazyIndexedSharIterator falls back to looking next to (typically remote) "
+                "data files."
+            ),
+            {},
+        )
+    return PASS, f"indexes_root={indexes_root}", {}
+
+
+def _check_indexes_root_exists(cfg: DictConfig):
+    indexes_root = cfg.get("indexes_root", None)
+    if not indexes_root:
+        return SKIP, "indexes_root unset; check not applicable", {}
+    p = Path(indexes_root)
+    if p.exists():
+        return PASS, f"{indexes_root} exists", {}
+    # Locally on a developer laptop the path is typically cluster-specific; downgrade to WARN.
+    return (
+        WARN,
+        (
+            f"indexes_root={indexes_root!r} does not exist on this host. "
+            "Expected on cluster; downgraded to WARN locally."
+        ),
+        {},
+    )
+
+
+def _check_idx_files_present(cfg: DictConfig):
+    indexes_root = cfg.get("indexes_root", None)
+    if not indexes_root or not Path(indexes_root).exists():
+        return SKIP, "indexes_root not present locally; cluster-side check only", {}
+    try:
+        from lhotse.indexing import index_exists, index_file_path
+    except ImportError as e:
+        return WARN, f"lhotse.indexing import failed: {e}", {}
+    try:
+        from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths
+    except ImportError:
+        expand_sharded_filepaths = None
+
+    leaves = _collect_leaf_paths(cfg)
+    if not leaves:
+        return WARN, "no leaf data paths found under input_cfg", {}
+
+    # Expand ``_OP_N..M_CL_`` shard patterns; sample 2 shards per leaf so
+    # we cover every source without doing thousands of stat()s.
+    expanded: list[str] = []
+    for raw in leaves:
+        if expand_sharded_filepaths is not None:
+            try:
+                shards = expand_sharded_filepaths(raw)
+            except Exception:
+                shards = [raw]
+        else:
+            shards = [raw]
+        expanded.extend(shards[:2])
+
+    missing: list[str] = []
+    truncated: list[str] = []
+    for shard_path in expanded[:64]:  # global cap, just in case
+        idx_path = str(index_file_path(shard_path, indexes_root=indexes_root))
+        if not Path(idx_path).exists():
+            missing.append(idx_path)
+        elif not index_exists(shard_path, idx_path):
+            truncated.append(idx_path)
+    if missing or truncated:
+        detail = f"{len(missing)} missing, {len(truncated)} truncated of {len(expanded[:64])} sampled"
+        return FAIL, detail, {"missing": missing[:5], "truncated": truncated[:5]}
+    return PASS, f"sampled {len(expanded[:64])} .idx files across {len(leaves)} leaves; all valid", {}
+
+
+def _check_constant_time_leaves(cfg: DictConfig):
+    """The user's note: O(1) state-dict restore requires constant-time leaves
+    in BOTH map-style (force_map_dataset=True) and iterable-style. So this
+    check fires whenever use_stateful_dataloader is on, regardless of
+    force_map_dataset. Implemented statically: every leaf type must be
+    one that admits indexed mode, AND the indexed flag must propagate
+    (top-level ``indexed: true`` OR per-leaf override)."""
+    stateful = cfg.get("use_stateful_dataloader", False) is True
+    top_indexed = cfg.get("indexed", False) is True
+    non_indexable: list[dict] = []
+    streaming: list[dict] = []
+    for leaf in _iter_leaf_nodes(cfg):
+        typ = leaf.get("type")
+        if typ in _STREAMING_ONLY_TYPES:
+            non_indexable.append({"type": typ, "corpus": leaf.get("corpus")})
+            continue
+        leaf_indexed = leaf.get("indexed", top_indexed) is True
+        if not leaf_indexed:
+            streaming.append({"type": typ, "corpus": leaf.get("corpus")})
+    severity_status = FAIL if stateful else WARN
+    if non_indexable or streaming:
+        n = len(non_indexable) + len(streaming)
+        detail = (
+            f"{n} leaf source(s) lack constant-time access "
+            f"({len(non_indexable)} non-indexable type, {len(streaming)} streaming-mode). "
+            "Resume falls back to O(N) replay; with force_map_dataset=False they also leak "
+            "across ranks."
+        )
+        return (
+            severity_status,
+            detail,
+            {
+                "non_indexable": non_indexable[:5],
+                "streaming": streaming[:5],
+            },
+        )
+    return PASS, "all leaf sources admit constant-time access", {}
+
+
+def _check_mux_weights_sum(cfg: DictConfig):
+    """A multiplexer in NeMo configs is any list of dicts where each entry
+    carries a ``weight`` key. Validate that weights are positive finite floats."""
+    bad: list[dict] = []
+    for path, mux_entries in _iter_mux_groups(cfg):
+        total = 0.0
+        for i, e in enumerate(mux_entries):
+            w = e.get("weight")
+            if not isinstance(w, (int, float)) or w <= 0 or not _isfinite(w):
+                bad.append({"path": f"{path}[{i}]", "weight": w, "type": e.get("type")})
+            else:
+                total += float(w)
+        if total <= 0:
+            bad.append({"path": path, "weights_sum": total})
+    if bad:
+        return FAIL, f"{len(bad)} bad weight(s) found", {"examples": bad[:5]}
+    return PASS, "all mux weights sum to finite positive", {}
+
+
+def _check_mux_seed_not_randomized(cfg: DictConfig):
+    if cfg.get("force_map_dataset", True) is not False:
+        return SKIP, "force_map_dataset != False; check not applicable", {}
+    shard_seed = cfg.get("shard_seed")
+    if isinstance(shard_seed, int):
+        return PASS, f"shard_seed={shard_seed}", {}
+    return (
+        FAIL,
+        (
+            f"force_map_dataset=False but shard_seed={shard_seed!r}. "
+            "LazyIteratorMultiplexer raises ValueError under multi-shard with "
+            "shard_seed='randomized'."
+        ),
+        {},
+    )
+
+
+def _check_slice_length_vs_indexed(cfg: DictConfig):
+    if not cfg.get("indexed", False):
+        return SKIP, "train_ds.indexed != True; check not applicable", {}
+    offenders: list[dict] = []
+    for leaf in _iter_leaf_nodes(cfg):
+        if leaf.get("slice_length") is not None:
+            offenders.append({"type": leaf.get("type"), "corpus": leaf.get("corpus")})
+    if offenders:
+        return (
+            FAIL,
+            (
+                f"{len(offenders)} source(s) set slice_length with indexed=True. "
+                "Lhotse rejects: \"'slice_length' is not supported with indexed=True\"."
+            ),
+            {"examples": offenders[:5]},
+        )
+    return PASS, "", {}
+
+
+def _check_cut_map_fns_vs_indexed(cfg: DictConfig):
+    if not cfg.get("indexed", False):
+        return SKIP, "train_ds.indexed != True; check not applicable", {}
+    offenders: list[dict] = []
+    for leaf in _iter_leaf_nodes(cfg):
+        if leaf.get("cut_map_fns"):
+            offenders.append({"type": leaf.get("type"), "corpus": leaf.get("corpus")})
+    if offenders:
+        return (
+            FAIL,
+            (
+                f"{len(offenders)} source(s) set cut_map_fns with indexed=True. "
+                "Lhotse rejects: \"'cut_map_fns' is not supported with indexed=True\"."
+            ),
+            {"examples": offenders[:5]},
+        )
+    return PASS, "", {}
+
+
+def _check_lambda_in_pipeline(cfg: DictConfig):
+    """Heuristic: scan the YAML-resolved config for strings containing
+    '<lambda>' or 'lambda '. Real lambdas in YAML can't round-trip but
+    some configs use ``_target_: somemodule:somefn`` strings — we look
+    for the textual hint."""
+    blob = OmegaConf.to_yaml(cfg, resolve=False)
+    hits: list[str] = []
+    for line in blob.splitlines():
+        if "<lambda>" in line or "lambda:" in line or " lambda " in line:
+            hits.append(line.strip())
+    if hits:
+        return WARN, f"{len(hits)} possible lambda reference(s) in config", {"examples": hits[:5]}
+    return PASS, "no lambda references found", {}
+
+
+def _check_bucketer_buffer(cfg: DictConfig):
+    if not cfg.get("use_bucketing", False):
+        return SKIP, "use_bucketing != True; check not applicable", {}
+    n_buckets = cfg.get("num_buckets", 0)
+    buffer_size = cfg.get("bucket_buffer_size", 0)
+    if not n_buckets or not buffer_size:
+        return WARN, f"num_buckets={n_buckets}, bucket_buffer_size={buffer_size}", {}
+    ratio = buffer_size / max(n_buckets, 1)
+    if ratio < 10:
+        return (
+            WARN,
+            (
+                f"bucket_buffer_size={buffer_size} is < 10×num_buckets ({n_buckets}). "
+                "Low buffers can cause BucketsDontHaveEnoughData mid-run."
+            ),
+            {"ratio": ratio},
+        )
+    return PASS, f"bucket_buffer_size={buffer_size}, num_buckets={n_buckets}, ratio={ratio:.1f}", {}
+
+
+def _check_multi_config_flags(cfg: DictConfig):
+    """multi_config = True means input_cfg is a list of per-sub-config blocks.
+    Top-level ``indexed`` / ``indexes_root`` only flow into sub-configs via
+    the ``overwriting_opts`` list at NeMo_resumable/.../dataloader.py:455-473.
+    The 2026-05 fixes added them to that list; we verify the runtime
+    config doesn't paper over a missing entry per-sub-config."""
+    if not cfg.get("multi_config", False):
+        return SKIP, "multi_config != True; check not applicable", {}
+    # Static structural check: at least one sub-config must carry the
+    # indexed/indexes_root flags (or the top-level must have them so they propagate).
+    top_indexed = cfg.get("indexed")
+    top_root = cfg.get("indexes_root")
+    if top_indexed is not None and top_root is not None:
+        return PASS, "top-level indexed+indexes_root will propagate via overwriting_opts", {}
+    sub_cfgs = cfg.get("input_cfg") or []
+    if not isinstance(sub_cfgs, (list, ListConfig)):
+        return WARN, "multi_config=True but input_cfg is not a list", {}
+    missing = [
+        i
+        for i, sc in enumerate(sub_cfgs)
+        if isinstance(sc, (dict, DictConfig)) and (sc.get("indexed") is None or sc.get("indexes_root") is None)
+    ]
+    if missing:
+        return (
+            FAIL,
+            (
+                f"multi_config=True; {len(missing)} sub-config(s) missing indexed/indexes_root "
+                "and top-level doesn't supply both."
+            ),
+            {"indices": missing[:5]},
+        )
+    return PASS, "every sub-config sets indexed and indexes_root", {}
+
+
+def _check_text_fields(cfg: DictConfig):
+    """Best-effort: only run if at least one nemo_tarred leaf is reachable
+    locally. Verifying ``text_field`` requires reading a manifest line."""
+    # In v1 we just verify the field name is one of the well-known
+    # candidates. Manifest-line inspection requires network/cluster access.
+    valid = {"text", "answer", "transcript", "text_pnc", "text_normalized"}
+    suspicious: list[dict] = []
+    tf = cfg.get("text_field")
+    if tf is not None and tf not in valid:
+        suspicious.append({"path": "train_ds.text_field", "value": tf})
+    for leaf in _iter_leaf_nodes(cfg):
+        if leaf.get("type") == "nemo_tarred":
+            tf = leaf.get("text_field")
+            if tf is not None and tf not in valid:
+                suspicious.append({"corpus": leaf.get("corpus"), "value": tf})
+    if suspicious:
+        return (
+            WARN,
+            f"{len(suspicious)} unusual text_field value(s); verify against shard 0",
+            {"examples": suspicious[:5], "known_valid": sorted(valid)},
+        )
+    return PASS, "text_field values match known-valid set", {}
+
+
+def _check_world_size_divides_workers(cfg: DictConfig):
+    """Heuristic only — we don't yet know the runtime ``num-ranks``; emit
+    INFO showing how many shards each leaf has so the user can eyeball it."""
+    counts: list[dict] = []
+    for leaf in _iter_leaf_nodes(cfg):
+        n = _count_shards(leaf)
+        if n is not None:
+            counts.append({"corpus": leaf.get("corpus") or leaf.get("type"), "shards": n})
+    if not counts:
+        return SKIP, "no leaf-shard counts derivable from config", {}
+    min_shards = min(c["shards"] for c in counts)
+    if min_shards < 8:  # arbitrary "small enough to worry about" heuristic
+        return (
+            WARN,
+            f"smallest source has only {min_shards} shards; verify (num_ranks × num_workers) ≤ this",
+            {"counts": counts[:10]},
+        )
+    return PASS, f"smallest source has {min_shards} shards", {"counts": counts[:10]}
+
+
+# --------------------------------------------------------------------------- #
+# Check registry. Order = output order.
+# --------------------------------------------------------------------------- #
+
+
+_REGISTRY: list[tuple[str, str, Callable[[DictConfig], tuple[str, str, dict]]]] = [
+    ("seed-int", FAIL, _check_seed_int),
+    ("shard-seed-int", FAIL, _check_shard_seed_int),
+    ("stateful-on", FAIL, _check_stateful_on),
+    ("indexed-implies-root", FAIL, _check_indexed_implies_root),
+    ("indexes-root-exists", FAIL, _check_indexes_root_exists),
+    ("idx-files-present", FAIL, _check_idx_files_present),
+    ("constant-time-leaves", FAIL, _check_constant_time_leaves),
+    ("mux-weights-sum", FAIL, _check_mux_weights_sum),
+    ("mux-seed-not-randomized", FAIL, _check_mux_seed_not_randomized),
+    ("slice-length-vs-indexed", FAIL, _check_slice_length_vs_indexed),
+    ("cut-map-fns-vs-indexed", FAIL, _check_cut_map_fns_vs_indexed),
+    ("lambda-in-pipeline", WARN, _check_lambda_in_pipeline),
+    ("bucketer-buffer", WARN, _check_bucketer_buffer),
+    ("multi-config-flags", FAIL, _check_multi_config_flags),
+    ("text-fields", WARN, _check_text_fields),
+    ("world-size-divides-workers", WARN, _check_world_size_divides_workers),
+]
+
+
+# --------------------------------------------------------------------------- #
+# Static topology helpers.
+# --------------------------------------------------------------------------- #
+
+
+# Types that read indexable underlying data.
+_LEAF_TYPES = frozenset(
+    {
+        "lhotse_shar",
+        "nemo",
+        "nemo_tarred",
+        "multimodal_conversation",
+        "share_gpt",
+    }
+)
+
+# Types that don't admit constant-time access at all.
+_STREAMING_ONLY_TYPES = frozenset(
+    {
+        "txt",
+        "txt_pair",
+        "parquet",
+        "multi_speaker_simulator",
+    }
+)
+
+# Transparent passthrough types — recurse into input_cfg.
+_TRANSFORM_TYPES = frozenset(
+    {
+        "lhotse_as_conversation",
+        "sqa_as_conversation",
+        "s2s_as_conversation",
+        "s2s_duplex_overlap_as_s2s_duplex",
+        "s2s_duplex_reverse_role",
+        "lhotse_magpietts_data_as_continuation",
+        "nemo_tarred_to_duplex",
+        "group",
+    }
+)
+
+
+def _iter_leaf_nodes(cfg: DictConfig) -> Iterable[DictConfig]:
+    """Yield each leaf-source dict reachable from ``cfg.input_cfg``."""
+    yield from _walk(cfg.get("input_cfg"))
+
+
+def _walk(node: Any) -> Iterable[DictConfig]:
+    if node is None:
+        return
+    if isinstance(node, (list, ListConfig)):
+        for sub in node:
+            yield from _walk(sub)
+        return
+    if isinstance(node, str):
+        # input_cfg reference to another YAML file — try to load it.
+        loaded = _try_load_yaml(node)
+        if loaded is not None:
+            yield from _walk(loaded)
+        return
+    if not isinstance(node, (dict, DictConfig)):
+        return
+    typ = node.get("type")
+    if typ in _LEAF_TYPES or typ in _STREAMING_ONLY_TYPES:
+        yield node
+        return
+    if typ in _TRANSFORM_TYPES or typ is None:
+        if "input_cfg" in node:
+            yield from _walk(node["input_cfg"])
+        return
+    # Unknown type — yield it so it's at least counted, but caller
+    # should be defensive about its keys.
+    yield node
+
+
+def _iter_mux_groups(cfg: DictConfig) -> Iterable[tuple[str, list]]:
+    """Yield ``(path, list-of-entries)`` for each input_cfg list whose entries
+    carry a ``weight`` field (= an implicit multiplexer)."""
+    yield from _walk_mux(cfg.get("input_cfg"), path="train_ds.input_cfg")
+
+
+def _walk_mux(node: Any, path: str) -> Iterable[tuple[str, list]]:
+    if node is None:
+        return
+    if isinstance(node, (list, ListConfig)):
+        entries = [e for e in node if isinstance(e, (dict, DictConfig))]
+        weighted = [e for e in entries if "weight" in e]
+        if weighted and len(weighted) > 1:
+            yield path, list(weighted)
+        for i, sub in enumerate(node):
+            yield from _walk_mux(sub, path=f"{path}[{i}]")
+        return
+    if isinstance(node, str):
+        loaded = _try_load_yaml(node)
+        if loaded is not None:
+            yield from _walk_mux(loaded, path=f"{path}<{Path(node).name}>")
+        return
+    if isinstance(node, (dict, DictConfig)) and "input_cfg" in node:
+        yield from _walk_mux(node["input_cfg"], path=f"{path}.input_cfg")
+
+
+def _collect_leaf_paths(cfg: DictConfig) -> list[str]:
+    """Flat list of every shard path referenced from leaf sources, in YAML order."""
+    out: list[str] = []
+    for leaf in _iter_leaf_nodes(cfg):
+        for path in _leaf_to_paths(leaf):
+            out.append(path)
+    return out
+
+
+def _leaf_to_paths(leaf: DictConfig) -> list[str]:
+    """Resolve the shar/manifest paths inside ``leaf`` into flat strings."""
+    paths: list[str] = []
+    if shar := leaf.get("shar_path"):
+        if isinstance(shar, (dict, DictConfig)):
+            for key in ("cuts", "recording"):
+                v = shar.get(key)
+                if isinstance(v, str):
+                    paths.append(v)
+        elif isinstance(shar, str):
+            paths.append(shar)
+    if mfp := leaf.get("manifest_filepath"):
+        paths.extend(_flatten_str(mfp))
+    if taf := leaf.get("tarred_audio_filepaths"):
+        paths.extend(_flatten_str(taf))
+    if cuts := leaf.get("cuts_path"):
+        paths.extend(_flatten_str(cuts))
+    return paths
+
+
+def _count_shards(leaf: DictConfig) -> Optional[int]:
+    """Best-effort shard count from a leaf's ``_OP_N..M_CL_`` patterns."""
+    import re
+
+    paths = _leaf_to_paths(leaf)
+    if not paths:
+        return None
+    rx = re.compile(r"_OP_(\d+)\.\.(\d+)_CL_")
+    total = 0
+    for p in paths:
+        m = rx.search(str(p))
+        if m:
+            total += int(m.group(2)) - int(m.group(1)) + 1
+    return total or None
+
+
+def _flatten_str(v: Any) -> list[str]:
+    if v is None:
+        return []
+    if isinstance(v, str):
+        return [v]
+    if isinstance(v, (list, ListConfig)):
+        out: list[str] = []
+        for item in v:
+            out.extend(_flatten_str(item))
+        return out
+    return []
+
+
+def _try_load_yaml(path: str) -> Optional[Any]:
+    if not path or not isinstance(path, str):
+        return None
+    p = Path(path)
+    if not p.exists():
+        return None
+    try:
+        return OmegaConf.load(str(p))
+    except Exception as e:
+        LOG.debug("failed to load %s: %s", path, e)
+        return None
+
+
+def _isfinite(x: float) -> bool:
+    import math
+
+    return math.isfinite(x)
+
+
+# --------------------------------------------------------------------------- #
+# CLI.
+# --------------------------------------------------------------------------- #
+
+
+@click.command(help=__doc__)
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Training YAML containing data.train_ds.",
+)
+@click.option(
+    "--data-blend-dir", default=None, help="Substituted into ${data_blend_dir} in the config (optional locally)."
+)
+@click.option("--section", default="train_ds", show_default=True, help="Which data.* section to validate.")
+@click.option("--output-dir", default=None, type=click.Path(), help="Write pre_validation.json under this directory.")
+@click.option(
+    "--ignore-fail",
+    multiple=True,
+    default=(),
+    help="Repeatable: check IDs whose FAIL outcome should be downgraded to WARN.",
+)
+@click.option("-v", "--verbose", is_flag=True, default=False, help="Verbose logs.")
+def cli(
+    config_path: str,
+    data_blend_dir: Optional[str],
+    section: str,
+    output_dir: Optional[str],
+    ignore_fail: tuple,
+    verbose: bool,
+) -> None:
+    logging.basicConfig(
+        level=logging.DEBUG if verbose else logging.INFO,
+        format="[%(asctime)s %(levelname)s] %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    cfg = OmegaConf.load(config_path)
+    if data_blend_dir is not None:
+        cfg.data_blend_dir = data_blend_dir
+    OmegaConf.resolve(cfg)
+    section_cfg = cfg.data[section]
+    report = run_pre_validation(section_cfg, ignore_fail=ignore_fail)
+
+    # Pretty-print to stdout.
+    print(f"\n=== pre-validation ({len(report.checks)} checks) ===")
+    for c in report.checks:
+        marker = {PASS: "  PASS", WARN: "  WARN", FAIL: "  FAIL", SKIP: "  skip"}[c.status]
+        print(f"{marker}  [{c.check_id}] {c.detail}")
+    print(f"\nsummary: {report.summary}")
+    if output_dir is not None:
+        out = Path(output_dir)
+        out.mkdir(parents=True, exist_ok=True)
+        (out / "pre_validation.json").write_text(json.dumps(report.to_dict(), indent=2))
+        print(f"wrote {out / 'pre_validation.json'}")
+    sys.exit(0 if report.all_passed else 1)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/scripts/dataloading/analyze_resumable_checkpoint.py b/scripts/dataloading/analyze_resumable_checkpoint.py
new file mode 100644
index 000000000000..fa566b5b6494
--- /dev/null
+++ b/scripts/dataloading/analyze_resumable_checkpoint.py
@@ -0,0 +1,1056 @@
+#!/usr/bin/env python
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Analyze resumable Lhotse dataloader progress stored in a checkpoint.
+
+This tool answers two operational questions for indexed/resumable
+training runs:
+
+* how far each leaf dataset in the blend has advanced, expressed as total
+  utilization (for example ``70%`` or ``1389%``), completed epochs, and current
+  in-progress epoch percentage;
+* how the observed consumed-item share compares with the desired blend weight,
+  which surfaces datasets that were over- or under-sampled by the checkpoint.
+
+Expected inputs
+---------------
+Use ``--checkpoint`` for a checkpoint file, a checkpoint directory, or an
+``eval-step-N`` directory. For FSDP/DCP checkpoints, the script first looks for
+metadata-only ``meta.pt`` files and expects NeMo's per-rank
+``train_dataloader_per_rank`` payload. Use ``--allow-full-ckpt-load`` only when
+metadata is unavailable and loading a non-meta checkpoint is acceptable.
+
+Pass ``--config`` when available so the script can resolve the training blend,
+recover desired blend weights, dataset names, and count indexed examples from
+``.idx`` sidecars. Use ``--indexes-root`` when the sidecars live in a mirrored
+index tree instead of next to the manifests/tars. ``--state-json`` is a debugging
+escape hatch for analyzing an already-extracted payload without importing torch.
+
+Outputs
+-------
+By default the script prints a Markdown table. ``--output-dir`` writes
+``summary.json``, ``summary.md``, and ``summary.csv``; the JSON also includes the
+raw leaf progress states and resolved dataset specs for follow-up debugging.
+
+When to use it
+--------------
+Run it after a resumable training checkpoint is produced, before continuing a
+suspicious chain, or during a dataloader postmortem when blend utilization looks
+wrong. It is read-only: it never modifies checkpoints, indexes, or configs.
+"""
+from __future__ import annotations
+
+import argparse
+import csv
+import datetime as dt
+import json
+import math
+import os
+import re
+import sys
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Iterable
+
+
+try:
+    import yaml
+except ImportError as exc:  # pragma: no cover - startup guard
+    raise SystemExit("PyYAML is required to parse training/blend configs.") from exc
+
+
+BRACE_RANGE_PATTERN = re.compile(r"\{(-?\d+)\.\.(-?\d+)(?:\.\.(-?\d+))?\}")
+EVAL_STEP_PATTERN = re.compile(r"eval-step-(\d+)$")
+STATEFUL_KEY = "train_dataloader_per_rank"
+POSITION_KEYS = {"position", "shard_id", "num_shards"}
+INDEX_DIR_CACHE: dict[str, dict[str, int] | None] = {}
+
+
+# ---------------------------------------------------------------------------
+# Public data model
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class DatasetSpec:
+    source_index: int
+    name: str
+    desired_weight: float | None = None
+    raw_weight: float | None = None
+    hours: float | None = None
+    kind: str | None = None
+    source_path: str | None = None
+    total_items: int | None = None
+    missing_index_paths: list[str] = field(default_factory=list)
+
+
+@dataclass
+class LeafProgress:
+    source_index: int
+    rank: int | None
+    worker: str | None
+    state_type: str
+    epoch: int
+    position: int
+    shard_id: int | None
+    num_shards: int | None
+    total_len: int | None
+    state_path: str
+
+
+@dataclass
+class SummaryRow:
+    source_index: int
+    dataset: str
+    state_type: str
+    desired_weight: float | None
+    observed_weight: float | None
+    drift_abs: float | None
+    drift_ratio: float | None
+    utilization_pct: float | None
+    completed_epochs: int | None
+    current_epoch_pct: float | None
+    consumed_items: int | None
+    total_items: int | None
+    partitions_seen: int
+    min_epoch: int | None
+    max_epoch: int | None
+    min_position: int | None
+    max_position: int | None
+    missing_total: bool
+    notes: str = ""
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def collect_dataset_specs(
+    config: dict[str, Any] | None,
+    *,
+    config_path: Path | None,
+    indexes_root: str | None,
+    data_blend_dir: str | None,
+) -> list[DatasetSpec]:
+    """Resolve training blend leaves into ordered dataset specs.
+
+    The checkpoint records source progress by leaf order, not by dataset name.
+    This function walks ``data.train_ds.input_cfg`` with the same nested blend
+    references and temperature-normalized weights used by the recipe, then
+    counts examples from indexed sidecars when available.
+    """
+    if not config:
+        return []
+    train_ds = _get_path(config, "data.train_ds")
+    if not isinstance(train_ds, dict):
+        return []
+    if data_blend_dir is None:
+        raw_dir = config.get("data_blend_dir")
+        if isinstance(raw_dir, str):
+            data_blend_dir = raw_dir
+    current_dir = config_path.parent if config_path is not None else None
+    temps = _temperature_list(train_ds)
+    leaves: list[dict[str, Any]] = []
+
+    def recurse(
+        node: Any,
+        cumulative_weight: float,
+        level: int,
+        cur_dir: Path | None,
+        inherited: dict[str, Any],
+    ) -> None:
+        node, next_dir = _load_ref_if_yaml(node, data_blend_dir=data_blend_dir, current_dir=cur_dir)
+        if isinstance(node, dict):
+            merged = dict(inherited)
+            for key, value in node.items():
+                if key not in ("input_cfg", "weight"):
+                    merged.setdefault(key, value)
+            if "input_cfg" in node:
+                child = node["input_cfg"]
+                recurse(child, cumulative_weight, level, next_dir, merged)
+            else:
+                leaf = dict(merged)
+                leaf.update(node)
+                leaf["_desired_weight"] = cumulative_weight
+                leaf["_raw_weight"] = _safe_float(node.get("weight"))
+                leaves.append(leaf)
+            return
+        if isinstance(node, list):
+            weights = [_safe_float(item.get("weight")) if isinstance(item, dict) else None for item in node]
+            if all(w is not None for w in weights):
+                temperature = temps[level] if temps and level < len(temps) else 1.0
+                local_weights = _normalize_weight_vector([float(w) for w in weights], temperature=temperature)
+            else:
+                local_weights = [1.0 / len(node)] * len(node)
+            for item, local_weight in zip(node, local_weights):
+                item_weight = cumulative_weight * local_weight
+                recurse(item, item_weight, level + 1, next_dir, inherited)
+            return
+        if _looks_like_yaml_ref(node):
+            loaded, loaded_dir = _load_ref_if_yaml(node, data_blend_dir=data_blend_dir, current_dir=cur_dir)
+            recurse(loaded, cumulative_weight, level, loaded_dir, inherited)
+
+    recurse(train_ds.get("input_cfg"), 1.0, 0, current_dir, {})
+
+    specs: list[DatasetSpec] = []
+    for idx, leaf in enumerate(leaves):
+        path_groups = _source_path_groups_for_item(leaf)
+        paths = path_groups[0] if path_groups else []
+        total_items = None
+        missing: list[str] = []
+        for group in path_groups:
+            group_total, group_missing = _count_indexed_items(group, indexes_root)
+            if group_total is not None:
+                total_items = group_total
+                paths = group
+                missing = group_missing
+                break
+            missing.extend(group_missing[:20])
+        source_path = paths[0] if paths else None
+        specs.append(
+            DatasetSpec(
+                source_index=idx,
+                name=_dataset_name(leaf, source_path, idx),
+                desired_weight=_safe_float(leaf.get("_desired_weight")),
+                raw_weight=_safe_float(leaf.get("_raw_weight")),
+                hours=_safe_float(leaf.get("hours")),
+                kind=str(leaf.get("type")) if leaf.get("type") is not None else None,
+                source_path=source_path,
+                total_items=total_items,
+                missing_index_paths=missing[:20],
+            )
+        )
+    return specs
+
+
+def extract_progress(payload: Any) -> tuple[list[LeafProgress], list[str]]:
+    """Extract per-leaf dataloader progress from a loaded checkpoint payload.
+
+    The preferred layout is NeMo's ``train_dataloader_per_rank`` list, but the
+    scanner also handles raw nested ``sampler_state`` payloads for debugging and
+    compatibility with partially extracted state dumps.
+    """
+    notes: list[str] = []
+    progress: list[LeafProgress] = []
+    stateful_payloads = _find_stateful_payloads(payload)
+    if stateful_payloads:
+        notes.append(f"found {len(stateful_payloads)} {STATEFUL_KEY!r} payload(s)")
+        for state_path, per_rank in stateful_payloads:
+            for idx, entry in enumerate(per_rank):
+                if not isinstance(entry, dict):
+                    continue
+                rank = entry.get("dp_rank", idx)
+                rank = rank if isinstance(rank, int) else idx
+                inner_state = entry.get("state", entry)
+                for sampler_path, sampler_state in _find_sampler_states(inner_state, f"{state_path}[{idx}].state"):
+                    worker = _worker_from_path(sampler_path)
+                    progress.extend(
+                        _collect_leaves_from_sampler(sampler_state, rank=rank, worker=worker, path=sampler_path)
+                    )
+    else:
+        notes.append(f"no {STATEFUL_KEY!r} payload found; scanning for raw sampler_state entries")
+        for sampler_path, sampler_state in _find_sampler_states(payload):
+            worker = _worker_from_path(sampler_path)
+            progress.extend(_collect_leaves_from_sampler(sampler_state, rank=None, worker=worker, path=sampler_path))
+    progress, removed = _deduplicate_progress(progress)
+    if removed:
+        notes.append(f"deduplicated {removed} duplicate leaf progress state(s)")
+    return progress, notes
+
+
+def summarize(progress: list[LeafProgress], specs: list[DatasetSpec]) -> list[SummaryRow]:
+    """Combine checkpoint progress and dataset specs into report rows.
+
+    Consumed examples are aggregated across ranks/workers, converted to dataset
+    utilization percentages when totals are known, and normalized into observed
+    blend weights for drift reporting.
+    """
+    spec_by_index = {spec.source_index: spec for spec in specs}
+    grouped: dict[int, list[LeafProgress]] = {}
+    for leaf in progress:
+        grouped.setdefault(leaf.source_index, []).append(leaf)
+    consumed_by_index: dict[int, int | None] = {}
+    total_observed_consumed = 0
+    for source_index, leaves in grouped.items():
+        spec = spec_by_index.get(source_index)
+        total_len = next((leaf.total_len for leaf in leaves if leaf.total_len is not None), None)
+        if total_len is None and spec is not None:
+            total_len = spec.total_items
+        values = [_consumed_items(leaf, total_len) for leaf in leaves]
+        consumed = sum(v for v in values if v is not None) if all(v is not None for v in values) else None
+        consumed_by_index[source_index] = consumed
+        if consumed is not None:
+            total_observed_consumed += consumed
+
+    rows: list[SummaryRow] = []
+    for source_index in sorted(grouped):
+        leaves = grouped[source_index]
+        spec = spec_by_index.get(source_index)
+        total_len = next((leaf.total_len for leaf in leaves if leaf.total_len is not None), None)
+        if total_len is None and spec is not None:
+            total_len = spec.total_items
+        consumed = consumed_by_index[source_index]
+        utilization = (100.0 * consumed / total_len) if consumed is not None and total_len else None
+        observed = (consumed / total_observed_consumed) if consumed is not None and total_observed_consumed else None
+        desired = spec.desired_weight if spec is not None else None
+        drift_abs = observed - desired if observed is not None and desired is not None else None
+        drift_ratio = observed / desired if observed is not None and desired not in (None, 0) else None
+        completed_epochs = math.floor(utilization / 100.0) if utilization is not None else None
+        current_epoch_pct = (
+            utilization - completed_epochs * 100.0
+            if utilization is not None and completed_epochs is not None
+            else None
+        )
+        notes = []
+        if spec is None:
+            notes.append("no matching config source")
+        elif spec.total_items is None and total_len is None:
+            notes.append("missing total; provide --indexes-root or a config with indexed sidecars")
+        elif spec.missing_index_paths:
+            notes.append(f"{len(spec.missing_index_paths)} missing index path(s)")
+        rows.append(
+            SummaryRow(
+                source_index=source_index,
+                dataset=spec.name if spec is not None else f"source-{source_index}",
+                state_type="/".join(sorted({leaf.state_type for leaf in leaves})),
+                desired_weight=desired,
+                observed_weight=observed,
+                drift_abs=drift_abs,
+                drift_ratio=drift_ratio,
+                utilization_pct=utilization,
+                completed_epochs=completed_epochs,
+                current_epoch_pct=current_epoch_pct,
+                consumed_items=consumed,
+                total_items=total_len,
+                partitions_seen=len(leaves),
+                min_epoch=min(leaf.epoch for leaf in leaves) if leaves else None,
+                max_epoch=max(leaf.epoch for leaf in leaves) if leaves else None,
+                min_position=min(leaf.position for leaf in leaves) if leaves else None,
+                max_position=max(leaf.position for leaf in leaves) if leaves else None,
+                missing_total=total_len is None,
+                notes="; ".join(notes),
+            )
+        )
+    return rows
+
+
+def load_checkpoint_payload(path: Path, *, allow_full_load: bool, max_full_load_mb: int) -> tuple[Any, Path]:
+    """Load the smallest checkpoint payload that contains dataloader state.
+
+    Metadata files are preferred. Full checkpoint files are skipped unless
+    explicitly allowed and under the configured size cap.
+    """
+    errors = []
+    for candidate in _checkpoint_metadata_candidates(path):
+        if not candidate.is_file():
+            continue
+        if candidate.name != "meta.pt" and not allow_full_load:
+            size_mb = candidate.stat().st_size / (1024 * 1024)
+            if size_mb > max_full_load_mb:
+                errors.append(f"skipped large non-meta checkpoint {candidate} ({size_mb:.1f} MiB)")
+                continue
+        try:
+            payload = _torch_load(candidate)
+        except Exception as exc:  # pragma: no cover - depends on checkpoint format
+            errors.append(f"{candidate}: {exc}")
+            continue
+        progress, _ = extract_progress(payload)
+        if progress:
+            return payload, candidate
+        errors.append(f"{candidate}: loaded but no dataloader progress state found")
+    detail = "\n".join(errors[-10:])
+    raise RuntimeError(f"Could not find dataloader state for {path}.\n{detail}")
+
+
+def load_config(path: Path | None, checkpoint: Path) -> tuple[dict[str, Any] | None, Path | None, list[str]]:
+    """Load an explicit or nearby training config used to annotate the report."""
+    notes = []
+    candidates = [path] if path is not None else _auto_config_candidates(checkpoint)
+    for candidate in candidates:
+        if candidate is None or not candidate.is_file():
+            continue
+        try:
+            if candidate.suffix == ".json":
+                data = _load_json(candidate)
+            else:
+                data = _load_yaml(candidate)
+        except Exception as exc:
+            notes.append(f"failed to load config candidate {candidate}: {exc}")
+            continue
+        if isinstance(data, dict):
+            if _get_path(data, "data.train_ds") is not None:
+                return data, candidate, notes
+            notes.append(f"skipped config candidate {candidate}: no data.train_ds")
+    if path is None:
+        notes.append("no config found; pass --config for desired weights and index totals")
+    else:
+        notes.append(f"config not found or invalid: {path}")
+    return None, None, notes
+
+
+def markdown_table(rows: list[SummaryRow]) -> str:
+    """Render summary rows as a compact Markdown table for logs/stdout."""
+    headers = [
+        "idx",
+        "dataset",
+        "utilization",
+        "epochs",
+        "desired_w",
+        "observed_w",
+        "drift",
+        "items",
+        "total",
+        "parts",
+        "notes",
+    ]
+    lines = [
+        "| " + " | ".join(headers) + " |",
+        "| " + " | ".join(["---"] * len(headers)) + " |",
+    ]
+    for row in rows:
+        epoch_text = ""
+        if row.completed_epochs is not None and row.current_epoch_pct is not None:
+            epoch_text = f"{row.completed_epochs} + {row.current_epoch_pct:.2f}%"
+        values = [
+            str(row.source_index),
+            row.dataset.replace("|", "\\|"),
+            _fmt_pct(row.utilization_pct),
+            epoch_text,
+            _fmt_float(row.desired_weight),
+            _fmt_float(row.observed_weight),
+            _fmt_float(row.drift_abs),
+            "" if row.consumed_items is None else str(row.consumed_items),
+            "" if row.total_items is None else str(row.total_items),
+            str(row.partitions_seen),
+            row.notes.replace("|", "\\|"),
+        ]
+        lines.append("| " + " | ".join(values) + " |")
+    return "\n".join(lines) + "\n"
+
+
+def write_outputs(summary: dict[str, Any], rows: list[SummaryRow], args: argparse.Namespace) -> None:
+    """Write requested JSON, Markdown, and CSV artifacts."""
+    output_dir = Path(args.output_dir) if args.output_dir else None
+    if output_dir is not None:
+        output_dir.mkdir(parents=True, exist_ok=True)
+    json_path = Path(args.output_json) if args.output_json else (output_dir / "summary.json" if output_dir else None)
+    md_path = Path(args.output_md) if args.output_md else (output_dir / "summary.md" if output_dir else None)
+    csv_path = Path(args.output_csv) if args.output_csv else (output_dir / "summary.csv" if output_dir else None)
+    if json_path is not None:
+        json_path.parent.mkdir(parents=True, exist_ok=True)
+        json_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    if md_path is not None:
+        md_path.parent.mkdir(parents=True, exist_ok=True)
+        body = [
+            f"# Resumable Dataloader Checkpoint Analysis",
+            "",
+            f"- checkpoint: `{summary['checkpoint_input']}`",
+            f"- metadata_loaded: `{summary.get('checkpoint_metadata_loaded')}`",
+            f"- config: `{summary.get('config_path') or ''}`",
+            f"- generated_at: `{summary['generated_at']}`",
+            "",
+            markdown_table(rows),
+        ]
+        md_path.write_text("\n".join(body), encoding="utf-8")
+    if csv_path is not None:
+        csv_path.parent.mkdir(parents=True, exist_ok=True)
+        with csv_path.open("w", encoding="utf-8", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=list(asdict(rows[0]).keys()) if rows else ["source_index"])
+            writer.writeheader()
+            for row in rows:
+                writer.writerow(asdict(row))
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse CLI flags for local or cluster-submitted analysis runs."""
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--checkpoint", "--checkpoint-path", dest="checkpoint", help="Checkpoint file/dir or eval-step dir."
+    )
+    parser.add_argument("--state-json", help="JSON payload to analyze instead of loading a torch checkpoint.")
+    parser.add_argument("--config", help="Training YAML/JSON for desired weights and source index totals.")
+    parser.add_argument("--data-blend-dir", help="Override ${data_blend_dir} while resolving nested blend YAMLs.")
+    parser.add_argument("--indexes-root", help="Root containing mirrored .idx sidecars, e.g. /tmp/idx.")
+    parser.add_argument("--output-dir", help="Directory for summary.json/summary.md/summary.csv.")
+    parser.add_argument("--output-json", help="Explicit JSON output path.")
+    parser.add_argument("--output-md", help="Explicit Markdown output path.")
+    parser.add_argument("--output-csv", help="Explicit CSV output path.")
+    parser.add_argument("--allow-full-ckpt-load", action="store_true", help="Allow loading non-meta checkpoint files.")
+    parser.add_argument("--max-full-load-mb", type=int, default=512, help="Safety cap for non-meta checkpoint files.")
+    parser.add_argument("--print-table", action="store_true", help="Print Markdown table to stdout.")
+    return parser.parse_args()
+
+
+def main() -> int:
+    """CLI entrypoint for loading inputs, computing rows, and writing outputs."""
+    args = parse_args()
+    if not args.checkpoint and not args.state_json:
+        raise SystemExit("Pass --checkpoint or --state-json.")
+
+    checkpoint = Path(args.checkpoint).expanduser() if args.checkpoint else Path(args.state_json).expanduser()
+    loaded_path: Path | None = None
+    if args.state_json:
+        payload = _load_json(Path(args.state_json).expanduser())
+        loaded_path = Path(args.state_json).expanduser()
+    else:
+        payload, loaded_path = load_checkpoint_payload(
+            checkpoint,
+            allow_full_load=args.allow_full_ckpt_load,
+            max_full_load_mb=args.max_full_load_mb,
+        )
+
+    progress, notes = extract_progress(payload)
+    config_path = Path(args.config).expanduser() if args.config else None
+    config, loaded_config_path, config_notes = load_config(config_path, checkpoint)
+    notes.extend(config_notes)
+    specs = collect_dataset_specs(
+        config,
+        config_path=loaded_config_path,
+        indexes_root=args.indexes_root,
+        data_blend_dir=args.data_blend_dir,
+    )
+    if specs and len(specs) != len({leaf.source_index for leaf in progress}):
+        notes.append(
+            f"config source count ({len(specs)}) differs from checkpoint source count "
+            f"({len({leaf.source_index for leaf in progress})}); mapping is by source order only"
+        )
+    rows = summarize(progress, specs)
+    summary = {
+        "checkpoint_input": str(checkpoint),
+        "checkpoint_metadata_loaded": str(loaded_path) if loaded_path else None,
+        "config_path": str(loaded_config_path) if loaded_config_path else None,
+        "indexes_root": args.indexes_root,
+        "generated_at": dt.datetime.now(dt.timezone.utc).isoformat(),
+        "notes": notes,
+        "num_leaf_progress_states": len(progress),
+        "num_summary_rows": len(rows),
+        "rows": [asdict(row) for row in rows],
+        "leaf_progress": [asdict(leaf) for leaf in progress],
+        "dataset_specs": [asdict(spec) for spec in specs],
+    }
+    write_outputs(summary, rows, args)
+    if args.print_table or not (args.output_dir or args.output_json or args.output_md or args.output_csv):
+        sys.stdout.write(markdown_table(rows))
+        if notes:
+            sys.stdout.write("\nNotes:\n")
+            for note in notes:
+                sys.stdout.write(f"- {note}\n")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
+
+
+def _load_yaml(path: Path) -> Any:
+    with path.open("r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+
+
+def _load_json(path: Path) -> Any:
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+
+def _safe_float(value: Any) -> float | None:
+    if isinstance(value, bool) or value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+
+def _get_path(data: Any, dotted: str) -> Any:
+    cur = data
+    for part in dotted.split("."):
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(part)
+    return cur
+
+
+def _normalize_weight_vector(weights: list[float], temperature: float = 1.0) -> list[float]:
+    if not weights:
+        return []
+    scaled = [w**temperature for w in weights]
+    total = sum(scaled)
+    if total <= 0:
+        return [1.0 / len(weights)] * len(weights)
+    return [w / total for w in scaled]
+
+
+def _strip_url_scheme(path: str) -> str:
+    match = re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://(.+)$", path)
+    return match.group(1) if match else path.lstrip("/")
+
+
+def _index_path_for(data_path: str, indexes_root: str | None) -> Path | None:
+    if not indexes_root:
+        if re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", data_path):
+            return None
+        return Path(data_path + ".idx")
+    return Path(indexes_root) / (_strip_url_scheme(data_path) + ".idx")
+
+
+def _indexed_file_size(idx_path: Path) -> int | None:
+    parent = str(idx_path.parent)
+    entries = INDEX_DIR_CACHE.get(parent)
+    if entries is None and parent not in INDEX_DIR_CACHE:
+        try:
+            entries = {entry.name: entry.stat().st_size for entry in os.scandir(idx_path.parent) if entry.is_file()}
+        except FileNotFoundError:
+            INDEX_DIR_CACHE[parent] = None
+            return None
+        INDEX_DIR_CACHE[parent] = entries
+    if entries is None:
+        return None
+    return entries.get(idx_path.name)
+
+
+def _fallback_brace_expand(path: str) -> list[str]:
+    match = BRACE_RANGE_PATTERN.search(path)
+    if not match:
+        return [path]
+    start_text, end_text, step_text = match.group(1), match.group(2), match.group(3)
+    start, end = int(start_text), int(end_text)
+    step = int(step_text) if step_text is not None else (1 if start <= end else -1)
+    if step == 0:
+        return [path]
+    if start < end and step < 0:
+        return [path]
+    if start > end and step > 0:
+        return [path]
+    width = max(len(start_text.lstrip("-")), len(end_text.lstrip("-")))
+    stop = end + (1 if step > 0 else -1)
+    expanded = []
+    for idx in range(start, stop, step):
+        sign = "-" if idx < 0 else ""
+        repl = f"{sign}{abs(idx):0{width}d}"
+        expanded.extend(_fallback_brace_expand(path[: match.start()] + repl + path[match.end() :]))
+    return expanded
+
+
+def _expand_op_path(path: str) -> list[str]:
+    # Match NeMo expand_sharded_filepaths(): _OP_/_CL_ are aliases for brace ranges.
+    sharded = path
+    for brace_open in ("(", "[", "<", "_OP_"):
+        sharded = sharded.replace(brace_open, "{")
+    for brace_close in (")", "]", ">", "_CL_"):
+        sharded = sharded.replace(brace_close, "}")
+    try:
+        import braceexpand
+
+        return list(braceexpand.braceexpand(sharded, escape=False))
+    except ImportError:
+        return _fallback_brace_expand(sharded)
+
+
+def _flatten_path_values(value: Any) -> list[str]:
+    if value is None:
+        return []
+    if isinstance(value, str):
+        return [value]
+    if isinstance(value, (list, tuple)):
+        out: list[str] = []
+        for item in value:
+            if isinstance(item, str):
+                out.append(item)
+            elif isinstance(item, (list, tuple)) and item and isinstance(item[0], str):
+                out.append(item[0])
+            elif isinstance(item, dict):
+                out.extend(_flatten_path_values(item))
+        return out
+    if isinstance(value, dict):
+        out = []
+        for item in value.values():
+            out.extend(_flatten_path_values(item))
+        return out
+    return []
+
+
+def _count_indexed_items(paths: list[str], indexes_root: str | None) -> tuple[int | None, list[str]]:
+    total = 0
+    missing: list[str] = []
+    any_count = False
+    for path in paths:
+        for expanded in _expand_op_path(path):
+            idx_path = _index_path_for(expanded, indexes_root)
+            if idx_path is None:
+                missing.append(f"{expanded}.idx")
+                continue
+            size = _indexed_file_size(idx_path)
+            if size is None:
+                missing.append(str(idx_path))
+                continue
+            if size < 8 or size % 8 != 0:
+                missing.append(str(idx_path))
+                continue
+            total += size // 8 - 1
+            any_count = True
+    return (total if any_count else None), missing
+
+
+def _resolve_ref(ref: str, *, data_blend_dir: str | None, current_dir: Path | None) -> Path:
+    text = ref
+    if data_blend_dir:
+        text = text.replace("${data_blend_dir}", data_blend_dir)
+    text = os.path.expandvars(text)
+    path = Path(text)
+    if path.is_absolute():
+        return path
+    if current_dir is not None:
+        return current_dir / path
+    return Path.cwd() / path
+
+
+def _looks_like_yaml_ref(value: Any) -> bool:
+    return isinstance(value, str) and value.endswith((".yaml", ".yml"))
+
+
+def _load_ref_if_yaml(value: Any, *, data_blend_dir: str | None, current_dir: Path | None) -> tuple[Any, Path | None]:
+    if not _looks_like_yaml_ref(value):
+        return value, current_dir
+    path = _resolve_ref(value, data_blend_dir=data_blend_dir, current_dir=current_dir)
+    return _load_yaml(path), path.parent
+
+
+def _source_path_groups_for_item(item: dict[str, Any]) -> list[list[str]]:
+    groups: list[list[str]] = []
+    keys = [
+        "manifest_filepath",
+        "cuts_path",
+        "source_paths",
+        "source_path",
+        "shar_path",
+        "tarred_audio_filepaths",
+        "tarred_audio_filepath",
+    ]
+    kind = str(item.get("type", ""))
+    if "nemo_tarred" in kind and (item.get("tarred_audio_filepaths") or item.get("tarred_audio_filepath")):
+        keys = [
+            "tarred_audio_filepaths",
+            "tarred_audio_filepath",
+            "manifest_filepath",
+            "cuts_path",
+            "source_paths",
+            "source_path",
+            "shar_path",
+        ]
+    for key in keys:
+        paths = _flatten_path_values(item.get(key))
+        if paths:
+            groups.append(paths)
+    return groups
+
+
+def _source_paths_for_item(item: dict[str, Any]) -> list[str]:
+    groups = _source_path_groups_for_item(item)
+    return groups[0] if groups else []
+
+
+def _dataset_name(item: dict[str, Any], source_path: str | None, fallback_index: int) -> str:
+    pieces = []
+    for key in ("corpus", "language", "dataset", "name", "type"):
+        value = item.get(key)
+        if value is not None and not isinstance(value, (dict, list)):
+            pieces.append(str(value))
+    if pieces:
+        return "/".join(pieces)
+    if source_path:
+        return source_path
+    return f"source-{fallback_index}"
+
+
+def _temperature_list(train_ds: dict[str, Any]) -> list[float] | None:
+    value = train_ds.get("reweight_temperature")
+    if value is None:
+        return None
+    if isinstance(value, (int, float)) and not isinstance(value, bool):
+        return [float(value)] * 16
+    if isinstance(value, (list, tuple)):
+        return [float(v) for v in value]
+    return None
+
+
+def _iter_children(obj: Any, path: str = "$") -> Iterable[tuple[str, Any]]:
+    if isinstance(obj, dict):
+        for key, value in obj.items():
+            yield f"{path}.{key}", value
+    elif isinstance(obj, list):
+        for idx, value in enumerate(obj):
+            yield f"{path}[{idx}]", value
+
+
+def _find_stateful_payloads(obj: Any, path: str = "$") -> list[tuple[str, list[Any]]]:
+    found: list[tuple[str, list[Any]]] = []
+    if isinstance(obj, dict):
+        value = obj.get(STATEFUL_KEY)
+        if isinstance(value, list):
+            found.append((f"{path}.{STATEFUL_KEY}", value))
+        for child_path, child in _iter_children(obj, path):
+            found.extend(_find_stateful_payloads(child, child_path))
+    elif isinstance(obj, list):
+        for child_path, child in _iter_children(obj, path):
+            found.extend(_find_stateful_payloads(child, child_path))
+    return found
+
+
+def _find_sampler_states(obj: Any, path: str = "$") -> list[tuple[str, dict[str, Any]]]:
+    found: list[tuple[str, dict[str, Any]]] = []
+    if isinstance(obj, dict):
+        sampler_state = obj.get("sampler_state")
+        if isinstance(sampler_state, dict):
+            found.append((f"{path}.sampler_state", sampler_state))
+        elif "cuts_state" in obj and "diagnostics" in obj:
+            found.append((path, obj))
+        for child_path, child in _iter_children(obj, path):
+            if child is sampler_state:
+                continue
+            found.extend(_find_sampler_states(child, child_path))
+    elif isinstance(obj, list):
+        for child_path, child in _iter_children(obj, path):
+            found.extend(_find_sampler_states(child, child_path))
+    return found
+
+
+def _worker_from_path(path: str) -> str | None:
+    match = re.search(r"worker[_-]?(\d+)", path)
+    if match:
+        return match.group(1)
+    return None
+
+
+def _state_total_len(state: dict[str, Any]) -> int | None:
+    range_state = state.get("range")
+    if isinstance(range_state, dict):
+        n = range_state.get("n")
+        if isinstance(n, int) and n >= 0:
+            return n
+    n = state.get("total_len") or state.get("_total_len") or state.get("n")
+    return int(n) if isinstance(n, int) and n >= 0 else None
+
+
+def _leaf_from_state(
+    source_index: int,
+    rank: int | None,
+    worker: str | None,
+    path: str,
+    node_type: str,
+    state: dict[str, Any],
+) -> LeafProgress | None:
+    if not POSITION_KEYS.issubset(state.keys()):
+        return None
+    position = state.get("position")
+    if not isinstance(position, int):
+        return None
+    epoch = state.get("epoch", 0)
+    if not isinstance(epoch, int):
+        epoch = 0
+    shard_id = state.get("shard_id")
+    num_shards = state.get("num_shards")
+    return LeafProgress(
+        source_index=source_index,
+        rank=rank,
+        worker=worker,
+        state_type=node_type,
+        epoch=epoch,
+        position=position,
+        shard_id=shard_id if isinstance(shard_id, int) else None,
+        num_shards=num_shards if isinstance(num_shards, int) else None,
+        total_len=_state_total_len(state),
+        state_path=path,
+    )
+
+
+def _collect_leaf_states(tree: Any, *, rank: int | None, worker: str | None, path: str = "$") -> list[LeafProgress]:
+    leaves: list[LeafProgress] = []
+
+    def walk(node: Any, node_path: str) -> None:
+        if isinstance(node, dict):
+            node_type = str(node.get("_type", "state"))
+            state = node.get("_state")
+            if isinstance(state, dict):
+                leaf = _leaf_from_state(len(leaves), rank, worker, f"{node_path}._state", node_type, state)
+                if leaf is not None:
+                    leaves.append(leaf)
+                    return
+                for key in ("source", "sources"):
+                    if key in state:
+                        walk(state[key], f"{node_path}._state.{key}")
+            leaf = _leaf_from_state(len(leaves), rank, worker, node_path, node_type, node)
+            if leaf is not None:
+                leaves.append(leaf)
+                return
+            for child_path, child in _iter_children(node, node_path):
+                walk(child, child_path)
+        elif isinstance(node, list):
+            for child_path, child in _iter_children(node, node_path):
+                walk(child, child_path)
+
+    walk(tree, path)
+    return leaves
+
+
+def _collect_leaves_from_sampler(
+    sampler_state: dict[str, Any], *, rank: int | None, worker: str | None, path: str
+) -> list[LeafProgress]:
+    leaves: list[LeafProgress] = []
+    nested = sampler_state.get("samplers") or sampler_state.get("bucket_samplers")
+    if isinstance(nested, list):
+        for idx, sub in enumerate(nested):
+            if isinstance(sub, dict):
+                leaves.extend(
+                    _collect_leaves_from_sampler(
+                        sub,
+                        rank=rank,
+                        worker=worker,
+                        path=f"{path}.samplers[{idx}]",
+                    )
+                )
+        if leaves:
+            for idx, leaf in enumerate(leaves):
+                leaf.source_index = idx
+            return leaves
+    cuts_state = sampler_state.get("cuts_state")
+    if cuts_state is not None:
+        leaves = _collect_leaf_states(cuts_state, rank=rank, worker=worker, path=f"{path}.cuts_state")
+        for idx, leaf in enumerate(leaves):
+            leaf.source_index = idx
+    return leaves
+
+
+def _deduplicate_progress(progress: list[LeafProgress]) -> tuple[list[LeafProgress], int]:
+    deduped: list[LeafProgress] = []
+    seen: set[tuple[Any, ...]] = set()
+    for leaf in progress:
+        key = (leaf.rank, leaf.worker, leaf.state_path, leaf.shard_id, leaf.num_shards)
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(leaf)
+    return deduped, len(progress) - len(deduped)
+
+
+def _shard_len(total_len: int, shard_id: int | None, num_shards: int | None) -> int | None:
+    if shard_id is None or num_shards is None or num_shards <= 0:
+        return None
+    if total_len <= shard_id:
+        return 0
+    return (total_len - shard_id + num_shards - 1) // num_shards
+
+
+def _consumed_items(leaf: LeafProgress, total_len: int | None) -> int | None:
+    total = leaf.total_len if leaf.total_len is not None else total_len
+    if total is None:
+        if leaf.epoch == 0:
+            return leaf.position
+        return None
+    shard_len = _shard_len(total, leaf.shard_id, leaf.num_shards)
+    if shard_len is None:
+        return leaf.position if leaf.epoch == 0 else None
+    return leaf.epoch * shard_len + leaf.position
+
+
+def _eval_step_candidates(path: Path) -> list[Path]:
+    match = EVAL_STEP_PATTERN.fullmatch(path.name)
+    if not match:
+        return []
+    step = match.group(1)
+    ckpt_dir = path.parent / "checkpoints"
+    return [
+        ckpt_dir / f"step={step}.ckpt",
+        ckpt_dir / f"step={step}-last.ckpt",
+        ckpt_dir / f"step-{step}.ckpt",
+        ckpt_dir / f"step-{step}-last.ckpt",
+    ]
+
+
+def _checkpoint_metadata_candidates(path: Path) -> list[Path]:
+    candidates: list[Path] = []
+    if path.is_dir():
+        candidates.extend(_eval_step_candidates(path))
+        candidates.extend([path / "meta.pt", path / "checkpoint" / "meta.pt"])
+        for child in sorted(path.glob("*.ckpt")):
+            candidates.append(child)
+        for child in sorted(path.glob("**/meta.pt")):
+            candidates.append(child)
+    else:
+        candidates.append(path)
+    expanded: list[Path] = []
+    for candidate in candidates:
+        if candidate.is_dir():
+            expanded.extend([candidate / "meta.pt", candidate / "checkpoint" / "meta.pt"])
+        expanded.append(candidate)
+    deduped: list[Path] = []
+    seen = set()
+    for candidate in expanded:
+        text = str(candidate)
+        if text not in seen:
+            seen.add(text)
+            deduped.append(candidate)
+    return deduped
+
+
+def _torch_load(path: Path) -> Any:
+    import torch
+
+    try:
+        return torch.load(path, map_location="cpu", weights_only=False)
+    except TypeError:
+        return torch.load(path, map_location="cpu")
+
+
+def _auto_config_candidates(checkpoint: Path) -> list[Path]:
+    candidates = []
+    roots = [checkpoint]
+    if checkpoint.is_file():
+        roots.append(checkpoint.parent)
+    if checkpoint.is_dir():
+        roots.extend([checkpoint.parent, checkpoint.parent.parent])
+    for root in roots:
+        if root.is_file():
+            continue
+        candidates.extend(
+            [
+                root / "exp_config.yaml",
+                root / "config.yaml",
+                root / "hparams.yaml",
+                root / "config.json",
+            ]
+        )
+    deduped = []
+    seen = set()
+    for candidate in candidates:
+        text = str(candidate)
+        if text not in seen:
+            seen.add(text)
+            deduped.append(candidate)
+    return deduped
+
+
+def _fmt_pct(value: float | None) -> str:
+    return "" if value is None else f"{value:.2f}%"
+
+
+def _fmt_float(value: float | None) -> str:
+    return "" if value is None else f"{value:.6g}"
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/scripts/dataloading/build_indexes.py b/scripts/dataloading/build_indexes.py
new file mode 100644
index 000000000000..b46c3bc5342d
--- /dev/null
+++ b/scripts/dataloading/build_indexes.py
@@ -0,0 +1,473 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Build O(1)-restore index sidecars for an arbitrary NeMo Lhotse ``input_cfg``.
+
+Walks a NeMo dataloading config (``input_cfg`` YAML, including nested ``group``
+entries and per-entry YAML references), discovers every JSONL/tar file an
+indexed dataloader will need, and creates the corresponding ``.idx`` sidecars
+next to each data file.
+
+Two tar layouts are dispatched correctly:
+
+* NeMo tarred audio (one regular member per sample, name-keyed) — uses
+  ``nemo.collections.common.data.lhotse.indexed_adapters.create_tar_index``
+  which records one offset per *basename group*.
+* WebDataset/Shar tars (json + payload pairs) — uses
+  ``lhotse.indexing.create_tar_index`` which records one offset per *member
+  pair*.
+
+Local files and remote URIs are both supported via lhotse's ``open_best``
+(which routes to ``smart_open`` / AIStore SDK when available). The ``.idx`` is
+written next to its source path, so the storage backend must accept writes at
+that location — for read-only object stores, materialize the data locally
+first or pre-build indexes at upload time.
+
+Examples::
+
+    # Build indexes for everything referenced by an input_cfg.yaml.
+    python scripts/dataloading/build_indexes.py path/to/input_cfg.yaml
+
+    # Multiple configs at once.
+    python scripts/dataloading/build_indexes.py train.yaml validation.yaml
+
+    # Show what would be built without writing anything.
+    python scripts/dataloading/build_indexes.py --dry-run path/to/input_cfg.yaml
+
+    # Rebuild even when an .idx already exists; parallelize across 16 workers.
+    python scripts/dataloading/build_indexes.py --force --workers 16 path/to/input_cfg.yaml
+"""
+
+import json
+import logging
+import sys
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+
+import click
+from lhotse.indexing import index_file_path
+from omegaconf import DictConfig, ListConfig, OmegaConf
+
+from nemo.collections.common.data.lhotse.indexed_adapters import create_tar_index as create_nemo_tar_index
+from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths
+
+# --------------------------------------------------------------------------- #
+# Tar layout taxonomy.
+# --------------------------------------------------------------------------- #
+# NEMO_TAR  — one regular member per sample, indexed by basename. Used by
+#             nemo / nemo_tarred / multimodal_conversation / share_gpt audio
+#             tars (read via IndexedTarMemberReader).
+# WDS_TAR   — WebDataset-style: each sample is a pair of consecutive members
+#             (e.g. {N}.json + {N}.<audio>). Used by lhotse_shar tars and
+#             share_gpt_webdataset tars (read via IndexedTarSampleReader).
+NEMO_TAR = "nemo_tar"
+WDS_TAR = "wds_tar"
+JSONL = "jsonl"
+
+
+@dataclass(frozen=True)
+class IndexJob:
+    path: str
+    kind: str  # one of {JSONL, NEMO_TAR, WDS_TAR}
+    indexes_root: Optional[str] = None
+
+    def idx_path(self):
+        return index_file_path(self.path, self.indexes_root)
+
+
+# --------------------------------------------------------------------------- #
+# Path discovery.
+# --------------------------------------------------------------------------- #
+
+
+def _as_list(val) -> list:
+    if val is None:
+        return []
+    if isinstance(val, (list, tuple, ListConfig)):
+        return list(val)
+    return [val]
+
+
+def _flatten_path_spec(spec) -> list[str]:
+    """
+    NeMo's manifest_filepath / tarred_audio_filepaths accept several layouts:
+      str, list[str], list[list[str]], list[tuple[str, weight]], ...
+    Flatten any of those into a list of plain string paths.
+    """
+    out: list[str] = []
+    for item in _as_list(spec):
+        if isinstance(item, (str, Path)):
+            out.append(str(item))
+        elif isinstance(item, (list, tuple, ListConfig)):
+            # [path] or [path, weight] or [[path], [path], ...]
+            head = item[0]
+            if isinstance(head, (str, Path)):
+                out.append(str(head))
+            else:
+                out.extend(_flatten_path_spec(item))
+    return out
+
+
+def _expand_jsonl(spec) -> list[str]:
+    return [p for raw in _flatten_path_spec(spec) for p in expand_sharded_filepaths(raw)]
+
+
+def _expand_tars(spec) -> list[str]:
+    return [p for raw in _flatten_path_spec(spec) for p in expand_sharded_filepaths(raw)]
+
+
+def _resolve_input_cfg(val) -> ListConfig | None:
+    """``input_cfg`` may be inline or a path to a YAML file. Materialize it."""
+    if isinstance(val, (list, ListConfig)):
+        return val
+    if isinstance(val, (str, Path)):
+        return OmegaConf.load(str(val))
+    return None
+
+
+# Types that don't read any data themselves — they delegate to
+# ``read_cutset_from_config(config)`` and accept *any* underlying source's keys
+# (``cuts_path``, ``shar_path``, ``manifest_filepath`` [+ ``tarred_audio_filepaths``],
+# nested ``input_cfg``, …). Treat them as transparent passthroughs.
+_TRANSFORM_TYPES = frozenset(
+    {
+        "lhotse_as_conversation",
+        "sqa_as_conversation",
+        "s2s_as_conversation",
+        "s2s_duplex_overlap_as_s2s_duplex",
+        "s2s_duplex_reverse_role",
+        "lhotse_magpietts_data_as_continuation",
+        "nemo_tarred_to_duplex",
+    }
+)
+
+# Types that index nothing on their own.
+_NO_INDEX_TYPES = frozenset({"txt", "txt_pair", "parquet", "multi_speaker_simulator"})
+
+
+def _discover_keys(entry, jobs: list[IndexJob], indexes_root: Optional[str]) -> None:
+    """
+    Key-based dispatch: emit IndexJobs based on which underlying-source keys
+    are present, regardless of ``type``. Used for transform types that
+    delegate to ``read_cutset_from_config``, and as the inner step for
+    concrete types that name them directly. Per-entry ``indexes_root``
+    overrides the inherited value when set.
+    """
+    indexes_root = entry.get("indexes_root", indexes_root)
+    if (cuts_path := entry.get("cuts_path")) is not None:
+        for p in _expand_jsonl(cuts_path):
+            jobs.append(IndexJob(p, JSONL, indexes_root))
+    if (shar_path := entry.get("shar_path")) is not None:
+        _discover_shar(shar_path, jobs, indexes_root)
+    if (mfp := entry.get("manifest_filepath")) is not None:
+        for p in _expand_jsonl(mfp):
+            jobs.append(IndexJob(p, JSONL, indexes_root))
+        for p in _expand_tars(entry.get("tarred_audio_filepaths")):
+            jobs.append(IndexJob(p, NEMO_TAR, indexes_root))
+    if (paths := entry.get("paths")) is not None:
+        _discover_paths(paths, jobs, indexes_root)
+    if (sub := _resolve_input_cfg(entry.get("input_cfg"))) is not None:
+        discover(sub, jobs, indexes_root)
+
+
+def _discover_paths(paths, jobs: list[IndexJob], indexes_root: Optional[str]) -> None:
+    for p in _expand_jsonl(paths):
+        path = Path(p)
+        if path.is_dir():
+            for tar_path in sorted(path.rglob("*.tar")):
+                jobs.append(IndexJob(str(tar_path), NEMO_TAR, indexes_root))
+        elif path.suffix == ".tar":
+            jobs.append(IndexJob(p, NEMO_TAR, indexes_root))
+        else:
+            jobs.append(IndexJob(p, JSONL, indexes_root))
+
+
+def _discover_share_gpt_webdataset(data_dir, jobs: list[IndexJob], indexes_root: Optional[str]) -> None:
+    """
+    Match NeMoMultimodalConversationShareGPTWebdatasetAdapter shard discovery.
+
+    The adapter reads ``wids-meta.json`` when present; otherwise it recursively
+    scans ``data_dir`` for tar shards. Energon exports commonly place shards
+    under nested directories such as ``0/sharded_manifests/shard-0.tar``, so a
+    non-recursive glob silently misses every runtime-required tar index.
+    """
+    if data_dir is None:
+        return
+
+    for raw in _flatten_path_spec(data_dir):
+        root = Path(raw)
+        meta_path = root / "wids-meta.json"
+        if meta_path.is_file():
+            with open(meta_path) as f:
+                meta = json.load(f)
+            for shard in meta.get("shardlist", []):
+                url = shard.get("url") if isinstance(shard, dict) else None
+                if url:
+                    jobs.append(IndexJob(str(root / url), WDS_TAR, indexes_root))
+        elif root.is_dir():
+            for tar_path in sorted(root.rglob("*.tar")):
+                jobs.append(IndexJob(str(tar_path), WDS_TAR, indexes_root))
+
+        # Preserve the previous behavior for optional root-level sidecar
+        # manifests without recursively indexing unrelated metadata files.
+        if root.is_dir():
+            for jsonl_path in sorted(root.glob("*.jsonl")):
+                jobs.append(IndexJob(str(jsonl_path), JSONL, indexes_root))
+
+
+def discover(entry, jobs: list[IndexJob], indexes_root: Optional[str] = None) -> None:
+    """Walk one entry of an ``input_cfg`` and append every required IndexJob."""
+    if isinstance(entry, (list, ListConfig)):
+        for sub in entry:
+            discover(sub, jobs, indexes_root)
+        return
+    if not isinstance(entry, (dict, DictConfig)):
+        return
+
+    # Per-entry override: a nested entry can carry its own ``indexes_root``.
+    indexes_root = entry.get("indexes_root", indexes_root)
+
+    typ = entry.get("type")
+    if typ is None:
+        # Top-level wrapper (``input_cfg: [...]``) — recurse into every value.
+        for v in entry.values():
+            discover(v, jobs, indexes_root)
+        return
+
+    if typ in _NO_INDEX_TYPES:
+        return
+
+    if typ == "group" or typ in _TRANSFORM_TYPES:
+        # Group and transform passthroughs: dispatch by keys.
+        _discover_keys(entry, jobs, indexes_root)
+        return
+
+    if typ in ("nemo", "nemo_tarred", "multimodal_conversation", "share_gpt"):
+        for p in _expand_jsonl(entry.get("manifest_filepath")):
+            jobs.append(IndexJob(p, JSONL, indexes_root))
+        for p in _expand_tars(entry.get("tarred_audio_filepaths")):
+            jobs.append(IndexJob(p, NEMO_TAR, indexes_root))
+        return
+
+    if typ == "share_gpt_webdataset":
+        # Layout: data_dir/wids-meta.json or recursive **/*.tar.
+        _discover_share_gpt_webdataset(entry.get("data_dir"), jobs, indexes_root)
+        return
+
+    if typ == "lhotse":
+        if (cuts_path := entry.get("cuts_path")) is not None:
+            for p in _expand_jsonl(cuts_path):
+                jobs.append(IndexJob(p, JSONL, indexes_root))
+        if (shar_path := entry.get("shar_path")) is not None:
+            _discover_shar(shar_path, jobs, indexes_root)
+        return
+
+    if typ == "lhotse_shar":
+        _discover_shar(entry.get("shar_path"), jobs, indexes_root)
+        return
+
+    if typ in ("txt_jsonl", "nemotron_text_converation"):
+        _discover_paths(entry.get("paths"), jobs, indexes_root)
+        return
+
+    # Unknown type — nothing to do.
+    return
+
+
+def _discover_shar(shar_path, jobs: list[IndexJob], indexes_root: Optional[str]) -> None:
+    """Index every uncompressed JSONL/tar shard inside one or more Shar dirs."""
+    if shar_path is None:
+        return
+    if isinstance(shar_path, (str, Path)):
+        candidates = [shar_path]
+    elif isinstance(shar_path, (list, ListConfig)):
+        candidates = []
+        for item in shar_path:
+            if isinstance(item, (str, Path)):
+                candidates.append(item)
+            elif isinstance(item, (list, tuple, ListConfig)) and item:
+                candidates.append(item[0])  # [path, weight] form
+    elif isinstance(shar_path, (dict, DictConfig)):
+        # {field: [shard, ...]} layout — index every shard in every field.
+        for v in shar_path.values():
+            for raw in _flatten_path_spec(v):
+                for p in expand_sharded_filepaths(raw):
+                    if p.endswith(".jsonl"):
+                        jobs.append(IndexJob(p, JSONL, indexes_root))
+                    elif p.endswith(".tar"):
+                        jobs.append(IndexJob(p, WDS_TAR, indexes_root))
+        return
+    else:
+        return
+
+    for d in candidates:
+        d = Path(str(d))
+        if not d.is_dir():
+            continue
+        for p in sorted(d.iterdir()):
+            if p.suffix == ".jsonl":
+                jobs.append(IndexJob(str(p), JSONL, indexes_root))
+            elif p.suffix == ".tar":
+                jobs.append(IndexJob(str(p), WDS_TAR, indexes_root))
+
+
+# --------------------------------------------------------------------------- #
+# Index builders.
+# --------------------------------------------------------------------------- #
+
+
+def _build_one(job: IndexJob) -> tuple[IndexJob, str]:
+    """Run the right indexer for *job*. Returns (job, status)."""
+    from lhotse.indexing import create_jsonl_index
+    from lhotse.indexing import create_tar_index as create_wds_tar_index
+
+    idx = job.idx_path()
+    # Ensure the parent directory exists for mirrored layouts.
+    idx_parent = Path(idx).parent
+    if not str(idx).startswith(("ais://", "s3://", "http://", "https://", "gs://")):
+        idx_parent.mkdir(parents=True, exist_ok=True)
+
+    if job.kind == JSONL:
+        create_jsonl_index(job.path, output_path=idx)
+    elif job.kind == WDS_TAR:
+        create_wds_tar_index(job.path, output_path=idx)
+    elif job.kind == NEMO_TAR:
+        # NeMo's create_tar_index has a (tar_path, idx_path) signature.
+        create_nemo_tar_index(job.path, idx)
+    else:
+        raise ValueError(f"Unknown index kind: {job.kind!r}")
+    return job, "built"
+
+
+def _is_indexed(job: IndexJob) -> bool:
+    """True if a non-empty .idx already exists locally."""
+    p = Path(job.idx_path())
+    try:
+        return p.is_file() and p.stat().st_size > 0
+    except OSError:
+        return False
+
+
+# --------------------------------------------------------------------------- #
+# CLI.
+# --------------------------------------------------------------------------- #
+
+
+@click.command(context_settings={"show_default": True})
+@click.argument("input_cfgs", type=click.Path(exists=True, dir_okay=False), nargs=-1, required=True)
+@click.option("--force", is_flag=True, help="Rebuild .idx files even if they already exist.")
+@click.option("--workers", type=int, default=4, help="Number of parallel index builders.")
+@click.option("--dry-run", is_flag=True, help="List the jobs without writing anything.")
+@click.option(
+    "--executor",
+    type=click.Choice(["process", "thread"]),
+    default="process",
+    help=(
+        "Worker pool kind. ``process`` (default) gives true CPU-level parallelism by "
+        "running each indexer in its own interpreter — required for tar indexing where "
+        "tarfile.next() and the read-and-discard for data members hold the GIL and "
+        "would otherwise serialize all workers onto one core. ``thread`` is useful for "
+        "debugging or when indexing only JSONLs over a slow network."
+    ),
+)
+@click.option(
+    "--indexes-root",
+    type=str,
+    default=None,
+    help=(
+        "Write .idx sidecars to a mirror under this root (preserving the data files' "
+        "directory structure) instead of next to each data file. CLI value overrides "
+        "any 'indexes_root' present in the YAML."
+    ),
+)
+def main(
+    input_cfgs: tuple[str, ...],
+    force: bool,
+    workers: int,
+    dry_run: bool,
+    executor: str,
+    indexes_root: Optional[str],
+):
+    """
+    Build .idx sidecars for every JSONL/tar referenced by INPUT_CFGS.
+
+    INPUT_CFGS are NeMo Lhotse dataloading configs (``input_cfg`` YAML).
+    """
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    jobs: list[IndexJob] = []
+    for cfg_path in input_cfgs:
+        cfg = OmegaConf.load(cfg_path)
+        discover(cfg, jobs, indexes_root=indexes_root)
+
+    # Deduplicate while preserving order.
+    seen: set[tuple[str, str, Optional[str]]] = set()
+    unique: list[IndexJob] = []
+    for j in jobs:
+        key = (j.path, j.kind, j.indexes_root)
+        if key not in seen:
+            seen.add(key)
+            unique.append(j)
+
+    todo = unique if force else [j for j in unique if not _is_indexed(j)]
+    skipped = len(unique) - len(todo)
+
+    logging.info("Discovered %d files (%d already indexed, %d to build).", len(unique), skipped, len(todo))
+
+    if dry_run or not todo:
+        for j in todo:
+            logging.info("  [%s] %s -> %s", j.kind, j.path, j.idx_path())
+        return
+
+    # Per-file success logging is suppressed: building 80k-400k indexes would
+    # otherwise emit one log line per file, swamping the SLURM stdout buffer.
+    # Failures are still logged inline; success only emits a periodic
+    # "<built>/<total> processed" heartbeat (~every 5% of total or 5000 files,
+    # whichever is smaller) plus a final summary.
+    failures: list[tuple[IndexJob, Exception]] = []
+    total = len(todo)
+    log_every = max(1, min(5000, total // 20))
+    pool_cls = ProcessPoolExecutor if executor == "process" else ThreadPoolExecutor
+    with pool_cls(max_workers=max(1, workers)) as ex:
+        futures = {ex.submit(_build_one, j): j for j in todo}
+        done = 0
+        for fut in as_completed(futures):
+            done += 1
+            j = futures[fut]
+            try:
+                _, _status = fut.result()
+            except Exception as e:  # surface worker failures but let interrupts/system exits propagate
+                failures.append((j, e))
+                logging.error("  [FAIL] %s %s: %s", j.kind, j.path, e)
+                continue
+            if done % log_every == 0 or done == total:
+                logging.info(
+                    "  built %d/%d (%.1f%%)  failures=%d",
+                    done,
+                    total,
+                    100.0 * done / total,
+                    len(failures),
+                )
+
+    if failures:
+        logging.error("\n%d index build(s) failed:", len(failures))
+        for j, e in failures:
+            logging.error("  %s (%s): %s", j.path, j.kind, e)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/dataloading/prefetch_indexes.py b/scripts/dataloading/prefetch_indexes.py
new file mode 100644
index 000000000000..694b695b923d
--- /dev/null
+++ b/scripts/dataloading/prefetch_indexes.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Copy existing ``.idx`` sidecars from their source locations into a local
+mirrored ``indexes_root``.
+
+Use this when your data lives on shared storage (NFS, S3, AIStore) and you
+want a local-disk copy of the indexes for fast random access during
+training, without ever touching the data files themselves.
+
+The script walks an arbitrary NeMo Lhotse ``input_cfg`` YAML (same machinery
+as ``build_indexes.py``), enumerates every ``.idx`` file the dataloader will
+need, and downloads each into ``<indexes_root>/<rel-path>.idx`` preserving
+the data files' directory structure. Source paths are read via lhotse's
+``open_best``, which routes ``ais://``, ``s3://``, ``http(s)://``, and local
+paths to the correct backend.
+
+Examples::
+
+    # Local data, mirror indexes onto a fast local SSD.
+    python scripts/dataloading/prefetch_indexes.py \\
+        --indexes-root /scratch/idx \\
+        path/to/input_cfg.yaml
+
+    # Indexes live next to data on AIStore; pull them down.
+    AIS_ENDPOINT=http://aistore.example.com \\
+        python scripts/dataloading/prefetch_indexes.py \\
+            --indexes-root /scratch/idx \\
+            path/to/input_cfg.yaml
+
+    # Skip files that are already in the mirror; re-run safely.
+    python scripts/dataloading/prefetch_indexes.py \\
+        --indexes-root /scratch/idx --workers 16 train.yaml validation.yaml
+
+After prefetch, point your training config at the mirror via the top-level
+``indexes_root: /scratch/idx`` option (no per-source changes required).
+"""
+
+import logging
+import shutil
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from contextlib import suppress
+from pathlib import Path
+from typing import Optional
+
+import click
+from lhotse.indexing import index_file_path
+from omegaconf import OmegaConf
+
+# Reuse the discovery + IndexJob machinery from build_indexes.py.
+sys.path.insert(0, str(Path(__file__).parent))
+from build_indexes import IndexJob, discover  # type: ignore[import-not-found]
+
+
+def _copy_idx(src: str, dst: str) -> None:
+    """Copy a single ``.idx`` from *src* (local or URL) to *dst* (local).
+
+    Uses lhotse's ``open_best`` so URL schemes are routed to the right
+    backend (smart_open / AIStore SDK).
+    """
+    from lhotse.serialization import open_best
+
+    Path(dst).parent.mkdir(parents=True, exist_ok=True)
+    # Stage to a sibling tmp file then rename, so partial writes never
+    # leave a half-baked .idx in place.
+    tmp = f"{dst}.tmp.{Path(dst).name}.partial"
+    try:
+        with open_best(src, "rb") as src_f, open(tmp, "wb") as dst_f:
+            shutil.copyfileobj(src_f, dst_f, length=8 * 1024 * 1024)
+        Path(tmp).replace(dst)
+    finally:
+        # Clean up if rename never happened (exception path).
+        with suppress(FileNotFoundError):
+            Path(tmp).unlink()
+
+
+def _is_present(local_idx: str) -> bool:
+    p = Path(local_idx)
+    try:
+        return p.is_file() and p.stat().st_size > 0
+    except OSError:
+        return False
+
+
+@click.command(context_settings={"show_default": True})
+@click.argument("input_cfgs", type=click.Path(exists=True, dir_okay=False), nargs=-1, required=True)
+@click.option(
+    "--indexes-root",
+    type=str,
+    required=True,
+    help="Local directory the .idx mirror is written to. The data files' directory structure is preserved underneath.",
+)
+@click.option(
+    "--source-indexes-root",
+    type=str,
+    default=None,
+    help=(
+        "If the source ``.idx`` files do not live next to the data (e.g. they "
+        "are themselves under another mirror — possibly remote), set this to "
+        "that root. Defaults to ``None`` meaning sidecars are read from "
+        "next to each data file."
+    ),
+)
+@click.option("--force", is_flag=True, help="Re-download even when a non-empty mirrored .idx already exists.")
+@click.option("--workers", type=int, default=8, help="Number of parallel copies.")
+@click.option("--dry-run", is_flag=True, help="List the (src, dst) pairs without copying anything.")
+def main(
+    input_cfgs: tuple[str, ...],
+    indexes_root: str,
+    source_indexes_root: Optional[str],
+    force: bool,
+    workers: int,
+    dry_run: bool,
+):
+    """
+    Prefetch .idx sidecars referenced by INPUT_CFGS into a local mirror.
+
+    INPUT_CFGS are NeMo Lhotse dataloading configs (``input_cfg`` YAML).
+    """
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    jobs: list[IndexJob] = []
+    for cfg_path in input_cfgs:
+        cfg = OmegaConf.load(cfg_path)
+        # Walk with no inherited indexes_root — we want the *natural* data paths,
+        # then we compute (source, destination) idx paths ourselves below.
+        discover(cfg, jobs, indexes_root=None)
+
+    # Deduplicate by (data_path, kind).
+    seen: set[tuple[str, str]] = set()
+    unique: list[IndexJob] = []
+    for j in jobs:
+        key = (j.path, j.kind)
+        if key not in seen:
+            seen.add(key)
+            unique.append(j)
+
+    pairs: list[tuple[str, str]] = []
+    for j in unique:
+        src = index_file_path(j.path, source_indexes_root)
+        dst = index_file_path(j.path, indexes_root)
+        pairs.append((src, dst))
+
+    todo = pairs if force else [(s, d) for (s, d) in pairs if not _is_present(d)]
+    skipped = len(pairs) - len(todo)
+    logging.info(
+        "Discovered %d sidecars (%d already present locally, %d to copy).",
+        len(pairs),
+        skipped,
+        len(todo),
+    )
+
+    if dry_run or not todo:
+        for s, d in todo:
+            logging.info("  %s  ->  %s", s, d)
+        return
+
+    # Per-file success logging is suppressed (80k-400k sidecars would swamp
+    # stdout); failures are still logged inline, success emits a periodic
+    # progress heartbeat plus a final summary.
+    failures: list[tuple[str, str, Exception]] = []
+    total = len(todo)
+    log_every = max(1, min(5000, total // 20))
+    with ThreadPoolExecutor(max_workers=max(1, workers)) as ex:
+        futures = {ex.submit(_copy_idx, s, d): (s, d) for (s, d) in todo}
+        done = 0
+        for fut in as_completed(futures):
+            done += 1
+            s, d = futures[fut]
+            try:
+                fut.result()
+            except Exception as e:
+                failures.append((s, d, e))
+                logging.error("  [FAIL] %s  ->  %s: %s", s, d, e)
+                continue
+            if done % log_every == 0 or done == total:
+                logging.info(
+                    "  copied %d/%d (%.1f%%)  failures=%d",
+                    done,
+                    total,
+                    100.0 * done / total,
+                    len(failures),
+                )
+
+    if failures:
+        logging.error("\n%d copy operation(s) failed:", len(failures))
+        for s, d, e in failures:
+            logging.error("  %s -> %s: %s", s, d, e)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/dataloading/validate_dataloader.py b/scripts/dataloading/validate_dataloader.py
new file mode 100644
index 000000000000..4fdfaf14bcc6
--- /dev/null
+++ b/scripts/dataloading/validate_dataloader.py
@@ -0,0 +1,348 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Validate a Lhotse + indexed dataloader config end-to-end.
+
+Per-rank entry point launched under torchrun. Builds the **exact** dataloader
+the SALM training builds (via ``get_lhotse_dataloader_from_config``) on top
+of a no-op ``CutIdDataset`` and dumps per-batch cut.id JSONL. Phase-aware:
+
+* ``baseline`` — iterate ``--steps`` batches from a fresh dataloader; at
+  ``--checkpoint-at`` save ``dl.state_dict()`` to ``state_rank_NNN.pt``.
+* ``resumed``  — load the saved state and iterate the rest; downstream
+  consolidation diffs the post-checkpoint window against the baseline tail.
+* ``groundtruth`` — single-rank, single-worker enumeration of every cut
+  the configured input_cfg yields under force_finite + metadata_only.
+
+Launch as a step in a multi-phase pipeline; downstream aggregator is
+``_validate_dataloader/consolidate.py``.
+
+Example::
+
+    torchrun --standalone --nnodes=1 --nproc-per-node=4 \\
+        scripts/dataloading/validate_dataloader.py \\
+        --config 0909-en-only-id2.yaml \\
+        --data-blend-dir /lustre/.../data_blends/ord \\
+        --output-dir validation_out \\
+        --phase baseline --run-idx 0 \\
+        --steps 200 --checkpoint-at 100
+"""
+
+import json
+import logging
+import os
+import statistics
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+import click
+import torch
+import torch.utils.data
+from omegaconf import OmegaConf
+
+# Local helpers — same directory.
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from _validate_dataloader.config_inject import inject_validator_flags  # noqa: E402
+from _validate_dataloader.cut_id_dataset import CutIdDataset  # noqa: E402
+
+LOG = logging.getLogger(__name__)
+
+
+PHASE_BASELINE = "baseline"
+PHASE_RESUMED = "resumed"
+PHASE_GROUNDTRUTH = "groundtruth"
+
+
+@click.command(help=__doc__)
+@click.option("--config", "config_path", required=True, type=click.Path(exists=True))
+@click.option("--data-blend-dir", default=None, help="Substituted into ${data_blend_dir} in the config.")
+@click.option("--section", default="train_ds", show_default=True)
+@click.option("--output-dir", required=True, type=click.Path())
+@click.option("--phase", type=click.Choice([PHASE_BASELINE, PHASE_RESUMED, PHASE_GROUNDTRUTH]), required=True)
+@click.option(
+    "--run-idx",
+    type=int,
+    default=0,
+    show_default=True,
+    help="Which determinism re-run this is. Only used with --phase=baseline.",
+)
+@click.option(
+    "--steps",
+    type=int,
+    default=200,
+    show_default=True,
+    help="Batches to iterate. Ignored in groundtruth phase (iterates until exhaustion).",
+)
+@click.option(
+    "--checkpoint-at",
+    type=int,
+    default=-1,
+    show_default=True,
+    help="Step index at which to save state in baseline phase. -1 = don't save.",
+)
+@click.option(
+    "--state-dir",
+    default=None,
+    type=click.Path(),
+    help="In --phase=resumed: directory containing state_rank_NNN.pt files.",
+)
+@click.option("--force-finite/--no-force-finite", default=True, show_default=True)
+@click.option("--metadata-only/--no-metadata-only", default=True, show_default=True)
+@click.option("--num-workers-override", type=int, default=None, help="Override config.{section}.num_workers.")
+@click.option(
+    "--mode",
+    type=click.Choice(["fast", "full"]),
+    default="fast",
+    show_default=True,
+    help="fast: CutIdDataset (default). full: stub-only in v1, raises.",
+)
+@click.option("-v", "--verbose", is_flag=True, default=False)
+def cli(
+    config_path: str,
+    data_blend_dir: Optional[str],
+    section: str,
+    output_dir: str,
+    phase: str,
+    run_idx: int,
+    steps: int,
+    checkpoint_at: int,
+    state_dir: Optional[str],
+    force_finite: bool,
+    metadata_only: bool,
+    num_workers_override: Optional[int],
+    mode: str,
+    verbose: bool,
+) -> None:
+    if mode == "full":
+        raise click.ClickException("--mode=full is not implemented in v1; use --mode=fast.")
+
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", str(rank)))
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["LOCAL_RANK"] = str(local_rank)
+
+    logging.basicConfig(
+        level=logging.DEBUG if verbose else logging.INFO,
+        format=f"[rank{rank}/{world_size} %(asctime)s %(levelname)s] %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    if phase == PHASE_GROUNDTRUTH and world_size != 1:
+        raise click.ClickException(f"--phase=groundtruth requires nproc-per-node=1 (got world_size={world_size})")
+
+    cfg = OmegaConf.load(config_path)
+    if data_blend_dir is not None:
+        cfg.data_blend_dir = data_blend_dir
+    OmegaConf.resolve(cfg)
+    section_cfg = cfg.data[section]
+
+    inject_validator_flags(section_cfg, force_finite=force_finite, metadata_only=metadata_only)
+    if num_workers_override is not None:
+        LOG.info("override num_workers: %s -> %s", section_cfg.get("num_workers"), num_workers_override)
+        section_cfg.num_workers = num_workers_override
+    # Groundtruth needs num_workers=0 so the single-process iteration enumerates everything.
+    if phase == PHASE_GROUNDTRUTH:
+        section_cfg.num_workers = 0
+        section_cfg.use_stateful_dataloader = False
+        section_cfg.force_map_dataset = True
+        LOG.info("groundtruth: forced num_workers=0, use_stateful_dataloader=False, force_map_dataset=True")
+
+    # Defer import until env vars and config injections are in place.
+    from nemo.collections.common.data.lhotse.dataloader import get_lhotse_dataloader_from_config
+
+    tokenizer = _build_tokenizer_if_needed(cfg, section_cfg)
+    dataset = CutIdDataset()
+    dataloader = get_lhotse_dataloader_from_config(
+        config=section_cfg,
+        global_rank=rank,
+        world_size=world_size,
+        dataset=dataset,
+        tokenizer=tokenizer,
+    )
+
+    if phase == PHASE_RESUMED:
+        _load_state(dataloader, state_dir=state_dir, rank=rank)
+
+    out_dir = Path(output_dir)
+    phase_dir = _phase_dir(out_dir, phase, run_idx)
+    phase_dir.mkdir(parents=True, exist_ok=True)
+
+    if phase == PHASE_GROUNDTRUTH:
+        out_path = phase_dir / "cuts.jsonl"
+    else:
+        out_path = phase_dir / f"rank_{rank:03d}.jsonl"
+
+    LOG.info("phase=%s run_idx=%d steps=%d checkpoint_at=%d -> %s", phase, run_idx, steps, checkpoint_at, out_path)
+
+    t_total_samples: list[float] = []
+    t_first_batch_ms: Optional[float] = None
+    iter_t0 = time.monotonic_ns()
+    with open(out_path, "w") as fout:
+        for step, batch in enumerate(dataloader):
+            t_step_end = time.monotonic_ns()
+            if step == 0:
+                t_first_batch_ms = (t_step_end - iter_t0) / 1e6
+            t_total_ms = (t_step_end - iter_t0) / 1e6
+            iter_t0 = t_step_end
+
+            if phase != PHASE_GROUNDTRUTH and step > 0:
+                t_total_samples.append(t_total_ms)
+
+            cut_ids, worker_id = _extract_cuts(batch)
+            row = {
+                "step": step,
+                "rank": rank,
+                "world_size": world_size,
+                "worker_id": worker_id,
+                "cut_ids": cut_ids,
+                "batch_size": len(cut_ids),
+                "t_total_ms": round(t_total_ms, 3),
+                "t_first_batch_ms": round(t_first_batch_ms, 3) if step == 0 else None,
+            }
+            fout.write(json.dumps(row) + "\n")
+
+            if step % 50 == 0:
+                LOG.info(
+                    "step=%d cuts=%d t_total=%.1fms (first cut: %s)",
+                    step,
+                    len(cut_ids),
+                    t_total_ms,
+                    cut_ids[0] if cut_ids else "<empty>",
+                )
+
+            if phase == PHASE_BASELINE and step == checkpoint_at:
+                state_path = phase_dir / f"state_rank_{rank:03d}.pt"
+                LOG.info("saving state_dict at step=%d -> %s", step, state_path)
+                torch.save(dataloader.state_dict(), state_path)
+
+            if phase != PHASE_GROUNDTRUTH and step + 1 >= steps:
+                break
+
+    if phase == PHASE_BASELINE and run_idx == 0:
+        _write_throughput_summary(
+            phase_dir / f"throughput_rank_{rank:03d}.json",
+            t_total_samples=t_total_samples,
+            t_first_batch_ms=t_first_batch_ms,
+            num_workers=section_cfg.get("num_workers", 0),
+        )
+
+    LOG.info("DONE")
+
+
+# --------------------------------------------------------------------------- #
+# Helpers.
+# --------------------------------------------------------------------------- #
+
+
+def _phase_dir(output_dir: Path, phase: str, run_idx: int) -> Path:
+    if phase == PHASE_GROUNDTRUTH:
+        return output_dir / phase
+    return output_dir / phase / f"run{run_idx}"
+
+
+def _extract_cuts(batch) -> tuple[list[str], int]:
+    """``CutIdDataset.__getitem__`` returns ``{"cut_ids": [...], "worker_id": W}``.
+    The default collate stacks across the batch (which is always a single
+    item under Lhotse's bucketing sampler), so we get back lists wrapped
+    in length-1 outer lists. Handle both shapes defensively."""
+    if isinstance(batch, dict):
+        cuts = batch.get("cut_ids", [])
+        worker = batch.get("worker_id", 0)
+        # Default collate wraps strings in lists; unwrap one level if needed.
+        if cuts and isinstance(cuts[0], list):
+            cuts = [c for sub in cuts for c in sub]
+        if isinstance(worker, list):
+            worker = int(worker[0]) if worker else 0
+        elif isinstance(worker, torch.Tensor):
+            worker = int(worker.item())
+        return [str(c) for c in cuts], int(worker)
+    # Fallback: unknown shape.
+    return [], -1
+
+
+def _build_tokenizer_if_needed(full_cfg, section_cfg):
+    """Bucketer length measurement under ``use_multimodal_sampling=True`` requires
+    a tokenizer. Mirror SALM's construction (``salm.py:66``) so token counts
+    match production. Returns ``None`` when the config doesn't ask for it."""
+    if not section_cfg.get("use_multimodal_sampling", False):
+        return None
+    pretrained_llm = full_cfg.get("model", {}).get("pretrained_llm")
+    if not pretrained_llm:
+        raise click.ClickException(
+            "use_multimodal_sampling=True requires model.pretrained_llm in the config to load a tokenizer."
+        )
+    from nemo.collections.common.tokenizers import AutoTokenizer
+
+    trust_remote_code = bool(full_cfg.get("model", {}).get("trust_remote_code", False))
+    LOG.info("loading tokenizer for %s (trust_remote_code=%s)", pretrained_llm, trust_remote_code)
+    tokenizer = AutoTokenizer(pretrained_llm, use_fast=True, trust_remote_code=trust_remote_code)
+    audio_tag = full_cfg.get("model", {}).get("audio_locator_tag")
+    if audio_tag:
+        tokenizer.add_special_tokens({"additional_special_tokens": [audio_tag]})
+    return tokenizer
+
+
+def _load_state(dataloader, *, state_dir: Optional[str], rank: int) -> None:
+    if state_dir is None:
+        raise click.ClickException("--state-dir is required for --phase=resumed")
+    state_path = Path(state_dir) / f"state_rank_{rank:03d}.pt"
+    if not state_path.exists():
+        raise click.ClickException(f"state file missing: {state_path}")
+    LOG.info("loading state_dict from %s", state_path)
+    state = torch.load(state_path, map_location="cpu", weights_only=False)
+    dataloader.load_state_dict(state)
+
+
+def _write_throughput_summary(
+    out_path: Path, *, t_total_samples: list[float], t_first_batch_ms: Optional[float], num_workers: int
+) -> None:
+    if not t_total_samples:
+        out_path.write_text(
+            json.dumps(
+                {
+                    "p50_ms": None,
+                    "p95_ms": None,
+                    "mean_ms": None,
+                    "count": 0,
+                    "t_first_batch_ms": t_first_batch_ms,
+                    "num_workers": num_workers,
+                },
+                indent=2,
+            )
+        )
+        return
+    samples = sorted(t_total_samples)
+    p50 = statistics.median(samples)
+    p95 = samples[int(0.95 * (len(samples) - 1))]
+    mean = statistics.fmean(samples)
+    out_path.write_text(
+        json.dumps(
+            {
+                "p50_ms": round(p50, 3),
+                "p95_ms": round(p95, 3),
+                "mean_ms": round(mean, 3),
+                "count": len(samples),
+                "t_first_batch_ms": round(t_first_batch_ms, 3) if t_first_batch_ms else None,
+                "num_workers": int(num_workers),
+            },
+            indent=2,
+        )
+    )
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/tests/collections/asr/test_asr_lhotse_dataset.py b/tests/collections/asr/test_asr_lhotse_dataset.py
index 618eb2d89c30..4d7edba2dc3f 100644
--- a/tests/collections/asr/test_asr_lhotse_dataset.py
+++ b/tests/collections/asr/test_asr_lhotse_dataset.py
@@ -105,29 +105,45 @@ def test_lhotse_asr_dataset_metadata(tokenizer):
 def test_lhotse_asr_dataset_ais_batch_loading_enabled(tokenizer, monkeypatch):
     """Test that USE_AIS_GET_BATCH=true passes use_batch_loader=True to AudioSamples."""
     monkeypatch.setenv("USE_AIS_GET_BATCH", "true")
+    monkeypatch.delenv("USE_AIS_INDIVIDUAL_GETS", raising=False)
 
     with patch.object(AudioSamples, "__init__", return_value=None) as mock_init:
         mock_init.side_effect = lambda *args, **kwargs: None
         try:
-            dataset = LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+            LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
         except Exception:
             pass
         # Check that AudioSamples was called with use_batch_loader=True
-        mock_init.assert_called_with(fault_tolerant=True, use_batch_loader=True)
+        mock_init.assert_called_with(fault_tolerant=True, use_batch_loader=True, ais_force_individual=False)
+
+
+def test_lhotse_asr_dataset_ais_batch_loading_force_individual(tokenizer, monkeypatch):
+    """Test that USE_AIS_INDIVIDUAL_GETS=true is passed to AudioSamples."""
+    monkeypatch.setenv("USE_AIS_GET_BATCH", "true")
+    monkeypatch.setenv("USE_AIS_INDIVIDUAL_GETS", "true")
+
+    with patch.object(AudioSamples, "__init__", return_value=None) as mock_init:
+        mock_init.side_effect = lambda *args, **kwargs: None
+        try:
+            LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+        except Exception:
+            pass
+        mock_init.assert_called_with(fault_tolerant=True, use_batch_loader=True, ais_force_individual=True)
 
 
 def test_lhotse_asr_dataset_ais_batch_loading_disabled(tokenizer, monkeypatch):
     """Test that without USE_AIS_GET_BATCH, use_batch_loader=False is passed to AudioSamples."""
     monkeypatch.delenv("USE_AIS_GET_BATCH", raising=False)
+    monkeypatch.delenv("USE_AIS_INDIVIDUAL_GETS", raising=False)
 
     with patch.object(AudioSamples, "__init__", return_value=None) as mock_init:
         mock_init.side_effect = lambda *args, **kwargs: None
         try:
-            dataset = LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+            LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
         except Exception:
             pass
         # Check that AudioSamples was called with use_batch_loader=False
-        mock_init.assert_called_with(fault_tolerant=True, use_batch_loader=False)
+        mock_init.assert_called_with(fault_tolerant=True, use_batch_loader=False, ais_force_individual=False)
 
 
 def test_lhotse_asr_dataset_ais_batch_loading_fallback(tokenizer, monkeypatch):
@@ -145,8 +161,30 @@ def mock_init(self, *args, **kwargs):
         return original_init(self, *args, **kwargs)
 
     with patch.object(AudioSamples, "__init__", mock_init):
-        dataset = LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+        LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
 
-    # First call should have use_batch_loader=True, second call should not
-    assert call_args[0] == {"fault_tolerant": True, "use_batch_loader": True}
+    # First call should use AIS batch options, second call should not.
+    assert call_args[0] == {"fault_tolerant": True, "use_batch_loader": True, "ais_force_individual": False}
     assert call_args[1] == {"fault_tolerant": True}
+
+
+def test_lhotse_asr_dataset_ais_force_individual_fallback(tokenizer, monkeypatch):
+    """Test fallback when Lhotse supports use_batch_loader but not ais_force_individual."""
+    monkeypatch.setenv("USE_AIS_GET_BATCH", "true")
+    monkeypatch.setenv("USE_AIS_INDIVIDUAL_GETS", "true")
+
+    call_args = []
+
+    original_init = AudioSamples.__init__
+
+    def mock_init(self, *args, **kwargs):
+        call_args.append(kwargs.copy())
+        if "ais_force_individual" in kwargs:
+            raise TypeError("unexpected keyword argument 'ais_force_individual'")
+        return original_init(self, *args, **kwargs)
+
+    with patch.object(AudioSamples, "__init__", mock_init):
+        LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+
+    assert call_args[0] == {"fault_tolerant": True, "use_batch_loader": True, "ais_force_individual": True}
+    assert call_args[1] == {"fault_tolerant": True, "use_batch_loader": True}
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index 73b94a3e7682..e1b45274d0de 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -1025,7 +1025,7 @@ def test_lazy_nemo_iterator_with_offset_field(tmp_path: Path):
     assert cut.supervisions[0].text == "irrelevant"
     audio = cut.load_audio()
     assert audio.shape == (1, 8000)
-    np.testing.assert_equal(audio[0], expected_audio[:8000])
+    np.testing.assert_allclose(audio[0], expected_audio[:8000], atol=5e-5)
 
     cut = cuts[1]
     assert isinstance(cut, lhotse.MonoCut)
@@ -1073,7 +1073,7 @@ def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path):
     assert cut.num_samples == 8000
     assert cut.supervisions[0].text == "irrelevant"
     assert audio.shape == (1, 8000)
-    np.testing.assert_equal(audio[0], expected_audio[:8000])
+    np.testing.assert_allclose(audio[0], expected_audio[:8000], atol=5e-5)
 
 
 def test_lhotse_cuts_resolve_relative_paths(tmp_path: Path):
diff --git a/tests/collections/common/test_lhotse_indexed_partition.py b/tests/collections/common/test_lhotse_indexed_partition.py
new file mode 100644
index 000000000000..25e2ab480275
--- /dev/null
+++ b/tests/collections/common/test_lhotse_indexed_partition.py
@@ -0,0 +1,493 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+"""Regression tests: every NeMo indexed adapter must produce disjoint slices
+across (DP rank x DataLoader worker) shards.
+
+The bug this guards against: each adapter's ``_iter_indexed`` previously
+iterated ``range(0, total_len)`` with no call to ``get_worker_partition()``,
+so under multi-rank training every rank yielded the same items
+(see ``sweeps/0909/debugging-duplication.md``). All 7 buggy adapters now
+delegate position+topology to ``PartitionedIndexedIterator``; this file
+asserts that contract at the adapter level so the next refactor can't quietly
+regress it.
+
+Each test simulates the env-var setup ``worker_init_fn`` would perform in a
+DataLoader worker subprocess, builds the adapter with ``indexed=True``, walks
+every (rank in range(world_size)) instance, and asserts:
+
+* per-rank slices are pairwise disjoint;
+* union over all ranks equals the full manifest (each example seen exactly
+  once across the world).
+"""
+from __future__ import annotations
+
+import json
+import os
+import tarfile
+from contextlib import contextmanager
+from io import BytesIO
+from pathlib import Path
+
+import pytest
+from lhotse import CutSet
+from lhotse.dataset.dataloading import LHOTSE_USE_WORKER_PARTITION
+from lhotse.testing.dummies import DummyManifest
+
+from nemo.collections.common.data.lhotse import nemo_adapters, text_adapters
+
+_PARTITION_ENV_KEYS = ("RANK", "WORLD_SIZE", LHOTSE_USE_WORKER_PARTITION)
+
+
+@contextmanager
+def _env_partition(rank: int, world_size: int):
+    """Mimic the worker-subprocess env that ``worker_init_fn`` sets."""
+    saved = {k: os.environ.get(k) for k in _PARTITION_ENV_KEYS}
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ[LHOTSE_USE_WORKER_PARTITION] = "1"
+    try:
+        yield
+    finally:
+        for k, v in saved.items():
+            if v is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v
+
+
+def _collect_disjoint_per_rank(build_iter_for_rank, world_size: int) -> tuple[list, set]:
+    """Run an adapter across every rank in ``range(world_size)`` and return
+    ``(per_rank_id_lists, union_of_all_ids)``. Asserts pairwise disjointness."""
+    per_rank: list[list] = []
+    union: set = set()
+    for rank in range(world_size):
+        with _env_partition(rank=rank, world_size=world_size):
+            ids = list(build_iter_for_rank())
+        # Disjointness against every prior rank.
+        for prev in per_rank:
+            assert set(prev).isdisjoint(ids), (
+                f"rank {rank} slice overlaps prior rank: " f"{sorted(set(prev) & set(ids))}"
+            )
+        per_rank.append(ids)
+        union.update(ids)
+    return per_rank, union
+
+
+# ---------------------------------------------------------------------------
+# Fixture: 20 single-channel cuts saved as one NeMo manifest + one tar file.
+# Used by the LazyNeMoTarredIterator + parquet tests.
+# ---------------------------------------------------------------------------
+
+N_CUTS = 20
+
+
+@pytest.fixture
+def tmp_audio_root(tmp_path_factory) -> Path:
+    return tmp_path_factory.mktemp("audio")
+
+
+@pytest.fixture
+def nemo_tarred_manifest(tmp_audio_root) -> tuple[Path, Path]:
+    """20-utterance NeMo tarred manifest (single shard) as
+    (manifest_filepath, tarred_audio_filepath)."""
+    from lhotse.serialization import SequentialJsonlWriter
+    from lhotse.shar.writers import TarWriter
+
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
+        tmp_audio_root, progress_bar=False
+    )
+    root = tmp_audio_root / "tarred"
+    root.mkdir(exist_ok=True)
+    with (
+        TarWriter(f"{root}/audios_0.tar", shard_size=None) as tar_writer,
+        SequentialJsonlWriter(root / "manifest_0.jsonl") as mft_writer,
+    ):
+        for idx, cut in enumerate(cuts):
+            src = cut.recording.sources[0].source
+            name = Path(src).name
+            with open(src, "rb") as f:
+                tar_writer.write(name, BytesIO(f.read()))
+            mft_writer.write(
+                {
+                    "audio_filepath": name,
+                    "text": "irrelevant",
+                    "duration": cut.duration,
+                    "lang": "en",
+                    "shard_id": 0,
+                    "cut_id": cut.id,
+                }
+            )
+    return Path(mft_writer.path), root / "audios_0.tar"
+
+
+# ---------------------------------------------------------------------------
+# 1. LazyNeMoTarredIterator
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_lazy_nemo_tarred_iterator_indexed_partition(nemo_tarred_manifest, world_size):
+    manifest_path, tar_path = nemo_tarred_manifest
+
+    def build():
+        it = nemo_adapters.LazyNeMoTarredIterator(
+            manifest_path=str(manifest_path),
+            tar_paths=str(tar_path),
+            indexed=True,
+        )
+        return [cut.id for cut in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS, f"missing {N_CUTS - len(union)} items at world_size={world_size}"
+    # All items get covered at least once (each exactly once due to disjointness).
+    assert sum(len(r) for r in per_rank) == N_CUTS
+
+
+@pytest.fixture
+def nemo_tarred_duplicate_bucket_manifest(tmp_audio_root) -> tuple[list[Path], list[Path]]:
+    """Two bucket dirs that both contain manifest_0.jsonl/audios_0.tar.
+
+    Indexed LazyNeMoTarredIterator used to key both paths by numeric shard id 0,
+    silently overwriting the first bucket. The expected dataset size is 2*N_CUTS.
+    """
+    from lhotse.serialization import SequentialJsonlWriter
+    from lhotse.shar.writers import TarWriter
+
+    root = tmp_audio_root / "tarred_duplicate_buckets"
+    root.mkdir(exist_ok=True)
+    manifest_paths: list[Path] = []
+    tar_paths: list[Path] = []
+    for bucket_idx in range(2):
+        cuts = DummyManifest(
+            CutSet,
+            begin_id=bucket_idx * N_CUTS,
+            end_id=(bucket_idx + 1) * N_CUTS,
+            with_data=True,
+        ).save_audios(tmp_audio_root / f"bucket_audio_{bucket_idx}", progress_bar=False)
+        bucket = root / f"bucket_{bucket_idx}"
+        bucket.mkdir(exist_ok=True)
+        manifest_path = bucket / "manifest_0.jsonl"
+        tar_path = bucket / "audios_0.tar"
+        with (
+            TarWriter(str(tar_path), shard_size=None) as tar_writer,
+            SequentialJsonlWriter(manifest_path) as mft_writer,
+        ):
+            for cut in cuts:
+                src = cut.recording.sources[0].source
+                name = Path(src).name
+                with open(src, "rb") as f:
+                    tar_writer.write(name, BytesIO(f.read()))
+                mft_writer.write(
+                    {
+                        "audio_filepath": name,
+                        "text": "irrelevant",
+                        "duration": cut.duration,
+                        "lang": "en",
+                        "shard_id": 0,
+                        "cut_id": cut.id,
+                    }
+                )
+        manifest_paths.append(manifest_path)
+        tar_paths.append(tar_path)
+    return manifest_paths, tar_paths
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_lazy_nemo_tarred_iterator_indexed_preserves_duplicate_bucket_shard_ids(
+    nemo_tarred_duplicate_bucket_manifest, world_size
+):
+    manifest_paths, tar_paths = nemo_tarred_duplicate_bucket_manifest
+
+    def build():
+        it = nemo_adapters.LazyNeMoTarredIterator(
+            manifest_path=[str(path) for path in manifest_paths],
+            tar_paths=[str(path) for path in tar_paths],
+            indexed=True,
+        )
+        assert len(it) == 2 * N_CUTS
+        assert len(it.shard_id_to_tar_path) == 2
+        return [cut.custom["cut_id"] for cut in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == 2 * N_CUTS, f"missing {2 * N_CUTS - len(union)} items at world_size={world_size}"
+    assert sum(len(r) for r in per_rank) == 2 * N_CUTS
+
+
+def test_lazy_nemo_tarred_iterator_streaming_preserves_duplicate_bucket_shard_ids(
+    nemo_tarred_duplicate_bucket_manifest,
+):
+    manifest_paths, tar_paths = nemo_tarred_duplicate_bucket_manifest
+    it = nemo_adapters.LazyNeMoTarredIterator(
+        manifest_path=[str(path) for path in manifest_paths],
+        tar_paths=[str(path) for path in tar_paths],
+        indexed=False,
+    )
+
+    ids = [cut.custom["cut_id"] for cut in it]
+    assert len(ids) == 2 * N_CUTS
+    assert len(set(ids)) == 2 * N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 2. LazyParquetIterator
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def parquet_manifest(tmp_audio_root) -> Path:
+    """20-row parquet file: id + audio_bytes + text."""
+    pytest.importorskip("pyarrow")
+    pytest.importorskip("pyarrow.parquet")
+    import pandas as pd
+
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
+        tmp_audio_root / "parquet_audio", progress_bar=False
+    )
+    rows = []
+    for cut in cuts:
+        with open(cut.recording.sources[0].source, "rb") as f:
+            rows.append(
+                {
+                    "id": cut.id,
+                    "audio": {"bytes": f.read()},
+                    "text": "irrelevant",
+                    "duration": cut.duration,
+                    "lang": "en",
+                }
+            )
+    df = pd.DataFrame(rows)
+    p = tmp_audio_root / "data.parquet"
+    df.to_parquet(p, engine="pyarrow", row_group_size=7)  # > 1 row group exercise
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_lazy_parquet_iterator_indexed_partition(parquet_manifest, world_size):
+    pytest.importorskip("pyarrow")
+
+    def build():
+        it = nemo_adapters.LazyParquetIterator(path=str(parquet_manifest), indexed=True)
+        return [cut.id for cut in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 3. LhotseTextJsonlAdapter
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def text_jsonl(tmp_path) -> Path:
+    p = tmp_path / "text.jsonl"
+    with open(p, "w") as f:
+        for i in range(N_CUTS):
+            f.write(json.dumps({"id": f"t-{i:04d}", "text": f"line {i}"}) + "\n")
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_lhotse_text_jsonl_adapter_indexed_partition(text_jsonl, world_size):
+    def build():
+        it = text_adapters.LhotseTextJsonlAdapter(paths=str(text_jsonl), language="en", indexed=True)
+        return [ex.text for ex in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 4. NeMoSFTJsonlAdapter
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def sft_jsonl(tmp_path) -> Path:
+    """Minimal NeMo-SFT-chat JSONL — adapter wraps each line, doesn't parse."""
+    p = tmp_path / "sft.jsonl"
+    with open(p, "w") as f:
+        for i in range(N_CUTS):
+            f.write(json.dumps({"id": f"sft-{i:04d}", "marker": i}) + "\n")
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_nemo_sft_jsonl_adapter_indexed_partition(sft_jsonl, world_size):
+    def build():
+        it = text_adapters.NeMoSFTJsonlAdapter(paths=str(sft_jsonl), language="en", indexed=True)
+        # NeMoSFTExample stores the raw dict in .data; key by "id".
+        return [ex.data["id"] for ex in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 5. NeMoMultimodalConversationJsonlAdapter — non-tarred path
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def mm_conversation_jsonl(tmp_audio_root) -> Path:
+    """20-line JSONL where each line is a 2-turn user/assistant conversation
+    referring to a local audio file."""
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
+        tmp_audio_root / "mm_audio", progress_bar=False
+    )
+    p = tmp_audio_root / "mm_conversations.jsonl"
+    with open(p, "w") as f:
+        for i, cut in enumerate(cuts):
+            audio_filepath = cut.recording.sources[0].source
+            f.write(
+                json.dumps(
+                    {
+                        "id": f"mm-{i:04d}",
+                        "conversations": [
+                            {
+                                "type": "audio",
+                                "from": "User",
+                                "value": audio_filepath,
+                                "duration": cut.duration,
+                                "offset": 0.0,
+                            },
+                            {
+                                "type": "text",
+                                "from": "Assistant",
+                                "value": f"answer {i}",
+                            },
+                        ],
+                    }
+                )
+                + "\n"
+            )
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_nemo_multimodal_conversation_jsonl_adapter_indexed_partition(mm_conversation_jsonl, world_size):
+    def build():
+        it = text_adapters.NeMoMultimodalConversationJsonlAdapter(
+            manifest_filepath=[str(mm_conversation_jsonl)],
+            audio_locator_tag="<audio>",
+            token_equivalent_duration=0.08,
+            indexed=True,
+        )
+        return [convo.id for convo in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 6. NeMoMultimodalConversationShareGPTJsonlAdapter — non-tarred path
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def sharegpt_conversation_jsonl(tmp_audio_root) -> Path:
+    """ShareGPT-format JSONL with a single user audio + assistant turn each.
+
+    Schema note: the audio path lives in the ``sound`` field (see
+    ``_transform_sharegpt`` in nemo.collections.common.data.lhotse.text_adapters),
+    not in ``audio_filepath`` — the adapter intentionally treats ShareGPT
+    distinctly from NeMo manifests."""
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
+        tmp_audio_root / "sharegpt_audio", progress_bar=False
+    )
+    p = tmp_audio_root / "sharegpt.jsonl"
+    with open(p, "w") as f:
+        for i, cut in enumerate(cuts):
+            audio_filepath = cut.recording.sources[0].source
+            f.write(
+                json.dumps(
+                    {
+                        "id": f"sgpt-{i:04d}",
+                        "conversations": [
+                            {"from": "User", "value": f"<audio>describe {i}"},
+                            {"from": "Assistant", "value": f"this is example {i}"},
+                        ],
+                        "sound": audio_filepath,
+                        "duration": cut.duration,
+                    }
+                )
+                + "\n"
+            )
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_sharegpt_jsonl_adapter_indexed_partition(sharegpt_conversation_jsonl, world_size):
+    def build():
+        it = text_adapters.NeMoMultimodalConversationShareGPTJsonlAdapter(
+            manifest_filepath=[str(sharegpt_conversation_jsonl)],
+            audio_locator_tag="<audio>",
+            audio_placeholders=["<audio>"],
+            token_equivalent_duration=0.08,
+            indexed=True,
+        )
+        return [convo.id for convo in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 7. NeMoMultimodalConversationShareGPTWebdatasetAdapter
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def sharegpt_webdataset_tar(tmp_audio_root) -> Path:
+    """20-sample ShareGPT WebDataset tar: each example is a (.json, .wav) pair
+    with matching stem. The adapter pairs alternating members. We also build
+    the ``.idx`` sidecar that IndexedTarSampleReader requires (it does not
+    auto-create indexes, unlike the JSONL reader)."""
+    from lhotse.indexing import create_tar_index
+
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
+        tmp_audio_root / "wds_audio", progress_bar=False
+    )
+    p = tmp_audio_root / "shard_0.tar"
+    with tarfile.open(p, "w") as tar:
+        for i, cut in enumerate(cuts):
+            stem = f"swds-{i:04d}"
+            audio_path = cut.recording.sources[0].source
+            with open(audio_path, "rb") as f:
+                audio_bytes = f.read()
+            payload = json.dumps(
+                {
+                    "id": stem,
+                    "conversations": [
+                        {"from": "User", "value": f"<audio>q{i}"},
+                        {"from": "Assistant", "value": f"a{i}"},
+                    ],
+                }
+            ).encode()
+            for ext, data in ((".json", payload), (".wav", audio_bytes)):
+                info = tarfile.TarInfo(stem + ext)
+                info.size = len(data)
+                tar.addfile(info, BytesIO(data))
+    create_tar_index(str(p), output_path=str(p) + ".idx")
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_sharegpt_webdataset_adapter_indexed_partition(sharegpt_webdataset_tar, world_size):
+    def build():
+        it = text_adapters.NeMoMultimodalConversationShareGPTWebdatasetAdapter(
+            data_dir=str(sharegpt_webdataset_tar.parent),
+            audio_locator_tag="<audio>",
+            audio_placeholders=["<audio>"],
+            token_equivalent_duration=0.08,
+            indexed=True,
+        )
+        return [convo.id for convo in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
diff --git a/tests/collections/common/test_lhotse_multimodal_ais_get_batch.py b/tests/collections/common/test_lhotse_multimodal_ais_get_batch.py
index b0c9c3569c37..ee3650283539 100644
--- a/tests/collections/common/test_lhotse_multimodal_ais_get_batch.py
+++ b/tests/collections/common/test_lhotse_multimodal_ais_get_batch.py
@@ -374,7 +374,9 @@ def test_salm_dataset_batch_loader_enabled(monkeypatch):
     with patch("nemo.collections.speechlm2.data.salm_dataset.AudioSamples") as audio_samples:
         ds = SALMDataset(tokenizer=_FakeTokenizer())
 
-    audio_samples.assert_called_once_with(fault_tolerant=True, use_batch_loader=True, mono_downmix=True)
+    audio_samples.assert_called_once_with(
+        fault_tolerant=True, use_batch_loader=True, ais_force_individual=False, mono_downmix=True
+    )
     assert ds.load_audio is audio_samples.return_value
 
 
@@ -386,5 +388,7 @@ def test_salm_dataset_batch_loader_disabled(monkeypatch):
     with patch("nemo.collections.speechlm2.data.salm_dataset.AudioSamples") as audio_samples:
         ds = SALMDataset(tokenizer=_FakeTokenizer())
 
-    audio_samples.assert_called_once_with(fault_tolerant=True, use_batch_loader=False, mono_downmix=True)
+    audio_samples.assert_called_once_with(
+        fault_tolerant=True, use_batch_loader=False, ais_force_individual=False, mono_downmix=True
+    )
     assert ds.load_audio is audio_samples.return_value
diff --git a/tests/collections/common/test_lhotse_multimodal_dataloading.py b/tests/collections/common/test_lhotse_multimodal_dataloading.py
index f7ae02d6d706..4d66fb57d7ae 100644
--- a/tests/collections/common/test_lhotse_multimodal_dataloading.py
+++ b/tests/collections/common/test_lhotse_multimodal_dataloading.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import os
 import random
 from itertools import islice
@@ -21,17 +22,14 @@
 import pytest
 import torch
 from lhotse import CutSet, SupervisionSegment, compute_num_samples
+from lhotse.audio import AudioLoadingError
+from lhotse.indexing import create_jsonl_index
 from lhotse.shar import JsonlShardWriter
 from lhotse.testing.dummies import dummy_cut, dummy_recording
 from omegaconf import OmegaConf
 
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
-from nemo.collections.common.data.lhotse.indexed_adapters import (
-    IndexedTarSampleReader,
-    LazyShuffledRange,
-    create_index,
-    create_tar_index,
-)
+from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarSampleReader, create_tar_index
 from nemo.collections.common.data.lhotse.sampling import (
     DurationFilter,
     MultimodalFixedBucketBatchSizeConstraint2D,
@@ -319,6 +317,211 @@ def test_multimodal_conversation_input_sharegpt(sharegpt_conversations_path):
     assert t.cut.load_audio().shape == (1, 39200)
 
 
+def test_multimodal_conversation_input_sharegpt_list_audio_paths(tmp_path):
+    manifest_path = tmp_path / "sharegpt_list_manifest.jsonl"
+    dummy_recording(0, 1.0, with_data=True).to_cut().save_audio(tmp_path / "clip_a.wav")
+    dummy_recording(1, 1.5, with_data=True).to_cut().save_audio(tmp_path / "clip_b.wav")
+    dummy_recording(2, 2.0, with_data=True).to_cut().save_audio(tmp_path / "clip_c.wav")
+    data = [
+        {
+            "id": "single_list_path",
+            "sound": ["clip_a.wav"],
+            "conversations": [
+                {"from": "human", "value": "Listen <sound>"},
+                {"from": "gpt", "value": "done"},
+            ],
+        },
+        {
+            "id": "multi_list_path",
+            "sound": ["clip_b.wav", "clip_c.wav"],
+            "conversations": [
+                {"from": "human", "value": "Compare <sound> now"},
+                {"from": "gpt", "value": "done"},
+            ],
+        },
+    ]
+    lhotse.serialization.save_to_jsonl(data, manifest_path)
+
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<sound>"],
+    )
+
+    single, multi = list(adapter)
+    single_audio = [t for t in single.turns if isinstance(t, AudioTurn)]
+    assert len(single_audio) == 1
+    assert single_audio[0].cut.duration == 1.0
+    assert single_audio[0].cut.load_audio().shape == (1, 16000)
+
+    assert [type(t) for t in multi.turns] == [TextTurn, AudioTurn, AudioTurn, TextTurn, TextTurn]
+    assert multi.turns[0].value == "Compare"
+    assert multi.turns[3].value == "now"
+    multi_audio = [t for t in multi.turns if isinstance(t, AudioTurn)]
+    assert [t.cut.duration for t in multi_audio] == [1.5, 2.0]
+
+
+def test_multimodal_conversation_input_sharegpt_nested_audio_path_list_raises(tmp_path):
+    manifest_path = tmp_path / "sharegpt_bad_list_manifest.jsonl"
+    lhotse.serialization.save_to_jsonl(
+        [
+            {
+                "id": "bad_nested_path",
+                "sound": [["clip_a.wav"]],
+                "conversations": [{"from": "human", "value": "Listen <sound>"}],
+            }
+        ],
+        manifest_path,
+    )
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<sound>"],
+    )
+
+    with pytest.raises(ValueError, match=r"unsupported sound\[0\]"):
+        list(adapter)
+
+
+def test_multimodal_conversation_input_sharegpt_ignores_assistant_literal_audio_tag(tmp_path):
+    manifest_path = tmp_path / "sharegpt_assistant_literal_audio_manifest.jsonl"
+    dummy_recording(0, 1.0, with_data=True).to_cut().save_audio(tmp_path / "clip_a.wav")
+    dummy_recording(1, 1.5, with_data=True).to_cut().save_audio(tmp_path / "clip_b.wav")
+    dummy_recording(2, 2.0, with_data=True).to_cut().save_audio(tmp_path / "clip_c.wav")
+    lhotse.serialization.save_to_jsonl(
+        [
+            {
+                "id": "assistant_literal_audio_tag",
+                "sound": ["clip_a.wav", "clip_b.wav", "clip_c.wav"],
+                "conversations": [
+                    {"from": "human", "value": "First prompt <sound>"},
+                    {"from": "gpt", "value": "Use an HTML <audio> tag in the page."},
+                    {"from": "human", "value": "Second prompt <sound>"},
+                    {"from": "gpt", "value": "Then wire audio.play() to a button."},
+                    {"from": "human", "value": "Third prompt <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            }
+        ],
+        manifest_path,
+    )
+
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<audio>", "<sound>", "<speech>"],
+    )
+
+    (conversation,) = list(adapter)
+    audio_turns = [t for t in conversation.turns if isinstance(t, AudioTurn)]
+    assert [t.cut.duration for t in audio_turns] == [1.0, 1.5, 2.0]
+    assistant_texts = [t.value for t in conversation.turns if isinstance(t, TextTurn) and t.role == "assistant"]
+    assert "Use an HTML <audio> tag in the page." in assistant_texts
+
+
+def test_multimodal_conversation_input_sharegpt_user_audio_path_placeholder_mismatch_raises(tmp_path):
+    manifest_path = tmp_path / "sharegpt_user_mismatch_manifest.jsonl"
+    lhotse.serialization.save_to_jsonl(
+        [
+            {
+                "id": "bad_user_mismatch",
+                "sound": ["clip_a.wav", "clip_b.wav", "clip_c.wav"],
+                "conversations": [
+                    {"from": "human", "value": "A <sound> B <sound> C <sound> D <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            }
+        ],
+        manifest_path,
+    )
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<sound>"],
+    )
+
+    with pytest.raises(ValueError, match="3 audio paths but 4 audio placeholders"):
+        list(adapter)
+
+
+def test_multimodal_conversation_input_sharegpt_missing_audio_path_raises(tmp_path):
+    manifest_path = tmp_path / "sharegpt_missing_audio_manifest.jsonl"
+    lhotse.serialization.save_to_jsonl(
+        [
+            {
+                "id": "missing_audio",
+                "sound": "missing.wav",
+                "conversations": [
+                    {"from": "human", "value": "Listen <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            }
+        ],
+        manifest_path,
+    )
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<sound>"],
+    )
+
+    with pytest.raises(AudioLoadingError):
+        list(adapter)
+
+
+@pytest.mark.parametrize("indexed", [False, True])
+def test_multimodal_conversation_input_sharegpt_missing_audio_path_skips_when_enabled(tmp_path, caplog, indexed):
+    manifest_path = tmp_path / "sharegpt_skip_missing_audio_manifest.jsonl"
+    dummy_recording(0, 1.0, with_data=True).to_cut().save_audio(tmp_path / "good_a.wav")
+    dummy_recording(1, 1.5, with_data=True).to_cut().save_audio(tmp_path / "good_b.wav")
+    lhotse.serialization.save_to_jsonl(
+        [
+            {
+                "id": "good_a",
+                "sound": "good_a.wav",
+                "conversations": [
+                    {"from": "human", "value": "Listen <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            },
+            {
+                "id": "missing_audio",
+                "sound": "missing.wav",
+                "conversations": [
+                    {"from": "human", "value": "Listen <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            },
+            {
+                "id": "good_b",
+                "sound": "good_b.wav",
+                "conversations": [
+                    {"from": "human", "value": "Listen <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            },
+        ],
+        manifest_path,
+    )
+    if indexed:
+        create_jsonl_index(str(manifest_path))
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<sound>"],
+        indexed=indexed,
+        skip_missing_manifest_entries=True,
+    )
+
+    with caplog.at_level(logging.WARNING):
+        conversations = list(adapter)
+
+    assert [c.id for c in conversations] == ["good_a", "good_b"]
+    assert "Skipping ShareGPT sample due to audio loading failure" in caplog.text
+    assert "missing_audio" in caplog.text
+    assert "missing.wav" in caplog.text
+
+
 @pytest.fixture
 def tokenizer(tmp_path_factory, multimodal_conversations_path):
     tmpdir = tmp_path_factory.mktemp("multi_convo_tokenizer")
@@ -952,7 +1155,7 @@ def indexed_sharegpt_conversations_path(tmp_path_factory):
         for i in range(10)
     ]
     lhotse.serialization.save_to_jsonl(data, manifest_path)
-    create_index(str(manifest_path), str(manifest_path) + ".idx")
+    create_jsonl_index(str(manifest_path))
     return manifest_path
 
 
@@ -964,47 +1167,12 @@ def test_sharegpt_indexed_sequential_no_shuffle(indexed_sharegpt_conversations_p
         shuffle_shards=False,
         shard_seed=0,
     )
-    assert adapter._has_index is True
     conversations = list(adapter)
     assert len(conversations) == 10
     ids = [c.id for c in conversations]
     assert ids == [f"convo_{i}" for i in range(10)]
 
 
-def test_sharegpt_indexed_shuffle_uses_random_access(indexed_sharegpt_conversations_path):
-    """When shuffle is on and .idx files exist, all items are yielded in shuffled order."""
-    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
-        manifest_filepath=str(indexed_sharegpt_conversations_path),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=0,
-    )
-    assert adapter._has_index is True
-    conversations = list(adapter)
-    assert len(conversations) == 10
-    ids = [c.id for c in conversations]
-    # All items present
-    assert sorted(ids) == [f"convo_{i}" for i in range(10)]
-    # Order is shuffled (with 10 items the chance of identical order is 1/10! ≈ 0)
-    assert ids != [f"convo_{i}" for i in range(10)]
-
-
-def test_sharegpt_indexed_different_epochs_different_order(indexed_sharegpt_conversations_path):
-    """Different epochs produce different shuffled orders."""
-    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
-        manifest_filepath=str(indexed_sharegpt_conversations_path),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=0,
-    )
-    epoch0_ids = [c.id for c in adapter]
-    epoch1_ids = [c.id for c in adapter]
-    # Both epochs have all items
-    assert sorted(epoch0_ids) == sorted(epoch1_ids)
-    # But in different order (epoch counter increments the seed)
-    assert epoch0_ids != epoch1_ids
-
-
 def test_sharegpt_no_index_falls_back_to_in_memory_shuffle(tmp_path_factory):
     """When .idx files don't exist, shuffle_shards still works via in-memory shuffle."""
     tmp_path = tmp_path_factory.mktemp("sharegpt_no_idx")
@@ -1020,7 +1188,6 @@ def test_sharegpt_no_index_falls_back_to_in_memory_shuffle(tmp_path_factory):
         for i in range(10)
     ]
     lhotse.serialization.save_to_jsonl(data, manifest_path)
-    # No .idx file created
 
     adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
         manifest_filepath=str(manifest_path),
@@ -1028,7 +1195,6 @@ def test_sharegpt_no_index_falls_back_to_in_memory_shuffle(tmp_path_factory):
         shuffle_shards=True,
         shard_seed=0,
     )
-    assert adapter._has_index is False
     conversations = list(adapter)
     assert len(conversations) == 10
     ids = [c.id for c in conversations]
@@ -1037,75 +1203,6 @@ def test_sharegpt_no_index_falls_back_to_in_memory_shuffle(tmp_path_factory):
     assert ids != [f"convo_{i}" for i in range(10)]
 
 
-def test_sharegpt_indexed_with_audio(tmp_path_factory):
-    """Indexed reading works correctly with audio turns (ShareGPT format with <sound> placeholders)."""
-    tmp_path = tmp_path_factory.mktemp("indexed_sharegpt_audio")
-    manifest_path = tmp_path / "manifest.jsonl"
-
-    # Create audio files
-    for i in range(5):
-        dummy_recording(i, duration=1.0 + i * 0.5, with_data=True).to_cut().save_audio(tmp_path / f"audio_{i}.wav")
-
-    data = [
-        {
-            "id": f"audio_convo_{i}",
-            "sound": f"audio_{i}.wav",
-            "conversations": [
-                {"from": "human", "value": f"Listen to this: <sound> What do you think?"},
-                {"from": "gpt", "value": f"Response {i}"},
-            ],
-        }
-        for i in range(5)
-    ]
-    lhotse.serialization.save_to_jsonl(data, manifest_path)
-    create_index(str(manifest_path), str(manifest_path) + ".idx")
-
-    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
-        manifest_filepath=str(manifest_path),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=42,
-    )
-    assert adapter._has_index is True
-    conversations = list(adapter)
-    assert len(conversations) == 5
-
-    ids = sorted([c.id for c in conversations])
-    assert ids == [f"audio_convo_{i}" for i in range(5)]
-
-    # Verify audio turns were created correctly
-    for conv in conversations:
-        assert conv.has_audio_turns
-        audio_turns = [t for t in conv.turns if isinstance(t, AudioTurn)]
-        assert len(audio_turns) == 1
-        assert audio_turns[0].audio_locator_tag == "[audio]"
-        assert audio_turns[0].cut.load_audio().shape[0] == 1  # mono audio
-
-
-@pytest.mark.parametrize("n", [0, 1, 2, 3, 5, 10, 100, 1000, 1023, 1024, 1025])
-def test_lazy_shuffled_range_is_a_permutation(n):
-    """LazyShuffledRange must yield every element of [0, n) exactly once."""
-    rng = random.Random(42)
-    result = list(LazyShuffledRange(n, rng))
-    assert len(result) == n
-    assert sorted(result) == list(range(n))
-
-
-def test_lazy_shuffled_range_is_shuffled():
-    """LazyShuffledRange should not produce the identity permutation (for non-trivial n)."""
-    rng = random.Random(0)
-    result = list(LazyShuffledRange(50, rng))
-    assert result != list(range(50))
-
-
-def test_lazy_shuffled_range_different_seeds():
-    """Different RNG seeds produce different permutations."""
-    a = list(LazyShuffledRange(100, random.Random(0)))
-    b = list(LazyShuffledRange(100, random.Random(1)))
-    assert a != b
-    assert sorted(a) == sorted(b) == list(range(100))
-
-
 # ─── WebDataset ShareGPT adapter tests ──────────────────────────────────────
 
 
@@ -1248,37 +1345,6 @@ def test_webdataset_sequential_turn_structure(webdataset_dir):
     assert conv.turns[3].value == "Response for sample 0"
 
 
-def test_webdataset_indexed_shuffle(webdataset_dir):
-    """When shuffle is on and .idx files exist, all items are yielded in shuffled order."""
-    adapter = NeMoMultimodalConversationShareGPTWebdatasetAdapter(
-        data_dir=str(webdataset_dir),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=0,
-    )
-    assert adapter._has_index is True
-    conversations = list(adapter)
-    assert len(conversations) == 6
-    ids = [c.id for c in conversations]
-    assert sorted(ids) == [f"sample_{i}" for i in range(6)]
-    # Order is shuffled (1/6! ≈ 0 chance of identity)
-    assert ids != [f"sample_{i}" for i in range(6)]
-
-
-def test_webdataset_indexed_different_epochs(webdataset_dir):
-    """Different epochs produce different shuffled orders."""
-    adapter = NeMoMultimodalConversationShareGPTWebdatasetAdapter(
-        data_dir=str(webdataset_dir),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=0,
-    )
-    epoch0_ids = [c.id for c in adapter]
-    epoch1_ids = [c.id for c in adapter]
-    assert sorted(epoch0_ids) == sorted(epoch1_ids)
-    assert epoch0_ids != epoch1_ids
-
-
 def test_webdataset_no_index_falls_back_to_sequential_shuffle(webdataset_dir_no_idx):
     """Without .idx files, shuffle_shards still works (shard-level shuffle, sequential within)."""
     adapter = NeMoMultimodalConversationShareGPTWebdatasetAdapter(
@@ -1287,7 +1353,6 @@ def test_webdataset_no_index_falls_back_to_sequential_shuffle(webdataset_dir_no_
         shuffle_shards=True,
         shard_seed=0,
     )
-    assert adapter._has_index is False
     conversations = list(adapter)
     assert len(conversations) == 6
     ids = [c.id for c in conversations]
@@ -1310,25 +1375,6 @@ def test_webdataset_audio_loads_correctly(webdataset_dir):
         assert audio.shape[0] == 1  # mono
 
 
-def test_webdataset_indexed_audio_loads_correctly(webdataset_dir):
-    """Audio loaded via indexed random access is valid and decodable."""
-    adapter = NeMoMultimodalConversationShareGPTWebdatasetAdapter(
-        data_dir=str(webdataset_dir),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=42,
-    )
-    assert adapter._has_index is True
-    conversations = list(adapter)
-    assert len(conversations) == 6
-    for conv in conversations:
-        audio_turns = [t for t in conv.turns if isinstance(t, AudioTurn)]
-        assert len(audio_turns) == 1
-        audio = audio_turns[0].cut.load_audio()
-        assert audio.shape[0] == 1
-        assert audio.shape[1] > 0
-
-
 def test_sharegpt_audio_root(tmp_path_factory):
     """When audio_root is set, audio files are resolved relative to it, not the manifest directory."""
     manifest_dir = tmp_path_factory.mktemp("sharegpt_manifest_dir")
@@ -1423,7 +1469,6 @@ def test_webdataset_auto_discover_shards_no_meta(tmp_path_factory, create_idx):
         audio_locator_tag="[audio]",
         shuffle_shards=False,
     )
-    assert adapter._has_index == create_idx
     conversations = list(adapter)
     assert len(conversations) == 4
     ids = sorted(c.id for c in conversations)
diff --git a/tests/collections/common/test_lhotse_nemo_adapters.py b/tests/collections/common/test_lhotse_nemo_adapters.py
index 98511118b645..5c7bc433964e 100644
--- a/tests/collections/common/test_lhotse_nemo_adapters.py
+++ b/tests/collections/common/test_lhotse_nemo_adapters.py
@@ -280,7 +280,7 @@ def nemo_manifest_path_sample_rate(tmp_path_factory):
                 "lang": "en",
             }
         )
-    p = tmpdir / "nemo_manifest_sr.json"
+    p = tmpdir / "nemo_manifest_sr.jsonl"
     save_to_jsonl(nemo, p)
     return p
 
@@ -304,3 +304,18 @@ def test_lazy_nemo_iterator_sample_rate_fallback(nemo_manifest_path_sample_rate)
 
         # sample_rate should not leak into custom fields
         assert "sample_rate" not in (c.custom or {})
+
+
+def test_lazy_nemo_iterator_sample_rate_fallback_indexed(nemo_manifest_path_sample_rate):
+    """The shared dict-to-cut helper must also consume 'sample_rate' in indexed mode."""
+    indexing = pytest.importorskip("lhotse.indexing")
+    indexing.create_jsonl_index(str(nemo_manifest_path_sample_rate))
+
+    cuts = CutSet(LazyNeMoIterator(nemo_manifest_path_sample_rate, indexed=True))
+
+    assert len(cuts) == 2
+    for c in cuts:
+        assert isinstance(c, MonoCut)
+        assert c.sampling_rate == 16000
+        assert c.recording.sampling_rate == 16000
+        assert "sample_rate" not in (c.custom or {})
diff --git a/tests/collections/common/test_lhotse_per_rank_stateful_loader.py b/tests/collections/common/test_lhotse_per_rank_stateful_loader.py
new file mode 100644
index 000000000000..9eb6dc200c58
--- /dev/null
+++ b/tests/collections/common/test_lhotse_per_rank_stateful_loader.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+"""Regression tests for ``_PerRankStatefulDataLoader``.
+
+The wrapper exists because Lightning's ``FitLoop`` saves
+``CombinedLoader._state_dicts()`` (which captures rank-0's
+``StatefulDataLoader.state_dict()`` only) and replays it on every rank on
+resume — broadcasting rank-0's iterator state to every rank and corrupting
+per-shard partitioning. The wrapper intercepts that pipeline so the saved
+payload is a per-rank list and the load picks the right entry.
+
+These tests intentionally do not spin up torch.distributed; the
+all_gather path is the trivial 1-rank fallback. The
+:func:`test_load_picks_correct_rank_entry` test simulates the multi-rank
+case by handing the wrapper an externally-built per-rank state dict and
+asserting the right inner state lands on the inner loader (proxied by a
+stub that records what ``load_state_dict`` was called with).
+"""
+
+from __future__ import annotations
+
+import sys
+import types
+
+import pytest
+import torch
+
+from nemo.collections.common.data.lhotse.dataloader import _build_dataloader, _PerRankStatefulDataLoader
+
+
+class _StubStatefulDataLoader:
+    """Stand-in for ``torchdata.stateful_dataloader.StatefulDataLoader``.
+
+    The wrapper's tests only need ``state_dict()`` and ``load_state_dict()``
+    to be observable; they don't care about iteration. We install this stub
+    as the ``StatefulDataLoader`` import inside the wrapper module so the
+    test runs without ``torchdata`` and stays focused on the gather/scatter
+    logic we own.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        self.kwargs = kwargs
+        self._state: dict = {"position": 0, "shard_id": None}
+        self.load_calls: list[dict] = []
+
+    def state_dict(self) -> dict:
+        return dict(self._state)
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        # record the call so tests can assert what was applied.
+        self.load_calls.append(state_dict)
+        self._state.update(state_dict)
+
+
+@pytest.fixture(autouse=True)
+def _patch_stateful_loader(monkeypatch):
+    """Make ``from torchdata.stateful_dataloader import StatefulDataLoader``
+    inside the wrapper resolve to our stub."""
+    fake_module = types.ModuleType("torchdata.stateful_dataloader")
+    fake_module.StatefulDataLoader = _StubStatefulDataLoader
+    fake_pkg = types.ModuleType("torchdata")
+    fake_pkg.stateful_dataloader = fake_module
+    monkeypatch.setitem(sys.modules, "torchdata", fake_pkg)
+    monkeypatch.setitem(sys.modules, "torchdata.stateful_dataloader", fake_module)
+
+
+def _new_wrapper(dp_rank: int, dp_world_size: int, dp_group=None) -> _PerRankStatefulDataLoader:
+    return _PerRankStatefulDataLoader(
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        dp_group=dp_group,
+        # the stub ignores constructor kwargs, but we pass something
+        # representative so the call signature mirrors real usage.
+        dataset=object(),
+        num_workers=4,
+    )
+
+
+def test_build_dataloader_forwards_dp_group():
+    dp_group = object()
+
+    dl = _build_dataloader(
+        use_stateful_dataloader=True,
+        dp_rank=1,
+        dp_world_size=2,
+        dp_group=dp_group,
+        dataset=object(),
+        batch_size=None,
+        num_workers=0,
+    )
+
+    assert isinstance(dl, _PerRankStatefulDataLoader)
+    assert dl._dp_group is dp_group
+
+
+def test_state_dict_all_gather_uses_dp_group(monkeypatch):
+    dp_group = object()
+    dl = _new_wrapper(dp_rank=1, dp_world_size=2, dp_group=dp_group)
+    dl._inner._state = {"position": 43, "shard_id": 1}
+    calls = []
+
+    monkeypatch.setattr(torch.distributed, "is_available", lambda: True)
+    monkeypatch.setattr(torch.distributed, "is_initialized", lambda: True)
+
+    def fake_all_gather_object(per_rank, tagged, group=None):
+        calls.append(group)
+        per_rank[0] = {"dp_rank": 0, "dp_world_size": 2, "state": {"position": 42, "shard_id": 0}}
+        per_rank[1] = tagged
+
+    monkeypatch.setattr(torch.distributed, "all_gather_object", fake_all_gather_object)
+
+    sd = dl.state_dict()
+
+    assert calls == [dp_group]
+    assert sd["train_dataloader_per_rank"][1]["state"] == {"position": 43, "shard_id": 1}
+
+
+def test_state_dict_single_rank_wraps_with_per_rank_list():
+    dl = _new_wrapper(dp_rank=0, dp_world_size=1)
+    dl._inner._state = {"position": 42, "shard_id": 0}
+
+    sd = dl.state_dict()
+
+    assert list(sd.keys()) == ["train_dataloader_per_rank"]
+    per_rank = sd["train_dataloader_per_rank"]
+    assert isinstance(per_rank, list) and len(per_rank) == 1
+    assert per_rank[0] == {
+        "dp_rank": 0,
+        "dp_world_size": 1,
+        "state": {"position": 42, "shard_id": 0},
+    }
+
+
+def test_load_state_dict_single_rank_unwraps_and_applies():
+    dl = _new_wrapper(dp_rank=0, dp_world_size=1)
+
+    dl.load_state_dict(
+        {
+            "train_dataloader_per_rank": [
+                {"dp_rank": 0, "dp_world_size": 1, "state": {"position": 99, "shard_id": 0}},
+            ]
+        }
+    )
+
+    assert dl._inner.load_calls == [{"position": 99, "shard_id": 0}]
+
+
+def test_load_picks_correct_rank_entry():
+    """Hand a 32-rank per_rank list to a wrapper bound to rank 28; assert
+    the inner loader receives rank 28's entry only.
+
+    This is the regression for the 2026-05-14 LOAD-side bug: Lightning's
+    FitLoop replays the saved state on every rank, and historically rank 28
+    ended up applying rank 0's worker-0 state because the broadcast came
+    from FitLoop AFTER our DataModule's per-rank load. With the wrapper,
+    even the FitLoop's broadcast goes through the right per-rank scatter.
+    """
+    world = 32
+    rank = 28
+    per_rank = [
+        {
+            "dp_rank": r,
+            "dp_world_size": world,
+            "state": {"position": 100 + r, "shard_id": r * 4},
+        }
+        for r in range(world)
+    ]
+
+    dl = _new_wrapper(dp_rank=rank, dp_world_size=world)
+    dl.load_state_dict({"train_dataloader_per_rank": per_rank})
+
+    assert len(dl._inner.load_calls) == 1
+    applied = dl._inner.load_calls[0]
+    assert applied == {"position": 100 + rank, "shard_id": rank * 4}, (
+        "Wrapper must consume per_rank[self._dp_rank] — bug would manifest "
+        "as applying per_rank[0] (rank-0 broadcast collapse)."
+    )
+
+
+def test_load_rejects_world_size_mismatch():
+    dl = _new_wrapper(dp_rank=0, dp_world_size=32)
+    with pytest.raises(RuntimeError, match="dp_world_size"):
+        dl.load_state_dict(
+            {
+                "train_dataloader_per_rank": [
+                    {"dp_rank": 0, "dp_world_size": 4, "state": {}},
+                    {"dp_rank": 1, "dp_world_size": 4, "state": {}},
+                    {"dp_rank": 2, "dp_world_size": 4, "state": {}},
+                    {"dp_rank": 3, "dp_world_size": 4, "state": {}},
+                ]
+            }
+        )
+
+
+def test_load_rejects_tag_mismatch():
+    dl = _new_wrapper(dp_rank=0, dp_world_size=2)
+    with pytest.raises(RuntimeError, match=r"tagged \(dp_rank=1"):
+        dl.load_state_dict(
+            {
+                "train_dataloader_per_rank": [
+                    # the entry at index 0 claims to be rank 1 — must reject.
+                    {"dp_rank": 1, "dp_world_size": 2, "state": {}},
+                    {"dp_rank": 1, "dp_world_size": 2, "state": {}},
+                ]
+            }
+        )
+
+
+def test_load_rejects_bare_inner_state():
+    """Strict wire format: a state dict without the
+    ``train_dataloader_per_rank`` top-level key is rejected. This guards
+    against the legacy code path (``DataModule.load_state_dict`` calling
+    ``dl.load_state_dict(entry["state"])`` with the raw inner state) and
+    against Lightning's FitLoop broadcasting rank-0's
+    ``StatefulDataLoader.state_dict()`` — both would otherwise look like
+    valid bare inner state and produce wrong, silently-corrupt resumes.
+    """
+    dl = _new_wrapper(dp_rank=0, dp_world_size=1)
+
+    with pytest.raises(RuntimeError, match="train_dataloader_per_rank"):
+        dl.load_state_dict({"position": 7, "shard_id": 0})
+
+    # an inner-shaped state (with ``_snapshot._worker_snapshots`` etc.) —
+    # what Lightning's FitLoop used to feed back — must be rejected too.
+    with pytest.raises(RuntimeError, match="train_dataloader_per_rank"):
+        dl.load_state_dict(
+            {
+                "_iterator_finished": False,
+                "_snapshot": {"_worker_snapshots": {"worker_0": {}}},
+                "_steps_since_snapshot": 0,
+            }
+        )
+
+
+def test_empty_state_is_a_noop():
+    dl = _new_wrapper(dp_rank=0, dp_world_size=1)
+    dl.load_state_dict({})
+    assert dl._inner.load_calls == []
diff --git a/tests/collections/common/test_validate_dataloader.py b/tests/collections/common/test_validate_dataloader.py
new file mode 100644
index 000000000000..8f0c1687cce4
--- /dev/null
+++ b/tests/collections/common/test_validate_dataloader.py
@@ -0,0 +1,366 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+"""Unit tests for ``scripts/dataloading/_validate_dataloader/{pre_validation,consolidate}``.
+
+These cover the parts that can run without a SLURM cluster or real
+Lhotse manifests:
+
+  * pre-validation static checks across hand-crafted config snippets
+  * consolidate() against synthesized JSONL rows (PASS / FAIL / SKIP)
+  * config_inject recursive walker
+"""
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+import pytest
+from omegaconf import OmegaConf
+
+# The validator lives under scripts/, which isn't on PYTHONPATH by default.
+REPO_ROOT = Path(__file__).resolve().parents[3]
+sys.path.insert(0, str(REPO_ROOT / "scripts" / "dataloading"))
+
+from _validate_dataloader import config_inject
+from _validate_dataloader import consolidate as cons  # noqa: E402
+from _validate_dataloader import pre_validation as pv
+
+# --------------------------------------------------------------------------- #
+# config_inject
+# --------------------------------------------------------------------------- #
+
+
+@pytest.mark.unit
+def test_config_inject_top_level_and_nested():
+    cfg = OmegaConf.create(
+        {
+            "input_cfg": [
+                {
+                    "type": "lhotse_as_conversation",
+                    "input_cfg": [
+                        {"type": "lhotse_shar", "weight": 1.0},
+                        {"type": "nemo_tarred", "weight": 0.5},
+                    ],
+                },
+                {"type": "group", "input_cfg": [{"type": "lhotse_shar", "weight": 0.3}]},
+            ],
+        }
+    )
+    config_inject.inject_validator_flags(cfg, force_finite=True, metadata_only=True)
+    assert cfg["force_finite"] is True
+    assert cfg["metadata_only"] is True
+    for transform in cfg["input_cfg"]:
+        assert transform["force_finite"] is True
+        assert transform["metadata_only"] is True
+        for leaf in transform["input_cfg"]:
+            assert leaf["force_finite"] is True
+            assert leaf["metadata_only"] is True
+
+
+@pytest.mark.unit
+def test_config_inject_preserves_existing_explicit_value():
+    cfg = OmegaConf.create({"input_cfg": [{"type": "lhotse_shar", "force_finite": False}]})
+    config_inject.inject_validator_flags(cfg, force_finite=True, metadata_only=False)
+    # Leaf had explicit override — preserve it.
+    assert cfg["input_cfg"][0]["force_finite"] is False
+
+
+# --------------------------------------------------------------------------- #
+# pre_validation
+# --------------------------------------------------------------------------- #
+
+
+def _base_cfg():
+    return OmegaConf.create(
+        {
+            "seed": 42,
+            "shard_seed": 42,
+            "use_stateful_dataloader": True,
+            "indexed": True,
+            "indexes_root": "/tmp/idx_does_not_exist_locally",
+            "use_bucketing": True,
+            "num_buckets": 20,
+            "bucket_buffer_size": 20000,
+            "force_map_dataset": False,
+            "text_field": "answer",
+            "input_cfg": [
+                {
+                    "type": "lhotse_as_conversation",
+                    "input_cfg": [
+                        {"type": "lhotse_shar", "weight": 1.0, "corpus": "ami"},
+                        {
+                            "type": "nemo_tarred",
+                            "weight": 0.13,
+                            "corpus": "librilight",
+                            "text_field": "answer",
+                            "manifest_filepath": "s3://x/manifest__OP_0..15_CL_.jsonl",
+                            "tarred_audio_filepaths": "s3://x/audio__OP_0..15_CL_.tar",
+                        },
+                    ],
+                },
+            ],
+        }
+    )
+
+
+@pytest.mark.unit
+def test_pre_validation_passing_config():
+    report = pv.run_pre_validation(_base_cfg())
+    fails = [c for c in report.checks if c.status == pv.FAIL]
+    assert not fails, f"unexpected FAILs: {[(c.check_id, c.detail) for c in fails]}"
+
+
+@pytest.mark.unit
+def test_pre_validation_seed_int_fail():
+    cfg = _base_cfg()
+    cfg.seed = "randomized"
+    report = pv.run_pre_validation(cfg)
+    seed_check = next(c for c in report.checks if c.check_id == "seed-int")
+    assert seed_check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_shard_seed_int_fail():
+    cfg = _base_cfg()
+    cfg.shard_seed = "randomized"
+    report = pv.run_pre_validation(cfg)
+    shard_check = next(c for c in report.checks if c.check_id == "shard-seed-int")
+    assert shard_check.status == pv.FAIL
+    mux_check = next(c for c in report.checks if c.check_id == "mux-seed-not-randomized")
+    # force_map_dataset is False in base config, so this also fires.
+    assert mux_check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_stateful_off_fail():
+    cfg = _base_cfg()
+    cfg.use_stateful_dataloader = False
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "stateful-on")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_indexed_implies_root_fail():
+    cfg = _base_cfg()
+    cfg.indexes_root = None
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "indexed-implies-root")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_constant_time_leaves_fail_when_streaming():
+    cfg = _base_cfg()
+    cfg.indexed = False  # turns off propagation -> all leaves go streaming
+    cfg.indexes_root = None  # avoid the dependent indexed-implies-root failing on its own.
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "constant-time-leaves")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_constant_time_leaves_fail_for_map_style_too():
+    """User's correction: constant-time leaves are required for both
+    map (force_map_dataset=True) and iterable (force_map_dataset=False)."""
+    cfg = _base_cfg()
+    cfg.force_map_dataset = True
+    cfg.indexed = False
+    cfg.indexes_root = None
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "constant-time-leaves")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_slice_length_with_indexed_fail():
+    cfg = _base_cfg()
+    cfg["input_cfg"][0]["input_cfg"][0]["slice_length"] = 50
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "slice-length-vs-indexed")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_mux_weights_sum_fail():
+    cfg = _base_cfg()
+    cfg["input_cfg"][0]["input_cfg"][0]["weight"] = -1.0
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "mux-weights-sum")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_ignore_fail_downgrades_to_warn():
+    cfg = _base_cfg()
+    cfg.seed = "randomized"
+    report = pv.run_pre_validation(cfg, ignore_fail=["seed-int"])
+    check = next(c for c in report.checks if c.check_id == "seed-int")
+    assert check.status == pv.WARN
+
+
+@pytest.mark.unit
+def test_pre_validation_bucketer_buffer_warn():
+    cfg = _base_cfg()
+    cfg.bucket_buffer_size = 50  # < 20 * 10
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "bucketer-buffer")
+    assert check.status == pv.WARN
+
+
+# --------------------------------------------------------------------------- #
+# consolidate
+# --------------------------------------------------------------------------- #
+
+
+def _write_jsonl(path: Path, rows: list[dict]):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        for r in rows:
+            f.write(json.dumps(r) + "\n")
+
+
+def _row(rank, step, cut_ids, *, worker_id=0):
+    return {
+        "step": step,
+        "rank": rank,
+        "world_size": 2,
+        "worker_id": worker_id,
+        "cut_ids": cut_ids,
+        "batch_size": len(cut_ids),
+        "t_total_ms": 1.0,
+        "t_first_batch_ms": None,
+    }
+
+
+@pytest.mark.unit
+def test_consolidate_q1_q3_pass(tmp_path):
+    """Two ranks, disjoint cuts, no duplication."""
+    base = tmp_path / "baseline" / "run0"
+    _write_jsonl(
+        base / "rank_000.jsonl",
+        [
+            _row(0, 0, ["a", "b"]),
+            _row(0, 1, ["c"]),
+        ],
+    )
+    _write_jsonl(
+        base / "rank_001.jsonl",
+        [
+            _row(1, 0, ["d", "e"]),
+            _row(1, 1, ["f"]),
+        ],
+    )
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q_by_id = {q.q_id: q for q in report.questions}
+    assert q_by_id["Q1"].status == cons.PASS
+    assert q_by_id["Q3"].status == cons.PASS
+
+
+@pytest.mark.unit
+def test_consolidate_q1_cross_rank_leak(tmp_path):
+    base = tmp_path / "baseline" / "run0"
+    _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, ["shared", "a"])])
+    _write_jsonl(base / "rank_001.jsonl", [_row(1, 0, ["shared", "b"])])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q1 = next(q for q in report.questions if q.q_id == "Q1")
+    assert q1.status == cons.FAIL
+    assert q1.tag == "partition-rank-leak"
+
+
+@pytest.mark.unit
+def test_consolidate_q3_full_broadcast(tmp_path):
+    """Every rank sees the same cuts → broadcast tag."""
+    base = tmp_path / "baseline" / "run0"
+    same = ["a", "b", "c"]
+    _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, same)])
+    _write_jsonl(base / "rank_001.jsonl", [_row(1, 0, same)])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q3 = next(q for q in report.questions if q.q_id == "Q3")
+    assert q3.status == cons.FAIL
+    assert "BROADCAST" in q3.detail
+
+
+@pytest.mark.unit
+def test_consolidate_q2_skip_without_groundtruth(tmp_path):
+    base = tmp_path / "baseline" / "run0"
+    _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, ["a"])])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q2 = next(q for q in report.questions if q.q_id == "Q2")
+    assert q2.status == cons.SKIP
+
+
+@pytest.mark.unit
+def test_consolidate_q2_skip_detects_missing(tmp_path):
+    base = tmp_path / "baseline" / "run0"
+    _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, ["a", "b"])])
+    _write_jsonl(tmp_path / "groundtruth" / "cuts.jsonl", [{"cut_ids": ["a", "b", "c"]}])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q2 = next(q for q in report.questions if q.q_id == "Q2")
+    assert q2.status == cons.FAIL
+    assert q2.tag == "skip"
+
+
+@pytest.mark.unit
+def test_consolidate_q4_resume_match(tmp_path):
+    """State is saved AFTER yielding baseline step ``checkpoint_at``, so
+    resumed[0] should match baseline[checkpoint_at + 1]."""
+    base = tmp_path / "baseline" / "run0"
+    res = tmp_path / "resumed" / "run0"
+    _write_jsonl(
+        base / "rank_000.jsonl",
+        [
+            _row(0, 0, ["a"]),
+            _row(0, 1, ["b"]),
+            _row(0, 2, ["c"]),
+        ],
+    )
+    # checkpoint_at=0 -> resumed[0] == baseline[1] == ["b"], resumed[1] == baseline[2] == ["c"]
+    _write_jsonl(
+        res / "rank_000.jsonl",
+        [
+            _row(0, 0, ["b"]),
+            _row(0, 1, ["c"]),
+        ],
+    )
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q4 = next(q for q in report.questions if q.q_id == "Q4")
+    assert q4.status == cons.PASS
+
+
+@pytest.mark.unit
+def test_consolidate_q4_resume_diverges(tmp_path):
+    base = tmp_path / "baseline" / "run0"
+    res = tmp_path / "resumed" / "run0"
+    _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, ["a"]), _row(0, 1, ["b"]), _row(0, 2, ["c"])])
+    # checkpoint_at=0 -> resumed[0] should == baseline[1] == ["b"], but it's "DIFFERENT".
+    _write_jsonl(res / "rank_000.jsonl", [_row(0, 0, ["DIFFERENT"])])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q4 = next(q for q in report.questions if q.q_id == "Q4")
+    assert q4.status == cons.FAIL
+    assert q4.tag == "resume-rng-divergence"
+
+
+@pytest.mark.unit
+def test_consolidate_q5_determinism_match(tmp_path):
+    for run in ("run0", "run1"):
+        _write_jsonl(tmp_path / "baseline" / run / "rank_000.jsonl", [_row(0, 0, ["a"]), _row(0, 1, ["b"])])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=2)
+    q5 = next(q for q in report.questions if q.q_id == "Q5")
+    assert q5.status == cons.PASS
+
+
+@pytest.mark.unit
+def test_consolidate_q5_determinism_diverges(tmp_path):
+    _write_jsonl(tmp_path / "baseline" / "run0" / "rank_000.jsonl", [_row(0, 0, ["a"])])
+    _write_jsonl(tmp_path / "baseline" / "run1" / "rank_000.jsonl", [_row(0, 0, ["DIFFERENT"])])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=2)
+    q5 = next(q for q in report.questions if q.q_id == "Q5")
+    assert q5.status == cons.FAIL
+    assert q5.tag == "non-determinism"
diff --git a/tests/collections/speechlm2/test_datamodule.py b/tests/collections/speechlm2/test_datamodule.py
index 253ded844aa1..ca5e5dceee7d 100644
--- a/tests/collections/speechlm2/test_datamodule.py
+++ b/tests/collections/speechlm2/test_datamodule.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from types import SimpleNamespace
+
 import pytest
 import torch
 from lhotse import CutSet
@@ -18,6 +20,7 @@
 from lightning.pytorch.utilities import CombinedLoader
 from omegaconf import DictConfig
 
+import nemo.collections.speechlm2.data.datamodule as datamodule_module
 from nemo.collections.common.data.lhotse.broadcasting import BroadcastingDataLoader
 from nemo.collections.common.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer, create_spt_model
 from nemo.collections.speechlm2.data import DataModule
@@ -100,6 +103,59 @@ def test_datamodule_train_dataloader(data_config, tokenizer):
     assert all(c.tag == "train" for c in batch)
 
 
+def test_datamodule_train_dataloader_caches_broadcast_wrapper_and_passes_dp_group(data_config, tokenizer, monkeypatch):
+    data = DataModule(data_config, tokenizer=tokenizer, dataset=Identity())
+    mesh = SimpleNamespace(mesh_dim_names=())
+    dp_group = object()
+    source = object()
+    calls = []
+
+    monkeypatch.setattr(data, "_get_device_mesh", lambda: mesh)
+    monkeypatch.setattr(data, "_get_dp_rank", lambda: 3)
+    monkeypatch.setattr(data, "_get_world_size", lambda: 8)
+    monkeypatch.setattr(data, "_get_dp_group", lambda: dp_group)
+    monkeypatch.setattr(datamodule_module, "is_dp_source_rank", lambda candidate: candidate is mesh)
+
+    def fake_get_lhotse_dataloader_from_config(**kwargs):
+        calls.append(kwargs)
+        return source
+
+    monkeypatch.setattr(
+        datamodule_module,
+        "get_lhotse_dataloader_from_config",
+        fake_get_lhotse_dataloader_from_config,
+    )
+
+    dl1 = data.train_dataloader()
+    dl2 = data.train_dataloader()
+
+    assert dl1 is dl2
+    assert isinstance(dl1, BroadcastingDataLoader)
+    assert dl1._source is source
+    assert len(calls) == 1
+    assert calls[0]["global_rank"] == 3
+    assert calls[0]["world_size"] == 8
+    assert calls[0]["dp_group"] is dp_group
+
+
+def test_datamodule_train_dataloader_non_source_rank_does_not_build_source(data_config, tokenizer, monkeypatch):
+    data = DataModule(data_config, tokenizer=tokenizer, dataset=Identity())
+    mesh = SimpleNamespace(mesh_dim_names=())
+
+    monkeypatch.setattr(data, "_get_device_mesh", lambda: mesh)
+    monkeypatch.setattr(datamodule_module, "is_dp_source_rank", lambda candidate: False)
+
+    def fail_if_called(**kwargs):
+        raise AssertionError("non-source CP/TP ranks must not build a Lhotse source loader")
+
+    monkeypatch.setattr(datamodule_module, "get_lhotse_dataloader_from_config", fail_if_called)
+
+    dl = data.train_dataloader()
+
+    assert isinstance(dl, BroadcastingDataLoader)
+    assert dl._source is None
+
+
 def test_datamodule_validation_dataloader(data_config, tokenizer):
     val_sets = {"val_set_0", "val_set_1"}
     data = DataModule(data_config, tokenizer=tokenizer, dataset=Identity())
diff --git a/tests/collections/speechlm2/test_salm_automodel.py b/tests/collections/speechlm2/test_salm_automodel.py
index 2f007671a8bc..25c0885f0123 100644
--- a/tests/collections/speechlm2/test_salm_automodel.py
+++ b/tests/collections/speechlm2/test_salm_automodel.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
 import os
 
 import pytest
@@ -192,12 +193,32 @@ def test_salm_automodel_training_step(model, dataset, prompt_formatter, training
     training_cutset_batch = training_cutset_batch.map(lambda c: c.apply_prompt_format(prompt_formatter), apply_fn=None)
     batch = dataset[training_cutset_batch]
     batch = move_data_to_device(batch, device=model.device)
-    results = model.training_step(batch, batch_idx=0)
+    results = model._training_step_batch(batch, batch_idx=0)
     assert torch.is_tensor(results["loss"])
     assert not torch.isnan(results["loss"])
     assert results["loss"] > 0
 
 
+def test_salm_automodel_training_step_uses_dataloader_iter_signature():
+    assert list(inspect.signature(SALMAutomodel.training_step).parameters) == ["self", "dataloader_iter"]
+
+
+def test_salm_automodel_record_training_stats_uses_thd_metadata():
+    model = SALMAutomodel.__new__(SALMAutomodel)
+    batch = {"input_ids": torch.zeros(3, 7, dtype=torch.long)}
+    inputs = {
+        "input_embeds": torch.zeros(5, 4),
+        "attention_mask": None,
+        "num_tokens": torch.tensor(11),
+        "num_examples": torch.tensor(3),
+    }
+
+    model._record_training_stats(batch, inputs)
+
+    assert model._last_batch_num_tokens == 11
+    assert model._last_batch_num_examples == 3
+
+
 @requires_cuda
 def test_salm_automodel_validation_step(model, dataset, prompt_formatter, training_cutset_batch):
     model.on_validation_epoch_start()
diff --git a/tests/collections/speechlm2/test_salm_automodel_lora.py b/tests/collections/speechlm2/test_salm_automodel_lora.py
index 16f907e4228b..b11585f2a75a 100644
--- a/tests/collections/speechlm2/test_salm_automodel_lora.py
+++ b/tests/collections/speechlm2/test_salm_automodel_lora.py
@@ -335,7 +335,7 @@ def test_lora_training_step(salm_with_lora, dataset, prompt_formatter, training_
     training_cutset_batch = training_cutset_batch.map(lambda c: c.apply_prompt_format(prompt_formatter), apply_fn=None)
     batch = dataset[training_cutset_batch]
     batch = move_data_to_device(batch, device=salm_with_lora.device)
-    results = salm_with_lora.training_step(batch, batch_idx=0)
+    results = salm_with_lora._training_step_batch(batch, batch_idx=0)
     assert torch.is_tensor(results["loss"])
     assert not torch.isnan(results["loss"])
     assert results["loss"] > 0
diff --git a/tests/collections/speechlm2/test_salm_automodel_pee.py b/tests/collections/speechlm2/test_salm_automodel_pee.py
index f98f0c8a57b5..f4cd8a4083d7 100644
--- a/tests/collections/speechlm2/test_salm_automodel_pee.py
+++ b/tests/collections/speechlm2/test_salm_automodel_pee.py
@@ -247,7 +247,7 @@ def test_salm_automodel_pee_training_step(model, dataset, prompt_formatter, trai
     batch = dataset[training_cutset_batch]
     assert "spk_targets" in batch  # injected as spk_targets into the PE encoder during training
     batch = move_data_to_device(batch, device=model.device)
-    results = model.training_step(batch, batch_idx=0)
+    results = model._training_step_batch(batch, batch_idx=0)
     assert torch.is_tensor(results["loss"])
     assert not torch.isnan(results["loss"])
     assert results["loss"] > 0
diff --git a/tests/collections/speechlm2/test_salm_cp_helpers.py b/tests/collections/speechlm2/test_salm_cp_helpers.py
index 669f7bfdf8e4..4fe29832061e 100644
--- a/tests/collections/speechlm2/test_salm_cp_helpers.py
+++ b/tests/collections/speechlm2/test_salm_cp_helpers.py
@@ -153,13 +153,19 @@ class _TrainablePerceptionStub(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.scale = torch.nn.Parameter(torch.tensor(2.0))
+        self.num_calls = 0
+        self.last_input_signal_shape = None
+        self.last_input_signal_length = None
         self.spk_targets_calls = []
 
     def forward(self, *, input_signal, input_signal_length, spk_targets=None):
+        self.num_calls += 1
+        self.last_input_signal_shape = tuple(input_signal.shape)
+        self.last_input_signal_length = input_signal_length.detach().cpu().tolist()
         self.spk_targets_calls.append(None if spk_targets is None else spk_targets.detach().clone())
         B = input_signal.shape[0]
-        embs = input_signal[:, :2].unsqueeze(-1) * self.scale
-        lens = torch.full((B,), 2, dtype=input_signal_length.dtype, device=input_signal_length.device)
+        embs = input_signal[:, : max(1, min(2, input_signal.shape[1]))].unsqueeze(-1) * self.scale
+        lens = torch.full((B,), embs.shape[1], dtype=input_signal_length.dtype, device=input_signal_length.device)
         return embs, lens
 
 
@@ -199,3 +205,101 @@ def fake_lens_all_gather(gathered_lens, local_lens, group):
     embs[0].sum().backward()
     assert perception.scale.grad is not None
     assert perception.scale.grad.item() == pytest.approx(3.0)
+
+
+def test_encode_audio_empty_rank_runs_dummy_when_fsdp_group_has_audio(monkeypatch):
+    perception = _TrainablePerceptionStub()
+    audios = torch.zeros(0, 1600, dtype=torch.float32)
+    audio_lens = torch.zeros(0, dtype=torch.long)
+    all_reduce_calls = []
+
+    def fake_all_reduce(tensor, op=None, group=None):
+        all_reduce_calls.append((int(tensor.item()), group))
+        tensor.fill_(1)
+
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_available", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_initialized", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.all_reduce", fake_all_reduce)
+
+    embs, dummy_audio_loss = encode_audio_with_cp_distribution(
+        perception,
+        audios,
+        audio_lens,
+        chunk_size_seconds=None,
+        sampling_rate=16000,
+        cp_mesh=None,
+        fsdp_sync_group="fake-fsdp-group",
+        return_dummy_loss=True,
+    )
+
+    assert embs == []
+    assert perception.num_calls == 1
+    assert perception.last_input_signal_shape == (1, 16000)
+    assert perception.last_input_signal_length == [16000]
+    assert all_reduce_calls == [(0, "fake-fsdp-group")]
+    assert dummy_audio_loss is not None
+    assert dummy_audio_loss.requires_grad
+    assert dummy_audio_loss.item() == pytest.approx(0.0)
+    dummy_audio_loss.backward()
+    assert perception.scale.grad is not None
+    assert perception.scale.grad.item() == pytest.approx(0.0)
+
+
+def test_encode_audio_empty_rank_skips_dummy_when_fsdp_group_has_no_audio(monkeypatch):
+    perception = _TrainablePerceptionStub()
+    audios = torch.zeros(0, 1600, dtype=torch.float32)
+    audio_lens = torch.zeros(0, dtype=torch.long)
+    all_reduce_calls = []
+
+    def fake_all_reduce(tensor, op=None, group=None):
+        all_reduce_calls.append((int(tensor.item()), group))
+
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_available", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_initialized", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.all_reduce", fake_all_reduce)
+
+    embs, dummy_audio_loss = encode_audio_with_cp_distribution(
+        perception,
+        audios,
+        audio_lens,
+        chunk_size_seconds=None,
+        sampling_rate=16000,
+        cp_mesh=None,
+        fsdp_sync_group="fake-fsdp-group",
+        return_dummy_loss=True,
+    )
+
+    assert embs == []
+    assert perception.num_calls == 0
+    assert all_reduce_calls == [(0, "fake-fsdp-group")]
+    assert dummy_audio_loss is None
+
+
+def test_encode_audio_nonempty_rank_participates_in_fsdp_audio_probe(monkeypatch):
+    perception = _TrainablePerceptionStub()
+    audios = torch.tensor([[1.0, 2.0, 0.0]])
+    audio_lens = torch.tensor([3], dtype=torch.long)
+    all_reduce_calls = []
+
+    def fake_all_reduce(tensor, op=None, group=None):
+        all_reduce_calls.append((int(tensor.item()), group))
+
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_available", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_initialized", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.all_reduce", fake_all_reduce)
+
+    embs, dummy_audio_loss = encode_audio_with_cp_distribution(
+        perception,
+        audios,
+        audio_lens,
+        chunk_size_seconds=None,
+        sampling_rate=16000,
+        cp_mesh=None,
+        fsdp_sync_group="fake-fsdp-group",
+        return_dummy_loss=True,
+    )
+
+    assert len(embs) == 1
+    assert perception.num_calls == 1
+    assert all_reduce_calls == [(1, "fake-fsdp-group")]
+    assert dummy_audio_loss is None
diff --git a/tests/core_ptl/test_resumable_dataloader_iter.py b/tests/core_ptl/test_resumable_dataloader_iter.py
new file mode 100644
index 000000000000..62f96fa77dab
--- /dev/null
+++ b/tests/core_ptl/test_resumable_dataloader_iter.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Regression tests for ``training_step(dataloader_iter)`` resumability."""
+
+from __future__ import annotations
+
+import time
+from datetime import timedelta
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Type
+
+import lightning.pytorch as pl
+import pytest
+import torch
+import torch.nn.functional as F
+from lightning.pytorch.callbacks import ModelCheckpoint
+from lightning.pytorch.utilities.exceptions import _TunerExitException
+from torchdata.stateful_dataloader import StatefulDataLoader
+
+from nemo.core.utils.lightning_utils import read_batch
+from nemo.utils.exp_manager import StatelessTimer, configure_no_restart_validation_training_loop
+
+
+class _RangeDataset(torch.utils.data.Dataset):
+    """Small deterministic dataset whose sample id is also its stream position."""
+
+    def __init__(self, size: int = 1000) -> None:
+        self.size = size
+
+    def __len__(self) -> int:
+        return self.size
+
+    def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
+        x = torch.tensor([float(index % 7)], dtype=torch.float32)
+        y = torch.tensor([float((index % 7) * 0.5)], dtype=torch.float32)
+        return {
+            "sample_id": torch.tensor(index, dtype=torch.long),
+            "x": x,
+            "y": y,
+        }
+
+
+class _BaseParityModel(pl.LightningModule):
+    def __init__(self, seen: list[dict[str, int]], sleep_sec: float = 0.0) -> None:
+        super().__init__()
+        self.seen = seen
+        self.sleep_sec = sleep_sec
+        self.proj = torch.nn.Linear(1, 1)
+        torch.nn.init.constant_(self.proj.weight, 0.25)
+        torch.nn.init.constant_(self.proj.bias, 0.0)
+
+    def train_dataloader(self):
+        return StatefulDataLoader(_RangeDataset(), batch_size=1, num_workers=0)
+
+    def val_dataloader(self):
+        return torch.utils.data.DataLoader(_RangeDataset(size=2), batch_size=1, num_workers=0)
+
+    def configure_optimizers(self):
+        return torch.optim.SGD(self.parameters(), lr=0.01)
+
+    def validation_step(self, batch, batch_idx):
+        loss = self._loss(batch)
+        self.log("val_loss", loss)
+        return loss
+
+    def _step(self, batch: dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
+        if self.sleep_sec:
+            time.sleep(self.sleep_sec)
+        self.seen.append(
+            {
+                "sample_id": int(batch["sample_id"].item()),
+                "epoch": int(self.current_epoch),
+                "batch_idx": int(batch_idx),
+                "global_step": int(self.global_step),
+            }
+        )
+        return self._loss(batch)
+
+    def _loss(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
+        return F.mse_loss(self.proj(batch["x"].float()), batch["y"].float())
+
+
+class _BatchStepModel(_BaseParityModel):
+    def training_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx)
+
+
+class _DataloaderIterStepModel(_BaseParityModel):
+    def training_step(self, dataloader_iter):
+        batch, batch_idx = read_batch(dataloader_iter, self)
+        return self._step(batch, batch_idx)
+
+
+class _StopBeforeNextBatchTimer(StatelessTimer):
+    def _check_time_remaining(self, trainer: pl.Trainer) -> None:
+        raise _TunerExitException()
+
+
+class _CountingIterator:
+    consumed = False
+
+    def __next__(self):
+        self.consumed = True
+        raise AssertionError("read_batch consumed a sample after the timer requested stop")
+
+
+class _PreemptedCallback:
+    preemption_enabled = True
+    interrupted = True
+
+
+def _make_checkpoint_callback(root: Path) -> ModelCheckpoint:
+    return ModelCheckpoint(
+        dirpath=str(root / "checkpoints"),
+        filename="{step}",
+        save_last=True,
+        save_top_k=-1,
+        every_n_epochs=1,
+    )
+
+
+def _make_trainer(root: Path, max_steps: int, callbacks: list | None = None) -> pl.Trainer:
+    trainer = pl.Trainer(
+        accelerator="cpu",
+        devices=1,
+        default_root_dir=str(root),
+        callbacks=callbacks or [],
+        max_steps=max_steps,
+        max_epochs=10,
+        limit_train_batches=5,
+        val_check_interval=5,
+        num_sanity_val_steps=0,
+        logger=False,
+        enable_checkpointing=bool(callbacks),
+        enable_model_summary=False,
+        enable_progress_bar=False,
+    )
+    configure_no_restart_validation_training_loop(trainer)
+    return trainer
+
+
+def _fit(
+    root: Path,
+    model_cls: Type[_BaseParityModel],
+    max_steps: int,
+    ckpt_path: str | None = None,
+    callbacks: list | None = None,
+    sleep_sec: float = 0.0,
+) -> tuple[list[dict[str, int]], pl.Trainer, _BaseParityModel]:
+    seen: list[dict[str, int]] = []
+    model = model_cls(seen=seen, sleep_sec=sleep_sec)
+    trainer = _make_trainer(root, max_steps=max_steps, callbacks=callbacks)
+    trainer.fit(model, ckpt_path=ckpt_path)
+    return seen, trainer, model
+
+
+def _fit_two_phase(root: Path, model_cls: Type[_BaseParityModel]) -> tuple[list[dict[str, int]], list[dict[str, int]]]:
+    first_callback = _make_checkpoint_callback(root / "first")
+    first, _, _ = _fit(root / "first", model_cls, max_steps=5, callbacks=[first_callback])
+    assert first_callback.last_model_path
+
+    second_callback = _make_checkpoint_callback(root / "second")
+    second, _, _ = _fit(
+        root / "second",
+        model_cls,
+        max_steps=10,
+        ckpt_path=first_callback.last_model_path,
+        callbacks=[second_callback],
+    )
+    return first, second
+
+
+def _project(records: list[dict[str, int]], key: str) -> list[int]:
+    return [record[key] for record in records]
+
+
+@pytest.mark.unit
+def test_uninterrupted_dataloader_iter_matches_batch_step(tmp_path):
+    batch_seen, _, _ = _fit(tmp_path / "batch", _BatchStepModel, max_steps=10)
+    iter_seen, _, _ = _fit(tmp_path / "iter", _DataloaderIterStepModel, max_steps=10)
+
+    expected = list(range(5)) + list(range(5))
+    assert _project(iter_seen, "sample_id") == _project(batch_seen, "sample_id") == expected
+    assert _project(iter_seen, "global_step") == _project(batch_seen, "global_step")
+    assert _project(iter_seen, "epoch") == _project(batch_seen, "epoch")
+
+
+@pytest.mark.unit
+def test_interrupted_resume_dataloader_iter_matches_batch_step(tmp_path):
+    batch_first, batch_second = _fit_two_phase(tmp_path / "batch", _BatchStepModel)
+    iter_first, iter_second = _fit_two_phase(tmp_path / "iter", _DataloaderIterStepModel)
+
+    assert _project(iter_first, "sample_id") == _project(batch_first, "sample_id") == list(range(5))
+    assert _project(iter_second, "sample_id") == _project(batch_second, "sample_id") == list(range(5, 10))
+    assert _project(iter_second, "global_step") == _project(batch_second, "global_step") == list(range(5, 10))
+    assert _project(iter_second, "epoch") == _project(batch_second, "epoch")
+
+
+@pytest.mark.unit
+def test_resume_boundary_does_not_replay_old_epoch_batch(tmp_path):
+    first_callback = _make_checkpoint_callback(tmp_path / "first")
+    first, _, _ = _fit(tmp_path / "first", _DataloaderIterStepModel, max_steps=5, callbacks=[first_callback])
+    assert _project(first, "sample_id") == list(range(5))
+
+    second_callback = _make_checkpoint_callback(tmp_path / "second")
+    second, _, _ = _fit(
+        tmp_path / "second",
+        _DataloaderIterStepModel,
+        max_steps=10,
+        ckpt_path=first_callback.last_model_path,
+        callbacks=[second_callback],
+    )
+
+    assert _project(second, "sample_id") == list(range(5, 10))
+    assert second[0] == {
+        "sample_id": 5,
+        "epoch": 1,
+        "batch_idx": 0,
+        "global_step": 5,
+    }
+
+
+@pytest.mark.unit
+def test_read_batch_checks_timer_before_consuming_next_sample():
+    iterator = _CountingIterator()
+    model = SimpleNamespace(trainer=SimpleNamespace(callbacks=[_StopBeforeNextBatchTimer(timedelta(seconds=1))]))
+
+    with pytest.raises(_TunerExitException):
+        read_batch(iterator, model)
+
+    assert not iterator.consumed
+
+
+@pytest.mark.unit
+def test_read_batch_checks_preemption_before_consuming_next_sample():
+    iterator = _CountingIterator()
+    trainer = SimpleNamespace(callbacks=[_PreemptedCallback()], checkpoint_callback=None)
+    model = SimpleNamespace(trainer=trainer)
+
+    with pytest.raises(_TunerExitException):
+        read_batch(iterator, model)
+
+    assert not iterator.consumed
+
+
+@pytest.mark.unit
+def test_read_batch_checks_lightning_sigterm_before_consuming_next_sample():
+    iterator = _CountingIterator()
+    trainer = SimpleNamespace(callbacks=[], checkpoint_callback=None, received_sigterm=True)
+    model = SimpleNamespace(trainer=trainer)
+
+    with pytest.raises(_TunerExitException):
+        read_batch(iterator, model)
+
+    assert not iterator.consumed
+
+
+@pytest.mark.unit
+def test_timer_checkpoint_resume_has_consistent_progress_and_no_sample_drift(tmp_path):
+    checkpoint_callback = _make_checkpoint_callback(tmp_path / "timer")
+    callbacks = [checkpoint_callback, StatelessTimer(duration=timedelta(seconds=0.15))]
+    first, _, _ = _fit(
+        tmp_path / "timer",
+        _DataloaderIterStepModel,
+        max_steps=50,
+        callbacks=callbacks,
+        sleep_sec=0.05,
+    )
+
+    assert 0 < len(first) < 50
+    assert checkpoint_callback.last_model_path
+    ckpt = torch.load(checkpoint_callback.last_model_path, map_location="cpu", weights_only=False)
+    batch_progress = ckpt["loops"]["fit_loop"]["epoch_loop.batch_progress"]
+    saved_step = int(ckpt["global_step"])
+
+    assert saved_step == len(first)
+    assert batch_progress["total"]["completed"] == saved_step
+    assert batch_progress["total"]["ready"] == saved_step
+    assert batch_progress["current"]["completed"] == batch_progress["current"]["ready"]
+
+    resumed_callback = _make_checkpoint_callback(tmp_path / "timer-resume")
+    resumed, _, _ = _fit(
+        tmp_path / "timer-resume",
+        _DataloaderIterStepModel,
+        max_steps=saved_step + 3,
+        ckpt_path=checkpoint_callback.last_model_path,
+        callbacks=[resumed_callback],
+    )
+
+    assert _project(first, "sample_id") == [idx % 5 for idx in range(saved_step)]
+    assert _project(resumed, "sample_id") == [(saved_step + idx) % 5 for idx in range(3)]
diff --git a/tests/utils/test_training_stats_callback.py b/tests/utils/test_training_stats_callback.py
new file mode 100644
index 000000000000..a99ccb2ff10b
--- /dev/null
+++ b/tests/utils/test_training_stats_callback.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from types import SimpleNamespace
+
+import torch
+
+from nemo.utils.callbacks.training_stats import TrainingStatsCallback
+
+
+class _DummyModule:
+    device = torch.device("cpu")
+    _last_batch_num_tokens = 5
+    _last_batch_num_examples = 2
+
+    def __init__(self, device_mesh=None):
+        self._device_mesh = device_mesh
+        self.logged = {}
+
+    def log_dict(self, values, **kwargs):
+        self.logged.update(values)
+
+
+class _FakeSubMesh:
+    def __init__(self, group):
+        self._group = group
+
+    def get_group(self):
+        return self._group
+
+
+class _FakeMesh:
+    mesh_dim_names = ("data_parallel", "tensor_parallel")
+
+    def __init__(self, group):
+        self._group = group
+
+    def __getitem__(self, item):
+        if item != "data_parallel":
+            raise KeyError(item)
+        return _FakeSubMesh(self._group)
+
+
+def test_training_stats_callback_reduces_with_device_mesh_dp_group(monkeypatch):
+    dp_group = object()
+    module = _DummyModule(device_mesh=_FakeMesh(dp_group))
+    callback = TrainingStatsCallback()
+    seen = []
+
+    monkeypatch.setattr(torch.distributed, "is_available", lambda: True)
+    monkeypatch.setattr(torch.distributed, "is_initialized", lambda: True)
+
+    def fake_all_reduce(tensor, op=None, group=None):
+        seen.append(group)
+
+    monkeypatch.setattr(torch.distributed, "all_reduce", fake_all_reduce)
+
+    callback.on_train_batch_end(SimpleNamespace(), module, outputs=None, batch={}, batch_idx=0)
+
+    assert seen == [dp_group]
+    assert callback.num_tokens_total == 5
+    assert callback.num_examples_total == 2
+    assert module.logged["num_tokens_total"] == 5.0
+    assert module.logged["num_examples_total"] == 2.0
+
+
+def test_training_stats_callback_plain_ddp_uses_default_group(monkeypatch):
+    module = _DummyModule()
+    callback = TrainingStatsCallback()
+    seen = []
+
+    monkeypatch.setattr(torch.distributed, "is_available", lambda: True)
+    monkeypatch.setattr(torch.distributed, "is_initialized", lambda: True)
+
+    def fake_all_reduce(tensor, op=None, group=None):
+        seen.append(group)
+
+    monkeypatch.setattr(torch.distributed, "all_reduce", fake_all_reduce)
+
+    callback.on_train_batch_end(SimpleNamespace(), module, outputs=None, batch={}, batch_idx=0)
+
+    assert seen == [None]
diff --git a/uv.lock b/uv.lock
index a6be21cea190..a5d7b5c0646a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3002,7 +3002,7 @@ wheels = [
 
 [[package]]
 name = "lhotse"
-version = "1.33.0"
+version = "2.0.0a2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "audioread" },
@@ -3021,9 +3021,9 @@ dependencies = [
     { name = "torch", version = "2.12.0+cu132", source = { registry = "https://download.pytorch.org/whl/cu132" }, marker = "(sys_platform == 'linux' and extra != 'extra-12-nemo-toolkit-compiled-a100' and extra == 'extra-12-nemo-toolkit-cu13') or (sys_platform == 'linux' and extra != 'extra-12-nemo-toolkit-compiled' and extra == 'extra-12-nemo-toolkit-cu13') or (extra == 'extra-12-nemo-toolkit-cu12' and extra == 'extra-12-nemo-toolkit-cu13') or (extra == 'extra-12-nemo-toolkit-compiled' and extra == 'extra-12-nemo-toolkit-compiled-a100')" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/9d/5a/b606c87b0a50322200aafb0f0682e719890bf0f045152b53e161090a6e8f/lhotse-1.33.0.tar.gz", hash = "sha256:3e91fca8531fc4c1798d0a6de1b3c7ea6bf2e181df70e5985927a131761c67f5", size = 686482, upload-time = "2026-04-20T13:11:08.579Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ed/2c/8235b4820a0a22402f6ce1ed3b3ba9de2c50a82704f63ca2d63bb7ae65bb/lhotse-2.0.0a2.tar.gz", hash = "sha256:f0fe285179060f5bcd96a289a8c3238d623c4842450f3acbdbd81ed08086de28", size = 736543, upload-time = "2026-06-22T21:37:30.175Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ad/e2/fbcb65dfed851f28ea15eca62cf449bc0b36378b005e6bec720714a9fb19/lhotse-1.33.0-py3-none-any.whl", hash = "sha256:8697bc74a8f3101594fca5661c7318c30899f3fdb132a44c7e99e794be6ac061", size = 903925, upload-time = "2026-04-20T13:11:07.027Z" },
+    { url = "https://files.pythonhosted.org/packages/73/a1/7d5d04df1a815866681bd133d3653530de15b5a44f10108bdb3b60505c75/lhotse-2.0.0a2-py3-none-any.whl", hash = "sha256:a20d934727929c42715d8d3a7f0aa68b8f7f89509dc8104bed6108a508aac9cf", size = 959069, upload-time = "2026-06-22T21:37:28.51Z" },
 ]
 
 [[package]]
@@ -4579,12 +4579,12 @@ requires-dist = [
     { name = "kaldialign", marker = "extra == 'asr-only'" },
     { name = "kaldialign", marker = "extra == 'speechlm2'" },
     { name = "kaldialign", marker = "extra == 'tts'" },
-    { name = "lhotse", marker = "extra == 'all'", specifier = ">=1.33.0" },
-    { name = "lhotse", marker = "extra == 'asr'", specifier = ">=1.33.0" },
-    { name = "lhotse", marker = "extra == 'asr-only'", specifier = ">=1.33.0" },
-    { name = "lhotse", marker = "extra == 'audio'", specifier = ">=1.33.0" },
-    { name = "lhotse", marker = "extra == 'speechlm2'", specifier = ">=1.33.0" },
-    { name = "lhotse", marker = "extra == 'tts'", specifier = ">=1.33.0" },
+    { name = "lhotse", marker = "extra == 'all'", specifier = "==2.0.0a2" },
+    { name = "lhotse", marker = "extra == 'asr'", specifier = "==2.0.0a2" },
+    { name = "lhotse", marker = "extra == 'asr-only'", specifier = "==2.0.0a2" },
+    { name = "lhotse", marker = "extra == 'audio'", specifier = "==2.0.0a2" },
+    { name = "lhotse", marker = "extra == 'speechlm2'", specifier = "==2.0.0a2" },
+    { name = "lhotse", marker = "extra == 'tts'", specifier = "==2.0.0a2" },
     { name = "librosa", marker = "extra == 'all'" },
     { name = "librosa", marker = "extra == 'all'", specifier = ">=0.10.0" },
     { name = "librosa", marker = "extra == 'all'", specifier = ">=0.10.1" },