From 905bf9983baa0f6fb7d4770f9acbb85180798b4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Fri, 1 May 2026 16:20:57 -0700
Subject: [PATCH 01/30] First draft of indexed lhotse datasets integration +
 checkpointable dataloader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 CLAUDE.md                                     |  2 +
 examples/speechlm2/salm_train.py              |  3 ++
 nemo/collections/common/data/lhotse/cutset.py |  4 +-
 .../common/data/lhotse/dataloader.py          | 48 +++++++++++--------
 nemo/collections/speechlm2/data/datamodule.py | 31 +++++++++---
 5 files changed, 59 insertions(+), 29 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index 7a1cd849f9e7..74e05265fc92 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -24,6 +24,8 @@ Requires Python 3.10+, PyTorch 2.6+.
 - Check: `python setup.py style --scope <path>`
 - Fix: `python setup.py style --scope <path> --fix`
 - **Incremental reformatting**: most collections are excluded from black (see `extend-exclude` in pyproject.toml). The files are reformatted when somebody makes changes to avoid a single big reformatting PR. Do not reformat files outside your changes.
+- **Helper placement**: keep public APIs and top-level classes/functions near the top of a file; place private
+  helpers and utilities at the bottom of the file unless a local module convention requires otherwise.
 
 ## Testing
 
diff --git a/examples/speechlm2/salm_train.py b/examples/speechlm2/salm_train.py
index 4fddb61985de..05a013c69d86 100644
--- a/examples/speechlm2/salm_train.py
+++ b/examples/speechlm2/salm_train.py
@@ -51,6 +51,9 @@ def train(cfg):
 
     trainer.fit(model, datamodule)
 
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+
 
 if __name__ == "__main__":
     train()
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 8e613091b7e1..48e69001bccc 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -715,7 +715,9 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
     else:
         # Regular Lhotse manifest points to individual audio files (like native NeMo manifest).
         path = config.cuts_path
-        cuts = CutSet.from_file(path).map(partial(resolve_relative_paths, manifest_path=path))
+        cuts = CutSet.from_file(path, indexed=config.get("indexed", None)).map(
+            partial(resolve_relative_paths, manifest_path=path)
+        )
     return cuts, is_tarred
 
 
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 5af5f5d004d7..f062978cc151 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -254,6 +254,16 @@ class LhotseDataLoadingConfig:
     # The first K examples will actually be read and then discarded, incurring the IO cost, due to
     # our support of object stores and gzipped files that generally don't have indexes of byte offsets per line.
     slice_length: Optional[int] = None
+    # Forwarded to ``CutSet.from_file(path, indexed=...)`` for plain JSONL ``cuts_path`` inputs.
+    # ``None`` = lhotse auto-detect (uses .idx if present, falls back to streaming).
+    # ``True`` = require indexed reads (errors if .idx is missing).
+    # ``False`` = streaming reads only.
+    indexed: Optional[bool] = None
+    # When True, build the dataloader with ``torchdata.stateful_dataloader.StatefulDataLoader``
+    # instead of ``torch.utils.data.DataLoader``. Combined with a checkpointable lhotse sampler
+    # (DynamicBucketingSampler / DynamicCutSampler), this enables exact resume from the next batch
+    # within the current epoch via the standard PyTorch state_dict / load_state_dict protocol.
+    use_stateful_dataloader: bool = False
 
 
 def determine_use_iterable_dataset(use_iterable_dataset: bool, config: DictConfig) -> bool:
@@ -265,6 +275,18 @@ def determine_use_iterable_dataset(use_iterable_dataset: bool, config: DictConfi
     return use_iterable_dataset
 
 
+def _build_dataloader(use_stateful_dataloader: bool, **kwargs) -> torch.utils.data.DataLoader:
+    """
+    Construct a DataLoader, optionally using ``torchdata.stateful_dataloader.StatefulDataLoader``
+    so that resume picks up at the exact next batch via ``state_dict()`` / ``load_state_dict()``.
+    """
+    if use_stateful_dataloader:
+        from torchdata.stateful_dataloader import StatefulDataLoader
+
+        return StatefulDataLoader(**kwargs)
+    return torch.utils.data.DataLoader(**kwargs)
+
+
 def get_lhotse_dataloader_from_config(
     config: Union[dict, DictConfig],
     global_rank: int,
@@ -369,7 +391,8 @@ def get_lhotse_dataloader_from_single_config(
         # reads only light-weight JSON objects; it samples mini-batches and passes
         # the meta-data to Dataset, which performs the actual I/O inside its __getitem__ method.
         dloader_kwargs = dict(dataset=dataset, sampler=sampler)
-    dloader = torch.utils.data.DataLoader(
+    dloader = _build_dataloader(
+        use_stateful_dataloader=config.use_stateful_dataloader,
         **dloader_kwargs,
         batch_size=None,
         num_workers=config.num_workers,
@@ -420,6 +443,7 @@ def gather_shared_opts():
             "multi_config",
             "metadata_only",
             "force_finite",
+            "use_stateful_dataloader",
         ]
         defaults = OmegaConf.structured(LhotseDataLoadingConfig)
         top_level_config["seed"] = resolve_seed(top_level_config["seed"])
@@ -493,7 +517,8 @@ def gather_shared_opts():
         # reads only light-weight JSON objects; it samples mini-batches and passes
         # the meta-data to Dataset, which performs the actual I/O inside its __getitem__ method.
         dloader_kwargs = dict(dataset=dataset, sampler=sampler)
-    dloader = torch.utils.data.DataLoader(
+    dloader = _build_dataloader(
+        use_stateful_dataloader=shared_opts.use_stateful_dataloader,
         **dloader_kwargs,
         batch_size=None,
         num_workers=shared_opts.num_workers,
@@ -519,9 +544,6 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No
     # Resample as a safeguard; it's a no-op when SR is already OK
     cuts = cuts.map(partial(resample, sampling_rate=config.sample_rate), apply_fn=None)
 
-    # Expands cuts if multiple translations are provided.
-    cuts = CutSet(LazyFlattener(cuts.map(_flatten_alt_text, apply_fn=None)))
-
     if config.use_multimodal_sampling:
         assert tokenizer is not None, (
             "You must pass a tokenizer to `get_lhotse_dataloader_from_config` in order to"
@@ -938,22 +960,6 @@ def _merge_supervisions(cuts: CutSet) -> CutSet:
     return cuts.merge_supervisions()
 
 
-def _flatten_alt_text(cut) -> list:
-    ans = [cut]
-    if not isinstance(cut, Cut) or cut.custom is None or cut.custom.get("alt_text") is None:
-        return ans
-    cut = cut.move_to_memory(audio_format="wav")  # performs I/O once and holds audio in memory from now on
-    # Popping to ease eyesight on debug.
-    paired_text = cut.custom.pop("alt_text")
-    for data in paired_text.values():
-        # Copy to avoid lazy dataloading issues
-        data = data.copy()
-        text_instance = cut.map_supervisions(lambda s: fastcopy(s, text=data["text"], language=data["lang"]))
-        text_instance.custom = {"text": data.pop("text"), "lang": data.pop("lang"), **data}
-        ans.append(text_instance)
-    return ans
-
-
 def maybe_set_cuda_expandable_segments(enabled: bool):
     """
     Configures PyTorch memory allocator to expand existing allocated segments
diff --git a/nemo/collections/speechlm2/data/datamodule.py b/nemo/collections/speechlm2/data/datamodule.py
index 0e95542e4ede..fd5364bdab05 100644
--- a/nemo/collections/speechlm2/data/datamodule.py
+++ b/nemo/collections/speechlm2/data/datamodule.py
@@ -68,17 +68,34 @@ def __init__(self, cfg, tokenizer: TokenizerSpec, dataset: torch.utils.data.Data
                     getattr(self.cfg, k).force_map_dataset = True
         self.tokenizer = tokenizer
         self.dataset = dataset
+        self._train_dl = None
 
     def train_dataloader(self):
         if "train_ds" not in self.cfg:
             return None
-        return get_lhotse_dataloader_from_config(
-            config=self.cfg.train_ds,
-            global_rank=self._get_dp_rank(),
-            world_size=self._get_world_size(),
-            dataset=FallbackDataset(self.dataset),
-            tokenizer=self.tokenizer,
-        )
+        if self._train_dl is None:
+            self._train_dl = get_lhotse_dataloader_from_config(
+                config=self.cfg.train_ds,
+                global_rank=self._get_dp_rank(),
+                world_size=self._get_world_size(),
+                dataset=FallbackDataset(self.dataset),
+                tokenizer=self.tokenizer,
+            )
+        return self._train_dl
+
+    def state_dict(self) -> dict:
+        # Persist the train dataloader state when it's stateful (e.g. torchdata's StatefulDataLoader
+        # paired with a checkpointable lhotse sampler). This enables exact-batch resume.
+        if self._train_dl is not None and hasattr(self._train_dl, "state_dict"):
+            return {"train_dataloader": self._train_dl.state_dict()}
+        return {}
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        if "train_dataloader" not in state_dict:
+            return
+        dl = self.train_dataloader()
+        if dl is not None and hasattr(dl, "load_state_dict"):
+            dl.load_state_dict(state_dict["train_dataloader"])
 
     def val_dataloader(self):
         if "validation_ds" not in self.cfg:

From 48818f5375f6f8b4c0c8e78ee6d16025decab292 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Tue, 5 May 2026 09:16:02 -0700
Subject: [PATCH 02/30] Support new Lhotse's indexed iterators across NeMo
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 nemo/collections/common/data/lhotse/cutset.py |  23 +-
 .../common/data/lhotse/indexed_adapters.py    | 122 +++
 .../common/data/lhotse/nemo_adapters.py       | 694 +++++++++++++++---
 .../common/data/lhotse/text_adapters.py       | 617 +++++++++++++++-
 .../speechlm2/models/salm_automodel.py        |  23 +-
 5 files changed, 1379 insertions(+), 100 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 48e69001bccc..84b74804e4fd 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -285,6 +285,7 @@ def read_dataset_config(config) -> tuple[CutSet, bool]:
         "force_map_dataset": config.get("force_map_dataset", False),
         "force_iterable_dataset": config.get("force_iterable_dataset", False),
         "slice_length": config.get("slice_length", None),
+        "indexed": config.get("indexed", False),
         # Temperature for re-weighting datasets. 1 is a neutral value. Lower temperature over-samples smaller datasets, and vice versa.
         "reweight_temperature": config.get("reweight_temperature", None),
     }
@@ -348,6 +349,7 @@ def read_txt_jsonl_paths(config: DictConfig) -> tuple[CutSet, bool]:
             text_field=config.text_field,
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
+            indexed=config.get("indexed", False),
         )
     )
     if not config.get("force_finite", False):
@@ -384,6 +386,7 @@ def read_nemo_sft_jsonl(config: DictConfig) -> tuple[CutSet, bool]:
             language=config.get("language"),
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
+            indexed=config.get("indexed", False),
         )
     )
     if not config.get("force_finite", False):
@@ -405,6 +408,7 @@ def read_multimodal_conversation_jsonl(config: DictConfig) -> tuple[CutSet, bool
             system_prompt=config.get("tags", {}).get("system_prompt"),
             context=config.get("tags", {}).get("context"),
             slice_length=config.get("slice_length"),
+            indexed=config.get("indexed", False),
         )
     )
     if not config.get("force_finite", False):
@@ -426,6 +430,7 @@ def read_share_gpt_as_conversation(config) -> tuple[CutSet, bool]:
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
             slice_length=config.get("slice_length"),
+            indexed=config.get("indexed", False),
         )
     )
     if not config.get("force_finite", False):
@@ -444,6 +449,7 @@ def read_share_gpt_webdataset_as_conversation(config) -> tuple[CutSet, bool]:
             token_equivalent_duration=config.get("token_equivalent_duration"),
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
+            indexed=config.get("indexed", False),
         )
     )
     # When force_finite is False (default), repeat the dataset infinitely so that
@@ -751,6 +757,7 @@ def read_parquet_manifest(config: DictConfig) -> tuple[CutSet, bool]:
     # Extract shuffling options (CRITICAL for distributed training)
     shuffle_shards = config.get("shuffle", False)
     shard_seed = config.get("shard_seed", "trng")
+    indexed = config.get("indexed", False)
 
     # 3. Create Iterators for each file
     iterators = []
@@ -763,6 +770,7 @@ def read_parquet_manifest(config: DictConfig) -> tuple[CutSet, bool]:
             duration_field=duration_field,
             lang_field=lang_field,
             sampling_rate=sampling_rate,
+            indexed=indexed,
         )
         iterators.append(adapter)
 
@@ -1461,6 +1469,8 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
                 common_kwargs["shuffle_shards"] = config[key]
             else:
                 common_kwargs[key] = config[key]
+    indexed = config.get("indexed", False)
+    notar_kwargs_extra = {"indexed": indexed} if indexed else {}
     # The option below is to allow a special case of NeMo manifest iteration as Lhotse CutSet
     # without performing any I/O. NeMo manifests typically don't have sampling_rate information required by Lhotse,
     # so lhotse has to look up the headers of audio files to fill it on-the-fly.
@@ -1470,6 +1480,7 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
     metadata_only = config.get("metadata_only", False)
     force_finite = config.get("force_finite", False)
     notar_kwargs = {"metadata_only": metadata_only}
+    tar_kwargs_extra = {"indexed": indexed} if indexed else {}
     is_tarred = config.get("tarred_audio_filepaths") is not None
     if isinstance(config.manifest_filepath, (str, Path)):
         if is_tarred and not metadata_only:
@@ -1479,13 +1490,18 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
                     tar_paths=config.tarred_audio_filepaths,
                     skip_missing_manifest_entries=config.get("skip_missing_manifest_entries", False),
                     slice_length=config.get("slice_length", None),
+                    **tar_kwargs_extra,
                     **common_kwargs,
                 )
             )
             if not force_finite:
                 cuts = cuts.repeat(preserve_id=True)
         else:
-            cuts = CutSet(LazyNeMoIterator(config.manifest_filepath, **notar_kwargs, **common_kwargs))
+            cuts = CutSet(
+                LazyNeMoIterator(
+                    config.manifest_filepath, **notar_kwargs, **notar_kwargs_extra, **common_kwargs
+                )
+            )
     else:
         # Format option 1:
         #   Assume it's [[path1], [path2], ...] (same for tarred_audio_filepaths).
@@ -1519,10 +1535,13 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
                     tar_paths=tar_path,
                     skip_missing_manifest_entries=config.get("skip_missing_manifest_entries", False),
                     slice_length=config.get("slice_length", None),
+                    **tar_kwargs_extra,
                     **common_kwargs,
                 )
             else:
-                nemo_iter = LazyNeMoIterator(manifest_path, **notar_kwargs, **common_kwargs)
+                nemo_iter = LazyNeMoIterator(
+                    manifest_path, **notar_kwargs, **notar_kwargs_extra, **common_kwargs
+                )
             # Then, determine the weight or use one provided
             if isinstance(manifest_info, str) or len(manifest_info) == 1:
                 weight = len(nemo_iter)
diff --git a/nemo/collections/common/data/lhotse/indexed_adapters.py b/nemo/collections/common/data/lhotse/indexed_adapters.py
index 831edf0b1f54..edb7e6e86400 100644
--- a/nemo/collections/common/data/lhotse/indexed_adapters.py
+++ b/nemo/collections/common/data/lhotse/indexed_adapters.py
@@ -245,6 +245,128 @@ def __getitem__(self, idx):
         return _split_json_audio_pair(name_a, bytes_a, name_b, bytes_b)
 
 
+class IndexedTarMemberReader:
+    """
+    Random access to a NeMo-style tar archive that stores **one regular member
+    per sample** (e.g. ``<cut_id>.flac`` per line of an external NeMo manifest).
+
+    Uses the same ``.idx`` format as :class:`IndexedJSONLReader` and
+    :class:`IndexedTarSampleReader`: little-endian uint64 byte offsets, with
+    a sentinel equal to the tar file size at the end. Each entry points at
+    one tar header, and the corresponding payload starts ``512`` bytes later.
+
+    Two access patterns:
+
+    * Positional: ``reader[idx]`` returns ``(member_name, payload_bytes)``.
+    * Name-keyed: ``reader.get(name)`` returns just the payload bytes. The
+      name → position map is built lazily on first use by walking the tar
+      headers (no payload reads), then cached for subsequent calls.
+    """
+
+    def __init__(
+        self,
+        tar_path: str | Path,
+        idx_path: str | Path | None = None,
+        auto_create_index: bool = True,
+    ):
+        self.data_path = str(tar_path)
+        resolved_idx = str(idx_path) if idx_path else self.data_path + ".idx"
+        if auto_create_index and not os.path.exists(resolved_idx):
+            create_tar_index(self.data_path, resolved_idx)
+        self.offsets, self._len = _load_index(self.data_path, resolved_idx)
+        self._fh = None
+        self._name_to_idx: dict[str, int] | None = None
+
+    def _ensure_open(self):
+        if self._fh is None:
+            self._fh = open(self.data_path, "rb")
+
+    def close(self):
+        if self._fh is not None:
+            self._fh.close()
+            self._fh = None
+
+    def __del__(self):
+        self.close()
+
+    def __getstate__(self):
+        s = self.__dict__.copy()
+        s["_fh"] = None  # file handles are not picklable
+        return s
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def __len__(self) -> int:
+        return self._len
+
+    def __getitem__(self, idx: int) -> tuple[str, bytes]:
+        idx = _resolve_idx(idx, self._len)
+        offset = int(self.offsets[idx])
+        self._ensure_open()
+        self._fh.seek(offset)
+        try:
+            name, data = _read_tar_member(self._fh)
+        except (EOFError, tarfile.TarError) as e:
+            raise type(e)(
+                f"{e} — reading sample {idx}/{self._len} at offset {offset} "
+                f"in {self.data_path}"
+            ) from e
+        return name, data
+
+    def _build_name_index(self) -> dict[str, int]:
+        """Walk the tar headers once to build a name → sample-index map.
+
+        Reads only the 512-byte tar headers (no payloads), so this is
+        relatively cheap even on remote storage. Done lazily on first
+        :meth:`get` call.
+
+        ``tar.add`` writes a PAX extended header (``@PaxHeader``) before any
+        member with a long path or extended attributes. We skip those and
+        record the *regular* file's name at each indexed offset.
+        """
+        name_to_idx: dict[str, int] = {}
+        self._ensure_open()
+        for i in range(self._len):
+            offset = int(self.offsets[i])
+            self._fh.seek(offset)
+            while True:
+                header = self._fh.read(512)
+                if len(header) < 512 or header == b"\0" * 512:
+                    break
+                info = tarfile.TarInfo.frombuf(
+                    header, tarfile.ENCODING, "surrogateescape"
+                )
+                if info.type in (tarfile.REGTYPE, tarfile.AREGTYPE):
+                    name_to_idx[info.name] = i
+                    break
+                # Non-regular (PAX header, GNU long-name, etc.):
+                # skip its data + 512-byte padding and continue.
+                size_blocks = (info.size + 511) // 512 * 512
+                self._fh.seek(size_blocks, 1)
+        return name_to_idx
+
+    def get(self, name: str) -> bytes:
+        """Return the payload bytes of the tar member named ``name``."""
+        if self._name_to_idx is None:
+            self._name_to_idx = self._build_name_index()
+        try:
+            idx = self._name_to_idx[name]
+        except KeyError as e:
+            raise KeyError(
+                f"Tar {self.data_path} has no member named '{name}'. "
+                f"The .idx may be stale or the manifest is referencing a "
+                f"different tar."
+            ) from e
+        _, data = self[idx]
+        return data
+
+    def __contains__(self, name: str) -> bool:
+        if self._name_to_idx is None:
+            self._name_to_idx = self._build_name_index()
+        return name in self._name_to_idx
+
+
 def _read_tar_member(f):
     """Read the next regular-file tar member, skipping non-regular entries
     (PAX headers, GNU long-name headers, directory entries, etc.).
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 69ca3d66c041..e506e8077324 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 
 """Lhotse adapters for NeMo datasets including Parquet support."""
+import bisect
+import json
 import os
 import random
 import re
@@ -34,7 +36,13 @@
 from lhotse.audio.backend import LibsndfileBackend
 from lhotse.cut import Cut
 from lhotse.dataset.dataloading import resolve_seed
-from lhotse.lazy import LazyIteratorChain, LazyJsonlIterator
+from lhotse.lazy import (
+    IteratorNode,
+    LazyIteratorChain,
+    LazyJsonlIterator,
+    attach_graph_origin,
+    normalize_graph_token,
+)
 from lhotse.serialization import open_best
 from lhotse.utils import compute_num_samples, ifnone
 
@@ -43,7 +51,7 @@
 from nemo.utils.data_utils import is_datastore_path
 
 
-class LazyNeMoIterator:
+class LazyNeMoIterator(IteratorNode):
     """
     ``LazyNeMoIterator`` reads a NeMo (non-tarred) JSON manifest and converts it on the fly to an ``Iterable[Cut]``.
     It's used to create a ``lhotse.CutSet``.
@@ -85,6 +93,24 @@ class LazyNeMoIterator:
         ...     "nemo_manifests/train.json",
         ...     extra_fields=[{"type": "text_sample", "name": "question", "path": "questions.txt"}],
         ... ))
+
+    Indexed mode (``indexed=True``)
+    -------------------------------
+
+    When the underlying manifest is uncompressed JSONL, set ``indexed=True`` to enable
+    O(1) random access and exact graph-token checkpointing through
+    :class:`lhotse.indexing.IndexedJsonlReader`. In indexed mode this iterator becomes
+    an indexed ``IteratorNode`` that can be combined with ``StatefulDataLoader`` for
+    bit-exact mid-epoch resume.
+
+    Indexed mode requires:
+
+    * the manifest path(s) to use ``.jsonl`` extension and be uncompressed;
+    * ``extra_fields`` to be unset (lookup-based fields are positional and cannot be
+      reproduced after a Feistel-permuted random access).
+
+    Sharded indexed inputs are composed via :class:`lhotse.lazy.LazyIteratorChain`,
+    which picks a Feistel cross-shard permutation for true item-level shuffling.
     """
 
     def __init__(
@@ -96,62 +122,126 @@ def __init__(
         shuffle_shards: bool = False,
         shard_seed: int | Literal["randomized", "trng"] = "trng",
         extra_fields: list[dict[str, str]] | None = None,
+        indexed: bool = False,
     ) -> None:
         self.path = path
         self.shuffle_shards = shuffle_shards
         self.shard_seed = shard_seed
-        paths = expand_sharded_filepaths(path)
-
-        if len(paths) == 1:
-            self.source = LazyJsonlIterator(paths[0])
-        else:
-            self.source = LazyIteratorChain(
-                *(LazyJsonlIterator(p) for p in paths), shuffle_iters=self.shuffle_shards, seed=self.shard_seed
-            )
         self.text_field = text_field
         self.lang_field = lang_field
         self.metadata_only = metadata_only
         self.extra_fields = extra_fields
+        self.indexed = indexed
         validate_extra_fields(self.extra_fields)
+        paths = expand_sharded_filepaths(path)
+
+        if indexed:
+            if extra_fields:
+                raise ValueError(
+                    "LazyNeMoIterator(indexed=True) does not support 'extra_fields' because "
+                    "their values are positional/streaming and cannot be reconstructed under "
+                    "graph-token random access."
+                )
+            seed = resolve_seed(shard_seed) if shard_seed not in (None, "trng", "randomized") else 0
+            indexed_sources = [_LazyIndexedJsonlDictNode(p) for p in paths]
+            if len(indexed_sources) == 1:
+                self.source = indexed_sources[0]
+            else:
+                self.source = LazyIteratorChain(
+                    *indexed_sources, shuffle_iters=shuffle_shards, seed=seed
+                )
+        else:
+            if len(paths) == 1:
+                self.source = LazyJsonlIterator(paths[0])
+            else:
+                self.source = LazyIteratorChain(
+                    *(LazyJsonlIterator(p) for p in paths),
+                    shuffle_iters=self.shuffle_shards,
+                    seed=self.shard_seed,
+                )
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
 
     def __iter__(self) -> Generator[Cut, None, None]:
         seed = resolve_seed(self.shard_seed)
         # Propagate the random seed
         extra_fields = [ExtraField.from_dict({"seed": seed, **field_cfg}) for field_cfg in self.extra_fields or ()]
         for data in self.source:
+            graph_token = getattr(data, "_graph_origin", None) if self.indexed else None
             # filter out entries with valid "_skipme" values.
             if data.get("_skipme", False):
                 continue
-            audio_path = get_full_path(str(data.pop("audio_filepath")), str(self.path), force_cache=False)
-            duration = data.pop("duration")
-            offset = data.pop("offset", None)
-            cut = self._create_cut(
-                audio_path=audio_path, offset=offset, duration=duration, sampling_rate=data.pop("sampling_rate", None)
-            )
-            # Note that start=0 and not start=offset because supervision's start if relative to the
-            # start of the cut; and cut.start is already set to offset
-            cut.supervisions.append(
-                SupervisionSegment(
-                    id=cut.id,
-                    recording_id=cut.recording_id,
-                    start=0,
-                    duration=cut.duration,
-                    channel=cut.channel,
-                    text=data.get(self.text_field),
-                    language=data.get(self.lang_field),
-                )
-            )
-            cut.custom = data
+            cut = self._build_cut_from_dict(data)
             for extra_field in extra_fields:
                 extra_field.attach_to(cut)
+            if graph_token is not None:
+                attach_graph_origin(cut, graph_token)
             yield cut
 
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "LazyNeMoIterator only supports __getitem__ when constructed with indexed=True."
+            )
+        token = normalize_graph_token(token)
+        data = self.source[token]
+        cut = self._build_cut_from_dict(data)
+        return attach_graph_origin(cut, token)
+
     def __len__(self) -> int:
         return len(self.source)
 
     def __add__(self, other):
         return LazyIteratorChain(self, other)
 
+    def state_dict(self) -> dict:
+        if not self.indexed:
+            return {}
+        return {"source": self.source.state_dict()}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        if "source" in sd:
+            self.source.load_state_dict(sd["source"])
+
+    def _build_cut_from_dict(self, data: dict) -> Cut:
+        # Note: ``data`` may be reused across calls in indexed mode (the reader returns
+        # a fresh dict each time, but we still avoid mutating the inner object).
+        data = dict(data)
+        audio_path = get_full_path(str(data.pop("audio_filepath")), str(self.path), force_cache=False)
+        duration = data.pop("duration")
+        offset = data.pop("offset", None)
+        cut = self._create_cut(
+            audio_path=audio_path,
+            offset=offset,
+            duration=duration,
+            sampling_rate=data.pop("sampling_rate", None),
+        )
+        cut.supervisions.append(
+            SupervisionSegment(
+                id=cut.id,
+                recording_id=cut.recording_id,
+                start=0,
+                duration=cut.duration,
+                channel=cut.channel,
+                text=data.get(self.text_field),
+                language=data.get(self.lang_field),
+            )
+        )
+        cut.custom = data
+        return cut
+
     def _create_cut(
         self,
         audio_path: str,
@@ -210,7 +300,67 @@ def _create_recording(
             return Recording.from_file(audio_path)
 
 
-class LazyNeMoTarredIterator:
+class _GraphOriginDict(dict):
+    """``dict`` subclass that can carry runtime attributes (e.g. ``_graph_origin``)."""
+
+    __slots__ = ("_graph_origin",)
+
+
+class _LazyIndexedJsonlDictNode(IteratorNode):
+    """
+    Internal helper: a graph-restorable indexed JSONL reader that yields raw dicts
+    (not Cuts). Built on top of :class:`lhotse.indexing.IndexedJsonlReader`.
+
+    Used as the source iterator for :class:`LazyNeMoIterator` (and other adapters)
+    when ``indexed=True``. Yielded items carry ``_graph_origin`` set to their
+    integer line index, which allows downstream nodes (e.g. ``LazyIteratorChain``,
+    ``LazyShuffler``) to compose graph tokens for exact restore.
+    """
+
+    is_checkpointable = True
+    is_indexed = True
+    has_constant_time_access = True
+
+    def __init__(self, path: str | Path) -> None:
+        from lhotse.indexing import IndexedJsonlReader
+
+        self.path = path
+        self._reader = IndexedJsonlReader(path)
+        self._position = 0
+        self._restored = False
+
+    def __getitem__(self, idx):
+        idx = int(normalize_graph_token(idx))
+        item = _GraphOriginDict(self._reader[idx])
+        return attach_graph_origin(item, idx)
+
+    def __len__(self) -> int:
+        return len(self._reader)
+
+    def __iter__(self):
+        start = self._position if self._restored else 0
+        self._restored = False
+        n = len(self._reader)
+        for i in range(start, n):
+            self._position = i + 1
+            item = _GraphOriginDict(self._reader[i])
+            attach_graph_origin(item, i)
+            yield item
+
+    def state_dict(self) -> dict:
+        return {"position": self._position}
+
+    def load_state_dict(self, sd: dict) -> None:
+        self._position = sd["position"]
+        self._restored = True
+
+
+# NeMo-tar indexed access is delegated to ``IndexedTarMemberReader`` from
+# ``indexed_adapters`` — the same canonical .idx format (uint64 LE offsets +
+# sentinel) used everywhere else in NeMo and lhotse for indexed access.
+
+
+class LazyNeMoTarredIterator(IteratorNode):
     r"""
     ``LazyNeMoTarredIterator`` reads a NeMo tarred JSON manifest and converts it on the fly to an ``Iterable[Cut]``.
     It's used to create a ``lhotse.CutSet``.
@@ -294,19 +444,27 @@ def __init__(
         skip_missing_manifest_entries: bool = False,
         extra_fields: list[dict[str, str]] | None = None,
         slice_length: int = None,
+        indexed: bool = False,
     ) -> None:
         self.skip_missing_manifest_entries = skip_missing_manifest_entries
+        self.indexed = indexed
         self.shard_id_to_manifest: dict[int, Iterable[dict]]
         self.paths = expand_sharded_filepaths(manifest_path)
         if len(self.paths) == 1:
-            logging.warning(
-                f"You are using Lhotse dataloading for tarred audio with a non-sharded manifest. "
-                f"This will incur significant memory overhead. To prevent this, please shard file "
-                f"'{self.paths[0]}' using 'scripts/speech_recognition/convert_to_tarred_audio_dataset.py' "
-                f"WITHOUT '--no_shard_manifest'"
-            )
+            if not indexed:
+                logging.warning(
+                    f"You are using Lhotse dataloading for tarred audio with a non-sharded manifest. "
+                    f"This will incur significant memory overhead. To prevent this, please shard file "
+                    f"'{self.paths[0]}' using 'scripts/speech_recognition/convert_to_tarred_audio_dataset.py' "
+                    f"WITHOUT '--no_shard_manifest'"
+                )
             self.source = LazyJsonlIterator(self.paths[0])
-            self.shard_id_to_manifest = groupby("shard_id", self.source)
+            if indexed:
+                # In indexed mode we will not consume self.source for grouping — the per-shard
+                # IndexedJsonlReaders below take over, keyed by the position-derived shard_id 0.
+                self.shard_id_to_manifest = {0: self.source}
+            else:
+                self.shard_id_to_manifest = groupby("shard_id", self.source)
         else:
             json_pattern = re.compile(r"manifest[^/]*_(\d+)[^/]*\.json")
             shard_ids = []
@@ -342,6 +500,74 @@ def __init__(
         self._validate()
         self.use_ais_get_batch = os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true"
 
+        if indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        """Build per-shard IndexedJsonlReaders + audio-tar index for indexed/random access."""
+        from lhotse.indexing import IndexedJsonlReader
+
+        from nemo.collections.common.data.lhotse.indexed_adapters import (
+            IndexedTarMemberReader,
+        )
+
+        if self.extra_fields:
+            raise ValueError(
+                "LazyNeMoTarredIterator(indexed=True) does not support 'extra_fields' "
+                "because their values are positional and cannot be reproduced under "
+                "graph-token random access."
+            )
+        if self.slice_length is not None:
+            raise ValueError(
+                "LazyNeMoTarredIterator(indexed=True) does not support 'slice_length'."
+            )
+
+        # Order shards by their integer shard_id so that global indices are stable.
+        self._sorted_shard_ids = sorted(self.shard_id_to_tar_path.keys())
+        self._cuts_readers: dict[int, IndexedJsonlReader] = {}
+        # In USE_AIS_GET_BATCH mode we never open the tar files locally — audio is
+        # fetched lazily via URL/file AudioSource by AudioSamples (typically batched).
+        self._tar_readers: dict[int, IndexedTarMemberReader] = {}
+
+        # Map shard_id → manifest path (single or multi-file).
+        if len(self.paths) == 1:
+            shard_id_to_manifest_path = {sid: self.paths[0] for sid in self._sorted_shard_ids}
+        else:
+            json_pattern = re.compile(r"manifest[^/]*_(\d+)[^/]*\.json")
+            shard_id_to_manifest_path = {}
+            for p in self.paths:
+                m = json_pattern.search(p)
+                assert m is not None
+                shard_id_to_manifest_path[int(m.group(1))] = p
+
+        cum = 0
+        cum_lens = [0]
+        for sid in self._sorted_shard_ids:
+            jsonl_path = shard_id_to_manifest_path[sid]
+            tar_path = self.shard_id_to_tar_path[sid]
+            self._cuts_readers[sid] = IndexedJsonlReader(jsonl_path)
+            if not self.use_ais_get_batch:
+                self._tar_readers[sid] = IndexedTarMemberReader(tar_path)
+            cum += len(self._cuts_readers[sid])
+            cum_lens.append(cum)
+        self._cum_lens = cum_lens
+        self._total_len = cum
+        self._position = 0
+        self._restored = False
+        self._offset_pattern = re.compile(r'^(?P<stem>.+)(?P<sub>-sub\d+)(?P<ext>\.\w+)?$')
+
     def to_shards(self) -> List["LazyNeMoTarredIterator"]:
         """Convert this iterator to a list of separate iterators for each shard."""
         if len(self.paths) == 1:
@@ -362,6 +588,13 @@ def to_shards(self) -> List["LazyNeMoTarredIterator"]:
             ]
 
     def _validate(self) -> None:
+        if self.indexed:
+            # Indexed mode keys shards by the tar path's shard_id and pairs them with
+            # the jsonl manifest of the same numeric id (see ``_init_indexed``); the
+            # streaming-time shard_id consistency check below would otherwise reject
+            # single-file inputs when the jsonl groups by a different shard_id field.
+            validate_extra_fields(self.extra_fields)
+            return
         shard_ids_tars = set(self.shard_id_to_tar_path)
         shard_ids_manifest = set(self.shard_id_to_manifest)
         assert shard_ids_tars == shard_ids_manifest, (
@@ -496,7 +729,184 @@ def _iter_sequential(
                             f"Cannot locate JSON entry for tar file '{tar_info.name}'"
                         ) from e
 
+    # ---------------------------------------------------------------------- indexed
+    def _resolve_global_idx(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._total_len
+        if idx < 0 or idx >= self._total_len:
+            raise IndexError(
+                f"index {idx} out of range for LazyNeMoTarredIterator with {self._total_len} cuts"
+            )
+        shard_pos = bisect.bisect_right(self._cum_lens, idx) - 1
+        sid = self._sorted_shard_ids[shard_pos]
+        return sid, idx - self._cum_lens[shard_pos]
+
+    def _build_indexed_cut(self, data: dict, audio_bytes: bytes, manifest_path: str, tar_path: str) -> Cut | None:
+        """Decode a single (manifest_entry, audio_bytes) pair into a Cut, mirroring the streaming path."""
+        if data.get("_skipme", False):
+            return None
+        try:
+            meta = soundfile.info(BytesIO(audio_bytes))
+        except Exception:
+            logging.warning(f"Skipped corrupted audio member referenced by '{data.get('audio_filepath')}' in {tar_path=}.")
+            return None
+        recording = Recording(
+            id=str(data["audio_filepath"]),
+            sources=[
+                AudioSource(type="memory", channels=list(range(meta.channels)), source=audio_bytes)
+            ],
+            sampling_rate=int(meta.samplerate),
+            num_samples=meta.frames,
+            duration=meta.duration,
+        )
+        cut = make_cut_with_subset_inmemory_recording(
+            recording, offset=data.get("offset", 0.0), duration=data.get("duration")
+        )
+        cut.supervisions.append(
+            SupervisionSegment(
+                id=cut.id,
+                recording_id=cut.recording_id,
+                start=0,
+                duration=cut.duration,
+                text=data.get(self.text_field),
+                language=data.get(self.lang_field),
+            )
+        )
+        cut.custom = _to_custom_attr_dict(data)
+        cut.manifest_origin = manifest_path
+        cut.tar_origin = tar_path
+        return cut
+
+    def _audio_member_name_from_entry(self, entry: dict) -> str:
+        af = entry["audio_filepath"]
+        m = self._offset_pattern.match(af)
+        if m is None:
+            return af
+        return m.group("stem") + ifnone(m.group("ext"), "")
+
+    def _build_indexed_url_cut(self, data: dict, manifest_path: str, tar_path: str) -> Cut | None:
+        """
+        AIS GetBatch counterpart of ``_build_indexed_cut``: produces a Cut backed
+        by a URL/file AudioSource (no audio bytes loaded), so that
+        ``AudioSamples(use_batch_loader=True)`` can fetch the entire minibatch in
+        a single AIS GetBatch request. Mirrors the streaming path in
+        ``_iter_batch_for_ais_get_batch``.
+        """
+        if data.get("_skipme", False):
+            return None
+        duration = data.get("duration")
+        if duration is None:
+            logging.warning(
+                f"Skipping '{data.get('audio_filepath')}' - missing duration in manifest"
+            )
+            return None
+        audio_filename = self._audio_member_name_from_entry(data)
+        audio_url = f"{tar_path.rstrip('/')}/{audio_filename.lstrip('/')}"
+        # Mirror the streaming path's convention: use type="url" since open_best()
+        # transparently handles both local paths and remote URLs (ais://, http(s)://, ...).
+        # AudioSamples' GetBatch loader inspects the URL scheme to dispatch to AIS.
+        source_type = "url" if "://" in tar_path else "file"
+        offset = data.get("offset", 0.0)
+        sampling_rate = data.get("sampling_rate", 16000)
+        recording = Recording(
+            id=audio_filename,
+            sources=[AudioSource(type=source_type, channels=[0], source=audio_url)],
+            sampling_rate=sampling_rate,
+            num_samples=compute_num_samples(duration, sampling_rate),
+            duration=duration,
+        )
+        cut = recording.to_cut()
+        if offset > 0:
+            cut = cut.truncate(offset=offset, duration=duration, preserve_id=True)
+            cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}"
+        cut.supervisions.append(
+            SupervisionSegment(
+                id=cut.id,
+                recording_id=cut.recording_id,
+                start=0,
+                duration=cut.duration,
+                text=data.get(self.text_field),
+                language=data.get(self.lang_field),
+            )
+        )
+        cut.custom = _to_custom_attr_dict(data)
+        cut.manifest_origin = manifest_path
+        cut.tar_origin = tar_path
+        return cut
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "LazyNeMoTarredIterator only supports __getitem__ when constructed with indexed=True."
+            )
+        idx = int(normalize_graph_token(token))
+        sid, local_idx = self._resolve_global_idx(idx)
+        data = self._cuts_readers[sid][local_idx]
+        manifest_path = self._cuts_readers[sid].path
+        tar_path = self.shard_id_to_tar_path[sid]
+        if self.use_ais_get_batch:
+            cut = self._build_indexed_url_cut(data, manifest_path, tar_path)
+        else:
+            member_name = self._audio_member_name_from_entry(data)
+            audio_bytes = self._tar_readers[sid].get(member_name)
+            cut = self._build_indexed_cut(data, audio_bytes, manifest_path, tar_path)
+        if cut is None:
+            raise RuntimeError(
+                f"Cut at global index {idx} (shard {sid}, local {local_idx}) is not decodable; "
+                f"cannot satisfy random-access __getitem__."
+            )
+        return attach_graph_origin(cut, idx)
+
+    def __len__(self) -> int:
+        if self.indexed:
+            return self._total_len
+        return len(self.source)
+
+    def state_dict(self) -> dict:
+        if not self.indexed:
+            return {}
+        return {"position": self._position, "epoch": self.epoch}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._position = sd.get("position", 0)
+        self.epoch = sd.get("epoch", 0)
+        self._restored = True
+
+    def _iter_indexed(self) -> Generator[Cut, None, None]:
+        start = self._position if self._restored else 0
+        self._restored = False
+        n = self._total_len
+        for i in range(start, n):
+            self._position = i + 1
+            sid, local_idx = self._resolve_global_idx(i)
+            data = self._cuts_readers[sid][local_idx]
+            manifest_path = self._cuts_readers[sid].path
+            tar_path = self.shard_id_to_tar_path[sid]
+            if self.use_ais_get_batch:
+                cut = self._build_indexed_url_cut(data, manifest_path, tar_path)
+            else:
+                member_name = self._audio_member_name_from_entry(data)
+                try:
+                    audio_bytes = self._tar_readers[sid].get(member_name)
+                except KeyError:
+                    if self.skip_missing_manifest_entries:
+                        continue
+                    raise
+                cut = self._build_indexed_cut(data, audio_bytes, manifest_path, tar_path)
+            if cut is None:
+                continue
+            attach_graph_origin(cut, i)
+            yield cut
+        self.epoch += 1
+
+    # ---------------------------------------------------------------- streaming
     def __iter__(self) -> Generator[Cut, None, None]:
+        if self.indexed:
+            yield from self._iter_indexed()
+            return
+
         shard_ids = self.shard_ids
 
         seed = self._get_seed()
@@ -579,9 +989,6 @@ def basename(d: dict) -> str:
 
         self.epoch += 1
 
-    def __len__(self) -> int:
-        return len(self.source)
-
     def __add__(self, other):
         return LazyIteratorChain(self, other)
 
@@ -737,7 +1144,7 @@ def _to_custom_attr_dict(d: dict, _excluded_fields: set[str] = {"duration", "aud
     return {k: v for k, v in d.items() if k not in _excluded_fields}
 
 
-class LazyParquetIterator:
+class LazyParquetIterator(IteratorNode):
     """
     LazyParquetIterator reads a Parquet file (local or remote) and yields Lhotse Cut objects.
     It streams data using PyArrow's iter_batches to avoid loading the full file into memory.
@@ -749,6 +1156,13 @@ class LazyParquetIterator:
         duration_field (str): Name of the column containing duration (default: "duration").
         lang_field (str): Name of the column containing language (default: "lang").
         sampling_rate (int): Fallback sampling rate if not found in metadata (default: 16000).
+        indexed (bool): When True, enable O(1) random access via row-group lookup
+            and graph-token checkpointing. Requires the parquet file to expose
+            row-group statistics (the default for files written by pyarrow/pandas).
+
+    Indexed mode reads one row group at a time on demand and caches the most
+    recently used row group, so unshuffled or locality-friendly access patterns
+    avoid repeated decompression.
     """
 
     def __init__(
@@ -759,6 +1173,7 @@ def __init__(
         duration_field: str = "duration",
         lang_field: str = "lang",
         sampling_rate: int = 16000,
+        indexed: bool = False,
     ) -> None:
         # SAFETY CHECK: Ensure pyarrow is actually installed
         if not HAVE_PYARROW:
@@ -772,8 +1187,153 @@ def __init__(
         self.duration_field = duration_field
         self.lang_field = lang_field
         self.sampling_rate = sampling_rate
+        self.indexed = indexed
+        self._row_group_offsets: list[int] | None = None
+        self._cached_row_group_idx: int | None = None
+        self._cached_row_group: list[dict] | None = None
+        self._position = 0
+        self._restored = False
+        if indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        try:
+            parquet_file = pq.ParquetFile(self.path)
+        except Exception as e:
+            raise RuntimeError(f"Failed to open Parquet file: {self.path}") from e
+        offsets = [0]
+        for i in range(parquet_file.num_row_groups):
+            offsets.append(offsets[-1] + parquet_file.metadata.row_group(i).num_rows)
+        self._row_group_offsets = offsets
+        self._num_row_groups = parquet_file.num_row_groups
+        self._total_rows = offsets[-1]
+        del parquet_file  # close handle; reopened lazily in workers
+
+    def _load_row_group(self, rg_idx: int) -> list[dict]:
+        if self._cached_row_group_idx == rg_idx and self._cached_row_group is not None:
+            return self._cached_row_group
+        parquet_file = pq.ParquetFile(self.path)
+        try:
+            df = parquet_file.read_row_group(rg_idx).to_pandas()
+        finally:
+            del parquet_file
+        rows = df.to_dict("records")
+        self._cached_row_group_idx = rg_idx
+        self._cached_row_group = rows
+        return rows
+
+    def _resolve_row_group(self, idx: int) -> tuple[int, int]:
+        # Find row group containing global ``idx`` via simple linear/bisect lookup.
+        offsets = self._row_group_offsets
+        # Linear scan is fine because num_row_groups is typically small.
+        for rg_idx in range(self._num_row_groups):
+            if idx < offsets[rg_idx + 1]:
+                return rg_idx, idx - offsets[rg_idx]
+        raise IndexError(f"index {idx} out of range for parquet file with {self._total_rows} rows")
+
+    def _build_cut_from_row(self, row: dict, fallback_idx: int) -> Cut | None:
+        audio_data = row.get(self.audio_field)
+        if isinstance(audio_data, dict) and 'bytes' in audio_data:
+            audio_bytes = audio_data['bytes']
+        elif isinstance(audio_data, bytes):
+            audio_bytes = audio_data
+        else:
+            logging.warning(
+                f"Skipping row {fallback_idx}: Audio column '{self.audio_field}' format unrecognized."
+            )
+            return None
+
+        text = row.get(self.text_field, "")
+        language = row.get(self.lang_field, None)
+        row_id = str(row.get('id', f"{Path(self.path).stem}_{fallback_idx}"))
+        try:
+            recording = Recording.from_bytes(data=audio_bytes, recording_id=row_id)
+        except (RuntimeError, ValueError, TypeError) as e:
+            logging.warning(f"Skipping row {row_id}: Failed to decode audio bytes. {e}")
+            return None
+        cut = recording.to_cut()
+        cut.supervisions.append(
+            SupervisionSegment(
+                id=row_id,
+                recording_id=row_id,
+                start=0.0,
+                duration=cut.duration,
+                channel=0,
+                text=text,
+                language=language,
+            )
+        )
+        cut.custom = {k: v for k, v in row.items() if k != self.audio_field}
+        return cut
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "LazyParquetIterator only supports __getitem__ when constructed with indexed=True."
+            )
+        idx = int(normalize_graph_token(token))
+        if idx < 0:
+            idx += self._total_rows
+        if idx < 0 or idx >= self._total_rows:
+            raise IndexError(f"index {token} out of range for parquet file with {self._total_rows} rows")
+        rg_idx, local_idx = self._resolve_row_group(idx)
+        rows = self._load_row_group(rg_idx)
+        cut = self._build_cut_from_row(rows[local_idx], fallback_idx=idx)
+        if cut is None:
+            raise RuntimeError(
+                f"Row {idx} in {self.path} is not decodable; cannot satisfy random-access __getitem__."
+            )
+        return attach_graph_origin(cut, idx)
+
+    def __len__(self) -> int:
+        if self.indexed:
+            return self._total_rows
+        raise TypeError("LazyParquetIterator has unknown length unless constructed with indexed=True.")
+
+    def state_dict(self) -> dict:
+        if not self.indexed:
+            return {}
+        return {"position": self._position}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._position = sd.get("position", 0)
+        self._restored = True
 
     def __iter__(self) -> Generator[Cut, None, None]:
+        if self.indexed:
+            yield from self._iter_indexed()
+        else:
+            yield from self._iter_streaming()
+
+    def _iter_indexed(self) -> Generator[Cut, None, None]:
+        start = self._position if self._restored else 0
+        self._restored = False
+        n = self._total_rows
+        for i in range(start, n):
+            self._position = i + 1
+            rg_idx, local_idx = self._resolve_row_group(i)
+            rows = self._load_row_group(rg_idx)
+            cut = self._build_cut_from_row(rows[local_idx], fallback_idx=i)
+            if cut is None:
+                continue
+            attach_graph_origin(cut, i)
+            yield cut
+
+    def _iter_streaming(self) -> Generator[Cut, None, None]:
         # Open Parquet file in streaming mode inside __iter__
         # This ensures each DataLoader worker gets its own file handle.
         try:
@@ -786,53 +1346,7 @@ def __iter__(self) -> Generator[Cut, None, None]:
             df = batch.to_pandas()
 
             for idx, row in df.iterrows():
-                # 1. Extract Audio Bytes
-                # Handle HuggingFace format: {'bytes': b'...', 'path': '...'} or raw bytes
-                audio_data = row.get(self.audio_field)
-                if isinstance(audio_data, dict) and 'bytes' in audio_data:
-                    audio_bytes = audio_data['bytes']
-                elif isinstance(audio_data, bytes):
-                    audio_bytes = audio_data
-                else:
-                    logging.warning(f"Skipping row {idx}: Audio column '{self.audio_field}' format unrecognized.")
-                    continue
-
-                # 2. Extract Metadata
-                text = row.get(self.text_field, "")
-                language = row.get(self.lang_field, None)
-
-                # 3. Create Unique ID
-                # Use 'id' column if exists, else combine filename + index
-                row_id = str(row.get('id', f"{Path(self.path).stem}_{idx}"))
-
-                # 4. Create Lhotse Recording
-                try:
-                    recording = Recording.from_bytes(
-                        data=audio_bytes,
-                        recording_id=row_id,
-                    )
-                except (RuntimeError, ValueError, TypeError) as e:
-                    logging.warning(f"Skipping row {row_id}: Failed to decode audio bytes. {e}")
+                cut = self._build_cut_from_row(row, fallback_idx=idx)
+                if cut is None:
                     continue
-
-                # 5. Create Cut
-                cut = recording.to_cut()
-
-                # Add Supervision (Transcript)
-                cut.supervisions.append(
-                    SupervisionSegment(
-                        id=row_id,
-                        recording_id=row_id,
-                        start=0.0,
-                        duration=cut.duration,
-                        channel=0,
-                        text=text,
-                        language=language,
-                    )
-                )
-
-                # Attach any extra metadata from the row to cut.custom
-                # (Exclude the heavy audio bytes to save RAM)
-                cut.custom = {k: v for k, v in row.items() if k != self.audio_field}
-
                 yield cut
diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py
index 8022e9c9e61e..f2d161528bb3 100644
--- a/nemo/collections/common/data/lhotse/text_adapters.py
+++ b/nemo/collections/common/data/lhotse/text_adapters.py
@@ -33,6 +33,8 @@
 from lhotse.shar import AudioTarWriter, JsonlShardWriter
 from lhotse.utils import Pathlike, compute_num_samples, is_valid_url
 
+from lhotse.lazy import IteratorNode, attach_graph_origin, normalize_graph_token
+
 from nemo.collections.common.data.lhotse.indexed_adapters import (
     IndexedJSONLReader,
     IndexedTarSampleReader,
@@ -132,10 +134,13 @@ def __iter__(self) -> Iterator[TextExample]:
 
 
 @dataclass
-class LhotseTextJsonlAdapter:
+class LhotseTextJsonlAdapter(IteratorNode):
     """
     ``LhotseTextJsonlAdapter`` is used to read a JSONL file and wrap
     the text field of each line into a ``TextExample``.
+
+    Set ``indexed=True`` to enable O(1) random access plus graph-token
+    checkpointing (requires uncompressed ``.jsonl`` paths).
     """
 
     paths: Union[Pathlike, list[Pathlike]]
@@ -143,11 +148,97 @@ class LhotseTextJsonlAdapter:
     text_field: str = "text"
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
+    indexed: bool = False
 
     def __post_init__(self):
         self.paths = expand_sharded_filepaths(self.paths)
+        self._readers: list = []
+        self._cum_lens: list[int] = []
+        self._position = 0
+        self._restored = False
+        if self.indexed:
+            from lhotse.indexing import IndexedJsonlReader
+
+            for p in self.paths:
+                self._readers.append(IndexedJsonlReader(p))
+            cum = 0
+            self._cum_lens.append(cum)
+            for r in self._readers:
+                cum += len(r)
+                self._cum_lens.append(cum)
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def __len__(self) -> int:
+        if not self.indexed:
+            raise TypeError("LhotseTextJsonlAdapter has unknown length unless constructed with indexed=True.")
+        return self._cum_lens[-1] if self._cum_lens else 0
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._cum_lens[-1]
+        for s in range(len(self._readers)):
+            if idx < self._cum_lens[s + 1]:
+                return s, idx - self._cum_lens[s]
+        raise IndexError(idx)
+
+    def _data_to_example(self, data: dict) -> TextExample | None:
+        if self.text_field not in data:
+            return None
+        return TextExample(data[self.text_field], language=self.language)
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError("LhotseTextJsonlAdapter only supports __getitem__ when indexed=True.")
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        ex = self._data_to_example(self._readers[shard_idx][local_idx])
+        if ex is None:
+            raise RuntimeError(
+                f"Index {idx} in {self.paths[shard_idx]} has no '{self.text_field}' field; "
+                f"cannot satisfy random-access __getitem__."
+            )
+        return attach_graph_origin(ex, idx)
+
+    def state_dict(self) -> dict:
+        return {"position": self._position} if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._position = sd.get("position", 0)
+        self._restored = True
 
     def __iter__(self) -> Iterator[TextExample]:
+        if self.indexed:
+            yield from self._iter_indexed()
+        else:
+            yield from self._iter_streaming()
+
+    def _iter_indexed(self) -> Iterator[TextExample]:
+        start = self._position if self._restored else 0
+        self._restored = False
+        n = self._cum_lens[-1] if self._cum_lens else 0
+        for i in range(start, n):
+            self._position = i + 1
+            shard_idx, local_idx = self._resolve(i)
+            ex = self._data_to_example(self._readers[shard_idx][local_idx])
+            if ex is None:
+                continue
+            attach_graph_origin(ex, i)
+            yield ex
+
+    def _iter_streaming(self) -> Iterator[TextExample]:
         paths = self.paths
         if self.shuffle_shards:
             seed = resolve_seed(self.shard_seed)
@@ -296,7 +387,7 @@ def default_sft_prompt_format_fn(example: NeMoSFTExample, prompt):
 
 
 @dataclass
-class NeMoSFTJsonlAdapter:
+class NeMoSFTJsonlAdapter(IteratorNode):
     """
     ``NeMoSFTJsonlAdapter`` is used to read a NeMo LM SFT Chat JSONL file and yield objects of type
     ``NeMoSFTExample`` that can be sampled with Lhotse.
@@ -318,17 +409,94 @@ class NeMoSFTJsonlAdapter:
             "dataset": str,
             "category": str,
         }
+
+    Set ``indexed=True`` to enable O(1) random access plus graph-token
+    checkpointing (requires uncompressed ``.jsonl`` paths).
     """
 
     paths: Union[Pathlike, list[Pathlike]]
     language: str | None = None
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
+    indexed: bool = False
 
     def __post_init__(self):
         self.paths = expand_sharded_filepaths(self.paths)
+        self._readers: list = []
+        self._cum_lens: list[int] = []
+        self._position = 0
+        self._restored = False
+        if self.indexed:
+            from lhotse.indexing import IndexedJsonlReader
+
+            for p in self.paths:
+                self._readers.append(IndexedJsonlReader(p))
+            cum = 0
+            self._cum_lens.append(cum)
+            for r in self._readers:
+                cum += len(r)
+                self._cum_lens.append(cum)
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def __len__(self) -> int:
+        if not self.indexed:
+            raise TypeError("NeMoSFTJsonlAdapter has unknown length unless constructed with indexed=True.")
+        return self._cum_lens[-1] if self._cum_lens else 0
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._cum_lens[-1]
+        for s in range(len(self._readers)):
+            if idx < self._cum_lens[s + 1]:
+                return s, idx - self._cum_lens[s]
+        raise IndexError(idx)
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError("NeMoSFTJsonlAdapter only supports __getitem__ when indexed=True.")
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        ex = NeMoSFTExample(self._readers[shard_idx][local_idx], language=self.language)
+        return attach_graph_origin(ex, idx)
+
+    def state_dict(self) -> dict:
+        return {"position": self._position} if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._position = sd.get("position", 0)
+        self._restored = True
 
     def __iter__(self) -> Iterator[NeMoSFTExample]:
+        if self.indexed:
+            yield from self._iter_indexed()
+        else:
+            yield from self._iter_streaming()
+
+    def _iter_indexed(self) -> Iterator[NeMoSFTExample]:
+        start = self._position if self._restored else 0
+        self._restored = False
+        n = self._cum_lens[-1] if self._cum_lens else 0
+        for i in range(start, n):
+            self._position = i + 1
+            shard_idx, local_idx = self._resolve(i)
+            ex = NeMoSFTExample(self._readers[shard_idx][local_idx], language=self.language)
+            attach_graph_origin(ex, i)
+            yield ex
+
+    def _iter_streaming(self) -> Iterator[NeMoSFTExample]:
         paths = self.paths
         if self.shuffle_shards:
             seed = resolve_seed(self.shard_seed)
@@ -596,7 +764,7 @@ def _make_url_cut(
 
 
 @dataclass
-class NeMoMultimodalConversationJsonlAdapter:
+class NeMoMultimodalConversationJsonlAdapter(IteratorNode):
     """
     ``NeMoMultimodalConversationJsonlAdapter`` is used to read a NeMo multimodal conversation JSONL
     and yield objects of type ``NeMoMultimodalConversation`` that can be sampled with Lhotse.
@@ -615,6 +783,11 @@ class NeMoMultimodalConversationJsonlAdapter:
                 ...
             ],
         }
+
+    Set ``indexed=True`` to enable O(1) random access plus graph-token
+    checkpointing. Indexed mode requires uncompressed JSONL manifests; for the
+    tarred path it additionally requires uncompressed tar shards (the canonical
+    ``.idx`` sidecars are built lazily on first construction).
     """
 
     manifest_filepath: str | list[str]
@@ -626,6 +799,7 @@ class NeMoMultimodalConversationJsonlAdapter:
     system_prompt: str | None = None
     context: str | None = None
     slice_length: int | None = None
+    indexed: bool = False
 
     def __post_init__(self):
         self.manifest_filepath = expand_sharded_filepaths(self.manifest_filepath)
@@ -635,13 +809,226 @@ def __post_init__(self):
                 self.tarred_audio_filepaths
             ), f"{len(self.manifest_filepath)} != {len(self.tarred_audio_filepaths)}"
         self.epoch = 0
+        self._cuts_readers: list = []
+        self._tar_readers: list = []
+        self._cum_lens: list[int] = []
+        self._total_len = 0
+        self._position = 0
+        self._restored = False
+        if self.indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        from lhotse.indexing import IndexedJsonlReader
+
+        if self.slice_length is not None:
+            raise ValueError(
+                "NeMoMultimodalConversationJsonlAdapter(indexed=True) does not support slice_length."
+            )
+        for p in self.manifest_filepath:
+            self._cuts_readers.append(IndexedJsonlReader(p))
+        if self.tarred_audio_filepaths is not None:
+            from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
+
+            for p in self.tarred_audio_filepaths:
+                self._tar_readers.append(IndexedTarMemberReader(p))
+        cum = 0
+        self._cum_lens.append(cum)
+        for r in self._cuts_readers:
+            cum += len(r)
+            self._cum_lens.append(cum)
+        self._total_len = cum
+
+    def __len__(self) -> int:
+        if self.indexed:
+            return self._total_len
+        raise TypeError(
+            "NeMoMultimodalConversationJsonlAdapter has unknown length unless constructed with indexed=True."
+        )
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._total_len
+        for s in range(len(self._cuts_readers)):
+            if idx < self._cum_lens[s + 1]:
+                return s, idx - self._cum_lens[s]
+        raise IndexError(idx)
+
+    def state_dict(self) -> dict:
+        return {"position": self._position, "epoch": self.epoch} if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._position = sd.get("position", 0)
+        self.epoch = sd.get("epoch", 0)
+        self._restored = True
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "NeMoMultimodalConversationJsonlAdapter only supports __getitem__ when indexed=True."
+            )
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        data = self._cuts_readers[shard_idx][local_idx]
+        if self._tar_readers:
+            convo = self._build_conversation_tarred(
+                data,
+                tar_reader=self._tar_readers[shard_idx],
+                tar_path=self.tarred_audio_filepaths[shard_idx],
+            )
+        else:
+            convo = self._build_conversation_local(
+                data, manifest_path=self._cuts_readers[shard_idx].path
+            )
+        if convo is None:
+            raise RuntimeError(
+                f"Conversation at index {idx} (shard {shard_idx}, local {local_idx}) "
+                f"could not be built; cannot satisfy random-access __getitem__."
+            )
+        return attach_graph_origin(convo, idx)
+
+    def _build_conversation_local(self, data: dict, manifest_path: str) -> NeMoMultimodalConversation | None:
+        if self._should_skip(data):
+            return None
+        turns = [
+            (
+                TextTurn(
+                    value=turn["value"],
+                    role=turn["from"].lower(),
+                )
+                if turn["type"] == "text"
+                else AudioTurn(
+                    cut=(
+                        cut := Recording.from_file(get_full_path(turn["value"], manifest_path))
+                        .to_cut()
+                        .truncate(offset=turn.get("offset", 0.0), duration=turn.get("duration"))
+                    ).with_id(self._make_cut_id(cut, turn)),
+                    text=cut.supervisions[0].text if cut.supervisions else None,
+                    role=turn["from"].lower(),
+                    audio_locator_tag=self.audio_locator_tag,
+                )
+            )
+            for turn in data["conversations"]
+        ]
+        if self.context is not None and turns[0].role == "user" and isinstance(turns[0], AudioTurn):
+            turns = [TextTurn(role="user", value=self.context)] + turns
+        if self.system_prompt is not None and turns[0].role != "system":
+            turns = [TextTurn(role="system", value=self.system_prompt)] + turns
+        return NeMoMultimodalConversation(
+            id=data["id"],
+            turns=turns,
+            token_equivalent_duration=self.token_equivalent_duration,
+            custom=data.get("custom"),
+        )
+
+    def _build_conversation_tarred(
+        self, data: dict, tar_reader, tar_path: str
+    ) -> NeMoMultimodalConversation | None:
+        import io as _io
+
+        import soundfile as _sf
+        from lhotse import AudioSource as _AudioSource
+        from lhotse import Recording as _Recording
+
+        if self._should_skip(data):
+            return None
+        cuts: list = []
+        for turn in data["conversations"]:
+            if turn["type"] != "audio":
+                continue
+            audio_bytes = tar_reader.get(turn["value"])
+            try:
+                meta = _sf.info(_io.BytesIO(audio_bytes))
+            except Exception:
+                logging.warning(f"Skipped corrupted audio member '{turn['value']}' in {tar_path=}.")
+                return None
+            recording = _Recording(
+                id=turn["value"],
+                sources=[_AudioSource(type="memory", channels=list(range(meta.channels)), source=audio_bytes)],
+                sampling_rate=int(meta.samplerate),
+                num_samples=meta.frames,
+                duration=meta.duration,
+            )
+            cut = recording.to_cut().truncate(
+                offset=turn.get("offset", 0.0), duration=turn.get("duration")
+            )
+            cut = cut.with_id(self._make_cut_id(cut, turn))
+            cuts.append(cut)
+        cuts = deque(cuts)
+        turns = [
+            (
+                TextTurn(
+                    value=turn["value"],
+                    role=turn["from"].lower(),
+                )
+                if turn["type"] == "text"
+                else AudioTurn(
+                    cut=(c := cuts.popleft()),
+                    text=c.supervisions[0].text if c.supervisions else None,
+                    role=turn["from"].lower(),
+                    audio_locator_tag=self.audio_locator_tag,
+                )
+            )
+            for turn in data["conversations"]
+        ]
+        if self.context is not None and turns[0].role == "user" and isinstance(turns[0], AudioTurn):
+            turns = [TextTurn(role="user", value=self.context)] + turns
+        if self.system_prompt is not None and turns[0].role != "system":
+            turns = [TextTurn(role="system", value=self.system_prompt)] + turns
+        return NeMoMultimodalConversation(
+            id=data["id"],
+            turns=turns,
+            token_equivalent_duration=self.token_equivalent_duration,
+            custom=data.get("custom"),
+        )
 
     def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
+        if self.indexed:
+            yield from self._iter_indexed()
+            return
         if self.tarred_audio_filepaths is not None:
             yield from self._iter_tar()
         else:
             yield from self._iter_jsonl()
 
+    def _iter_indexed(self) -> Iterator[NeMoMultimodalConversation]:
+        start = self._position if self._restored else 0
+        self._restored = False
+        n = self._total_len
+        for i in range(start, n):
+            self._position = i + 1
+            shard_idx, local_idx = self._resolve(i)
+            data = self._cuts_readers[shard_idx][local_idx]
+            if self._tar_readers:
+                convo = self._build_conversation_tarred(
+                    data,
+                    tar_reader=self._tar_readers[shard_idx],
+                    tar_path=self.tarred_audio_filepaths[shard_idx],
+                )
+            else:
+                convo = self._build_conversation_local(
+                    data, manifest_path=self._cuts_readers[shard_idx].path
+                )
+            if convo is None:
+                continue
+            attach_graph_origin(convo, i)
+            yield convo
+        self.epoch += 1
+
     def _should_skip(self, example: dict) -> bool:
         custom = example.get("custom")
         if custom is None:
@@ -845,7 +1232,7 @@ def _create_sharegpt_turns(audio_locator_tag: str, conversations: list[dict], re
 
 
 @dataclass
-class NeMoMultimodalConversationShareGPTJsonlAdapter:
+class NeMoMultimodalConversationShareGPTJsonlAdapter(IteratorNode):
     """
     ``NeMoMultimodalConversationShareGPTJsonlAdapter`` is used to read a ShareGPT format multimodal
     conversation JSONL and yield objects of type ``NeMoMultimodalConversation`` that can be sampled with Lhotse.
@@ -878,6 +1265,7 @@ class NeMoMultimodalConversationShareGPTJsonlAdapter:
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
     slice_length: int | None = None
+    indexed: bool = False
 
     def __post_init__(self):
         self.manifest_filepath = expand_sharded_filepaths(self.manifest_filepath)
@@ -889,8 +1277,132 @@ def __post_init__(self):
         self.audio_placeholders = _normalize_audio_placeholders(self.audio_placeholders)
         self._has_index = all(Path(p + ".idx").exists() for p in self.manifest_filepath)
         self.epoch = 0
+        self._cuts_readers: list = []
+        self._tar_readers: list = []
+        self._cum_lens: list[int] = []
+        self._total_len = 0
+        self._position = 0
+        self._restored = False
+        if self.indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        from lhotse.indexing import IndexedJsonlReader
+
+        if self.slice_length is not None:
+            raise ValueError(
+                "NeMoMultimodalConversationShareGPTJsonlAdapter(indexed=True) does not support slice_length."
+            )
+        for p in self.manifest_filepath:
+            self._cuts_readers.append(IndexedJsonlReader(p))
+        if self.tarred_audio_filepaths is not None:
+            from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
+
+            for p in self.tarred_audio_filepaths:
+                self._tar_readers.append(IndexedTarMemberReader(p))
+        cum = 0
+        self._cum_lens.append(cum)
+        for r in self._cuts_readers:
+            cum += len(r)
+            self._cum_lens.append(cum)
+        self._total_len = cum
+
+    def __len__(self) -> int:
+        if self.indexed:
+            return self._total_len
+        raise TypeError(
+            "NeMoMultimodalConversationShareGPTJsonlAdapter has unknown length unless constructed with indexed=True."
+        )
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._total_len
+        for s in range(len(self._cuts_readers)):
+            if idx < self._cum_lens[s + 1]:
+                return s, idx - self._cum_lens[s]
+        raise IndexError(idx)
+
+    def state_dict(self) -> dict:
+        return {"position": self._position, "epoch": self.epoch} if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._position = sd.get("position", 0)
+        self.epoch = sd.get("epoch", 0)
+        self._restored = True
+
+    def _build_one(self, data: dict, shard_idx: int) -> NeMoMultimodalConversation:
+        conversations = _transform_sharegpt(self.audio_placeholders, data)
+        if self._tar_readers:
+            tar_reader = self._tar_readers[shard_idx]
+            tar_path = self.tarred_audio_filepaths[shard_idx]
+            return NeMoMultimodalConversation(
+                id=data.get("id", "missing-example-id"),
+                turns=_create_sharegpt_turns(
+                    self.audio_locator_tag,
+                    conversations,
+                    lambda t: self._resolve_cut_from_indexed_tar(t, tar_reader, tar_path),
+                ),
+                token_equivalent_duration=self.token_equivalent_duration,
+            )
+        manifest_path = self._cuts_readers[shard_idx].path
+        return NeMoMultimodalConversation(
+            id=data.get("id", "missing-example-id"),
+            turns=_create_sharegpt_turns(
+                self.audio_locator_tag,
+                conversations,
+                lambda t, _p=manifest_path: self._resolve_cut_from_path(t, _p),
+            ),
+            token_equivalent_duration=self.token_equivalent_duration,
+        )
+
+    def _resolve_cut_from_indexed_tar(self, turn, tar_reader, tar_path):
+        import io as _io
+
+        import soundfile as _sf
+        from lhotse import AudioSource as _AudioSource
+        from lhotse import Recording as _Recording
+
+        audio_bytes = tar_reader.get(turn["value"])
+        meta = _sf.info(_io.BytesIO(audio_bytes))
+        recording = _Recording(
+            id=turn["value"],
+            sources=[_AudioSource(type="memory", channels=list(range(meta.channels)), source=audio_bytes)],
+            sampling_rate=int(meta.samplerate),
+            num_samples=meta.frames,
+            duration=meta.duration,
+        )
+        cut = recording.to_cut().truncate(offset=turn.get("offset", 0.0), duration=turn.get("duration"))
+        return cut.with_id(self._make_cut_id(cut, turn))
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "NeMoMultimodalConversationShareGPTJsonlAdapter only supports __getitem__ when indexed=True."
+            )
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        data = self._cuts_readers[shard_idx][local_idx]
+        convo = self._build_one(data, shard_idx)
+        return attach_graph_origin(convo, idx)
 
     def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
+        if self.indexed:
+            yield from self._iter_indexed_node()
+            return
         if self.tarred_audio_filepaths is not None:
             yield from self._iter_tar()
         elif self.shuffle_shards and self._has_index:
@@ -898,6 +1410,19 @@ def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
         else:
             yield from self._iter_jsonl()
 
+    def _iter_indexed_node(self) -> Iterator[NeMoMultimodalConversation]:
+        start = self._position if self._restored else 0
+        self._restored = False
+        n = self._total_len
+        for i in range(start, n):
+            self._position = i + 1
+            shard_idx, local_idx = self._resolve(i)
+            data = self._cuts_readers[shard_idx][local_idx]
+            convo = self._build_one(data, shard_idx)
+            attach_graph_origin(convo, i)
+            yield convo
+        self.epoch += 1
+
     def _get_rng(self) -> random.Random:
         return random.Random(resolve_seed(self.shard_seed) + self.epoch)
 
@@ -1024,7 +1549,7 @@ def _iter_jsonl_indexed(self):
 
 
 @dataclass
-class NeMoMultimodalConversationShareGPTWebdatasetAdapter:
+class NeMoMultimodalConversationShareGPTWebdatasetAdapter(IteratorNode):
     """
     ``NeMoMultimodalConversationShareGPTWebdatasetAdapter`` reads ShareGPT format multimodal
     conversations from WebDataset tar archives and yields ``NeMoMultimodalConversation`` objects.
@@ -1059,6 +1584,7 @@ class NeMoMultimodalConversationShareGPTWebdatasetAdapter:
     token_equivalent_duration: float = None
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
+    indexed: bool = False
 
     def __post_init__(self):
         import json as _json
@@ -1075,13 +1601,94 @@ def __post_init__(self):
         self.audio_placeholders = _normalize_audio_placeholders(self.audio_placeholders)
         self._has_index = all(Path(p + ".idx").exists() for p in self._shard_paths)
         self.epoch = 0
+        self._tar_readers: list = []
+        self._cum_lens: list[int] = []
+        self._total_len = 0
+        self._position = 0
+        self._restored = False
+        if self.indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        for p in self._shard_paths:
+            self._tar_readers.append(IndexedTarSampleReader(p))
+        cum = 0
+        self._cum_lens.append(cum)
+        for r in self._tar_readers:
+            cum += len(r)
+            self._cum_lens.append(cum)
+        self._total_len = cum
+
+    def __len__(self) -> int:
+        if self.indexed:
+            return self._total_len
+        raise TypeError(
+            "NeMoMultimodalConversationShareGPTWebdatasetAdapter has unknown length unless constructed with indexed=True."
+        )
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._total_len
+        for s in range(len(self._tar_readers)):
+            if idx < self._cum_lens[s + 1]:
+                return s, idx - self._cum_lens[s]
+        raise IndexError(idx)
+
+    def state_dict(self) -> dict:
+        return {"position": self._position, "epoch": self.epoch} if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._position = sd.get("position", 0)
+        self.epoch = sd.get("epoch", 0)
+        self._restored = True
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "NeMoMultimodalConversationShareGPTWebdatasetAdapter only supports __getitem__ when indexed=True."
+            )
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        json_data, audio_bytes, audio_name = self._tar_readers[shard_idx][local_idx]
+        convo = self._yield_from_sample(json_data, audio_bytes, audio_name)
+        return attach_graph_origin(convo, idx)
 
     def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
+        if self.indexed:
+            yield from self._iter_indexed_node()
+            return
         if self.shuffle_shards and self._has_index:
             yield from self._iter_indexed()
         else:
             yield from self._iter_sequential()
 
+    def _iter_indexed_node(self) -> Iterator[NeMoMultimodalConversation]:
+        start = self._position if self._restored else 0
+        self._restored = False
+        n = self._total_len
+        for i in range(start, n):
+            self._position = i + 1
+            shard_idx, local_idx = self._resolve(i)
+            json_data, audio_bytes, audio_name = self._tar_readers[shard_idx][local_idx]
+            convo = self._yield_from_sample(json_data, audio_bytes, audio_name)
+            attach_graph_origin(convo, i)
+            yield convo
+        self.epoch += 1
+
     def _get_rng(self) -> random.Random:
         return random.Random(resolve_seed(self.shard_seed) + self.epoch)
 
diff --git a/nemo/collections/speechlm2/models/salm_automodel.py b/nemo/collections/speechlm2/models/salm_automodel.py
index f759ac01bcc7..ac9000404a94 100644
--- a/nemo/collections/speechlm2/models/salm_automodel.py
+++ b/nemo/collections/speechlm2/models/salm_automodel.py
@@ -234,7 +234,22 @@ def on_fit_start(self) -> None:
         averaging (see ``_configure_moe_aux_loss_scaler``)."""
         self._configure_moe_aux_loss_scaler()
 
-    def training_step(self, batch: dict, batch_idx: int):
+    def training_step(self, dataloader_iter):
+        # Use the explicit ``dataloader_iter`` signature so Lightning selects
+        # ``_DataLoaderIterDataFetcher`` (no upfront prefetch). With
+        # ``_PrefetchDataFetcher`` Lightning re-primes one batch from the
+        # dataloader every time iteration starts (including on resume), which
+        # advances the StatefulDataLoader past the saved snapshot point and
+        # breaks bit-identical resumption. The dataloader_iter path consumes
+        # one batch per training step, so save/restore captures the exact
+        # next-batch position.
+        batch, batch_idx, _ = next(dataloader_iter)
+        # Move to device + apply precision conversions normally done by Lightning
+        # for the prefetch fetcher path.
+        batch = self.trainer.precision_plugin.convert_input(batch)
+        batch = self._on_before_batch_transfer(batch, dataloader_idx=0)
+        batch = self.trainer.strategy.batch_to_device(batch, dataloader_idx=0)
+
         self._current_batch_idx = batch_idx
         for m in (self.perception.preprocessor, self.perception.encoder, self.llm):
             if is_frozen(m):
@@ -286,8 +301,10 @@ def training_step(self, batch: dict, batch_idx: int):
             "target_to_input_ratio": num_frames / (B * T),
             "padding_ratio": (batch["input_ids"] != self.text_pad_id).long().sum() / batch["input_ids"].numel(),
         }
-        self.log("loss", loss_display, on_step=True, prog_bar=True)
-        self.log_dict({k: v for k, v in ans.items() if k != "loss"}, on_step=True)
+        # batch_size kwarg is required by Lightning when training_step uses
+        # the ``dataloader_iter`` signature (it can't auto-infer otherwise).
+        self.log("loss", loss_display, on_step=True, prog_bar=True, batch_size=B)
+        self.log_dict({k: v for k, v in ans.items() if k != "loss"}, on_step=True, batch_size=B)
         self.maybe_log_moe_metrics(batch_idx)
         return ans
 

From 8a482e437ccfde7fb8222174a2c5a075fac30d3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Tue, 5 May 2026 09:23:10 -0700
Subject: [PATCH 03/30] refactor read_batch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../speechlm2/models/salm_automodel.py        | 20 ++------
 nemo/core/utils/lightning_utils.py            | 49 +++++++++++++++++++
 2 files changed, 54 insertions(+), 15 deletions(-)
 create mode 100644 nemo/core/utils/lightning_utils.py

diff --git a/nemo/collections/speechlm2/models/salm_automodel.py b/nemo/collections/speechlm2/models/salm_automodel.py
index ac9000404a94..8cf8ee7b3eec 100644
--- a/nemo/collections/speechlm2/models/salm_automodel.py
+++ b/nemo/collections/speechlm2/models/salm_automodel.py
@@ -40,6 +40,7 @@
     update_perception_output_dim,
 )
 from nemo.core.neural_types import AudioSignal, LabelsType, LengthsType, MaskType, NeuralType
+from nemo.core.utils.lightning_utils import read_batch
 
 
 class SALMAutomodel(LightningModule, HFHubMixin):
@@ -235,21 +236,10 @@ def on_fit_start(self) -> None:
         self._configure_moe_aux_loss_scaler()
 
     def training_step(self, dataloader_iter):
-        # Use the explicit ``dataloader_iter`` signature so Lightning selects
-        # ``_DataLoaderIterDataFetcher`` (no upfront prefetch). With
-        # ``_PrefetchDataFetcher`` Lightning re-primes one batch from the
-        # dataloader every time iteration starts (including on resume), which
-        # advances the StatefulDataLoader past the saved snapshot point and
-        # breaks bit-identical resumption. The dataloader_iter path consumes
-        # one batch per training step, so save/restore captures the exact
-        # next-batch position.
-        batch, batch_idx, _ = next(dataloader_iter)
-        # Move to device + apply precision conversions normally done by Lightning
-        # for the prefetch fetcher path.
-        batch = self.trainer.precision_plugin.convert_input(batch)
-        batch = self._on_before_batch_transfer(batch, dataloader_idx=0)
-        batch = self.trainer.strategy.batch_to_device(batch, dataloader_idx=0)
-
+        # ``dataloader_iter`` signature → Lightning selects
+        # ``_DataLoaderIterDataFetcher`` (no prefetch) which is required for
+        # bit-identical checkpoint resumption. See ``read_batch`` docstring.
+        batch, batch_idx = read_batch(dataloader_iter, self)
         self._current_batch_idx = batch_idx
         for m in (self.perception.preprocessor, self.perception.encoder, self.llm):
             if is_frozen(m):
diff --git a/nemo/core/utils/lightning_utils.py b/nemo/core/utils/lightning_utils.py
new file mode 100644
index 000000000000..77c88942ac9f
--- /dev/null
+++ b/nemo/core/utils/lightning_utils.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helpers for working with PyTorch Lightning's ``training_step``."""
+from typing import Any, Iterator, Tuple
+
+import lightning.pytorch as pl
+
+
+def read_batch(dataloader_iter: Iterator, model: pl.LightningModule) -> Tuple[Any, int]:
+    """Pull the next batch from a Lightning ``dataloader_iter`` and apply the
+    device/precision conversions that ``_PrefetchDataFetcher`` would have
+    applied for the default ``training_step(batch, batch_idx)`` signature.
+
+    Use this from a ``training_step(self, dataloader_iter)``-style step. That
+    signature makes Lightning select ``_DataLoaderIterDataFetcher`` (no
+    prefetch), which is required for bit-identical checkpoint resumption with
+    a stateful dataloader: the default ``_PrefetchDataFetcher`` re-primes one
+    batch on every iter init (including on resume), advancing the stateful
+    dataloader past the saved snapshot point and giving the resumed run a
+    one-batch drift versus the continuous run.
+
+    Args:
+        dataloader_iter: The iterator passed by Lightning into a
+            ``training_step(self, dataloader_iter)`` (an instance of
+            ``_DataFetcherWrapper``). Yields ``(batch, batch_idx, dataloader_idx)``.
+        model: The ``LightningModule`` whose ``trainer`` carries the precision
+            plugin and strategy used to move the batch to device.
+
+    Returns:
+        ``(batch, batch_idx)`` — batch is already converted to the right
+        precision and moved to the model's device, ready for forward.
+    """
+    batch, batch_idx, dataloader_idx = next(dataloader_iter)
+    trainer = model.trainer
+    batch = trainer.precision_plugin.convert_input(batch)
+    batch = model._on_before_batch_transfer(batch, dataloader_idx=dataloader_idx)
+    batch = trainer.strategy.batch_to_device(batch, dataloader_idx=dataloader_idx)
+    return batch, batch_idx

From 086f0e3491e263a5f9b2a5c9d5f5dcded644b7de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Tue, 5 May 2026 16:53:32 -0700
Subject: [PATCH 04/30] refactor/cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../common/data/lhotse/indexed_adapters.py    |  49 +++--
 .../common/data/lhotse/nemo_adapters.py       | 205 ++++++------------
 2 files changed, 100 insertions(+), 154 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/indexed_adapters.py b/nemo/collections/common/data/lhotse/indexed_adapters.py
index edb7e6e86400..597e6c1f4726 100644
--- a/nemo/collections/common/data/lhotse/indexed_adapters.py
+++ b/nemo/collections/common/data/lhotse/indexed_adapters.py
@@ -24,6 +24,10 @@
 # Knuth's multiplicative hash constant (golden-ratio derived, 32-bit).
 _KNUTH_HASH = 2654435761
 
+# Tar block size + the all-zeros block that marks end-of-archive in tar.
+_TAR_BLOCK_SIZE = 512
+_TAR_ZERO_BLOCK = b'\0' * _TAR_BLOCK_SIZE
+
 
 class LazyShuffledRange:
     """
@@ -184,8 +188,8 @@ def _validate_index(self):
             last = int(self.offsets[self._len - 1])
             with open(self.data_path, 'rb') as f:
                 f.seek(last)
-                buf = f.read(512)
-            if len(buf) < 512 or buf == b'\0' * 512:
+                buf = f.read(_TAR_BLOCK_SIZE)
+            if len(buf) < _TAR_BLOCK_SIZE or buf == _TAR_ZERO_BLOCK:
                 self._len -= 1
             else:
                 break
@@ -193,13 +197,13 @@ def _validate_index(self):
     def _check_offset_is_tar_header(self, offset: int, label: str = ""):
         with open(self.data_path, 'rb') as f:
             f.seek(offset)
-            buf = f.read(512)
-        if len(buf) < 512:
+            buf = f.read(_TAR_BLOCK_SIZE)
+        if len(buf) < _TAR_BLOCK_SIZE:
             raise ValueError(
                 f"Tar index for {self.data_path}: {label} offset {offset} "
                 f"is too close to EOF (file size {self._data_size})."
             )
-        if buf == b'\0' * 512:
+        if buf == _TAR_ZERO_BLOCK:
             raise ValueError(
                 f"Tar index for {self.data_path}: {label} offset {offset} "
                 f"points to a zero block (end-of-archive marker), not a tar header. "
@@ -328,11 +332,10 @@ def _build_name_index(self) -> dict[str, int]:
         name_to_idx: dict[str, int] = {}
         self._ensure_open()
         for i in range(self._len):
-            offset = int(self.offsets[i])
-            self._fh.seek(offset)
+            self._fh.seek(int(self.offsets[i]))
             while True:
-                header = self._fh.read(512)
-                if len(header) < 512 or header == b"\0" * 512:
+                header = self._fh.read(_TAR_BLOCK_SIZE)
+                if len(header) < _TAR_BLOCK_SIZE or header == _TAR_ZERO_BLOCK:
                     break
                 info = tarfile.TarInfo.frombuf(
                     header, tarfile.ENCODING, "surrogateescape"
@@ -340,9 +343,8 @@ def _build_name_index(self) -> dict[str, int]:
                 if info.type in (tarfile.REGTYPE, tarfile.AREGTYPE):
                     name_to_idx[info.name] = i
                     break
-                # Non-regular (PAX header, GNU long-name, etc.):
-                # skip its data + 512-byte padding and continue.
-                size_blocks = (info.size + 511) // 512 * 512
+                # Skip non-regular member (PAX/GNU long-name) data + padding.
+                size_blocks = -(-info.size // _TAR_BLOCK_SIZE) * _TAR_BLOCK_SIZE
                 self._fh.seek(size_blocks, 1)
         return name_to_idx
 
@@ -378,16 +380,16 @@ def _read_tar_member(f):
     arbitrary byte offset and read just the members we need in O(1).
     """
     while True:
-        header_buf = f.read(512)
-        if len(header_buf) < 512 or header_buf == b'\0' * 512:
+        header_buf = f.read(_TAR_BLOCK_SIZE)
+        if len(header_buf) < _TAR_BLOCK_SIZE or header_buf == _TAR_ZERO_BLOCK:
             raise EOFError("End of tar archive or unexpected EOF")
         info = tarfile.TarInfo.frombuf(header_buf, tarfile.ENCODING, "surrogateescape")
         data = f.read(info.size)
         if len(data) < info.size:
             raise EOFError("Unexpected end of tar file while reading data")
-        remainder = info.size % 512
+        remainder = info.size % _TAR_BLOCK_SIZE
         if remainder:
-            f.seek(512 - remainder, 1)
+            f.seek(_TAR_BLOCK_SIZE - remainder, 1)
         if info.type not in (tarfile.REGTYPE, tarfile.AREGTYPE):
             continue
         return info.name, data
@@ -399,10 +401,14 @@ def create_index(jsonl_path, idx_path):
 
     Format: sequence of little-endian uint64 values
     ``[Offset_0, Offset_1, ..., Offset_N, File_Size]``
+
+    Written atomically (tmp + ``os.replace``) so concurrent writers can't
+    observe a half-written ``.idx``.
     """
     # Flush the write buffer every 8 MiB to limit memory usage on large files.
     flush_threshold = 8 * 1024 * 1024
-    with open(jsonl_path, 'rb') as f_in, open(idx_path, 'wb') as f_out:
+    tmp_path = f"{idx_path}.tmp.{os.getpid()}"
+    with open(jsonl_path, 'rb') as f_in, open(tmp_path, 'wb') as f_out:
         current_offset = 0
         write_buffer = bytearray()
         write_buffer.extend(struct.pack('<Q', current_offset))
@@ -414,6 +420,7 @@ def create_index(jsonl_path, idx_path):
                 write_buffer.clear()
         if write_buffer:
             f_out.write(write_buffer)
+    os.replace(tmp_path, idx_path)
 
 
 def create_tar_index(tar_path, idx_path):
@@ -422,6 +429,10 @@ def create_tar_index(tar_path, idx_path):
     Stores the byte offset of the first member of each sample (grouped by basename),
     followed by a sentinel equal to the tar file size.
     Format is identical to :func:`create_index`.
+
+    Written atomically: data is staged in a per-process temp file next to
+    ``idx_path`` and then ``os.replace()``-d into place, so concurrent writers
+    can't observe a half-written ``.idx``.
     """
     offsets = []
     prev_stem = None
@@ -433,9 +444,11 @@ def create_tar_index(tar_path, idx_path):
             if stem != prev_stem:
                 offsets.append(member.offset)
                 prev_stem = stem
-    with open(idx_path, 'wb') as f:
+    tmp_path = f"{idx_path}.tmp.{os.getpid()}"
+    with open(tmp_path, 'wb') as f:
         buf = bytearray()
         for off in offsets:
             buf.extend(struct.pack('<Q', off))
         buf.extend(struct.pack('<Q', os.path.getsize(tar_path)))
         f.write(buf)
+    os.replace(tmp_path, idx_path)
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index e506e8077324..db3eea4c80d9 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -37,7 +37,9 @@
 from lhotse.cut import Cut
 from lhotse.dataset.dataloading import resolve_seed
 from lhotse.lazy import (
+    GraphOriginDict,
     IteratorNode,
+    LazyIndexedManifestIterator,
     LazyIteratorChain,
     LazyJsonlIterator,
     attach_graph_origin,
@@ -50,6 +52,11 @@
 from nemo.utils import logging
 from nemo.utils.data_utils import is_datastore_path
 
+# NeMo tarred manifests support per-recording offsets via "-subN" audio_filepath
+# suffixes. We use this pattern in both indexed and streaming code paths to
+# recover the actual tar member name (offsets share a single member).
+_OFFSET_PATTERN = re.compile(r'^(?P<stem>.+)(?P<sub>-sub\d+)(?P<ext>\.\w+)?$')
+
 
 class LazyNeMoIterator(IteratorNode):
     """
@@ -143,7 +150,9 @@ def __init__(
                     "graph-token random access."
                 )
             seed = resolve_seed(shard_seed) if shard_seed not in (None, "trng", "randomized") else 0
-            indexed_sources = [_LazyIndexedJsonlDictNode(p) for p in paths]
+            indexed_sources = [
+                LazyIndexedManifestIterator(p, decode=GraphOriginDict) for p in paths
+            ]
             if len(indexed_sources) == 1:
                 self.source = indexed_sources[0]
             else:
@@ -300,66 +309,6 @@ def _create_recording(
             return Recording.from_file(audio_path)
 
 
-class _GraphOriginDict(dict):
-    """``dict`` subclass that can carry runtime attributes (e.g. ``_graph_origin``)."""
-
-    __slots__ = ("_graph_origin",)
-
-
-class _LazyIndexedJsonlDictNode(IteratorNode):
-    """
-    Internal helper: a graph-restorable indexed JSONL reader that yields raw dicts
-    (not Cuts). Built on top of :class:`lhotse.indexing.IndexedJsonlReader`.
-
-    Used as the source iterator for :class:`LazyNeMoIterator` (and other adapters)
-    when ``indexed=True``. Yielded items carry ``_graph_origin`` set to their
-    integer line index, which allows downstream nodes (e.g. ``LazyIteratorChain``,
-    ``LazyShuffler``) to compose graph tokens for exact restore.
-    """
-
-    is_checkpointable = True
-    is_indexed = True
-    has_constant_time_access = True
-
-    def __init__(self, path: str | Path) -> None:
-        from lhotse.indexing import IndexedJsonlReader
-
-        self.path = path
-        self._reader = IndexedJsonlReader(path)
-        self._position = 0
-        self._restored = False
-
-    def __getitem__(self, idx):
-        idx = int(normalize_graph_token(idx))
-        item = _GraphOriginDict(self._reader[idx])
-        return attach_graph_origin(item, idx)
-
-    def __len__(self) -> int:
-        return len(self._reader)
-
-    def __iter__(self):
-        start = self._position if self._restored else 0
-        self._restored = False
-        n = len(self._reader)
-        for i in range(start, n):
-            self._position = i + 1
-            item = _GraphOriginDict(self._reader[i])
-            attach_graph_origin(item, i)
-            yield item
-
-    def state_dict(self) -> dict:
-        return {"position": self._position}
-
-    def load_state_dict(self, sd: dict) -> None:
-        self._position = sd["position"]
-        self._restored = True
-
-
-# NeMo-tar indexed access is delegated to ``IndexedTarMemberReader`` from
-# ``indexed_adapters`` — the same canonical .idx format (uint64 LE offsets +
-# sentinel) used everywhere else in NeMo and lhotse for indexed access.
-
-
 class LazyNeMoTarredIterator(IteratorNode):
     r"""
     ``LazyNeMoTarredIterator`` reads a NeMo tarred JSON manifest and converts it on the fly to an ``Iterable[Cut]``.
@@ -566,7 +515,6 @@ def _init_indexed(self) -> None:
         self._total_len = cum
         self._position = 0
         self._restored = False
-        self._offset_pattern = re.compile(r'^(?P<stem>.+)(?P<sub>-sub\d+)(?P<ext>\.\w+)?$')
 
     def to_shards(self) -> List["LazyNeMoTarredIterator"]:
         """Convert this iterator to a list of separate iterators for each shard."""
@@ -741,6 +689,31 @@ def _resolve_global_idx(self, idx: int) -> tuple[int, int]:
         sid = self._sorted_shard_ids[shard_pos]
         return sid, idx - self._cum_lens[shard_pos]
 
+    def _audio_member_name_from_entry(self, entry: dict) -> str:
+        af = entry["audio_filepath"]
+        m = _OFFSET_PATTERN.match(af)
+        if m is None:
+            return af
+        return m.group("stem") + ifnone(m.group("ext"), "")
+
+    def _attach_supervision_and_metadata(
+        self, cut: Cut, data: dict, manifest_path: str, tar_path: str
+    ) -> Cut:
+        cut.supervisions.append(
+            SupervisionSegment(
+                id=cut.id,
+                recording_id=cut.recording_id,
+                start=0,
+                duration=cut.duration,
+                text=data.get(self.text_field),
+                language=data.get(self.lang_field),
+            )
+        )
+        cut.custom = _to_custom_attr_dict(data)
+        cut.manifest_origin = manifest_path
+        cut.tar_origin = tar_path
+        return cut
+
     def _build_indexed_cut(self, data: dict, audio_bytes: bytes, manifest_path: str, tar_path: str) -> Cut | None:
         """Decode a single (manifest_entry, audio_bytes) pair into a Cut, mirroring the streaming path."""
         if data.get("_skipme", False):
@@ -762,35 +735,14 @@ def _build_indexed_cut(self, data: dict, audio_bytes: bytes, manifest_path: str,
         cut = make_cut_with_subset_inmemory_recording(
             recording, offset=data.get("offset", 0.0), duration=data.get("duration")
         )
-        cut.supervisions.append(
-            SupervisionSegment(
-                id=cut.id,
-                recording_id=cut.recording_id,
-                start=0,
-                duration=cut.duration,
-                text=data.get(self.text_field),
-                language=data.get(self.lang_field),
-            )
-        )
-        cut.custom = _to_custom_attr_dict(data)
-        cut.manifest_origin = manifest_path
-        cut.tar_origin = tar_path
-        return cut
-
-    def _audio_member_name_from_entry(self, entry: dict) -> str:
-        af = entry["audio_filepath"]
-        m = self._offset_pattern.match(af)
-        if m is None:
-            return af
-        return m.group("stem") + ifnone(m.group("ext"), "")
+        return self._attach_supervision_and_metadata(cut, data, manifest_path, tar_path)
 
     def _build_indexed_url_cut(self, data: dict, manifest_path: str, tar_path: str) -> Cut | None:
         """
         AIS GetBatch counterpart of ``_build_indexed_cut``: produces a Cut backed
         by a URL/file AudioSource (no audio bytes loaded), so that
         ``AudioSamples(use_batch_loader=True)`` can fetch the entire minibatch in
-        a single AIS GetBatch request. Mirrors the streaming path in
-        ``_iter_batch_for_ais_get_batch``.
+        a single AIS GetBatch request. Mirrors ``_iter_batch_for_ais_get_batch``.
         """
         if data.get("_skipme", False):
             return None
@@ -802,9 +754,8 @@ def _build_indexed_url_cut(self, data: dict, manifest_path: str, tar_path: str)
             return None
         audio_filename = self._audio_member_name_from_entry(data)
         audio_url = f"{tar_path.rstrip('/')}/{audio_filename.lstrip('/')}"
-        # Mirror the streaming path's convention: use type="url" since open_best()
-        # transparently handles both local paths and remote URLs (ais://, http(s)://, ...).
-        # AudioSamples' GetBatch loader inspects the URL scheme to dispatch to AIS.
+        # ``open_best`` handles ais://, http(s)://, and local paths uniformly;
+        # the AIS GetBatch loader still keys off the URL scheme.
         source_type = "url" if "://" in tar_path else "file"
         offset = data.get("offset", 0.0)
         sampling_rate = data.get("sampling_rate", 16000)
@@ -819,41 +770,40 @@ def _build_indexed_url_cut(self, data: dict, manifest_path: str, tar_path: str)
         if offset > 0:
             cut = cut.truncate(offset=offset, duration=duration, preserve_id=True)
             cut.id = f"{cut.id}-{round(offset * 1e2):06d}-{round(duration * 1e2):06d}"
-        cut.supervisions.append(
-            SupervisionSegment(
-                id=cut.id,
-                recording_id=cut.recording_id,
-                start=0,
-                duration=cut.duration,
-                text=data.get(self.text_field),
-                language=data.get(self.lang_field),
-            )
-        )
-        cut.custom = _to_custom_attr_dict(data)
-        cut.manifest_origin = manifest_path
-        cut.tar_origin = tar_path
-        return cut
+        return self._attach_supervision_and_metadata(cut, data, manifest_path, tar_path)
 
-    def __getitem__(self, token):
-        if not self.indexed:
-            raise NotImplementedError(
-                "LazyNeMoTarredIterator only supports __getitem__ when constructed with indexed=True."
-            )
-        idx = int(normalize_graph_token(token))
+    def _decode_cut_at(self, idx: int) -> Cut | None:
+        """Build the Cut for a global index in indexed mode (AIS or local).
+
+        Returns ``None`` if the audio member is missing and
+        ``skip_missing_manifest_entries`` is set, or if the entry has
+        ``_skipme=True`` / undecodable audio.
+        """
         sid, local_idx = self._resolve_global_idx(idx)
         data = self._cuts_readers[sid][local_idx]
         manifest_path = self._cuts_readers[sid].path
         tar_path = self.shard_id_to_tar_path[sid]
         if self.use_ais_get_batch:
-            cut = self._build_indexed_url_cut(data, manifest_path, tar_path)
-        else:
-            member_name = self._audio_member_name_from_entry(data)
+            return self._build_indexed_url_cut(data, manifest_path, tar_path)
+        member_name = self._audio_member_name_from_entry(data)
+        try:
             audio_bytes = self._tar_readers[sid].get(member_name)
-            cut = self._build_indexed_cut(data, audio_bytes, manifest_path, tar_path)
+        except KeyError:
+            if self.skip_missing_manifest_entries:
+                return None
+            raise
+        return self._build_indexed_cut(data, audio_bytes, manifest_path, tar_path)
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError(
+                "LazyNeMoTarredIterator only supports __getitem__ when constructed with indexed=True."
+            )
+        idx = int(normalize_graph_token(token))
+        cut = self._decode_cut_at(idx)
         if cut is None:
             raise RuntimeError(
-                f"Cut at global index {idx} (shard {sid}, local {local_idx}) is not decodable; "
-                f"cannot satisfy random-access __getitem__."
+                f"Cut at global index {idx} is not decodable; cannot satisfy random-access __getitem__."
             )
         return attach_graph_origin(cut, idx)
 
@@ -877,24 +827,9 @@ def load_state_dict(self, sd: dict) -> None:
     def _iter_indexed(self) -> Generator[Cut, None, None]:
         start = self._position if self._restored else 0
         self._restored = False
-        n = self._total_len
-        for i in range(start, n):
+        for i in range(start, self._total_len):
             self._position = i + 1
-            sid, local_idx = self._resolve_global_idx(i)
-            data = self._cuts_readers[sid][local_idx]
-            manifest_path = self._cuts_readers[sid].path
-            tar_path = self.shard_id_to_tar_path[sid]
-            if self.use_ais_get_batch:
-                cut = self._build_indexed_url_cut(data, manifest_path, tar_path)
-            else:
-                member_name = self._audio_member_name_from_entry(data)
-                try:
-                    audio_bytes = self._tar_readers[sid].get(member_name)
-                except KeyError:
-                    if self.skip_missing_manifest_entries:
-                        continue
-                    raise
-                cut = self._build_indexed_cut(data, audio_bytes, manifest_path, tar_path)
+            cut = self._decode_cut_at(i)
             if cut is None:
                 continue
             attach_graph_origin(cut, i)
@@ -917,17 +852,15 @@ def __iter__(self) -> Generator[Cut, None, None]:
         # Propagate the random seed
         extra_fields = [ExtraField.from_dict({"seed": seed, **field_cfg}) for field_cfg in self.extra_fields or ()]
 
-        # Handle NeMo tarred manifests with offsets.
-        # They have multiple JSONL entries where audio paths end with '-sub1', '-sub2', etc. for each offset.
-        offset_pattern = re.compile(r'^(?P<stem>.+)(?P<sub>-sub\d+)(?P<ext>\.\w+)?$')
-
+        # NeMo tarred manifests can have multiple JSONL entries pointing at the
+        # same audio member with -subN audio_filepath suffixes (per-offset cuts).
         for sid in shard_ids:
             manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
 
             def basename(d: dict) -> str:
                 return (
                     m.group("stem") + ifnone(m.group("ext"), "")
-                    if (m := offset_pattern.match(k := d["audio_filepath"])) is not None
+                    if (m := _OFFSET_PATTERN.match(k := d["audio_filepath"])) is not None
                     else k
                 )
 

From ad6861a02fe075dc27d1bced2353efd8b0cfd048 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Wed, 6 May 2026 16:06:42 -0700
Subject: [PATCH 05/30] Documentation update to reflect indexed/checkpointable
 things + general gap coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 docs/source/asr/datasets.rst         |   6 +
 docs/source/audio/datasets.rst       |   6 +
 docs/source/dataloaders.rst          | 797 ++++++++++++++++++++++++++-
 docs/source/speechlm2/datasets.rst   |  64 +++
 scripts/dataloading/build_indexes.py | 377 +++++++++++++
 5 files changed, 1229 insertions(+), 21 deletions(-)
 create mode 100644 scripts/dataloading/build_indexes.py

diff --git a/docs/source/asr/datasets.rst b/docs/source/asr/datasets.rst
index 09ff87ea180c..620194c53727 100644
--- a/docs/source/asr/datasets.rst
+++ b/docs/source/asr/datasets.rst
@@ -3,6 +3,12 @@ Datasets
 
 NeMo ASR models expect data as a set of audio files plus a manifest file describing each utterance.
 
+.. seealso::
+
+   For Lhotse-based dataloading (the recommended path for new ASR
+   recipes — dynamic bucketing, multi-source mixing, indexed/resumable
+   dataloading), see :doc:`/dataloaders`.
+
 .. _section-with-manifest-format-explanation:
 
 Manifest Format
diff --git a/docs/source/audio/datasets.rst b/docs/source/audio/datasets.rst
index 4c023961a29e..781b0a9e99d8 100644
--- a/docs/source/audio/datasets.rst
+++ b/docs/source/audio/datasets.rst
@@ -3,6 +3,12 @@ Datasets
 
 The `audio` collection expect the training, validation and tests datasets in either NeMo format or Lhotse format.
 
+.. seealso::
+
+   For the Lhotse dataloader's full surface — supported ``input_cfg``
+   types, bucketing, indexed manifests + resumable dataloading, and the
+   ``LhotseDataLoadingConfig`` field reference — see :doc:`/dataloaders`.
+
 NeMo Format
 -----------
 
diff --git a/docs/source/dataloaders.rst b/docs/source/dataloaders.rst
index 8a7ed848b8a8..7ef1ffbc761d 100644
--- a/docs/source/dataloaders.rst
+++ b/docs/source/dataloaders.rst
@@ -24,26 +24,6 @@ NeMo supports using `Lhotse`_, a speech data handling library, as a dataloading
     constant in time (i.e., stationary); in fact, each mini-batch will have roughly the same ratio of data coming from each source.
     Since the multiplexing is done dynamically, it is very easy to tune the sampling weights.
 
-Lhotse dataloading supports the following types of inputs:
-
-* NeMo manifests
-    Regular NeMo JSON manifests.
-* NeMo tarred data
-    Tarred NeMo JSON manifests + audio tar files; we also support combination of multiple NeMo
-    tarred data sources (e.g., multiple buckets of NeMo data or multiple datasets) via dynamic multiplexing.
-
-    We support using a subset of Tarred NeMo JSON manifests along with audio tar files without disrupting the alignment between the tarred files and their corresponding manifests.
-    This feature is essential because large datasets often consist of numerous tar files and multiple versions of Tarred NeMo JSON manifest subsets, which may contain only a portion of the audio files due to filtering for various reasons.
-    To skip specific entries in the manifests without repeatedly copying and retarring audio files, the entries must include a ``_skipme`` key. This key should be set to ``True``, ``1``, or a reason for skipping (e.g., ``low character-rate``).
-
-* Lhotse CutSet manifests
-    Regular Lhotse CutSet manifests (typically gzipped JSONL).
-    See `Lhotse Cuts documentation`_ to learn more about Lhotse data formats.
-* Lhotse Shar data
-    Lhotse Shar is a data format that also uses tar files for sequential data loading,
-    but is designed to be modular (i.e., easily extensible with new data sources and with new feature fields).
-    More details can be found here: |tutorial_shar|
-
 .. caution:: As of now, Lhotse is mainly supported in most ASR model configurations. We aim to gradually extend this support to other speech tasks.
 
 .. _Lhotse: https://github.com/lhotse-speech/lhotse
@@ -51,6 +31,269 @@ Lhotse dataloading supports the following types of inputs:
 .. |tutorial_shar| image:: https://colab.research.google.com/assets/colab-badge.svg
     :target: https://colab.research.google.com/github/lhotse-speech/lhotse/blob/master/examples/04-lhotse-shar.ipynb
 
+Architecture overview
+---------------------
+
+The Lhotse dataloader is a pipeline of small components. Each YAML option you
+set lands in exactly one of them, so it pays to know which is which::
+
+    input_cfg entry  ──►  parser_fn  ──►  Adapter (IteratorNode)
+                          (registered                 │
+                           via @data_type_parser)     ▼
+                                            CutSet (lazy iterator graph)
+                                                      │
+                              SamplingConstraint  ──► CutSampler
+                                                      │
+                                                      ▼
+                                          IterableDatasetWrapper
+                                                      │
+                                                      ▼
+                                            user-defined Dataset
+                                                      │
+                                                      ▼
+                                                 DataLoader
+                                                 (or StatefulDataLoader)
+
+Components, top to bottom:
+
+* **input_cfg entry** — one YAML dict identified by ``type:`` (e.g.
+  ``type: nemo_tarred``). Listed below in :ref:`lhotse-format-reference`.
+* **parser_fn** — registered with the ``@data_type_parser`` decorator in
+  ``nemo/collections/common/data/lhotse/cutset.py``. Reads the entry and
+  returns ``(CutSet, is_tarred)``. Users can add their own (see
+  :ref:`lhotse-extension-hooks`).
+* **Adapter** — a class that knows how to iterate one specific on-disk
+  format (e.g. ``LazyNeMoTarredIterator``, ``LazyParquetIterator``,
+  ``NeMoMultimodalConversationJsonlAdapter``). All recent adapters are
+  Lhotse :class:`~lhotse.lazy.IteratorNode` subclasses and support
+  ``indexed=True`` for O(1) random access — see
+  :ref:`indexed-resumable-dataloading`.
+* **CutSet** — Lhotse's lazy manifest wrapper. Composing multiple sources
+  produces a graph of iterator nodes (mux, mix, map, filter, …) underneath.
+* **SamplingConstraint** — defines what "length" means for batch packing:
+  :class:`~lhotse.dataset.sampling.base.TimeConstraint` (audio duration,
+  default), :class:`~lhotse.dataset.sampling.base.TokenConstraint` (token
+  count, multimodal), ``MultimodalSamplingConstraint`` /
+  ``FixedBucketBatchSizeConstraint2D`` (NeMo extensions; see
+  :ref:`lhotse-sampling-constraints`).
+* **CutSampler** — :class:`~lhotse.dataset.sampling.DynamicCutSampler` or
+  :class:`~lhotse.dataset.sampling.DynamicBucketingSampler`, picked
+  automatically based on ``use_bucketing``.
+* **IterableDatasetWrapper** — Lhotse helper that turns the sampler-produced
+  ``CutSet`` mini-batches into a stream the PyTorch ``DataLoader`` can
+  consume.
+* **Dataset class** — supplied by the model code; converts a ``CutSet``
+  mini-batch into a ``dict[str, Tensor]``. The same dataset class can serve
+  multiple model architectures because all batching is upstream.
+
+.. _lhotse-format-reference:
+
+Supported input formats
+-----------------------
+
+Every entry in ``input_cfg`` is identified by ``type:``. The table below is
+the canonical list of every type the dataloader understands today, what it
+returns, and the on-disk shape it expects.
+
+.. list-table::
+   :header-rows: 1
+   :widths: 18 32 14 8 8 10 10
+
+   * - ``type:``
+     - Purpose
+     - Yields
+     - Audio
+     - Tarred
+     - Indexable
+     - Adapter / parser
+   * - ``nemo``
+     - NeMo non-tarred JSON manifest (per-file audio)
+     - ``Cut``
+     - yes
+     - no
+     - yes
+     - ``LazyNeMoIterator``
+   * - ``nemo_tarred``
+     - NeMo tarred manifest + audio tar shards
+     - ``Cut``
+     - yes
+     - yes
+     - yes
+     - ``LazyNeMoTarredIterator``
+   * - ``lhotse``
+     - Plain Lhotse cuts JSONL
+     - ``Cut``
+     - yes
+     - no
+     - yes
+     - lhotse ``LazyJsonlIterator`` / ``LazyIndexedManifestIterator``
+   * - ``lhotse_shar``
+     - Lhotse Shar (sharded archive directory)
+     - ``Cut``
+     - yes
+     - yes
+     - yes
+     - lhotse ``LazySharIterator``
+   * - ``parquet``
+     - Parquet file with audio bytes column
+     - ``Cut``
+     - yes
+     - no
+     - yes (row groups)
+     - ``LazyParquetIterator``
+   * - ``txt``
+     - One example per line, raw text
+     - ``TextExample``
+     - no
+     - n/a
+     - no
+     - ``LhotseTextAdapter``
+   * - ``txt_jsonl``
+     - One JSON object per line; configurable text field
+     - ``TextExample``
+     - no
+     - n/a
+     - yes
+     - ``LhotseTextJsonlAdapter``
+   * - ``txt_pair``
+     - Source + target text files for translation
+     - ``SourceTargetTextExample``
+     - no
+     - n/a
+     - no
+     - ``LhotseTextPairAdapter``
+   * - ``multimodal_conversation``
+     - Multi-turn chat with mixed text/audio turns (JSONL)
+     - ``NeMoMultimodalConversation``
+     - optional
+     - optional
+     - yes
+     - ``NeMoMultimodalConversationJsonlAdapter``
+   * - ``share_gpt``
+     - ShareGPT-format JSONL → conversation
+     - ``NeMoMultimodalConversation``
+     - optional
+     - optional
+     - yes
+     - ``NeMoMultimodalConversationShareGPTJsonlAdapter``
+   * - ``share_gpt_webdataset``
+     - ShareGPT in WebDataset tar shards
+     - ``NeMoMultimodalConversation``
+     - optional
+     - yes
+     - yes
+     - ``NeMoMultimodalConversationShareGPTWebdatasetAdapter``
+   * - ``lhotse_as_conversation``
+     - Read ASR data and emit it as ASR conversation
+     - ``NeMoMultimodalConversation``
+     - yes
+     - inherits
+     - inherits
+     - transform on ``read_cutset_from_config``
+   * - ``sqa_as_conversation``
+     - Spoken-QA → 3-turn conversation (question / audio / answer)
+     - ``NeMoMultimodalConversation``
+     - yes
+     - inherits
+     - inherits
+     - transform
+   * - ``s2s_as_conversation``
+     - Duplex S2S → conversation
+     - ``NeMoMultimodalConversation``
+     - yes
+     - inherits
+     - inherits
+     - transform
+   * - ``s2s_duplex_overlap_as_s2s_duplex``
+     - Overlapping agent/user segments → unified S2S timeline
+     - ``Cut``
+     - yes
+     - inherits
+     - inherits
+     - transform
+   * - ``s2s_duplex_reverse_role``
+     - Swap user and agent in a duplex cut
+     - ``Cut``
+     - yes
+     - inherits
+     - inherits
+     - transform
+   * - ``lhotse_magpietts_data_as_continuation``
+     - MagpieTTS dataset → S2S duplex continuation
+     - ``Cut``
+     - yes
+     - inherits
+     - inherits
+     - transform
+   * - ``nemo_tarred_to_duplex``
+     - Single-supervision NeMo → duplex (user speech + agent silence)
+     - ``Cut``
+     - yes
+     - yes
+     - inherits
+     - transform
+   * - ``multi_speaker_simulator``
+     - Synthetic multi-speaker mixtures from a manifest
+     - ``Cut``
+     - yes
+     - n/a
+     - no
+     - ``MultiSpeakerMixtureGenerator``
+   * - ``group``
+     - Wrap a list of entries with a shared ``weight`` and ``tags``
+     - (nested)
+     - n/a
+     - n/a
+     - n/a
+     - n/a
+
+Notes:
+
+* "Inherits" means the type is a transform that wraps another underlying
+  source via ``read_cutset_from_config(config)``. Such entries accept the
+  underlying source's keys (e.g. ``cuts_path`` and ``manifest_filepath``)
+  *in addition to* their own.
+* Tarred NeMo manifests support a ``_skipme`` key to omit specific manifest
+  rows without repacking tars (set to ``True``, ``1``, or a reason string).
+* Lhotse Shar is documented in the upstream tutorial: |tutorial_shar|.
+
+Conversation / multimodal types — when to use which
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Six types yield ``NeMoMultimodalConversation`` from very different sources.
+Pick by the shape of your input data:
+
+.. list-table::
+   :header-rows: 1
+   :widths: 35 25 40
+
+   * - Your data
+     - ``type:``
+     - Notes
+   * - JSONL of multi-turn chats with mixed text/audio turns
+     - ``multimodal_conversation``
+     - Native chat schema; audio turns reference paths or tar members
+   * - JSONL in ShareGPT chat schema
+     - ``share_gpt``
+     - Adds ShareGPT-specific role/value parsing
+   * - ShareGPT data packed in WebDataset tar shards
+     - ``share_gpt_webdataset``
+     - Same parsing as ``share_gpt``, reads tarred shards
+   * - ASR data in NeMo or Lhotse format
+     - ``lhotse_as_conversation``
+     - Builds a 2-turn (instruction+audio / transcript) conversation per cut
+   * - Spoken-QA data with ``question`` / ``answer`` fields
+     - ``sqa_as_conversation``
+     - Builds a 3-turn (question / audio / answer) conversation per cut
+   * - Duplex S2S data with user/agent supervisions
+     - ``s2s_as_conversation``
+     - Maps duplex roles onto chat turns
+
+The last three (``*_as_conversation``) are *transforms*: they delegate to
+``read_cutset_from_config(config)`` for the underlying audio source, so the
+nested keys like ``manifest_filepath``, ``cuts_path``, or ``shar_path``
+belong on the same entry.
+
 Enabling Lhotse via configuration
 ----------------------------------
 
@@ -128,6 +371,16 @@ Some other Lhotse related arguments we support:
     When ``batch_duration`` is not set, it acts as a static batch size.
 * ``seed`` sets a random seed for the shuffle buffer.
 
+* ``indexed`` (default ``False``) opts the dataloader into Lhotse's indexed-manifest
+  path, giving every adapter O(1) random access and graph-token-based exact restore.
+  Requires ``.idx`` sidecars next to every JSONL/tar file. See
+  :ref:`indexed-resumable-dataloading` below.
+
+* ``use_stateful_dataloader`` (default ``False``) swaps PyTorch's
+  ``DataLoader`` for ``torchdata.stateful_dataloader.StatefulDataLoader`` so
+  that per-worker iterator state is captured in checkpoints and restored
+  exactly on resume. Pair with ``indexed: true`` for full O(1) restore.
+
 The full and always up-to-date list of supported options can be found in ``LhotseDataLoadingConfig`` class.
 
 .. _asr-dataset-config-format:
@@ -147,6 +400,29 @@ The dataset class which converts these examples to tensors can partition the min
 different processing to each group.
 For example, you may want to construct different prompts for the model using metadata in ``tags``.
 
+How ``tags`` is applied
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Every key/value pair in ``tags`` becomes an attribute on every cut produced
+by that entry. The dataloader walks the cuts via ``cuts.map(...)`` and runs::
+
+    for key, val in tags.items():
+        setattr(cut, key, val)
+
+So in your dataset class you read them back as ordinary attributes::
+
+    def __getitem__(self, cuts):
+        for cut in cuts:
+            lang   = cut.lang
+            task   = cut.task
+            ctx    = cut.context
+            ...
+
+Tags set on a ``group`` apply to every nested entry; tags set on an inner
+entry override the outer ones for that source. Conflicts with built-in cut
+fields (``id``, ``duration``, ``supervisions``, …) silently overwrite the
+built-in — pick tag names that don't collide.
+
 .. note:: When fine-tuning a model that was trained with ``input_cfg`` option, typically you'd only need
     to override the following options: ``input_cfg=null`` and ``manifest_filepath=path/to/manifest.json``.
 
@@ -384,6 +660,12 @@ Python dataloader instantiation example::
         tokenizer=my_tokenizer,
     )
 
+**Indexed mode for text/multimodal sources.** All of the parsers above
+(``txt_jsonl``, ``nemo_sft_jsonl``, ``multimodal_conversation``, ``share_gpt``,
+``share_gpt_webdataset``) accept ``indexed: true`` and integrate with
+``StatefulDataLoader``-based exact resume. ``txt`` and ``txt_pair`` are
+intentionally streaming-only. See :ref:`indexed-resumable-dataloading`.
+
 **Dataloading and bucketing of text and multimodal data.** When dataloading text or multimodal data, pay attention to the following config options (we provide example values for convenience):
 
 * ``use_multimodal_sampling: true`` tells Lhotse to switch from measuring audio duration to measuring token counts; required for text.
@@ -419,6 +701,25 @@ To enable bucketing, set ``batch_size: null`` and use the following options:
 **Joint dataloading of text/audio/multimodal data.** The key strength of this approach is that we can easily combine audio datasets and text datasets,
 and benefit from every other technique we described in this doc, such as: dynamic data mixing, data weighting, dynamic bucketing, and so on.
 
+Single-config vs. ``multi_config: true``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By default the dataloader builds **one** ``CutSet`` and **one** sampler from
+the top-level config. Setting ``multi_config: true`` switches to a
+**multi-modality** layout where each named sub-block (typically ``audio:``
+and ``text:``) is parsed as its own dataloader config, with its own
+sampling/bucketing options, and the per-modality samplers are fused at the
+batch level.
+
+When ``multi_config: true`` is set:
+
+* Top-level keys (``num_workers``, ``shuffle``, ``seed``, ``sample_rate``,
+  …) apply globally and are inherited by every sub-block.
+* Per-modality overrides — including the ``input_cfg`` itself — go inside
+  the named sub-block (``audio: ...`` / ``text: ...``).
+* The per-modality samplers are combined into one stream by
+  ``sampler_fusion``.
+
 This approach is described in the `EMMeTT`_ paper. There's also a notebook tutorial called Multimodal Lhotse Dataloading. We construct a separate sampler (with its own batching settings) for each modality,
 and specify how the samplers should be fused together via the option ``sampler_fusion``:
 
@@ -481,6 +782,162 @@ Example. Combine an ASR (audio-text) dataset with an MT (text-only) dataset so t
 
 .. caution:: We strongly recommend to use multiple shards for text files as well so that different nodes and dataloading workers are able to randomize the order of text iteration. Otherwise, multi-GPU training has a high risk of duplication of text examples.
 
+.. _lhotse-sampling-constraints:
+
+Sampling constraints
+--------------------
+
+A :class:`~lhotse.dataset.sampling.base.SamplingConstraint` decides what
+"length" means when the sampler packs a mini-batch. NeMo uses four:
+
+* :class:`~lhotse.dataset.sampling.base.TimeConstraint` — default.
+  Length = audio duration in seconds. Enforces ``max_duration`` /
+  ``batch_duration`` / ``quadratic_duration``.
+* :class:`~lhotse.dataset.sampling.base.TokenConstraint` — activated by
+  ``use_multimodal_sampling: true`` for text-only flows. Length = token
+  count after applying the tokenizer (and optionally the prompt format).
+  Enforces ``max_tokens`` / ``batch_tokens`` / ``quadratic_factor``.
+* ``MultimodalSamplingConstraint`` — Lhotse-style mixed-modality
+  packing. Activated by setting both ``use_multimodal_sampling: true``
+  and a ``token_equivalent_duration`` so audio cuts are measured in
+  equivalent-token units alongside text. Enforces all of the above plus
+  ``min_tpt``/``max_tpt`` (token-per-token ratio filtering).
+* ``FixedBucketBatchSizeConstraint2D`` — activated automatically when
+  ``bucket_duration_bins`` is given as a list of ``[duration, tokens]``
+  pairs **and** ``bucket_batch_size`` is set. Each bucket gets its own
+  fixed batch size; this is the layout produced by
+  ``estimate_duration_bins_2d.py`` and the OOMptimizer.
+
+You usually don't pick a constraint by name — it's inferred from the
+combination of YAML options. The names matter when you read NeMo's source,
+extend the system with a custom constraint, or interpret error messages.
+
+.. _indexed-resumable-dataloading:
+
+Resumable / indexed dataloading
+-------------------------------
+
+Setting ``indexed: true`` (per-source or top-level) plus
+``use_stateful_dataloader: true`` (top-level) opts NeMo's Lhotse dataloader
+into Lhotse's indexed iterator graph and torchdata's
+``StatefulDataLoader``. The combination gives you:
+
+* O(1) checkpoint/restore of the *whole* dataloading pipeline — sampler RNG,
+  bucketer state, multiplexer choice RNG, per-source iterator cursors, and
+  per-worker prefetch queues — without any replay from the start of the epoch.
+* Random access (``__getitem__``) over every supported adapter.
+
+When set at the top level, ``indexed: true`` is propagated by
+``read_dataset_config`` through the ``propagate_attrs`` cascade, so a single
+top-level flag covers every nested ``input_cfg`` group. You can still override
+it per-source if needed.
+
+Per-adapter support
+^^^^^^^^^^^^^^^^^^^
+
+The following ``input_cfg`` types accept ``indexed: true`` today and require an
+``.idx`` sidecar next to each data file:
+
+* ``nemo`` / ``nemo_tarred`` — JSONL manifest gets ``manifest.json.idx``;
+  every audio tar in ``tarred_audio_filepaths`` gets ``shard.tar.idx``.
+* ``lhotse`` (plain) — ``cuts.jsonl`` gets ``cuts.jsonl.idx``.
+* ``lhotse_shar`` — every uncompressed ``cuts.<NNNNNN>.jsonl`` and field tar
+  inside the Shar dir.
+* ``parquet`` — no sidecar required, but the file must expose row-group
+  statistics (the default for files written by pyarrow / pandas).
+* ``txt_jsonl`` — every file in ``paths``.
+* ``multimodal_conversation`` and ``share_gpt`` — JSONL manifest plus optional
+  audio tars in ``tarred_audio_filepaths``.
+* ``share_gpt_webdataset`` — every ``shard-*.tar`` inside ``data_dir``.
+
+``txt`` and ``txt_pair`` remain streaming-only (no random-access support).
+
+Two caveats to be aware of:
+
+* ``indexed: true`` is incompatible with ``extra_fields`` and ``slice_length``
+  on ``nemo``/``nemo_tarred``: those features mutate or expand cuts in a way
+  that has no stable index. Pre-process the manifest offline if you need them
+  in an indexed pipeline.
+* Only **uncompressed** files can be indexed (no ``.jsonl.gz``,
+  ``.tar.gz``, etc.) and only files on a backend that supports indexed reads
+  (local FS, S3-compatible object stores, AIStore).
+
+Building ``.idx`` sidecars
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Two equivalent ways:
+
+1. Lhotse's CLI per file::
+
+       lhotse index jsonl path/to/cuts.jsonl
+       lhotse index tar  path/to/shard.tar
+       lhotse index shar path/to/shar_dir/
+
+2. NeMo's batch helper that takes a config and indexes everything it
+   references in one shot::
+
+       python scripts/dataloading/build_indexes.py path/to/input_cfg.yaml
+
+   The script walks ``input_cfg`` (including nested ``group`` entries and
+   per-entry YAML references), dispatches the right tar layout for each
+   adapter (NeMo one-member-per-sample vs. WebDataset/Shar pair format), and
+   skips files that already have an up-to-date ``.idx``. Use ``--force`` to
+   rebuild, ``--workers N`` for parallelism, ``--dry-run`` to preview.
+
+End-to-end YAML example
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: yaml
+
+    model:
+      train_ds:
+        # Top-level switches enable indexed restore for every source below.
+        indexed: true
+        use_stateful_dataloader: true
+        force_finite: true
+        force_map_dataset: true
+
+        sample_rate: 16000
+        num_workers: 4
+        seed: 42
+        shard_seed: randomized
+
+        # Bucketing and the rest of the dataloader knobs work exactly as before.
+        use_bucketing: true
+        num_buckets: 30
+        batch_duration: 1100
+        quadratic_duration: 30
+
+        input_cfg:
+          - type: nemo_tarred
+            manifest_filepath: /data/asr/manifest__OP_0..127_CL_.jsonl
+            tarred_audio_filepaths: /data/asr/audio__OP_0..127_CL_.tar
+            weight: 0.7
+          - type: lhotse
+            cuts_path: /data/extra/cuts.jsonl
+            weight: 0.3
+
+Resume contract
+^^^^^^^^^^^^^^^
+
+When ``use_stateful_dataloader: true`` is set, Lightning's checkpoint will
+contain the full lhotse iterator graph state under the dataloader key. On
+resume:
+
+* iterator positions advance to where they were at save time (no replay from
+  position 0);
+* ``set_epoch`` is a no-op while restored state is pending, so the resumed run
+  continues the same epoch instead of starting a new one;
+* ``num_workers`` and ``world_size`` must match between save and restore (a
+  hard requirement of ``StatefulDataLoader``).
+
+Non-indexed pipelines fall back to Lhotse's ``_fast_forward()`` replay (O(N)
+in batches consumed before the checkpoint) and require ``num_workers`` only to
+be consistent for replay-based restore — not exact restore.
+
+For the iterator graph contract itself, see Lhotse's
+`indexed manifests guide <https://lhotse.readthedocs.io/en/latest/indexed-manifests.html>`_.
+
 Pre-computing bucket duration bins
 ------------------------------------
 
@@ -594,7 +1051,7 @@ For Canary-1B, we'll also provide the special tokens tokenizer. Example:
         input_cfg.yaml
 
 Pushing GPU utilization to the limits with bucketing and OOMptimizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The default approach of specifying a ``batch_duration``, ``bucket_duration_bins`` and ``quadratic_duration``
 is quite flexible, but is not maximally efficient. We observed that in practice it often leads to under-utilization
@@ -685,3 +1142,301 @@ Other, more exotic configurations:
 * With ``seed="trng"``, the base random seed itself will be drawn using a TRNG. It will be different on each GPU training process. This setting is not recommended.
 
 * With ``seed="randomized"``, the base random seed is set to Python's global RNG seed. It might be different on each GPU training process. This setting is not recommended.
+
+Train vs. validation / test configs
+-----------------------------------
+
+The training and validation/test sections of a NeMo recipe use the same
+underlying dataloader builder but have a different shape and a different
+default behavior.
+
+**Training (``train_ds``).** A single config that produces one infinite
+``CutSet``. The dataloader is wrapped to never run out of data, so
+``trainer.max_steps`` (and ``limit_train_batches`` for tarred sources)
+controls the run length:
+
+.. code-block:: yaml
+
+    model:
+      train_ds:
+        sample_rate: 16000
+        num_workers: 4
+        shuffle: true
+        use_bucketing: true
+        num_buckets: 30
+        batch_duration: 1100
+        input_cfg:
+          - type: nemo_tarred
+            manifest_filepath: /data/asr/manifest__OP_0..127_CL_.json
+            tarred_audio_filepaths: /data/asr/audio__OP_0..127_CL_.tar
+
+**Validation / test (``validation_ds`` / ``test_ds``).** A *named* dict of
+configs — one per evaluation set — that produces finite iteration:
+
+.. code-block:: yaml
+
+    model:
+      validation_ds:
+        sample_rate: 16000
+        batch_size: 16
+        # Per-set entries; keys become the metric prefixes in logging.
+        datasets:
+          dev_clean:
+            cuts_path: /data/dev-clean/cuts.jsonl
+          dev_other:
+            cuts_path: /data/dev-other/cuts.jsonl
+
+The most common eval-side overrides:
+
+* ``shuffle: false`` — deterministic order.
+* ``force_finite: true`` — break out of the infinite-mux that's safe for
+  training but would loop forever in eval.
+* ``use_bucketing: false`` — bucketing trades padding for randomness; on a
+  small eval set the savings are negligible and a fixed batch size makes
+  results easier to interpret.
+* ``num_workers: 0`` (or a small number) — eval is short, the worker
+  startup cost matters more.
+
+When the model code expects a single eval set, use the plain ``cuts_path`` /
+``manifest_filepath`` form at the same level as ``train_ds`` instead of the
+``datasets:`` dict.
+
+Preparing your data
+-------------------
+
+Three minimal recipes covering the main on-disk formats.
+
+**NeMo manifest** — one JSON object per line, fields read by ``LazyNeMoIterator``::
+
+    {"audio_filepath": "/data/utt_0001.wav", "duration": 3.42, "text": "hello world", "lang": "en"}
+    {"audio_filepath": "/data/utt_0002.wav", "duration": 5.10, "text": "another example", "lang": "en"}
+
+For tarred NeMo manifests, see
+``scripts/speech_recognition/convert_to_tarred_audio_dataset.py`` in the NeMo
+repo.
+
+**Lhotse cuts JSONL** — build a ``CutSet`` from raw recordings + supervisions:
+
+.. code-block:: python
+
+    from lhotse import CutSet, Recording, SupervisionSegment
+
+    cuts = []
+    for path, transcript in pairs:
+        rec = Recording.from_file(path)
+        sup = SupervisionSegment(
+            id=rec.id, recording_id=rec.id,
+            start=0.0, duration=rec.duration,
+            text=transcript, language="en",
+        )
+        cut = rec.to_cut()
+        cut.supervisions = [sup]
+        cuts.append(cut)
+
+    CutSet.from_cuts(cuts).to_file("cuts.jsonl")  # uncompressed!
+
+For Lhotse Shar (sharded archive), see the upstream tutorial: |tutorial_shar|.
+
+**Parquet** — write a ``pyarrow`` table with the column names the
+``LazyParquetIterator`` reads (``audio``, ``text``, ``duration``,
+optional ``lang``):
+
+.. code-block:: python
+
+    import pyarrow as pa, pyarrow.parquet as pq
+
+    table = pa.table({
+        "audio":    [open(p, "rb").read() for p in paths],
+        "text":     transcripts,
+        "duration": durations,
+        "lang":     ["en"] * len(paths),
+    })
+    pq.write_table(table, "shard_000.parquet")  # row-group stats kept by default
+
+Once your manifests are written, build the indexed sidecars in one shot::
+
+    python scripts/dataloading/build_indexes.py path/to/input_cfg.yaml
+
+See :ref:`indexed-resumable-dataloading` for the resumable side.
+
+.. _lhotse-storage-backends:
+
+Storage backends: local, object store, AIStore
+----------------------------------------------
+
+Every input path the dataloader reads goes through Lhotse's ``open_best``,
+which routes file paths and URIs to the right backend automatically:
+
+* **Local files** — paths like ``/data/...`` work out of the box, no
+  configuration needed.
+* **Generic object stores via ``smart_open``** — ``s3://``, ``gs://``,
+  ``http://``, ``https://`` URIs work after ``pip install smart_open``.
+  Authentication uses the underlying SDK's defaults (e.g. AWS env vars).
+* **AIStore** — ``ais://bucket/key`` URIs work after ``pip install aistore``
+  and ``export AIS_ENDPOINT=http://...``. Optional tuning env vars
+  ``AIS_CONNECT_TIMEOUT`` and ``AIS_READ_TIMEOUT`` are honored by the SDK.
+
+The same routing applies to ``.idx`` sidecars: they are read and written
+next to the data file, so the backend must accept writes at that location
+or the indexes need to be pre-built locally and uploaded.
+
+AIStore GetBatch (separate optimization)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For tarred multimodal-conversation manifests, NeMo also supports AIStore's
+batched object-fetch API (``GetBatch``) via ``USE_AIS_GET_BATCH=true``,
+which issues one batched fetch per minibatch instead of per-cut tar reads.
+This is independent of using AIStore as a generic backend — see
+:doc:`speechlm2/datasets` for the speech-LM-specific details, including
+how it composes with ``indexed: true``.
+
+.. _lhotse-extension-hooks:
+
+Registering a custom format
+---------------------------
+
+Adding a new ``type:`` to the ``input_cfg`` registry is one decorator and
+one function:
+
+.. code-block:: python
+
+    from nemo.collections.common.data.lhotse.cutset import data_type_parser
+    from lhotse import CutSet
+
+    @data_type_parser("my_format")
+    def read_my_format(config) -> tuple[CutSet, bool]:
+        cuts = CutSet(MyAdapter(path=config.path, ...))
+        is_tarred = True  # True ⇒ IterableDataset path; False ⇒ map-style
+        return cuts, is_tarred
+
+The parser must accept arbitrary keys: ``read_dataset_config`` cascades
+options like ``indexed``, ``shard_seed``, ``metadata_only``,
+``force_finite``, ``audio_locator_tag`` from the top of the YAML down into
+every entry via ``propagate_attrs``. Missing keys should fall back to
+sensible defaults via ``config.get(...)``.
+
+To make ``MyAdapter`` participate in the indexed/resumable path
+(:ref:`indexed-resumable-dataloading`), implement Lhotse's
+:class:`~lhotse.lazy.IteratorNode` contract — see
+`indexed manifests guide <https://lhotse.readthedocs.io/en/latest/indexed-manifests.html>`_
+for the requirements.
+
+Common pitfalls
+---------------
+
+The most common foot-guns when standing up a NeMo Lhotse recipe:
+
+1. **Forgetting** ``trainer.use_distributed_sampler=false``. NeMo's Lhotse
+   integration handles distributed sampling itself; leaving Lightning's
+   default on causes silent batch duplication across DP ranks.
+
+2. **No** ``max_steps`` **with tarred / Shar data.** Tarred sources are
+   infinite by design, so without ``trainer.max_steps`` (and
+   ``limit_train_batches`` for the periodic validation cadence) training
+   never completes the first "epoch". Always set both.
+
+3. **Compressed inputs cannot be indexed.** ``.jsonl.gz`` and ``.tar.gz``
+   work for streaming, but ``indexed: true`` requires uncompressed,
+   seekable files. Re-extract or re-write before building ``.idx``.
+
+4. **Mismatched** ``num_workers`` / ``world_size`` **on resume.** Exact
+   per-worker resume with ``StatefulDataLoader`` requires both to match
+   between save and restore. Replay-based restore with the regular
+   ``DataLoader`` is more lenient.
+
+5. ``indexed: true`` **is incompatible with** ``extra_fields`` **and**
+   ``slice_length`` on ``nemo`` / ``nemo_tarred``. Both expand or rewrite
+   cuts in a way that has no stable index. Pre-process the manifest
+   offline if you need them in an indexed pipeline.
+
+6. ``shard_seed: "trng"`` **deadlocks under TP/PP.** Tensor- and pipeline-
+   parallel ranks must see the same shard order, but ``"trng"`` draws an
+   independent seed per worker. Use ``shard_seed: "randomized"`` whenever
+   you have model parallelism on top of DDP.
+
+7. **Missing** ``force_finite: true`` **on validation.** Validation configs
+   that reuse training infrastructure inherit the infinite-mux behavior;
+   without ``force_finite: true`` the validation loop never terminates.
+
+.. _lhotse-config-reference:
+
+``LhotseDataLoadingConfig`` field reference
+-------------------------------------------
+
+The complete option schema lives in ``LhotseDataLoadingConfig``
+(``nemo/collections/common/data/lhotse/dataloader.py``). It carries ~80
+fields; the categorization below mirrors the source order and groups
+options by what they control.
+
+**Inputs.** ``input_cfg``, ``manifest_filepath``,
+``tarred_audio_filepaths``, ``cuts_path``, ``shar_path``,
+``skip_missing_manifest_entries``.
+
+**Sampling — basic.** ``batch_size``, ``batch_duration``,
+``quadratic_duration``, ``min_duration``, ``max_duration``, ``min_tps``,
+``max_tps``.
+
+**Sampling — bucketing.** ``use_bucketing``, ``num_buckets``,
+``bucket_duration_bins``, ``bucket_batch_size``, ``bucket_buffer_size``,
+``num_cuts_for_bins_estimate``, ``concurrent_bucketing``.
+
+**Sampling — multimodal.** ``use_multimodal_sampling``, ``prompt_format``,
+``pretokenize``, ``audio_locator_tag``, ``token_equivalent_duration``,
+``batch_tokens``, ``quadratic_factor``, ``min_tokens``, ``max_tokens``,
+``min_tpt``, ``max_tpt``, ``measure_total_length``.
+
+**Sampling — fusion (multi-config).** ``multi_config``, ``sampler_fusion``,
+``sampler_weights``.
+
+**Indexed / resumable.** ``indexed``, ``use_stateful_dataloader``. See
+:ref:`indexed-resumable-dataloading`.
+
+**Mixing & weighting.** ``reweight_temperature``, ``max_open_streams``.
+
+**I/O & distributed.** ``num_workers``, ``pin_memory``, ``shard_seed``,
+``seed``, ``shuffle``, ``shuffle_buffer_size``, ``drop_last``,
+``force_finite``, ``force_map_dataset``, ``force_iterable_dataset``,
+``metadata_only``, ``cuda_expandable_segments``.
+
+**On-the-fly augmentation.**
+
+* Speed/RIR — ``perturb_speed``, ``rir_enabled``, ``rir_path``, ``rir_prob``.
+* Noise — ``noise_path``, ``noise_snr``, ``noise_mix_prob``.
+* Lowpass — ``lowpass_enabled``, ``lowpass_frequencies_interval``,
+  ``lowpass_prob``.
+* Compression — ``compression_enabled``, ``compression_prob``,
+  ``compression_level_interval``, ``compression_codecs``,
+  ``compression_codec_weights``, ``compression_enable_for_custom_fields``.
+* Clipping — ``clipping_enabled``, ``clipping_gain_db``,
+  ``clipping_normalize``, ``clipping_oversampling``, ``clipping_prob``,
+  ``clipping_prob_hard``.
+* Concatenation — ``concatenate_samples``, ``concatenate_gap_seconds``,
+  ``concatenate_duration_factor``, ``concatenate_merge_supervisions``,
+  ``db_norm``.
+
+**Cut transforms.** ``truncate_duration``, ``truncate_offset_type``,
+``cut_into_windows_duration``, ``cut_into_windows_hop``,
+``pad_min_duration``, ``pad_direction``, ``cut_text_into_windows_tokens``,
+``keep_excessive_supervisions``.
+
+**Field-name overrides.** ``text_field``, ``lang_field``,
+``channel_selector``, ``sample_rate``.
+
+**Filtering.** ``max_cer``, ``min_context_speaker_similarity``, ``keep``.
+
+For exact types and defaults, see the dataclass definition in the source
+file — it is the single source of truth.
+
+See also
+--------
+
+* :doc:`speechlm2/datasets` — speech-LM-specific data classes, AIStore
+  GetBatch with indexed mode, and the SpeechLM ``DataModule`` resume
+  contract.
+* :doc:`asr/datasets` — ASR-specific data preparation conventions.
+* :doc:`audio/datasets` — audio (codec, enhancement) data flows.
+* `Lhotse PyTorch Datasets <https://lhotse.readthedocs.io/en/latest/datasets.html>`_
+  — upstream sampler API, ``StatefulDataLoader`` integration, custom RNG
+  state in batch transforms.
+* `Lhotse indexed manifests <https://lhotse.readthedocs.io/en/latest/indexed-manifests.html>`_
+  — the iterator-graph contract that makes O(1) restore work.
diff --git a/docs/source/speechlm2/datasets.rst b/docs/source/speechlm2/datasets.rst
index 2006fbc59cc6..f408458b86c3 100644
--- a/docs/source/speechlm2/datasets.rst
+++ b/docs/source/speechlm2/datasets.rst
@@ -4,6 +4,16 @@ Datasets
 The speechlm2 collection supports datasets that contain both audio and text data for training models that can understand speech and generate appropriate responses.
 This section describes the dataset format, preparation, and usage with the speechlm2 models.
 
+.. seealso::
+
+   :doc:`/dataloaders` is the canonical reference for the underlying Lhotse
+   dataloader: ``input_cfg`` shape, supported formats, sampling/bucketing
+   options, indexed manifests + resumable dataloading, and
+   ``LhotseDataLoadingConfig`` field schema. The page below covers what's
+   speech-LM-specific on top of that — datamodule resume contract,
+   AIStore GetBatch, conversation type semantics in the SALM/duplex
+   recipes.
+
 Dataset Format
 --------------
 
@@ -228,6 +238,27 @@ When enabled:
 
 Leave the env var unset to keep the original tar-iterating loader.
 
+Combining with ``indexed: true``
+""""""""""""""""""""""""""""""""
+
+``USE_AIS_GET_BATCH=true`` coexists with ``indexed: true`` on
+``LazyNeMoTarredIterator`` (and on the multimodal-conversation adapters).
+Indexed mode keeps the JSONL-driven O(1) global indexing and graph-token
+checkpointing, while AIStore GetBatch handles the actual audio fetch:
+
+* The audio-tar ``.idx`` sidecar is **not** required when GetBatch is enabled
+  — the iterator skips opening tar files entirely and emits URL-backed cuts
+  whose ``AudioSource`` points at ``{tar_path}/{audio_filename}``
+  (``type="url"`` for ``ais://...`` paths, ``type="file"`` otherwise).
+* Manifest JSONLs still need their ``.idx`` sidecars; they drive the indexed
+  iterator graph and the ``state_dict`` / ``load_state_dict`` round-trip.
+* Audio bytes are fetched lazily by ``AudioSamples(use_batch_loader=True)`` at
+  collation time, which issues one batched GetBatch request per minibatch.
+
+Use this combination when shards live on AIStore and you want both the
+network efficiency of GetBatch and the exact-resume guarantees of the
+indexed/stateful pipeline.
+
 DuplexSTTDataset
 ****************
 
@@ -264,6 +295,39 @@ The DataModule takes care of:
 1. Setting up proper data parallel ranks for dataloaders
 2. Instantiating the dataloaders with configuration from YAML
 3. Managing multiple datasets for validation/testing
+4. Persisting the train dataloader's iterator state across checkpoints
+   (when ``use_stateful_dataloader: true``)
+
+Checkpointed / resumable training
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The DataModule caches the train dataloader on first ``train_dataloader()``
+call and exposes ``state_dict()`` / ``load_state_dict()`` that delegate to the
+cached dataloader when it supports them. Lightning's trainer wires those into
+every checkpoint automatically, so an experiment configured with::
+
+    data:
+      train_ds:
+        indexed: true
+        use_stateful_dataloader: true
+        ...
+
+resumes O(1) — sampler RNG, bucketer state, multiplexer choice RNG,
+per-source iterator cursors, and per-worker prefetch queues are all restored
+exactly without replay.
+
+With a regular ``DataLoader`` (``use_stateful_dataloader`` unset or
+``False``) ``state_dict``/``load_state_dict`` become no-ops and resume falls
+back to Lhotse's ``_fast_forward()`` replay path.
+
+Two constraints to keep in mind across save/restore:
+
+* ``num_workers`` and ``world_size`` must match between save and restore
+  (a hard requirement of ``StatefulDataLoader``).
+* All data files must be **uncompressed** and accompanied by ``.idx``
+  sidecars. Build them in one shot with ``scripts/dataloading/build_indexes.py``
+  (see :ref:`indexed-resumable-dataloading` in the main Lhotse dataloading
+  guide).
 
 Bucketing for Efficient Training
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/scripts/dataloading/build_indexes.py b/scripts/dataloading/build_indexes.py
new file mode 100644
index 000000000000..9a39d38a0cda
--- /dev/null
+++ b/scripts/dataloading/build_indexes.py
@@ -0,0 +1,377 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Build O(1)-restore index sidecars for an arbitrary NeMo Lhotse ``input_cfg``.
+
+Walks a NeMo dataloading config (``input_cfg`` YAML, including nested ``group``
+entries and per-entry YAML references), discovers every JSONL/tar file an
+indexed dataloader will need, and creates the corresponding ``.idx`` sidecars
+next to each data file.
+
+Two tar layouts are dispatched correctly:
+
+* NeMo tarred audio (one regular member per sample, name-keyed) — uses
+  ``nemo.collections.common.data.lhotse.indexed_adapters.create_tar_index``
+  which records one offset per *basename group*.
+* WebDataset/Shar tars (json + payload pairs) — uses
+  ``lhotse.indexing.create_tar_index`` which records one offset per *member
+  pair*.
+
+Local files and remote URIs are both supported via lhotse's ``open_best``
+(which routes to ``smart_open`` / AIStore SDK when available). The ``.idx`` is
+written next to its source path, so the storage backend must accept writes at
+that location — for read-only object stores, materialize the data locally
+first or pre-build indexes at upload time.
+
+Examples::
+
+    # Build indexes for everything referenced by an input_cfg.yaml.
+    python scripts/dataloading/build_indexes.py path/to/input_cfg.yaml
+
+    # Multiple configs at once.
+    python scripts/dataloading/build_indexes.py train.yaml validation.yaml
+
+    # Show what would be built without writing anything.
+    python scripts/dataloading/build_indexes.py --dry-run path/to/input_cfg.yaml
+
+    # Rebuild even when an .idx already exists; parallelize across 16 workers.
+    python scripts/dataloading/build_indexes.py --force --workers 16 path/to/input_cfg.yaml
+"""
+
+import logging
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable, Iterable, Iterator
+
+import click
+from omegaconf import DictConfig, ListConfig, OmegaConf
+
+from nemo.collections.common.data.lhotse.indexed_adapters import (
+    create_tar_index as create_nemo_tar_index,
+)
+from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths
+
+
+# --------------------------------------------------------------------------- #
+# Tar layout taxonomy.
+# --------------------------------------------------------------------------- #
+# NEMO_TAR  — one regular member per sample, indexed by basename. Used by
+#             nemo / nemo_tarred / multimodal_conversation / share_gpt audio
+#             tars (read via IndexedTarMemberReader).
+# WDS_TAR   — WebDataset-style: each sample is a pair of consecutive members
+#             (e.g. {N}.json + {N}.<audio>). Used by lhotse_shar tars and
+#             share_gpt_webdataset tars (read via IndexedTarSampleReader).
+NEMO_TAR = "nemo_tar"
+WDS_TAR = "wds_tar"
+JSONL = "jsonl"
+
+
+@dataclass(frozen=True)
+class IndexJob:
+    path: str
+    kind: str  # one of {JSONL, NEMO_TAR, WDS_TAR}
+
+    def idx_path(self) -> str:
+        return self.path + ".idx"
+
+
+# --------------------------------------------------------------------------- #
+# Path discovery.
+# --------------------------------------------------------------------------- #
+
+
+def _as_list(val) -> list:
+    if val is None:
+        return []
+    if isinstance(val, (list, tuple, ListConfig)):
+        return list(val)
+    return [val]
+
+
+def _flatten_path_spec(spec) -> list[str]:
+    """
+    NeMo's manifest_filepath / tarred_audio_filepaths accept several layouts:
+      str, list[str], list[list[str]], list[tuple[str, weight]], ...
+    Flatten any of those into a list of plain string paths.
+    """
+    out: list[str] = []
+    for item in _as_list(spec):
+        if isinstance(item, (str, Path)):
+            out.append(str(item))
+        elif isinstance(item, (list, tuple, ListConfig)):
+            # [path] or [path, weight] or [[path], [path], ...]
+            head = item[0]
+            if isinstance(head, (str, Path)):
+                out.append(str(head))
+            else:
+                out.extend(_flatten_path_spec(item))
+    return out
+
+
+def _expand_jsonl(spec) -> list[str]:
+    return [p for raw in _flatten_path_spec(spec) for p in expand_sharded_filepaths(raw)]
+
+
+def _expand_tars(spec) -> list[str]:
+    return [p for raw in _flatten_path_spec(spec) for p in expand_sharded_filepaths(raw)]
+
+
+def _resolve_input_cfg(val) -> ListConfig | None:
+    """``input_cfg`` may be inline or a path to a YAML file. Materialize it."""
+    if isinstance(val, (list, ListConfig)):
+        return val
+    if isinstance(val, (str, Path)):
+        return OmegaConf.load(str(val))
+    return None
+
+
+# Types that don't read any data themselves — they delegate to
+# ``read_cutset_from_config(config)`` and accept *any* underlying source's keys
+# (``cuts_path``, ``shar_path``, ``manifest_filepath`` [+ ``tarred_audio_filepaths``],
+# nested ``input_cfg``, …). Treat them as transparent passthroughs.
+_TRANSFORM_TYPES = frozenset({
+    "lhotse_as_conversation",
+    "sqa_as_conversation",
+    "s2s_as_conversation",
+    "s2s_duplex_overlap_as_s2s_duplex",
+    "s2s_duplex_reverse_role",
+    "lhotse_magpietts_data_as_continuation",
+    "nemo_tarred_to_duplex",
+})
+
+# Types that index nothing on their own.
+_NO_INDEX_TYPES = frozenset({"txt", "txt_pair", "parquet", "multi_speaker_simulator"})
+
+
+def _discover_keys(entry, jobs: list[IndexJob]) -> None:
+    """
+    Key-based dispatch: emit IndexJobs based on which underlying-source keys
+    are present, regardless of ``type``. Used for transform types that
+    delegate to ``read_cutset_from_config``, and as the inner step for
+    concrete types that name them directly.
+    """
+    if (cuts_path := entry.get("cuts_path")) is not None:
+        for p in _expand_jsonl(cuts_path):
+            jobs.append(IndexJob(p, JSONL))
+    if (shar_path := entry.get("shar_path")) is not None:
+        _discover_shar(shar_path, jobs)
+    if (mfp := entry.get("manifest_filepath")) is not None:
+        for p in _expand_jsonl(mfp):
+            jobs.append(IndexJob(p, JSONL))
+        for p in _expand_tars(entry.get("tarred_audio_filepaths")):
+            jobs.append(IndexJob(p, NEMO_TAR))
+    if (paths := entry.get("paths")) is not None:
+        for p in _expand_jsonl(paths):
+            jobs.append(IndexJob(p, JSONL))
+    if (sub := _resolve_input_cfg(entry.get("input_cfg"))) is not None:
+        discover(sub, jobs)
+
+
+def discover(entry, jobs: list[IndexJob]) -> None:
+    """Walk one entry of an ``input_cfg`` and append every required IndexJob."""
+    if isinstance(entry, (list, ListConfig)):
+        for sub in entry:
+            discover(sub, jobs)
+        return
+    if not isinstance(entry, (dict, DictConfig)):
+        return
+
+    typ = entry.get("type")
+    if typ is None:
+        # Top-level wrapper (``input_cfg: [...]``) — recurse into every value.
+        for v in entry.values():
+            discover(v, jobs)
+        return
+
+    if typ in _NO_INDEX_TYPES:
+        return
+
+    if typ == "group" or typ in _TRANSFORM_TYPES:
+        # Group and transform passthroughs: dispatch by keys.
+        _discover_keys(entry, jobs)
+        return
+
+    if typ in ("nemo", "nemo_tarred", "multimodal_conversation", "share_gpt"):
+        for p in _expand_jsonl(entry.get("manifest_filepath")):
+            jobs.append(IndexJob(p, JSONL))
+        for p in _expand_tars(entry.get("tarred_audio_filepaths")):
+            jobs.append(IndexJob(p, NEMO_TAR))
+        return
+
+    if typ == "share_gpt_webdataset":
+        # Layout: data_dir/shard-N.tar [+ optional shard-N.tar.idx, manifest jsonl].
+        data_dir = entry.get("data_dir")
+        if data_dir is None:
+            return
+        for ext, kind in ((".tar", WDS_TAR), (".jsonl", JSONL)):
+            for p in sorted(Path(data_dir).glob(f"*{ext}")):
+                jobs.append(IndexJob(str(p), kind))
+        return
+
+    if typ == "lhotse":
+        if (cuts_path := entry.get("cuts_path")) is not None:
+            for p in _expand_jsonl(cuts_path):
+                jobs.append(IndexJob(p, JSONL))
+        if (shar_path := entry.get("shar_path")) is not None:
+            _discover_shar(shar_path, jobs)
+        return
+
+    if typ == "lhotse_shar":
+        _discover_shar(entry.get("shar_path"), jobs)
+        return
+
+    if typ == "txt_jsonl":
+        for p in _expand_jsonl(entry.get("paths")):
+            jobs.append(IndexJob(p, JSONL))
+        return
+
+    # Unknown type — nothing to do.
+    return
+
+
+def _discover_shar(shar_path, jobs: list[IndexJob]) -> None:
+    """Index every uncompressed JSONL/tar shard inside one or more Shar dirs."""
+    if shar_path is None:
+        return
+    if isinstance(shar_path, (str, Path)):
+        candidates = [shar_path]
+    elif isinstance(shar_path, (list, ListConfig)):
+        candidates = []
+        for item in shar_path:
+            if isinstance(item, (str, Path)):
+                candidates.append(item)
+            elif isinstance(item, (list, tuple, ListConfig)) and item:
+                candidates.append(item[0])  # [path, weight] form
+    elif isinstance(shar_path, (dict, DictConfig)):
+        # {field: [shard, ...]} layout — index every shard in every field.
+        for v in shar_path.values():
+            for raw in _flatten_path_spec(v):
+                for p in expand_sharded_filepaths(raw):
+                    if p.endswith(".jsonl"):
+                        jobs.append(IndexJob(p, JSONL))
+                    elif p.endswith(".tar"):
+                        jobs.append(IndexJob(p, WDS_TAR))
+        return
+    else:
+        return
+
+    for d in candidates:
+        d = Path(str(d))
+        if not d.is_dir():
+            continue
+        for p in sorted(d.iterdir()):
+            if p.suffix == ".jsonl":
+                jobs.append(IndexJob(str(p), JSONL))
+            elif p.suffix == ".tar":
+                jobs.append(IndexJob(str(p), WDS_TAR))
+
+
+# --------------------------------------------------------------------------- #
+# Index builders.
+# --------------------------------------------------------------------------- #
+
+
+def _build_one(job: IndexJob) -> tuple[IndexJob, str]:
+    """Run the right indexer for *job*. Returns (job, status)."""
+    from lhotse.indexing import create_jsonl_index, create_tar_index as create_wds_tar_index
+
+    builders: dict[str, Callable[[str], object]] = {
+        JSONL: create_jsonl_index,
+        WDS_TAR: create_wds_tar_index,
+        NEMO_TAR: create_nemo_tar_index,
+    }
+    builder = builders[job.kind]
+    if job.kind == NEMO_TAR:
+        # NeMo's create_tar_index has a (tar_path, idx_path) signature.
+        builder(job.path, job.idx_path())
+    else:
+        builder(job.path)
+    return job, "built"
+
+
+def _is_indexed(job: IndexJob) -> bool:
+    """True if a non-empty .idx already exists locally."""
+    p = Path(job.idx_path())
+    try:
+        return p.is_file() and p.stat().st_size > 0
+    except OSError:
+        return False
+
+
+# --------------------------------------------------------------------------- #
+# CLI.
+# --------------------------------------------------------------------------- #
+
+
+@click.command(context_settings={"show_default": True})
+@click.argument("input_cfgs", type=click.Path(exists=True, dir_okay=False), nargs=-1, required=True)
+@click.option("--force", is_flag=True, help="Rebuild .idx files even if they already exist.")
+@click.option("--workers", type=int, default=4, help="Number of parallel index builders.")
+@click.option("--dry-run", is_flag=True, help="List the jobs without writing anything.")
+def main(input_cfgs: tuple[str, ...], force: bool, workers: int, dry_run: bool):
+    """
+    Build .idx sidecars for every JSONL/tar referenced by INPUT_CFGS.
+
+    INPUT_CFGS are NeMo Lhotse dataloading configs (``input_cfg`` YAML).
+    """
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    jobs: list[IndexJob] = []
+    for cfg_path in input_cfgs:
+        cfg = OmegaConf.load(cfg_path)
+        discover(cfg, jobs)
+
+    # Deduplicate while preserving order.
+    seen: set[tuple[str, str]] = set()
+    unique: list[IndexJob] = []
+    for j in jobs:
+        key = (j.path, j.kind)
+        if key not in seen:
+            seen.add(key)
+            unique.append(j)
+
+    todo = unique if force else [j for j in unique if not _is_indexed(j)]
+    skipped = len(unique) - len(todo)
+
+    logging.info("Discovered %d files (%d already indexed, %d to build).",
+                 len(unique), skipped, len(todo))
+
+    if dry_run or not todo:
+        for j in todo:
+            logging.info("  [%s] %s", j.kind, j.path)
+        return
+
+    failures: list[tuple[IndexJob, BaseException]] = []
+    with ThreadPoolExecutor(max_workers=max(1, workers)) as ex:
+        futures = {ex.submit(_build_one, j): j for j in todo}
+        for fut in as_completed(futures):
+            j = futures[fut]
+            try:
+                _, status = fut.result()
+                logging.info("  [%s] %s -> %s", status, j.kind, j.path)
+            except BaseException as e:  # noqa: BLE001 — surface any failure
+                failures.append((j, e))
+                logging.error("  [FAIL] %s %s: %s", j.kind, j.path, e)
+
+    if failures:
+        logging.error("\n%d index build(s) failed:", len(failures))
+        for j, e in failures:
+            logging.error("  %s (%s): %s", j.path, j.kind, e)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()

From ffc53c797087d9d7b45560165e44d18e7d7455cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Thu, 7 May 2026 09:16:26 -0700
Subject: [PATCH 06/30] Add supported for external index directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 docs/source/dataloaders.rst                   |  66 +++++-
 nemo/collections/common/data/lhotse/cutset.py |  20 +-
 .../common/data/lhotse/dataloader.py          |   6 +
 .../common/data/lhotse/indexed_adapters.py    |  48 ++++-
 .../common/data/lhotse/nemo_adapters.py       |  20 +-
 .../common/data/lhotse/text_adapters.py       |  49 ++++-
 scripts/dataloading/build_indexes.py          | 107 ++++++----
 scripts/dataloading/prefetch_indexes.py       | 190 ++++++++++++++++++
 8 files changed, 447 insertions(+), 59 deletions(-)
 create mode 100644 scripts/dataloading/prefetch_indexes.py

diff --git a/docs/source/dataloaders.rst b/docs/source/dataloaders.rst
index 7ef1ffbc761d..c4b9a332d07a 100644
--- a/docs/source/dataloaders.rst
+++ b/docs/source/dataloaders.rst
@@ -884,6 +884,67 @@ Two equivalent ways:
    skips files that already have an up-to-date ``.idx``. Use ``--force`` to
    rebuild, ``--workers N`` for parallelism, ``--dry-run`` to preview.
 
+   Pass ``--indexes-root /path/to/mirror`` to write the sidecars to a
+   separate directory tree that mirrors the data files' layout instead of
+   placing them next to the data — see :ref:`lhotse-indexes-root` below.
+
+.. _lhotse-indexes-root:
+
+Storing ``.idx`` sidecars in a separate directory (``indexes_root``)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By default, every ``.idx`` lives next to its data file
+(``cuts.jsonl`` ↔ ``cuts.jsonl.idx``). If your data sits on shared, slow,
+or read-only storage (NFS, S3, AIStore), you may want to keep the indexes
+on a fast local disk instead. Set ``indexes_root`` at the top of the
+dataloader config:
+
+.. code-block:: yaml
+
+    data:
+      train_ds:
+        indexed: true
+        use_stateful_dataloader: true
+        indexes_root: /scratch/idx     # mirror lives here
+        input_cfg:
+          - type: nemo_tarred
+            manifest_filepath: /shared/data/asr/manifest__OP_0..127_CL_.jsonl
+            tarred_audio_filepaths: ais://bucket/asr/audio__OP_0..127_CL_.tar
+
+Index lookups for each data file ``D`` resolve to
+``<indexes_root>/<D-with-scheme-stripped>.idx``. Examples::
+
+    /shared/data/asr/manifest_0.jsonl    -> /scratch/idx/shared/data/asr/manifest_0.jsonl.idx
+    ais://bucket/asr/audio_0.tar        -> /scratch/idx/bucket/asr/audio_0.tar.idx
+
+The setting cascades through ``read_dataset_config`` to every nested
+``input_cfg`` entry, so a single top-level value covers the whole pipeline.
+You can override it per-source on any entry that needs a different mirror.
+
+Two ways to populate the mirror:
+
+1. **Build the indexes there to begin with**::
+
+       python scripts/dataloading/build_indexes.py \
+           --indexes-root /scratch/idx path/to/input_cfg.yaml
+
+   The script reads each data file in place, computes the offsets, and
+   writes the ``.idx`` directly to the mirrored target.
+
+2. **Prefetch existing remote indexes** when sidecars already live next to
+   the data on shared/object storage and you just want a local copy::
+
+       python scripts/dataloading/prefetch_indexes.py \
+           --indexes-root /scratch/idx path/to/input_cfg.yaml
+
+   ``prefetch_indexes.py`` walks the same ``input_cfg``, locates every
+   sidecar at its natural location (via lhotse's ``open_best``, so
+   ``ais://`` / ``s3://`` / ``http://`` are all supported as sources),
+   and copies it into the local mirror. Use ``--source-indexes-root``
+   when the source sidecars themselves live under another mirror.
+
+Both scripts accept ``--force``, ``--workers N``, and ``--dry-run``.
+
 End-to-end YAML example
 ^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -1388,8 +1449,9 @@ options by what they control.
 **Sampling — fusion (multi-config).** ``multi_config``, ``sampler_fusion``,
 ``sampler_weights``.
 
-**Indexed / resumable.** ``indexed``, ``use_stateful_dataloader``. See
-:ref:`indexed-resumable-dataloading`.
+**Indexed / resumable.** ``indexed``, ``use_stateful_dataloader``,
+``indexes_root``. See :ref:`indexed-resumable-dataloading` and
+:ref:`lhotse-indexes-root`.
 
 **Mixing & weighting.** ``reweight_temperature``, ``max_open_streams``.
 
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 84b74804e4fd..18243e71deda 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -286,6 +286,7 @@ def read_dataset_config(config) -> tuple[CutSet, bool]:
         "force_iterable_dataset": config.get("force_iterable_dataset", False),
         "slice_length": config.get("slice_length", None),
         "indexed": config.get("indexed", False),
+        "indexes_root": config.get("indexes_root", None),
         # Temperature for re-weighting datasets. 1 is a neutral value. Lower temperature over-samples smaller datasets, and vice versa.
         "reweight_temperature": config.get("reweight_temperature", None),
     }
@@ -350,6 +351,7 @@ def read_txt_jsonl_paths(config: DictConfig) -> tuple[CutSet, bool]:
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
             indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
         )
     )
     if not config.get("force_finite", False):
@@ -387,6 +389,7 @@ def read_nemo_sft_jsonl(config: DictConfig) -> tuple[CutSet, bool]:
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
             indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
         )
     )
     if not config.get("force_finite", False):
@@ -409,6 +412,7 @@ def read_multimodal_conversation_jsonl(config: DictConfig) -> tuple[CutSet, bool
             context=config.get("tags", {}).get("context"),
             slice_length=config.get("slice_length"),
             indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
         )
     )
     if not config.get("force_finite", False):
@@ -431,6 +435,7 @@ def read_share_gpt_as_conversation(config) -> tuple[CutSet, bool]:
             shard_seed=config.shard_seed,
             slice_length=config.get("slice_length"),
             indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
         )
     )
     if not config.get("force_finite", False):
@@ -450,6 +455,7 @@ def read_share_gpt_webdataset_as_conversation(config) -> tuple[CutSet, bool]:
             shuffle_shards=config.shuffle,
             shard_seed=config.shard_seed,
             indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
         )
     )
     # When force_finite is False (default), repeat the dataset infinitely so that
@@ -721,7 +727,13 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
     else:
         # Regular Lhotse manifest points to individual audio files (like native NeMo manifest).
         path = config.cuts_path
-        cuts = CutSet.from_file(path, indexed=config.get("indexed", None)).map(
+        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
+        indexes_root = config.get("indexes_root", None)
+        from_file_kwargs = {"indexed": config.get("indexed", None)}
+        if indexes_root is not None:
+            from_file_kwargs["index_path"] = resolve_idx_path(path, indexes_root)
+        cuts = CutSet.from_file(path, **from_file_kwargs).map(
             partial(resolve_relative_paths, manifest_path=path)
         )
     return cuts, is_tarred
@@ -1470,7 +1482,9 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
             else:
                 common_kwargs[key] = config[key]
     indexed = config.get("indexed", False)
-    notar_kwargs_extra = {"indexed": indexed} if indexed else {}
+    indexes_root = config.get("indexes_root", None)
+    indexed_extra = {"indexes_root": indexes_root} if (indexed and indexes_root is not None) else {}
+    notar_kwargs_extra = {"indexed": indexed, **indexed_extra} if indexed else {}
     # The option below is to allow a special case of NeMo manifest iteration as Lhotse CutSet
     # without performing any I/O. NeMo manifests typically don't have sampling_rate information required by Lhotse,
     # so lhotse has to look up the headers of audio files to fill it on-the-fly.
@@ -1480,7 +1494,7 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
     metadata_only = config.get("metadata_only", False)
     force_finite = config.get("force_finite", False)
     notar_kwargs = {"metadata_only": metadata_only}
-    tar_kwargs_extra = {"indexed": indexed} if indexed else {}
+    tar_kwargs_extra = {"indexed": indexed, **indexed_extra} if indexed else {}
     is_tarred = config.get("tarred_audio_filepaths") is not None
     if isinstance(config.manifest_filepath, (str, Path)):
         if is_tarred and not metadata_only:
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index f062978cc151..6a3a6a5939e6 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -259,6 +259,12 @@ class LhotseDataLoadingConfig:
     # ``True`` = require indexed reads (errors if .idx is missing).
     # ``False`` = streaming reads only.
     indexed: Optional[bool] = None
+    # When set, ``.idx`` sidecars are read from a mirror under this root that
+    # preserves the data files' directory structure (URL schemes are stripped,
+    # leading separators dropped). Use this to keep indexes on a fast local
+    # disk while the data lives on shared / object storage. Cascades through
+    # ``read_dataset_config`` to every nested ``input_cfg`` entry.
+    indexes_root: Optional[str] = None
     # When True, build the dataloader with ``torchdata.stateful_dataloader.StatefulDataLoader``
     # instead of ``torch.utils.data.DataLoader``. Combined with a checkpointable lhotse sampler
     # (DynamicBucketingSampler / DynamicCutSampler), this enables exact resume from the next batch
diff --git a/nemo/collections/common/data/lhotse/indexed_adapters.py b/nemo/collections/common/data/lhotse/indexed_adapters.py
index 597e6c1f4726..3af89186bccf 100644
--- a/nemo/collections/common/data/lhotse/indexed_adapters.py
+++ b/nemo/collections/common/data/lhotse/indexed_adapters.py
@@ -14,10 +14,11 @@
 import json
 import os
 import random
+import re
 import struct
 import tarfile
 from pathlib import Path
-from typing import NamedTuple
+from typing import NamedTuple, Optional
 
 import numpy as np
 
@@ -28,6 +29,51 @@
 _TAR_BLOCK_SIZE = 512
 _TAR_ZERO_BLOCK = b'\0' * _TAR_BLOCK_SIZE
 
+# Recognized URL schemes whose authority ("host" component) is part of the
+# logical path (e.g. the bucket name). Stripping just the scheme keeps the
+# bucket+key in the relative path used to mirror under indexes_root.
+_URL_RE = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.\-]*://")
+
+
+def resolve_idx_path(data_path: str | Path, indexes_root: Optional[str | Path] = None) -> str:
+    """
+    Compute the ``.idx`` sidecar path for *data_path*.
+
+    When ``indexes_root`` is ``None`` (the default), return ``data_path + ".idx"``
+    so the sidecar lives next to the data file, matching the conventional
+    layout.
+
+    When ``indexes_root`` is set, return a path under that root that mirrors
+    the data file's directory structure. URL schemes are stripped (so the
+    bucket/key remains as the relative key); leading separators on local paths
+    are dropped. Examples::
+
+        /data/foo/bar.jsonl       + indexes_root=/cache/idx
+            -> /cache/idx/data/foo/bar.jsonl.idx
+        ais://bucket/key/m.jsonl  + indexes_root=/cache/idx
+            -> /cache/idx/bucket/key/m.jsonl.idx
+        s3://b/path/data.tar      + indexes_root=/cache/idx
+            -> /cache/idx/b/path/data.tar.idx
+
+    The indexes_root argument itself can be local or a URL — joining respects
+    URL semantics so e.g. mirroring into ``ais://cache/idx`` works the same way.
+    """
+    data_str = str(data_path)
+    if indexes_root is None:
+        return data_str + ".idx"
+
+    # Normalize the data path into a relative "key" by stripping a URL scheme,
+    # any leading slashes, and Windows-style drive letters (best-effort).
+    rel = _URL_RE.sub("", data_str).lstrip("/\\")
+    # Strip "C:" or "C:/" style drive prefixes.
+    if len(rel) >= 2 and rel[1] == ":":
+        rel = rel[2:].lstrip("/\\")
+
+    root_str = str(indexes_root).rstrip("/\\")
+    if _URL_RE.match(root_str):
+        return f"{root_str}/{rel}.idx"
+    return str(Path(root_str) / (rel + ".idx"))
+
 
 class LazyShuffledRange:
     """
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index db3eea4c80d9..f29746128468 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -130,6 +130,7 @@ def __init__(
         shard_seed: int | Literal["randomized", "trng"] = "trng",
         extra_fields: list[dict[str, str]] | None = None,
         indexed: bool = False,
+        indexes_root: str | Path | None = None,
     ) -> None:
         self.path = path
         self.shuffle_shards = shuffle_shards
@@ -139,6 +140,7 @@ def __init__(
         self.metadata_only = metadata_only
         self.extra_fields = extra_fields
         self.indexed = indexed
+        self.indexes_root = indexes_root
         validate_extra_fields(self.extra_fields)
         paths = expand_sharded_filepaths(path)
 
@@ -149,9 +151,14 @@ def __init__(
                     "their values are positional/streaming and cannot be reconstructed under "
                     "graph-token random access."
                 )
+            from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
             seed = resolve_seed(shard_seed) if shard_seed not in (None, "trng", "randomized") else 0
             indexed_sources = [
-                LazyIndexedManifestIterator(p, decode=GraphOriginDict) for p in paths
+                LazyIndexedManifestIterator(
+                    p, index_path=resolve_idx_path(p, indexes_root), decode=GraphOriginDict
+                )
+                for p in paths
             ]
             if len(indexed_sources) == 1:
                 self.source = indexed_sources[0]
@@ -394,9 +401,11 @@ def __init__(
         extra_fields: list[dict[str, str]] | None = None,
         slice_length: int = None,
         indexed: bool = False,
+        indexes_root: str | Path | None = None,
     ) -> None:
         self.skip_missing_manifest_entries = skip_missing_manifest_entries
         self.indexed = indexed
+        self.indexes_root = indexes_root
         self.shard_id_to_manifest: dict[int, Iterable[dict]]
         self.paths = expand_sharded_filepaths(manifest_path)
         if len(self.paths) == 1:
@@ -470,6 +479,7 @@ def _init_indexed(self) -> None:
 
         from nemo.collections.common.data.lhotse.indexed_adapters import (
             IndexedTarMemberReader,
+            resolve_idx_path,
         )
 
         if self.extra_fields:
@@ -506,9 +516,13 @@ def _init_indexed(self) -> None:
         for sid in self._sorted_shard_ids:
             jsonl_path = shard_id_to_manifest_path[sid]
             tar_path = self.shard_id_to_tar_path[sid]
-            self._cuts_readers[sid] = IndexedJsonlReader(jsonl_path)
+            self._cuts_readers[sid] = IndexedJsonlReader(
+                jsonl_path, index_path=resolve_idx_path(jsonl_path, self.indexes_root)
+            )
             if not self.use_ais_get_batch:
-                self._tar_readers[sid] = IndexedTarMemberReader(tar_path)
+                self._tar_readers[sid] = IndexedTarMemberReader(
+                    tar_path, idx_path=resolve_idx_path(tar_path, self.indexes_root)
+                )
             cum += len(self._cuts_readers[sid])
             cum_lens.append(cum)
         self._cum_lens = cum_lens
diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py
index f2d161528bb3..621fd7188e9a 100644
--- a/nemo/collections/common/data/lhotse/text_adapters.py
+++ b/nemo/collections/common/data/lhotse/text_adapters.py
@@ -149,6 +149,7 @@ class LhotseTextJsonlAdapter(IteratorNode):
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
     indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
 
     def __post_init__(self):
         self.paths = expand_sharded_filepaths(self.paths)
@@ -159,8 +160,10 @@ def __post_init__(self):
         if self.indexed:
             from lhotse.indexing import IndexedJsonlReader
 
+            from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
             for p in self.paths:
-                self._readers.append(IndexedJsonlReader(p))
+                self._readers.append(IndexedJsonlReader(p, index_path=resolve_idx_path(p, self.indexes_root)))
             cum = 0
             self._cum_lens.append(cum)
             for r in self._readers:
@@ -419,6 +422,7 @@ class NeMoSFTJsonlAdapter(IteratorNode):
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
     indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
 
     def __post_init__(self):
         self.paths = expand_sharded_filepaths(self.paths)
@@ -429,8 +433,10 @@ def __post_init__(self):
         if self.indexed:
             from lhotse.indexing import IndexedJsonlReader
 
+            from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
             for p in self.paths:
-                self._readers.append(IndexedJsonlReader(p))
+                self._readers.append(IndexedJsonlReader(p, index_path=resolve_idx_path(p, self.indexes_root)))
             cum = 0
             self._cum_lens.append(cum)
             for r in self._readers:
@@ -800,6 +806,7 @@ class NeMoMultimodalConversationJsonlAdapter(IteratorNode):
     context: str | None = None
     slice_length: int | None = None
     indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
 
     def __post_init__(self):
         self.manifest_filepath = expand_sharded_filepaths(self.manifest_filepath)
@@ -833,17 +840,21 @@ def has_constant_time_access(self) -> bool:
     def _init_indexed(self) -> None:
         from lhotse.indexing import IndexedJsonlReader
 
+        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
         if self.slice_length is not None:
             raise ValueError(
                 "NeMoMultimodalConversationJsonlAdapter(indexed=True) does not support slice_length."
             )
         for p in self.manifest_filepath:
-            self._cuts_readers.append(IndexedJsonlReader(p))
+            self._cuts_readers.append(IndexedJsonlReader(p, index_path=resolve_idx_path(p, self.indexes_root)))
         if self.tarred_audio_filepaths is not None:
             from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
 
             for p in self.tarred_audio_filepaths:
-                self._tar_readers.append(IndexedTarMemberReader(p))
+                self._tar_readers.append(
+                    IndexedTarMemberReader(p, idx_path=resolve_idx_path(p, self.indexes_root))
+                )
         cum = 0
         self._cum_lens.append(cum)
         for r in self._cuts_readers:
@@ -1266,8 +1277,11 @@ class NeMoMultimodalConversationShareGPTJsonlAdapter(IteratorNode):
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
     slice_length: int | None = None
     indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
 
     def __post_init__(self):
+        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
         self.manifest_filepath = expand_sharded_filepaths(self.manifest_filepath)
         if self.tarred_audio_filepaths is not None:
             self.tarred_audio_filepaths = expand_sharded_filepaths(self.tarred_audio_filepaths)
@@ -1275,7 +1289,9 @@ def __post_init__(self):
                 self.tarred_audio_filepaths
             ), f"{len(self.manifest_filepath)} != {len(self.tarred_audio_filepaths)}"
         self.audio_placeholders = _normalize_audio_placeholders(self.audio_placeholders)
-        self._has_index = all(Path(p + ".idx").exists() for p in self.manifest_filepath)
+        self._has_index = all(
+            Path(resolve_idx_path(p, self.indexes_root)).exists() for p in self.manifest_filepath
+        )
         self.epoch = 0
         self._cuts_readers: list = []
         self._tar_readers: list = []
@@ -1301,17 +1317,21 @@ def has_constant_time_access(self) -> bool:
     def _init_indexed(self) -> None:
         from lhotse.indexing import IndexedJsonlReader
 
+        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
         if self.slice_length is not None:
             raise ValueError(
                 "NeMoMultimodalConversationShareGPTJsonlAdapter(indexed=True) does not support slice_length."
             )
         for p in self.manifest_filepath:
-            self._cuts_readers.append(IndexedJsonlReader(p))
+            self._cuts_readers.append(IndexedJsonlReader(p, index_path=resolve_idx_path(p, self.indexes_root)))
         if self.tarred_audio_filepaths is not None:
             from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
 
             for p in self.tarred_audio_filepaths:
-                self._tar_readers.append(IndexedTarMemberReader(p))
+                self._tar_readers.append(
+                    IndexedTarMemberReader(p, idx_path=resolve_idx_path(p, self.indexes_root))
+                )
         cum = 0
         self._cum_lens.append(cum)
         for r in self._cuts_readers:
@@ -1585,10 +1605,13 @@ class NeMoMultimodalConversationShareGPTWebdatasetAdapter(IteratorNode):
     shuffle_shards: bool = False
     shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
     indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
 
     def __post_init__(self):
         import json as _json
 
+        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
         meta_path = Path(self.data_dir) / "wids-meta.json"
         if meta_path.exists():
             with open(meta_path) as f:
@@ -1599,7 +1622,9 @@ def __post_init__(self):
             if not self._shard_paths:
                 raise FileNotFoundError(f"No wids-meta.json and no .tar files found under {self.data_dir}")
         self.audio_placeholders = _normalize_audio_placeholders(self.audio_placeholders)
-        self._has_index = all(Path(p + ".idx").exists() for p in self._shard_paths)
+        self._has_index = all(
+            Path(resolve_idx_path(p, self.indexes_root)).exists() for p in self._shard_paths
+        )
         self.epoch = 0
         self._tar_readers: list = []
         self._cum_lens: list[int] = []
@@ -1622,8 +1647,10 @@ def has_constant_time_access(self) -> bool:
         return self.indexed
 
     def _init_indexed(self) -> None:
+        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
         for p in self._shard_paths:
-            self._tar_readers.append(IndexedTarSampleReader(p))
+            self._tar_readers.append(IndexedTarSampleReader(p, idx_path=resolve_idx_path(p, self.indexes_root)))
         cum = 0
         self._cum_lens.append(cum)
         for r in self._tar_readers:
@@ -1726,11 +1753,13 @@ def _iter_sequential(self):
         self.epoch += 1
 
     def _iter_indexed(self):
+        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
         shard_paths = list(self._shard_paths)
         rng = self._get_rng()
         rng.shuffle(shard_paths)
         for tar_path in shard_paths:
-            reader = IndexedTarSampleReader(tar_path)
+            reader = IndexedTarSampleReader(tar_path, idx_path=resolve_idx_path(tar_path, self.indexes_root))
             for idx in LazyShuffledRange(len(reader), rng):
                 json_data, audio_bytes, audio_name = reader[idx]
                 yield self._yield_from_sample(json_data, audio_bytes, audio_name)
diff --git a/scripts/dataloading/build_indexes.py b/scripts/dataloading/build_indexes.py
index 9a39d38a0cda..eacbaed2fb39 100644
--- a/scripts/dataloading/build_indexes.py
+++ b/scripts/dataloading/build_indexes.py
@@ -54,13 +54,14 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Callable, Iterable, Iterator
+from typing import Callable, Iterable, Iterator, Optional
 
 import click
 from omegaconf import DictConfig, ListConfig, OmegaConf
 
 from nemo.collections.common.data.lhotse.indexed_adapters import (
     create_tar_index as create_nemo_tar_index,
+    resolve_idx_path,
 )
 from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths
 
@@ -83,9 +84,10 @@
 class IndexJob:
     path: str
     kind: str  # one of {JSONL, NEMO_TAR, WDS_TAR}
+    indexes_root: Optional[str] = None
 
     def idx_path(self) -> str:
-        return self.path + ".idx"
+        return resolve_idx_path(self.path, self.indexes_root)
 
 
 # --------------------------------------------------------------------------- #
@@ -156,44 +158,49 @@ def _resolve_input_cfg(val) -> ListConfig | None:
 _NO_INDEX_TYPES = frozenset({"txt", "txt_pair", "parquet", "multi_speaker_simulator"})
 
 
-def _discover_keys(entry, jobs: list[IndexJob]) -> None:
+def _discover_keys(entry, jobs: list[IndexJob], indexes_root: Optional[str]) -> None:
     """
     Key-based dispatch: emit IndexJobs based on which underlying-source keys
     are present, regardless of ``type``. Used for transform types that
     delegate to ``read_cutset_from_config``, and as the inner step for
-    concrete types that name them directly.
+    concrete types that name them directly. Per-entry ``indexes_root``
+    overrides the inherited value when set.
     """
+    indexes_root = entry.get("indexes_root", indexes_root)
     if (cuts_path := entry.get("cuts_path")) is not None:
         for p in _expand_jsonl(cuts_path):
-            jobs.append(IndexJob(p, JSONL))
+            jobs.append(IndexJob(p, JSONL, indexes_root))
     if (shar_path := entry.get("shar_path")) is not None:
-        _discover_shar(shar_path, jobs)
+        _discover_shar(shar_path, jobs, indexes_root)
     if (mfp := entry.get("manifest_filepath")) is not None:
         for p in _expand_jsonl(mfp):
-            jobs.append(IndexJob(p, JSONL))
+            jobs.append(IndexJob(p, JSONL, indexes_root))
         for p in _expand_tars(entry.get("tarred_audio_filepaths")):
-            jobs.append(IndexJob(p, NEMO_TAR))
+            jobs.append(IndexJob(p, NEMO_TAR, indexes_root))
     if (paths := entry.get("paths")) is not None:
         for p in _expand_jsonl(paths):
-            jobs.append(IndexJob(p, JSONL))
+            jobs.append(IndexJob(p, JSONL, indexes_root))
     if (sub := _resolve_input_cfg(entry.get("input_cfg"))) is not None:
-        discover(sub, jobs)
+        discover(sub, jobs, indexes_root)
 
 
-def discover(entry, jobs: list[IndexJob]) -> None:
+def discover(entry, jobs: list[IndexJob], indexes_root: Optional[str] = None) -> None:
     """Walk one entry of an ``input_cfg`` and append every required IndexJob."""
     if isinstance(entry, (list, ListConfig)):
         for sub in entry:
-            discover(sub, jobs)
+            discover(sub, jobs, indexes_root)
         return
     if not isinstance(entry, (dict, DictConfig)):
         return
 
+    # Per-entry override: a nested entry can carry its own ``indexes_root``.
+    indexes_root = entry.get("indexes_root", indexes_root)
+
     typ = entry.get("type")
     if typ is None:
         # Top-level wrapper (``input_cfg: [...]``) — recurse into every value.
         for v in entry.values():
-            discover(v, jobs)
+            discover(v, jobs, indexes_root)
         return
 
     if typ in _NO_INDEX_TYPES:
@@ -201,14 +208,14 @@ def discover(entry, jobs: list[IndexJob]) -> None:
 
     if typ == "group" or typ in _TRANSFORM_TYPES:
         # Group and transform passthroughs: dispatch by keys.
-        _discover_keys(entry, jobs)
+        _discover_keys(entry, jobs, indexes_root)
         return
 
     if typ in ("nemo", "nemo_tarred", "multimodal_conversation", "share_gpt"):
         for p in _expand_jsonl(entry.get("manifest_filepath")):
-            jobs.append(IndexJob(p, JSONL))
+            jobs.append(IndexJob(p, JSONL, indexes_root))
         for p in _expand_tars(entry.get("tarred_audio_filepaths")):
-            jobs.append(IndexJob(p, NEMO_TAR))
+            jobs.append(IndexJob(p, NEMO_TAR, indexes_root))
         return
 
     if typ == "share_gpt_webdataset":
@@ -218,31 +225,31 @@ def discover(entry, jobs: list[IndexJob]) -> None:
             return
         for ext, kind in ((".tar", WDS_TAR), (".jsonl", JSONL)):
             for p in sorted(Path(data_dir).glob(f"*{ext}")):
-                jobs.append(IndexJob(str(p), kind))
+                jobs.append(IndexJob(str(p), kind, indexes_root))
         return
 
     if typ == "lhotse":
         if (cuts_path := entry.get("cuts_path")) is not None:
             for p in _expand_jsonl(cuts_path):
-                jobs.append(IndexJob(p, JSONL))
+                jobs.append(IndexJob(p, JSONL, indexes_root))
         if (shar_path := entry.get("shar_path")) is not None:
-            _discover_shar(shar_path, jobs)
+            _discover_shar(shar_path, jobs, indexes_root)
         return
 
     if typ == "lhotse_shar":
-        _discover_shar(entry.get("shar_path"), jobs)
+        _discover_shar(entry.get("shar_path"), jobs, indexes_root)
         return
 
     if typ == "txt_jsonl":
         for p in _expand_jsonl(entry.get("paths")):
-            jobs.append(IndexJob(p, JSONL))
+            jobs.append(IndexJob(p, JSONL, indexes_root))
         return
 
     # Unknown type — nothing to do.
     return
 
 
-def _discover_shar(shar_path, jobs: list[IndexJob]) -> None:
+def _discover_shar(shar_path, jobs: list[IndexJob], indexes_root: Optional[str]) -> None:
     """Index every uncompressed JSONL/tar shard inside one or more Shar dirs."""
     if shar_path is None:
         return
@@ -261,9 +268,9 @@ def _discover_shar(shar_path, jobs: list[IndexJob]) -> None:
             for raw in _flatten_path_spec(v):
                 for p in expand_sharded_filepaths(raw):
                     if p.endswith(".jsonl"):
-                        jobs.append(IndexJob(p, JSONL))
+                        jobs.append(IndexJob(p, JSONL, indexes_root))
                     elif p.endswith(".tar"):
-                        jobs.append(IndexJob(p, WDS_TAR))
+                        jobs.append(IndexJob(p, WDS_TAR, indexes_root))
         return
     else:
         return
@@ -274,9 +281,9 @@ def _discover_shar(shar_path, jobs: list[IndexJob]) -> None:
             continue
         for p in sorted(d.iterdir()):
             if p.suffix == ".jsonl":
-                jobs.append(IndexJob(str(p), JSONL))
+                jobs.append(IndexJob(str(p), JSONL, indexes_root))
             elif p.suffix == ".tar":
-                jobs.append(IndexJob(str(p), WDS_TAR))
+                jobs.append(IndexJob(str(p), WDS_TAR, indexes_root))
 
 
 # --------------------------------------------------------------------------- #
@@ -288,17 +295,21 @@ def _build_one(job: IndexJob) -> tuple[IndexJob, str]:
     """Run the right indexer for *job*. Returns (job, status)."""
     from lhotse.indexing import create_jsonl_index, create_tar_index as create_wds_tar_index
 
-    builders: dict[str, Callable[[str], object]] = {
-        JSONL: create_jsonl_index,
-        WDS_TAR: create_wds_tar_index,
-        NEMO_TAR: create_nemo_tar_index,
-    }
-    builder = builders[job.kind]
-    if job.kind == NEMO_TAR:
+    idx = job.idx_path()
+    # Ensure the parent directory exists for mirrored layouts.
+    idx_parent = Path(idx).parent
+    if not str(idx).startswith(("ais://", "s3://", "http://", "https://", "gs://")):
+        idx_parent.mkdir(parents=True, exist_ok=True)
+
+    if job.kind == JSONL:
+        create_jsonl_index(job.path, output_path=idx)
+    elif job.kind == WDS_TAR:
+        create_wds_tar_index(job.path, output_path=idx)
+    elif job.kind == NEMO_TAR:
         # NeMo's create_tar_index has a (tar_path, idx_path) signature.
-        builder(job.path, job.idx_path())
+        create_nemo_tar_index(job.path, idx)
     else:
-        builder(job.path)
+        raise ValueError(f"Unknown index kind: {job.kind!r}")
     return job, "built"
 
 
@@ -321,7 +332,23 @@ def _is_indexed(job: IndexJob) -> bool:
 @click.option("--force", is_flag=True, help="Rebuild .idx files even if they already exist.")
 @click.option("--workers", type=int, default=4, help="Number of parallel index builders.")
 @click.option("--dry-run", is_flag=True, help="List the jobs without writing anything.")
-def main(input_cfgs: tuple[str, ...], force: bool, workers: int, dry_run: bool):
+@click.option(
+    "--indexes-root",
+    type=str,
+    default=None,
+    help=(
+        "Write .idx sidecars to a mirror under this root (preserving the data files' "
+        "directory structure) instead of next to each data file. CLI value overrides "
+        "any 'indexes_root' present in the YAML."
+    ),
+)
+def main(
+    input_cfgs: tuple[str, ...],
+    force: bool,
+    workers: int,
+    dry_run: bool,
+    indexes_root: Optional[str],
+):
     """
     Build .idx sidecars for every JSONL/tar referenced by INPUT_CFGS.
 
@@ -332,13 +359,13 @@ def main(input_cfgs: tuple[str, ...], force: bool, workers: int, dry_run: bool):
     jobs: list[IndexJob] = []
     for cfg_path in input_cfgs:
         cfg = OmegaConf.load(cfg_path)
-        discover(cfg, jobs)
+        discover(cfg, jobs, indexes_root=indexes_root)
 
     # Deduplicate while preserving order.
-    seen: set[tuple[str, str]] = set()
+    seen: set[tuple[str, str, Optional[str]]] = set()
     unique: list[IndexJob] = []
     for j in jobs:
-        key = (j.path, j.kind)
+        key = (j.path, j.kind, j.indexes_root)
         if key not in seen:
             seen.add(key)
             unique.append(j)
@@ -351,7 +378,7 @@ def main(input_cfgs: tuple[str, ...], force: bool, workers: int, dry_run: bool):
 
     if dry_run or not todo:
         for j in todo:
-            logging.info("  [%s] %s", j.kind, j.path)
+            logging.info("  [%s] %s -> %s", j.kind, j.path, j.idx_path())
         return
 
     failures: list[tuple[IndexJob, BaseException]] = []
diff --git a/scripts/dataloading/prefetch_indexes.py b/scripts/dataloading/prefetch_indexes.py
new file mode 100644
index 000000000000..126491aa0f41
--- /dev/null
+++ b/scripts/dataloading/prefetch_indexes.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Copy existing ``.idx`` sidecars from their source locations into a local
+mirrored ``indexes_root``.
+
+Use this when your data lives on shared storage (NFS, S3, AIStore) and you
+want a local-disk copy of the indexes for fast random access during
+training, without ever touching the data files themselves.
+
+The script walks an arbitrary NeMo Lhotse ``input_cfg`` YAML (same machinery
+as ``build_indexes.py``), enumerates every ``.idx`` file the dataloader will
+need, and downloads each into ``<indexes_root>/<rel-path>.idx`` preserving
+the data files' directory structure. Source paths are read via lhotse's
+``open_best``, which routes ``ais://``, ``s3://``, ``http(s)://``, and local
+paths to the correct backend.
+
+Examples::
+
+    # Local data, mirror indexes onto a fast local SSD.
+    python scripts/dataloading/prefetch_indexes.py \\
+        --indexes-root /scratch/idx \\
+        path/to/input_cfg.yaml
+
+    # Indexes live next to data on AIStore; pull them down.
+    AIS_ENDPOINT=http://aistore.example.com \\
+        python scripts/dataloading/prefetch_indexes.py \\
+            --indexes-root /scratch/idx \\
+            path/to/input_cfg.yaml
+
+    # Skip files that are already in the mirror; re-run safely.
+    python scripts/dataloading/prefetch_indexes.py \\
+        --indexes-root /scratch/idx --workers 16 train.yaml validation.yaml
+
+After prefetch, point your training config at the mirror via the top-level
+``indexes_root: /scratch/idx`` option (no per-source changes required).
+"""
+
+import logging
+import shutil
+import sys
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from pathlib import Path
+from typing import Optional
+
+import click
+from omegaconf import OmegaConf
+
+from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+
+# Reuse the discovery + IndexJob machinery from build_indexes.py.
+sys.path.insert(0, str(Path(__file__).parent))
+from build_indexes import IndexJob, discover  # type: ignore[import-not-found]
+
+
+def _copy_idx(src: str, dst: str) -> None:
+    """Copy a single ``.idx`` from *src* (local or URL) to *dst* (local).
+
+    Uses lhotse's ``open_best`` so URL schemes are routed to the right
+    backend (smart_open / AIStore SDK).
+    """
+    from lhotse.serialization import open_best
+
+    Path(dst).parent.mkdir(parents=True, exist_ok=True)
+    # Stage to a sibling tmp file then rename, so partial writes never
+    # leave a half-baked .idx in place.
+    tmp = f"{dst}.tmp.{Path(dst).name}.partial"
+    try:
+        with open_best(src, "rb") as src_f, open(tmp, "wb") as dst_f:
+            shutil.copyfileobj(src_f, dst_f, length=8 * 1024 * 1024)
+        Path(tmp).replace(dst)
+    finally:
+        # Clean up if rename never happened (exception path).
+        try:
+            Path(tmp).unlink()
+        except FileNotFoundError:
+            pass
+
+
+def _is_present(local_idx: str) -> bool:
+    p = Path(local_idx)
+    try:
+        return p.is_file() and p.stat().st_size > 0
+    except OSError:
+        return False
+
+
+@click.command(context_settings={"show_default": True})
+@click.argument("input_cfgs", type=click.Path(exists=True, dir_okay=False), nargs=-1, required=True)
+@click.option(
+    "--indexes-root",
+    type=str,
+    required=True,
+    help="Local directory the .idx mirror is written to. The data files' directory structure is preserved underneath.",
+)
+@click.option(
+    "--source-indexes-root",
+    type=str,
+    default=None,
+    help=(
+        "If the source ``.idx`` files do not live next to the data (e.g. they "
+        "are themselves under another mirror — possibly remote), set this to "
+        "that root. Defaults to ``None`` meaning sidecars are read from "
+        "next to each data file."
+    ),
+)
+@click.option("--force", is_flag=True, help="Re-download even when a non-empty mirrored .idx already exists.")
+@click.option("--workers", type=int, default=8, help="Number of parallel copies.")
+@click.option("--dry-run", is_flag=True, help="List the (src, dst) pairs without copying anything.")
+def main(
+    input_cfgs: tuple[str, ...],
+    indexes_root: str,
+    source_indexes_root: Optional[str],
+    force: bool,
+    workers: int,
+    dry_run: bool,
+):
+    """
+    Prefetch .idx sidecars referenced by INPUT_CFGS into a local mirror.
+
+    INPUT_CFGS are NeMo Lhotse dataloading configs (``input_cfg`` YAML).
+    """
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    jobs: list[IndexJob] = []
+    for cfg_path in input_cfgs:
+        cfg = OmegaConf.load(cfg_path)
+        # Walk with no inherited indexes_root — we want the *natural* data paths,
+        # then we compute (source, destination) idx paths ourselves below.
+        discover(cfg, jobs, indexes_root=None)
+
+    # Deduplicate by (data_path, kind).
+    seen: set[tuple[str, str]] = set()
+    unique: list[IndexJob] = []
+    for j in jobs:
+        key = (j.path, j.kind)
+        if key not in seen:
+            seen.add(key)
+            unique.append(j)
+
+    pairs: list[tuple[str, str]] = []
+    for j in unique:
+        src = resolve_idx_path(j.path, source_indexes_root)
+        dst = resolve_idx_path(j.path, indexes_root)
+        pairs.append((src, dst))
+
+    todo = pairs if force else [(s, d) for (s, d) in pairs if not _is_present(d)]
+    skipped = len(pairs) - len(todo)
+    logging.info(
+        "Discovered %d sidecars (%d already present locally, %d to copy).",
+        len(pairs), skipped, len(todo),
+    )
+
+    if dry_run or not todo:
+        for s, d in todo:
+            logging.info("  %s  ->  %s", s, d)
+        return
+
+    failures: list[tuple[str, str, BaseException]] = []
+    with ThreadPoolExecutor(max_workers=max(1, workers)) as ex:
+        futures = {ex.submit(_copy_idx, s, d): (s, d) for (s, d) in todo}
+        for fut in as_completed(futures):
+            s, d = futures[fut]
+            try:
+                fut.result()
+                logging.info("  [ok]   %s  ->  %s", s, d)
+            except BaseException as e:  # noqa: BLE001
+                failures.append((s, d, e))
+                logging.error("  [FAIL] %s  ->  %s: %s", s, d, e)
+
+    if failures:
+        logging.error("\n%d copy operation(s) failed:", len(failures))
+        for s, d, e in failures:
+            logging.error("  %s -> %s: %s", s, d, e)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()

From d7b68555324709f3b35ffea149e7489ff6e924f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Tue, 12 May 2026 09:23:52 -0400
Subject: [PATCH 07/30] total token/examples logging; force individual GET
 instead of GetBatch; changes to allow byte-range reading from AIS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 examples/speechlm2/salm_train.py              |   6 +
 .../asr/data/audio_to_text_lhotse.py          |   7 +-
 .../asr/data/audio_to_text_lhotse_prompted.py |   7 +-
 .../common/data/lhotse/dataloader.py          |  31 +++
 .../common/data/lhotse/indexed_adapters.py    | 237 ++++++++++++++++--
 .../speechlm2/data/salm_dataset.py            |   5 +
 nemo/collections/speechlm2/models/salm.py     |   5 +
 .../speechlm2/models/salm_automodel.py        |  15 +-
 nemo/utils/callbacks/training_stats.py        | 186 ++++++++++++++
 scripts/dataloading/build_indexes.py          |  36 ++-
 scripts/dataloading/prefetch_indexes.py       |  14 +-
 11 files changed, 520 insertions(+), 29 deletions(-)
 create mode 100644 nemo/utils/callbacks/training_stats.py

diff --git a/examples/speechlm2/salm_train.py b/examples/speechlm2/salm_train.py
index 05a013c69d86..83c5b1639c5d 100644
--- a/examples/speechlm2/salm_train.py
+++ b/examples/speechlm2/salm_train.py
@@ -19,6 +19,7 @@
 
 from nemo.collections.speechlm2 import SALM, DataModule, SALMDataset
 from nemo.core.config import hydra_runner
+from nemo.utils.callbacks.training_stats import TrainingStatsCallback
 from nemo.utils.exp_manager import exp_manager
 from nemo.utils.trainer_utils import resolve_trainer_cfg
 
@@ -35,6 +36,11 @@ def train(cfg):
     torch.set_float32_matmul_precision("medium")
     trainer = Trainer(**resolve_trainer_cfg(cfg.trainer))
     log_dir = exp_manager(trainer, cfg.get("exp_manager", None))
+    # Insert at position 0 so our ``on_train_batch_end`` runs BEFORE the
+    # StatelessTimer's hook (which can trigger a checkpoint save mid-
+    # batch-end). Without this, the saved ``state_dict`` would lag the
+    # accumulators by one batch on every wall-time-induced save.
+    trainer.callbacks.insert(0, TrainingStatsCallback())
     OmegaConf.save(cfg, log_dir / "exp_config.yaml")
 
     model_cls = SALM
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse.py b/nemo/collections/asr/data/audio_to_text_lhotse.py
index 46c301be0822..a80ef6dfe7e4 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse.py
@@ -53,10 +53,15 @@ def __init__(self, tokenizer: TokenizerSpec, return_cuts: bool = False):
         super().__init__()
         self.tokenizer = TokenizerWrapper(tokenizer)
         self.use_ais_get_batch = os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true"
+        self.ais_prefer_individual = os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true"
 
         # Try to use use_batch_loader if available (Lhotse >= 1.32.0)
         try:
-            self.load_audio = AudioSamples(fault_tolerant=True, use_batch_loader=self.use_ais_get_batch)
+            self.load_audio = AudioSamples(
+                fault_tolerant=True,
+                use_batch_loader=self.use_ais_get_batch,
+                ais_prefer_individual=self.ais_prefer_individual,
+            )
         except TypeError:
             # Lhotse < 1.32.0 doesn't support use_batch_loader
             if self.use_ais_get_batch:
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
index a3510a78836a..01c91fc8c4a8 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -83,10 +83,15 @@ def __init__(
         super().__init__()
         self.tokenizer = tokenizer
         self.use_ais_get_batch = os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true"
+        self.ais_prefer_individual = os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true"
 
         # Try to use use_batch_loader if available (Lhotse >= 1.32.0)
         try:
-            self.load_audio = AudioSamples(fault_tolerant=True, use_batch_loader=self.use_ais_get_batch)
+            self.load_audio = AudioSamples(
+                fault_tolerant=True,
+                use_batch_loader=self.use_ais_get_batch,
+                ais_prefer_individual=self.ais_prefer_individual,
+            )
         except TypeError:
             # Lhotse < 1.32.0 doesn't support use_batch_loader
             if self.use_ais_get_batch:
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 6a3a6a5939e6..450a1adefe32 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -540,6 +540,37 @@ def get_lhotse_sampler_from_config(config, global_rank, world_size, tokenizer=No
     cuts, use_iterable_dataset = read_cutset_from_config(config)
     use_iterable_dataset = determine_use_iterable_dataset(use_iterable_dataset, config)
 
+    # Map-style + StatefulDataLoader requires shard_seed to be a fixed integer:
+    #   * On the map path, cross-rank de-duplication is by ``rank/world_size``
+    #     index slicing (passed below to DynamicBucketingSampler/DynamicCutSampler),
+    #     NOT by per-rank seed differentiation. ``shard_seed="randomized"`` is
+    #     iterable-path machinery that injects worker-PID-derived seeding;
+    #     across resume boundaries the new process has a different PID, so the
+    #     freshly-initialised sampler RNG diverges from the saved snapshot.
+    #     ``StatefulDataLoader.load_state_dict`` overrides that init RNG state
+    #     in practice, but it's a footgun: any RNG draw before the first
+    #     ``__iter__`` (e.g. shuffle of shards in the parent process) is lost.
+    # If the user sets ``shard_seed="randomized"`` AND ``force_map_dataset=True``
+    # AND ``use_stateful_dataloader=True``, warn loudly and auto-overwrite with
+    # the fixed ``seed`` integer so resume semantics stay clean.
+    if (
+        getattr(config, "force_map_dataset", False)
+        and getattr(config, "use_stateful_dataloader", False)
+        and isinstance(config.get("shard_seed"), str)
+        and str(config.shard_seed).lower() == "randomized"
+    ):
+        fixed_seed = int(config.seed)
+        logging.warning(
+            "shard_seed=%r is incompatible with force_map_dataset=True + "
+            "use_stateful_dataloader=True (the map path doesn't need per-rank "
+            "seed differentiation; cross-rank de-dup is by index slicing). "
+            "Auto-overriding shard_seed -> %d (the value of `seed`) for "
+            "deterministic StatefulDataLoader resume. Pin shard_seed to an "
+            "integer in your YAML to silence this warning.",
+            config.shard_seed, fixed_seed,
+        )
+        config.shard_seed = fixed_seed
+
     _auto_detect_bucketing_and_validate_batch_size(config)
 
     # Apply channel selector
diff --git a/nemo/collections/common/data/lhotse/indexed_adapters.py b/nemo/collections/common/data/lhotse/indexed_adapters.py
index 3af89186bccf..a88c533a1e5b 100644
--- a/nemo/collections/common/data/lhotse/indexed_adapters.py
+++ b/nemo/collections/common/data/lhotse/indexed_adapters.py
@@ -35,6 +35,138 @@
 _URL_RE = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.\-]*://")
 
 
+def _is_remote_path(path) -> bool:
+    """True if *path* is a URL/URI (s3://, ais://, http(s)://, gs://, …)."""
+    return bool(_URL_RE.match(str(path)))
+
+
+class _AISRangeReader:
+    """
+    Pseudo file-like object backed by AIStore HTTP byte-range reads.
+
+    Translates ``seek()`` + ``read(n)`` into ``Object.get_reader(byte_range=…)``
+    requests so the indexed-tar readers can do random access into ``s3://`` /
+    ``ais://`` archives the same way they would into a local file. Each
+    ``read()`` corresponds to one HTTP range request, which AIStore serves in
+    O(1); the index already tells us exactly which byte ranges we need (one
+    per tar member or sample), so the request count per training sample is
+    small and bounded.
+
+    The aistore SDK is imported lazily so ``indexed_adapters`` doesn't have to
+    take a hard dependency on it for local-only code paths.
+
+    Notes
+    -----
+    * ``seek()`` accepts whence ∈ {0, 1, 2}; for whence=2 the file size
+      already known via ``Object.props.size`` is used, so no extra HTTP call
+      is needed.
+    * The instance is **not** safe to share across threads — pickling support
+      drops the cached ``_obj`` so per-worker processes re-resolve the URL
+      after fork.
+    """
+
+    def __init__(self, url: str):
+        # Defer the aistore import — pure-local installs don't need it.
+        from aistore import Client  # noqa: F401  (presence-check only)
+
+        self._url = url
+        self._obj = None
+        self._size: Optional[int] = None
+        self._pos = 0
+
+    def _ensure_obj(self):
+        if self._obj is not None:
+            return
+        # Same client/env wiring as ``lhotse.serialization.AIStoreIOBackend``
+        # — import locally so build_indexes / training don't require lhotse
+        # for non-remote files.
+        from lhotse.serialization import get_aistore_client
+
+        client, _version = get_aistore_client()
+        self._obj = client.get_object_from_url(self._url)
+        self._size = int(self._obj.props.size)
+
+    @property
+    def size(self) -> int:
+        self._ensure_obj()
+        return self._size  # type: ignore[return-value]
+
+    def seekable(self) -> bool:
+        return True
+
+    def readable(self) -> bool:
+        return True
+
+    def seek(self, offset: int, whence: int = 0) -> int:
+        if whence == 0:
+            self._pos = int(offset)
+        elif whence == 1:
+            self._pos += int(offset)
+        elif whence == 2:
+            self._pos = self.size + int(offset)
+        else:
+            raise ValueError(f"Unsupported whence: {whence}")
+        return self._pos
+
+    def tell(self) -> int:
+        return self._pos
+
+    def read(self, n: int = -1) -> bytes:
+        self._ensure_obj()
+        if self._pos >= self._size:
+            return b""
+        if n == 0:
+            return b""
+        if n < 0:
+            end_inclusive = self._size - 1
+        else:
+            end_inclusive = min(self._pos + n - 1, self._size - 1)
+        if end_inclusive < self._pos:
+            return b""
+        # AIStore expects the HTTP Range syntax: ``bytes=START-END`` with
+        # END inclusive. ``read_all()`` drains the entire response into bytes.
+        byte_range = f"bytes={self._pos}-{end_inclusive}"
+        reader = self._obj.get_reader(byte_range=byte_range)
+        data = reader.read_all()
+        self._pos += len(data)
+        return data
+
+    def close(self) -> None:
+        self._obj = None
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc):
+        self.close()
+
+    def __getstate__(self):
+        # Drop the resolved AIStore Object handle so a forked DataLoader
+        # worker re-creates it lazily against the worker's own connection
+        # pool / HTTP session.
+        return {"_url": self._url, "_pos": 0, "_obj": None, "_size": None}
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+
+def _open_data_path(path: str):
+    """
+    Return a seekable file-like for *path*, suitable for the indexed
+    tar readers' ``self._fh`` slot.
+
+    Local paths get a regular ``open(path, "rb")``. URL/URI paths return an
+    :class:`_AISRangeReader` that turns ``seek + read`` into AIStore HTTP
+    range requests. Other URL schemes (``http://``, ``gs://``, …) currently
+    fall through to ``_AISRangeReader`` as well — the aistore SDK is the only
+    seekable remote backend lhotse exposes today; if a future backend gains a
+    seekable wrapper, dispatch here.
+    """
+    if _is_remote_path(path):
+        return _AISRangeReader(str(path))
+    return open(path, "rb")
+
+
 def resolve_idx_path(data_path: str | Path, indexes_root: Optional[str | Path] = None) -> str:
     """
     Compute the ``.idx`` sidecar path for *data_path*.
@@ -143,16 +275,38 @@ def _load_index(data_path: str, idx_path: str | None = None):
     (appended if absent in the on-disk index).
 
     Validates that all sample offsets fall within the data file.
+
+    For remote ``data_path`` URIs (``s3://`` / ``ais://`` / ``http(s)://`` /
+    ``gs://``) ``os.path.getsize`` is not callable; we trust the size
+    sentinel that ``create_tar_index`` / ``create_jsonl_index`` recorded as
+    the last offset in the on-disk index. The same indexes are emitted for
+    local and remote sources, so the on-disk format is identical — only the
+    file-size cross-check is skipped.
     """
     if idx_path is None:
         idx_path = data_path + '.idx'
-    offsets = np.memmap(idx_path, dtype=np.dtype('<u8'), mode='r')
-    data_size = os.path.getsize(data_path)
-    if offsets[-1] == data_size:
+    # Use np.fromfile (resident memory) rather than np.memmap so that NeMo
+    # blends with tens of thousands of shards don't exhaust the kernel's
+    # ``vm.max_map_count`` budget (~65k by default) and subsequently raise
+    # ``OSError: [Errno 12] Cannot allocate memory``. Indexes are small
+    # (a uint64 per record + a sentinel; typically O(KB) per shard), so the
+    # resident-memory cost across an entire blend is in the hundreds of MB.
+    offsets = np.fromfile(idx_path, dtype=np.dtype('<u8'))
+    if _URL_RE.match(str(data_path)):
+        if offsets.shape[0] < 1:
+            raise ValueError(
+                f"Index for remote source {data_path} is empty; expected at "
+                f"least a size sentinel. Rebuild via build_indexes.py."
+            )
+        data_size = int(offsets[-1])
         num_samples = offsets.shape[0] - 1
     else:
-        num_samples = offsets.shape[0]
-        offsets = np.append(offsets, np.uint64(data_size))
+        data_size = os.path.getsize(data_path)
+        if offsets[-1] == data_size:
+            num_samples = offsets.shape[0] - 1
+        else:
+            num_samples = offsets.shape[0]
+            offsets = np.append(offsets, np.uint64(data_size))
     if num_samples > 0:
         max_offset = int(offsets[:num_samples].max())
         if max_offset >= data_size:
@@ -185,7 +339,7 @@ def __getitem__(self, idx):
         idx = _resolve_idx(idx, self._len)
         start = int(self.offsets[idx])
         end = int(self.offsets[idx + 1])
-        with open(self.data_path, 'rb') as f:
+        with _open_data_path(self.data_path) as f:
             f.seek(start)
             data = f.read(end - start)
         return json.loads(data.decode('utf-8'))
@@ -232,7 +386,7 @@ def _validate_index(self):
         # file size (which _load_index already handles).
         while self._len > 0:
             last = int(self.offsets[self._len - 1])
-            with open(self.data_path, 'rb') as f:
+            with _open_data_path(self.data_path) as f:
                 f.seek(last)
                 buf = f.read(_TAR_BLOCK_SIZE)
             if len(buf) < _TAR_BLOCK_SIZE or buf == _TAR_ZERO_BLOCK:
@@ -241,7 +395,7 @@ def _validate_index(self):
                 break
 
     def _check_offset_is_tar_header(self, offset: int, label: str = ""):
-        with open(self.data_path, 'rb') as f:
+        with _open_data_path(self.data_path) as f:
             f.seek(offset)
             buf = f.read(_TAR_BLOCK_SIZE)
         if len(buf) < _TAR_BLOCK_SIZE:
@@ -273,7 +427,7 @@ def __len__(self):
     def __getitem__(self, idx):
         idx = _resolve_idx(idx, self._len)
         offset = int(self.offsets[idx])
-        with open(self.data_path, 'rb') as f:
+        with _open_data_path(self.data_path) as f:
             f.seek(offset)
             try:
                 name_a, bytes_a = _read_tar_member(f)
@@ -329,7 +483,7 @@ def __init__(
 
     def _ensure_open(self):
         if self._fh is None:
-            self._fh = open(self.data_path, "rb")
+            self._fh = _open_data_path(self.data_path)
 
     def close(self):
         if self._fh is not None:
@@ -469,6 +623,35 @@ def create_index(jsonl_path, idx_path):
     os.replace(tmp_path, idx_path)
 
 
+class _CountingReader:
+    """
+    Minimal file-like wrapper that delegates everything to an inner stream
+    while counting the total number of bytes read. Used by
+    :func:`create_tar_index` to compute a tar file's size without calling
+    ``tell()`` — necessary because non-seekable remote streams (AIStore's
+    ``ObjectFileReader``, smart_open's S3 reader without seek support, …)
+    raise ``io.UnsupportedOperation`` on ``tell()`` even when sequential
+    reads succeed.
+    """
+
+    def __init__(self, fileobj):
+        self._f = fileobj
+        self.bytes_read = 0
+
+    def read(self, n=-1):
+        data = self._f.read(n)
+        self.bytes_read += len(data)
+        return data
+
+    def readable(self):
+        return True
+
+    def seekable(self):
+        # tarfile's ``r|`` (stream) mode falls back to read+discard when
+        # the fileobj is not seekable, which is exactly what we want.
+        return False
+
+
 def create_tar_index(tar_path, idx_path):
     """
     Creates a raw binary index file for a WebDataset tar archive.
@@ -476,25 +659,37 @@ def create_tar_index(tar_path, idx_path):
     followed by a sentinel equal to the tar file size.
     Format is identical to :func:`create_index`.
 
+    Reads ``tar_path`` via ``lhotse.serialization.open_best`` so the function
+    works for local files as well as ``s3://`` / ``ais://`` / ``http(s)://``
+    URIs. The tar is opened in streaming mode (``r|``) — remote backends are
+    not seekable — and the sentinel records the total bytes read through a
+    ``_CountingReader`` wrapper rather than ``os.path.getsize`` /
+    ``f.tell()``, both of which fail on non-seekable URI streams.
+
     Written atomically: data is staged in a per-process temp file next to
     ``idx_path`` and then ``os.replace()``-d into place, so concurrent writers
     can't observe a half-written ``.idx``.
     """
+    from lhotse.serialization import open_best
+
     offsets = []
     prev_stem = None
-    with tarfile.open(tar_path, 'r:') as tar:
-        for member in tar:
-            if not member.isreg():
-                continue
-            stem = Path(member.name).stem
-            if stem != prev_stem:
-                offsets.append(member.offset)
-                prev_stem = stem
+    with open_best(tar_path, "rb") as f:
+        counter = _CountingReader(f)
+        with tarfile.open(fileobj=counter, mode='r|') as tar:
+            for member in tar:
+                if not member.isreg():
+                    continue
+                stem = Path(member.name).stem
+                if stem != prev_stem:
+                    offsets.append(member.offset)
+                    prev_stem = stem
+        file_size = counter.bytes_read
     tmp_path = f"{idx_path}.tmp.{os.getpid()}"
-    with open(tmp_path, 'wb') as f:
+    with open(tmp_path, 'wb') as f_out:
         buf = bytearray()
         for off in offsets:
             buf.extend(struct.pack('<Q', off))
-        buf.extend(struct.pack('<Q', os.path.getsize(tar_path)))
-        f.write(buf)
+        buf.extend(struct.pack('<Q', file_size))
+        f_out.write(buf)
     os.replace(tmp_path, idx_path)
diff --git a/nemo/collections/speechlm2/data/salm_dataset.py b/nemo/collections/speechlm2/data/salm_dataset.py
index 003205c79333..93a8f5b4d109 100644
--- a/nemo/collections/speechlm2/data/salm_dataset.py
+++ b/nemo/collections/speechlm2/data/salm_dataset.py
@@ -74,9 +74,14 @@ def __init__(self, tokenizer: AutoTokenizer) -> None:
         # Setting USE_AIS_GET_BATCH=true makes the loader issue a single AIStore GetBatch
         # call per minibatch, paired with URL-backed cuts produced by the multimodal
         # conversation adapters (NeMoMultimodalConversation{Jsonl,ShareGPTJsonl}Adapter).
+        # USE_AIS_INDIVIDUAL_GETS=true (only meaningful when USE_AIS_GET_BATCH=true) forces
+        # the underlying AISBatchLoader to skip MOSS GetBatch and issue one
+        # ``Object.get_reader().read_all()`` per object — useful when the deployment
+        # doesn't support GetBatch or its performance is degraded.
         self.load_audio = AudioSamples(
             fault_tolerant=True,
             use_batch_loader=os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true",
+            ais_prefer_individual=os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true",
             mono_downmix=True,
         )
 
diff --git a/nemo/collections/speechlm2/models/salm.py b/nemo/collections/speechlm2/models/salm.py
index 5e1533993ce0..1884e3c6fd9a 100644
--- a/nemo/collections/speechlm2/models/salm.py
+++ b/nemo/collections/speechlm2/models/salm.py
@@ -214,6 +214,11 @@ def training_step(self, batch: dict, batch_idx: int):
                 m.eval()
 
         inputs = self.prepare_inputs(batch)
+        # Counters consumed by TrainingStatsCallback. ``attention_mask`` is 1
+        # for every real LLM input position (text non-pad + audio frames
+        # post-perception) and 0 for padding.
+        self._last_batch_num_tokens = int(inputs["attention_mask"].long().sum().item())
+        self._last_batch_num_examples = int(inputs["input_embeds"].shape[0])
         forward_outputs = self(inputs["input_embeds"], attention_mask=inputs["attention_mask"])
         num_frames = (inputs["target_ids"] != -100).long().sum()
         with loss_parallel():
diff --git a/nemo/collections/speechlm2/models/salm_automodel.py b/nemo/collections/speechlm2/models/salm_automodel.py
index 8cf8ee7b3eec..1706f2bddc40 100644
--- a/nemo/collections/speechlm2/models/salm_automodel.py
+++ b/nemo/collections/speechlm2/models/salm_automodel.py
@@ -246,6 +246,12 @@ def training_step(self, dataloader_iter):
                 m.eval()
 
         inputs = self.prepare_inputs(batch)
+        # Counters consumed by TrainingStatsCallback. ``attention_mask`` is 1
+        # for every real LLM input position (text non-pad + audio frames
+        # post-perception) and 0 for padding, so its sum is exactly the
+        # "text non-pad + audio frames" definition.
+        self._last_batch_num_tokens = int(inputs["attention_mask"].long().sum().item())
+        self._last_batch_num_examples = int(inputs["input_embeds"].shape[0])
         forward_outputs = self(inputs["input_embeds"], attention_mask=inputs["attention_mask"])
         num_frames = (inputs["target_ids"] != -100).long().sum()
 
@@ -609,7 +615,14 @@ def maybe_log_moe_metrics(self, step: int):
         else:
             metrics = compute_brief_metrics(layer_loads, top_k=top_k)
 
-        self.log_dict(metrics, on_step=True)
+        # ``batch_size=1`` is required when training_step uses the
+        # ``dataloader_iter`` flavor: Lightning cannot infer the batch size
+        # from the closure, and these MoE metrics are model-internal
+        # aggregates (load fractions, top-k expert utilization), so the
+        # per-call batch_size is just a logging-aggregation hint, not a true
+        # sample count. Without it Lightning raises
+        # ``MisconfigurationException`` on the very first training step.
+        self.log_dict(metrics, on_step=True, batch_size=1)
 
     def _get_moe_dp_group(self):
         """Return the DP process group for MoE metrics all-reduce.
diff --git a/nemo/utils/callbacks/training_stats.py b/nemo/utils/callbacks/training_stats.py
new file mode 100644
index 000000000000..98444b86d3aa
--- /dev/null
+++ b/nemo/utils/callbacks/training_stats.py
@@ -0,0 +1,186 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Training-throughput metrics that are not specific to a single model.
+
+Three metrics are emitted at ``on_train_batch_end`` via ``pl_module.log()``:
+
+* ``dataloader_wait_s`` — wall-clock seconds spent between the previous
+  batch's ``on_train_batch_end`` and the current batch's
+  ``on_train_batch_start``. With PTL's prefetcher this is normally near
+  zero; large values mean the dataloader couldn't keep up. Useful for
+  catching AIS / lustre stalls before they crater the run.
+* ``num_tokens_total`` — running sum across the whole training of every
+  non-padding token position fed into the LLM (text non-pad + audio
+  frames after perception subsampling). Includes loss-masked tokens.
+  Survives job restarts via callback ``state_dict``.
+* ``num_examples_total`` — running sum across the whole training of
+  per-batch example counts. Also restart-safe.
+
+The model is expected to populate two short-lived attributes inside its
+``training_step`` so the callback can pick them up without parsing the
+batch a second time::
+
+    pl_module._last_batch_num_tokens = int(...)
+    pl_module._last_batch_num_examples = int(...)
+
+If either attribute is missing the callback falls back to counting from
+``batch["input_ids"]`` (non-pad text tokens only) — useful for non-SALM
+models, but loses audio-frame contribution.
+"""
+
+import time
+from typing import Any, Dict, Optional
+
+import torch
+import torch.distributed as dist
+from lightning.pytorch import Callback, LightningModule, Trainer
+
+__all__ = ["TrainingStatsCallback"]
+
+
+class TrainingStatsCallback(Callback):
+    """Logs dataloader wait time and accumulates token/example counts.
+
+    Persists ``num_tokens_total`` and ``num_examples_total`` via the
+    Lightning checkpoint state-dict mechanism so the counters survive
+    job restarts. The per-step ``dataloader_wait_s`` gauge is
+    intentionally NOT persisted (it has no meaningful value across a
+    process boundary).
+
+    The first batch after a fresh process start has no meaningful
+    ``dataloader_wait_s`` (no preceding ``on_train_batch_end``); the
+    callback skips logging it for that step.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        # Persisted state — survives checkpoint resume.
+        self.num_tokens_total: int = 0
+        self.num_examples_total: int = 0
+        # Per-process state — not persisted.
+        self._prev_batch_end_monotonic: Optional[float] = None
+
+    # ------------------------------------------------------------------ state
+    def state_dict(self) -> Dict[str, Any]:
+        return {
+            "num_tokens_total": int(self.num_tokens_total),
+            "num_examples_total": int(self.num_examples_total),
+        }
+
+    def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+        self.num_tokens_total = int(state_dict.get("num_tokens_total", 0))
+        self.num_examples_total = int(state_dict.get("num_examples_total", 0))
+
+    # ----------------------------------------------------------------- hooks
+    def on_train_batch_start(
+        self,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        batch: Any,
+        batch_idx: int,
+    ) -> None:
+        if self._prev_batch_end_monotonic is None:
+            # First batch of the process — no previous end timestamp to
+            # diff against; skip emitting a misleading value.
+            return
+        wait_s = time.monotonic() - self._prev_batch_end_monotonic
+        # ``batch_size`` is required when the LightningModule uses
+        # ``def training_step(self, dataloader_iter)`` (SALMAutomodel does):
+        # Lightning can't auto-infer it from a ``dataloader_iter`` arg.
+        # The value is only used for epoch-level aggregation; we log
+        # ``on_step=True, on_epoch=False`` so the actual number is
+        # irrelevant — pass 1 as a sentinel.
+        pl_module.log(
+            "dataloader_wait_s",
+            wait_s,
+            on_step=True,
+            on_epoch=False,
+            prog_bar=False,
+            rank_zero_only=True,
+            batch_size=1,
+        )
+
+    def on_train_batch_end(
+        self,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        outputs: Any,
+        batch: Any,
+        batch_idx: int,
+    ) -> None:
+        # Pull per-batch counts the model exposed in training_step.
+        local_tokens = int(getattr(pl_module, "_last_batch_num_tokens", -1))
+        local_examples = int(getattr(pl_module, "_last_batch_num_examples", -1))
+        if local_tokens < 0 or local_examples < 0:
+            # Model didn't expose the attributes — fall back to a generic
+            # estimate from the batch. Counts non-pad text tokens only;
+            # audio frame contribution is lost. Better than zero.
+            local_tokens, local_examples = self._fallback_counts(batch, pl_module)
+
+        # All-reduce across DP ranks so every rank holds the same
+        # cumulative value (required for state_dict consistency across
+        # ranks on save).
+        if dist.is_available() and dist.is_initialized():
+            buf = torch.tensor(
+                [local_tokens, local_examples],
+                dtype=torch.long,
+                device=pl_module.device,
+            )
+            dist.all_reduce(buf, op=dist.ReduceOp.SUM)
+            global_tokens, global_examples = buf.tolist()
+        else:
+            global_tokens, global_examples = local_tokens, local_examples
+
+        self.num_tokens_total += global_tokens
+        self.num_examples_total += global_examples
+
+        pl_module.log_dict(
+            {
+                "num_tokens_total": float(self.num_tokens_total),
+                "num_examples_total": float(self.num_examples_total),
+            },
+            on_step=True,
+            on_epoch=False,
+            prog_bar=False,
+            rank_zero_only=True,
+            batch_size=max(local_examples, 1),
+        )
+
+        self._prev_batch_end_monotonic = time.monotonic()
+
+    # ------------------------------------------------------------ fallbacks
+    @staticmethod
+    def _fallback_counts(batch: Any, pl_module: LightningModule) -> tuple[int, int]:
+        """Best-effort token/example count from a generic ``batch`` dict.
+
+        Used only when the model didn't expose
+        ``_last_batch_num_tokens`` / ``_last_batch_num_examples``. Counts
+        non-pad text tokens via ``batch["input_ids"]`` and
+        ``pl_module.text_pad_id`` when both exist. Audio-frame
+        contribution is not visible from here.
+        """
+        try:
+            ids = batch["input_ids"]
+        except (KeyError, TypeError):
+            return 0, 0
+        if not torch.is_tensor(ids):
+            return 0, 0
+        pad_id = getattr(pl_module, "text_pad_id", None)
+        if pad_id is None:
+            n_tokens = int(ids.numel())
+        else:
+            n_tokens = int((ids != pad_id).long().sum().item())
+        n_examples = int(ids.shape[0])
+        return n_tokens, n_examples
diff --git a/scripts/dataloading/build_indexes.py b/scripts/dataloading/build_indexes.py
index eacbaed2fb39..50479b6ef13a 100644
--- a/scripts/dataloading/build_indexes.py
+++ b/scripts/dataloading/build_indexes.py
@@ -51,7 +51,7 @@
 
 import logging
 import sys
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Callable, Iterable, Iterator, Optional
@@ -332,6 +332,18 @@ def _is_indexed(job: IndexJob) -> bool:
 @click.option("--force", is_flag=True, help="Rebuild .idx files even if they already exist.")
 @click.option("--workers", type=int, default=4, help="Number of parallel index builders.")
 @click.option("--dry-run", is_flag=True, help="List the jobs without writing anything.")
+@click.option(
+    "--executor",
+    type=click.Choice(["process", "thread"]),
+    default="process",
+    help=(
+        "Worker pool kind. ``process`` (default) gives true CPU-level parallelism by "
+        "running each indexer in its own interpreter — required for tar indexing where "
+        "tarfile.next() and the read-and-discard for data members hold the GIL and "
+        "would otherwise serialize all workers onto one core. ``thread`` is useful for "
+        "debugging or when indexing only JSONLs over a slow network."
+    ),
+)
 @click.option(
     "--indexes-root",
     type=str,
@@ -347,6 +359,7 @@ def main(
     force: bool,
     workers: int,
     dry_run: bool,
+    executor: str,
     indexes_root: Optional[str],
 ):
     """
@@ -381,17 +394,32 @@ def main(
             logging.info("  [%s] %s -> %s", j.kind, j.path, j.idx_path())
         return
 
+    # Per-file success logging is suppressed: building 80k-400k indexes would
+    # otherwise emit one log line per file, swamping the SLURM stdout buffer.
+    # Failures are still logged inline; success only emits a periodic
+    # "<built>/<total> processed" heartbeat (~every 5% of total or 5000 files,
+    # whichever is smaller) plus a final summary.
     failures: list[tuple[IndexJob, BaseException]] = []
-    with ThreadPoolExecutor(max_workers=max(1, workers)) as ex:
+    total = len(todo)
+    log_every = max(1, min(5000, total // 20))
+    pool_cls = ProcessPoolExecutor if executor == "process" else ThreadPoolExecutor
+    with pool_cls(max_workers=max(1, workers)) as ex:
         futures = {ex.submit(_build_one, j): j for j in todo}
+        done = 0
         for fut in as_completed(futures):
+            done += 1
             j = futures[fut]
             try:
-                _, status = fut.result()
-                logging.info("  [%s] %s -> %s", status, j.kind, j.path)
+                _, _status = fut.result()
             except BaseException as e:  # noqa: BLE001 — surface any failure
                 failures.append((j, e))
                 logging.error("  [FAIL] %s %s: %s", j.kind, j.path, e)
+                continue
+            if done % log_every == 0 or done == total:
+                logging.info(
+                    "  built %d/%d (%.1f%%)  failures=%d",
+                    done, total, 100.0 * done / total, len(failures),
+                )
 
     if failures:
         logging.error("\n%d index build(s) failed:", len(failures))
diff --git a/scripts/dataloading/prefetch_indexes.py b/scripts/dataloading/prefetch_indexes.py
index 126491aa0f41..f743db758b53 100644
--- a/scripts/dataloading/prefetch_indexes.py
+++ b/scripts/dataloading/prefetch_indexes.py
@@ -167,17 +167,29 @@ def main(
             logging.info("  %s  ->  %s", s, d)
         return
 
+    # Per-file success logging is suppressed (80k-400k sidecars would swamp
+    # stdout); failures are still logged inline, success emits a periodic
+    # progress heartbeat plus a final summary.
     failures: list[tuple[str, str, BaseException]] = []
+    total = len(todo)
+    log_every = max(1, min(5000, total // 20))
     with ThreadPoolExecutor(max_workers=max(1, workers)) as ex:
         futures = {ex.submit(_copy_idx, s, d): (s, d) for (s, d) in todo}
+        done = 0
         for fut in as_completed(futures):
+            done += 1
             s, d = futures[fut]
             try:
                 fut.result()
-                logging.info("  [ok]   %s  ->  %s", s, d)
             except BaseException as e:  # noqa: BLE001
                 failures.append((s, d, e))
                 logging.error("  [FAIL] %s  ->  %s: %s", s, d, e)
+                continue
+            if done % log_every == 0 or done == total:
+                logging.info(
+                    "  copied %d/%d (%.1f%%)  failures=%d",
+                    done, total, 100.0 * done / total, len(failures),
+                )
 
     if failures:
         logging.error("\n%d copy operation(s) failed:", len(failures))

From d057dee184709ff78363209802d4c749f79100d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Tue, 12 May 2026 09:24:23 -0400
Subject: [PATCH 08/30] Agentic skill for performing migration to the new
 dataloader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../migrate-to-resumable-dataloader/SKILL.md  | 257 +++++++++++++
 .../references/aistore-vs-non-aistore.md      | 200 ++++++++++
 .../references/best-practices.md              | 141 +++++++
 .../references/conflict-matrix.md             |  33 ++
 .../references/failure-modes.md               | 359 ++++++++++++++++++
 .../references/option-reference.md            |  96 +++++
 .../templates/migration-report.md             | 143 +++++++
 7 files changed, 1229 insertions(+)
 create mode 100644 .claude/skills/migrate-to-resumable-dataloader/SKILL.md
 create mode 100644 .claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md
 create mode 100644 .claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
 create mode 100644 .claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
 create mode 100644 .claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
 create mode 100644 .claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
 create mode 100644 .claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md

diff --git a/.claude/skills/migrate-to-resumable-dataloader/SKILL.md b/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
new file mode 100644
index 000000000000..76b9a21cac87
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
@@ -0,0 +1,257 @@
+---
+name: migrate-to-resumable-dataloader
+description: This skill should be used when the user asks to "migrate to the resumable dataloader", "switch to indexed Lhotse", "adopt the indexed + resumable pipeline", "make my training resumable", "set up StatefulDataLoader for SALM", "use AIStore GetBatch", or "convert this YAML to the resumable path". Walks a NeMo training YAML (and optional launcher / blend / cluster info) through the indexed + resumable Lhotse migration; lints every interacting field, auto-patches the YAML and any blends, emits a migration report and a pre-flight checklist, and produces a one-shot `submit_build_indexes.py` invocation. Static analysis only; never runs jobs.
+argument-hint: '<config.yaml> [launcher.py] [blend.yaml] [--cluster=<name>]'
+---
+
+# Migrate a NeMo training YAML to the indexed + resumable Lhotse dataloader
+
+The repo's resumable path (replacing the streaming/replay loader with O(1)
+checkpoint-restore via `torchdata.StatefulDataLoader` + `.idx` sidecars) has
+~15 distinct ways to silently corrupt or hard-fail. This skill runs every
+one of those checks against a concrete YAML, auto-patches what it can, and
+emits a teaching-style migration report so the user understands every
+decision and the user-only steps to run before launching.
+
+## When to apply
+
+Trigger phrases listed in the frontmatter. Three common entry modes:
+
+1. **New migration**: user points at an experiment YAML and asks to migrate.
+   Walk every field, write patched YAML + report + pre-flight + build-indexes
+   command.
+2. **Sanity-check existing migration**: user says "audit this YAML, is it
+   resumable-correct?". Same workflow but emit only the report (no patched
+   files unless errors found).
+3. **AIStore-aware variant**: cluster has `AIS_ENDPOINT` and the blend has
+   `s3://` / `ais://` / `http(s)://` paths. Skill switches to the AIStore
+   workflow (sets `USE_AIS_GET_BATCH=true`, optionally
+   `USE_AIS_INDIVIDUAL_GETS=true`, requires `aistore` SDK in container).
+
+## Inputs
+
+| input | required | source | purpose |
+|---|---|---|---|
+| Training YAML | yes | argument or `--config=` | every `data.train_ds` / `data.validation_ds` / `model` / `trainer` / `exp_manager` field that interacts with the resumable path |
+| Launcher script | no | argument or auto-detect (`train_and_eval.py`, `pretrain.sh`, raw `python salm_train.py …`, `torchrun …`) | grep for per-chunk seed rotation, missing prefetch preamble, etc. If absent, skill emits "launcher review SKIPPED — manual review required" with the things to check by hand |
+| Data-blend YAML | no | resolved from `data.train_ds.input_cfg` if it references `${data_blend_dir}/...` | walked for unindexable entries (`extra_fields`, `slice_length`, `.jsonl.gz`, `.tar.gz`, AMI Shar) |
+| Cluster name | no | `--cluster=<name>` or detected from `data_blend_dir` path | reads `cluster_configs/<cluster>.yaml` env_vars to detect AIS_ENDPOINT and pick the right code paths |
+
+## Outputs
+
+Every output lands in a fresh directory `migrate-resumable/<config-stem>/`
+in the repo root (so multiple migrations stay self-contained):
+
+| output | purpose |
+|---|---|
+| `migration-report.md` | exhaustive walkthrough — every field touched, every option explained, every pitfall checked, severity-classified findings, links into MIGRATION_GUIDE.md and codebase |
+| `<config-stem>-resumable.yaml` | the patched config, ready to drop in. Preserves comments where possible; explicit `# NOTE:` block at every changed line citing the rationale |
+| `<blend-stem>-resumable.yaml` | patched blend (drops unindexable entries with rationale comments) — only emitted when blend was inspected |
+| `pre-flight-checklist.md` | manual steps before launch: build indexes, verify SDK, verify cluster fits the workflow, etc. |
+| `build-indexes-cmd.sh` | concrete one-shot shell command invoking `submit_build_indexes.py` (or a generic equivalent if the repo doesn't have it) |
+
+## Workflow
+
+### 1. Discover and parse inputs
+
+1. Resolve the training YAML path. Read it with OmegaConf.
+2. If `data.train_ds.input_cfg` references `${data_blend_dir}/<file>.yaml`,
+   try to resolve `data_blend_dir` from the YAML's top-level scalar. Locate
+   the blend on disk (try `data_blends/<cluster>/<file>.yaml` first, fall
+   back to glob across `data_blends/`).
+3. If a launcher path was given, read it as text. Otherwise inspect the
+   repo root for any of `train_and_eval.py` / `pretrain.sh` / `salm_train.sh`
+   and pick the most-likely match.
+4. If `--cluster=<name>` was passed, read `cluster_configs/<name>.yaml`.
+   Otherwise grep `data_blend_dir` for a known cluster path prefix
+   (`/lustre/fsw/portfolios/llmservice/users/...` → iad, `nrt`, `ord`).
+5. Detect AIStore presence: cluster `env_vars` contains `AIS_ENDPOINT=...`,
+   AND the blend has `s3://` or `ais://` or `http(s)://` paths in
+   `tarred_audio_filepaths` / `manifest_filepath` / `cuts_path`.
+
+### 2. Run lint pipeline
+
+Run every check in `references/option-reference.md`,
+`references/conflict-matrix.md`, and `references/failure-modes.md` against
+the YAML and (when present) the blend and launcher. Each check emits a
+finding entry:
+
+```
+{
+  severity: fatal | error | warning | note,
+  field:    "data.train_ds.shard_seed",
+  current:  "randomized",
+  recommended: 42,
+  rationale: <one paragraph explaining the *why*>,
+  link:     "MIGRATION_GUIDE.md §… / file:line",
+}
+```
+
+Severities:
+- **fatal** — auto-patch impossible (e.g. blend uses `extra_fields` on a
+  `nemo_tarred` entry). Skill exits non-zero with explanation; user must
+  pre-process the manifest offline.
+- **error** — auto-patch produces a working config (e.g.
+  `shard_seed: "randomized"` → fixed integer). Skill applies the patch.
+- **warning** — auto-patch optional or context-dependent; emitted as a
+  comment in the patched YAML and a section in the report.
+- **note** — informational; report-only.
+
+### 3. Emit patched YAML + blend
+
+Apply every `error`-severity patch. For each, leave a `# NOTE:` comment
+above the changed line citing the finding. Comment-preserving YAML round-trip
+uses `ruamel.yaml`; if that's not available, fall back to `omegaconf`
+serialization (loses comments, but the report still documents every change
+in detail).
+
+For the blend: drop every entry that fails the indexability checks. Each
+dropped entry leaves a `# DROPPED: <rationale>` block in its place. If the
+drop empties out the blend or removes a domain entirely, surface that in
+the report.
+
+### 4. Generate `migration-report.md`
+
+Use `templates/migration-report.md`. Sections:
+
+1. **Summary** — one paragraph: AIStore vs non-AIStore, count of changes,
+   any fatal blockers.
+2. **Inputs** — paths to training YAML / launcher / blend / cluster config
+   that were inspected.
+3. **Findings table** — every finding with severity / field / current /
+   recommended / link.
+4. **Per-section walkthrough** — `data.train_ds`, `data.validation_ds`,
+   `exp_manager`, `trainer`, AIStore-specific section. For each, the table
+   from `references/option-reference.md` filtered to fields that exist in
+   this YAML, with current vs. recommended values inline.
+5. **Pitfalls / failure modes encountered** — link to
+   `references/failure-modes.md` for each that fired.
+6. **Conflict matrix** — link to `references/conflict-matrix.md` and call
+   out any conflicts found.
+7. **Best practices reminder** — copy of `references/best-practices.md`.
+8. **Verification recipe** — the bit-exact verification snippet from
+   `MIGRATION_GUIDE.md` §3, with the user's actual config path filled in.
+
+### 5. Generate `pre-flight-checklist.md`
+
+Use `templates/pre-flight-checklist.md`. Required steps:
+
+- Build indexes via `submit_build_indexes.py` (or generic equivalent);
+  print the exact command.
+- If AIStore in play: verify `aistore` SDK ≥ 1.17 in the container; verify
+  `AIS_ENDPOINT` is set; warn about the MOSS GetBatch issue for
+  multilingual / non-EN-replicated data and recommend
+  `USE_AIS_INDIVIDUAL_GETS=true` for those.
+- If launcher absent or only a stub: list the launcher items the user must
+  hand-check (single-seed across chain, prefetch preamble, num_workers /
+  world_size invariance).
+- Recommend running the bit-exact verification snippet from MIGRATION_GUIDE
+  §3 once before sweeping.
+- Recommend the 1-node single-chunk → 1-node multi-chunk → 4-node test
+  sequence.
+
+### 6. Generate `build-indexes-cmd.sh`
+
+Single executable shell file with the exact `submit_build_indexes.py`
+invocation, using:
+- `--cluster=<detected or user-supplied cluster>`
+- `--blend=<every blend referenced from the training YAML>` (training +
+  validation blends)
+- `--bypass-nvidia-hook` if cluster is NRT or any cluster whose `cpu_partition`
+  is documented to lack `nvidia-container-cli`
+- A comment block at the top with the rationale for each flag
+
+If the repo doesn't have `submit_build_indexes.py`, emit a generic
+equivalent that does:
+```bash
+python <NeMo>/scripts/dataloading/build_indexes.py \
+    --indexes-root <local-mirror> \
+    --workers <effective> \
+    <blend>.yaml <validation-blend>.yaml
+```
+plus a SLURM wrapper sketch and call out that `submit_build_indexes.py` in
+the speechlm-2026h1 repo is the canonical version.
+
+### 7. Print final summary to chat
+
+Short recap (under 10 lines): output dir, count of fatal/error/warning/note
+findings, link to migration report, the next single command the user
+should run (`bash migrate-resumable/<stem>/build-indexes-cmd.sh` then the
+launcher).
+
+## Knowledge base — references baked into this skill
+
+- **`references/option-reference.md`** — exhaustive field-by-field table
+  (every YAML key that interacts with the resumable path, required value,
+  rationale, see-also link). Read this for every finding.
+- **`references/failure-modes.md`** — 18 catalogued failure modes with log
+  signatures, tracebacks, and fixes. Plus an "Open investigation" section.
+- **`references/conflict-matrix.md`** — the option pairs that don't work
+  together and what to do about each.
+- **`references/best-practices.md`** — distilled checklist (priority-ordered).
+- **`references/aistore-vs-non-aistore.md`** — the two parallel workflows.
+
+- **`examples/iad-english-granary/`** — IAD English training (Granary 1.1,
+  lustre manifests, S3 tars, AIStore). Before/after pair.
+- **`examples/nrt-lustre-only/`** — NRT lustre-only training (no AIStore).
+  Before/after pair, includes the `--bypass-nvidia-hook` build-index
+  invocation.
+- **`examples/multilingual-mixed/`** — multilingual blend with mixed
+  S3/lustre. Demonstrates `USE_AIS_INDIVIDUAL_GETS=true` and the
+  AMI-Shar-drop pattern.
+
+- **`templates/migration-report.md`** — output template, fill-in-the-blank.
+- **`templates/pre-flight-checklist.md`** — output template.
+
+- **`scripts/analyze.py`** — the analysis engine. Reads YAML, runs every
+  lint check, emits findings + writes patched YAML. Pure static analysis;
+  no cluster calls.
+
+## Constraints
+
+- **Read MIGRATION_GUIDE.md** at `/Users/pzelasko/canary-dev/speechlm-2026h1/MIGRATION_GUIDE.md`
+  in full before running. The references in this skill cite specific
+  sections of that doc.
+- **Cross-check against the actual code** at:
+  - `lhotse_resumable/lhotse/serialization.py` (`open_best`, AIStore backend, MSC backend)
+  - `lhotse_resumable/lhotse/indexing.py` (`create_jsonl_index`, `create_tar_index`, `indexed_path_kind`, `IndexedJsonlReader`, `read_index`)
+  - `lhotse_resumable/lhotse/ais/batch_loader.py` (`AISBatchLoader`, `prefer_individual`, `_moss_attrs`)
+  - `lhotse_resumable/lhotse/dataset/input_strategies.py` (`AudioSamples`)
+  - `NeMo_resumable/nemo/collections/common/data/lhotse/indexed_adapters.py` (`IndexedTarMemberReader`, `_AISRangeReader`, `_CountingReader`, `_open_data_path`, `_load_index`, `resolve_idx_path`)
+  - `NeMo_resumable/nemo/collections/common/data/lhotse/dataloader.py` (`get_lhotse_sampler_from_config`, `get_lhotse_dataloader_from_config`, `force_map_dataset` handling, the auto-overwrite of `shard_seed`)
+  - `NeMo_resumable/nemo/collections/common/data/lhotse/nemo_adapters.py` (`LazyNeMoTarredIterator`, `_init_indexed`, `_iter_batch_for_ais_get_batch`, `USE_AIS_GET_BATCH` gate)
+  - `NeMo_resumable/scripts/dataloading/build_indexes.py` and `prefetch_indexes.py`
+- **Cross-check against today's debug docs** at:
+  - `agent-debug-workspace/0909-summary.md`
+  - `agent-debug-workspace/0909-multiling-failures.md`
+  - `agent-debug-workspace/0909-longform-failures.md`
+  - `agent-debug-workspace/nano-v3-1node-resumable-tests.md`
+  These contain the freshest evidence-based knowledge. Cite line:file
+  pointers when emitting findings whose rationale traces back to them.
+- **Mention but do not duplicate** the existing `submit_build_indexes.py`
+  in the speechlm-2026h1 repo; this skill references it as the canonical
+  builder for that repo and provides a generic equivalent for users on
+  other repos.
+- **Don't write code that runs jobs on the cluster.** Static-analysis +
+  migration tool, not a job runner.
+- **Identify gaps clearly.** If something is unknown (e.g., why MOSS
+  GetBatch returns empty for multilingual data), say so explicitly in
+  `failure-modes.md` under "Open investigation" and surface that in the
+  report when relevant.
+
+## Non-goals
+
+- Do not run `submit_build_indexes.py` automatically; emit the command and
+  let the user invoke it.
+- Do not modify upstream code (NeMo_resumable / lhotse_resumable). The
+  skill works around upstream bugs via YAML / env-var settings.
+- Do not invent fields the user didn't ask about. If a value is ambiguous
+  (e.g. `seed` was unset and there's no default we can read), prompt with
+  one batched `AskUserQuestion`.
+
+## Style
+
+Match the tone of `hyperparam-sweep/SKILL.md` and `debug-cluster-run/SKILL.md`.
+Crisp, evidence-based, no fluff. Inline rationale at every decision. The
+skill is a teaching tool as well as an automated migrator — every patched
+line should land with a citation the user can verify.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md b/.claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md
new file mode 100644
index 000000000000..90371a02f2d1
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md
@@ -0,0 +1,200 @@
+# AIStore vs non-AIStore workflows
+
+The indexed + resumable Lhotse pipeline supports two storage backends for
+the audio tar files. Manifests can be on lustre regardless. The choice of
+backend changes which env vars / flags / container deps are required.
+
+## Detection
+
+The skill picks the workflow based on the **blend's `tarred_audio_filepaths`
+scheme**, NOT the cluster name:
+
+| signal | workflow |
+|---|---|
+| `tarred_audio_filepaths: s3://...` or `ais://...` or `http(s)://...` | **AIStore** workflow |
+| `tarred_audio_filepaths: /lustre/...` (or any local-FS path) | **non-AIStore** workflow |
+| Both in the same blend | **AIStore** workflow (the local files are still loadable; AIS path is the strictly-larger superset) |
+
+Cluster `env_vars` containing `AIS_ENDPOINT=...` is a necessary but not
+sufficient signal — the blend may still be all-lustre, in which case
+`AIS_ENDPOINT` is unused.
+
+## Workflow A — AIStore (s3:// / ais:// audio)
+
+### Required setup
+
+- **`aistore` SDK installed** in the training container. Either pre-baked
+  or `pip install aistore` in the preamble (no version pin needed; the
+  lhotse_resumable code's `_moss_attrs` normalizer handles both
+  pre-/post-MossOut-rename SDKs).
+- **`AIS_ENDPOINT` exported** in cluster env_vars (and forwarded into the
+  container via `--container-env=AIS_ENDPOINT,...`). Optionally
+  `AIS_AUTHN_URL` and `AIS_AUTHN_TOKEN` for authenticated AIStore
+  deployments (MOSS GetBatch requires the token).
+- **`USE_AIS_GET_BATCH=true`** env var in the training step (set
+  automatically by `--enable-indexes-prefetch` via
+  `train_and_eval.py`). This short-circuits eager
+  `IndexedTarMemberReader` construction: the indexed tar readers would
+  otherwise instantiate one per shard at startup, which on a 41k-shard
+  blend means 41k AIS HTTP connections opened before training begins.
+  With `USE_AIS_GET_BATCH=true`, audio is fetched lazily at sample time
+  via AIStore's MOSS GetBatch — one batched HTTP call per minibatch.
+
+### Optional setup
+
+- **`USE_AIS_INDIVIDUAL_GETS=true`** (or
+  `--enable-ais-individual-gets`): bypass MOSS GetBatch entirely and
+  fetch each object via `Object.get_reader(archive_config=...).read_all()`.
+  Slower (one HTTP call per object instead of one per minibatch) but
+  works around MOSS-specific server-side issues — e.g. empty-content
+  returns for non-replicated multilingual data on iad AIS, which crashes
+  the GetBatch path's empty-content retry logic.
+
+### Required code paths
+
+| component | role |
+|---|---|
+| `lhotse.serialization.AIStoreIOBackend` | turns `s3://` / `ais://` into actual HTTP fetches via aistore SDK; gated on `AIS_ENDPOINT` env var presence |
+| `nemo.collections.common.data.lhotse.indexed_adapters._AISRangeReader` | seekable file-like wrapper that translates `seek()` + `read(n)` into AIS byte-range HTTP requests; used by the indexed tar member readers when `data_path` is a URL |
+| `nemo.collections.common.data.lhotse.indexed_adapters._open_data_path` | factory that returns either a regular `open(path, "rb")` for local paths or `_AISRangeReader` for URL paths |
+| `lhotse.ais.batch_loader.AISBatchLoader` | minibatch-time MOSS GetBatch client; aggregates all URLs from a CutSet into one request and demultiplexes the response back into manifests |
+| `lhotse.ais.batch_loader._moss_attrs` | normalizer for AIS SDK MossIn-vs-MossOut attribute differences (older `.bck` / `.provider` / `.obj_name` vs newer `.bucket_name` / `.bucket_provider` / `.object_name`); handles both transparently |
+
+### Index building
+
+- `submit_build_indexes.py` runs once per blend on a CPU SLURM job.
+- Build reads tar files via AIS (HTTP GET with byte-range), parses tar
+  headers, writes `.idx` sidecars to `<workspace>/indexes_mirror/`
+  (lustre) — mirroring the data files' s3 paths.
+- Successful index build proves the data IS on AIS. If indexing
+  succeeds for a path but training fails to fetch via MOSS GetBatch with
+  empty content, the data is replicated for individual GET but not for
+  MOSS — switch the run to `USE_AIS_INDIVIDUAL_GETS=true`.
+
+### Prefetch pipeline
+
+1. **Indexes mirror → local SSD**: `prefetch_indexes_to_ssd.sh` copies
+   `<workspace>/indexes_mirror/` to `/tmp/idx` on each node. Reads via
+   `lhotse.serialization.open_best`, so source can be lustre or remote.
+2. **Manifests** (optional): `prefetch_manifests_to_ssd.sh` pulls AIS
+   manifests to `/tmp/manifests/` and rewrites blend YAMLs to point at
+   the local copies. Only useful when `manifest_filepath` is `s3://`;
+   no-op if manifests are already on lustre.
+3. **HF cache** (optional): `cache_pretrained_to_ssd.sh` copies the
+   pretrained LLM/ASR weights from `$HF_HOME/hub/` to local SSD to avoid
+   N-rank concurrent reads from lustre at training start.
+
+All three preambles are now run **in parallel** by `train_and_eval.py`
+(each in a backgrounded subshell with PID capture; `wait` propagates any
+non-zero exit). Each prefetch script is flock-guarded, so only one rank
+per node does the actual work; the other 7 wait for the lock-holder to
+finish.
+
+### Container requirements
+
+- `aistore` Python SDK (any version ≥1.18; the `_moss_attrs` normalizer
+  handles MossIn↔MossOut renames in 1.19+).
+- `nvidia-container-cli` on every node the build/training runs on. Some
+  cpu partitions don't have it (NRT cpu partition is a known case);
+  workaround is the `--bypass-nvidia-hook` flag in
+  `submit_build_indexes.py`, which injects
+  `--export=ALL,NVIDIA_VISIBLE_DEVICES=void` so enroot's
+  `98-nvidia.sh` hook short-circuits.
+
+### Failure modes specific to AIStore
+
+See `references/failure-modes.md` §3 (`f.tell()` on non-seekable
+ObjectFileReader), §4 (`os.path.getsize` on URL paths), §5 (`open()`
+builtin on URL paths), §10 (`MossOut.bck` AttributeError), §16 (MOSS
+GetBatch returns empty content for non-replicated data).
+
+## Workflow B — non-AIStore (lustre-only)
+
+### Required setup
+
+- **All `tarred_audio_filepaths` resolve to local-FS paths** (typically
+  `/lustre/...`).
+- **`AIS_ENDPOINT` UNSET** in cluster env_vars — when present and the
+  blend has any URL paths, `AISBatchLoader` would otherwise be
+  instantiated and try to MOSS-fetch local-FS paths, causing confusing
+  errors. Comment out the env var or use a different cluster_config
+  variant.
+- **`USE_AIS_GET_BATCH=false`** (the default; `--enable-indexes-prefetch`
+  sets it to `true` so use a different launcher invocation, OR pass
+  `--no-enable-indexes-prefetch` if your launcher exposes that, OR call
+  `salm_train.py` directly without the env var set).
+
+### Required code paths
+
+| component | role |
+|---|---|
+| `lhotse.serialization.BuiltinIOBackend` | trivial `open(path, "rb")` for local files |
+| `nemo.collections.common.data.lhotse.indexed_adapters._open_data_path` | falls through to `open()` for paths that don't match `_URL_RE` |
+| `nemo.collections.common.data.lhotse.indexed_adapters.IndexedTarMemberReader` | regular seekable random access into local tars |
+| **NOT used**: `_AISRangeReader`, `AISBatchLoader`, `aistore` SDK, MOSS GetBatch, archpath-based archive member fetch |
+
+### Index building
+
+- Same `submit_build_indexes.py` invocation.
+- Build reads tar files via local `open(path, "rb")` (the
+  `_open_data_path` factory's local branch). No HTTP, no AIS.
+- Faster than the AIStore workflow per file (no network round-trip),
+  but lustre I/O can be the bottleneck with high worker counts.
+
+### Prefetch pipeline
+
+1. **Indexes mirror → local SSD**: same `prefetch_indexes_to_ssd.sh`,
+   but the source is the lustre mirror (no AIS to traverse).
+2. **Manifests prefetch**: not needed (manifests are already on
+   lustre).
+3. **HF cache**: same as AIStore workflow.
+
+### Container requirements
+
+- `aistore` SDK NOT required. Container can be slim.
+- `nvidia-container-cli` still required for the GPU portion (training
+  itself); for the CPU-only index build, the `--bypass-nvidia-hook`
+  flag still applies.
+
+### Failure modes specific to non-AIStore
+
+Mostly the local-FS-only failure modes of §1, §2, §6, §7, §8, §11-§15,
+§17 from `references/failure-modes.md`. The AIS-specific modes (§3-§5,
+§10, §16) don't fire.
+
+## Decision tree
+
+```
+                 [is `tarred_audio_filepaths` a URL?]
+                          /                 \
+                        no                  yes
+                        /                     \
+              [non-AIStore workflow]   [is AIS_ENDPOINT set?]
+                                          /          \
+                                         no          yes
+                                         /            \
+                              [ERROR: blend uses     [AIStore workflow]
+                               URLs but cluster                  \
+                               doesn't expose AIS]      [does MOSS GetBatch
+                                                         work for this data?]
+                                                              /         \
+                                                         yes              no
+                                                          /                \
+                                          [USE_AIS_GET_BATCH=true]   [USE_AIS_GET_BATCH=true
+                                          (default)                    USE_AIS_INDIVIDUAL_GETS=true]
+```
+
+## Common gotchas in mode-switching
+
+- **Same blend across clusters**: a blend with `s3://` paths only works
+  on clusters with `AIS_ENDPOINT` configured. Maintain per-cluster
+  blend variants (`data_blends/<cluster>/...`) when porting.
+- **Lustre mounts identical?** Don't assume — verify with `ls` on the
+  cluster login node before assuming a `/lustre/...` path resolves on a
+  new cluster. NRT and IAD have similar mount roots but disjoint data
+  trees.
+- **`indexes_root` is shared across both workflows**. The `.idx` file
+  format is identical (uint64 offsets + sentinel); the source-data
+  resolution is what differs. You can re-use a mirror across an AIS
+  → lustre migration as long as the blend's data file paths are
+  identical strings.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md b/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
new file mode 100644
index 000000000000..ff3dce7ae572
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
@@ -0,0 +1,141 @@
+# Best practices — indexed + resumable Lhotse migration
+
+A short, prioritised checklist distilled from the failure-mode catalog and
+real-world adoption pain. Apply these before sweeping any new recipe.
+
+## Tier 1 — non-negotiable
+
+1. **Pin BOTH `seed` and `shard_seed` to fixed integers** when
+   `force_map_dataset: true` and `use_stateful_dataloader: true`. The
+   sampler RNG (`shard_seed`) is checkpointed into `meta.pt` and restored
+   verbatim on resume; if `shard_seed: "randomized"`, each new chunk derives
+   a fresh worker-PID-hashed seed at init that diverges from the saved
+   snapshot. NeMo's auto-overwrite (`dataloader.py`
+   `get_lhotse_sampler_from_config`) papers over this with a warning, but
+   pinning up front gives reviewers an obvious signal of intent.
+
+2. **Same seed across every chunk of a chain.** Lightning re-seeds
+   Python/torch/numpy global RNGs at chunk start using
+   `data.train_ds.seed`. If your launcher rotates this per chunk
+   (FIXED_SEEDS-style — what `train_and_eval.py` did historically), every
+   resume silently breaks bit-exactness in dropout / augmentation /
+   aux-loss permutations. The repo's `train_and_eval.py` now pins a single
+   seed when `--enable-indexes-prefetch` is set; for other launchers, do
+   it manually.
+
+3. **Match `num_workers` AND `world_size` between save and restore.**
+   `torchdata.StatefulDataLoader` enforces this as a hard contract. Any
+   mismatch raises immediately at load. Document the values in your
+   training script header so a re-submission can't accidentally drift.
+
+4. **Build the index mirror once per blend; reuse across experiments.**
+   `submit_build_indexes.py` skips already-indexed files (checks for
+   non-empty `.idx`), so re-runs are cheap. Pin a stable
+   `indexes_root: <workspace>/indexes_mirror` and don't move it.
+
+5. **Set `concurrent_bucketing: false` in `data.train_ds`.** Default is
+   `true`, which spawns a daemon producer thread inside
+   `DynamicBucketingSampler` that races the main thread on
+   `cuts_iter`. The main thread is the one `StatefulDataLoader`
+   checkpoints; the producer is invisible to the snapshot. After resume
+   the producer's pre-fetched cuts are lost and the per-step batch
+   composition silently diverges from the non-resumed run. The
+   throughput cost of the synchronous path is negligible at steady
+   state; the determinism gain is non-negotiable for resumable
+   training. See `failure-modes.md §19`.
+
+## Tier 2 — strongly recommended
+
+5. **Run the bit-exact verification from `MIGRATION_GUIDE.md` §3 before
+   sweeping.** ~10 sec, model-free: take 5 batches → `state_dict` →
+   take 5 more (ground-truth); fresh process loads `state_dict`, takes 5,
+   asserts equal. Catches sampler/bucketer state-dict bugs that schema
+   inspection of `meta.pt` (just confirming the keys exist) won't.
+
+6. **Pick exactly ONE checkpoint trigger** in
+   `exp_manager.checkpoint_callback_params` — `every_n_train_steps`,
+   `every_n_epochs`, OR `train_time_interval`. Lightning's
+   `ModelCheckpoint.__validate_init_configuration` raises
+   `MisconfigurationException` if more than one is set. External
+   preemption (cluster scheduler kills mid-chunk) doesn't go through
+   NeMo's `max_time_per_run`-based PreemptionCallback, so progress
+   between the last save and the kill is lost — pick whichever trigger
+   matches your chunk's reachable progress: `every_n_train_steps: 50`
+   if chunks reach only ~80-100 steps; `every_n_epochs: 1` if you
+   reliably get full 1000-step epochs; `train_time_interval: "00:30:00"`
+   if you prefer wall-clock semantics.
+
+7. **Test 1-node single-chunk first, then 1-node multi-chunk (resume),
+   then full N-node.** The 1-node smoke isolates dataloader/IO bugs from
+   distributed/EP issues. The multi-chunk-on-1-node test exercises the
+   resume path before scale changes. The repo's
+   `nano-v3-granary1p1-en-1node-resumable.yaml` is a working template.
+
+8. **For multilingual / non-English data on AIStore that fails MOSS
+   GetBatch with "empty content"**, switch to
+   `USE_AIS_INDIVIDUAL_GETS=true` (or
+   `train_and_eval.py --enable-ais-individual-gets`). Slower per batch
+   but bypasses the buggy MOSS path. See `references/aistore-vs-non-aistore.md`.
+
+9. **Keep `.idx` mirror on lustre, prefetch destination on local SSD.**
+   Building indexes writes to lustre (cluster-shared, persistent). The
+   training preamble copies the mirror to `/tmp/idx` on each node's local
+   SSD via `prefetch_indexes_to_ssd.sh` for fast mmap. Don't store the
+   mirror on `/tmp` — it would be lost between jobs.
+
+## Tier 3 — nice to have
+
+10. **Use `--bypass-nvidia-hook`** for clusters whose cpu partition
+    lacks `nvidia-container-cli` (e.g. NRT). The launcher injects
+    `--export=ALL,NVIDIA_VISIBLE_DEVICES=void` so enroot's
+    `98-nvidia.sh` short-circuits instead of failing the container start.
+
+11. **`--exclusive --cpus_per_task=96`** for the index build job. The
+    container's unsquashfs needs the full memory budget on first
+    extraction; without exclusive, the default per-CPU memory allocation
+    can OOM-kill the container before `build_indexes.py` even starts.
+
+12. **`--workers $((cpus - 1))`** for the index ProcessPool, leaving one
+    core for OS/scheduler. Indexing is I/O bound when manifests are on
+    s3, but the tar-header parse is GIL-heavy (so threads serialize) —
+    process pool is the right call. If you OOM, drop workers; with 96
+    workers chewing big s3 manifests we've seen `BrokenProcessPool` on
+    the very large all-asr blend.
+
+13. **Drop AMI from English Granary blends until uncompressed Shar
+    exists.** AMI's Lhotse Shar uses `.jsonl.gz` cuts which can't be
+    indexed; either re-export with `compress_jsonl=False`, or use the
+    `granary1p1-en-resumable.yaml` blend which omits AMI entirely.
+
+14. **Run preambles in parallel.** `train_and_eval.py` now backgrounds
+    each preamble (HF SSD cache / manifest prefetch / index prefetch)
+    with PID capture and `wait`-with-error-propagation. Each script's
+    flock guards cross-rank de-duplication, so backgrounding from each
+    rank is safe.
+
+## What NOT to do
+
+- **Don't skip the bit-exact verification** because "schema looks right".
+  Schema-only verification (presence of `_snapshot/_steps_since_snapshot/
+  _iterator_finished` in `meta.pt`) confirms the StatefulDataLoader is
+  being asked to checkpoint, NOT that the snapshot bytes are restored
+  correctly.
+
+- **Don't pin `aistore` SDK to an old version** "to avoid the MossOut
+  bug" — the lhotse code already handles both shapes via
+  `_moss_attrs`. Use the latest SDK; track future SDK churn with the
+  same defensive normalizer pattern.
+
+- **Don't combine `every_n_train_steps + every_n_epochs +
+  train_time_interval`** in one `checkpoint_callback_params`. Lightning
+  raises `MisconfigurationException` at startup. Pick one trigger.
+
+- **Don't enable `concurrent_bucketing=True` with custom samplers** that
+  spawn non-daemon threads. The built-in `DynamicBucketingSampler` is
+  correct (background thread is `daemon=True`); only matters if you
+  forked it.
+
+- **Don't move `indexes_root` between training and prefetch.** If the
+  YAML says `indexes_root: /tmp/idx` and the prefetch script writes to
+  `/tmp/idx2`, training silently can't find any index, falls back to
+  building on first access (slow).
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md b/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
new file mode 100644
index 000000000000..662023ee6eca
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
@@ -0,0 +1,33 @@
+# Conflict matrix — option pairs that don't work together
+
+Table format: `A | B | conflict | severity | resolution`.
+
+Severities:
+- **fatal** = auto-patch impossible; requires offline manifest pre-processing
+  or data ingestion. Skill exits non-zero with explanation.
+- **error** = auto-patchable.
+- **warning** = patchable but context-dependent; the skill emits a comment
+  in the patched YAML and a section in the report.
+
+| A | B | conflict | severity | resolution |
+|---|---|---|---|---|
+| `data.train_ds.indexed: true` | `extra_fields:` on a `nemo` / `nemo_tarred` / `multimodal_conversation` entry | `LazyNeMoTarredIterator(indexed=True)` raises `RuntimeError` (`nemo_adapters.py:485-487`). Graph-token random access has no stable index. | fatal | Pre-process the manifest offline to materialize the extra fields, drop the `extra_fields` key. |
+| `data.train_ds.indexed: true` | `slice_length:` on a `nemo` / `nemo_tarred` entry | Sliced cuts have no stable index — slicing rewrites the cut sequence. | fatal | Re-shard the audio offline to the target slice length, drop the `slice_length` key. |
+| `data.train_ds.indexed: true` | Lhotse Shar `cuts.*.jsonl.gz` (compressed cuts) | `lhotse/indexing.py:88-110` rejects compressed paths in `indexed_path_kind`. AMI's stock distribution hits this. | fatal | Drop the corpus from the blend, OR re-export the Shar with `compress_jsonl=False`, OR convert to `nemo_tarred` format. |
+| `data.train_ds.indexed: true` | `tarred_audio_filepaths: *.tar.gz` | Compressed tars can't be indexed. | fatal | Re-pack the tars uncompressed. |
+| `data.train_ds.indexed: true` | `pipe:cmd \| cmd2` paths | Pipe commands aren't seekable; `validate_indexed_access` raises `ValueError`. | fatal | Materialize the upstream of the pipe to a real file, then point at that. |
+| `data.train_ds.force_map_dataset: true` | `data.train_ds.force_iterable_dataset: true` | `dataloader.py:278-280` asserts these are mutually exclusive. | error | Keep only `force_map_dataset: true`. |
+| `data.train_ds.force_map_dataset: true` + `data.train_ds.use_stateful_dataloader: true` | `data.train_ds.shard_seed: "randomized"` | Map path doesn't need per-rank seed differentiation; `"randomized"` adds worker-PID-derived seeding that breaks across resume. NeMo's `dataloader.py:556-572` warns + auto-overwrites with `seed`. | error | Set `shard_seed: <int>` (typically equal to `seed`). |
+| `data.train_ds.use_stateful_dataloader: true` | per-chunk seed rotation in launcher | Silent corruption: model RNG (dropout, aux-loss, random-init) diverges across chunks even though sampler state restores correctly. | error | Pin a single seed across the entire chain. `train_and_eval.py:925-952` does this when `--enable-indexes-prefetch` is set. For arbitrary launchers, set the same seed in every chunk's command. |
+| `data.train_ds.use_stateful_dataloader: true` | `num_workers` change between save and restore | Hard error from `torchdata.StatefulDataLoader`. | error | Document `num_workers` in the YAML / launcher header; never change between chunks. |
+| `data.train_ds.use_stateful_dataloader: true` | `world_size` change between save and restore (`num_nodes * devices_per_node`) | Hard error from torchdata. | error | Restart from a converted HuggingFace checkpoint if you need to scale (no resume in that case). |
+| AIStore MOSS GetBatch (`USE_AIS_GET_BATCH=true`, `USE_AIS_INDIVIDUAL_GETS` unset) | non-EN-replicated multilingual data on `s3://FLEURS/...`, `s3://MCV/...`, etc. | MOSS returns 200 + empty body for missing objects. Triggers the empty-content retry path which then crashes (§10 / §16 in failure-modes.md). | warning | Set `USE_AIS_INDIVIDUAL_GETS=true` until the data is replicated to AIS, OR replicate the data, OR switch the blend to lustre tar paths if available. |
+| `data.validation_ds.force_finite: true` | training (`data.train_ds`) | `force_finite` caps the infinite-mux behavior that training requires. | error | `force_finite: true` is a validation-only flag; don't propagate it to `data.train_ds`. |
+| `exp_manager.checkpoint_callback_params.every_n_train_steps: null` | external preemption (`svc-hwinf-cs-sched`, NODE_FAIL, etc.) at < 1 epoch | No mid-epoch save; chunk progress is lost on every preemption. | warning | Add `every_n_train_steps: 50-250` (and/or `train_time_interval: "00:30:00"`). Lightning ORs the triggers. |
+| `exp_manager.max_time_per_run` ≥ SLURM walltime | SLURM SIGKILL during teardown | The internal preemption save never fires; teardown is killed mid-write. | error | Set `max_time_per_run` to `<SLURM walltime - 10min>` (e.g. `00:03:50:00` for a 4h walltime). |
+| `data.train_ds.indexes_root` | `prefetch_indexes_to_ssd.sh` destination | Mismatch → manifests fail to find their `.idx` neighbors at training time. | error | Keep both in sync. The prefetch script's default is `/tmp/idx`; the YAML's `indexes_root` must match. |
+| `submit_build_indexes.py` (no `--bypass-nvidia-hook`) | NRT cpu partition (lacks `nvidia-container-cli`) | enroot's `98-nvidia.sh` hook hard-fails container start. | error | Pass `--bypass-nvidia-hook` for any cluster whose cpu partition lacks `nvidia-container-cli`. |
+| Container `aistore` SDK < 1.17 | AIStore in play | `lhotse_resumable/lhotse/ais/batch_loader.py:75` requires `>=1.17.0`. | error | Pin `aistore>=1.17` in the build/training container preamble; `submit_build_indexes.py:227` does this. |
+| `data.train_ds.seed` | per-chunk seed rotation in launcher | Same as above — silent model-level divergence. | error | Pin `seed` in YAML AND in launcher; both must be invariant across the chain. |
+| `pretrained_llm` change | resume from a chain | `init_from_checkpoint` resharding issues; tokenizer mismatch. | warning | Don't change the LLM mid-chain. Start fresh if you need a different LLM (+ optionally `init_from_checkpoint: <previous_run.ckpt>` for transfer). |
+| `model.aux_loss_coeff > 0` | `model.activation_checkpointing_llm: true` | AC + MoE aux-loss recompute dtype flip (debug-cluster-run §6(16)). `CheckpointError: Recomputed values ... different metadata`. Orthogonal to resumable, but a frequent recipe pitfall. | error | Set `aux_loss_coeff: 0`, OR disable `activation_checkpointing_llm` (perception AC alone is fine). |
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md b/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
new file mode 100644
index 000000000000..b4ded608bf9f
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
@@ -0,0 +1,359 @@
+# Failure-mode catalog
+
+Every failure mode observed during the speechlm-2026h1 migration to indexed
++ resumable dataloading. Each entry: **signature** (what you grep for in
+logs), **trigger** (the YAML/launcher condition that produces it),
+**fix**, and **see-also** pointers.
+
+## §1 — `.jsonl.gz` AMI shar in blend
+
+**Signature**: index build fails with
+`ValueError: <ctx> requires uncompressed JSONL or tar data, but got a compressed path: <file>.jsonl.gz`
+from `lhotse/indexing.py:130-135`.
+
+**Trigger**: blend YAML references AMI's stock distribution (Lhotse Shar
+with `cuts.*.jsonl.gz`).
+
+**Fix**: drop AMI from the blend until an uncompressed Shar export (or a
+`nemo_tarred` re-export) is available. The repo's
+`data_blends/iad/granary1p1-en-resumable.yaml` does exactly this — see its
+header comment.
+
+## §2 — `extra_fields` / `slice_length` on `nemo_tarred` entry
+
+**Signature**:
+`RuntimeError: LazyNeMoTarredIterator(indexed=True) does not support 'extra_fields' because <ctx>` from `nemo_adapters.py:485-487`,
+or
+`RuntimeError: LazyNeMoIterator(indexed=True) does not support 'extra_fields'` from `nemo_adapters.py:148-152`.
+
+**Trigger**: blend entry has `extra_fields:` block (typically attaching
+text-iter / text-sample / graph-token features to a `nemo` /
+`nemo_tarred`) or `slice_length: N`.
+
+**Fix**: pre-process the manifest offline to materialize the extra fields
+into the manifest, then drop the `extra_fields` key. For `slice_length`,
+re-shard the audio to the target slice and drop the key.
+
+## §3 — `f.tell()` on AIStore `ObjectFileReader`
+
+**Signature**: `io.UnsupportedOperation: seek` on first read of an
+`ais://` / `s3://` tar source.
+
+**Trigger**: AIStore SDK's `ObjectFileReader` doesn't implement
+`tell()` / `seek()`. The indexer uses `_CountingReader` to accumulate bytes
+manually; if your code path bypasses that, this fires.
+
+**Fix**: ensure the `aistore` SDK is installed in the container so lhotse
+routes via `AIStoreIOBackend`. The indexer's `create_jsonl_index` /
+`create_tar_index` accumulate bytes via `len(line)` and `_CountingReader`
+in `lhotse/indexing.py`. `submit_build_indexes.py:227` does the SDK install
+preamble.
+
+## §4 — `os.path.getsize(s3://…)`
+
+**Signature**: `FileNotFoundError: [Errno 2] No such file or directory: 's3://...'`
+
+**Trigger**: legacy code path computing index file size from disk for an
+`s3://` URL.
+
+**Fix**: `IndexedJsonlReader._load_index` / `IndexedTarMemberReader._load_index`
+now read the size sentinel from the `.idx` file itself for URL paths.
+Confirmed at `NeMo_resumable/nemo/collections/common/data/lhotse/indexed_adapters.py:269-294`
+(uses `np.fromfile` with `<u8` dtype; final entry is the file-size
+sentinel).
+
+## §5 — `open(s3://…)` in tar member readers
+
+**Signature**: `FileNotFoundError: [Errno 2] No such file or directory: 's3://...'`
+on first audio fetch.
+
+**Trigger**: `IndexedTarMemberReader` calling stdlib `open()` instead of
+the AIS-aware reader on a remote tar.
+
+**Fix**: `_open_data_path` at `indexed_adapters.py:159-166` returns
+`_AISRangeReader(str(path))` for any path with a `://` scheme. The
+`_AISRangeReader` translates `seek + read` into AIStore HTTP range requests.
+
+## §6 — `np.memmap` exhausts `vm.max_map_count`
+
+**Signature**: `OSError: [Errno 12] Cannot allocate memory` during
+training startup, with 80k+ shards.
+
+**Trigger**: legacy `np.memmap` per `.idx` file. With
+`vm.max_map_count = 65530` (Linux default), 80k shards × 1 mmap each
+exceeds the limit.
+
+**Fix**: switched to `np.fromfile` (resident array). Indexes are tiny
+(KB-scale per shard), so the memory cost is negligible. Confirmed at
+`indexed_adapters.py:288-294` ("Use np.fromfile (resident memory) rather
+than np.memmap so that NeMo blends with 80k+ shards don't exhaust
+vm.max_map_count").
+
+## §7 — Validation manifest with `.json` extension
+
+**Signature**: `ValueError: <ctx> path is not indexable: <file>.json`
+from `validate_indexed_access`.
+
+**Trigger**: NeMo convention to ship some manifests as `.json` (one JSON
+object per line) rather than `.jsonl`. The first version of
+`indexed_path_kind` rejected `.json`.
+
+**Fix**: `lhotse/indexing.py:99-107` now accepts both `.jsonl` and
+`.json` since the indexer only relies on newline-separated records.
+
+## §8 — ProcessPool OOM during indexing
+
+**Signature**: `concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly`
+in the build-indexes log, often after several minutes of forward progress.
+
+**Trigger**: 95 workers on a 96-cpu node + huge S3 manifests + Granary 1.1
+audio tars. The forks each load a manifest + tar header into RAM; with
+176 GiB total and 95 workers, peak per-worker RAM crosses ~1.8 GiB and
+the kernel OOM-killer fires.
+
+**Fix**: drop to `--workers 48`, or split the blend across multiple
+build-indexes invocations. `submit_build_indexes.py:99-104` defaults
+`cpus_per_task=96`; the auto-effective worker count is `cpus_per_task - 1`
+= 95. Override with `--workers 48`.
+
+## §9 — Container `nvidia-container-cli` missing on cpu partition
+
+**Signature**: enroot's `98-nvidia.sh` hook hard-fails container start;
+sbatch.log shows `nvidia-container-cli: command not found` or similar.
+
+**Trigger**: NRT cluster's `cpu` / `cpu_interactive` / `cpu_datamover`
+partitions lack `nvidia-container-cli`. IAD's cpu partition has it.
+
+**Fix**: pass `--bypass-nvidia-hook` to `submit_build_indexes.py`
+(`:122-129, 240-245`). Sets `--export=ALL,NVIDIA_VISIBLE_DEVICES=void` on
+the sbatch line, which makes enroot's hook short-circuit.
+
+## §10 — AIStore SDK `MossOut.bck` AttributeError
+
+**Signature**:
+`AttributeError: 'MossOut' object has no attribute 'bck'` in the empty-
+content retry path of `AISBatchLoader.__call__`. Cascade:
+`Error collating conversations: 'MossOut' object has no attribute 'bck'`,
+then `FallbackDataset received None`, then
+`TypeError: 'NoneType' object is not subscriptable` in
+`salm_automodel.training_step`, then DeepEP's `'unspecified launch failure'`.
+
+**Trigger**: `aistore>=1.20` (we're on 1.23.0) renames the MossIn-shaped
+`info.bck/.provider/.obj_name/.archpath` → MossOut-shaped
+`info.bucket_name/.bucket_provider/.obj_name/.archpath`. Triggered when
+the underlying object is missing on AIS and the SDK returns 200 + empty
+body, kicking the retry path that then crashes on attribute access.
+
+**Fix**: `_moss_attrs` normalizer at
+`lhotse_resumable/lhotse/ais/batch_loader.py:81` returns a 4-tuple
+`(bck, provider, obj_name, archpath)` for both shapes. Every consumer site
+must use it; raw `info.bck` references are bugs.
+
+**See also**: `agent-debug-workspace/0909-multiling-failures.md` for the
+full causal chain (multilingual Granary 1.1 audio not on iad AIS → empty
+content → retry path crash).
+
+## §11 — `shard_seed: "randomized"` + `force_map_dataset: true` + `use_stateful_dataloader: true`
+
+**Signature**: silent — no crash. Each fork re-derives a worker-PID-hashed
+seed at `worker_init_fn` time, but `StatefulDataLoader.load_state_dict`
+overrides the sampler state from the checkpoint. The mismatch produces
+non-bit-exact resume at the data level (within the saved snapshot
+window).
+
+**Trigger**: `shard_seed: "randomized"` literal in YAML, paired with
+`force_map_dataset: true` + `use_stateful_dataloader: true`.
+
+**Fix**: pin `shard_seed: <int>` (typically equal to `seed`).
+**NeMo's `dataloader.py:556-572` now warns + auto-overwrites** with the
+`seed` integer, so this is a safety net; explicit pinning in YAML keeps
+the rationale visible.
+
+## §12 — Per-chunk seed rotation in launcher
+
+**Signature**: silent (and worse than §11). On each chunk, Lightning
+calls `pl.seed_everything(run_seed)`, re-seeding Python/numpy/torch global
+RNG with a different value. Dropout, aux-loss, model random-init RNG draws
+diverge across chunks. The data-iteration level is correct (StatefulDataLoader
+wins the seed race for sampler state); the model level is not.
+
+**Trigger**: `train_and_eval.py`'s `FIXED_SEEDS[seed_offset+i]` rotation
+(pre-fix), or any launcher that picks a fresh seed per chunk
+(`seed = randint(...)`, `seed = run_idx`, etc.).
+
+**Fix**: pin a single seed for the entire chain. `train_and_eval.py:925-952`
+now does this when `--enable-indexes-prefetch` is set:
+`invariant_seed = seed if seed is not None else FIXED_SEEDS[seed_offset]`,
+and all chunks use `invariant_seed`. For arbitrary launchers, grep for
+seed-per-chunk patterns and warn.
+
+**See also**: `agent-debug-workspace/0909-longform-failures.md` Cause A
+(the original investigation).
+
+## §13 — `every_n_epochs: 1` only, no `every_n_train_steps`
+
+**Signature**: visible — only `step=N.ckpt` (where N is one
+`limit_train_batches`-aligned boundary) on disk after many hours of
+compute, with the rest of the chain producing no new checkpoints.
+
+**Trigger**: `checkpoint_callback_params.every_n_train_steps: null` AND
+`every_n_epochs: 1`. With `limit_train_batches: 1000`, 1 epoch = 1000
+steps. If chunks get preempted at ~1h before reaching the next 1000-step
+boundary, NO save happens (the preemption callback's `step=N-last.ckpt` is
+the only fallback).
+
+**Fix**: add `every_n_train_steps: 50-250` (and/or
+`train_time_interval: "00:30:00"`). Lightning ORs the triggers, so all
+three can coexist.
+
+**See also**: `agent-debug-workspace/0909-longform-failures.md` Cause B.
+
+## §14 — `max_time_per_run` doesn't fire on external SIGTERM
+
+**Signature**: SLURM SIGTERM kills the job; no extra `step=N-last.ckpt`
+written; chunk progresses through 75–150 steps but loses them all on
+restart.
+
+**Trigger**: any external preemption (`svc-hwinf-cs-sched`, NODE_FAIL, QOS
+preemption, manual cancel). NeMo's `PreemptionCallback` fires only on its
+own internal timer (`max_time_per_run`).
+
+**Fix**: doesn't fix the root issue; mitigated by frequent step/time-based
+saves (§13). Set `max_time_per_run` to `<SLURM walltime - 10min>` to keep
+the internal-timer save before SLURM SIGKILLs the teardown.
+
+## §15 — `num_workers` mismatch on resume
+
+**Signature**: torchdata `StatefulDataLoader` raises a hard error at
+`load_state_dict` time, complaining the snapshot has different
+`num_workers`.
+
+**Trigger**: chain config changes `num_workers` between chunks (e.g. saved
+under `num_workers: 4`, restored with `num_workers: 8`).
+
+**Fix**: keep `num_workers` invariant across the chain. Same rule for
+`world_size` (= `num_nodes * devices_per_node`).
+
+## §16 — AIS MOSS GetBatch returns empty content for non-replicated data
+
+**Signature**: `_inject_data_into_manifest` retries with empty content; on
+old SDK shape (`info.bck`) crashes with §10 AttributeError. With the §10
+patch in place: `Error collating conversations: <object>/<archpath> from
+bucket <provider>://<bck> returned empty content` → `FallbackDataset
+received None` → `TypeError: 'NoneType' object is not subscriptable`.
+
+**Trigger**: data path is `s3://FLEURS/tarred/<lang>/...`,
+`s3://MCV/MCV4/.../<lang>/...`, etc., and the cluster's AIS doesn't have
+that data replicated. Confirmed for non-EN multilingual on IAD AIS as of
+2026-05-09.
+
+**Fix (workaround)**: set `USE_AIS_INDIVIDUAL_GETS=true` to bypass MOSS
+GetBatch and use per-object `Object.get_reader(archive_config=...).read_all()`
+(slower but works). `lhotse_resumable/lhotse/ais/batch_loader.py:67, 218`
+implements `prefer_individual=True`.
+
+**Fix (proper)**: replicate the missing data to AIS. Quick check from
+inside an iad container:
+```python
+from aistore import Client
+import os
+c = Client(os.environ["AIS_ENDPOINT"])
+for url in ["s3://FLEURS/tarred/bg/audio_0.tar"]:
+    try:
+        print(url, c.get_object_from_url(url).head_v2().size)
+    except Exception as e:
+        print(url, "MISSING:", e)
+```
+
+**See also**: `agent-debug-workspace/0909-multiling-failures.md`.
+
+### Open investigation
+
+**Why does MOSS GetBatch return empty content for non-EN multilingual
+data?** Most likely answer: data not replicated to AIS. But worth
+confirming via the head_v2 probe above before adopting
+`USE_AIS_INDIVIDUAL_GETS` as the permanent workaround. If the data IS on
+AIS but unreadable for some other reason, a different fix is needed (and
+the lhotse fallback would benefit from raising an explicit
+`AISBatchLoaderError: object missing on AIS` instead of letting the
+empty-content path lead to a `TypeError` 6 frames down).
+
+## §17 — `indexes_root` mismatch between training YAML and prefetch script
+
+**Signature**: training startup fails with
+`FileNotFoundError: <path>/<manifest>.idx` or, in the indexed adapter,
+`ValueError: ... .idx file not found ...` from `IndexedJsonlReader._load_index`.
+
+**Trigger**: `data.train_ds.indexes_root: /tmp/idx` in YAML but
+`prefetch_indexes_to_ssd.sh` writes to `/scratch/idx`, or vice versa.
+
+**Fix**: keep both in sync. In `submit_build_indexes.py` the mirror
+defaults to `<workspace>/indexes_mirror/`; the prefetch script then pulls
+onto each node's `/tmp/idx` (the default is `/tmp/idx` per
+`prefetch_indexes_to_ssd.sh`). The training YAML's `indexes_root` must
+match the prefetch destination.
+
+## §19 — `concurrent_bucketing: true` (default) breaks resume bit-exactness
+
+**Signature**: silent. Loss curves and per-sample order across resume
+boundaries diverge from a single-run reference; no exception fires.
+Spot-check by saving `state_dict` mid-run, restoring in a fresh
+process, and asserting batches 0..K are bit-identical (the
+`MIGRATION_GUIDE.md` §3 recipe). Without the fix, you'll see byte-level
+mismatches starting from the very first restored batch.
+
+**Trigger**: any resumable training run with `force_map_dataset: true`
+and `use_stateful_dataloader: true` but `concurrent_bucketing` left at
+its default `True`.
+
+**Cause**: `DynamicBucketingSampler` spawns a daemon producer thread
+(`lhotse_resumable/lhotse/dataset/sampling/dynamic_bucketing.py:924-944`)
+that pre-pulls cuts from `self.cuts_iter` into per-bucket queues. The
+main thread is the one `StatefulDataLoader` checkpoints; the producer
+operates concurrently. At `state_dict` time, the saved cursor reflects
+the main thread's position, NOT the producer's pre-fetched cuts. On
+resume the producer is gone; its pre-fetched cuts are lost. The
+bucketing decisions and per-step batch composition diverge from the
+non-resumed run. As a side effect the same config is also not
+bit-reproducible between two fresh runs (producer scheduling is
+OS-dependent).
+
+**Fix**: set `concurrent_bucketing: false` in `data.train_ds`. NeMo
+falls through to the synchronous `_collect_cuts_in_buckets` path
+(same file, `:954-965`) which advances the iterator only from the
+main thread. Slight throughput hit during bucket warm-up; negligible
+in steady state since the bucket buffer is normally well-stocked.
+
+**Cross-refs**: `option-reference.md` `data.train_ds.concurrent_bucketing`
+row; `best-practices.md` Tier 1.
+
+---
+
+## §18 — `prefetch_indexes.py` PYTHONPATH
+
+**Signature**: `ImportError: cannot import name 'create_jsonl_index'` or
+`ModuleNotFoundError: No module named 'lhotse.indexing'` — the
+container's stock `lhotse` lacks the resumable extensions.
+
+**Trigger**: prefetch / build_indexes preamble doesn't prepend
+`lhotse_resumable/` and `NeMo_resumable/` to PYTHONPATH.
+
+**Fix**: `submit_build_indexes.py:225` does
+`export PYTHONPATH={lhotse_remote}:{code_dir}:$PYTHONPATH` before invoking
+`build_indexes.py`. Arbitrary launchers must do the same.
+
+---
+
+## Cascading symptoms (NOT root causes)
+
+Distributed failures cascade — one bad rank's exception triggers a NCCL
+timeout 30 min later that kills the rest. When the loud error is one of:
+
+- `EPException what(): 'unspecified launch failure'` at `deep_ep.cpp:155`
+- `DeepEP timeout check failed: rank=X, thread=Y, value=…`
+- `Watchdog caught collective operation timeout: WorkNCCL(...)`
+
+…look upstream for the Python traceback that fired first. The DeepEP /
+NCCL chatter is cascade. The 0909-multiling chains had this exact
+pattern: `TypeError: 'NoneType' object is not subscriptable` (origin) →
+DeepEP `'unspecified launch failure'` (cascade).
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md b/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
new file mode 100644
index 000000000000..4166a0536789
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
@@ -0,0 +1,96 @@
+# Option reference — every YAML/launcher field that interacts with the resumable path
+
+Field-by-field exhaustive reference. Required values, rationale, source code
+pointer, see-also link to MIGRATION_GUIDE.md and (when relevant) to the
+0909-debug docs that motivated the field.
+
+## `data.train_ds` — required for the indexed + resumable path
+
+| field | required value | purpose | see also |
+|---|---|---|---|
+| `indexed` | `true` | Routes every nested `input_cfg` source to its indexed adapter (`LazyNeMoTarredIterator(indexed=True)`, `IndexedJsonlReader`, etc.). Without this flag, the streaming/replay path is used. Defined in `LhotseDataLoadingConfig` (`NeMo_resumable/nemo/collections/common/data/lhotse/dataloader.py:261`). | MIGRATION_GUIDE.md "Step 2 — Flip two flags" |
+| `use_stateful_dataloader` | `true` | Swaps PyTorch `DataLoader` → `torchdata.StatefulDataLoader` so iterator state is checkpointed in `meta.pt` under `DataModule.train_dataloader` (3 keys: `_snapshot`, `_steps_since_snapshot`, `_iterator_finished`). Verified via `inspect_meta.py` against `step=2000.ckpt` / `step=3000.ckpt` / `step=N-last.ckpt` (see `agent-debug-workspace/nano-v3-1node-resumable-tests.md`). | `dataloader.py:272`, MIGRATION_GUIDE.md "Step 2" |
+| `force_map_dataset` | `true` | Consume sources map-style (random `__getitem__`) rather than iterable. Required for full per-worker resume state — without it, the iterator-graph state isn't restorable in O(1). On the map path, cross-rank de-dup is via `rank/world_size` slicing in `DynamicBucketingSampler` (`dataloader.py:680-681` constructs the sampler with `rank=global_rank, world_size=world_size`), NOT via per-rank seed differentiation. | `dataloader.py:247-279` |
+| `indexes_root` | local SSD path (e.g. `/tmp/idx`) matching `prefetch_indexes.py` destination | Where the prefetched `.idx` mirror is read from at training time. Mirror tree preserves the data-file paths (`<indexes_root>/lustre/...` mirroring the blend's lustre paths). Resolved by `resolve_idx_path` in `NeMo_resumable/nemo/collections/common/data/lhotse/indexed_adapters.py:170`. **Must match the prefetch script's destination**, otherwise manifests fail to find their `.idx` neighbors at training time. | MIGRATION_GUIDE.md "keep indexes on a separate fast disk" |
+| `seed` | a fixed integer, **invariant across chunks** | Controls Python/numpy/torch global RNG via `pl.seed_everything(seed)` at chunk start. **MUST NOT change on resume**, otherwise dropout / aux-loss / random-init diverge across chunks even though `StatefulDataLoader.load_state_dict` restores sampler state correctly. The 0909 longform chains (see `agent-debug-workspace/0909-longform-failures.md`) hit this exact silent-corruption bug because `train_and_eval.py` rotated `FIXED_SEEDS[seed_offset+i]` per chunk. Fixed in `train_and_eval.py:925-952` — when `--enable-indexes-prefetch` is set, all chunks use the same seed. | MIGRATION_GUIDE.md "Operational constraints" §1, `0909-longform-failures.md` Cause A |
+| `shard_seed` | a fixed integer (NOT `"randomized"`) when `force_map_dataset: true` | Sampler RNG for `DynamicBucketingSampler`. On the map path, cross-rank de-dup is by index slicing (`rank=global_rank, world_size=world_size` at `dataloader.py:680-681`), so per-rank seed differentiation is unneeded. `"randomized"` is iterable-path machinery; on the map path it adds worker-PID-derived seeding that breaks across resume boundaries. **NeMo's dataloader.py auto-overwrites `shard_seed: "randomized"` → `shard_seed: <seed>` with a warning when `force_map_dataset + use_stateful_dataloader` are both true** (`dataloader.py:556-572`). The auto-overwrite is a safety net; pin it explicitly in YAML so the rationale is visible in code review. | `0909-summary.md` R2, `dataloader.py:543-572` |
+| `num_workers` | match between save and restore | `StatefulDataLoader` hard requirement: changing `num_workers` between save and restore raises a hard error from torchdata. Document the value in the YAML / launcher header. | MIGRATION_GUIDE.md "Operational constraints" §1 |
+| `concurrent_bucketing` | **`false`** when `force_map_dataset + use_stateful_dataloader` are both true | The default (`true`) spawns a `daemon=True` producer thread inside `DynamicBucketingSampler` (`lhotse_resumable/lhotse/dataset/sampling/dynamic_bucketing.py:924-944`) that pre-pulls cuts from the source iterator and fills per-bucket queues. The main thread (which `StatefulDataLoader` checkpoints) and the producer thread BOTH advance `self.cuts_iter`, so the cursor saved at `state_dict` time does NOT reflect the cuts the producer has already pre-fetched. On resume, the next-cut cursor is correct from the main thread's view but the producer's pre-fetched cuts are gone, so the bucketing/order across resume boundaries is nondeterministic. Also breaks single-run bit-exact reproducibility between two runs of the same config because the producer's scheduling is OS-thread-dependent. Set `concurrent_bucketing: false` in `data.train_ds` for any resumable run. | `lhotse_resumable/lhotse/dataset/sampling/dynamic_bucketing.py:924-944`, `failure-modes.md §<new>`, observed in `0909-multiling-*` (2026-05-11) |
+| `force_iterable_dataset` | unset (or `false`) | Mutually exclusive with `force_map_dataset: true`. `dataloader.py:278-280` asserts `not (force_map_dataset and force_iterable_dataset)`. | `dataloader.py:278-280` |
+| `force_finite` | unset / `false` (training only) | Setting this to `true` would cap the infinite-mux behavior that training requires. Only validation_ds needs `force_finite: true`. | MIGRATION_GUIDE.md "Operational constraints" §4 |
+| `extra_fields` (on any nested `nemo` / `nemo_tarred` / `multimodal_conversation`) | unset | `LazyNeMoTarredIterator(indexed=True)` raises `RuntimeError` if `extra_fields` is set (`nemo_adapters.py:485-487`: "LazyNeMoTarredIterator(indexed=True) does not support 'extra_fields'"). Same constraint on `LazyNeMoIterator` at `nemo_adapters.py:148-152`. Pre-process the manifest offline. | `nemo_adapters.py:148-152, 485-487` |
+| `slice_length` (on any nested `nemo` / `nemo_tarred`) | unset | Slicing rewrites cuts in a way that has no stable index. The dataloader still threads `slice_length` through (`dataloader.py:253-256`, `cutset.py:413, 436, 662, 678, 693, 717, 1506, 1551`), but the indexed reader does not honor it. Pre-process offline if needed. | MIGRATION_GUIDE.md "Prerequisites" §3 |
+| compressed `.jsonl.gz` / `.tar.gz` paths | reject | `lhotse.indexing.indexed_path_kind` returns `None` for any path matching `_is_compressed_path` (`lhotse_resumable/lhotse/indexing.py:88-110`); `validate_indexed_access` raises `ValueError("...requires uncompressed JSONL or tar...")`. Re-extract or re-export with `compress_jsonl=False` for Shar. | MIGRATION_GUIDE.md "Prerequisites" §1, `lhotse/indexing.py:88-110, 130-135` |
+| `pipe:` paths (`pipe:cmd \| cmd`) | reject | Pipe commands aren't seekable. `validate_indexed_access` raises `ValueError("...requires seekable data sources...")`. | `lhotse/indexing.py:126-128` |
+| `.json` extension on a JSONL manifest | accepted | NeMo ships many ASR/SLM manifests as `*.json` (one JSON object per line). `lhotse/indexing.py:99-107` accepts both `.json` and `.jsonl` since the indexer only relies on newline-separated records. (Pretty-printed multi-line JSON would produce a bogus index, but that's not a supported NeMo manifest layout.) | `lhotse/indexing.py:99-107` |
+
+## `data.validation_ds` — finite map access required
+
+| field | required value | purpose | see also |
+|---|---|---|---|
+| `indexed` | `true` (inherited from train_ds OR per-validation set) | Same as `data.train_ds.indexed`. | MIGRATION_GUIDE.md Step 2 |
+| `force_map_dataset` | `true` | Map-style finite access. | MIGRATION_GUIDE.md Step 2 |
+| `force_finite` | `true` | **Caps the infinite-mux behavior that training uses**. Without this, validation loops forever (the multiplexer never raises StopIteration). MIGRATION_GUIDE.md "Operational constraints" §4 calls this out explicitly. | MIGRATION_GUIDE.md "Operational constraints" §4 |
+| `use_stateful_dataloader` | `false` (or `true`, doesn't matter) | Validation never resumes from mid-eval; eval is run-to-completion. Either value works. | — |
+| `indexes_root` | same path as train_ds | Same — must match the prefetch destination. | — |
+| `seed` / `shard_seed` | same fixed integers as train_ds (or any fixed value) | Determinism for eval. Doesn't need to be invariant across chunks the way training does. | — |
+
+## `exp_manager` — Lightning resume contract
+
+| field | required value | purpose | see also |
+|---|---|---|---|
+| `resume_if_exists` | `true` | Lightning auto-finds the latest `step=N-last.ckpt` and loads model + optimizer + dataloader state from DCP shards + `meta.pt`. Without this, every chunk starts from scratch. | MIGRATION_GUIDE.md "Lightning resume contract" |
+| `resume_ignore_no_checkpoint` | `true` | First chunk runs without prior ckpt; without this flag, the first run errors. | — |
+| `checkpoint_callback_params.every_n_train_steps` | small int (50–250 recommended) | Mid-chunk saves so external preemption (`svc-hwinf-cs-sched`, NODE_FAIL, etc.) doesn't waste 80–150 step progress. The 0909 longform chains (`0909-longform-failures.md` Cause B) accumulated **0** progress past `step=1000` because the only save trigger was `every_n_epochs: 1` and chunks averaged 75–150 steps after preemption. | `0909-longform-failures.md` Cause B |
+| `checkpoint_callback_params.train_time_interval` | `"00:30:00"` (suggested) | Belt-and-braces wall-clock save trigger. Lightning ORs the per-step and per-time triggers, so both can coexist. | best-practices.md §4 |
+| `checkpoint_callback_params.every_n_epochs` | `null` or `1` | If you keep `every_n_epochs: 1`, *also* set `every_n_train_steps`; do not rely on epochs alone. | — |
+| `checkpoint_callback_params.save_top_k` | `-1` (no pruning) | Prevents Lightning from deleting old checkpoints when `monitor` doesn't fire. With `every_n_train_steps + every_n_epochs` saves you want all of them on disk. | — |
+| `max_time_per_run` | `<SLURM walltime - 10min>` | NeMo's `PreemptionCallback` fires here, leaving a 10-minute buffer for the teardown tail. **Does NOT fire on external SIGTERM** (only on its own timer) — external cancels can still lose progress. Mitigated by frequent step/time-based saves. | debug-cluster-run §6(11) |
+
+## `trainer` — Lightning + parallelism
+
+| field | constraint | purpose | see also |
+|---|---|---|---|
+| `devices` / `num_nodes` | match between save and restore | StatefulDataLoader is sensitive to `world_size`; changing it between save and restore raises a hard error. To scale a chain mid-flight you must restart from a converted HuggingFace checkpoint (no resume). | MIGRATION_GUIDE.md "Operational constraints" §1 |
+| `max_steps` | unchanged across chain | Chain semantics: each chunk advances `global_step`; `max_steps` is the chain target. Don't reduce it mid-chain or Lightning will think training is finished. | — |
+| `limit_train_batches` | usually `1000` | Defines an "epoch". With `every_n_epochs: 1` this is also the only save trigger if `every_n_train_steps` is unset. See `every_n_train_steps` above. | — |
+
+## Launcher contract — `train_and_eval.py` and equivalents
+
+| concern | requirement | purpose | see also |
+|---|---|---|---|
+| Per-chunk seed | **invariant across all chunks of a chain** when `use_stateful_dataloader: true` | StatefulDataLoader contract: model RNG must be the same on resume so dropout/aux-loss/random-init are bit-exact across chunks. The 0909 longform chains hit this with `FIXED_SEEDS[0..9]` rotation. Fixed in `train_and_eval.py:925-952`: when `--enable-indexes-prefetch` is set, `seeds = [seed_or_default] * num_runs`. The skill should grep for any `FIXED_SEEDS[i]` / `seed = randint(...)` / `seed=run_idx` patterns in arbitrary launchers and warn. | `train_and_eval.py:925-952`, `0909-longform-failures.md` Cause A |
+| Indexes prefetch preamble | every chunk's container startup runs `prefetch_indexes.py` (or the equivalent rsync) onto each node's local SSD, populating `<indexes_root>` before `salm_train.py` starts | `train_and_eval.py:577-578` does this via `prefetch_indexes_to_ssd.sh`; if missing, training reads `.idx` files from lustre on every `__getitem__` call (slow; defeats the purpose). | `train_and_eval.py:577-578` |
+| `num_workers`, `world_size` | invariant across chain | Hard requirement of StatefulDataLoader (see above). Launcher should NOT change `--num-nodes` or `--num-workers` between chunks. | MIGRATION_GUIDE.md "Operational constraints" §1 |
+| `--bypass-nvidia-hook` for cpu partitions | required on clusters whose `cpu_partition` lacks `nvidia-container-cli` (e.g. NRT) | Without it, enroot's `98-nvidia.sh` hook hard-fails the container start on cpu partitions of those clusters. Sets `--export=ALL,NVIDIA_VISIBLE_DEVICES=void` on the sbatch line. Used by `submit_build_indexes.py:122-129, 240-245` and `train_and_eval.py`. | `submit_build_indexes.py:122-129` |
+| PYTHONPATH | must include both `lhotse_resumable/` and `NeMo_resumable/` | Without it, the in-container default `lhotse` / `nemo` are loaded and lack the resumable code. `submit_build_indexes.py:225` does this; arbitrary launchers must too. | `submit_build_indexes.py:225` |
+
+## AIStore env vars
+
+| env var | required when | purpose | see also |
+|---|---|---|---|
+| `USE_AIS_GET_BATCH` | training data is on `s3://`, `ais://`, or `http(s)://` AND the cluster has `AIS_ENDPOINT` | Skip eager `IndexedTarMemberReader` per shard; defer audio fetch to AIS at sample time via `AISBatchLoader`. Read at `nemo_adapters.py:459`. | aistore-vs-non-aistore.md |
+| `USE_AIS_INDIVIDUAL_GETS` | non-EN-replicated multilingual data on AIS, or any time MOSS GetBatch returns empty content | Routes through per-object `Object.get_reader(archive_config=...).read_all()` instead of MOSS GetBatch. Slower but bypasses MOSS-side issues. `lhotse_resumable/lhotse/ais/batch_loader.py:67, 218` (the `prefer_individual` flag). | failure-modes.md §16 |
+| `AIS_ENDPOINT` | always when AIStore in play | The AIS proxy URL. IAD: `http://asr.iad.oci.aistore.nvidia.com:51080`. Set in `cluster_configs/<cluster>.yaml` under `env_vars`. | `cluster_configs/iad.yaml:31` |
+| `aistore` SDK version | ≥ 1.17 | `lhotse_resumable/lhotse/ais/batch_loader.py:75` requires `aistore>=1.17.0`. As of 2026-05-10, latest is 1.23.0. The `_moss_attrs` normalizer at `batch_loader.py:81` handles both MossIn (≤1.18) and MossOut (≥1.20) attribute namings. | `lhotse_resumable/lhotse/ais/batch_loader.py:75-89` |
+
+## Index building
+
+| concern | requirement | purpose | see also |
+|---|---|---|---|
+| Uncompressed sources only | `.jsonl` / `.tar` (NOT `.jsonl.gz` / `.tar.gz`); Shar `cuts.*.jsonl` not `cuts.*.jsonl.gz` | See `lhotse/indexing.py:88-110, 130-135`. AMI's stock distribution as `.jsonl.gz` Shar fails — drop AMI from the blend until an uncompressed export is available. | `data_blends/iad/granary1p1-en-resumable.yaml` header comment, MIGRATION_GUIDE.md "Prerequisites" §1 |
+| No `extra_fields` | every `nemo` / `nemo_tarred` / `multimodal_conversation` entry must omit `extra_fields` | `LazyNeMoTarredIterator(indexed=True)` raises explicitly. | `nemo_adapters.py:485-487` |
+| No `slice_length` | every `nemo` / `nemo_tarred` entry must omit `slice_length` | Sliced cuts have no stable index. | dataloader.py:253-256 |
+| Workers | 95 (on 96-cpu node) for 80k–400k files; 48 if OOM | Tar parsing is GIL-bound (process executor required). 96-cpu / 95-worker / `--exclusive` is the sweet spot. ProcessPool OOM signature: `concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly`. Drop to 48 workers if 95 OOMs. | failure-modes.md §8 |
+| Time | ~90 min for 80k files; ~2-3 h for 360k files | `submit_build_indexes.py:131` defaults `time_min=04:00:00`. | submit_build_indexes.py:131 |
+| Mirror destination | lustre under `<workspace>/indexes_mirror/` (writable, fast enough for prefetch source); NOT S3 | `prefetch_indexes.py` then pulls onto each node's local SSD at `/tmp/idx` (or whatever `indexes_root` resolves to). | submit_build_indexes.py:88-92, prefetch_indexes.py |
+| `aistore` SDK in builder container | required if any source is `s3://` / `ais://` | `submit_build_indexes.py:227` does `pip install --quiet --disable-pip-version-check aistore`. Without it, lhotse falls back to smart_open's AWS S3 client and fails with `io.UnsupportedOperation: seek`. Pin the SDK version range to match the lhotse code (`aistore>=1.17`). | submit_build_indexes.py:218-227 |
+| Reusability | once per blend; reuse across experiments | Already-indexed files are skipped; `--force` to rebuild. Re-runs are safe. | build_indexes.py:386 |
+
+## Cluster info
+
+| concern | requirement | purpose | see also |
+|---|---|---|---|
+| `cluster_configs/<cluster>.yaml` must exist | always | `submit_build_indexes.py` and `train_and_eval.py` read SSH creds, partition, container, env_vars from it. | TEMPLATE.yaml |
+| `nvidia-container-cli` on cpu partitions | NRT lacks it (cpu / cpu_interactive / cpu_datamover); IAD has it | If absent, use `--bypass-nvidia-hook` (sets `--export=ALL,NVIDIA_VISIBLE_DEVICES=void`). | submit_build_indexes.py:122-129 |
+| `AIS_ENDPOINT` env var | required when AIStore is the audio backend | Set in `env_vars:` block of cluster config. IAD has it; lustre-only clusters (typically NRT) won't. | cluster_configs/iad.yaml:31 |
diff --git a/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md b/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md
new file mode 100644
index 000000000000..c8db819c7fa6
--- /dev/null
+++ b/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md
@@ -0,0 +1,143 @@
+# Migration report — `<config-stem>`
+
+- **Generated**: <YYYY-MM-DD HH:MM>
+- **Source YAML**: `<path/to/source-config.yaml>`
+- **Patched YAML**: `<path/to/source-config-resumable.yaml>`
+- **Source blend** (if inspected): `<path/to/blend.yaml>`
+- **Patched blend** (if emitted): `<path/to/blend-resumable.yaml>`
+- **Launcher** (if inspected): `<path/to/launcher.py>` (or "skipped — no launcher provided")
+- **Cluster**: `<cluster>` (AIStore: yes/no)
+
+## Summary
+
+<One paragraph: what was changed, severity counts (e.g. "1 fatal, 4 errors,
+3 warnings, 2 notes"), whether the patched YAML is ready-to-launch or
+requires further user action.>
+
+## Findings
+
+### Fatal (must fix; auto-patching not possible)
+
+- _none_  — OR —
+- **`<field-path>`** (`<file>:<line>`): <one-paragraph explanation>
+  - **Current**: `<value>`
+  - **Recommended**: `<value>` (or "manual rewrite")
+  - **Why fatal**: <reason auto-patch isn't possible>
+  - **References**: [option-reference §X], [failure-modes §Y]
+
+### Errors (auto-patched; review the diff)
+
+- **`data.train_ds.indexed`** (`<file>:<line>`): <description>
+  - **Was**: `false` → **now**: `true`
+  - **Why**: <one paragraph>
+  - **References**: [option-reference §train_ds.indexed]
+
+- _(more)_
+
+### Warnings (auto-patched OR commented inline; verify intent)
+
+- **`data.train_ds.shard_seed`** (`<file>:<line>`): <description>
+  - **Was**: `"randomized"` → **now**: `42`
+  - **Why**: NeMo `dataloader.py` would auto-overwrite at runtime with a
+    `WARNING` log; pinning at the YAML layer makes intent obvious to
+    reviewers and avoids the runtime warning.
+  - **References**: [conflict-matrix row 3], [failure-modes §11]
+
+- _(more)_
+
+### Notes (informational; no patch)
+
+- **`data.validation_ds.use_stateful_dataloader`** (`<file>:<line>`):
+  Not strictly required for validation (eval doesn't checkpoint), but
+  setting it `false` matches the working 1-node smoke recipe. No change
+  needed.
+
+- _(more)_
+
+## Cross-cuts
+
+### Data blend audit
+
+<Drop in `references/failure-modes.md` §1 / §2 callouts: blend entries with
+`.jsonl.gz`, `.tar.gz`, `extra_fields`, `slice_length`. List the entries
+that were removed from the patched blend and the per-entry rationale.>
+
+| corpus | reason for exclusion | upstream fix |
+|---|---|---|
+| ami | `cuts: *.jsonl.gz` (compressed Lhotse Shar) | re-export with `compress_jsonl=False` OR convert to `nemo_tarred` |
+| _(more)_ | _(more)_ | _(more)_ |
+
+### Launcher review
+
+<If launcher script provided: list grep findings. Otherwise: "skipped".>
+
+- **Per-chunk seed rotation**: <not detected | DETECTED at `<file>:<line>` —
+  the launcher pulls from a FIXED_SEEDS-like array; this MUST be pinned
+  to a single value when the resumable path is on. See
+  `failure-modes.md §12`. Manual fix required.>
+- **Prefetch preamble wired**: <yes / NO — `--enable-indexes-prefetch`
+  flag not set; manual addition needed. See `option-reference.md §launcher
+  flags`.>
+- **`--bypass-nvidia-hook`**: <not needed | needed for `<cluster>` cpu
+  partition — see `failure-modes.md §9`>
+
+### AIStore vs lustre
+
+<One paragraph: which workflow this migration follows (per
+`aistore-vs-non-aistore.md` decision tree), and any cross-cluster
+caveats.>
+
+## Patched output diff
+
+### `<config>.yaml` → `<config>-resumable.yaml`
+
+```diff
+-  data.train_ds:
+-    indexed: false
+-    use_stateful_dataloader: false
+-    shard_seed: "randomized"
++  data.train_ds:
++    indexed: true
++    use_stateful_dataloader: true
++    force_map_dataset: true
++    indexes_root: /tmp/idx
++    shard_seed: 42  # NOTE: pinned for StatefulDataLoader resume; see
++                    # MIGRATION_GUIDE.md §"Operational constraints"
+```
+
+_(full diff inline)_
+
+### `<blend>.yaml` → `<blend>-resumable.yaml`
+
+```diff
+-  - corpus: ami
+-    shar_path:
+-      cuts: s3://AMI/lhotse_shar/cuts._OP_*_CL_.jsonl.gz
+-    type: lhotse_shar
+-    weight: 0.2
++  # AMI dropped — Lhotse Shar `cuts.*.jsonl.gz` cannot be indexed
++  # (uncompressed sources only). Re-export with `compress_jsonl=False`
++  # or convert to `nemo_tarred` to re-include.
+```
+
+_(full diff inline)_
+
+## Pre-flight checklist
+
+See `pre-flight-checklist.md` next to this report. The TL;DR:
+
+1. Build indexes via the generated `build-indexes-cmd.sh`.
+2. Run the `MIGRATION_GUIDE.md §3` bit-exact verification once on this
+   recipe.
+3. Confirm `aistore` SDK present in the container (AIStore workflow only).
+4. 1-node single-chunk → 1-node multi-chunk → full N-node smoke ladder.
+5. Submit the real run.
+
+## References
+
+- `MIGRATION_GUIDE.md` (repo root): canonical migration walkthrough.
+- `references/option-reference.md`: every YAML field, every flag.
+- `references/conflict-matrix.md`: option pairs that conflict.
+- `references/failure-modes.md`: 18-entry failure-mode catalog.
+- `references/best-practices.md`: prioritised checklist.
+- `references/aistore-vs-non-aistore.md`: workflow selection.

From 31de72733dabab3365b5a754a88ab48fc4b82c2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Tue, 12 May 2026 15:53:09 -0400
Subject: [PATCH 09/30] iterable+indexed glue; ais_force_individual rename;
 skill doc updates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

dataloader.py: _maybe_init_main_process_for_iterable eagerly calls
worker_init_fn(0, ...) when num_workers=0 so the iterable path with
no worker subprocesses still gets DP×worker partition wired up.
Wired into both get_lhotse_dataloader_from_{single,multi}_config
iterable branches.

ASR + SpeechLM2 dataset wrappers: ais_prefer_individual ->
ais_force_individual; tracks the lhotse-side rename. The
USE_AIS_INDIVIDUAL_GETS env-var contract is unchanged.

Skill docs updated for: the rename, the new byte-range shar_ptr
fallback in AISBatchLoader, iterable-mode partition semantics, the
0-byte .idx race repro + atomic _write_index fix, and the
prefetch-manifests / prefetch-indexes co-design (merged into the
manifest prefetch script so .idx sidecars land at the resolve_idx_path
slot the rewritten YAML will look up).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../migrate-to-resumable-dataloader/SKILL.md  |  28 ++++-
 .../references/best-practices.md              |  43 +++++++
 .../references/conflict-matrix.md             |   4 +
 .../references/failure-modes.md               | 106 +++++++++++++++++-
 .../references/option-reference.md            |  21 +++-
 .../templates/migration-report.md             |  33 +++++-
 .../asr/data/audio_to_text_lhotse.py          |   4 +-
 .../asr/data/audio_to_text_lhotse_prompted.py |   4 +-
 .../common/data/lhotse/dataloader.py          |  15 +++
 .../speechlm2/data/salm_dataset.py            |   2 +-
 10 files changed, 242 insertions(+), 18 deletions(-)

diff --git a/.claude/skills/migrate-to-resumable-dataloader/SKILL.md b/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
index 76b9a21cac87..8c0d10e27b80 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
@@ -8,11 +8,28 @@ argument-hint: '<config.yaml> [launcher.py] [blend.yaml] [--cluster=<name>]'
 
 The repo's resumable path (replacing the streaming/replay loader with O(1)
 checkpoint-restore via `torchdata.StatefulDataLoader` + `.idx` sidecars) has
-~15 distinct ways to silently corrupt or hard-fail. This skill runs every
+~20 distinct ways to silently corrupt or hard-fail. This skill runs every
 one of those checks against a concrete YAML, auto-patches what it can, and
 emits a teaching-style migration report so the user understands every
 decision and the user-only steps to run before launching.
 
+**Map-style vs iterable-style for indexed sources.** The resumable path supports
+two dedup modes:
+
+1. **`force_map_dataset: true`** (default; safest) — sampler runs in the main
+   GPU process and over-samples `world_size` batches per step, discards
+   `world_size - 1`. Works for any source type. Costs `W×` redundant
+   sampler/manifest I/O per step.
+2. **`force_map_dataset: false`** (optimization for indexed-only configs at
+   high `world_size`) — sampler runs co-located with the dataset inside CPU
+   worker subprocesses; sample indices are partitioned across
+   `(DP rank × DataLoader worker)` via `LazyShuffledRange(shard_id, num_shards)`
+   so each shard yields a disjoint slice. Resolved at iteration time via the
+   `LHOTSE_USE_WORKER_PARTITION` env-var signal that `worker_init_fn` sets.
+   Eliminates the `W×` redundant work; near-`W×` step-time improvement at
+   scale. **Requires all sources to be indexed** (or use other dedup
+   mechanisms — see `references/failure-modes.md` §20-§23).
+
 ## When to apply
 
 Trigger phrases listed in the frontmatter. Three common entry modes:
@@ -214,13 +231,16 @@ launcher).
   sections of that doc.
 - **Cross-check against the actual code** at:
   - `lhotse_resumable/lhotse/serialization.py` (`open_best`, AIStore backend, MSC backend)
-  - `lhotse_resumable/lhotse/indexing.py` (`create_jsonl_index`, `create_tar_index`, `indexed_path_kind`, `IndexedJsonlReader`, `read_index`)
-  - `lhotse_resumable/lhotse/ais/batch_loader.py` (`AISBatchLoader`, `prefer_individual`, `_moss_attrs`)
+  - `lhotse_resumable/lhotse/indexing.py` (`create_jsonl_index`, `create_tar_index`, `indexed_path_kind`, `IndexedJsonlReader`, `read_index`, `LazyShuffledRange` with `(shard_id, num_shards)` partition)
+  - `lhotse_resumable/lhotse/lazy.py` (`LazyIndexedManifestIterator.__iter__` defers `LazyShuffledRange` construction to resolve partition at iter time; `LazyIteratorChain._iter_globally_shuffled` partitions the combined range; `LazyIteratorMultiplexer.__iter__` rejects `seed='randomized'` under multi-shard partition)
+  - `lhotse_resumable/lhotse/dataset/dataloading.py` (`worker_init_fn` sets the `LHOTSE_USE_WORKER_PARTITION` signal; `get_worker_partition()` returns the trivial `(0, 1)` when that signal is absent — keeps map-style mode unaffected even under torchrun)
+  - `lhotse_resumable/lhotse/ais/batch_loader.py` (`AISBatchLoader`, `force_individual`, byte-range `shar_ptr` fallback, `_moss_attrs`)
   - `lhotse_resumable/lhotse/dataset/input_strategies.py` (`AudioSamples`)
   - `NeMo_resumable/nemo/collections/common/data/lhotse/indexed_adapters.py` (`IndexedTarMemberReader`, `_AISRangeReader`, `_CountingReader`, `_open_data_path`, `_load_index`, `resolve_idx_path`)
-  - `NeMo_resumable/nemo/collections/common/data/lhotse/dataloader.py` (`get_lhotse_sampler_from_config`, `get_lhotse_dataloader_from_config`, `force_map_dataset` handling, the auto-overwrite of `shard_seed`)
+  - `NeMo_resumable/nemo/collections/common/data/lhotse/dataloader.py` (`get_lhotse_sampler_from_config`, `get_lhotse_dataloader_from_config`, `force_map_dataset` handling, the auto-overwrite of `shard_seed`, `_maybe_init_main_process_for_iterable` for `num_workers=0` eager `worker_init_fn` call)
   - `NeMo_resumable/nemo/collections/common/data/lhotse/nemo_adapters.py` (`LazyNeMoTarredIterator`, `_init_indexed`, `_iter_batch_for_ais_get_batch`, `USE_AIS_GET_BATCH` gate)
   - `NeMo_resumable/scripts/dataloading/build_indexes.py` and `prefetch_indexes.py`
+  - `lhotse_resumable/test/test_partition.py` (49 tests pinning every partition edge case: map-style regression, empty/tiny manifests, composition with shuffler/mapper/filter/repeater, multiplexer state-dict roundtrip, chain topology mismatch, etc.)
 - **Cross-check against today's debug docs** at:
   - `agent-debug-workspace/0909-summary.md`
   - `agent-debug-workspace/0909-multiling-failures.md`
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md b/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
index ff3dce7ae572..dbc25c40e6cb 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
@@ -52,6 +52,34 @@ real-world adoption pain. Apply these before sweeping any new recipe.
    asserts equal. Catches sampler/bucketer state-dict bugs that schema
    inspection of `meta.pt` (just confirming the keys exist) won't.
 
+5b. **Consider `force_map_dataset: false` for indexed-only configs at high
+   `world_size` (≥ 16-ish).** The default map-style path over-samples
+   `world_size` batches per step and discards `world_size - 1` — at 32 DP
+   ranks that's 32× redundant sampler/manifest I/O on the main GPU
+   process per step, and the 0909 profiling showed it nearly doubling
+   training step time. The iterable path co-locates the sampler with the
+   dataset inside CPU worker subprocesses and partitions sample indices
+   across `(DP rank × DataLoader worker)` via `LazyShuffledRange(shard_id,
+   num_shards)`. Single partition level, no double-counting; near-`W×`
+   step-time improvement at scale.
+
+   Preconditions before flipping:
+   - Every nested `input_cfg` source must be indexed (no plain
+     `LazyJsonlIterator` / `LazyManifestIterator` in the chain — see
+     `failure-modes.md §21`).
+   - Every `LazyIteratorMultiplexer.seed` is a fixed integer
+     (`shard_seed` typically pins this). `seed='randomized'` raises a
+     loud `ValueError` at iter time under multi-shard partition
+     (§22).
+   - `(world_size, num_workers)` invariant across the chain (§23).
+   - Validation still uses `force_map_dataset: true` (small, finite,
+     no perf benefit from partitioning).
+
+   The 0909 sweep flipped `0909-longform5pct.yaml` as a canary;
+   `force_map_dataset: true` remains the safe default for any config
+   that mixes indexed + non-indexed sources or that you haven't
+   profiled yet.
+
 6. **Pick exactly ONE checkpoint trigger** in
    `exp_manager.checkpoint_callback_params` — `every_n_train_steps`,
    `every_n_epochs`, OR `train_time_interval`. Lightning's
@@ -139,3 +167,18 @@ real-world adoption pain. Apply these before sweeping any new recipe.
   YAML says `indexes_root: /tmp/idx` and the prefetch script writes to
   `/tmp/idx2`, training silently can't find any index, falls back to
   building on first access (slow).
+
+- **Don't flip to `force_map_dataset: false` without auditing every
+  source in the chain.** A single non-indexed source (plain
+  `LazyJsonlIterator`, `LazyManifestIterator`, compressed Shar that
+  silently fell back, etc.) yields its full content on every rank under
+  iterable mode — silent data duplication that won't show up until you
+  inspect cut-ID coverage across ranks. See `failure-modes.md §21`. When
+  in doubt, keep `force_map_dataset: true`; the over-sample-and-discard
+  dedup works regardless of source type.
+
+- **Don't set `LHOTSE_USE_WORKER_PARTITION` manually.** It's a signal
+  set by `worker_init_fn` to indicate iterable-mode partition is active.
+  Setting it from outside (e.g. in a launcher script or `.env` file)
+  while running map-style mode would re-introduce the under-sampling bug
+  fixed by §20.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md b/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
index 662023ee6eca..d17f60d44cdd 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
@@ -31,3 +31,7 @@ Severities:
 | `data.train_ds.seed` | per-chunk seed rotation in launcher | Same as above — silent model-level divergence. | error | Pin `seed` in YAML AND in launcher; both must be invariant across the chain. |
 | `pretrained_llm` change | resume from a chain | `init_from_checkpoint` resharding issues; tokenizer mismatch. | warning | Don't change the LLM mid-chain. Start fresh if you need a different LLM (+ optionally `init_from_checkpoint: <previous_run.ckpt>` for transfer). |
 | `model.aux_loss_coeff > 0` | `model.activation_checkpointing_llm: true` | AC + MoE aux-loss recompute dtype flip (debug-cluster-run §6(16)). `CheckpointError: Recomputed values ... different metadata`. Orthogonal to resumable, but a frequent recipe pitfall. | error | Set `aux_loss_coeff: 0`, OR disable `activation_checkpointing_llm` (perception AC alone is fine). |
+| `data.train_ds.force_map_dataset: false` | `LazyIteratorMultiplexer(seed="randomized")` anywhere in the chain | Under iterable-mode partition all ranks must pick the same source at each multiplex step (else the global weighted distribution drifts). `LazyIteratorMultiplexer.__iter__` raises `ValueError` at iter time. | error | Use a fixed integer seed for every multiplexer in the chain (`shard_seed` is typically the one to pin). |
+| `data.train_ds.force_map_dataset: false` | non-indexed sources in the chain (plain `LazyJsonlIterator`, `LazyManifestIterator`, anything `is_indexed=False`) | Non-indexed sources don't partition — every rank reads them in full, silently duplicating data. The chain's `is_indexed` is `False` when any source is non-indexed, so `_iter_globally_shuffled` won't fire either. | warning | (a) Convert the non-indexed sources to indexed (rebuild with `submit_build_indexes.py`); OR (b) split into separate dataloaders; OR (c) revert to `force_map_dataset: true` for that config. |
+| `data.train_ds.force_map_dataset: false` | resume with different `(world_size, num_workers)` | `LazyShuffledRange.load_state_dict` validates the full topology including `shard_id` and `num_shards`. Mismatch raises `ValueError` loudly. Same contract as map-style StatefulDataLoader, but check fires at the iterator level. | error | Keep `(world_size, num_workers)` invariant across the chain. To scale, restart from a converted HuggingFace checkpoint (no resume). |
+| `data.train_ds.force_map_dataset: false` | `force_iterable_dataset: true` | Redundant but not an error — `dataloader.py:280` resolves to iterable in both cases; only `force_map_dataset=True` overrides. The `assert not (force_map_dataset and force_iterable_dataset)` still holds. | note | Pick one form. `force_map_dataset: false` is sufficient; setting both is just noise. |
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md b/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
index b4ded608bf9f..849929f2881e 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
@@ -249,8 +249,8 @@ that data replicated. Confirmed for non-EN multilingual on IAD AIS as of
 
 **Fix (workaround)**: set `USE_AIS_INDIVIDUAL_GETS=true` to bypass MOSS
 GetBatch and use per-object `Object.get_reader(archive_config=...).read_all()`
-(slower but works). `lhotse_resumable/lhotse/ais/batch_loader.py:67, 218`
-implements `prefer_individual=True`.
+(slower but works). `lhotse_resumable/lhotse/ais/batch_loader.py` implements
+this via the `force_individual=True` ctor arg.
 
 **Fix (proper)**: replicate the missing data to AIS. Quick check from
 inside an iad container:
@@ -329,6 +329,108 @@ row; `best-practices.md` Tier 1.
 
 ---
 
+## §20 — Iterable mode (`force_map_dataset: false`) silent under-sampling when partition signal missing
+
+**Signature**: silent. Step time looks normal but training runs through far
+fewer data points than expected; loss curves are wrong (each rank ends up
+training on a sliver of its already-sliced shard). Inspect with: take a
+fresh process under torchrun (so RANK/WORLD_SIZE are set), construct a
+`LazyIndexedManifestIterator(...)` directly without going through NeMo's
+dataloader (or with `worker_init_fn` somehow not running), and assert
+`len(list(iter(it))) == n`. Pre-fix this returned `n / world_size`.
+
+**Trigger**: previously (before the env-var signal was added), `LazyIndexedManifestIterator.__iter__`
+called `get_worker_partition()` which read RANK/WORLD_SIZE directly. Under
+torchrun, those env vars are set in the main process even in map-style mode
+— so the iterator applied partition even though the sampler was about to
+over-sample-and-discard, causing 1/world_size² effective coverage per rank.
+
+**Fix**: `worker_init_fn` now sets `LHOTSE_USE_WORKER_PARTITION=1`, and
+`get_worker_partition()` returns the trivial `(0, 1)` partition when that
+flag is absent. Map-style mode never calls `worker_init_fn`, so the flag
+stays unset and partition is bypassed. For iterable mode the NeMo dataloader
+passes `worker_init_fn` to the DataLoader (workers `num_workers>0`) or calls
+it eagerly via `_maybe_init_main_process_for_iterable()` (`num_workers=0`).
+
+**Cross-refs**: `lhotse_resumable/lhotse/dataset/dataloading.py:22` (constant
+definition), `:82` (set in `worker_init_fn`), `:139-170` (`get_worker_partition`
+checks the flag, returns `(0, 1)` if unset);
+`lhotse_resumable/test/test_partition.py::test_map_style_path_yields_all_items_under_torchrun`
+pins the regression.
+
+---
+
+## §21 — Iterable mode with non-indexed source in the chain → silent duplication
+
+**Signature**: silent. Each rank reads the non-indexed source(s) in full.
+Inspect with the bit-exact verification recipe (`MIGRATION_GUIDE.md §3`) on
+a config containing a mixed-indexed chain — items from the non-indexed
+source(s) show up on every rank.
+
+**Trigger**: `force_map_dataset: false` plus a `LazyIteratorChain` mixing
+`LazyIndexedManifestIterator` (indexed) with `LazyJsonlIterator` /
+`LazyManifestIterator` (non-indexed). The chain's `is_indexed` is `False`
+when any source is non-indexed, so the chain falls back to
+`_iter_sequential` which delegates to each source's `__iter__`. Indexed
+sources partition themselves; non-indexed ones don't.
+
+**Fix**: either (a) convert the non-indexed sources to indexed via
+`submit_build_indexes.py`; (b) split the non-indexed sources into a separate
+dataloader; (c) revert to `force_map_dataset: true` for this config — its
+over-sample-and-discard dedup works regardless of source type.
+
+**Cross-refs**: `lhotse_resumable/test/test_partition.py::test_chain_mixed_indexed_non_indexed_only_indexed_partitions`
+pins the documented behaviour.
+
+---
+
+## §22 — Iterable mode + `LazyIteratorMultiplexer(seed="randomized")`
+
+**Signature**: loud `ValueError: LazyIteratorMultiplexer cannot use
+seed='randomized' under multi-shard (DP rank x DataLoader worker)
+iteration: each shard would draw a different RNG state and pick a different
+source at the same step, causing the global weighted source distribution to
+drift across ranks. Use a fixed integer seed.` from
+`lhotse_resumable/lhotse/lazy.py:960-970`.
+
+**Trigger**: `force_map_dataset: false` and a multiplexer somewhere in the
+iteration graph has `seed='randomized'` (or unset and inheriting the
+default randomized seed propagation).
+
+**Fix**: pin the multiplexer's `seed` (or the top-level `shard_seed` that
+flows in) to a fixed integer. Map-style mode is unaffected since partition
+collapses to `(0, 1)` and the assertion never fires.
+
+**Cross-refs**: `lhotse_resumable/test/test_partition.py::test_multiplexer_rejects_randomized_seed_under_multishard`
+and `test_multiplexer_allows_randomized_seed_single_shard`.
+
+---
+
+## §23 — Iterable mode resume topology mismatch
+
+**Signature**: loud `ValueError: LazyShuffledRange state mismatch: expected
+n=…, seed=…, shard_id=…, num_shards=…; got … Resuming with a different
+DP/worker topology is not supported — drop dataloader state if the topology
+changed.` from `lhotse_resumable/lhotse/indexing.py:507-540`. For chains
+under global shuffle: `ValueError: LazyIteratorChain global-shuffle
+partition mismatch on resume: ...`.
+
+**Trigger**: a chunk saved with `(world_size=W1, num_workers=NW1)` is
+restored under `(world_size=W2, num_workers=NW2)` where
+`W1 * NW1 != W2 * NW2` or the rank/worker_id assignment differs. Common
+sources: launcher changed `--num-nodes` or `--num-workers` between chunks,
+or elastic-cluster behaviour silently re-shuffled ranks.
+
+**Fix**: keep `(world_size, num_workers)` invariant across the chain — same
+hard contract as map-style `StatefulDataLoader` (which raises analogously).
+If you must scale, restart from a converted HuggingFace checkpoint (no
+resume of dataloader state).
+
+**Cross-refs**: `lhotse_resumable/test/test_partition.py::test_chain_globally_shuffled_topology_mismatch_on_resume`
+and `test_indexed_manifest_iterator_partition_resume_topology_mismatch_raises`.
+
+---
+
 ## §18 — `prefetch_indexes.py` PYTHONPATH
 
 **Signature**: `ImportError: cannot import name 'create_jsonl_index'` or
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md b/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
index 4166a0536789..900bb13b968f 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
@@ -10,10 +10,10 @@ pointer, see-also link to MIGRATION_GUIDE.md and (when relevant) to the
 |---|---|---|---|
 | `indexed` | `true` | Routes every nested `input_cfg` source to its indexed adapter (`LazyNeMoTarredIterator(indexed=True)`, `IndexedJsonlReader`, etc.). Without this flag, the streaming/replay path is used. Defined in `LhotseDataLoadingConfig` (`NeMo_resumable/nemo/collections/common/data/lhotse/dataloader.py:261`). | MIGRATION_GUIDE.md "Step 2 — Flip two flags" |
 | `use_stateful_dataloader` | `true` | Swaps PyTorch `DataLoader` → `torchdata.StatefulDataLoader` so iterator state is checkpointed in `meta.pt` under `DataModule.train_dataloader` (3 keys: `_snapshot`, `_steps_since_snapshot`, `_iterator_finished`). Verified via `inspect_meta.py` against `step=2000.ckpt` / `step=3000.ckpt` / `step=N-last.ckpt` (see `agent-debug-workspace/nano-v3-1node-resumable-tests.md`). | `dataloader.py:272`, MIGRATION_GUIDE.md "Step 2" |
-| `force_map_dataset` | `true` | Consume sources map-style (random `__getitem__`) rather than iterable. Required for full per-worker resume state — without it, the iterator-graph state isn't restorable in O(1). On the map path, cross-rank de-dup is via `rank/world_size` slicing in `DynamicBucketingSampler` (`dataloader.py:680-681` constructs the sampler with `rank=global_rank, world_size=world_size`), NOT via per-rank seed differentiation. | `dataloader.py:247-279` |
+| `force_map_dataset` | `true` (safe default) **OR** `false` (optimization for indexed-only configs at high `world_size`) | Two viable modes. **`true`**: sampler runs in the main GPU process; cross-rank dedup is over-sample-and-discard inside `DynamicBucketingSampler` (sampler generates `world_size` batches per step, picks `batches[rank]`, discards the rest). Works for any source type. Costs `W×` redundant sampler/manifest reads per step. **`false`**: sampler runs co-located with the dataset inside CPU worker subprocesses (`IterableDatasetWrapper`); sample indices are partitioned across `(DP rank × DataLoader worker)` via `LazyShuffledRange(shard_id, num_shards)`. Eliminates the `W×` redundant work — near-`W×` step-time improvement at scale. **Requires every source to be indexed** (lhotse-indexed JSONL, nemo_tarred with indexed mode, etc.); non-indexed sources mixed into the chain are NOT deduplicated and may be silently duplicated across ranks. The partition is gated by the `LHOTSE_USE_WORKER_PARTITION` env var that `worker_init_fn` sets (and `dataloader.py:_maybe_init_main_process_for_iterable` sets eagerly for the `num_workers=0` case). | `dataloader.py:247-279`, `lhotse_resumable/lhotse/indexing.py:396-571` (`LazyShuffledRange` with `(shard_id, num_shards)`; constructor L423, `state_dict` L497, `load_state_dict` L507 validates topology), `lhotse_resumable/lhotse/lazy.py:548+` (`LazyIndexedManifestIterator.__iter__` at L606), `failure-modes.md §20-§23` |
 | `indexes_root` | local SSD path (e.g. `/tmp/idx`) matching `prefetch_indexes.py` destination | Where the prefetched `.idx` mirror is read from at training time. Mirror tree preserves the data-file paths (`<indexes_root>/lustre/...` mirroring the blend's lustre paths). Resolved by `resolve_idx_path` in `NeMo_resumable/nemo/collections/common/data/lhotse/indexed_adapters.py:170`. **Must match the prefetch script's destination**, otherwise manifests fail to find their `.idx` neighbors at training time. | MIGRATION_GUIDE.md "keep indexes on a separate fast disk" |
 | `seed` | a fixed integer, **invariant across chunks** | Controls Python/numpy/torch global RNG via `pl.seed_everything(seed)` at chunk start. **MUST NOT change on resume**, otherwise dropout / aux-loss / random-init diverge across chunks even though `StatefulDataLoader.load_state_dict` restores sampler state correctly. The 0909 longform chains (see `agent-debug-workspace/0909-longform-failures.md`) hit this exact silent-corruption bug because `train_and_eval.py` rotated `FIXED_SEEDS[seed_offset+i]` per chunk. Fixed in `train_and_eval.py:925-952` — when `--enable-indexes-prefetch` is set, all chunks use the same seed. | MIGRATION_GUIDE.md "Operational constraints" §1, `0909-longform-failures.md` Cause A |
-| `shard_seed` | a fixed integer (NOT `"randomized"`) when `force_map_dataset: true` | Sampler RNG for `DynamicBucketingSampler`. On the map path, cross-rank de-dup is by index slicing (`rank=global_rank, world_size=world_size` at `dataloader.py:680-681`), so per-rank seed differentiation is unneeded. `"randomized"` is iterable-path machinery; on the map path it adds worker-PID-derived seeding that breaks across resume boundaries. **NeMo's dataloader.py auto-overwrites `shard_seed: "randomized"` → `shard_seed: <seed>` with a warning when `force_map_dataset + use_stateful_dataloader` are both true** (`dataloader.py:556-572`). The auto-overwrite is a safety net; pin it explicitly in YAML so the rationale is visible in code review. | `0909-summary.md` R2, `dataloader.py:543-572` |
+| `shard_seed` | a fixed integer (NOT `"randomized"`) under either `force_map_dataset` value | Sampler RNG for `DynamicBucketingSampler`. **Map path**: cross-rank dedup is by index slicing (`rank=global_rank, world_size=world_size` at `dataloader.py:680-681`); per-rank seed differentiation is unneeded, and `"randomized"` adds worker-PID-derived seeding that breaks across resume boundaries. NeMo's `dataloader.py:556-572` auto-overwrites `shard_seed: "randomized"` → `shard_seed: <seed>` with a warning when `force_map_dataset + use_stateful_dataloader` are both true. **Iterable path** (`force_map_dataset: false`): the multiplexer inside the sampler graph (`LazyIteratorMultiplexer`) requires all DP ranks to pick the same source at each multiplex step so the global weighted source distribution stays coherent. `seed='randomized'` would derive a different per-(rank, worker) seed and break this — `LazyIteratorMultiplexer.__iter__` (`lhotse_resumable/lhotse/lazy.py:960-970`) raises `ValueError` if `seed='randomized'` under multi-shard partition. Either mode: pin `shard_seed: <int>` explicitly in YAML. | `0909-summary.md` R2, `dataloader.py:543-572`, `failure-modes.md §22` |
 | `num_workers` | match between save and restore | `StatefulDataLoader` hard requirement: changing `num_workers` between save and restore raises a hard error from torchdata. Document the value in the YAML / launcher header. | MIGRATION_GUIDE.md "Operational constraints" §1 |
 | `concurrent_bucketing` | **`false`** when `force_map_dataset + use_stateful_dataloader` are both true | The default (`true`) spawns a `daemon=True` producer thread inside `DynamicBucketingSampler` (`lhotse_resumable/lhotse/dataset/sampling/dynamic_bucketing.py:924-944`) that pre-pulls cuts from the source iterator and fills per-bucket queues. The main thread (which `StatefulDataLoader` checkpoints) and the producer thread BOTH advance `self.cuts_iter`, so the cursor saved at `state_dict` time does NOT reflect the cuts the producer has already pre-fetched. On resume, the next-cut cursor is correct from the main thread's view but the producer's pre-fetched cuts are gone, so the bucketing/order across resume boundaries is nondeterministic. Also breaks single-run bit-exact reproducibility between two runs of the same config because the producer's scheduling is OS-thread-dependent. Set `concurrent_bucketing: false` in `data.train_ds` for any resumable run. | `lhotse_resumable/lhotse/dataset/sampling/dynamic_bucketing.py:924-944`, `failure-modes.md §<new>`, observed in `0909-multiling-*` (2026-05-11) |
 | `force_iterable_dataset` | unset (or `false`) | Mutually exclusive with `force_map_dataset: true`. `dataloader.py:278-280` asserts `not (force_map_dataset and force_iterable_dataset)`. | `dataloader.py:278-280` |
@@ -24,6 +24,21 @@ pointer, see-also link to MIGRATION_GUIDE.md and (when relevant) to the
 | `pipe:` paths (`pipe:cmd \| cmd`) | reject | Pipe commands aren't seekable. `validate_indexed_access` raises `ValueError("...requires seekable data sources...")`. | `lhotse/indexing.py:126-128` |
 | `.json` extension on a JSONL manifest | accepted | NeMo ships many ASR/SLM manifests as `*.json` (one JSON object per line). `lhotse/indexing.py:99-107` accepts both `.json` and `.jsonl` since the indexer only relies on newline-separated records. (Pretty-printed multi-line JSON would produce a bogus index, but that's not a supported NeMo manifest layout.) | `lhotse/indexing.py:99-107` |
 
+## Iterable-mode partition — only when `force_map_dataset: false`
+
+These concerns only apply when you've opted into the iterable path for
+indexed sources. Skip this whole section if you've kept the default
+`force_map_dataset: true`.
+
+| concern | requirement | purpose | see also |
+|---|---|---|---|
+| `LHOTSE_USE_WORKER_PARTITION` env var | set automatically by `worker_init_fn`; never set manually | Signals to `get_worker_partition()` that worker-level partition is active. In iterable mode NeMo passes `worker_init_fn` to the DataLoader (workers `num_workers>0`) or calls it eagerly in `_maybe_init_main_process_for_iterable()` (`num_workers=0`). Map-style mode never calls `worker_init_fn`, so the signal stays unset and partition collapses to `(0, 1)` — this is what keeps the map-style path correct under torchrun (where RANK/WORLD_SIZE are already in env). | `lhotse_resumable/lhotse/dataset/dataloading.py:22` (constant), L82 (set in `worker_init_fn`), L139-170 (`get_worker_partition`), `failure-modes.md §20` |
+| All sources in the iteration graph are indexed | required | The partition is implemented in `LazyShuffledRange` and reaches `LazyIndexedManifestIterator` and `LazyIteratorChain._iter_globally_shuffled`. Non-indexed sources (plain `LazyJsonlIterator`, `LazyManifestIterator`) do NOT partition; they yield all items on every rank. If the chain mixes indexed + non-indexed sources, the non-indexed parts are duplicated across ranks — silently. Inspect every nested input_cfg entry to confirm it lands on an indexed adapter (`indexed: true` cascades, but compressed paths / `pipe:` paths / certain blends fall back to non-indexed). | `lhotse_resumable/lhotse/lazy.py` (`LazyShuffler` / `LazyMapper` / `LazyFilter` / `LazyRepeater` all delegate to the source's `__iter__`, so partition propagates), `failure-modes.md §21` |
+| `LazyIteratorMultiplexer.seed` | fixed integer (NOT `"randomized"`) | Under multi-shard partition, all ranks must pick the same source at each step (else the global weighted source distribution drifts across ranks). The multiplexer asserts this at iter time. Map-style mode is unaffected (partition is `(0, 1)` so the assertion never fires). | `lhotse_resumable/lhotse/lazy.py:960-970`, `failure-modes.md §22` |
+| Resume topology (DP rank × num_workers) | invariant between save and restore | `LazyShuffledRange.load_state_dict` validates `(n, seed, shard_id, num_shards)`; `LazyIteratorChain._iter_globally_shuffled` validates `(shard_id, num_shards)` against saved values. Topology mismatch raises a loud `ValueError`. The same hard contract as map-style `StatefulDataLoader` — but check fires earlier (at iterator level) and includes the worker dimension. | `lhotse_resumable/lhotse/indexing.py:497-540` (`LazyShuffledRange.state_dict` / `load_state_dict`), `failure-modes.md §23` |
+| `num_workers` | invariant; same as map-style | StatefulDataLoader contract. Additionally: `num_shards = world_size * num_workers`, so `num_workers` is part of the partition identity. Changing it would force a different shard assignment per rank. | MIGRATION_GUIDE.md "Operational constraints" §1 |
+| Mixed indexed/non-indexed chain | warn | Non-indexed sources in the chain are duplicated across ranks (see "All sources indexed" above). Either move them to a separate dataloader, convert them to indexed format, or revert to `force_map_dataset: true` for that config. | `failure-modes.md §21` |
+
 ## `data.validation_ds` — finite map access required
 
 | field | required value | purpose | see also |
@@ -70,7 +85,7 @@ pointer, see-also link to MIGRATION_GUIDE.md and (when relevant) to the
 | env var | required when | purpose | see also |
 |---|---|---|---|
 | `USE_AIS_GET_BATCH` | training data is on `s3://`, `ais://`, or `http(s)://` AND the cluster has `AIS_ENDPOINT` | Skip eager `IndexedTarMemberReader` per shard; defer audio fetch to AIS at sample time via `AISBatchLoader`. Read at `nemo_adapters.py:459`. | aistore-vs-non-aistore.md |
-| `USE_AIS_INDIVIDUAL_GETS` | non-EN-replicated multilingual data on AIS, or any time MOSS GetBatch returns empty content | Routes through per-object `Object.get_reader(archive_config=...).read_all()` instead of MOSS GetBatch. Slower but bypasses MOSS-side issues. `lhotse_resumable/lhotse/ais/batch_loader.py:67, 218` (the `prefer_individual` flag). | failure-modes.md §16 |
+| `USE_AIS_INDIVIDUAL_GETS` | non-EN-replicated multilingual data on AIS, or any time MOSS GetBatch returns empty content | Routes through per-object `Object.get_reader(archive_config=...).read_all()` instead of MOSS GetBatch (the `force_individual` flag on `AISBatchLoader`). Slower but bypasses MOSS-side issues, and on `shar_ptr` sources falls back to per-object byte-range `get_reader` so non-gzipped lhotse-shar cuts work even when MOSS lacks byte-range support. `lhotse_resumable/lhotse/ais/batch_loader.py`. | failure-modes.md §16 |
 | `AIS_ENDPOINT` | always when AIStore in play | The AIS proxy URL. IAD: `http://asr.iad.oci.aistore.nvidia.com:51080`. Set in `cluster_configs/<cluster>.yaml` under `env_vars`. | `cluster_configs/iad.yaml:31` |
 | `aistore` SDK version | ≥ 1.17 | `lhotse_resumable/lhotse/ais/batch_loader.py:75` requires `aistore>=1.17.0`. As of 2026-05-10, latest is 1.23.0. The `_moss_attrs` normalizer at `batch_loader.py:81` handles both MossIn (≤1.18) and MossOut (≥1.20) attribute namings. | `lhotse_resumable/lhotse/ais/batch_loader.py:75-89` |
 
diff --git a/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md b/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md
index c8db819c7fa6..650c49706c5d 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md
@@ -54,6 +54,28 @@ requires further user action.>
 
 - _(more)_
 
+## Dedup mode
+
+<One paragraph: which `force_map_dataset` value this config uses and why.>
+
+- **`force_map_dataset: true`** (safe default; over-sample-and-discard
+  inside `DynamicBucketingSampler`) — works for any source type. Costs
+  `W×` redundant sampler/manifest I/O per step.
+- **`force_map_dataset: false`** (iterable + worker partition; suitable
+  for indexed-only configs at high `world_size`) — sample indices are
+  partitioned across `(DP rank × DataLoader worker)` via
+  `LazyShuffledRange(shard_id, num_shards)`. Near-`W×` step-time
+  improvement at scale. Audit required: every source must be indexed
+  (`failure-modes.md §21`), every `LazyIteratorMultiplexer.seed` must
+  be a fixed integer (§22), `(world_size, num_workers)` invariant
+  across the chain (§23).
+
+If `false` was selected, list:
+- Sources confirmed indexed: <list>
+- Multiplexer seeds confirmed integer: <list>
+- World-size / num-workers commitment: `<W>` × `<NW>` for the entire
+  chain.
+
 ## Cross-cuts
 
 ### Data blend audit
@@ -136,8 +158,11 @@ See `pre-flight-checklist.md` next to this report. The TL;DR:
 ## References
 
 - `MIGRATION_GUIDE.md` (repo root): canonical migration walkthrough.
-- `references/option-reference.md`: every YAML field, every flag.
-- `references/conflict-matrix.md`: option pairs that conflict.
-- `references/failure-modes.md`: 18-entry failure-mode catalog.
-- `references/best-practices.md`: prioritised checklist.
+- `references/option-reference.md`: every YAML field, every flag, including
+  the iterable-mode partition concerns.
+- `references/conflict-matrix.md`: option pairs that conflict (includes
+  iterable-mode constraints: §20–§23).
+- `references/failure-modes.md`: 23-entry failure-mode catalog (§20–§23 cover iterable-mode partition concerns).
+- `references/best-practices.md`: prioritised checklist (tier 2 §5b covers
+  when to prefer `force_map_dataset: false`).
 - `references/aistore-vs-non-aistore.md`: workflow selection.
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse.py b/nemo/collections/asr/data/audio_to_text_lhotse.py
index a80ef6dfe7e4..38ce7a0e8f9e 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse.py
@@ -53,14 +53,14 @@ def __init__(self, tokenizer: TokenizerSpec, return_cuts: bool = False):
         super().__init__()
         self.tokenizer = TokenizerWrapper(tokenizer)
         self.use_ais_get_batch = os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true"
-        self.ais_prefer_individual = os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true"
+        self.ais_force_individual = os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true"
 
         # Try to use use_batch_loader if available (Lhotse >= 1.32.0)
         try:
             self.load_audio = AudioSamples(
                 fault_tolerant=True,
                 use_batch_loader=self.use_ais_get_batch,
-                ais_prefer_individual=self.ais_prefer_individual,
+                ais_force_individual=self.ais_force_individual,
             )
         except TypeError:
             # Lhotse < 1.32.0 doesn't support use_batch_loader
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
index 01c91fc8c4a8..2fa10edd8cf9 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -83,14 +83,14 @@ def __init__(
         super().__init__()
         self.tokenizer = tokenizer
         self.use_ais_get_batch = os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true"
-        self.ais_prefer_individual = os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true"
+        self.ais_force_individual = os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true"
 
         # Try to use use_batch_loader if available (Lhotse >= 1.32.0)
         try:
             self.load_audio = AudioSamples(
                 fault_tolerant=True,
                 use_batch_loader=self.use_ais_get_batch,
-                ais_prefer_individual=self.ais_prefer_individual,
+                ais_force_individual=self.ais_force_individual,
             )
         except TypeError:
             # Lhotse < 1.32.0 doesn't support use_batch_loader
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 450a1adefe32..8fe350260a1f 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -293,6 +293,19 @@ def _build_dataloader(use_stateful_dataloader: bool, **kwargs) -> torch.utils.da
     return torch.utils.data.DataLoader(**kwargs)
 
 
+def _maybe_init_main_process_for_iterable(num_workers: int, global_rank: int, world_size: int, seed: int) -> None:
+    """When ``num_workers == 0`` the iterable-path sampler runs in the main training
+    process; PyTorch's DataLoader never invokes ``worker_init_fn`` in that case.
+    Call it eagerly so env vars (``RANK``/``WORLD_SIZE``/``LHOTSE_PROCESS_SEED``) and
+    the per-process random seed are set before any iterator is consumed — required so
+    ``get_worker_partition`` returns the correct DP-rank shard inside lhotse's lazy
+    indexed iterators (e.g. ``LazyShuffledRange``)."""
+    if num_workers == 0:
+        from lhotse.dataset.dataloading import worker_init_fn
+
+        worker_init_fn(0, rank=global_rank, world_size=world_size, seed=seed)
+
+
 def get_lhotse_dataloader_from_config(
     config: Union[dict, DictConfig],
     global_rank: int,
@@ -387,6 +400,7 @@ def get_lhotse_dataloader_from_single_config(
         # We use lhotse's own worker_init_fn which leverages information such as rank, world_size,
         # worker_id, etc. to set a different random seed for each (node, worker) combination.
         # This together with infinite datasets removes the need to split data across nodes/workers.
+        _maybe_init_main_process_for_iterable(config.num_workers, global_rank, world_size, config.seed)
         dloader_kwargs = dict(
             dataset=IterableDatasetWrapper(dataset=dataset, sampler=sampler),
             worker_init_fn=make_worker_init_fn(rank=global_rank, world_size=world_size, seed=config.seed),
@@ -513,6 +527,7 @@ def gather_shared_opts():
         # We use lhotse's own worker_init_fn which leverages information such as rank, world_size,
         # worker_id, etc. to set a different random seed for each (node, worker) combination.
         # This together with infinite datasets removes the need to split data across nodes/workers.
+        _maybe_init_main_process_for_iterable(shared_opts.num_workers, global_rank, world_size, shared_opts.seed)
         dloader_kwargs = dict(
             dataset=IterableDatasetWrapper(dataset=dataset, sampler=sampler),
             worker_init_fn=make_worker_init_fn(rank=global_rank, world_size=world_size, seed=shared_opts.seed),
diff --git a/nemo/collections/speechlm2/data/salm_dataset.py b/nemo/collections/speechlm2/data/salm_dataset.py
index 93a8f5b4d109..735cc68cf1b7 100644
--- a/nemo/collections/speechlm2/data/salm_dataset.py
+++ b/nemo/collections/speechlm2/data/salm_dataset.py
@@ -81,7 +81,7 @@ def __init__(self, tokenizer: AutoTokenizer) -> None:
         self.load_audio = AudioSamples(
             fault_tolerant=True,
             use_batch_loader=os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true",
-            ais_prefer_individual=os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true",
+            ais_force_individual=os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true",
             mono_downmix=True,
         )
 

From fc9f3663a229140057e28a497cce92c38539936c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Tue, 12 May 2026 20:19:19 -0400
Subject: [PATCH 10/30] =?UTF-8?q?Fix=20DP=C3=97worker=20dedup=20in=20index?=
 =?UTF-8?q?ed=20adapters=20via=20PartitionedIndexedIterator;=20dedupe=20ag?=
 =?UTF-8?q?ainst=20lhotse=20primitives?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The bug: every indexed adapter's _iter_indexed iterated range(0, total_len)
with no get_worker_partition() call, so under multi-rank training every DP
rank x DataLoader worker yielded the same item sequence. Observed in the
0909-id sweep: 32 ranks all skipped the same 7 bad-path librilight cuts
instead of 32x7=224 disjoint ones (full diagnosis in
agent-debug-workspace/0909-en-only-id-4node/DIAGNOSIS.md).

Adapter refactor (7 leaves now partition-aware):
* LazyNeMoTarredIterator (nemo_adapters.py)
* LazyParquetIterator (nemo_adapters.py)
* LhotseTextJsonlAdapter, NeMoSFTJsonlAdapter (text_adapters.py)
* NeMoMultimodalConversationJsonlAdapter (text_adapters.py)
* NeMoMultimodalConversationShareGPTJsonlAdapter (text_adapters.py)
* NeMoMultimodalConversationShareGPTWebdatasetAdapter (text_adapters.py)
Each holds self._iter_state = PartitionedIndexedIterator() and replaces its
local _position/_restored/range(start, n) loop with a one-line delegation:
``for global_idx in self._iter_state.iterate(self._total_len): ...``.
state_dict/load_state_dict forward to the helper plus the adapter's own
epoch counter. The 8th indexed adapter (LazyNeMoIterator) was already
correct because it delegates to lhotse's LazyIndexedManifestIterator.

Dedup against lhotse primitives (indexed_adapters.py, -170 LOC):
* Drop local LazyShuffledRange and IndexedJSONLReader; use the lhotse-side
  classes (lhotse.indexing.LazyShuffledRange / IndexedJsonlReader).
* Drop create_index; use lhotse.indexing.create_jsonl_index.
* _load_index now delegates the offsets-loading to lhotse.indexing.read_index
  and layers only the NeMo-specific validation (file-size cross-check +
  legacy-format sentinel handling) on top.
Kept: resolve_idx_path, IndexedTarSampleReader, IndexedTarMemberReader,
create_tar_index (NeMo-style tar with basename-grouped members).

Remove non-partition-aware index code paths (user directive):
* Delete _iter_jsonl_indexed in NeMoMultimodalConversationShareGPTJsonlAdapter
  and _iter_indexed in NeMoMultimodalConversationShareGPTWebdatasetAdapter.
  Both used .idx files for shuffling without partitioning the result. The
  __iter__ dispatch now routes index-driven access exclusively through the
  partition-aware _iter_indexed_node (indexed=True). Non-indexed paths
  keep using shard-level DP partitioning via streaming. Remove the now-
  unused self._has_index plumbing.

Tests:
* New tests/collections/common/test_lhotse_indexed_partition.py: 28 tests
  (7 adapters x 4 world sizes {1,2,4,5}) asserting per-rank slices are
  pairwise disjoint and the union covers the manifest exactly once.
* test_lhotse_multimodal_dataloading.py: delete 6 tests that asserted the
  removed non-partition-aware fallback paths, plus the 3 NeMoLazyShuffledRange
  tests now covered by lhotse-side test_indexing.py. Drop _has_index
  assertions from the surviving tests.

58/58 NeMo tests + 179/179 lhotse tests pass under nemo312-hf5 with
PYTHONPATH=lhotse_resumable:NeMo_resumable.

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../common/data/lhotse/indexed_adapters.py    | 139 +-----
 .../common/data/lhotse/nemo_adapters.py       |  39 +-
 .../common/data/lhotse/text_adapters.py       | 146 ++----
 .../common/test_lhotse_indexed_partition.py   | 416 ++++++++++++++++++
 .../test_lhotse_multimodal_dataloading.py     | 164 +------
 5 files changed, 484 insertions(+), 420 deletions(-)
 create mode 100644 tests/collections/common/test_lhotse_indexed_partition.py

diff --git a/nemo/collections/common/data/lhotse/indexed_adapters.py b/nemo/collections/common/data/lhotse/indexed_adapters.py
index a88c533a1e5b..8952948ce052 100644
--- a/nemo/collections/common/data/lhotse/indexed_adapters.py
+++ b/nemo/collections/common/data/lhotse/indexed_adapters.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import json
 import os
-import random
 import re
 import struct
 import tarfile
@@ -22,8 +21,7 @@
 
 import numpy as np
 
-# Knuth's multiplicative hash constant (golden-ratio derived, 32-bit).
-_KNUTH_HASH = 2654435761
+from lhotse.indexing import read_index
 
 # Tar block size + the all-zeros block that marks end-of-archive in tar.
 _TAR_BLOCK_SIZE = 512
@@ -207,72 +205,15 @@ def resolve_idx_path(data_path: str | Path, indexes_root: Optional[str | Path] =
     return str(Path(root_str) / (rel + ".idx"))
 
 
-class LazyShuffledRange:
+def _load_index(data_path: str, idx_path: Optional[str] = None):
     """
-    Generates a permutation of ``range(n)`` lazily using a Feistel cipher,
-    without materializing the full index list. Each element is computed on
-    the fly in O(1) time and the object itself uses O(1) memory regardless
-    of ``n``.
-
-    The technique is known as *cycle-walking* format-preserving encryption:
-    a Feistel network is a bijection on ``[0, 2^k)``, and repeatedly applying
-    it until the output falls within ``[0, n)`` restricts it to a bijection
-    on the desired domain.
-
-    Args:
-        n: Size of the range to permute.
-        rng: A ``random.Random`` instance used to derive round keys.
-        num_rounds: Number of Feistel rounds (more rounds = better uniformity,
-            6 is a good default for typical dataset sizes).
-    """
-
-    def __init__(self, n: int, rng: random.Random, num_rounds: int = 6):
-        self.n = n
-        if n <= 1:
-            return
-        bits = (n - 1).bit_length()
-        if bits < 2:
-            bits = 2
-        if bits % 2:
-            bits += 1
-        self._half = bits // 2
-        self._mask = (1 << self._half) - 1
-        self._num_rounds = num_rounds
-        self._keys = [rng.getrandbits(64) for _ in range(num_rounds)]
-
-    def _permute_one(self, x: int) -> int:
-        left = (x >> self._half) & self._mask
-        right = x & self._mask
-        for key in self._keys:
-            left, right = right, left ^ (((right * _KNUTH_HASH) ^ key) >> 32 & self._mask)
-        return (left << self._half) | right
-
-    def __len__(self) -> int:
-        return self.n
-
-    def __iter__(self):
-        n = self.n
-        if n <= 0:
-            return
-        if n == 1:
-            yield 0
-            return
-        for i in range(n):
-            x = i
-            while True:
-                x = self._permute_one(x)
-                if x < n:
-                    yield x
-                    break
-
-
-def _load_index(data_path: str, idx_path: str | None = None):
-    """
-    Load a memmap'd offset index for *data_path*.
+    Load an offset index for *data_path*, layering NeMo-specific validation
+    on top of :func:`lhotse.indexing.read_index`.
 
     Returns ``(offsets, num_samples)`` where ``offsets`` always has
     ``num_samples + 1`` entries — the last one being the data file size
-    (appended if absent in the on-disk index).
+    (appended if absent in the on-disk index, for legacy ``.idx`` files
+    written before the sentinel convention was added).
 
     Validates that all sample offsets fall within the data file.
 
@@ -285,13 +226,7 @@ def _load_index(data_path: str, idx_path: str | None = None):
     """
     if idx_path is None:
         idx_path = data_path + '.idx'
-    # Use np.fromfile (resident memory) rather than np.memmap so that NeMo
-    # blends with tens of thousands of shards don't exhaust the kernel's
-    # ``vm.max_map_count`` budget (~65k by default) and subsequently raise
-    # ``OSError: [Errno 12] Cannot allocate memory``. Indexes are small
-    # (a uint64 per record + a sentinel; typically O(KB) per shard), so the
-    # resident-memory cost across an entire blend is in the hundreds of MB.
-    offsets = np.fromfile(idx_path, dtype=np.dtype('<u8'))
+    offsets = read_index(idx_path)
     if _URL_RE.match(str(data_path)):
         if offsets.shape[0] < 1:
             raise ValueError(
@@ -327,24 +262,6 @@ def _resolve_idx(idx: int, length: int) -> int:
     return idx
 
 
-class IndexedJSONLReader:
-    def __init__(self, jsonl_path: Path | str, idx_path: Path | str | None = None):
-        self.data_path = str(jsonl_path)
-        self.offsets, self._len = _load_index(self.data_path, str(idx_path) if idx_path else None)
-
-    def __len__(self):
-        return self._len
-
-    def __getitem__(self, idx):
-        idx = _resolve_idx(idx, self._len)
-        start = int(self.offsets[idx])
-        end = int(self.offsets[idx + 1])
-        with _open_data_path(self.data_path) as f:
-            f.seek(start)
-            data = f.read(end - start)
-        return json.loads(data.decode('utf-8'))
-
-
 class TarSample(NamedTuple):
     """A single sample extracted from a WebDataset tar archive."""
 
@@ -365,8 +282,9 @@ def _split_json_audio_pair(name_a, bytes_a, name_b, bytes_b) -> TarSample:
 class IndexedTarSampleReader:
     """
     Random access to WebDataset tar samples (``N.json`` + ``N.<audio>``) via an index file.
-    Index format is identical to ``IndexedJSONLReader``: little-endian uint64 offsets,
-    optionally followed by a sentinel equal to the tar file size.
+    Index format is the same little-endian ``uint64`` offsets as
+    :class:`lhotse.indexing.IndexedJsonlReader`, optionally followed by a
+    sentinel equal to the tar file size.
     """
 
     def __init__(self, tar_path: str | Path, idx_path: str | Path | None = None):
@@ -454,8 +372,8 @@ class IndexedTarMemberReader:
     Random access to a NeMo-style tar archive that stores **one regular member
     per sample** (e.g. ``<cut_id>.flac`` per line of an external NeMo manifest).
 
-    Uses the same ``.idx`` format as :class:`IndexedJSONLReader` and
-    :class:`IndexedTarSampleReader`: little-endian uint64 byte offsets, with
+    Uses the same ``.idx`` format as :class:`lhotse.indexing.IndexedJsonlReader`
+    and :class:`IndexedTarSampleReader`: little-endian uint64 byte offsets, with
     a sentinel equal to the tar file size at the end. Each entry points at
     one tar header, and the corresponding payload starts ``512`` bytes later.
 
@@ -595,34 +513,6 @@ def _read_tar_member(f):
         return info.name, data
 
 
-def create_index(jsonl_path, idx_path):
-    """
-    Creates a raw binary index file compatible with Megatron-Energon (CrudeJsonlDataset).
-
-    Format: sequence of little-endian uint64 values
-    ``[Offset_0, Offset_1, ..., Offset_N, File_Size]``
-
-    Written atomically (tmp + ``os.replace``) so concurrent writers can't
-    observe a half-written ``.idx``.
-    """
-    # Flush the write buffer every 8 MiB to limit memory usage on large files.
-    flush_threshold = 8 * 1024 * 1024
-    tmp_path = f"{idx_path}.tmp.{os.getpid()}"
-    with open(jsonl_path, 'rb') as f_in, open(tmp_path, 'wb') as f_out:
-        current_offset = 0
-        write_buffer = bytearray()
-        write_buffer.extend(struct.pack('<Q', current_offset))
-        for line in f_in:
-            current_offset += len(line)
-            write_buffer.extend(struct.pack('<Q', current_offset))
-            if len(write_buffer) > flush_threshold:
-                f_out.write(write_buffer)
-                write_buffer.clear()
-        if write_buffer:
-            f_out.write(write_buffer)
-    os.replace(tmp_path, idx_path)
-
-
 class _CountingReader:
     """
     Minimal file-like wrapper that delegates everything to an inner stream
@@ -656,8 +546,9 @@ def create_tar_index(tar_path, idx_path):
     """
     Creates a raw binary index file for a WebDataset tar archive.
     Stores the byte offset of the first member of each sample (grouped by basename),
-    followed by a sentinel equal to the tar file size.
-    Format is identical to :func:`create_index`.
+    followed by a sentinel equal to the tar file size. On-disk format matches
+    :func:`lhotse.indexing.create_jsonl_index` and the other readers in this
+    module: a sequence of little-endian uint64 byte offsets.
 
     Reads ``tar_path`` via ``lhotse.serialization.open_best`` so the function
     works for local files as well as ``s3://`` / ``ais://`` / ``http(s)://``
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index f29746128468..6aca474ad105 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -35,7 +35,7 @@
 from lhotse import AudioSource, MonoCut, Recording, SupervisionSegment
 from lhotse.audio.backend import LibsndfileBackend
 from lhotse.cut import Cut
-from lhotse.dataset.dataloading import resolve_seed
+from lhotse.dataset.dataloading import PartitionedIndexedIterator, resolve_seed
 from lhotse.lazy import (
     GraphOriginDict,
     IteratorNode,
@@ -527,8 +527,7 @@ def _init_indexed(self) -> None:
             cum_lens.append(cum)
         self._cum_lens = cum_lens
         self._total_len = cum
-        self._position = 0
-        self._restored = False
+        self._iter_state = PartitionedIndexedIterator()
 
     def to_shards(self) -> List["LazyNeMoTarredIterator"]:
         """Convert this iterator to a list of separate iterators for each shard."""
@@ -829,24 +828,20 @@ def __len__(self) -> int:
     def state_dict(self) -> dict:
         if not self.indexed:
             return {}
-        return {"position": self._position, "epoch": self.epoch}
+        return {**self._iter_state.state_dict(), "epoch": self.epoch}
 
     def load_state_dict(self, sd: dict) -> None:
         if not self.indexed:
             return
-        self._position = sd.get("position", 0)
+        self._iter_state.load_state_dict(sd)
         self.epoch = sd.get("epoch", 0)
-        self._restored = True
 
     def _iter_indexed(self) -> Generator[Cut, None, None]:
-        start = self._position if self._restored else 0
-        self._restored = False
-        for i in range(start, self._total_len):
-            self._position = i + 1
-            cut = self._decode_cut_at(i)
+        for global_idx in self._iter_state.iterate(self._total_len):
+            cut = self._decode_cut_at(global_idx)
             if cut is None:
                 continue
-            attach_graph_origin(cut, i)
+            attach_graph_origin(cut, global_idx)
             yield cut
         self.epoch += 1
 
@@ -1138,8 +1133,7 @@ def __init__(
         self._row_group_offsets: list[int] | None = None
         self._cached_row_group_idx: int | None = None
         self._cached_row_group: list[dict] | None = None
-        self._position = 0
-        self._restored = False
+        self._iter_state = PartitionedIndexedIterator()
         if indexed:
             self._init_indexed()
 
@@ -1252,13 +1246,12 @@ def __len__(self) -> int:
     def state_dict(self) -> dict:
         if not self.indexed:
             return {}
-        return {"position": self._position}
+        return self._iter_state.state_dict()
 
     def load_state_dict(self, sd: dict) -> None:
         if not self.indexed:
             return
-        self._position = sd.get("position", 0)
-        self._restored = True
+        self._iter_state.load_state_dict(sd)
 
     def __iter__(self) -> Generator[Cut, None, None]:
         if self.indexed:
@@ -1267,17 +1260,13 @@ def __iter__(self) -> Generator[Cut, None, None]:
             yield from self._iter_streaming()
 
     def _iter_indexed(self) -> Generator[Cut, None, None]:
-        start = self._position if self._restored else 0
-        self._restored = False
-        n = self._total_rows
-        for i in range(start, n):
-            self._position = i + 1
-            rg_idx, local_idx = self._resolve_row_group(i)
+        for global_idx in self._iter_state.iterate(self._total_rows):
+            rg_idx, local_idx = self._resolve_row_group(global_idx)
             rows = self._load_row_group(rg_idx)
-            cut = self._build_cut_from_row(rows[local_idx], fallback_idx=i)
+            cut = self._build_cut_from_row(rows[local_idx], fallback_idx=global_idx)
             if cut is None:
                 continue
-            attach_graph_origin(cut, i)
+            attach_graph_origin(cut, global_idx)
             yield cut
 
     def _iter_streaming(self) -> Generator[Cut, None, None]:
diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py
index 621fd7188e9a..12b27980f194 100644
--- a/nemo/collections/common/data/lhotse/text_adapters.py
+++ b/nemo/collections/common/data/lhotse/text_adapters.py
@@ -28,17 +28,17 @@
 from lhotse.custom import CustomFieldMixin
 from lhotse.cut import Cut
 from lhotse.dataset import AudioSamples
-from lhotse.dataset.dataloading import resolve_seed
+from lhotse.dataset.dataloading import PartitionedIndexedIterator, resolve_seed
 from lhotse.serialization import load_jsonl, open_best
 from lhotse.shar import AudioTarWriter, JsonlShardWriter
 from lhotse.utils import Pathlike, compute_num_samples, is_valid_url
 
 from lhotse.lazy import IteratorNode, attach_graph_origin, normalize_graph_token
 
+from lhotse.indexing import IndexedJsonlReader
+
 from nemo.collections.common.data.lhotse.indexed_adapters import (
-    IndexedJSONLReader,
     IndexedTarSampleReader,
-    LazyShuffledRange,
     _split_json_audio_pair,
 )
 from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths
@@ -155,8 +155,7 @@ def __post_init__(self):
         self.paths = expand_sharded_filepaths(self.paths)
         self._readers: list = []
         self._cum_lens: list[int] = []
-        self._position = 0
-        self._restored = False
+        self._iter_state = PartitionedIndexedIterator()
         if self.indexed:
             from lhotse.indexing import IndexedJsonlReader
 
@@ -214,13 +213,12 @@ def __getitem__(self, token):
         return attach_graph_origin(ex, idx)
 
     def state_dict(self) -> dict:
-        return {"position": self._position} if self.indexed else {}
+        return self._iter_state.state_dict() if self.indexed else {}
 
     def load_state_dict(self, sd: dict) -> None:
         if not self.indexed:
             return
-        self._position = sd.get("position", 0)
-        self._restored = True
+        self._iter_state.load_state_dict(sd)
 
     def __iter__(self) -> Iterator[TextExample]:
         if self.indexed:
@@ -229,16 +227,13 @@ def __iter__(self) -> Iterator[TextExample]:
             yield from self._iter_streaming()
 
     def _iter_indexed(self) -> Iterator[TextExample]:
-        start = self._position if self._restored else 0
-        self._restored = False
-        n = self._cum_lens[-1] if self._cum_lens else 0
-        for i in range(start, n):
-            self._position = i + 1
-            shard_idx, local_idx = self._resolve(i)
+        total = self._cum_lens[-1] if self._cum_lens else 0
+        for global_idx in self._iter_state.iterate(total):
+            shard_idx, local_idx = self._resolve(global_idx)
             ex = self._data_to_example(self._readers[shard_idx][local_idx])
             if ex is None:
                 continue
-            attach_graph_origin(ex, i)
+            attach_graph_origin(ex, global_idx)
             yield ex
 
     def _iter_streaming(self) -> Iterator[TextExample]:
@@ -428,8 +423,7 @@ def __post_init__(self):
         self.paths = expand_sharded_filepaths(self.paths)
         self._readers: list = []
         self._cum_lens: list[int] = []
-        self._position = 0
-        self._restored = False
+        self._iter_state = PartitionedIndexedIterator()
         if self.indexed:
             from lhotse.indexing import IndexedJsonlReader
 
@@ -477,13 +471,12 @@ def __getitem__(self, token):
         return attach_graph_origin(ex, idx)
 
     def state_dict(self) -> dict:
-        return {"position": self._position} if self.indexed else {}
+        return self._iter_state.state_dict() if self.indexed else {}
 
     def load_state_dict(self, sd: dict) -> None:
         if not self.indexed:
             return
-        self._position = sd.get("position", 0)
-        self._restored = True
+        self._iter_state.load_state_dict(sd)
 
     def __iter__(self) -> Iterator[NeMoSFTExample]:
         if self.indexed:
@@ -492,14 +485,11 @@ def __iter__(self) -> Iterator[NeMoSFTExample]:
             yield from self._iter_streaming()
 
     def _iter_indexed(self) -> Iterator[NeMoSFTExample]:
-        start = self._position if self._restored else 0
-        self._restored = False
-        n = self._cum_lens[-1] if self._cum_lens else 0
-        for i in range(start, n):
-            self._position = i + 1
-            shard_idx, local_idx = self._resolve(i)
+        total = self._cum_lens[-1] if self._cum_lens else 0
+        for global_idx in self._iter_state.iterate(total):
+            shard_idx, local_idx = self._resolve(global_idx)
             ex = NeMoSFTExample(self._readers[shard_idx][local_idx], language=self.language)
-            attach_graph_origin(ex, i)
+            attach_graph_origin(ex, global_idx)
             yield ex
 
     def _iter_streaming(self) -> Iterator[NeMoSFTExample]:
@@ -820,8 +810,7 @@ def __post_init__(self):
         self._tar_readers: list = []
         self._cum_lens: list[int] = []
         self._total_len = 0
-        self._position = 0
-        self._restored = False
+        self._iter_state = PartitionedIndexedIterator()
         if self.indexed:
             self._init_indexed()
 
@@ -878,14 +867,13 @@ def _resolve(self, idx: int) -> tuple[int, int]:
         raise IndexError(idx)
 
     def state_dict(self) -> dict:
-        return {"position": self._position, "epoch": self.epoch} if self.indexed else {}
+        return {**self._iter_state.state_dict(), "epoch": self.epoch} if self.indexed else {}
 
     def load_state_dict(self, sd: dict) -> None:
         if not self.indexed:
             return
-        self._position = sd.get("position", 0)
+        self._iter_state.load_state_dict(sd)
         self.epoch = sd.get("epoch", 0)
-        self._restored = True
 
     def __getitem__(self, token):
         if not self.indexed:
@@ -1017,12 +1005,8 @@ def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
             yield from self._iter_jsonl()
 
     def _iter_indexed(self) -> Iterator[NeMoMultimodalConversation]:
-        start = self._position if self._restored else 0
-        self._restored = False
-        n = self._total_len
-        for i in range(start, n):
-            self._position = i + 1
-            shard_idx, local_idx = self._resolve(i)
+        for global_idx in self._iter_state.iterate(self._total_len):
+            shard_idx, local_idx = self._resolve(global_idx)
             data = self._cuts_readers[shard_idx][local_idx]
             if self._tar_readers:
                 convo = self._build_conversation_tarred(
@@ -1036,7 +1020,7 @@ def _iter_indexed(self) -> Iterator[NeMoMultimodalConversation]:
                 )
             if convo is None:
                 continue
-            attach_graph_origin(convo, i)
+            attach_graph_origin(convo, global_idx)
             yield convo
         self.epoch += 1
 
@@ -1289,16 +1273,12 @@ def __post_init__(self):
                 self.tarred_audio_filepaths
             ), f"{len(self.manifest_filepath)} != {len(self.tarred_audio_filepaths)}"
         self.audio_placeholders = _normalize_audio_placeholders(self.audio_placeholders)
-        self._has_index = all(
-            Path(resolve_idx_path(p, self.indexes_root)).exists() for p in self.manifest_filepath
-        )
         self.epoch = 0
         self._cuts_readers: list = []
         self._tar_readers: list = []
         self._cum_lens: list[int] = []
         self._total_len = 0
-        self._position = 0
-        self._restored = False
+        self._iter_state = PartitionedIndexedIterator()
         if self.indexed:
             self._init_indexed()
 
@@ -1355,14 +1335,13 @@ def _resolve(self, idx: int) -> tuple[int, int]:
         raise IndexError(idx)
 
     def state_dict(self) -> dict:
-        return {"position": self._position, "epoch": self.epoch} if self.indexed else {}
+        return {**self._iter_state.state_dict(), "epoch": self.epoch} if self.indexed else {}
 
     def load_state_dict(self, sd: dict) -> None:
         if not self.indexed:
             return
-        self._position = sd.get("position", 0)
+        self._iter_state.load_state_dict(sd)
         self.epoch = sd.get("epoch", 0)
-        self._restored = True
 
     def _build_one(self, data: dict, shard_idx: int) -> NeMoMultimodalConversation:
         conversations = _transform_sharegpt(self.audio_placeholders, data)
@@ -1425,21 +1404,15 @@ def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
             return
         if self.tarred_audio_filepaths is not None:
             yield from self._iter_tar()
-        elif self.shuffle_shards and self._has_index:
-            yield from self._iter_jsonl_indexed()
         else:
             yield from self._iter_jsonl()
 
     def _iter_indexed_node(self) -> Iterator[NeMoMultimodalConversation]:
-        start = self._position if self._restored else 0
-        self._restored = False
-        n = self._total_len
-        for i in range(start, n):
-            self._position = i + 1
-            shard_idx, local_idx = self._resolve(i)
+        for global_idx in self._iter_state.iterate(self._total_len):
+            shard_idx, local_idx = self._resolve(global_idx)
             data = self._cuts_readers[shard_idx][local_idx]
             convo = self._build_one(data, shard_idx)
-            attach_graph_origin(convo, i)
+            attach_graph_origin(convo, global_idx)
             yield convo
         self.epoch += 1
 
@@ -1547,27 +1520,6 @@ def _iter_jsonl(self):
                 )
         self.epoch += 1
 
-    def _iter_jsonl_indexed(self):
-        paths = list(self.manifest_filepath)
-        rng = self._get_rng()
-        rng.shuffle(paths)
-        for path in paths:
-            reader = IndexedJSONLReader(path)
-            for idx in LazyShuffledRange(len(reader), rng):
-                data = reader[idx]
-                conversations = _transform_sharegpt(self.audio_placeholders, data)
-                yield NeMoMultimodalConversation(
-                    id=data.get("id", "missing-example-id"),
-                    turns=_create_sharegpt_turns(
-                        self.audio_locator_tag,
-                        conversations,
-                        lambda t, _p=path: self._resolve_cut_from_path(t, _p),
-                    ),
-                    token_equivalent_duration=self.token_equivalent_duration,
-                )
-        self.epoch += 1
-
-
 @dataclass
 class NeMoMultimodalConversationShareGPTWebdatasetAdapter(IteratorNode):
     """
@@ -1622,15 +1574,11 @@ def __post_init__(self):
             if not self._shard_paths:
                 raise FileNotFoundError(f"No wids-meta.json and no .tar files found under {self.data_dir}")
         self.audio_placeholders = _normalize_audio_placeholders(self.audio_placeholders)
-        self._has_index = all(
-            Path(resolve_idx_path(p, self.indexes_root)).exists() for p in self._shard_paths
-        )
         self.epoch = 0
         self._tar_readers: list = []
         self._cum_lens: list[int] = []
         self._total_len = 0
-        self._position = 0
-        self._restored = False
+        self._iter_state = PartitionedIndexedIterator()
         if self.indexed:
             self._init_indexed()
 
@@ -1674,14 +1622,13 @@ def _resolve(self, idx: int) -> tuple[int, int]:
         raise IndexError(idx)
 
     def state_dict(self) -> dict:
-        return {"position": self._position, "epoch": self.epoch} if self.indexed else {}
+        return {**self._iter_state.state_dict(), "epoch": self.epoch} if self.indexed else {}
 
     def load_state_dict(self, sd: dict) -> None:
         if not self.indexed:
             return
-        self._position = sd.get("position", 0)
+        self._iter_state.load_state_dict(sd)
         self.epoch = sd.get("epoch", 0)
-        self._restored = True
 
     def __getitem__(self, token):
         if not self.indexed:
@@ -1698,21 +1645,14 @@ def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
         if self.indexed:
             yield from self._iter_indexed_node()
             return
-        if self.shuffle_shards and self._has_index:
-            yield from self._iter_indexed()
-        else:
-            yield from self._iter_sequential()
+        yield from self._iter_sequential()
 
     def _iter_indexed_node(self) -> Iterator[NeMoMultimodalConversation]:
-        start = self._position if self._restored else 0
-        self._restored = False
-        n = self._total_len
-        for i in range(start, n):
-            self._position = i + 1
-            shard_idx, local_idx = self._resolve(i)
+        for global_idx in self._iter_state.iterate(self._total_len):
+            shard_idx, local_idx = self._resolve(global_idx)
             json_data, audio_bytes, audio_name = self._tar_readers[shard_idx][local_idx]
             convo = self._yield_from_sample(json_data, audio_bytes, audio_name)
-            attach_graph_origin(convo, i)
+            attach_graph_origin(convo, global_idx)
             yield convo
         self.epoch += 1
 
@@ -1752,20 +1692,6 @@ def _iter_sequential(self):
                     yield self._yield_from_sample(json_data, audio_bytes, audio_name)
         self.epoch += 1
 
-    def _iter_indexed(self):
-        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
-
-        shard_paths = list(self._shard_paths)
-        rng = self._get_rng()
-        rng.shuffle(shard_paths)
-        for tar_path in shard_paths:
-            reader = IndexedTarSampleReader(tar_path, idx_path=resolve_idx_path(tar_path, self.indexes_root))
-            for idx in LazyShuffledRange(len(reader), rng):
-                json_data, audio_bytes, audio_name = reader[idx]
-                yield self._yield_from_sample(json_data, audio_bytes, audio_name)
-        self.epoch += 1
-
-
 class TarIterator:
     """
     Copy of lhotse.shar.readers.tar.TarIterator, modified to read both Lhotse-Shar style audio tar files
diff --git a/tests/collections/common/test_lhotse_indexed_partition.py b/tests/collections/common/test_lhotse_indexed_partition.py
new file mode 100644
index 000000000000..bb69393fcdf4
--- /dev/null
+++ b/tests/collections/common/test_lhotse_indexed_partition.py
@@ -0,0 +1,416 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+"""Regression tests: every NeMo indexed adapter must produce disjoint slices
+across (DP rank x DataLoader worker) shards.
+
+The bug this guards against: each adapter's ``_iter_indexed`` previously
+iterated ``range(0, total_len)`` with no call to ``get_worker_partition()``,
+so under multi-rank training every rank yielded the same items
+(see ``sweeps/0909/debugging-duplication.md``). All 7 buggy adapters now
+delegate position+topology to ``PartitionedIndexedIterator``; this file
+asserts that contract at the adapter level so the next refactor can't quietly
+regress it.
+
+Each test simulates the env-var setup ``worker_init_fn`` would perform in a
+DataLoader worker subprocess, builds the adapter with ``indexed=True``, walks
+every (rank in range(world_size)) instance, and asserts:
+
+* per-rank slices are pairwise disjoint;
+* union over all ranks equals the full manifest (each example seen exactly
+  once across the world).
+"""
+from __future__ import annotations
+
+import json
+import os
+import tarfile
+from contextlib import contextmanager
+from io import BytesIO
+from pathlib import Path
+
+import pytest
+from lhotse import CutSet
+from lhotse.dataset.dataloading import LHOTSE_USE_WORKER_PARTITION
+from lhotse.serialization import save_to_jsonl
+from lhotse.testing.dummies import DummyManifest
+
+from nemo.collections.common.data.lhotse import nemo_adapters, text_adapters
+
+_PARTITION_ENV_KEYS = ("RANK", "WORLD_SIZE", LHOTSE_USE_WORKER_PARTITION)
+
+
+@contextmanager
+def _env_partition(rank: int, world_size: int):
+    """Mimic the worker-subprocess env that ``worker_init_fn`` sets."""
+    saved = {k: os.environ.get(k) for k in _PARTITION_ENV_KEYS}
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ[LHOTSE_USE_WORKER_PARTITION] = "1"
+    try:
+        yield
+    finally:
+        for k, v in saved.items():
+            if v is None:
+                os.environ.pop(k, None)
+            else:
+                os.environ[k] = v
+
+
+def _collect_disjoint_per_rank(build_iter_for_rank, world_size: int) -> tuple[list, set]:
+    """Run an adapter across every rank in ``range(world_size)`` and return
+    ``(per_rank_id_lists, union_of_all_ids)``. Asserts pairwise disjointness."""
+    per_rank: list[list] = []
+    union: set = set()
+    for rank in range(world_size):
+        with _env_partition(rank=rank, world_size=world_size):
+            ids = list(build_iter_for_rank())
+        # Disjointness against every prior rank.
+        for prev in per_rank:
+            assert set(prev).isdisjoint(ids), (
+                f"rank {rank} slice overlaps prior rank: "
+                f"{sorted(set(prev) & set(ids))}"
+            )
+        per_rank.append(ids)
+        union.update(ids)
+    return per_rank, union
+
+
+# ---------------------------------------------------------------------------
+# Fixture: 20 single-channel cuts saved as one NeMo manifest + one tar file.
+# Used by the LazyNeMoTarredIterator + parquet tests.
+# ---------------------------------------------------------------------------
+
+N_CUTS = 20
+
+
+@pytest.fixture
+def tmp_audio_root(tmp_path_factory) -> Path:
+    return tmp_path_factory.mktemp("audio")
+
+
+@pytest.fixture
+def nemo_tarred_manifest(tmp_audio_root) -> tuple[Path, Path]:
+    """20-utterance NeMo tarred manifest (single shard) as
+    (manifest_filepath, tarred_audio_filepath)."""
+    from lhotse.shar.writers import TarWriter
+    from lhotse.serialization import SequentialJsonlWriter
+
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
+        tmp_audio_root, progress_bar=False
+    )
+    root = tmp_audio_root / "tarred"
+    root.mkdir(exist_ok=True)
+    with (
+        TarWriter(f"{root}/audios_0.tar", shard_size=None) as tar_writer,
+        SequentialJsonlWriter(root / "manifest_0.jsonl") as mft_writer,
+    ):
+        for idx, cut in enumerate(cuts):
+            src = cut.recording.sources[0].source
+            name = Path(src).name
+            with open(src, "rb") as f:
+                tar_writer.write(name, BytesIO(f.read()))
+            mft_writer.write(
+                {
+                    "audio_filepath": name,
+                    "text": "irrelevant",
+                    "duration": cut.duration,
+                    "lang": "en",
+                    "shard_id": 0,
+                    "cut_id": cut.id,
+                }
+            )
+    return Path(mft_writer.path), root / "audios_0.tar"
+
+
+# ---------------------------------------------------------------------------
+# 1. LazyNeMoTarredIterator
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_lazy_nemo_tarred_iterator_indexed_partition(nemo_tarred_manifest, world_size):
+    manifest_path, tar_path = nemo_tarred_manifest
+
+    def build():
+        it = nemo_adapters.LazyNeMoTarredIterator(
+            manifest_path=str(manifest_path),
+            tar_paths=str(tar_path),
+            indexed=True,
+        )
+        return [cut.id for cut in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS, f"missing {N_CUTS - len(union)} items at world_size={world_size}"
+    # All items get covered at least once (each exactly once due to disjointness).
+    assert sum(len(r) for r in per_rank) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 2. LazyParquetIterator
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def parquet_manifest(tmp_audio_root) -> Path:
+    """20-row parquet file: id + audio_bytes + text."""
+    pa = pytest.importorskip("pyarrow")
+    pq = pytest.importorskip("pyarrow.parquet")
+    import pandas as pd
+
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
+        tmp_audio_root / "parquet_audio", progress_bar=False
+    )
+    rows = []
+    for cut in cuts:
+        with open(cut.recording.sources[0].source, "rb") as f:
+            rows.append(
+                {
+                    "id": cut.id,
+                    "audio": {"bytes": f.read()},
+                    "text": "irrelevant",
+                    "duration": cut.duration,
+                    "lang": "en",
+                }
+            )
+    df = pd.DataFrame(rows)
+    p = tmp_audio_root / "data.parquet"
+    df.to_parquet(p, engine="pyarrow", row_group_size=7)  # > 1 row group exercise
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_lazy_parquet_iterator_indexed_partition(parquet_manifest, world_size):
+    pytest.importorskip("pyarrow")
+
+    def build():
+        it = nemo_adapters.LazyParquetIterator(path=str(parquet_manifest), indexed=True)
+        return [cut.id for cut in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 3. LhotseTextJsonlAdapter
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def text_jsonl(tmp_path) -> Path:
+    p = tmp_path / "text.jsonl"
+    with open(p, "w") as f:
+        for i in range(N_CUTS):
+            f.write(json.dumps({"id": f"t-{i:04d}", "text": f"line {i}"}) + "\n")
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_lhotse_text_jsonl_adapter_indexed_partition(text_jsonl, world_size):
+    def build():
+        it = text_adapters.LhotseTextJsonlAdapter(
+            paths=str(text_jsonl), language="en", indexed=True
+        )
+        return [ex.text for ex in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 4. NeMoSFTJsonlAdapter
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def sft_jsonl(tmp_path) -> Path:
+    """Minimal NeMo-SFT-chat JSONL — adapter wraps each line, doesn't parse."""
+    p = tmp_path / "sft.jsonl"
+    with open(p, "w") as f:
+        for i in range(N_CUTS):
+            f.write(json.dumps({"id": f"sft-{i:04d}", "marker": i}) + "\n")
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_nemo_sft_jsonl_adapter_indexed_partition(sft_jsonl, world_size):
+    def build():
+        it = text_adapters.NeMoSFTJsonlAdapter(
+            paths=str(sft_jsonl), language="en", indexed=True
+        )
+        # NeMoSFTExample stores the raw dict in .data; key by "id".
+        return [ex.data["id"] for ex in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 5. NeMoMultimodalConversationJsonlAdapter — non-tarred path
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def mm_conversation_jsonl(tmp_audio_root) -> Path:
+    """20-line JSONL where each line is a 2-turn user/assistant conversation
+    referring to a local audio file."""
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
+        tmp_audio_root / "mm_audio", progress_bar=False
+    )
+    p = tmp_audio_root / "mm_conversations.jsonl"
+    with open(p, "w") as f:
+        for i, cut in enumerate(cuts):
+            audio_filepath = cut.recording.sources[0].source
+            f.write(
+                json.dumps(
+                    {
+                        "id": f"mm-{i:04d}",
+                        "conversations": [
+                            {
+                                "type": "audio",
+                                "from": "User",
+                                "value": audio_filepath,
+                                "duration": cut.duration,
+                                "offset": 0.0,
+                            },
+                            {
+                                "type": "text",
+                                "from": "Assistant",
+                                "value": f"answer {i}",
+                            },
+                        ],
+                    }
+                )
+                + "\n"
+            )
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_nemo_multimodal_conversation_jsonl_adapter_indexed_partition(
+    mm_conversation_jsonl, world_size
+):
+    def build():
+        it = text_adapters.NeMoMultimodalConversationJsonlAdapter(
+            manifest_filepath=[str(mm_conversation_jsonl)],
+            audio_locator_tag="<audio>",
+            token_equivalent_duration=0.08,
+            indexed=True,
+        )
+        return [convo.id for convo in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 6. NeMoMultimodalConversationShareGPTJsonlAdapter — non-tarred path
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def sharegpt_conversation_jsonl(tmp_audio_root) -> Path:
+    """ShareGPT-format JSONL with a single user audio + assistant turn each.
+
+    Schema note: the audio path lives in the ``sound`` field (see
+    ``_transform_sharegpt`` in nemo.collections.common.data.lhotse.text_adapters),
+    not in ``audio_filepath`` — the adapter intentionally treats ShareGPT
+    distinctly from NeMo manifests."""
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
+        tmp_audio_root / "sharegpt_audio", progress_bar=False
+    )
+    p = tmp_audio_root / "sharegpt.jsonl"
+    with open(p, "w") as f:
+        for i, cut in enumerate(cuts):
+            audio_filepath = cut.recording.sources[0].source
+            f.write(
+                json.dumps(
+                    {
+                        "id": f"sgpt-{i:04d}",
+                        "conversations": [
+                            {"from": "User", "value": f"<audio>describe {i}"},
+                            {"from": "Assistant", "value": f"this is example {i}"},
+                        ],
+                        "sound": audio_filepath,
+                        "duration": cut.duration,
+                    }
+                )
+                + "\n"
+            )
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_sharegpt_jsonl_adapter_indexed_partition(sharegpt_conversation_jsonl, world_size):
+    def build():
+        it = text_adapters.NeMoMultimodalConversationShareGPTJsonlAdapter(
+            manifest_filepath=[str(sharegpt_conversation_jsonl)],
+            audio_locator_tag="<audio>",
+            audio_placeholders=["<audio>"],
+            token_equivalent_duration=0.08,
+            indexed=True,
+        )
+        return [convo.id for convo in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
+
+
+# ---------------------------------------------------------------------------
+# 7. NeMoMultimodalConversationShareGPTWebdatasetAdapter
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def sharegpt_webdataset_tar(tmp_audio_root) -> Path:
+    """20-sample ShareGPT WebDataset tar: each example is a (.json, .wav) pair
+    with matching stem. The adapter pairs alternating members. We also build
+    the ``.idx`` sidecar that IndexedTarSampleReader requires (it does not
+    auto-create indexes, unlike the JSONL reader)."""
+    from lhotse.indexing import create_tar_index
+
+    cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
+        tmp_audio_root / "wds_audio", progress_bar=False
+    )
+    p = tmp_audio_root / "shard_0.tar"
+    with tarfile.open(p, "w") as tar:
+        for i, cut in enumerate(cuts):
+            stem = f"swds-{i:04d}"
+            audio_path = cut.recording.sources[0].source
+            with open(audio_path, "rb") as f:
+                audio_bytes = f.read()
+            payload = json.dumps(
+                {
+                    "id": stem,
+                    "conversations": [
+                        {"from": "User", "value": f"<audio>q{i}"},
+                        {"from": "Assistant", "value": f"a{i}"},
+                    ],
+                }
+            ).encode()
+            for ext, data in ((".json", payload), (".wav", audio_bytes)):
+                info = tarfile.TarInfo(stem + ext)
+                info.size = len(data)
+                tar.addfile(info, BytesIO(data))
+    create_tar_index(str(p), output_path=str(p) + ".idx")
+    return p
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_sharegpt_webdataset_adapter_indexed_partition(sharegpt_webdataset_tar, world_size):
+    def build():
+        it = text_adapters.NeMoMultimodalConversationShareGPTWebdatasetAdapter(
+            data_dir=str(sharegpt_webdataset_tar.parent),
+            audio_locator_tag="<audio>",
+            audio_placeholders=["<audio>"],
+            token_equivalent_duration=0.08,
+            indexed=True,
+        )
+        return [convo.id for convo in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == N_CUTS
diff --git a/tests/collections/common/test_lhotse_multimodal_dataloading.py b/tests/collections/common/test_lhotse_multimodal_dataloading.py
index f7ae02d6d706..dd1cc7134d8d 100644
--- a/tests/collections/common/test_lhotse_multimodal_dataloading.py
+++ b/tests/collections/common/test_lhotse_multimodal_dataloading.py
@@ -26,10 +26,10 @@
 from omegaconf import OmegaConf
 
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
+from lhotse.indexing import create_jsonl_index
+
 from nemo.collections.common.data.lhotse.indexed_adapters import (
     IndexedTarSampleReader,
-    LazyShuffledRange,
-    create_index,
     create_tar_index,
 )
 from nemo.collections.common.data.lhotse.sampling import (
@@ -952,7 +952,7 @@ def indexed_sharegpt_conversations_path(tmp_path_factory):
         for i in range(10)
     ]
     lhotse.serialization.save_to_jsonl(data, manifest_path)
-    create_index(str(manifest_path), str(manifest_path) + ".idx")
+    create_jsonl_index(str(manifest_path))
     return manifest_path
 
 
@@ -964,47 +964,12 @@ def test_sharegpt_indexed_sequential_no_shuffle(indexed_sharegpt_conversations_p
         shuffle_shards=False,
         shard_seed=0,
     )
-    assert adapter._has_index is True
     conversations = list(adapter)
     assert len(conversations) == 10
     ids = [c.id for c in conversations]
     assert ids == [f"convo_{i}" for i in range(10)]
 
 
-def test_sharegpt_indexed_shuffle_uses_random_access(indexed_sharegpt_conversations_path):
-    """When shuffle is on and .idx files exist, all items are yielded in shuffled order."""
-    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
-        manifest_filepath=str(indexed_sharegpt_conversations_path),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=0,
-    )
-    assert adapter._has_index is True
-    conversations = list(adapter)
-    assert len(conversations) == 10
-    ids = [c.id for c in conversations]
-    # All items present
-    assert sorted(ids) == [f"convo_{i}" for i in range(10)]
-    # Order is shuffled (with 10 items the chance of identical order is 1/10! ≈ 0)
-    assert ids != [f"convo_{i}" for i in range(10)]
-
-
-def test_sharegpt_indexed_different_epochs_different_order(indexed_sharegpt_conversations_path):
-    """Different epochs produce different shuffled orders."""
-    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
-        manifest_filepath=str(indexed_sharegpt_conversations_path),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=0,
-    )
-    epoch0_ids = [c.id for c in adapter]
-    epoch1_ids = [c.id for c in adapter]
-    # Both epochs have all items
-    assert sorted(epoch0_ids) == sorted(epoch1_ids)
-    # But in different order (epoch counter increments the seed)
-    assert epoch0_ids != epoch1_ids
-
-
 def test_sharegpt_no_index_falls_back_to_in_memory_shuffle(tmp_path_factory):
     """When .idx files don't exist, shuffle_shards still works via in-memory shuffle."""
     tmp_path = tmp_path_factory.mktemp("sharegpt_no_idx")
@@ -1020,7 +985,6 @@ def test_sharegpt_no_index_falls_back_to_in_memory_shuffle(tmp_path_factory):
         for i in range(10)
     ]
     lhotse.serialization.save_to_jsonl(data, manifest_path)
-    # No .idx file created
 
     adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
         manifest_filepath=str(manifest_path),
@@ -1028,7 +992,6 @@ def test_sharegpt_no_index_falls_back_to_in_memory_shuffle(tmp_path_factory):
         shuffle_shards=True,
         shard_seed=0,
     )
-    assert adapter._has_index is False
     conversations = list(adapter)
     assert len(conversations) == 10
     ids = [c.id for c in conversations]
@@ -1037,75 +1000,6 @@ def test_sharegpt_no_index_falls_back_to_in_memory_shuffle(tmp_path_factory):
     assert ids != [f"convo_{i}" for i in range(10)]
 
 
-def test_sharegpt_indexed_with_audio(tmp_path_factory):
-    """Indexed reading works correctly with audio turns (ShareGPT format with <sound> placeholders)."""
-    tmp_path = tmp_path_factory.mktemp("indexed_sharegpt_audio")
-    manifest_path = tmp_path / "manifest.jsonl"
-
-    # Create audio files
-    for i in range(5):
-        dummy_recording(i, duration=1.0 + i * 0.5, with_data=True).to_cut().save_audio(tmp_path / f"audio_{i}.wav")
-
-    data = [
-        {
-            "id": f"audio_convo_{i}",
-            "sound": f"audio_{i}.wav",
-            "conversations": [
-                {"from": "human", "value": f"Listen to this: <sound> What do you think?"},
-                {"from": "gpt", "value": f"Response {i}"},
-            ],
-        }
-        for i in range(5)
-    ]
-    lhotse.serialization.save_to_jsonl(data, manifest_path)
-    create_index(str(manifest_path), str(manifest_path) + ".idx")
-
-    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
-        manifest_filepath=str(manifest_path),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=42,
-    )
-    assert adapter._has_index is True
-    conversations = list(adapter)
-    assert len(conversations) == 5
-
-    ids = sorted([c.id for c in conversations])
-    assert ids == [f"audio_convo_{i}" for i in range(5)]
-
-    # Verify audio turns were created correctly
-    for conv in conversations:
-        assert conv.has_audio_turns
-        audio_turns = [t for t in conv.turns if isinstance(t, AudioTurn)]
-        assert len(audio_turns) == 1
-        assert audio_turns[0].audio_locator_tag == "[audio]"
-        assert audio_turns[0].cut.load_audio().shape[0] == 1  # mono audio
-
-
-@pytest.mark.parametrize("n", [0, 1, 2, 3, 5, 10, 100, 1000, 1023, 1024, 1025])
-def test_lazy_shuffled_range_is_a_permutation(n):
-    """LazyShuffledRange must yield every element of [0, n) exactly once."""
-    rng = random.Random(42)
-    result = list(LazyShuffledRange(n, rng))
-    assert len(result) == n
-    assert sorted(result) == list(range(n))
-
-
-def test_lazy_shuffled_range_is_shuffled():
-    """LazyShuffledRange should not produce the identity permutation (for non-trivial n)."""
-    rng = random.Random(0)
-    result = list(LazyShuffledRange(50, rng))
-    assert result != list(range(50))
-
-
-def test_lazy_shuffled_range_different_seeds():
-    """Different RNG seeds produce different permutations."""
-    a = list(LazyShuffledRange(100, random.Random(0)))
-    b = list(LazyShuffledRange(100, random.Random(1)))
-    assert a != b
-    assert sorted(a) == sorted(b) == list(range(100))
-
-
 # ─── WebDataset ShareGPT adapter tests ──────────────────────────────────────
 
 
@@ -1248,37 +1142,6 @@ def test_webdataset_sequential_turn_structure(webdataset_dir):
     assert conv.turns[3].value == "Response for sample 0"
 
 
-def test_webdataset_indexed_shuffle(webdataset_dir):
-    """When shuffle is on and .idx files exist, all items are yielded in shuffled order."""
-    adapter = NeMoMultimodalConversationShareGPTWebdatasetAdapter(
-        data_dir=str(webdataset_dir),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=0,
-    )
-    assert adapter._has_index is True
-    conversations = list(adapter)
-    assert len(conversations) == 6
-    ids = [c.id for c in conversations]
-    assert sorted(ids) == [f"sample_{i}" for i in range(6)]
-    # Order is shuffled (1/6! ≈ 0 chance of identity)
-    assert ids != [f"sample_{i}" for i in range(6)]
-
-
-def test_webdataset_indexed_different_epochs(webdataset_dir):
-    """Different epochs produce different shuffled orders."""
-    adapter = NeMoMultimodalConversationShareGPTWebdatasetAdapter(
-        data_dir=str(webdataset_dir),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=0,
-    )
-    epoch0_ids = [c.id for c in adapter]
-    epoch1_ids = [c.id for c in adapter]
-    assert sorted(epoch0_ids) == sorted(epoch1_ids)
-    assert epoch0_ids != epoch1_ids
-
-
 def test_webdataset_no_index_falls_back_to_sequential_shuffle(webdataset_dir_no_idx):
     """Without .idx files, shuffle_shards still works (shard-level shuffle, sequential within)."""
     adapter = NeMoMultimodalConversationShareGPTWebdatasetAdapter(
@@ -1287,7 +1150,6 @@ def test_webdataset_no_index_falls_back_to_sequential_shuffle(webdataset_dir_no_
         shuffle_shards=True,
         shard_seed=0,
     )
-    assert adapter._has_index is False
     conversations = list(adapter)
     assert len(conversations) == 6
     ids = [c.id for c in conversations]
@@ -1310,25 +1172,6 @@ def test_webdataset_audio_loads_correctly(webdataset_dir):
         assert audio.shape[0] == 1  # mono
 
 
-def test_webdataset_indexed_audio_loads_correctly(webdataset_dir):
-    """Audio loaded via indexed random access is valid and decodable."""
-    adapter = NeMoMultimodalConversationShareGPTWebdatasetAdapter(
-        data_dir=str(webdataset_dir),
-        audio_locator_tag="[audio]",
-        shuffle_shards=True,
-        shard_seed=42,
-    )
-    assert adapter._has_index is True
-    conversations = list(adapter)
-    assert len(conversations) == 6
-    for conv in conversations:
-        audio_turns = [t for t in conv.turns if isinstance(t, AudioTurn)]
-        assert len(audio_turns) == 1
-        audio = audio_turns[0].cut.load_audio()
-        assert audio.shape[0] == 1
-        assert audio.shape[1] > 0
-
-
 def test_sharegpt_audio_root(tmp_path_factory):
     """When audio_root is set, audio files are resolved relative to it, not the manifest directory."""
     manifest_dir = tmp_path_factory.mktemp("sharegpt_manifest_dir")
@@ -1423,7 +1266,6 @@ def test_webdataset_auto_discover_shards_no_meta(tmp_path_factory, create_idx):
         audio_locator_tag="[audio]",
         shuffle_shards=False,
     )
-    assert adapter._has_index == create_idx
     conversations = list(adapter)
     assert len(conversations) == 4
     ids = sorted(c.id for c in conversations)

From ca40489a0e67d7897246e1c8c3c42370274db4b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Wed, 13 May 2026 14:14:54 -0400
Subject: [PATCH 11/30] Proper lhotse shar indexing support and refactoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../migrate-to-resumable-dataloader/SKILL.md  |  3 +-
 .../references/option-reference.md            |  2 +-
 nemo/collections/common/data/lhotse/cutset.py | 14 +++-
 .../common/data/lhotse/dataloader.py          |  6 ++
 .../common/data/lhotse/indexed_adapters.py    | 40 ----------
 .../common/data/lhotse/nemo_adapters.py       | 26 ++++--
 .../common/data/lhotse/text_adapters.py       | 28 +++----
 nemo/collections/speechlm2/data/datamodule.py | 79 +++++++++++++++++--
 scripts/dataloading/build_indexes.py          |  7 +-
 scripts/dataloading/prefetch_indexes.py       |  6 +-
 10 files changed, 133 insertions(+), 78 deletions(-)

diff --git a/.claude/skills/migrate-to-resumable-dataloader/SKILL.md b/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
index 8c0d10e27b80..24caee384c49 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
@@ -236,7 +236,8 @@ launcher).
   - `lhotse_resumable/lhotse/dataset/dataloading.py` (`worker_init_fn` sets the `LHOTSE_USE_WORKER_PARTITION` signal; `get_worker_partition()` returns the trivial `(0, 1)` when that signal is absent — keeps map-style mode unaffected even under torchrun)
   - `lhotse_resumable/lhotse/ais/batch_loader.py` (`AISBatchLoader`, `force_individual`, byte-range `shar_ptr` fallback, `_moss_attrs`)
   - `lhotse_resumable/lhotse/dataset/input_strategies.py` (`AudioSamples`)
-  - `NeMo_resumable/nemo/collections/common/data/lhotse/indexed_adapters.py` (`IndexedTarMemberReader`, `_AISRangeReader`, `_CountingReader`, `_open_data_path`, `_load_index`, `resolve_idx_path`)
+  - `NeMo_resumable/nemo/collections/common/data/lhotse/indexed_adapters.py` (`IndexedTarMemberReader`, `_AISRangeReader`, `_CountingReader`, `_open_data_path`, `_load_index`)
+  - `lhotse_resumable/lhotse/indexing.py` — `index_file_path(data_path, indexes_root=None)` is the canonical `.idx` path resolver.
   - `NeMo_resumable/nemo/collections/common/data/lhotse/dataloader.py` (`get_lhotse_sampler_from_config`, `get_lhotse_dataloader_from_config`, `force_map_dataset` handling, the auto-overwrite of `shard_seed`, `_maybe_init_main_process_for_iterable` for `num_workers=0` eager `worker_init_fn` call)
   - `NeMo_resumable/nemo/collections/common/data/lhotse/nemo_adapters.py` (`LazyNeMoTarredIterator`, `_init_indexed`, `_iter_batch_for_ais_get_batch`, `USE_AIS_GET_BATCH` gate)
   - `NeMo_resumable/scripts/dataloading/build_indexes.py` and `prefetch_indexes.py`
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md b/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
index 900bb13b968f..8620793dbf9e 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
@@ -11,7 +11,7 @@ pointer, see-also link to MIGRATION_GUIDE.md and (when relevant) to the
 | `indexed` | `true` | Routes every nested `input_cfg` source to its indexed adapter (`LazyNeMoTarredIterator(indexed=True)`, `IndexedJsonlReader`, etc.). Without this flag, the streaming/replay path is used. Defined in `LhotseDataLoadingConfig` (`NeMo_resumable/nemo/collections/common/data/lhotse/dataloader.py:261`). | MIGRATION_GUIDE.md "Step 2 — Flip two flags" |
 | `use_stateful_dataloader` | `true` | Swaps PyTorch `DataLoader` → `torchdata.StatefulDataLoader` so iterator state is checkpointed in `meta.pt` under `DataModule.train_dataloader` (3 keys: `_snapshot`, `_steps_since_snapshot`, `_iterator_finished`). Verified via `inspect_meta.py` against `step=2000.ckpt` / `step=3000.ckpt` / `step=N-last.ckpt` (see `agent-debug-workspace/nano-v3-1node-resumable-tests.md`). | `dataloader.py:272`, MIGRATION_GUIDE.md "Step 2" |
 | `force_map_dataset` | `true` (safe default) **OR** `false` (optimization for indexed-only configs at high `world_size`) | Two viable modes. **`true`**: sampler runs in the main GPU process; cross-rank dedup is over-sample-and-discard inside `DynamicBucketingSampler` (sampler generates `world_size` batches per step, picks `batches[rank]`, discards the rest). Works for any source type. Costs `W×` redundant sampler/manifest reads per step. **`false`**: sampler runs co-located with the dataset inside CPU worker subprocesses (`IterableDatasetWrapper`); sample indices are partitioned across `(DP rank × DataLoader worker)` via `LazyShuffledRange(shard_id, num_shards)`. Eliminates the `W×` redundant work — near-`W×` step-time improvement at scale. **Requires every source to be indexed** (lhotse-indexed JSONL, nemo_tarred with indexed mode, etc.); non-indexed sources mixed into the chain are NOT deduplicated and may be silently duplicated across ranks. The partition is gated by the `LHOTSE_USE_WORKER_PARTITION` env var that `worker_init_fn` sets (and `dataloader.py:_maybe_init_main_process_for_iterable` sets eagerly for the `num_workers=0` case). | `dataloader.py:247-279`, `lhotse_resumable/lhotse/indexing.py:396-571` (`LazyShuffledRange` with `(shard_id, num_shards)`; constructor L423, `state_dict` L497, `load_state_dict` L507 validates topology), `lhotse_resumable/lhotse/lazy.py:548+` (`LazyIndexedManifestIterator.__iter__` at L606), `failure-modes.md §20-§23` |
-| `indexes_root` | local SSD path (e.g. `/tmp/idx`) matching `prefetch_indexes.py` destination | Where the prefetched `.idx` mirror is read from at training time. Mirror tree preserves the data-file paths (`<indexes_root>/lustre/...` mirroring the blend's lustre paths). Resolved by `resolve_idx_path` in `NeMo_resumable/nemo/collections/common/data/lhotse/indexed_adapters.py:170`. **Must match the prefetch script's destination**, otherwise manifests fail to find their `.idx` neighbors at training time. | MIGRATION_GUIDE.md "keep indexes on a separate fast disk" |
+| `indexes_root` | local SSD path (e.g. `/tmp/idx`) matching `prefetch_indexes.py` destination | Where the prefetched `.idx` mirror is read from at training time. Mirror tree preserves the data-file paths (`<indexes_root>/lustre/...` mirroring the blend's lustre paths). Resolved by `lhotse.indexing.index_file_path(data_path, indexes_root=...)` (canonical), at `lhotse_resumable/lhotse/indexing.py`. **Must match the prefetch script's destination**, otherwise manifests fail to find their `.idx` neighbors at training time. | MIGRATION_GUIDE.md "keep indexes on a separate fast disk" |
 | `seed` | a fixed integer, **invariant across chunks** | Controls Python/numpy/torch global RNG via `pl.seed_everything(seed)` at chunk start. **MUST NOT change on resume**, otherwise dropout / aux-loss / random-init diverge across chunks even though `StatefulDataLoader.load_state_dict` restores sampler state correctly. The 0909 longform chains (see `agent-debug-workspace/0909-longform-failures.md`) hit this exact silent-corruption bug because `train_and_eval.py` rotated `FIXED_SEEDS[seed_offset+i]` per chunk. Fixed in `train_and_eval.py:925-952` — when `--enable-indexes-prefetch` is set, all chunks use the same seed. | MIGRATION_GUIDE.md "Operational constraints" §1, `0909-longform-failures.md` Cause A |
 | `shard_seed` | a fixed integer (NOT `"randomized"`) under either `force_map_dataset` value | Sampler RNG for `DynamicBucketingSampler`. **Map path**: cross-rank dedup is by index slicing (`rank=global_rank, world_size=world_size` at `dataloader.py:680-681`); per-rank seed differentiation is unneeded, and `"randomized"` adds worker-PID-derived seeding that breaks across resume boundaries. NeMo's `dataloader.py:556-572` auto-overwrites `shard_seed: "randomized"` → `shard_seed: <seed>` with a warning when `force_map_dataset + use_stateful_dataloader` are both true. **Iterable path** (`force_map_dataset: false`): the multiplexer inside the sampler graph (`LazyIteratorMultiplexer`) requires all DP ranks to pick the same source at each multiplex step so the global weighted source distribution stays coherent. `seed='randomized'` would derive a different per-(rank, worker) seed and break this — `LazyIteratorMultiplexer.__iter__` (`lhotse_resumable/lhotse/lazy.py:960-970`) raises `ValueError` if `seed='randomized'` under multi-shard partition. Either mode: pin `shard_seed: <int>` explicitly in YAML. | `0909-summary.md` R2, `dataloader.py:543-572`, `failure-modes.md §22` |
 | `num_workers` | match between save and restore | `StatefulDataLoader` hard requirement: changing `num_workers` between save and restore raises a hard error from torchdata. Document the value in the YAML / launcher header. | MIGRATION_GUIDE.md "Operational constraints" §1 |
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 18243e71deda..50a3d962caac 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -652,6 +652,8 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
         shard_seed = config.get("shard_seed", "trng")
         metadata_only = config.get("metadata_only", False)
         force_finite = config.get("force_finite", False)
+        indexed = config.get("indexed", False)
+        indexes_root = config.get("indexes_root", None)
         if config.get("cuts_path") is not None:
             warnings.warn("Note: lhotse.cuts_path will be ignored because lhotse.shar_path was provided.")
         if isinstance(config.shar_path, (str, Path)):
@@ -660,6 +662,8 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
                 shuffle_shards=True,
                 seed=shard_seed,
                 slice_length=config.get("slice_length", None),
+                indexed=indexed,
+                indexes_root=indexes_root,
             )
             if not metadata_only and not force_finite:
                 cuts = cuts.repeat(preserve_id=True)
@@ -676,6 +680,8 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
                         shuffle_shards=True,
                         seed=shard_seed,
                         slice_length=config.get("slice_length", None),
+                        indexed=indexed,
+                        indexes_root=indexes_root,
                     )
                     weight = len(cs)
                 else:
@@ -691,6 +697,8 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
                         shuffle_shards=True,
                         seed=shard_seed,
                         slice_length=config.get("slice_length", None),
+                        indexed=indexed,
+                        indexes_root=indexes_root,
                     )
                 cutsets.append(cs)
                 weights.append(weight)
@@ -715,6 +723,8 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
                 shuffle_shards=True,
                 seed=shard_seed,
                 slice_length=config.get("slice_length", None),
+                indexed=indexed,
+                indexes_root=indexes_root,
             )
             if not metadata_only and not force_finite:
                 cuts = cuts.repeat(preserve_id=True)
@@ -727,12 +737,12 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
     else:
         # Regular Lhotse manifest points to individual audio files (like native NeMo manifest).
         path = config.cuts_path
-        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+        from lhotse.indexing import index_file_path
 
         indexes_root = config.get("indexes_root", None)
         from_file_kwargs = {"indexed": config.get("indexed", None)}
         if indexes_root is not None:
-            from_file_kwargs["index_path"] = resolve_idx_path(path, indexes_root)
+            from_file_kwargs["index_path"] = index_file_path(path, indexes_root)
         cuts = CutSet.from_file(path, **from_file_kwargs).map(
             partial(resolve_relative_paths, manifest_path=path)
         )
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index 8fe350260a1f..b0bc32ba32c2 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -464,6 +464,12 @@ def gather_shared_opts():
             "metadata_only",
             "force_finite",
             "use_stateful_dataloader",
+            # Indexed dataloading flags must propagate too — otherwise a
+            # top-level ``indexed: true`` / ``indexes_root: /tmp/idx`` on the
+            # train_ds namespace silently fails to reach sub-configs, and the
+            # underlying readers fall back to streaming.
+            "indexed",
+            "indexes_root",
         ]
         defaults = OmegaConf.structured(LhotseDataLoadingConfig)
         top_level_config["seed"] = resolve_seed(top_level_config["seed"])
diff --git a/nemo/collections/common/data/lhotse/indexed_adapters.py b/nemo/collections/common/data/lhotse/indexed_adapters.py
index 8952948ce052..15b6b0f5ccae 100644
--- a/nemo/collections/common/data/lhotse/indexed_adapters.py
+++ b/nemo/collections/common/data/lhotse/indexed_adapters.py
@@ -165,46 +165,6 @@ def _open_data_path(path: str):
     return open(path, "rb")
 
 
-def resolve_idx_path(data_path: str | Path, indexes_root: Optional[str | Path] = None) -> str:
-    """
-    Compute the ``.idx`` sidecar path for *data_path*.
-
-    When ``indexes_root`` is ``None`` (the default), return ``data_path + ".idx"``
-    so the sidecar lives next to the data file, matching the conventional
-    layout.
-
-    When ``indexes_root`` is set, return a path under that root that mirrors
-    the data file's directory structure. URL schemes are stripped (so the
-    bucket/key remains as the relative key); leading separators on local paths
-    are dropped. Examples::
-
-        /data/foo/bar.jsonl       + indexes_root=/cache/idx
-            -> /cache/idx/data/foo/bar.jsonl.idx
-        ais://bucket/key/m.jsonl  + indexes_root=/cache/idx
-            -> /cache/idx/bucket/key/m.jsonl.idx
-        s3://b/path/data.tar      + indexes_root=/cache/idx
-            -> /cache/idx/b/path/data.tar.idx
-
-    The indexes_root argument itself can be local or a URL — joining respects
-    URL semantics so e.g. mirroring into ``ais://cache/idx`` works the same way.
-    """
-    data_str = str(data_path)
-    if indexes_root is None:
-        return data_str + ".idx"
-
-    # Normalize the data path into a relative "key" by stripping a URL scheme,
-    # any leading slashes, and Windows-style drive letters (best-effort).
-    rel = _URL_RE.sub("", data_str).lstrip("/\\")
-    # Strip "C:" or "C:/" style drive prefixes.
-    if len(rel) >= 2 and rel[1] == ":":
-        rel = rel[2:].lstrip("/\\")
-
-    root_str = str(indexes_root).rstrip("/\\")
-    if _URL_RE.match(root_str):
-        return f"{root_str}/{rel}.idx"
-    return str(Path(root_str) / (rel + ".idx"))
-
-
 def _load_index(data_path: str, idx_path: Optional[str] = None):
     """
     Load an offset index for *data_path*, layering NeMo-specific validation
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 6aca474ad105..720ca5e14a5f 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -151,12 +151,12 @@ def __init__(
                     "their values are positional/streaming and cannot be reconstructed under "
                     "graph-token random access."
                 )
-            from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+            from lhotse.indexing import index_file_path
 
             seed = resolve_seed(shard_seed) if shard_seed not in (None, "trng", "randomized") else 0
             indexed_sources = [
                 LazyIndexedManifestIterator(
-                    p, index_path=resolve_idx_path(p, indexes_root), decode=GraphOriginDict
+                    p, index_path=index_file_path(p, indexes_root), decode=GraphOriginDict
                 )
                 for p in paths
             ]
@@ -475,11 +475,10 @@ def has_constant_time_access(self) -> bool:
 
     def _init_indexed(self) -> None:
         """Build per-shard IndexedJsonlReaders + audio-tar index for indexed/random access."""
-        from lhotse.indexing import IndexedJsonlReader
+        from lhotse.indexing import IndexedJsonlReader, index_file_path
 
         from nemo.collections.common.data.lhotse.indexed_adapters import (
             IndexedTarMemberReader,
-            resolve_idx_path,
         )
 
         if self.extra_fields:
@@ -517,11 +516,11 @@ def _init_indexed(self) -> None:
             jsonl_path = shard_id_to_manifest_path[sid]
             tar_path = self.shard_id_to_tar_path[sid]
             self._cuts_readers[sid] = IndexedJsonlReader(
-                jsonl_path, index_path=resolve_idx_path(jsonl_path, self.indexes_root)
+                jsonl_path, index_path=index_file_path(jsonl_path, self.indexes_root)
             )
             if not self.use_ais_get_batch:
                 self._tar_readers[sid] = IndexedTarMemberReader(
-                    tar_path, idx_path=resolve_idx_path(tar_path, self.indexes_root)
+                    tar_path, idx_path=index_file_path(tar_path, self.indexes_root)
                 )
             cum += len(self._cuts_readers[sid])
             cum_lens.append(cum)
@@ -530,7 +529,15 @@ def _init_indexed(self) -> None:
         self._iter_state = PartitionedIndexedIterator()
 
     def to_shards(self) -> List["LazyNeMoTarredIterator"]:
-        """Convert this iterator to a list of separate iterators for each shard."""
+        """Convert this iterator to a list of separate iterators for each shard.
+
+        Forwards every constructor knob (notably ``indexed``/``indexes_root``,
+        ``extra_fields``, ``slice_length``, ``skip_missing_manifest_entries``)
+        so per-shard sub-iterators behave identically to the parent. Dropping
+        these silently re-enters streaming mode, which a downstream caller
+        like ``mux(..., max_open_streams=N)`` won't notice until the bucketer
+        fails to checkpoint.
+        """
         if len(self.paths) == 1:
             # Cannot do that if the JSON manifest is a single file for all shards;
             # just return self.
@@ -544,6 +551,11 @@ def to_shards(self) -> List["LazyNeMoTarredIterator"]:
                     shard_seed=self.shard_seed,
                     text_field=self.text_field,
                     lang_field=self.lang_field,
+                    skip_missing_manifest_entries=self.skip_missing_manifest_entries,
+                    extra_fields=self.extra_fields,
+                    slice_length=self.slice_length,
+                    indexed=self.indexed,
+                    indexes_root=self.indexes_root,
                 )
                 for path, tarpath in zip(self.paths, self.shard_id_to_tar_path.values())
             ]
diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py
index 12b27980f194..8a168d264687 100644
--- a/nemo/collections/common/data/lhotse/text_adapters.py
+++ b/nemo/collections/common/data/lhotse/text_adapters.py
@@ -159,10 +159,10 @@ def __post_init__(self):
         if self.indexed:
             from lhotse.indexing import IndexedJsonlReader
 
-            from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+            from lhotse.indexing import index_file_path
 
             for p in self.paths:
-                self._readers.append(IndexedJsonlReader(p, index_path=resolve_idx_path(p, self.indexes_root)))
+                self._readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
             cum = 0
             self._cum_lens.append(cum)
             for r in self._readers:
@@ -427,10 +427,10 @@ def __post_init__(self):
         if self.indexed:
             from lhotse.indexing import IndexedJsonlReader
 
-            from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+            from lhotse.indexing import index_file_path
 
             for p in self.paths:
-                self._readers.append(IndexedJsonlReader(p, index_path=resolve_idx_path(p, self.indexes_root)))
+                self._readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
             cum = 0
             self._cum_lens.append(cum)
             for r in self._readers:
@@ -829,20 +829,20 @@ def has_constant_time_access(self) -> bool:
     def _init_indexed(self) -> None:
         from lhotse.indexing import IndexedJsonlReader
 
-        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+        from lhotse.indexing import index_file_path
 
         if self.slice_length is not None:
             raise ValueError(
                 "NeMoMultimodalConversationJsonlAdapter(indexed=True) does not support slice_length."
             )
         for p in self.manifest_filepath:
-            self._cuts_readers.append(IndexedJsonlReader(p, index_path=resolve_idx_path(p, self.indexes_root)))
+            self._cuts_readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
         if self.tarred_audio_filepaths is not None:
             from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
 
             for p in self.tarred_audio_filepaths:
                 self._tar_readers.append(
-                    IndexedTarMemberReader(p, idx_path=resolve_idx_path(p, self.indexes_root))
+                    IndexedTarMemberReader(p, idx_path=index_file_path(p, self.indexes_root))
                 )
         cum = 0
         self._cum_lens.append(cum)
@@ -1264,7 +1264,7 @@ class NeMoMultimodalConversationShareGPTJsonlAdapter(IteratorNode):
     indexes_root: Optional[Pathlike] = None
 
     def __post_init__(self):
-        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+        from lhotse.indexing import index_file_path
 
         self.manifest_filepath = expand_sharded_filepaths(self.manifest_filepath)
         if self.tarred_audio_filepaths is not None:
@@ -1297,20 +1297,20 @@ def has_constant_time_access(self) -> bool:
     def _init_indexed(self) -> None:
         from lhotse.indexing import IndexedJsonlReader
 
-        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+        from lhotse.indexing import index_file_path
 
         if self.slice_length is not None:
             raise ValueError(
                 "NeMoMultimodalConversationShareGPTJsonlAdapter(indexed=True) does not support slice_length."
             )
         for p in self.manifest_filepath:
-            self._cuts_readers.append(IndexedJsonlReader(p, index_path=resolve_idx_path(p, self.indexes_root)))
+            self._cuts_readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
         if self.tarred_audio_filepaths is not None:
             from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
 
             for p in self.tarred_audio_filepaths:
                 self._tar_readers.append(
-                    IndexedTarMemberReader(p, idx_path=resolve_idx_path(p, self.indexes_root))
+                    IndexedTarMemberReader(p, idx_path=index_file_path(p, self.indexes_root))
                 )
         cum = 0
         self._cum_lens.append(cum)
@@ -1562,7 +1562,7 @@ class NeMoMultimodalConversationShareGPTWebdatasetAdapter(IteratorNode):
     def __post_init__(self):
         import json as _json
 
-        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+        from lhotse.indexing import index_file_path
 
         meta_path = Path(self.data_dir) / "wids-meta.json"
         if meta_path.exists():
@@ -1595,10 +1595,10 @@ def has_constant_time_access(self) -> bool:
         return self.indexed
 
     def _init_indexed(self) -> None:
-        from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+        from lhotse.indexing import index_file_path
 
         for p in self._shard_paths:
-            self._tar_readers.append(IndexedTarSampleReader(p, idx_path=resolve_idx_path(p, self.indexes_root)))
+            self._tar_readers.append(IndexedTarSampleReader(p, idx_path=index_file_path(p, self.indexes_root)))
         cum = 0
         self._cum_lens.append(cum)
         for r in self._tar_readers:
diff --git a/nemo/collections/speechlm2/data/datamodule.py b/nemo/collections/speechlm2/data/datamodule.py
index fd5364bdab05..385ff1fb0247 100644
--- a/nemo/collections/speechlm2/data/datamodule.py
+++ b/nemo/collections/speechlm2/data/datamodule.py
@@ -84,18 +84,58 @@ def train_dataloader(self):
         return self._train_dl
 
     def state_dict(self) -> dict:
-        # Persist the train dataloader state when it's stateful (e.g. torchdata's StatefulDataLoader
-        # paired with a checkpointable lhotse sampler). This enables exact-batch resume.
-        if self._train_dl is not None and hasattr(self._train_dl, "state_dict"):
-            return {"train_dataloader": self._train_dl.state_dict()}
-        return {}
+        # Each DP rank has its own dataloader state (different cuts partition, different
+        # per-worker RNG positions). all_gather across the DP group so the rank-0 meta.pt
+        # that Lightning writes contains every rank's state, keyed by dp_rank.
+        if self._train_dl is None or not hasattr(self._train_dl, "state_dict"):
+            return {}
+        local_state = self._train_dl.state_dict()
+        rank = self._get_dp_rank()
+        world = self._get_world_size()
+        tagged = {"dp_rank": rank, "dp_world_size": world, "state": local_state}
+        if world <= 1 or not (torch.distributed.is_available() and torch.distributed.is_initialized()):
+            per_rank = [tagged]
+        else:
+            group = self._get_dp_group()
+            per_rank = [None] * world
+            torch.distributed.all_gather_object(per_rank, tagged, group=group)
+        return {"train_dataloader_per_rank": per_rank}
 
     def load_state_dict(self, state_dict: dict) -> None:
-        if "train_dataloader" not in state_dict:
+        # Mirrors state_dict: we expect a per-DP-rank list and consume the slot that
+        # matches our current (dp_rank, dp_world_size). Any other shape is a bug.
+        if not state_dict:
             return
+        if "train_dataloader_per_rank" not in state_dict:
+            raise RuntimeError(
+                "DataModule.load_state_dict: expected 'train_dataloader_per_rank' in "
+                f"state_dict, got keys {list(state_dict.keys())}."
+            )
+        per_rank = state_dict["train_dataloader_per_rank"]
+        rank = self._get_dp_rank()
+        world = self._get_world_size()
+        if not isinstance(per_rank, list) or len(per_rank) != world:
+            raise RuntimeError(
+                f"DataModule state has dp_world_size="
+                f"{len(per_rank) if isinstance(per_rank, list) else 'unknown'} but the "
+                f"current run has dp_world_size={world}."
+            )
+        entry = per_rank[rank]
+        if not isinstance(entry, dict) or "state" not in entry or "dp_rank" not in entry or "dp_world_size" not in entry:
+            raise RuntimeError(
+                f"Malformed per-rank dataloader state at index {rank}: expected keys "
+                f"{{'dp_rank', 'dp_world_size', 'state'}}, got "
+                f"{list(entry.keys()) if isinstance(entry, dict) else type(entry).__name__}."
+            )
+        saved_rank, saved_world = entry["dp_rank"], entry["dp_world_size"]
+        if saved_rank != rank or saved_world != world:
+            raise RuntimeError(
+                f"Dataloader state tagged (dp_rank={saved_rank}, dp_world_size={saved_world}) "
+                f"loaded on (dp_rank={rank}, dp_world_size={world})."
+            )
         dl = self.train_dataloader()
         if dl is not None and hasattr(dl, "load_state_dict"):
-            dl.load_state_dict(state_dict["train_dataloader"])
+            dl.load_state_dict(entry["state"])
 
     def val_dataloader(self):
         if "validation_ds" not in self.cfg:
@@ -201,3 +241,28 @@ def _get_world_size(self):
                 return torch.distributed.get_world_size()
         else:
             return 1  # 1 GPU
+
+    def _get_dp_group(self):
+        """Return the torch.distributed process group covering this rank's DP siblings.
+
+        Used by ``state_dict`` to gather per-rank dataloader state across DP only
+        (i.e. excluding CP / TP / PP / EP duplicates, which carry the same data
+        partition by construction). Returns ``None`` for the default world group,
+        which is correct for plain DDP and for single-process runs.
+        """
+        if not (torch.distributed.is_available() and torch.distributed.is_initialized()):
+            return None
+        if (
+            hasattr(self.trainer, "model")
+            and hasattr(self.trainer.model, "device_mesh")
+            and (dm := self.trainer.model.device_mesh) is not None
+        ):
+            if "data_parallel" in dm.mesh_dim_names:  # Lightning's ModelParallelStrategy
+                return dm["data_parallel"].get_group()
+            if "dp_shard" in dm.mesh_dim_names and "dp_replicate" in dm.mesh_dim_names:
+                # AutomodelParallelStrategy exposes a flattened "dp" submesh covering
+                # dp_replicate × dp_shard (see parallel.py docstring). Required —
+                # without it we can't restrict the all_gather to DP-only siblings and
+                # would over-gather across CP/TP/PP, producing incorrect per-rank entries.
+                return dm["dp"].get_group()
+        return None  # default = global DDP group
diff --git a/scripts/dataloading/build_indexes.py b/scripts/dataloading/build_indexes.py
index 50479b6ef13a..d1eedd1a4269 100644
--- a/scripts/dataloading/build_indexes.py
+++ b/scripts/dataloading/build_indexes.py
@@ -59,9 +59,10 @@
 import click
 from omegaconf import DictConfig, ListConfig, OmegaConf
 
+from lhotse.indexing import index_file_path
+
 from nemo.collections.common.data.lhotse.indexed_adapters import (
     create_tar_index as create_nemo_tar_index,
-    resolve_idx_path,
 )
 from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths
 
@@ -86,8 +87,8 @@ class IndexJob:
     kind: str  # one of {JSONL, NEMO_TAR, WDS_TAR}
     indexes_root: Optional[str] = None
 
-    def idx_path(self) -> str:
-        return resolve_idx_path(self.path, self.indexes_root)
+    def idx_path(self):
+        return index_file_path(self.path, self.indexes_root)
 
 
 # --------------------------------------------------------------------------- #
diff --git a/scripts/dataloading/prefetch_indexes.py b/scripts/dataloading/prefetch_indexes.py
index f743db758b53..311ebd032329 100644
--- a/scripts/dataloading/prefetch_indexes.py
+++ b/scripts/dataloading/prefetch_indexes.py
@@ -57,7 +57,7 @@
 import click
 from omegaconf import OmegaConf
 
-from nemo.collections.common.data.lhotse.indexed_adapters import resolve_idx_path
+from lhotse.indexing import index_file_path
 
 # Reuse the discovery + IndexJob machinery from build_indexes.py.
 sys.path.insert(0, str(Path(__file__).parent))
@@ -151,8 +151,8 @@ def main(
 
     pairs: list[tuple[str, str]] = []
     for j in unique:
-        src = resolve_idx_path(j.path, source_indexes_root)
-        dst = resolve_idx_path(j.path, indexes_root)
+        src = index_file_path(j.path, source_indexes_root)
+        dst = index_file_path(j.path, indexes_root)
         pairs.append((src, dst))
 
     todo = pairs if force else [(s, d) for (s, d) in pairs if not _is_present(d)]

From 89c7b51a95f8e6aa8714d658d5eb769d79ae786b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Wed, 13 May 2026 19:10:54 -0400
Subject: [PATCH 12/30] Add dataloader validator under scripts/dataloading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

End-to-end harness that exercises any train_ds-shaped Lhotse config:
torchrun-launched per-rank entry dumps per-batch cut.id JSONL, saves
state_dict at a configurable step, and a separate resumed-phase
torchrun loads it and continues. Post-iteration consolidate.py
verifies five properties:

  Q1 no cut.id appears on >1 (rank, worker)
  Q2 union of yielded IDs equals ground-truth enumeration
  Q3 per-rank cut sets are pairwise disjoint
  Q4 resumed cells match baseline tail bit-for-bit (off-by-one
     aware: state captures position AFTER yielding step K, so
     resumed[0] == baseline[K+1])
  Q5 two independent runs with the same seed yield identical
     (rank, step) cut sets

Plus 16 static pre-validation checks (seed types, indexed/stateful
flags, idx-file presence, constant-time-leaves contract, mux weight
sanity, multi_config flag propagation, bucketer buffer heuristic,
text_field sanity).

Caught the LazyIndexedSharIterator partition leak fixed in the
companion lhotse commit (4072 AMI cuts duplicated across 8 DP
ranks under iterable+indexed mode); now reports clean PASS on the
0909-en-only-id2 recipe.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../_validate_dataloader/__init__.py          |  14 +
 .../_validate_dataloader/config_inject.py     |  67 ++
 .../_validate_dataloader/consolidate.py       | 370 +++++++++++
 .../_validate_dataloader/cut_id_dataset.py    |  34 +
 .../_validate_dataloader/pre_validation.py    | 617 ++++++++++++++++++
 scripts/dataloading/validate_dataloader.py    | 299 +++++++++
 .../common/test_validate_dataloader.py        | 334 ++++++++++
 7 files changed, 1735 insertions(+)
 create mode 100644 scripts/dataloading/_validate_dataloader/__init__.py
 create mode 100644 scripts/dataloading/_validate_dataloader/config_inject.py
 create mode 100644 scripts/dataloading/_validate_dataloader/consolidate.py
 create mode 100644 scripts/dataloading/_validate_dataloader/cut_id_dataset.py
 create mode 100644 scripts/dataloading/_validate_dataloader/pre_validation.py
 create mode 100644 scripts/dataloading/validate_dataloader.py
 create mode 100644 tests/collections/common/test_validate_dataloader.py

diff --git a/scripts/dataloading/_validate_dataloader/__init__.py b/scripts/dataloading/_validate_dataloader/__init__.py
new file mode 100644
index 000000000000..845889837cb5
--- /dev/null
+++ b/scripts/dataloading/_validate_dataloader/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Internal helpers for ``scripts/dataloading/validate_dataloader.py``."""
diff --git a/scripts/dataloading/_validate_dataloader/config_inject.py b/scripts/dataloading/_validate_dataloader/config_inject.py
new file mode 100644
index 000000000000..fc1b5f63a3ef
--- /dev/null
+++ b/scripts/dataloading/_validate_dataloader/config_inject.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Recursively inject validator-specific flags into a train_ds-shaped
+OmegaConf node and every nested ``input_cfg``."""
+
+import logging
+from typing import Any
+
+from omegaconf import DictConfig, ListConfig, OmegaConf
+
+LOG = logging.getLogger(__name__)
+
+
+def inject_validator_flags(cfg: DictConfig, *, force_finite: bool, metadata_only: bool) -> DictConfig:
+    """Mutate-in-place: set ``force_finite`` and ``metadata_only`` on ``cfg``
+    and on every nested ``input_cfg`` entry (recursively). Logs every
+    injection so the user can see exactly what was changed."""
+    if force_finite:
+        _inject_key(cfg, "force_finite", True, ctx="train_ds (top-level)")
+    if metadata_only:
+        _inject_key(cfg, "metadata_only", True, ctx="train_ds (top-level)")
+    _walk_input_cfg(cfg.get("input_cfg"), force_finite=force_finite, metadata_only=metadata_only)
+    return cfg
+
+
+def _walk_input_cfg(node: Any, *, force_finite: bool, metadata_only: bool, path: str = "input_cfg") -> None:
+    if node is None:
+        return
+    if isinstance(node, (list, ListConfig)):
+        for i, sub in enumerate(node):
+            _walk_input_cfg(sub, force_finite=force_finite, metadata_only=metadata_only, path=f"{path}[{i}]")
+        return
+    if isinstance(node, str):
+        return  # input_cfg reference to a YAML file path — resolved later by NeMo
+    if not isinstance(node, (dict, DictConfig)):
+        return
+    typ = node.get("type", "<no-type>")
+    if force_finite and "force_finite" not in node:
+        _inject_key(node, "force_finite", True, ctx=f"{path} (type={typ})")
+    if metadata_only and "metadata_only" not in node:
+        _inject_key(node, "metadata_only", True, ctx=f"{path} (type={typ})")
+    if "input_cfg" in node:
+        _walk_input_cfg(
+            node["input_cfg"],
+            force_finite=force_finite,
+            metadata_only=metadata_only,
+            path=f"{path}.input_cfg",
+        )
+
+
+def _inject_key(node: Any, key: str, value: Any, *, ctx: str) -> None:
+    prev = node.get(key) if isinstance(node, (dict, DictConfig)) else None
+    if prev == value:
+        return
+    node[key] = value
+    LOG.info("inject %s=%s into %s (was %r)", key, value, ctx, prev)
diff --git a/scripts/dataloading/_validate_dataloader/consolidate.py b/scripts/dataloading/_validate_dataloader/consolidate.py
new file mode 100644
index 000000000000..ee894da5444b
--- /dev/null
+++ b/scripts/dataloading/_validate_dataloader/consolidate.py
@@ -0,0 +1,370 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Consolidate per-rank validator JSONLs and emit PASS/FAIL on Q1..Q5.
+
+Layout the per-rank entry writes:
+
+    {output_dir}/
+        baseline/run0/rank_NNN.jsonl
+        baseline/run0/state_rank_NNN.pt
+        baseline/run0/throughput_rank_NNN.json
+        baseline/run1/rank_NNN.jsonl            # if --num-determinism-runs >= 2
+        resumed/run0/rank_NNN.jsonl             # phase=resumed
+        groundtruth/cuts.jsonl                  # phase=groundtruth (single-rank)
+        pre_validation.json                     # written by pre_validation.py
+
+This module is the post-iteration aggregator. Exit code: 0 if all checks
+pass, 1 if any check fails, 2 if there's a structural problem
+(no JSONLs, missing groundtruth, etc.).
+"""
+
+import json
+import logging
+import statistics
+import sys
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+import click
+
+LOG = logging.getLogger(__name__)
+
+
+PASS = "PASS"
+FAIL = "FAIL"
+WARN = "WARN"
+SKIP = "SKIP"
+
+
+@dataclass
+class QResult:
+    q_id: str
+    status: str
+    tag: Optional[str] = None
+    detail: str = ""
+    extra: dict = field(default_factory=dict)
+
+
+@dataclass
+class ValidationReport:
+    questions: list[QResult]
+    throughput: dict
+
+    def to_dict(self):
+        return {
+            "questions": {q.q_id: {"status": q.status, "tag": q.tag, "detail": q.detail, **q.extra}
+                          for q in self.questions},
+            "throughput": self.throughput,
+        }
+
+    @property
+    def all_passed(self) -> bool:
+        return all(q.status != FAIL for q in self.questions)
+
+
+# --------------------------------------------------------------------------- #
+# Public API.
+# --------------------------------------------------------------------------- #
+
+
+def consolidate(output_dir: Path, *, checkpoint_at: int, num_determinism_runs: int) -> ValidationReport:
+    """Read every artifact under ``output_dir`` and produce a ValidationReport."""
+    baseline = _load_phase(output_dir / "baseline" / "run0")
+    questions: list[QResult] = []
+
+    questions.append(_q1_no_duplication(baseline))
+    questions.append(_q2_no_skipping(baseline, output_dir / "groundtruth" / "cuts.jsonl"))
+    questions.append(_q3_partition_correctness(baseline))
+    questions.append(_q4_exact_resume(
+        baseline,
+        _load_phase(output_dir / "resumed" / "run0"),
+        checkpoint_at=checkpoint_at,
+    ))
+    if num_determinism_runs >= 2:
+        run1 = _load_phase(output_dir / "baseline" / "run1")
+        questions.append(_q5_determinism(baseline, run1))
+    else:
+        questions.append(QResult("Q5", SKIP, detail="num_determinism_runs < 2"))
+
+    throughput = _collect_throughput(output_dir / "baseline" / "run0")
+    return ValidationReport(questions=questions, throughput=throughput)
+
+
+# --------------------------------------------------------------------------- #
+# Question implementations.
+# --------------------------------------------------------------------------- #
+
+
+def _q1_no_duplication(rows: list[dict]) -> QResult:
+    """Q1: no cut appears twice within phase 1. Tag ``partition-rank-leak``
+    if cross-rank, ``partition-worker-leak`` if within one rank."""
+    if not rows:
+        return QResult("Q1", SKIP, detail="no baseline rows loaded")
+    # Map cut_id -> set of (rank, worker) tuples that saw it.
+    sightings: dict[str, set[tuple[int, int]]] = defaultdict(set)
+    for r in rows:
+        for cid in r["cut_ids"]:
+            sightings[cid].add((r["rank"], r["worker_id"]))
+    dup_cross_rank: list[str] = []
+    dup_within_rank: list[str] = []
+    for cid, seen in sightings.items():
+        if len(seen) <= 1:
+            continue
+        ranks = {rank for rank, _ in seen}
+        if len(ranks) > 1:
+            dup_cross_rank.append(cid)
+        else:
+            dup_within_rank.append(cid)
+    if dup_cross_rank:
+        return QResult("Q1", FAIL, tag="partition-rank-leak",
+                       detail=f"{len(dup_cross_rank)} cut.id(s) appeared on multiple ranks",
+                       extra={"examples": dup_cross_rank[:5]})
+    if dup_within_rank:
+        return QResult("Q1", FAIL, tag="partition-worker-leak",
+                       detail=f"{len(dup_within_rank)} cut.id(s) seen by multiple workers within one rank",
+                       extra={"examples": dup_within_rank[:5]})
+    return QResult("Q1", PASS, detail=f"{len(sightings)} distinct cuts, no duplicates")
+
+
+def _q2_no_skipping(rows: list[dict], groundtruth_path: Path) -> QResult:
+    """Q2: yielded ID set equals the ground-truth set (force_finite mode)."""
+    if not rows:
+        return QResult("Q2", SKIP, detail="no baseline rows loaded")
+    if not groundtruth_path.exists():
+        return QResult("Q2", SKIP, detail=f"groundtruth file missing: {groundtruth_path}")
+    expected: set[str] = set()
+    with open(groundtruth_path) as f:
+        for line in f:
+            obj = json.loads(line)
+            expected.update(obj.get("cut_ids", []))
+    yielded: set[str] = set()
+    for r in rows:
+        yielded.update(r["cut_ids"])
+    missing = expected - yielded
+    unexpected = yielded - expected
+    if missing:
+        return QResult("Q2", FAIL, tag="skip",
+                       detail=f"{len(missing)} of {len(expected)} expected cut.id(s) never yielded",
+                       extra={"missing_examples": list(missing)[:5],
+                              "unexpected_count": len(unexpected)})
+    if unexpected:
+        return QResult("Q2", FAIL, tag="id-collision",
+                       detail=f"{len(unexpected)} cut.id(s) yielded but not in ground truth",
+                       extra={"unexpected_examples": list(unexpected)[:5]})
+    return QResult("Q2", PASS,
+                   detail=f"yielded ({len(yielded)}) == ground truth ({len(expected)})")
+
+
+def _q3_partition_correctness(rows: list[dict]) -> QResult:
+    """Q3: per-rank cut sets are pairwise disjoint."""
+    if not rows:
+        return QResult("Q3", SKIP, detail="no baseline rows loaded")
+    per_rank: dict[int, set[str]] = defaultdict(set)
+    for r in rows:
+        per_rank[r["rank"]].update(r["cut_ids"])
+    grand_union = set()
+    for s in per_rank.values():
+        grand_union.update(s)
+    sum_distinct = sum(len(s) for s in per_rank.values())
+    if sum_distinct == len(grand_union):
+        return QResult("Q3", PASS, detail=f"{len(per_rank)} ranks, |union|={len(grand_union)}")
+    overlap = sum_distinct - len(grand_union)
+    # Detect broadcast vs partial overlap.
+    n_ranks = max(len(per_rank), 1)
+    ratio = sum_distinct / max(len(grand_union), 1)
+    tag = "partition-rank-leak"
+    if ratio >= n_ranks - 0.5:
+        detail = (f"FULL BROADCAST: each cut.id appears on ~{ratio:.1f}/{n_ranks} ranks "
+                  f"(overlap={overlap})")
+    else:
+        detail = (f"PARTIAL OVERLAP: per-rank distinct sums to {sum_distinct} but |union|={len(grand_union)} "
+                  f"(overlap={overlap})")
+    return QResult("Q3", FAIL, tag=tag, detail=detail)
+
+
+def _q4_exact_resume(baseline: list[dict], resumed: list[dict], *, checkpoint_at: int) -> QResult:
+    """Q4: per-(rank, step) cut sets in resumed match the baseline tail.
+
+    The validator saves ``state_dict()`` AFTER yielding baseline step
+    ``checkpoint_at``; StatefulDataLoader's state points at the NEXT
+    element, so resumed[0] should equal baseline[checkpoint_at + 1].
+
+    The comparison runs on the **overlapping window** only: cells where
+    both the baseline and the resumed JSONL have an entry. Cells
+    beyond that (resumed ran longer than baseline tail, or vice versa)
+    are reported in ``extra`` but don't trigger FAIL on their own —
+    they just mean one side iterated more batches than necessary."""
+    if not resumed:
+        return QResult("Q4", SKIP, detail="no resumed rows loaded")
+    base_by_key = {(r["rank"], r["step"]): set(r["cut_ids"]) for r in baseline}
+    res_by_key = {(r["rank"], r["step"]): set(r["cut_ids"]) for r in resumed}
+    divergences: list[dict] = []
+    overlap = 0
+    extra_resumed = 0
+    extra_baseline_tail = 0
+    # Compare every resumed cell to its baseline counterpart at step + checkpoint_at + 1.
+    for (rank, rstep), res_cuts in sorted(res_by_key.items()):
+        base_step = rstep + checkpoint_at + 1
+        base_cuts = base_by_key.get((rank, base_step))
+        if base_cuts is None:
+            extra_resumed += 1
+            continue
+        overlap += 1
+        if base_cuts != res_cuts:
+            divergences.append({
+                "rank": rank, "step": rstep, "baseline_step": base_step,
+                "only_in_baseline": list(base_cuts - res_cuts)[:3],
+                "only_in_resumed": list(res_cuts - base_cuts)[:3],
+            })
+    # Cells in baseline-tail that the resumed run never reached.
+    for (rank, bstep) in base_by_key:
+        if bstep <= checkpoint_at:
+            continue
+        rstep = bstep - checkpoint_at - 1
+        if (rank, rstep) not in res_by_key:
+            extra_baseline_tail += 1
+    extras = {"overlap_cells": overlap, "extra_resumed_cells": extra_resumed,
+              "extra_baseline_tail_cells": extra_baseline_tail}
+    if divergences:
+        return QResult("Q4", FAIL, tag="resume-rng-divergence",
+                       detail=f"{len(divergences)}/{overlap} overlapping cell(s) diverge after resume",
+                       extra={**extras, "examples": divergences[:5]})
+    if overlap == 0:
+        return QResult("Q4", FAIL, tag="resume-length-mismatch",
+                       detail="zero overlap between resumed and baseline-tail windows",
+                       extra=extras)
+    return QResult("Q4", PASS,
+                   detail=f"{overlap} overlapping cell(s) match baseline tail bit-for-bit",
+                   extra=extras)
+
+
+def _q5_determinism(run0: list[dict], run1: list[dict]) -> QResult:
+    """Q5: two independent baseline runs produce identical (rank, step) cut sets."""
+    if not run1:
+        return QResult("Q5", SKIP, detail="run1 missing")
+    a = {(r["rank"], r["step"]): set(r["cut_ids"]) for r in run0}
+    b = {(r["rank"], r["step"]): set(r["cut_ids"]) for r in run1}
+    if a.keys() != b.keys():
+        only_a = list(a.keys() - b.keys())[:3]
+        only_b = list(b.keys() - a.keys())[:3]
+        return QResult("Q5", FAIL, tag="non-determinism",
+                       detail="run0/run1 step coverage differs",
+                       extra={"only_in_run0": only_a, "only_in_run1": only_b})
+    divergences: list[dict] = []
+    for k, va in a.items():
+        vb = b[k]
+        if va != vb:
+            divergences.append({"rank": k[0], "step": k[1],
+                                "only_run0": list(va - vb)[:3],
+                                "only_run1": list(vb - va)[:3]})
+    if divergences:
+        return QResult("Q5", FAIL, tag="non-determinism",
+                       detail=f"{len(divergences)} cell(s) differ between determinism runs",
+                       extra={"examples": divergences[:5]})
+    return QResult("Q5", PASS, detail="run0 == run1 across all (rank, step) cells")
+
+
+# --------------------------------------------------------------------------- #
+# Throughput summary (v1 minimal: t_total only).
+# --------------------------------------------------------------------------- #
+
+
+def _collect_throughput(run_dir: Path) -> dict:
+    files = sorted(run_dir.glob("throughput_rank_*.json"))
+    if not files:
+        return {"available": False}
+    aggregates = [json.loads(f.read_text()) for f in files]
+    p50s = [a["p50_ms"] for a in aggregates if a.get("p50_ms") is not None]
+    p95s = [a["p95_ms"] for a in aggregates if a.get("p95_ms") is not None]
+    num_workers = aggregates[0].get("num_workers")
+    p50 = statistics.median(p50s) if p50s else None
+    p95 = max(p95s) if p95s else None
+    out = {
+        "available": True,
+        "num_workers": num_workers,
+        "num_ranks": len(aggregates),
+        "p50_ms_median": p50,
+        "p95_ms_max": p95,
+        "batches_per_s_per_rank": (1000.0 / p50) if p50 else None,
+        "t_first_batch_ms_max": max(
+            (a.get("t_first_batch_ms") or 0) for a in aggregates
+        ) or None,
+    }
+    if p50 and num_workers:
+        out["t_gpu_min_for_overlap_ms"] = p50 / num_workers
+    return out
+
+
+# --------------------------------------------------------------------------- #
+# IO helpers.
+# --------------------------------------------------------------------------- #
+
+
+def _load_phase(phase_dir: Path) -> list[dict]:
+    """Load every ``rank_*.jsonl`` under ``phase_dir`` into a flat list of rows."""
+    if not phase_dir.exists():
+        return []
+    rows: list[dict] = []
+    for f in sorted(phase_dir.glob("rank_*.jsonl")):
+        with open(f) as fp:
+            for line in fp:
+                line = line.strip()
+                if line:
+                    rows.append(json.loads(line))
+    return rows
+
+
+# --------------------------------------------------------------------------- #
+# CLI.
+# --------------------------------------------------------------------------- #
+
+
+@click.command(help=__doc__)
+@click.option("--output-dir", required=True, type=click.Path(exists=True),
+              help="Directory written by validate_dataloader.py.")
+@click.option("--checkpoint-at", type=int, default=0, show_default=True,
+              help="Step index at which the baseline saved state. Must match the baseline run.")
+@click.option("--num-determinism-runs", type=int, default=1, show_default=True,
+              help="If >= 2, compares baseline/run0 vs baseline/run1 for Q5.")
+@click.option("-v", "--verbose", is_flag=True, default=False)
+def cli(output_dir: str, checkpoint_at: int, num_determinism_runs: int, verbose: bool) -> None:
+    logging.basicConfig(
+        level=logging.DEBUG if verbose else logging.INFO,
+        format="[%(asctime)s %(levelname)s] %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    out_dir = Path(output_dir)
+    report = consolidate(out_dir, checkpoint_at=checkpoint_at, num_determinism_runs=num_determinism_runs)
+
+    print(f"\n=== validation report ({len(report.questions)} questions) ===")
+    for q in report.questions:
+        marker = {PASS: "  PASS", WARN: "  WARN", FAIL: "  FAIL", SKIP: "  skip"}[q.status]
+        tag = f" [{q.tag}]" if q.tag else ""
+        print(f"{marker}  {q.q_id}{tag}: {q.detail}")
+    if report.throughput.get("available"):
+        t = report.throughput
+        print(f"\nthroughput: p50={t['p50_ms_median']:.1f}ms p95={t['p95_ms_max']:.1f}ms "
+              f"=> {t['batches_per_s_per_rank']:.2f} batches/s/rank "
+              f"(num_workers={t['num_workers']}, T_gpu_min={t.get('t_gpu_min_for_overlap_ms', 0):.1f}ms)")
+    else:
+        print("\nthroughput: <not collected>")
+    (out_dir / "validation_report.json").write_text(json.dumps(report.to_dict(), indent=2))
+    print(f"wrote {out_dir / 'validation_report.json'}")
+    sys.exit(0 if report.all_passed else 1)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/scripts/dataloading/_validate_dataloader/cut_id_dataset.py b/scripts/dataloading/_validate_dataloader/cut_id_dataset.py
new file mode 100644
index 000000000000..373b24c5bb1a
--- /dev/null
+++ b/scripts/dataloading/_validate_dataloader/cut_id_dataset.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""No-op dataset that materializes the per-batch ``cut.id`` list and the
+worker subprocess metadata. The sampler/dataloader machinery decides
+*which* cuts each call gets, which is exactly the question the
+validator answers."""
+
+import torch.utils.data
+
+
+class CutIdDataset(torch.utils.data.Dataset):
+    """Returns per-batch ``cut.id`` list and ``worker_info`` instead of
+    realizing audio/features. Bypasses ``SALMDataset`` and the tokenizer
+    so the validator can iterate orders of magnitude faster than a real
+    training step."""
+
+    def __getitem__(self, cuts):
+        info = torch.utils.data.get_worker_info()
+        return {
+            "cut_ids": [str(cut.id) for cut in cuts],
+            "worker_id": int(info.id) if info is not None else 0,
+            "num_workers": int(info.num_workers) if info is not None else 1,
+        }
diff --git a/scripts/dataloading/_validate_dataloader/pre_validation.py b/scripts/dataloading/_validate_dataloader/pre_validation.py
new file mode 100644
index 000000000000..af46b2d8d617
--- /dev/null
+++ b/scripts/dataloading/_validate_dataloader/pre_validation.py
@@ -0,0 +1,617 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Static pre-validation checks for a train_ds-shaped Lhotse dataloader config.
+
+Run as either a function (``run_pre_validation(cfg)``) or a CLI
+(``python pre_validation.py --config ... --output-dir ...``). All checks
+operate on the resolved OmegaConf node — no iteration, no GPUs, no
+SLURM. Intended runtime: < 5 s on a typical SALM ``train_ds`` config.
+
+The output is a structured report (``pre_validation.json``) listing each
+check's ``PASS``/``WARN``/``FAIL`` status. Exit code is ``0`` iff no
+``FAIL`` checks remain after applying ``--ignore-fail`` overrides.
+"""
+
+import json
+import logging
+import sys
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Callable, Iterable, Optional
+
+import click
+from omegaconf import DictConfig, ListConfig, OmegaConf
+
+LOG = logging.getLogger(__name__)
+
+PASS = "PASS"
+WARN = "WARN"
+FAIL = "FAIL"
+SKIP = "SKIP"
+
+_NON_INT_SEED_VALUES = {"randomized", "trng", "trng_initial", None}
+
+
+@dataclass
+class CheckResult:
+    check_id: str
+    severity: str  # FAIL or WARN — the worst this check is permitted to emit
+    status: str  # PASS | WARN | FAIL | SKIP
+    detail: str = ""
+    extra: dict = field(default_factory=dict)
+
+
+@dataclass
+class PreValidationReport:
+    checks: list[CheckResult]
+    summary: dict
+
+    def to_dict(self):
+        return {
+            "checks": {c.check_id: {"status": c.status, "severity": c.severity, "detail": c.detail, **c.extra}
+                       for c in self.checks},
+            "summary": self.summary,
+        }
+
+    @property
+    def all_passed(self) -> bool:
+        return not any(c.status == FAIL for c in self.checks)
+
+
+# --------------------------------------------------------------------------- #
+# Public API.
+# --------------------------------------------------------------------------- #
+
+
+def run_pre_validation(cfg: DictConfig, *, ignore_fail: Iterable[str] = ()) -> PreValidationReport:
+    """Run every registered check against ``cfg`` (a train_ds-shaped node).
+
+    Set ``ignore_fail`` to a list of check IDs to downgrade their ``FAIL``
+    outcome to ``WARN``. Always run every check — never short-circuit —
+    so the user sees the full picture.
+    """
+    ignore = set(ignore_fail)
+    results: list[CheckResult] = []
+    for check_id, severity, fn in _REGISTRY:
+        try:
+            status, detail, extra = fn(cfg)
+        except Exception as e:  # pragma: no cover — safety net
+            status, detail, extra = FAIL, f"check raised {type(e).__name__}: {e}", {}
+        if check_id in ignore and status == FAIL:
+            status = WARN
+            detail = f"(downgraded to WARN via --ignore-fail) {detail}"
+        results.append(CheckResult(check_id, severity, status, detail, extra))
+    summary = {
+        "total": len(results),
+        "pass": sum(1 for r in results if r.status == PASS),
+        "warn": sum(1 for r in results if r.status == WARN),
+        "fail": sum(1 for r in results if r.status == FAIL),
+        "skip": sum(1 for r in results if r.status == SKIP),
+    }
+    return PreValidationReport(checks=results, summary=summary)
+
+
+# --------------------------------------------------------------------------- #
+# Individual checks. Each returns (status, detail, extra_fields).
+# --------------------------------------------------------------------------- #
+
+
+def _check_seed_int(cfg: DictConfig):
+    seed = cfg.get("seed", None)
+    if isinstance(seed, int):
+        return PASS, f"seed={seed}", {}
+    if seed in _NON_INT_SEED_VALUES:
+        return FAIL, (f"train_ds.seed is {seed!r}; must be an integer for reproducibility across "
+                      "launches and determinism re-runs."), {}
+    return FAIL, f"train_ds.seed={seed!r} (type={type(seed).__name__}); must be int", {}
+
+
+def _check_shard_seed_int(cfg: DictConfig):
+    shard_seed = cfg.get("shard_seed", None)
+    if isinstance(shard_seed, int):
+        return PASS, f"shard_seed={shard_seed}", {}
+    return FAIL, (f"train_ds.shard_seed={shard_seed!r}; must be an integer. "
+                  "LazyIteratorMultiplexer raises under multi-shard + 'randomized'."), {}
+
+
+def _check_stateful_on(cfg: DictConfig):
+    if cfg.get("use_stateful_dataloader", False) is True:
+        return PASS, "", {}
+    return FAIL, ("use_stateful_dataloader is not True; resumability validation requires the "
+                  "StatefulDataLoader path."), {}
+
+
+def _check_indexed_implies_root(cfg: DictConfig):
+    indexed = cfg.get("indexed", False)
+    indexes_root = cfg.get("indexes_root", None)
+    if not indexed:
+        return SKIP, "train_ds.indexed != True; check not applicable", {}
+    if indexes_root in (None, "", "null"):
+        return FAIL, ("train_ds.indexed=True but indexes_root is unset. Without indexes_root, "
+                      "LazyIndexedSharIterator falls back to looking next to (typically remote) "
+                      "data files."), {}
+    return PASS, f"indexes_root={indexes_root}", {}
+
+
+def _check_indexes_root_exists(cfg: DictConfig):
+    indexes_root = cfg.get("indexes_root", None)
+    if not indexes_root:
+        return SKIP, "indexes_root unset; check not applicable", {}
+    p = Path(indexes_root)
+    if p.exists():
+        return PASS, f"{indexes_root} exists", {}
+    # Locally on a developer laptop the path is typically cluster-specific; downgrade to WARN.
+    return WARN, (f"indexes_root={indexes_root!r} does not exist on this host. "
+                  "Expected on cluster; downgraded to WARN locally."), {}
+
+
+def _check_idx_files_present(cfg: DictConfig):
+    indexes_root = cfg.get("indexes_root", None)
+    if not indexes_root or not Path(indexes_root).exists():
+        return SKIP, "indexes_root not present locally; cluster-side check only", {}
+    try:
+        from lhotse.indexing import index_exists, index_file_path
+    except ImportError as e:
+        return WARN, f"lhotse.indexing import failed: {e}", {}
+    try:
+        from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths
+    except ImportError:
+        expand_sharded_filepaths = None
+
+    leaves = _collect_leaf_paths(cfg)
+    if not leaves:
+        return WARN, "no leaf data paths found under input_cfg", {}
+
+    # Expand ``_OP_N..M_CL_`` shard patterns; sample 2 shards per leaf so
+    # we cover every source without doing thousands of stat()s.
+    expanded: list[str] = []
+    for raw in leaves:
+        if expand_sharded_filepaths is not None:
+            try:
+                shards = expand_sharded_filepaths(raw)
+            except Exception:
+                shards = [raw]
+        else:
+            shards = [raw]
+        expanded.extend(shards[:2])
+
+    missing: list[str] = []
+    truncated: list[str] = []
+    for shard_path in expanded[:64]:  # global cap, just in case
+        idx_path = str(index_file_path(shard_path, indexes_root=indexes_root))
+        if not Path(idx_path).exists():
+            missing.append(idx_path)
+        elif not index_exists(shard_path, idx_path):
+            truncated.append(idx_path)
+    if missing or truncated:
+        detail = f"{len(missing)} missing, {len(truncated)} truncated of {len(expanded[:64])} sampled"
+        return FAIL, detail, {"missing": missing[:5], "truncated": truncated[:5]}
+    return PASS, f"sampled {len(expanded[:64])} .idx files across {len(leaves)} leaves; all valid", {}
+
+
+def _check_constant_time_leaves(cfg: DictConfig):
+    """The user's note: O(1) state-dict restore requires constant-time leaves
+    in BOTH map-style (force_map_dataset=True) and iterable-style. So this
+    check fires whenever use_stateful_dataloader is on, regardless of
+    force_map_dataset. Implemented statically: every leaf type must be
+    one that admits indexed mode, AND the indexed flag must propagate
+    (top-level ``indexed: true`` OR per-leaf override)."""
+    stateful = cfg.get("use_stateful_dataloader", False) is True
+    top_indexed = cfg.get("indexed", False) is True
+    non_indexable: list[dict] = []
+    streaming: list[dict] = []
+    for leaf in _iter_leaf_nodes(cfg):
+        typ = leaf.get("type")
+        if typ in _STREAMING_ONLY_TYPES:
+            non_indexable.append({"type": typ, "corpus": leaf.get("corpus")})
+            continue
+        leaf_indexed = leaf.get("indexed", top_indexed) is True
+        if not leaf_indexed:
+            streaming.append({"type": typ, "corpus": leaf.get("corpus")})
+    severity_status = FAIL if stateful else WARN
+    if non_indexable or streaming:
+        n = len(non_indexable) + len(streaming)
+        detail = (f"{n} leaf source(s) lack constant-time access "
+                  f"({len(non_indexable)} non-indexable type, {len(streaming)} streaming-mode). "
+                  "Resume falls back to O(N) replay; with force_map_dataset=False they also leak "
+                  "across ranks.")
+        return severity_status, detail, {
+            "non_indexable": non_indexable[:5],
+            "streaming": streaming[:5],
+        }
+    return PASS, "all leaf sources admit constant-time access", {}
+
+
+def _check_mux_weights_sum(cfg: DictConfig):
+    """A multiplexer in NeMo configs is any list of dicts where each entry
+    carries a ``weight`` key. Validate that weights are positive finite floats."""
+    bad: list[dict] = []
+    for path, mux_entries in _iter_mux_groups(cfg):
+        total = 0.0
+        for i, e in enumerate(mux_entries):
+            w = e.get("weight")
+            if not isinstance(w, (int, float)) or w <= 0 or not _isfinite(w):
+                bad.append({"path": f"{path}[{i}]", "weight": w, "type": e.get("type")})
+            else:
+                total += float(w)
+        if total <= 0:
+            bad.append({"path": path, "weights_sum": total})
+    if bad:
+        return FAIL, f"{len(bad)} bad weight(s) found", {"examples": bad[:5]}
+    return PASS, "all mux weights sum to finite positive", {}
+
+
+def _check_mux_seed_not_randomized(cfg: DictConfig):
+    if cfg.get("force_map_dataset", True) is not False:
+        return SKIP, "force_map_dataset != False; check not applicable", {}
+    shard_seed = cfg.get("shard_seed")
+    if isinstance(shard_seed, int):
+        return PASS, f"shard_seed={shard_seed}", {}
+    return FAIL, (f"force_map_dataset=False but shard_seed={shard_seed!r}. "
+                  "LazyIteratorMultiplexer raises ValueError under multi-shard with "
+                  "shard_seed='randomized'."), {}
+
+
+def _check_slice_length_vs_indexed(cfg: DictConfig):
+    if not cfg.get("indexed", False):
+        return SKIP, "train_ds.indexed != True; check not applicable", {}
+    offenders: list[dict] = []
+    for leaf in _iter_leaf_nodes(cfg):
+        if leaf.get("slice_length") is not None:
+            offenders.append({"type": leaf.get("type"), "corpus": leaf.get("corpus")})
+    if offenders:
+        return FAIL, (f"{len(offenders)} source(s) set slice_length with indexed=True. "
+                      "Lhotse rejects: \"'slice_length' is not supported with indexed=True\"."), \
+               {"examples": offenders[:5]}
+    return PASS, "", {}
+
+
+def _check_cut_map_fns_vs_indexed(cfg: DictConfig):
+    if not cfg.get("indexed", False):
+        return SKIP, "train_ds.indexed != True; check not applicable", {}
+    offenders: list[dict] = []
+    for leaf in _iter_leaf_nodes(cfg):
+        if leaf.get("cut_map_fns"):
+            offenders.append({"type": leaf.get("type"), "corpus": leaf.get("corpus")})
+    if offenders:
+        return FAIL, (f"{len(offenders)} source(s) set cut_map_fns with indexed=True. "
+                      "Lhotse rejects: \"'cut_map_fns' is not supported with indexed=True\"."),\
+               {"examples": offenders[:5]}
+    return PASS, "", {}
+
+
+def _check_lambda_in_pipeline(cfg: DictConfig):
+    """Heuristic: scan the YAML-resolved config for strings containing
+    '<lambda>' or 'lambda '. Real lambdas in YAML can't round-trip but
+    some configs use ``_target_: somemodule:somefn`` strings — we look
+    for the textual hint."""
+    blob = OmegaConf.to_yaml(cfg, resolve=False)
+    hits: list[str] = []
+    for line in blob.splitlines():
+        if "<lambda>" in line or "lambda:" in line or " lambda " in line:
+            hits.append(line.strip())
+    if hits:
+        return WARN, f"{len(hits)} possible lambda reference(s) in config", {"examples": hits[:5]}
+    return PASS, "no lambda references found", {}
+
+
+def _check_bucketer_buffer(cfg: DictConfig):
+    if not cfg.get("use_bucketing", False):
+        return SKIP, "use_bucketing != True; check not applicable", {}
+    n_buckets = cfg.get("num_buckets", 0)
+    buffer_size = cfg.get("bucket_buffer_size", 0)
+    if not n_buckets or not buffer_size:
+        return WARN, f"num_buckets={n_buckets}, bucket_buffer_size={buffer_size}", {}
+    ratio = buffer_size / max(n_buckets, 1)
+    if ratio < 10:
+        return WARN, (f"bucket_buffer_size={buffer_size} is < 10×num_buckets ({n_buckets}). "
+                      "Low buffers can cause BucketsDontHaveEnoughData mid-run."), \
+               {"ratio": ratio}
+    return PASS, f"bucket_buffer_size={buffer_size}, num_buckets={n_buckets}, ratio={ratio:.1f}", {}
+
+
+def _check_multi_config_flags(cfg: DictConfig):
+    """multi_config = True means input_cfg is a list of per-sub-config blocks.
+    Top-level ``indexed`` / ``indexes_root`` only flow into sub-configs via
+    the ``overwriting_opts`` list at NeMo_resumable/.../dataloader.py:455-473.
+    The 2026-05 fixes added them to that list; we verify the runtime
+    config doesn't paper over a missing entry per-sub-config."""
+    if not cfg.get("multi_config", False):
+        return SKIP, "multi_config != True; check not applicable", {}
+    # Static structural check: at least one sub-config must carry the
+    # indexed/indexes_root flags (or the top-level must have them so they propagate).
+    top_indexed = cfg.get("indexed")
+    top_root = cfg.get("indexes_root")
+    if top_indexed is not None and top_root is not None:
+        return PASS, "top-level indexed+indexes_root will propagate via overwriting_opts", {}
+    sub_cfgs = cfg.get("input_cfg") or []
+    if not isinstance(sub_cfgs, (list, ListConfig)):
+        return WARN, "multi_config=True but input_cfg is not a list", {}
+    missing = [i for i, sc in enumerate(sub_cfgs)
+               if isinstance(sc, (dict, DictConfig)) and (
+                   sc.get("indexed") is None or sc.get("indexes_root") is None)]
+    if missing:
+        return FAIL, (f"multi_config=True; {len(missing)} sub-config(s) missing indexed/indexes_root "
+                      "and top-level doesn't supply both."), {"indices": missing[:5]}
+    return PASS, "every sub-config sets indexed and indexes_root", {}
+
+
+def _check_text_fields(cfg: DictConfig):
+    """Best-effort: only run if at least one nemo_tarred leaf is reachable
+    locally. Verifying ``text_field`` requires reading a manifest line."""
+    # In v1 we just verify the field name is one of the well-known
+    # candidates. Manifest-line inspection requires network/cluster access.
+    valid = {"text", "answer", "transcript", "text_pnc", "text_normalized"}
+    suspicious: list[dict] = []
+    tf = cfg.get("text_field")
+    if tf is not None and tf not in valid:
+        suspicious.append({"path": "train_ds.text_field", "value": tf})
+    for leaf in _iter_leaf_nodes(cfg):
+        if leaf.get("type") == "nemo_tarred":
+            tf = leaf.get("text_field")
+            if tf is not None and tf not in valid:
+                suspicious.append({"corpus": leaf.get("corpus"), "value": tf})
+    if suspicious:
+        return WARN, f"{len(suspicious)} unusual text_field value(s); verify against shard 0", \
+               {"examples": suspicious[:5], "known_valid": sorted(valid)}
+    return PASS, "text_field values match known-valid set", {}
+
+
+def _check_world_size_divides_workers(cfg: DictConfig):
+    """Heuristic only — we don't yet know the runtime ``num-ranks``; emit
+    INFO showing how many shards each leaf has so the user can eyeball it."""
+    counts: list[dict] = []
+    for leaf in _iter_leaf_nodes(cfg):
+        n = _count_shards(leaf)
+        if n is not None:
+            counts.append({"corpus": leaf.get("corpus") or leaf.get("type"), "shards": n})
+    if not counts:
+        return SKIP, "no leaf-shard counts derivable from config", {}
+    min_shards = min(c["shards"] for c in counts)
+    if min_shards < 8:  # arbitrary "small enough to worry about" heuristic
+        return WARN, f"smallest source has only {min_shards} shards; verify (num_ranks × num_workers) ≤ this", \
+               {"counts": counts[:10]}
+    return PASS, f"smallest source has {min_shards} shards", {"counts": counts[:10]}
+
+
+# --------------------------------------------------------------------------- #
+# Check registry. Order = output order.
+# --------------------------------------------------------------------------- #
+
+
+_REGISTRY: list[tuple[str, str, Callable[[DictConfig], tuple[str, str, dict]]]] = [
+    ("seed-int", FAIL, _check_seed_int),
+    ("shard-seed-int", FAIL, _check_shard_seed_int),
+    ("stateful-on", FAIL, _check_stateful_on),
+    ("indexed-implies-root", FAIL, _check_indexed_implies_root),
+    ("indexes-root-exists", FAIL, _check_indexes_root_exists),
+    ("idx-files-present", FAIL, _check_idx_files_present),
+    ("constant-time-leaves", FAIL, _check_constant_time_leaves),
+    ("mux-weights-sum", FAIL, _check_mux_weights_sum),
+    ("mux-seed-not-randomized", FAIL, _check_mux_seed_not_randomized),
+    ("slice-length-vs-indexed", FAIL, _check_slice_length_vs_indexed),
+    ("cut-map-fns-vs-indexed", FAIL, _check_cut_map_fns_vs_indexed),
+    ("lambda-in-pipeline", WARN, _check_lambda_in_pipeline),
+    ("bucketer-buffer", WARN, _check_bucketer_buffer),
+    ("multi-config-flags", FAIL, _check_multi_config_flags),
+    ("text-fields", WARN, _check_text_fields),
+    ("world-size-divides-workers", WARN, _check_world_size_divides_workers),
+]
+
+
+# --------------------------------------------------------------------------- #
+# Static topology helpers.
+# --------------------------------------------------------------------------- #
+
+
+# Types that read indexable underlying data.
+_LEAF_TYPES = frozenset({
+    "lhotse_shar", "nemo", "nemo_tarred", "multimodal_conversation", "share_gpt",
+})
+
+# Types that don't admit constant-time access at all.
+_STREAMING_ONLY_TYPES = frozenset({
+    "txt", "txt_pair", "parquet", "multi_speaker_simulator",
+})
+
+# Transparent passthrough types — recurse into input_cfg.
+_TRANSFORM_TYPES = frozenset({
+    "lhotse_as_conversation", "sqa_as_conversation", "s2s_as_conversation",
+    "s2s_duplex_overlap_as_s2s_duplex", "s2s_duplex_reverse_role",
+    "lhotse_magpietts_data_as_continuation", "nemo_tarred_to_duplex",
+    "group",
+})
+
+
+def _iter_leaf_nodes(cfg: DictConfig) -> Iterable[DictConfig]:
+    """Yield each leaf-source dict reachable from ``cfg.input_cfg``."""
+    yield from _walk(cfg.get("input_cfg"))
+
+
+def _walk(node: Any) -> Iterable[DictConfig]:
+    if node is None:
+        return
+    if isinstance(node, (list, ListConfig)):
+        for sub in node:
+            yield from _walk(sub)
+        return
+    if isinstance(node, str):
+        # input_cfg reference to another YAML file — try to load it.
+        loaded = _try_load_yaml(node)
+        if loaded is not None:
+            yield from _walk(loaded)
+        return
+    if not isinstance(node, (dict, DictConfig)):
+        return
+    typ = node.get("type")
+    if typ in _LEAF_TYPES or typ in _STREAMING_ONLY_TYPES:
+        yield node
+        return
+    if typ in _TRANSFORM_TYPES or typ is None:
+        if "input_cfg" in node:
+            yield from _walk(node["input_cfg"])
+        return
+    # Unknown type — yield it so it's at least counted, but caller
+    # should be defensive about its keys.
+    yield node
+
+
+def _iter_mux_groups(cfg: DictConfig) -> Iterable[tuple[str, list]]:
+    """Yield ``(path, list-of-entries)`` for each input_cfg list whose entries
+    carry a ``weight`` field (= an implicit multiplexer)."""
+    yield from _walk_mux(cfg.get("input_cfg"), path="train_ds.input_cfg")
+
+
+def _walk_mux(node: Any, path: str) -> Iterable[tuple[str, list]]:
+    if node is None:
+        return
+    if isinstance(node, (list, ListConfig)):
+        entries = [e for e in node if isinstance(e, (dict, DictConfig))]
+        weighted = [e for e in entries if "weight" in e]
+        if weighted and len(weighted) > 1:
+            yield path, list(weighted)
+        for i, sub in enumerate(node):
+            yield from _walk_mux(sub, path=f"{path}[{i}]")
+        return
+    if isinstance(node, str):
+        loaded = _try_load_yaml(node)
+        if loaded is not None:
+            yield from _walk_mux(loaded, path=f"{path}<{Path(node).name}>")
+        return
+    if isinstance(node, (dict, DictConfig)) and "input_cfg" in node:
+        yield from _walk_mux(node["input_cfg"], path=f"{path}.input_cfg")
+
+
+def _collect_leaf_paths(cfg: DictConfig) -> list[str]:
+    """Flat list of every shard path referenced from leaf sources, in YAML order."""
+    out: list[str] = []
+    for leaf in _iter_leaf_nodes(cfg):
+        for path in _leaf_to_paths(leaf):
+            out.append(path)
+    return out
+
+
+def _leaf_to_paths(leaf: DictConfig) -> list[str]:
+    """Resolve the shar/manifest paths inside ``leaf`` into flat strings."""
+    paths: list[str] = []
+    if (shar := leaf.get("shar_path")):
+        if isinstance(shar, (dict, DictConfig)):
+            for key in ("cuts", "recording"):
+                v = shar.get(key)
+                if isinstance(v, str):
+                    paths.append(v)
+        elif isinstance(shar, str):
+            paths.append(shar)
+    if (mfp := leaf.get("manifest_filepath")):
+        paths.extend(_flatten_str(mfp))
+    if (taf := leaf.get("tarred_audio_filepaths")):
+        paths.extend(_flatten_str(taf))
+    if (cuts := leaf.get("cuts_path")):
+        paths.extend(_flatten_str(cuts))
+    return paths
+
+
+def _count_shards(leaf: DictConfig) -> Optional[int]:
+    """Best-effort shard count from a leaf's ``_OP_N..M_CL_`` patterns."""
+    import re
+    paths = _leaf_to_paths(leaf)
+    if not paths:
+        return None
+    rx = re.compile(r"_OP_(\d+)\.\.(\d+)_CL_")
+    total = 0
+    for p in paths:
+        m = rx.search(str(p))
+        if m:
+            total += int(m.group(2)) - int(m.group(1)) + 1
+    return total or None
+
+
+def _flatten_str(v: Any) -> list[str]:
+    if v is None:
+        return []
+    if isinstance(v, str):
+        return [v]
+    if isinstance(v, (list, ListConfig)):
+        out: list[str] = []
+        for item in v:
+            out.extend(_flatten_str(item))
+        return out
+    return []
+
+
+def _try_load_yaml(path: str) -> Optional[Any]:
+    if not path or not isinstance(path, str):
+        return None
+    p = Path(path)
+    if not p.exists():
+        return None
+    try:
+        return OmegaConf.load(str(p))
+    except Exception as e:
+        LOG.debug("failed to load %s: %s", path, e)
+        return None
+
+
+def _isfinite(x: float) -> bool:
+    import math
+    return math.isfinite(x)
+
+
+# --------------------------------------------------------------------------- #
+# CLI.
+# --------------------------------------------------------------------------- #
+
+
+@click.command(help=__doc__)
+@click.option("--config", "config_path", required=True, type=click.Path(exists=True),
+              help="Training YAML containing data.train_ds.")
+@click.option("--data-blend-dir", default=None,
+              help="Substituted into ${data_blend_dir} in the config (optional locally).")
+@click.option("--section", default="train_ds", show_default=True,
+              help="Which data.* section to validate.")
+@click.option("--output-dir", default=None, type=click.Path(),
+              help="Write pre_validation.json under this directory.")
+@click.option("--ignore-fail", multiple=True, default=(),
+              help="Repeatable: check IDs whose FAIL outcome should be downgraded to WARN.")
+@click.option("-v", "--verbose", is_flag=True, default=False, help="Verbose logs.")
+def cli(config_path: str, data_blend_dir: Optional[str], section: str, output_dir: Optional[str],
+        ignore_fail: tuple, verbose: bool) -> None:
+    logging.basicConfig(
+        level=logging.DEBUG if verbose else logging.INFO,
+        format="[%(asctime)s %(levelname)s] %(message)s",
+        datefmt="%H:%M:%S",
+    )
+    cfg = OmegaConf.load(config_path)
+    if data_blend_dir is not None:
+        cfg.data_blend_dir = data_blend_dir
+    OmegaConf.resolve(cfg)
+    section_cfg = cfg.data[section]
+    report = run_pre_validation(section_cfg, ignore_fail=ignore_fail)
+
+    # Pretty-print to stdout.
+    print(f"\n=== pre-validation ({len(report.checks)} checks) ===")
+    for c in report.checks:
+        marker = {PASS: "  PASS", WARN: "  WARN", FAIL: "  FAIL", SKIP: "  skip"}[c.status]
+        print(f"{marker}  [{c.check_id}] {c.detail}")
+    print(f"\nsummary: {report.summary}")
+    if output_dir is not None:
+        out = Path(output_dir)
+        out.mkdir(parents=True, exist_ok=True)
+        (out / "pre_validation.json").write_text(json.dumps(report.to_dict(), indent=2))
+        print(f"wrote {out / 'pre_validation.json'}")
+    sys.exit(0 if report.all_passed else 1)
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/scripts/dataloading/validate_dataloader.py b/scripts/dataloading/validate_dataloader.py
new file mode 100644
index 000000000000..313b4c48ed30
--- /dev/null
+++ b/scripts/dataloading/validate_dataloader.py
@@ -0,0 +1,299 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Validate a Lhotse + indexed dataloader config end-to-end.
+
+Per-rank entry point launched under torchrun. Builds the **exact** dataloader
+the SALM training builds (via ``get_lhotse_dataloader_from_config``) on top
+of a no-op ``CutIdDataset`` and dumps per-batch cut.id JSONL. Phase-aware:
+
+* ``baseline`` — iterate ``--steps`` batches from a fresh dataloader; at
+  ``--checkpoint-at`` save ``dl.state_dict()`` to ``state_rank_NNN.pt``.
+* ``resumed``  — load the saved state and iterate the rest; downstream
+  consolidation diffs the post-checkpoint window against the baseline tail.
+* ``groundtruth`` — single-rank, single-worker enumeration of every cut
+  the configured input_cfg yields under force_finite + metadata_only.
+
+Launch as a step in a multi-phase pipeline; downstream aggregator is
+``_validate_dataloader/consolidate.py``.
+
+Example::
+
+    torchrun --standalone --nnodes=1 --nproc-per-node=4 \\
+        scripts/dataloading/validate_dataloader.py \\
+        --config 0909-en-only-id2.yaml \\
+        --data-blend-dir /lustre/.../data_blends/ord \\
+        --output-dir validation_out \\
+        --phase baseline --run-idx 0 \\
+        --steps 200 --checkpoint-at 100
+"""
+
+import json
+import logging
+import os
+import statistics
+import sys
+import time
+from pathlib import Path
+from typing import Optional
+
+import click
+import torch
+import torch.utils.data
+from omegaconf import OmegaConf
+
+# Local helpers — same directory.
+sys.path.insert(0, str(Path(__file__).resolve().parent))
+from _validate_dataloader.config_inject import inject_validator_flags  # noqa: E402
+from _validate_dataloader.cut_id_dataset import CutIdDataset  # noqa: E402
+
+LOG = logging.getLogger(__name__)
+
+
+PHASE_BASELINE = "baseline"
+PHASE_RESUMED = "resumed"
+PHASE_GROUNDTRUTH = "groundtruth"
+
+
+@click.command(help=__doc__)
+@click.option("--config", "config_path", required=True, type=click.Path(exists=True))
+@click.option("--data-blend-dir", default=None,
+              help="Substituted into ${data_blend_dir} in the config.")
+@click.option("--section", default="train_ds", show_default=True)
+@click.option("--output-dir", required=True, type=click.Path())
+@click.option("--phase", type=click.Choice([PHASE_BASELINE, PHASE_RESUMED, PHASE_GROUNDTRUTH]),
+              required=True)
+@click.option("--run-idx", type=int, default=0, show_default=True,
+              help="Which determinism re-run this is. Only used with --phase=baseline.")
+@click.option("--steps", type=int, default=200, show_default=True,
+              help="Batches to iterate. Ignored in groundtruth phase (iterates until exhaustion).")
+@click.option("--checkpoint-at", type=int, default=-1, show_default=True,
+              help="Step index at which to save state in baseline phase. -1 = don't save.")
+@click.option("--state-dir", default=None, type=click.Path(),
+              help="In --phase=resumed: directory containing state_rank_NNN.pt files.")
+@click.option("--force-finite/--no-force-finite", default=True, show_default=True)
+@click.option("--metadata-only/--no-metadata-only", default=True, show_default=True)
+@click.option("--num-workers-override", type=int, default=None,
+              help="Override config.{section}.num_workers.")
+@click.option("--mode", type=click.Choice(["fast", "full"]), default="fast", show_default=True,
+              help="fast: CutIdDataset (default). full: stub-only in v1, raises.")
+@click.option("-v", "--verbose", is_flag=True, default=False)
+def cli(config_path: str, data_blend_dir: Optional[str], section: str, output_dir: str,
+        phase: str, run_idx: int, steps: int, checkpoint_at: int,
+        state_dir: Optional[str], force_finite: bool, metadata_only: bool,
+        num_workers_override: Optional[int], mode: str, verbose: bool) -> None:
+    if mode == "full":
+        raise click.ClickException("--mode=full is not implemented in v1; use --mode=fast.")
+
+    rank = int(os.environ.get("RANK", "0"))
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+    local_rank = int(os.environ.get("LOCAL_RANK", str(rank)))
+    os.environ["RANK"] = str(rank)
+    os.environ["WORLD_SIZE"] = str(world_size)
+    os.environ["LOCAL_RANK"] = str(local_rank)
+
+    logging.basicConfig(
+        level=logging.DEBUG if verbose else logging.INFO,
+        format=f"[rank{rank}/{world_size} %(asctime)s %(levelname)s] %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    if phase == PHASE_GROUNDTRUTH and world_size != 1:
+        raise click.ClickException(
+            f"--phase=groundtruth requires nproc-per-node=1 (got world_size={world_size})"
+        )
+
+    cfg = OmegaConf.load(config_path)
+    if data_blend_dir is not None:
+        cfg.data_blend_dir = data_blend_dir
+    OmegaConf.resolve(cfg)
+    section_cfg = cfg.data[section]
+
+    inject_validator_flags(section_cfg, force_finite=force_finite, metadata_only=metadata_only)
+    if num_workers_override is not None:
+        LOG.info("override num_workers: %s -> %s", section_cfg.get("num_workers"), num_workers_override)
+        section_cfg.num_workers = num_workers_override
+    # Groundtruth needs num_workers=0 so the single-process iteration enumerates everything.
+    if phase == PHASE_GROUNDTRUTH:
+        section_cfg.num_workers = 0
+        section_cfg.use_stateful_dataloader = False
+        section_cfg.force_map_dataset = True
+        LOG.info("groundtruth: forced num_workers=0, use_stateful_dataloader=False, force_map_dataset=True")
+
+    # Defer import until env vars and config injections are in place.
+    from nemo.collections.common.data.lhotse.dataloader import get_lhotse_dataloader_from_config
+
+    tokenizer = _build_tokenizer_if_needed(cfg, section_cfg)
+    dataset = CutIdDataset()
+    dataloader = get_lhotse_dataloader_from_config(
+        config=section_cfg,
+        global_rank=rank,
+        world_size=world_size,
+        dataset=dataset,
+        tokenizer=tokenizer,
+    )
+
+    if phase == PHASE_RESUMED:
+        _load_state(dataloader, state_dir=state_dir, rank=rank)
+
+    out_dir = Path(output_dir)
+    phase_dir = _phase_dir(out_dir, phase, run_idx)
+    phase_dir.mkdir(parents=True, exist_ok=True)
+
+    if phase == PHASE_GROUNDTRUTH:
+        out_path = phase_dir / "cuts.jsonl"
+    else:
+        out_path = phase_dir / f"rank_{rank:03d}.jsonl"
+
+    LOG.info("phase=%s run_idx=%d steps=%d checkpoint_at=%d -> %s",
+             phase, run_idx, steps, checkpoint_at, out_path)
+
+    t_total_samples: list[float] = []
+    t_first_batch_ms: Optional[float] = None
+    iter_t0 = time.monotonic_ns()
+    with open(out_path, "w") as fout:
+        for step, batch in enumerate(dataloader):
+            t_step_end = time.monotonic_ns()
+            if step == 0:
+                t_first_batch_ms = (t_step_end - iter_t0) / 1e6
+            t_total_ms = (t_step_end - iter_t0) / 1e6
+            iter_t0 = t_step_end
+
+            if phase != PHASE_GROUNDTRUTH and step > 0:
+                t_total_samples.append(t_total_ms)
+
+            cut_ids, worker_id = _extract_cuts(batch)
+            row = {
+                "step": step,
+                "rank": rank,
+                "world_size": world_size,
+                "worker_id": worker_id,
+                "cut_ids": cut_ids,
+                "batch_size": len(cut_ids),
+                "t_total_ms": round(t_total_ms, 3),
+                "t_first_batch_ms": round(t_first_batch_ms, 3) if step == 0 else None,
+            }
+            fout.write(json.dumps(row) + "\n")
+
+            if step % 50 == 0:
+                LOG.info("step=%d cuts=%d t_total=%.1fms (first cut: %s)",
+                         step, len(cut_ids), t_total_ms,
+                         cut_ids[0] if cut_ids else "<empty>")
+
+            if phase == PHASE_BASELINE and step == checkpoint_at:
+                state_path = phase_dir / f"state_rank_{rank:03d}.pt"
+                LOG.info("saving state_dict at step=%d -> %s", step, state_path)
+                torch.save(dataloader.state_dict(), state_path)
+
+            if phase != PHASE_GROUNDTRUTH and step + 1 >= steps:
+                break
+
+    if phase == PHASE_BASELINE and run_idx == 0:
+        _write_throughput_summary(
+            phase_dir / f"throughput_rank_{rank:03d}.json",
+            t_total_samples=t_total_samples,
+            t_first_batch_ms=t_first_batch_ms,
+            num_workers=section_cfg.get("num_workers", 0),
+        )
+
+    LOG.info("DONE")
+
+
+# --------------------------------------------------------------------------- #
+# Helpers.
+# --------------------------------------------------------------------------- #
+
+
+def _phase_dir(output_dir: Path, phase: str, run_idx: int) -> Path:
+    if phase == PHASE_GROUNDTRUTH:
+        return output_dir / phase
+    return output_dir / phase / f"run{run_idx}"
+
+
+def _extract_cuts(batch) -> tuple[list[str], int]:
+    """``CutIdDataset.__getitem__`` returns ``{"cut_ids": [...], "worker_id": W}``.
+    The default collate stacks across the batch (which is always a single
+    item under Lhotse's bucketing sampler), so we get back lists wrapped
+    in length-1 outer lists. Handle both shapes defensively."""
+    if isinstance(batch, dict):
+        cuts = batch.get("cut_ids", [])
+        worker = batch.get("worker_id", 0)
+        # Default collate wraps strings in lists; unwrap one level if needed.
+        if cuts and isinstance(cuts[0], list):
+            cuts = [c for sub in cuts for c in sub]
+        if isinstance(worker, list):
+            worker = int(worker[0]) if worker else 0
+        elif isinstance(worker, torch.Tensor):
+            worker = int(worker.item())
+        return [str(c) for c in cuts], int(worker)
+    # Fallback: unknown shape.
+    return [], -1
+
+
+def _build_tokenizer_if_needed(full_cfg, section_cfg):
+    """Bucketer length measurement under ``use_multimodal_sampling=True`` requires
+    a tokenizer. Mirror SALM's construction (``salm.py:66``) so token counts
+    match production. Returns ``None`` when the config doesn't ask for it."""
+    if not section_cfg.get("use_multimodal_sampling", False):
+        return None
+    pretrained_llm = full_cfg.get("model", {}).get("pretrained_llm")
+    if not pretrained_llm:
+        raise click.ClickException(
+            "use_multimodal_sampling=True requires model.pretrained_llm in the config to load a tokenizer."
+        )
+    from nemo.collections.common.tokenizers import AutoTokenizer
+
+    trust_remote_code = bool(full_cfg.get("model", {}).get("trust_remote_code", False))
+    LOG.info("loading tokenizer for %s (trust_remote_code=%s)", pretrained_llm, trust_remote_code)
+    tokenizer = AutoTokenizer(pretrained_llm, use_fast=True, trust_remote_code=trust_remote_code)
+    audio_tag = full_cfg.get("model", {}).get("audio_locator_tag")
+    if audio_tag:
+        tokenizer.add_special_tokens({"additional_special_tokens": [audio_tag]})
+    return tokenizer
+
+
+def _load_state(dataloader, *, state_dir: Optional[str], rank: int) -> None:
+    if state_dir is None:
+        raise click.ClickException("--state-dir is required for --phase=resumed")
+    state_path = Path(state_dir) / f"state_rank_{rank:03d}.pt"
+    if not state_path.exists():
+        raise click.ClickException(f"state file missing: {state_path}")
+    LOG.info("loading state_dict from %s", state_path)
+    state = torch.load(state_path, map_location="cpu", weights_only=False)
+    dataloader.load_state_dict(state)
+
+
+def _write_throughput_summary(out_path: Path, *, t_total_samples: list[float],
+                              t_first_batch_ms: Optional[float], num_workers: int) -> None:
+    if not t_total_samples:
+        out_path.write_text(json.dumps({
+            "p50_ms": None, "p95_ms": None, "mean_ms": None, "count": 0,
+            "t_first_batch_ms": t_first_batch_ms, "num_workers": num_workers,
+        }, indent=2))
+        return
+    samples = sorted(t_total_samples)
+    p50 = statistics.median(samples)
+    p95 = samples[int(0.95 * (len(samples) - 1))]
+    mean = statistics.fmean(samples)
+    out_path.write_text(json.dumps({
+        "p50_ms": round(p50, 3),
+        "p95_ms": round(p95, 3),
+        "mean_ms": round(mean, 3),
+        "count": len(samples),
+        "t_first_batch_ms": round(t_first_batch_ms, 3) if t_first_batch_ms else None,
+        "num_workers": int(num_workers),
+    }, indent=2))
+
+
+if __name__ == "__main__":
+    cli()
diff --git a/tests/collections/common/test_validate_dataloader.py b/tests/collections/common/test_validate_dataloader.py
new file mode 100644
index 000000000000..b2912e5e8527
--- /dev/null
+++ b/tests/collections/common/test_validate_dataloader.py
@@ -0,0 +1,334 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+"""Unit tests for ``scripts/dataloading/_validate_dataloader/{pre_validation,consolidate}``.
+
+These cover the parts that can run without a SLURM cluster or real
+Lhotse manifests:
+
+  * pre-validation static checks across hand-crafted config snippets
+  * consolidate() against synthesized JSONL rows (PASS / FAIL / SKIP)
+  * config_inject recursive walker
+"""
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+import pytest
+from omegaconf import OmegaConf
+
+# The validator lives under scripts/, which isn't on PYTHONPATH by default.
+REPO_ROOT = Path(__file__).resolve().parents[3]
+sys.path.insert(0, str(REPO_ROOT / "scripts" / "dataloading"))
+
+from _validate_dataloader import config_inject, consolidate as cons, pre_validation as pv  # noqa: E402
+
+
+# --------------------------------------------------------------------------- #
+# config_inject
+# --------------------------------------------------------------------------- #
+
+
+@pytest.mark.unit
+def test_config_inject_top_level_and_nested():
+    cfg = OmegaConf.create({
+        "input_cfg": [
+            {"type": "lhotse_as_conversation",
+             "input_cfg": [
+                 {"type": "lhotse_shar", "weight": 1.0},
+                 {"type": "nemo_tarred", "weight": 0.5},
+             ]},
+            {"type": "group",
+             "input_cfg": [{"type": "lhotse_shar", "weight": 0.3}]},
+        ],
+    })
+    config_inject.inject_validator_flags(cfg, force_finite=True, metadata_only=True)
+    assert cfg["force_finite"] is True
+    assert cfg["metadata_only"] is True
+    for transform in cfg["input_cfg"]:
+        assert transform["force_finite"] is True
+        assert transform["metadata_only"] is True
+        for leaf in transform["input_cfg"]:
+            assert leaf["force_finite"] is True
+            assert leaf["metadata_only"] is True
+
+
+@pytest.mark.unit
+def test_config_inject_preserves_existing_explicit_value():
+    cfg = OmegaConf.create({"input_cfg": [{"type": "lhotse_shar", "force_finite": False}]})
+    config_inject.inject_validator_flags(cfg, force_finite=True, metadata_only=False)
+    # Leaf had explicit override — preserve it.
+    assert cfg["input_cfg"][0]["force_finite"] is False
+
+
+# --------------------------------------------------------------------------- #
+# pre_validation
+# --------------------------------------------------------------------------- #
+
+
+def _base_cfg():
+    return OmegaConf.create({
+        "seed": 42,
+        "shard_seed": 42,
+        "use_stateful_dataloader": True,
+        "indexed": True,
+        "indexes_root": "/tmp/idx_does_not_exist_locally",
+        "use_bucketing": True,
+        "num_buckets": 20,
+        "bucket_buffer_size": 20000,
+        "force_map_dataset": False,
+        "text_field": "answer",
+        "input_cfg": [
+            {"type": "lhotse_as_conversation",
+             "input_cfg": [
+                 {"type": "lhotse_shar", "weight": 1.0, "corpus": "ami"},
+                 {"type": "nemo_tarred", "weight": 0.13, "corpus": "librilight",
+                  "text_field": "answer",
+                  "manifest_filepath": "s3://x/manifest__OP_0..15_CL_.jsonl",
+                  "tarred_audio_filepaths": "s3://x/audio__OP_0..15_CL_.tar"},
+             ]},
+        ],
+    })
+
+
+@pytest.mark.unit
+def test_pre_validation_passing_config():
+    report = pv.run_pre_validation(_base_cfg())
+    fails = [c for c in report.checks if c.status == pv.FAIL]
+    assert not fails, f"unexpected FAILs: {[(c.check_id, c.detail) for c in fails]}"
+
+
+@pytest.mark.unit
+def test_pre_validation_seed_int_fail():
+    cfg = _base_cfg()
+    cfg.seed = "randomized"
+    report = pv.run_pre_validation(cfg)
+    seed_check = next(c for c in report.checks if c.check_id == "seed-int")
+    assert seed_check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_shard_seed_int_fail():
+    cfg = _base_cfg()
+    cfg.shard_seed = "randomized"
+    report = pv.run_pre_validation(cfg)
+    shard_check = next(c for c in report.checks if c.check_id == "shard-seed-int")
+    assert shard_check.status == pv.FAIL
+    mux_check = next(c for c in report.checks if c.check_id == "mux-seed-not-randomized")
+    # force_map_dataset is False in base config, so this also fires.
+    assert mux_check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_stateful_off_fail():
+    cfg = _base_cfg()
+    cfg.use_stateful_dataloader = False
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "stateful-on")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_indexed_implies_root_fail():
+    cfg = _base_cfg()
+    cfg.indexes_root = None
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "indexed-implies-root")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_constant_time_leaves_fail_when_streaming():
+    cfg = _base_cfg()
+    cfg.indexed = False  # turns off propagation -> all leaves go streaming
+    cfg.indexes_root = None  # avoid the dependent indexed-implies-root failing on its own.
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "constant-time-leaves")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_constant_time_leaves_fail_for_map_style_too():
+    """User's correction: constant-time leaves are required for both
+    map (force_map_dataset=True) and iterable (force_map_dataset=False)."""
+    cfg = _base_cfg()
+    cfg.force_map_dataset = True
+    cfg.indexed = False
+    cfg.indexes_root = None
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "constant-time-leaves")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_slice_length_with_indexed_fail():
+    cfg = _base_cfg()
+    cfg["input_cfg"][0]["input_cfg"][0]["slice_length"] = 50
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "slice-length-vs-indexed")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_mux_weights_sum_fail():
+    cfg = _base_cfg()
+    cfg["input_cfg"][0]["input_cfg"][0]["weight"] = -1.0
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "mux-weights-sum")
+    assert check.status == pv.FAIL
+
+
+@pytest.mark.unit
+def test_pre_validation_ignore_fail_downgrades_to_warn():
+    cfg = _base_cfg()
+    cfg.seed = "randomized"
+    report = pv.run_pre_validation(cfg, ignore_fail=["seed-int"])
+    check = next(c for c in report.checks if c.check_id == "seed-int")
+    assert check.status == pv.WARN
+
+
+@pytest.mark.unit
+def test_pre_validation_bucketer_buffer_warn():
+    cfg = _base_cfg()
+    cfg.bucket_buffer_size = 50  # < 20 * 10
+    report = pv.run_pre_validation(cfg)
+    check = next(c for c in report.checks if c.check_id == "bucketer-buffer")
+    assert check.status == pv.WARN
+
+
+# --------------------------------------------------------------------------- #
+# consolidate
+# --------------------------------------------------------------------------- #
+
+
+def _write_jsonl(path: Path, rows: list[dict]):
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w") as f:
+        for r in rows:
+            f.write(json.dumps(r) + "\n")
+
+
+def _row(rank, step, cut_ids, *, worker_id=0):
+    return {"step": step, "rank": rank, "world_size": 2, "worker_id": worker_id,
+            "cut_ids": cut_ids, "batch_size": len(cut_ids), "t_total_ms": 1.0,
+            "t_first_batch_ms": None}
+
+
+@pytest.mark.unit
+def test_consolidate_q1_q3_pass(tmp_path):
+    """Two ranks, disjoint cuts, no duplication."""
+    base = tmp_path / "baseline" / "run0"
+    _write_jsonl(base / "rank_000.jsonl", [
+        _row(0, 0, ["a", "b"]), _row(0, 1, ["c"]),
+    ])
+    _write_jsonl(base / "rank_001.jsonl", [
+        _row(1, 0, ["d", "e"]), _row(1, 1, ["f"]),
+    ])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q_by_id = {q.q_id: q for q in report.questions}
+    assert q_by_id["Q1"].status == cons.PASS
+    assert q_by_id["Q3"].status == cons.PASS
+
+
+@pytest.mark.unit
+def test_consolidate_q1_cross_rank_leak(tmp_path):
+    base = tmp_path / "baseline" / "run0"
+    _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, ["shared", "a"])])
+    _write_jsonl(base / "rank_001.jsonl", [_row(1, 0, ["shared", "b"])])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q1 = next(q for q in report.questions if q.q_id == "Q1")
+    assert q1.status == cons.FAIL
+    assert q1.tag == "partition-rank-leak"
+
+
+@pytest.mark.unit
+def test_consolidate_q3_full_broadcast(tmp_path):
+    """Every rank sees the same cuts → broadcast tag."""
+    base = tmp_path / "baseline" / "run0"
+    same = ["a", "b", "c"]
+    _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, same)])
+    _write_jsonl(base / "rank_001.jsonl", [_row(1, 0, same)])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q3 = next(q for q in report.questions if q.q_id == "Q3")
+    assert q3.status == cons.FAIL
+    assert "BROADCAST" in q3.detail
+
+
+@pytest.mark.unit
+def test_consolidate_q2_skip_without_groundtruth(tmp_path):
+    base = tmp_path / "baseline" / "run0"
+    _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, ["a"])])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q2 = next(q for q in report.questions if q.q_id == "Q2")
+    assert q2.status == cons.SKIP
+
+
+@pytest.mark.unit
+def test_consolidate_q2_skip_detects_missing(tmp_path):
+    base = tmp_path / "baseline" / "run0"
+    _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, ["a", "b"])])
+    _write_jsonl(tmp_path / "groundtruth" / "cuts.jsonl",
+                 [{"cut_ids": ["a", "b", "c"]}])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q2 = next(q for q in report.questions if q.q_id == "Q2")
+    assert q2.status == cons.FAIL
+    assert q2.tag == "skip"
+
+
+@pytest.mark.unit
+def test_consolidate_q4_resume_match(tmp_path):
+    """State is saved AFTER yielding baseline step ``checkpoint_at``, so
+    resumed[0] should match baseline[checkpoint_at + 1]."""
+    base = tmp_path / "baseline" / "run0"
+    res = tmp_path / "resumed" / "run0"
+    _write_jsonl(base / "rank_000.jsonl", [
+        _row(0, 0, ["a"]), _row(0, 1, ["b"]), _row(0, 2, ["c"]),
+    ])
+    # checkpoint_at=0 -> resumed[0] == baseline[1] == ["b"], resumed[1] == baseline[2] == ["c"]
+    _write_jsonl(res / "rank_000.jsonl", [
+        _row(0, 0, ["b"]), _row(0, 1, ["c"]),
+    ])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q4 = next(q for q in report.questions if q.q_id == "Q4")
+    assert q4.status == cons.PASS
+
+
+@pytest.mark.unit
+def test_consolidate_q4_resume_diverges(tmp_path):
+    base = tmp_path / "baseline" / "run0"
+    res = tmp_path / "resumed" / "run0"
+    _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, ["a"]), _row(0, 1, ["b"]), _row(0, 2, ["c"])])
+    # checkpoint_at=0 -> resumed[0] should == baseline[1] == ["b"], but it's "DIFFERENT".
+    _write_jsonl(res / "rank_000.jsonl", [_row(0, 0, ["DIFFERENT"])])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
+    q4 = next(q for q in report.questions if q.q_id == "Q4")
+    assert q4.status == cons.FAIL
+    assert q4.tag == "resume-rng-divergence"
+
+
+@pytest.mark.unit
+def test_consolidate_q5_determinism_match(tmp_path):
+    for run in ("run0", "run1"):
+        _write_jsonl(tmp_path / "baseline" / run / "rank_000.jsonl",
+                     [_row(0, 0, ["a"]), _row(0, 1, ["b"])])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=2)
+    q5 = next(q for q in report.questions if q.q_id == "Q5")
+    assert q5.status == cons.PASS
+
+
+@pytest.mark.unit
+def test_consolidate_q5_determinism_diverges(tmp_path):
+    _write_jsonl(tmp_path / "baseline" / "run0" / "rank_000.jsonl",
+                 [_row(0, 0, ["a"])])
+    _write_jsonl(tmp_path / "baseline" / "run1" / "rank_000.jsonl",
+                 [_row(0, 0, ["DIFFERENT"])])
+    report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=2)
+    q5 = next(q for q in report.questions if q.q_id == "Q5")
+    assert q5.status == cons.FAIL
+    assert q5.tag == "non-determinism"

From 3e8dab88596fbd77a653eaccbcc009fbf1b3f4b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Wed, 27 May 2026 20:30:35 -0400
Subject: [PATCH 13/30] dataloder checkpoints save/load correct per-rank
 information; stateless timer pre-emption patch (willfix)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../common/data/lhotse/dataloader.py          | 154 ++++++++++++-
 .../common/data/lhotse/indexed_adapters.py    | 126 +----------
 nemo/collections/speechlm2/data/datamodule.py |  75 ++-----
 nemo/core/utils/lightning_utils.py            |  39 +++-
 .../test_lhotse_per_rank_stateful_loader.py   | 202 ++++++++++++++++++
 5 files changed, 425 insertions(+), 171 deletions(-)
 create mode 100644 tests/collections/common/test_lhotse_per_rank_stateful_loader.py

diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index b0bc32ba32c2..93823a549379 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -281,18 +281,166 @@ def determine_use_iterable_dataset(use_iterable_dataset: bool, config: DictConfi
     return use_iterable_dataset
 
 
-def _build_dataloader(use_stateful_dataloader: bool, **kwargs) -> torch.utils.data.DataLoader:
+def _build_dataloader(
+    use_stateful_dataloader: bool,
+    *,
+    dp_rank: Optional[int] = None,
+    dp_world_size: Optional[int] = None,
+    dp_group: Optional[Any] = None,
+    **kwargs,
+) -> torch.utils.data.DataLoader:
     """
     Construct a DataLoader, optionally using ``torchdata.stateful_dataloader.StatefulDataLoader``
     so that resume picks up at the exact next batch via ``state_dict()`` / ``load_state_dict()``.
+
+    When ``dp_rank`` / ``dp_world_size`` are provided AND we're building a
+    stateful loader under multi-rank training, wrap ``StatefulDataLoader`` in
+    :class:`_PerRankStatefulDataLoader`. The wrapper all-gathers each rank's
+    local state at save time and scatters back the right entry at load time,
+    so Lightning's automatic ``FitLoop`` save-and-restore of
+    ``CombinedLoader._state_dicts()`` doesn't broadcast rank-0's iterator
+    state to every rank (which would corrupt per-shard partitioning — see
+    the 2026-05-14 post-mortem).
     """
     if use_stateful_dataloader:
         from torchdata.stateful_dataloader import StatefulDataLoader
 
+        if dp_world_size is not None and dp_world_size > 1:
+            return _PerRankStatefulDataLoader(
+                dp_rank=dp_rank if dp_rank is not None else 0,
+                dp_world_size=dp_world_size,
+                dp_group=dp_group,
+                **kwargs,
+            )
         return StatefulDataLoader(**kwargs)
     return torch.utils.data.DataLoader(**kwargs)
 
 
+class _PerRankStatefulDataLoader:
+    """``StatefulDataLoader`` whose ``state_dict`` is a per-rank list.
+
+    Why this exists: Lightning's ``FitLoop`` saves dataloader state via
+    ``CombinedLoader._state_dicts()`` → ``loader.state_dict()`` (collective
+    across ranks but only rank 0's return value is persisted to meta.pt),
+    then on resume calls ``loader.load_state_dict(state)`` on EVERY rank with
+    that single rank-0-only state. Per-shard partitioning (``shard_id =
+    dp_rank * num_workers + worker_id`` inside lhotse's
+    ``PartitionedIndexedIterator``) then desynchronises — rank 28 worker 0
+    loads rank 0 worker 0's ``shard_id=0`` while its own current shard_id is
+    112, the iterator's first ``iterate()`` call raises ValueError, and the
+    rest of the ranks get SIGTERMed via ``srun --kill-on-bad-exit=1``. (See
+    ``agent-debug-workspace/0909-en-only-id2-4node-postfix/DIAGNOSIS_ORD_vs_IAD.md``.)
+
+    The fix turns ``state_dict()`` into a per-rank gather and
+    ``load_state_dict(state)`` into a per-rank scatter. The serialised payload
+    on disk becomes a list of N tagged state dicts (one per DP rank); on
+    every rank, the wrapper picks ``per_rank[self._dp_rank]``. This works
+    whether the call comes from Lightning's automatic FitLoop path OR from
+    our DataModule.load_state_dict override, because both go through this
+    one method.
+
+    We delegate to a contained ``StatefulDataLoader`` rather than subclass
+    it: subclassing would inherit ``_Stateful`` via the runtime-checkable
+    Protocol AND every attribute Lightning's iterator-management code
+    introspects (``flattened``, ``persistent_workers``, etc.), which is what
+    we want; but it would also inherit ``__init__`` whose signature includes
+    parameters we don't want at this layer. Composition keeps the wrapper's
+    constructor clean and lets us forward attribute lookups via
+    ``__getattr__``.
+    """
+
+    def __init__(
+        self,
+        *,
+        dp_rank: int,
+        dp_world_size: int,
+        dp_group: Optional[Any] = None,
+        **kwargs,
+    ) -> None:
+        from torchdata.stateful_dataloader import StatefulDataLoader
+
+        self._dp_rank = int(dp_rank)
+        self._dp_world_size = int(dp_world_size)
+        self._dp_group = dp_group
+        self._inner = StatefulDataLoader(**kwargs)
+
+    def state_dict(self) -> dict:
+        local_state = self._inner.state_dict()
+        tagged = {
+            "dp_rank": self._dp_rank,
+            "dp_world_size": self._dp_world_size,
+            "state": local_state,
+        }
+        if self._dp_world_size <= 1 or not (
+            torch.distributed.is_available() and torch.distributed.is_initialized()
+        ):
+            per_rank = [tagged]
+        else:
+            per_rank: List[Optional[dict]] = [None] * self._dp_world_size
+            torch.distributed.all_gather_object(per_rank, tagged, group=self._dp_group)
+        return {"train_dataloader_per_rank": per_rank}
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        if not state_dict:
+            return
+        # We exclusively support the per-rank wire format produced by our
+        # own ``state_dict()``. Anything else — a bare inner state, a
+        # rank-0-only StatefulDataLoader payload (the shape Lightning's
+        # FitLoop used to broadcast and silently corrupt resume), an old
+        # DataModule key — must fail loudly so any partial-rollforward or
+        # checkpoint-format mismatch is caught at load time rather than
+        # producing wrong data several minutes into training.
+        if "train_dataloader_per_rank" not in state_dict:
+            raise RuntimeError(
+                "PerRankStatefulDataLoader.load_state_dict: state must use "
+                "the per-rank wire format (top-level key "
+                "'train_dataloader_per_rank'); got keys "
+                f"{sorted(state_dict.keys())}. This dataloader only supports "
+                "states produced by its own state_dict()."
+            )
+        per_rank = state_dict["train_dataloader_per_rank"]
+        if not isinstance(per_rank, list) or len(per_rank) != self._dp_world_size:
+            raise RuntimeError(
+                f"PerRankStatefulDataLoader: state has dp_world_size="
+                f"{len(per_rank) if isinstance(per_rank, list) else 'unknown'} "
+                f"but the current run has dp_world_size={self._dp_world_size}."
+            )
+        entry = per_rank[self._dp_rank]
+        if (
+            not isinstance(entry, dict)
+            or "state" not in entry
+            or "dp_rank" not in entry
+            or "dp_world_size" not in entry
+        ):
+            raise RuntimeError(
+                f"PerRankStatefulDataLoader: malformed per-rank entry at index "
+                f"{self._dp_rank}: expected keys {{'dp_rank', 'dp_world_size', "
+                f"'state'}}, got {list(entry.keys()) if isinstance(entry, dict) else type(entry).__name__}."
+            )
+        saved_rank, saved_world = entry["dp_rank"], entry["dp_world_size"]
+        if saved_rank != self._dp_rank or saved_world != self._dp_world_size:
+            raise RuntimeError(
+                f"PerRankStatefulDataLoader: state tagged (dp_rank={saved_rank}, "
+                f"dp_world_size={saved_world}) loaded on (dp_rank={self._dp_rank}, "
+                f"dp_world_size={self._dp_world_size})."
+            )
+        self._inner.load_state_dict(entry["state"])
+
+    # Forward everything else to the inner StatefulDataLoader so Lightning's
+    # iterator-management, ``flattened``-discovery and friends keep working.
+    def __getattr__(self, name: str) -> Any:
+        # ``__getattr__`` only fires when normal attribute lookup fails, so the
+        # explicit attributes (``_inner``, ``_dp_rank``, ...) are reached
+        # directly without bouncing through here.
+        return getattr(self._inner, name)
+
+    def __iter__(self):
+        return iter(self._inner)
+
+    def __len__(self):
+        return len(self._inner)
+
+
 def _maybe_init_main_process_for_iterable(num_workers: int, global_rank: int, world_size: int, seed: int) -> None:
     """When ``num_workers == 0`` the iterable-path sampler runs in the main training
     process; PyTorch's DataLoader never invokes ``worker_init_fn`` in that case.
@@ -413,6 +561,8 @@ def get_lhotse_dataloader_from_single_config(
         dloader_kwargs = dict(dataset=dataset, sampler=sampler)
     dloader = _build_dataloader(
         use_stateful_dataloader=config.use_stateful_dataloader,
+        dp_rank=global_rank,
+        dp_world_size=world_size,
         **dloader_kwargs,
         batch_size=None,
         num_workers=config.num_workers,
@@ -546,6 +696,8 @@ def gather_shared_opts():
         dloader_kwargs = dict(dataset=dataset, sampler=sampler)
     dloader = _build_dataloader(
         use_stateful_dataloader=shared_opts.use_stateful_dataloader,
+        dp_rank=global_rank,
+        dp_world_size=world_size,
         **dloader_kwargs,
         batch_size=None,
         num_workers=shared_opts.num_workers,
diff --git a/nemo/collections/common/data/lhotse/indexed_adapters.py b/nemo/collections/common/data/lhotse/indexed_adapters.py
index 15b6b0f5ccae..1e623d4ee765 100644
--- a/nemo/collections/common/data/lhotse/indexed_adapters.py
+++ b/nemo/collections/common/data/lhotse/indexed_adapters.py
@@ -38,130 +38,24 @@ def _is_remote_path(path) -> bool:
     return bool(_URL_RE.match(str(path)))
 
 
-class _AISRangeReader:
-    """
-    Pseudo file-like object backed by AIStore HTTP byte-range reads.
-
-    Translates ``seek()`` + ``read(n)`` into ``Object.get_reader(byte_range=…)``
-    requests so the indexed-tar readers can do random access into ``s3://`` /
-    ``ais://`` archives the same way they would into a local file. Each
-    ``read()`` corresponds to one HTTP range request, which AIStore serves in
-    O(1); the index already tells us exactly which byte ranges we need (one
-    per tar member or sample), so the request count per training sample is
-    small and bounded.
-
-    The aistore SDK is imported lazily so ``indexed_adapters`` doesn't have to
-    take a hard dependency on it for local-only code paths.
-
-    Notes
-    -----
-    * ``seek()`` accepts whence ∈ {0, 1, 2}; for whence=2 the file size
-      already known via ``Object.props.size`` is used, so no extra HTTP call
-      is needed.
-    * The instance is **not** safe to share across threads — pickling support
-      drops the cached ``_obj`` so per-worker processes re-resolve the URL
-      after fork.
-    """
-
-    def __init__(self, url: str):
-        # Defer the aistore import — pure-local installs don't need it.
-        from aistore import Client  # noqa: F401  (presence-check only)
-
-        self._url = url
-        self._obj = None
-        self._size: Optional[int] = None
-        self._pos = 0
-
-    def _ensure_obj(self):
-        if self._obj is not None:
-            return
-        # Same client/env wiring as ``lhotse.serialization.AIStoreIOBackend``
-        # — import locally so build_indexes / training don't require lhotse
-        # for non-remote files.
-        from lhotse.serialization import get_aistore_client
-
-        client, _version = get_aistore_client()
-        self._obj = client.get_object_from_url(self._url)
-        self._size = int(self._obj.props.size)
-
-    @property
-    def size(self) -> int:
-        self._ensure_obj()
-        return self._size  # type: ignore[return-value]
-
-    def seekable(self) -> bool:
-        return True
-
-    def readable(self) -> bool:
-        return True
-
-    def seek(self, offset: int, whence: int = 0) -> int:
-        if whence == 0:
-            self._pos = int(offset)
-        elif whence == 1:
-            self._pos += int(offset)
-        elif whence == 2:
-            self._pos = self.size + int(offset)
-        else:
-            raise ValueError(f"Unsupported whence: {whence}")
-        return self._pos
-
-    def tell(self) -> int:
-        return self._pos
-
-    def read(self, n: int = -1) -> bytes:
-        self._ensure_obj()
-        if self._pos >= self._size:
-            return b""
-        if n == 0:
-            return b""
-        if n < 0:
-            end_inclusive = self._size - 1
-        else:
-            end_inclusive = min(self._pos + n - 1, self._size - 1)
-        if end_inclusive < self._pos:
-            return b""
-        # AIStore expects the HTTP Range syntax: ``bytes=START-END`` with
-        # END inclusive. ``read_all()`` drains the entire response into bytes.
-        byte_range = f"bytes={self._pos}-{end_inclusive}"
-        reader = self._obj.get_reader(byte_range=byte_range)
-        data = reader.read_all()
-        self._pos += len(data)
-        return data
-
-    def close(self) -> None:
-        self._obj = None
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, *exc):
-        self.close()
-
-    def __getstate__(self):
-        # Drop the resolved AIStore Object handle so a forked DataLoader
-        # worker re-creates it lazily against the worker's own connection
-        # pool / HTTP session.
-        return {"_url": self._url, "_pos": 0, "_obj": None, "_size": None}
-
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-
-
 def _open_data_path(path: str):
     """
     Return a seekable file-like for *path*, suitable for the indexed
     tar readers' ``self._fh`` slot.
 
     Local paths get a regular ``open(path, "rb")``. URL/URI paths return an
-    :class:`_AISRangeReader` that turns ``seek + read`` into AIStore HTTP
-    range requests. Other URL schemes (``http://``, ``gs://``, …) currently
-    fall through to ``_AISRangeReader`` as well — the aistore SDK is the only
-    seekable remote backend lhotse exposes today; if a future backend gains a
-    seekable wrapper, dispatch here.
+    :class:`lhotse.ais.AISRangeReader` (imported from lhotse to keep the
+    seekable-AIS wrapper as a single source of truth shared with
+    :func:`lhotse.indexing._open_for_indexed_read`). Other URL schemes
+    (``http://``, ``gs://``, …) currently fall through to ``AISRangeReader``
+    as well — the aistore SDK is the only seekable remote backend lhotse
+    exposes today; if a future backend gains a seekable wrapper, dispatch
+    here.
     """
     if _is_remote_path(path):
-        return _AISRangeReader(str(path))
+        from lhotse.ais import AISRangeReader
+
+        return AISRangeReader(str(path))
     return open(path, "rb")
 
 
diff --git a/nemo/collections/speechlm2/data/datamodule.py b/nemo/collections/speechlm2/data/datamodule.py
index 385ff1fb0247..7505b298510f 100644
--- a/nemo/collections/speechlm2/data/datamodule.py
+++ b/nemo/collections/speechlm2/data/datamodule.py
@@ -83,59 +83,28 @@ def train_dataloader(self):
             )
         return self._train_dl
 
-    def state_dict(self) -> dict:
-        # Each DP rank has its own dataloader state (different cuts partition, different
-        # per-worker RNG positions). all_gather across the DP group so the rank-0 meta.pt
-        # that Lightning writes contains every rank's state, keyed by dp_rank.
-        if self._train_dl is None or not hasattr(self._train_dl, "state_dict"):
-            return {}
-        local_state = self._train_dl.state_dict()
-        rank = self._get_dp_rank()
-        world = self._get_world_size()
-        tagged = {"dp_rank": rank, "dp_world_size": world, "state": local_state}
-        if world <= 1 or not (torch.distributed.is_available() and torch.distributed.is_initialized()):
-            per_rank = [tagged]
-        else:
-            group = self._get_dp_group()
-            per_rank = [None] * world
-            torch.distributed.all_gather_object(per_rank, tagged, group=group)
-        return {"train_dataloader_per_rank": per_rank}
-
-    def load_state_dict(self, state_dict: dict) -> None:
-        # Mirrors state_dict: we expect a per-DP-rank list and consume the slot that
-        # matches our current (dp_rank, dp_world_size). Any other shape is a bug.
-        if not state_dict:
-            return
-        if "train_dataloader_per_rank" not in state_dict:
-            raise RuntimeError(
-                "DataModule.load_state_dict: expected 'train_dataloader_per_rank' in "
-                f"state_dict, got keys {list(state_dict.keys())}."
-            )
-        per_rank = state_dict["train_dataloader_per_rank"]
-        rank = self._get_dp_rank()
-        world = self._get_world_size()
-        if not isinstance(per_rank, list) or len(per_rank) != world:
-            raise RuntimeError(
-                f"DataModule state has dp_world_size="
-                f"{len(per_rank) if isinstance(per_rank, list) else 'unknown'} but the "
-                f"current run has dp_world_size={world}."
-            )
-        entry = per_rank[rank]
-        if not isinstance(entry, dict) or "state" not in entry or "dp_rank" not in entry or "dp_world_size" not in entry:
-            raise RuntimeError(
-                f"Malformed per-rank dataloader state at index {rank}: expected keys "
-                f"{{'dp_rank', 'dp_world_size', 'state'}}, got "
-                f"{list(entry.keys()) if isinstance(entry, dict) else type(entry).__name__}."
-            )
-        saved_rank, saved_world = entry["dp_rank"], entry["dp_world_size"]
-        if saved_rank != rank or saved_world != world:
-            raise RuntimeError(
-                f"Dataloader state tagged (dp_rank={saved_rank}, dp_world_size={saved_world}) "
-                f"loaded on (dp_rank={rank}, dp_world_size={world})."
-            )
-        dl = self.train_dataloader()
-        if dl is not None and hasattr(dl, "load_state_dict"):
-            dl.load_state_dict(entry["state"])
+    # state_dict / load_state_dict are intentionally NOT overridden.
+    #
+    # Per-rank dataloader state is now produced and consumed by
+    # ``_PerRankStatefulDataLoader`` (in
+    # ``nemo.collections.common.data.lhotse.dataloader``). The wrapper's
+    # ``state_dict`` all-gathers across DP ranks and its ``load_state_dict``
+    # picks the entry matching the current rank. Lightning's ``FitLoop``
+    # already round-trips ``CombinedLoader._state_dicts()`` through
+    # ``loader.state_dict()`` / ``loader.load_state_dict()`` on every rank,
+    # so the wrapper alone is sufficient to keep per-rank shard partitioning
+    # synchronised on resume.
+    #
+    # Historically this class also gathered+scattered the state at the
+    # DataModule level. That worked for the save, but on load, Lightning's
+    # automatic ``FitLoop._load_combined_loader_states`` fired AFTER
+    # ``restore_datamodule`` and overwrote our per-rank load with the
+    # rank-0-only state captured under ``loops.fit_loop.state_dict.combined_loader``
+    # — every non-zero rank's iterator ended up with ``shard_id=0`` (the
+    # rank-0 worker-0 value) and ``PartitionedIndexedIterator.iterate``
+    # raised ``topology mismatch on resume`` ~14 min into training. See
+    # ``agent-debug-workspace/0909-en-only-id2-4node-postfix/DIAGNOSIS_ORD_vs_IAD.md``
+    # for the full post-mortem.
 
     def val_dataloader(self):
         if "validation_ds" not in self.cfg:
diff --git a/nemo/core/utils/lightning_utils.py b/nemo/core/utils/lightning_utils.py
index 77c88942ac9f..96496d1a415e 100644
--- a/nemo/core/utils/lightning_utils.py
+++ b/nemo/core/utils/lightning_utils.py
@@ -30,6 +30,15 @@ def read_batch(dataloader_iter: Iterator, model: pl.LightningModule) -> Tuple[An
     dataloader past the saved snapshot point and giving the resumed run a
     one-batch drift versus the continuous run.
 
+    Also force-fires StatelessTimer's preempt check before pulling the next
+    batch. Under ``dataloader_iter`` flavor, Lightning's on_train_batch_end
+    callback dispatch silently fails to invoke StatelessTimer (likely related
+    to the "unforeseen effects on callbacks" warning Lightning emits for this
+    flavor) — so the SLURM walltime preempt-save + graceful-exit path never
+    fires, and each chunk's mid-epoch progress is lost to SIGKILL. Calling
+    ``StatelessTimer._check_time_remaining`` directly here restores the same
+    behavior the ``(batch, batch_idx)`` flavor has out of the box.
+
     Args:
         dataloader_iter: The iterator passed by Lightning into a
             ``training_step(self, dataloader_iter)`` (an instance of
@@ -41,9 +50,37 @@ def read_batch(dataloader_iter: Iterator, model: pl.LightningModule) -> Tuple[An
         ``(batch, batch_idx)`` — batch is already converted to the right
         precision and moved to the model's device, ready for forward.
     """
-    batch, batch_idx, dataloader_idx = next(dataloader_iter)
     trainer = model.trainer
+    _force_fire_stateless_timer(trainer)
+    batch, batch_idx, dataloader_idx = next(dataloader_iter)
     batch = trainer.precision_plugin.convert_input(batch)
     batch = model._on_before_batch_transfer(batch, dataloader_idx=dataloader_idx)
     batch = trainer.strategy.batch_to_device(batch, dataloader_idx=dataloader_idx)
     return batch, batch_idx
+
+
+def _force_fire_stateless_timer(trainer: pl.Trainer) -> None:
+    """Invoke ``StatelessTimer._check_time_remaining`` directly.
+
+    Workaround for Lightning's ``dataloader_iter`` step flavor: in that mode,
+    StatelessTimer's ``on_train_batch_end`` does not reliably fire, so neither
+    the time-elapsed check nor the preempt-save+exit path runs. Calling
+    ``_check_time_remaining`` from inside the user-owned training_step makes
+    the preempt behavior identical to what ``(batch, batch_idx)`` flavor
+    gets via Lightning's standard callback flow.
+
+    Idempotent on the time-not-yet-up case (cheap: one ``time_elapsed()``
+    check + one comparison). On the time-up case, ``StatelessTimer`` saves a
+    ``-last.ckpt`` via ``NeMoModelCheckpoint._save_last_checkpoint`` and
+    raises ``_TunerExitException`` to exit Lightning gracefully — that
+    exception propagates up through ``read_batch`` → ``training_step`` →
+    Lightning's epoch loop, which Lightning treats as a clean stop.
+    """
+    # Local import to avoid a circular import at module load time
+    # (exp_manager imports from various nemo submodules).
+    from nemo.utils.exp_manager import StatelessTimer
+
+    for cb in trainer.callbacks:
+        if isinstance(cb, StatelessTimer):
+            cb._check_time_remaining(trainer)
+            return
diff --git a/tests/collections/common/test_lhotse_per_rank_stateful_loader.py b/tests/collections/common/test_lhotse_per_rank_stateful_loader.py
new file mode 100644
index 000000000000..ab2f70c715aa
--- /dev/null
+++ b/tests/collections/common/test_lhotse_per_rank_stateful_loader.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+"""Regression tests for ``_PerRankStatefulDataLoader``.
+
+The wrapper exists because Lightning's ``FitLoop`` saves
+``CombinedLoader._state_dicts()`` (which captures rank-0's
+``StatefulDataLoader.state_dict()`` only) and replays it on every rank on
+resume — broadcasting rank-0's iterator state to every rank and corrupting
+per-shard partitioning. The wrapper intercepts that pipeline so the saved
+payload is a per-rank list and the load picks the right entry.
+
+These tests intentionally do not spin up torch.distributed; the
+all_gather path is the trivial 1-rank fallback. The
+:func:`test_load_picks_correct_rank_entry` test simulates the multi-rank
+case by handing the wrapper an externally-built per-rank state dict and
+asserting the right inner state lands on the inner loader (proxied by a
+stub that records what ``load_state_dict`` was called with).
+"""
+
+from __future__ import annotations
+
+import sys
+import types
+
+import pytest
+
+from nemo.collections.common.data.lhotse.dataloader import _PerRankStatefulDataLoader
+
+
+class _StubStatefulDataLoader:
+    """Stand-in for ``torchdata.stateful_dataloader.StatefulDataLoader``.
+
+    The wrapper's tests only need ``state_dict()`` and ``load_state_dict()``
+    to be observable; they don't care about iteration. We install this stub
+    as the ``StatefulDataLoader`` import inside the wrapper module so the
+    test runs without ``torchdata`` and stays focused on the gather/scatter
+    logic we own.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        self.kwargs = kwargs
+        self._state: dict = {"position": 0, "shard_id": None}
+        self.load_calls: list[dict] = []
+
+    def state_dict(self) -> dict:
+        return dict(self._state)
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        # record the call so tests can assert what was applied.
+        self.load_calls.append(state_dict)
+        self._state.update(state_dict)
+
+
+@pytest.fixture(autouse=True)
+def _patch_stateful_loader(monkeypatch):
+    """Make ``from torchdata.stateful_dataloader import StatefulDataLoader``
+    inside the wrapper resolve to our stub."""
+    fake_module = types.ModuleType("torchdata.stateful_dataloader")
+    fake_module.StatefulDataLoader = _StubStatefulDataLoader
+    fake_pkg = types.ModuleType("torchdata")
+    fake_pkg.stateful_dataloader = fake_module
+    monkeypatch.setitem(sys.modules, "torchdata", fake_pkg)
+    monkeypatch.setitem(sys.modules, "torchdata.stateful_dataloader", fake_module)
+
+
+def _new_wrapper(dp_rank: int, dp_world_size: int) -> _PerRankStatefulDataLoader:
+    return _PerRankStatefulDataLoader(
+        dp_rank=dp_rank,
+        dp_world_size=dp_world_size,
+        # the stub ignores constructor kwargs, but we pass something
+        # representative so the call signature mirrors real usage.
+        dataset=object(),
+        num_workers=4,
+    )
+
+
+def test_state_dict_single_rank_wraps_with_per_rank_list():
+    dl = _new_wrapper(dp_rank=0, dp_world_size=1)
+    dl._inner._state = {"position": 42, "shard_id": 0}
+
+    sd = dl.state_dict()
+
+    assert list(sd.keys()) == ["train_dataloader_per_rank"]
+    per_rank = sd["train_dataloader_per_rank"]
+    assert isinstance(per_rank, list) and len(per_rank) == 1
+    assert per_rank[0] == {
+        "dp_rank": 0,
+        "dp_world_size": 1,
+        "state": {"position": 42, "shard_id": 0},
+    }
+
+
+def test_load_state_dict_single_rank_unwraps_and_applies():
+    dl = _new_wrapper(dp_rank=0, dp_world_size=1)
+
+    dl.load_state_dict(
+        {
+            "train_dataloader_per_rank": [
+                {"dp_rank": 0, "dp_world_size": 1, "state": {"position": 99, "shard_id": 0}},
+            ]
+        }
+    )
+
+    assert dl._inner.load_calls == [{"position": 99, "shard_id": 0}]
+
+
+def test_load_picks_correct_rank_entry():
+    """Hand a 32-rank per_rank list to a wrapper bound to rank 28; assert
+    the inner loader receives rank 28's entry only.
+
+    This is the regression for the 2026-05-14 LOAD-side bug: Lightning's
+    FitLoop replays the saved state on every rank, and historically rank 28
+    ended up applying rank 0's worker-0 state because the broadcast came
+    from FitLoop AFTER our DataModule's per-rank load. With the wrapper,
+    even the FitLoop's broadcast goes through the right per-rank scatter.
+    """
+    world = 32
+    rank = 28
+    per_rank = [
+        {
+            "dp_rank": r,
+            "dp_world_size": world,
+            "state": {"position": 100 + r, "shard_id": r * 4},
+        }
+        for r in range(world)
+    ]
+
+    dl = _new_wrapper(dp_rank=rank, dp_world_size=world)
+    dl.load_state_dict({"train_dataloader_per_rank": per_rank})
+
+    assert len(dl._inner.load_calls) == 1
+    applied = dl._inner.load_calls[0]
+    assert applied == {"position": 100 + rank, "shard_id": rank * 4}, (
+        "Wrapper must consume per_rank[self._dp_rank] — bug would manifest "
+        "as applying per_rank[0] (rank-0 broadcast collapse)."
+    )
+
+
+def test_load_rejects_world_size_mismatch():
+    dl = _new_wrapper(dp_rank=0, dp_world_size=32)
+    with pytest.raises(RuntimeError, match="dp_world_size"):
+        dl.load_state_dict(
+            {
+                "train_dataloader_per_rank": [
+                    {"dp_rank": 0, "dp_world_size": 4, "state": {}},
+                    {"dp_rank": 1, "dp_world_size": 4, "state": {}},
+                    {"dp_rank": 2, "dp_world_size": 4, "state": {}},
+                    {"dp_rank": 3, "dp_world_size": 4, "state": {}},
+                ]
+            }
+        )
+
+
+def test_load_rejects_tag_mismatch():
+    dl = _new_wrapper(dp_rank=0, dp_world_size=2)
+    with pytest.raises(RuntimeError, match=r"tagged \(dp_rank=1"):
+        dl.load_state_dict(
+            {
+                "train_dataloader_per_rank": [
+                    # the entry at index 0 claims to be rank 1 — must reject.
+                    {"dp_rank": 1, "dp_world_size": 2, "state": {}},
+                    {"dp_rank": 1, "dp_world_size": 2, "state": {}},
+                ]
+            }
+        )
+
+
+def test_load_rejects_bare_inner_state():
+    """Strict wire format: a state dict without the
+    ``train_dataloader_per_rank`` top-level key is rejected. This guards
+    against the legacy code path (``DataModule.load_state_dict`` calling
+    ``dl.load_state_dict(entry["state"])`` with the raw inner state) and
+    against Lightning's FitLoop broadcasting rank-0's
+    ``StatefulDataLoader.state_dict()`` — both would otherwise look like
+    valid bare inner state and produce wrong, silently-corrupt resumes.
+    """
+    dl = _new_wrapper(dp_rank=0, dp_world_size=1)
+
+    with pytest.raises(RuntimeError, match="train_dataloader_per_rank"):
+        dl.load_state_dict({"position": 7, "shard_id": 0})
+
+    # an inner-shaped state (with ``_snapshot._worker_snapshots`` etc.) —
+    # what Lightning's FitLoop used to feed back — must be rejected too.
+    with pytest.raises(RuntimeError, match="train_dataloader_per_rank"):
+        dl.load_state_dict(
+            {
+                "_iterator_finished": False,
+                "_snapshot": {"_worker_snapshots": {"worker_0": {}}},
+                "_steps_since_snapshot": 0,
+            }
+        )
+
+
+def test_empty_state_is_a_noop():
+    dl = _new_wrapper(dp_rank=0, dp_world_size=1)
+    dl.load_state_dict({})
+    assert dl._inner.load_calls == []

From 563a81e0e584c061d5637cc347d3746c74908b8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Wed, 3 Jun 2026 06:56:17 -0700
Subject: [PATCH 14/30] Fix dataloader_iter resumability on preemption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 nemo/core/utils/lightning_utils.py            |  97 +++++-
 nemo/utils/callbacks/preemption.py            |  22 +-
 nemo/utils/exp_manager.py                     |  98 ++++++
 .../test_resumable_dataloader_iter.py         | 303 ++++++++++++++++++
 4 files changed, 493 insertions(+), 27 deletions(-)
 create mode 100644 tests/core_ptl/test_resumable_dataloader_iter.py

diff --git a/nemo/core/utils/lightning_utils.py b/nemo/core/utils/lightning_utils.py
index 96496d1a415e..b55f03769998 100644
--- a/nemo/core/utils/lightning_utils.py
+++ b/nemo/core/utils/lightning_utils.py
@@ -30,14 +30,12 @@ def read_batch(dataloader_iter: Iterator, model: pl.LightningModule) -> Tuple[An
     dataloader past the saved snapshot point and giving the resumed run a
     one-batch drift versus the continuous run.
 
-    Also force-fires StatelessTimer's preempt check before pulling the next
-    batch. Under ``dataloader_iter`` flavor, Lightning's on_train_batch_end
-    callback dispatch silently fails to invoke StatelessTimer (likely related
-    to the "unforeseen effects on callbacks" warning Lightning emits for this
-    flavor) — so the SLURM walltime preempt-save + graceful-exit path never
-    fires, and each chunk's mid-epoch progress is lost to SIGKILL. Calling
-    ``StatelessTimer._check_time_remaining`` directly here restores the same
-    behavior the ``(batch, batch_idx)`` flavor has out of the box.
+    Also checks shutdown conditions before pulling the next batch. Lightning
+    still calls timer and preemption callbacks in normal ``dataloader_iter``
+    runs, but checking here closes the deadline/preemption window before user
+    code advances a stateful iterator. If the time budget is already exhausted
+    or preemption was already signaled, the helper saves ``last.ckpt`` and
+    exits before another sample is consumed.
 
     Args:
         dataloader_iter: The iterator passed by Lightning into a
@@ -51,7 +49,7 @@ def read_batch(dataloader_iter: Iterator, model: pl.LightningModule) -> Tuple[An
         precision and moved to the model's device, ready for forward.
     """
     trainer = model.trainer
-    _force_fire_stateless_timer(trainer)
+    _check_shutdown_before_next_batch(trainer)
     batch, batch_idx, dataloader_idx = next(dataloader_iter)
     batch = trainer.precision_plugin.convert_input(batch)
     batch = model._on_before_batch_transfer(batch, dataloader_idx=dataloader_idx)
@@ -59,15 +57,84 @@ def read_batch(dataloader_iter: Iterator, model: pl.LightningModule) -> Tuple[An
     return batch, batch_idx
 
 
+def _check_shutdown_before_next_batch(trainer: pl.Trainer) -> None:
+    """Handle pending shutdown before advancing a stateful ``dataloader_iter``."""
+    _log_read_batch_shutdown_guards_once(trainer)
+    _force_fire_preemption_callback(trainer)
+    _save_and_exit_if_lightning_received_sigterm(trainer)
+    _force_fire_stateless_timer(trainer)
+
+
+def _log_read_batch_shutdown_guards_once(trainer: pl.Trainer) -> None:
+    """Log the active ``read_batch`` shutdown guards once per trainer."""
+    if getattr(trainer, "_nemo_read_batch_shutdown_guards_logged", False):
+        return
+    setattr(trainer, "_nemo_read_batch_shutdown_guards_logged", True)
+
+    try:
+        has_preemption = any(getattr(cb, "preemption_enabled", False) for cb in trainer.callbacks)
+        has_sigterm = hasattr(trainer, "received_sigterm")
+        has_timer = _has_stateless_timer(trainer)
+        from nemo.utils import logging
+
+        logging.info(
+            "read_batch shutdown guards active: "
+            f"stateless_timer={has_timer} preemption_callback={has_preemption} "
+            f"lightning_sigterm_state={has_sigterm}"
+        )
+    except Exception:
+        # This is observability only; never let it affect the training path.
+        return
+
+
+def _force_fire_preemption_callback(trainer: pl.Trainer) -> None:
+    """Save and exit if NeMo's preemption callback has observed SIGTERM.
+
+    ``PreemptionCallback.on_train_batch_end`` still handles the normal
+    post-batch case. This pre-fetch check covers the ``training_step(
+    dataloader_iter)`` path where user code is responsible for advancing the
+    stateful iterator and can otherwise enter ``next(dataloader_iter)`` after
+    rank 0 already received the preemption signal.
+    """
+    for cb in trainer.callbacks:
+        if not getattr(cb, "preemption_enabled", False):
+            continue
+        if cb.interrupted:
+            from nemo.utils.exp_manager import _save_last_checkpoint_and_exit
+
+            _save_last_checkpoint_and_exit(
+                trainer,
+                "read_batch observed a pending preemption signal before consuming the next batch",
+            )
+
+
+def _save_and_exit_if_lightning_received_sigterm(trainer: pl.Trainer) -> None:
+    """Handle Lightning's own SIGTERM state before consuming a stateful batch."""
+    if not getattr(trainer, "received_sigterm", False):
+        return
+
+    from nemo.utils.exp_manager import _save_last_checkpoint_and_exit
+
+    _save_last_checkpoint_and_exit(
+        trainer,
+        "read_batch observed trainer.received_sigterm before consuming the next batch",
+    )
+
+
+def _has_stateless_timer(trainer: pl.Trainer) -> bool:
+    """Return whether trainer has NeMo's StatelessTimer callback."""
+    from nemo.utils.exp_manager import StatelessTimer
+
+    return any(isinstance(cb, StatelessTimer) for cb in trainer.callbacks)
+
+
 def _force_fire_stateless_timer(trainer: pl.Trainer) -> None:
     """Invoke ``StatelessTimer._check_time_remaining`` directly.
 
-    Workaround for Lightning's ``dataloader_iter`` step flavor: in that mode,
-    StatelessTimer's ``on_train_batch_end`` does not reliably fire, so neither
-    the time-elapsed check nor the preempt-save+exit path runs. Calling
-    ``_check_time_remaining`` from inside the user-owned training_step makes
-    the preempt behavior identical to what ``(batch, batch_idx)`` flavor
-    gets via Lightning's standard callback flow.
+    Defensive deadline check for Lightning's ``dataloader_iter`` step flavor.
+    The standard callback path checks the timer after a batch. This pre-fetch
+    check prevents a resumed stateful iterator from being advanced when the
+    deadline has already expired before the next batch is requested.
 
     Idempotent on the time-not-yet-up case (cheap: one ``time_elapsed()``
     check + one comparison). On the time-up case, ``StatelessTimer`` saves a
diff --git a/nemo/utils/callbacks/preemption.py b/nemo/utils/callbacks/preemption.py
index be4712a60241..e0a723d28bf6 100644
--- a/nemo/utils/callbacks/preemption.py
+++ b/nemo/utils/callbacks/preemption.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import signal
-import sys
 
 import torch
 from lightning.pytorch.callbacks import Callback
@@ -66,6 +65,7 @@ def on_train_start(self, trainer, pl_module):
 
             # Master handler on rank 0 only upon preemption signal to avoid deadlock conditions
             def master_handler(signum, frame):
+                logging.info("Received preemption signal on rank 0; checkpoint save will run after current batch")
                 self.release()
                 self._interrupted = True
 
@@ -76,6 +76,9 @@ def ignoring_handler(signum, frame):
             self.private_rank = torch.distributed.get_rank()
             if self.private_rank == 0:
                 signal.signal(self.sig, master_handler)
+                logging.info(
+                    f"PreemptionCallback enabled on rank 0 for signal {getattr(self.sig, 'name', self.sig)}"
+                )
             else:
                 signal.signal(self.sig, ignoring_handler)
 
@@ -96,17 +99,12 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int)
             # a regular local variable
             interrupted = self.interrupted
             if interrupted:
-                logging.info("Received SIGTERM, saving checkpoint and exiting")
-                # Same off-by-one as in StatelessTimer: on_train_batch_end fires before
-                # batch_progress.increment_completed(), but the batch's optim step has
-                # already advanced global_step. Flush the in-flight batch so resume
-                # doesn't replay it and double-count the optim step.
-                from nemo.utils.exp_manager import _flush_in_flight_batch_progress
-
-                _flush_in_flight_batch_progress(trainer)
-                monitor_candidates = self.checkpoint_callback._monitor_candidates(trainer)
-                self.checkpoint_callback._save_last_checkpoint(trainer, monitor_candidates)
-                sys.exit(0)
+                from nemo.utils.exp_manager import _save_last_checkpoint_and_exit
+
+                _save_last_checkpoint_and_exit(
+                    trainer,
+                    "PreemptionCallback observed SIGTERM at train batch end",
+                )
 
     def release(self):
         """Restore the original signal handler; returns False if already released, True otherwise."""
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index ad7757b72ba5..0c23dca9fe7c 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -1452,6 +1452,13 @@ def _check_time_remaining(self, trainer: lightning.pytorch.Trainer) -> None:
         """_check_time_remaining"""
         super()._check_time_remaining(trainer)
         if trainer.should_stop:
+            before_flush = _describe_batch_progress(trainer)
+            logging.info(
+                "StatelessTimer deadline reached; saving last checkpoint "
+                f"global_step={getattr(trainer, 'global_step', None)} "
+                f"current_epoch={getattr(trainer, 'current_epoch', None)} "
+                f"batch_progress_before_flush={before_flush}"
+            )
             # PTL's TrainingEpochLoop.advance() calls the on_train_batch_end hooks (which is where
             # Timer._check_time_remaining fires) BEFORE batch_progress.increment_completed(). The
             # current batch's optim step has already advanced global_step, so saving here would
@@ -1461,16 +1468,46 @@ def _check_time_remaining(self, trainer: lightning.pytorch.Trainer) -> None:
             # global_step per wall-time resume. Flush the in-flight batch first to keep the
             # saved state self-consistent.
             _flush_in_flight_batch_progress(trainer)
+            after_flush = _describe_batch_progress(trainer)
             checkpoint_callback: Optional[NeMoModelCheckpoint] = trainer.checkpoint_callback
             if checkpoint_callback:
+                save_started = time.monotonic()
                 monitor_candidates = checkpoint_callback._monitor_candidates(trainer)
                 checkpoint_callback._save_last_checkpoint(trainer, monitor_candidates)
+                logging.info(
+                    "StatelessTimer last checkpoint save finished "
+                    f"global_step={getattr(trainer, 'global_step', None)} "
+                    f"current_epoch={getattr(trainer, 'current_epoch', None)} "
+                    f"batch_progress_after_flush={after_flush} "
+                    f"last_model_path={getattr(checkpoint_callback, 'last_model_path', None)} "
+                    f"save_duration_sec={time.monotonic() - save_started:.3f}"
+                )
+            else:
+                logging.warning("StatelessTimer deadline reached but trainer.checkpoint_callback is not configured")
             # Throw this exception to signal to Lightning to terminate gracefully.
             from lightning.pytorch.utilities.exceptions import _TunerExitException
 
             raise _TunerExitException()
 
 
+def _describe_batch_progress(trainer: lightning.pytorch.Trainer) -> Dict[str, Any]:
+    """Return a compact, log-friendly snapshot of Lightning's train batch progress."""
+    try:
+        batch_progress = trainer.fit_loop.epoch_loop.batch_progress
+    except AttributeError:
+        return {}
+
+    return {
+        "current_ready": getattr(batch_progress.current, "ready", None),
+        "current_processed": getattr(batch_progress.current, "processed", None),
+        "current_completed": getattr(batch_progress.current, "completed", None),
+        "total_ready": getattr(batch_progress.total, "ready", None),
+        "total_processed": getattr(batch_progress.total, "processed", None),
+        "total_completed": getattr(batch_progress.total, "completed", None),
+        "is_last_batch": getattr(batch_progress, "is_last_batch", None),
+    }
+
+
 def _flush_in_flight_batch_progress(trainer: lightning.pytorch.Trainer) -> None:
     """Bring batch_progress.current.completed up to .ready if a batch is in flight.
 
@@ -1487,6 +1524,44 @@ def _flush_in_flight_batch_progress(trainer: lightning.pytorch.Trainer) -> None:
         batch_progress.increment_completed()
 
 
+def _save_last_checkpoint_and_exit(trainer: lightning.pytorch.Trainer, reason: str) -> None:
+    """Save the last checkpoint for graceful shutdown and exit Lightning.
+
+    ``reason`` should describe the caller-visible shutdown trigger. The
+    checkpoint policy itself is unchanged: this only asks the configured
+    ``NeMoModelCheckpoint`` to update its existing ``*-last.ckpt`` target.
+    """
+    before_flush = _describe_batch_progress(trainer)
+    logging.info(
+        f"{reason}; saving last checkpoint "
+        f"global_step={getattr(trainer, 'global_step', None)} "
+        f"current_epoch={getattr(trainer, 'current_epoch', None)} "
+        f"batch_progress_before_flush={before_flush}"
+    )
+    _flush_in_flight_batch_progress(trainer)
+    after_flush = _describe_batch_progress(trainer)
+
+    checkpoint_callback: Optional[NeMoModelCheckpoint] = getattr(trainer, "checkpoint_callback", None)
+    if checkpoint_callback:
+        save_started = time.monotonic()
+        monitor_candidates = checkpoint_callback._monitor_candidates(trainer)
+        checkpoint_callback._save_last_checkpoint(trainer, monitor_candidates)
+        logging.info(
+            "Graceful shutdown last checkpoint save finished "
+            f"global_step={getattr(trainer, 'global_step', None)} "
+            f"current_epoch={getattr(trainer, 'current_epoch', None)} "
+            f"batch_progress_after_flush={after_flush} "
+            f"last_model_path={getattr(checkpoint_callback, 'last_model_path', None)} "
+            f"save_duration_sec={time.monotonic() - save_started:.3f}"
+        )
+    else:
+        logging.warning(f"{reason}; trainer.checkpoint_callback is not configured")
+
+    from lightning.pytorch.utilities.exceptions import _TunerExitException
+
+    raise _TunerExitException()
+
+
 def configure_no_restart_validation_training_loop(trainer: lightning.pytorch.Trainer) -> None:
     """configure_no_restart_validation_training_loop"""
     if type(trainer.fit_loop.epoch_loop) != _TrainingEpochLoop:
@@ -1504,8 +1579,31 @@ class SkipResumeTrainingValidationLoop(_TrainingEpochLoop):
     the training state before validation has run.
     """
 
+    def __init__(self, *args, **kwargs) -> None:
+        """Initialize skip-validation bookkeeping."""
+        super().__init__(*args, **kwargs)
+        self._skip_resume_validation_once = False
+
+    def advance(self, data_fetcher) -> None:
+        """Skip restart validation without replaying an already-completed train batch."""
+        if self.restarting and super()._should_check_val_fx(data_fetcher):
+            logging.info("Skipping restart validation without replaying a completed training batch")
+            self._skip_resume_validation_once = True
+            self.restarting = False
+            return
+        return super().advance(data_fetcher)
+
+    def on_advance_end(self, data_fetcher) -> None:
+        """Clear the one-shot restart-validation skip after normal epoch-loop bookkeeping."""
+        try:
+            return super().on_advance_end(data_fetcher)
+        finally:
+            self._skip_resume_validation_once = False
+
     def _should_check_val_fx(self, data_fetcher) -> bool:
         """_should_check_val_fx"""
+        if self._skip_resume_validation_once:
+            return False
         if self.restarting:
             return False
         return super()._should_check_val_fx(data_fetcher)
diff --git a/tests/core_ptl/test_resumable_dataloader_iter.py b/tests/core_ptl/test_resumable_dataloader_iter.py
new file mode 100644
index 000000000000..62f96fa77dab
--- /dev/null
+++ b/tests/core_ptl/test_resumable_dataloader_iter.py
@@ -0,0 +1,303 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Regression tests for ``training_step(dataloader_iter)`` resumability."""
+
+from __future__ import annotations
+
+import time
+from datetime import timedelta
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Type
+
+import lightning.pytorch as pl
+import pytest
+import torch
+import torch.nn.functional as F
+from lightning.pytorch.callbacks import ModelCheckpoint
+from lightning.pytorch.utilities.exceptions import _TunerExitException
+from torchdata.stateful_dataloader import StatefulDataLoader
+
+from nemo.core.utils.lightning_utils import read_batch
+from nemo.utils.exp_manager import StatelessTimer, configure_no_restart_validation_training_loop
+
+
+class _RangeDataset(torch.utils.data.Dataset):
+    """Small deterministic dataset whose sample id is also its stream position."""
+
+    def __init__(self, size: int = 1000) -> None:
+        self.size = size
+
+    def __len__(self) -> int:
+        return self.size
+
+    def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
+        x = torch.tensor([float(index % 7)], dtype=torch.float32)
+        y = torch.tensor([float((index % 7) * 0.5)], dtype=torch.float32)
+        return {
+            "sample_id": torch.tensor(index, dtype=torch.long),
+            "x": x,
+            "y": y,
+        }
+
+
+class _BaseParityModel(pl.LightningModule):
+    def __init__(self, seen: list[dict[str, int]], sleep_sec: float = 0.0) -> None:
+        super().__init__()
+        self.seen = seen
+        self.sleep_sec = sleep_sec
+        self.proj = torch.nn.Linear(1, 1)
+        torch.nn.init.constant_(self.proj.weight, 0.25)
+        torch.nn.init.constant_(self.proj.bias, 0.0)
+
+    def train_dataloader(self):
+        return StatefulDataLoader(_RangeDataset(), batch_size=1, num_workers=0)
+
+    def val_dataloader(self):
+        return torch.utils.data.DataLoader(_RangeDataset(size=2), batch_size=1, num_workers=0)
+
+    def configure_optimizers(self):
+        return torch.optim.SGD(self.parameters(), lr=0.01)
+
+    def validation_step(self, batch, batch_idx):
+        loss = self._loss(batch)
+        self.log("val_loss", loss)
+        return loss
+
+    def _step(self, batch: dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor:
+        if self.sleep_sec:
+            time.sleep(self.sleep_sec)
+        self.seen.append(
+            {
+                "sample_id": int(batch["sample_id"].item()),
+                "epoch": int(self.current_epoch),
+                "batch_idx": int(batch_idx),
+                "global_step": int(self.global_step),
+            }
+        )
+        return self._loss(batch)
+
+    def _loss(self, batch: dict[str, torch.Tensor]) -> torch.Tensor:
+        return F.mse_loss(self.proj(batch["x"].float()), batch["y"].float())
+
+
+class _BatchStepModel(_BaseParityModel):
+    def training_step(self, batch, batch_idx):
+        return self._step(batch, batch_idx)
+
+
+class _DataloaderIterStepModel(_BaseParityModel):
+    def training_step(self, dataloader_iter):
+        batch, batch_idx = read_batch(dataloader_iter, self)
+        return self._step(batch, batch_idx)
+
+
+class _StopBeforeNextBatchTimer(StatelessTimer):
+    def _check_time_remaining(self, trainer: pl.Trainer) -> None:
+        raise _TunerExitException()
+
+
+class _CountingIterator:
+    consumed = False
+
+    def __next__(self):
+        self.consumed = True
+        raise AssertionError("read_batch consumed a sample after the timer requested stop")
+
+
+class _PreemptedCallback:
+    preemption_enabled = True
+    interrupted = True
+
+
+def _make_checkpoint_callback(root: Path) -> ModelCheckpoint:
+    return ModelCheckpoint(
+        dirpath=str(root / "checkpoints"),
+        filename="{step}",
+        save_last=True,
+        save_top_k=-1,
+        every_n_epochs=1,
+    )
+
+
+def _make_trainer(root: Path, max_steps: int, callbacks: list | None = None) -> pl.Trainer:
+    trainer = pl.Trainer(
+        accelerator="cpu",
+        devices=1,
+        default_root_dir=str(root),
+        callbacks=callbacks or [],
+        max_steps=max_steps,
+        max_epochs=10,
+        limit_train_batches=5,
+        val_check_interval=5,
+        num_sanity_val_steps=0,
+        logger=False,
+        enable_checkpointing=bool(callbacks),
+        enable_model_summary=False,
+        enable_progress_bar=False,
+    )
+    configure_no_restart_validation_training_loop(trainer)
+    return trainer
+
+
+def _fit(
+    root: Path,
+    model_cls: Type[_BaseParityModel],
+    max_steps: int,
+    ckpt_path: str | None = None,
+    callbacks: list | None = None,
+    sleep_sec: float = 0.0,
+) -> tuple[list[dict[str, int]], pl.Trainer, _BaseParityModel]:
+    seen: list[dict[str, int]] = []
+    model = model_cls(seen=seen, sleep_sec=sleep_sec)
+    trainer = _make_trainer(root, max_steps=max_steps, callbacks=callbacks)
+    trainer.fit(model, ckpt_path=ckpt_path)
+    return seen, trainer, model
+
+
+def _fit_two_phase(root: Path, model_cls: Type[_BaseParityModel]) -> tuple[list[dict[str, int]], list[dict[str, int]]]:
+    first_callback = _make_checkpoint_callback(root / "first")
+    first, _, _ = _fit(root / "first", model_cls, max_steps=5, callbacks=[first_callback])
+    assert first_callback.last_model_path
+
+    second_callback = _make_checkpoint_callback(root / "second")
+    second, _, _ = _fit(
+        root / "second",
+        model_cls,
+        max_steps=10,
+        ckpt_path=first_callback.last_model_path,
+        callbacks=[second_callback],
+    )
+    return first, second
+
+
+def _project(records: list[dict[str, int]], key: str) -> list[int]:
+    return [record[key] for record in records]
+
+
+@pytest.mark.unit
+def test_uninterrupted_dataloader_iter_matches_batch_step(tmp_path):
+    batch_seen, _, _ = _fit(tmp_path / "batch", _BatchStepModel, max_steps=10)
+    iter_seen, _, _ = _fit(tmp_path / "iter", _DataloaderIterStepModel, max_steps=10)
+
+    expected = list(range(5)) + list(range(5))
+    assert _project(iter_seen, "sample_id") == _project(batch_seen, "sample_id") == expected
+    assert _project(iter_seen, "global_step") == _project(batch_seen, "global_step")
+    assert _project(iter_seen, "epoch") == _project(batch_seen, "epoch")
+
+
+@pytest.mark.unit
+def test_interrupted_resume_dataloader_iter_matches_batch_step(tmp_path):
+    batch_first, batch_second = _fit_two_phase(tmp_path / "batch", _BatchStepModel)
+    iter_first, iter_second = _fit_two_phase(tmp_path / "iter", _DataloaderIterStepModel)
+
+    assert _project(iter_first, "sample_id") == _project(batch_first, "sample_id") == list(range(5))
+    assert _project(iter_second, "sample_id") == _project(batch_second, "sample_id") == list(range(5, 10))
+    assert _project(iter_second, "global_step") == _project(batch_second, "global_step") == list(range(5, 10))
+    assert _project(iter_second, "epoch") == _project(batch_second, "epoch")
+
+
+@pytest.mark.unit
+def test_resume_boundary_does_not_replay_old_epoch_batch(tmp_path):
+    first_callback = _make_checkpoint_callback(tmp_path / "first")
+    first, _, _ = _fit(tmp_path / "first", _DataloaderIterStepModel, max_steps=5, callbacks=[first_callback])
+    assert _project(first, "sample_id") == list(range(5))
+
+    second_callback = _make_checkpoint_callback(tmp_path / "second")
+    second, _, _ = _fit(
+        tmp_path / "second",
+        _DataloaderIterStepModel,
+        max_steps=10,
+        ckpt_path=first_callback.last_model_path,
+        callbacks=[second_callback],
+    )
+
+    assert _project(second, "sample_id") == list(range(5, 10))
+    assert second[0] == {
+        "sample_id": 5,
+        "epoch": 1,
+        "batch_idx": 0,
+        "global_step": 5,
+    }
+
+
+@pytest.mark.unit
+def test_read_batch_checks_timer_before_consuming_next_sample():
+    iterator = _CountingIterator()
+    model = SimpleNamespace(trainer=SimpleNamespace(callbacks=[_StopBeforeNextBatchTimer(timedelta(seconds=1))]))
+
+    with pytest.raises(_TunerExitException):
+        read_batch(iterator, model)
+
+    assert not iterator.consumed
+
+
+@pytest.mark.unit
+def test_read_batch_checks_preemption_before_consuming_next_sample():
+    iterator = _CountingIterator()
+    trainer = SimpleNamespace(callbacks=[_PreemptedCallback()], checkpoint_callback=None)
+    model = SimpleNamespace(trainer=trainer)
+
+    with pytest.raises(_TunerExitException):
+        read_batch(iterator, model)
+
+    assert not iterator.consumed
+
+
+@pytest.mark.unit
+def test_read_batch_checks_lightning_sigterm_before_consuming_next_sample():
+    iterator = _CountingIterator()
+    trainer = SimpleNamespace(callbacks=[], checkpoint_callback=None, received_sigterm=True)
+    model = SimpleNamespace(trainer=trainer)
+
+    with pytest.raises(_TunerExitException):
+        read_batch(iterator, model)
+
+    assert not iterator.consumed
+
+
+@pytest.mark.unit
+def test_timer_checkpoint_resume_has_consistent_progress_and_no_sample_drift(tmp_path):
+    checkpoint_callback = _make_checkpoint_callback(tmp_path / "timer")
+    callbacks = [checkpoint_callback, StatelessTimer(duration=timedelta(seconds=0.15))]
+    first, _, _ = _fit(
+        tmp_path / "timer",
+        _DataloaderIterStepModel,
+        max_steps=50,
+        callbacks=callbacks,
+        sleep_sec=0.05,
+    )
+
+    assert 0 < len(first) < 50
+    assert checkpoint_callback.last_model_path
+    ckpt = torch.load(checkpoint_callback.last_model_path, map_location="cpu", weights_only=False)
+    batch_progress = ckpt["loops"]["fit_loop"]["epoch_loop.batch_progress"]
+    saved_step = int(ckpt["global_step"])
+
+    assert saved_step == len(first)
+    assert batch_progress["total"]["completed"] == saved_step
+    assert batch_progress["total"]["ready"] == saved_step
+    assert batch_progress["current"]["completed"] == batch_progress["current"]["ready"]
+
+    resumed_callback = _make_checkpoint_callback(tmp_path / "timer-resume")
+    resumed, _, _ = _fit(
+        tmp_path / "timer-resume",
+        _DataloaderIterStepModel,
+        max_steps=saved_step + 3,
+        ckpt_path=checkpoint_callback.last_model_path,
+        callbacks=[resumed_callback],
+    )
+
+    assert _project(first, "sample_id") == [idx % 5 for idx in range(saved_step)]
+    assert _project(resumed, "sample_id") == [(saved_step + idx) % 5 for idx in range(3)]

From 993c246900f3cbf7b9c5f98583d089811ac9986a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Tue, 9 Jun 2026 11:47:54 -0700
Subject: [PATCH 15/30] Support text-only data loading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 nemo/collections/common/data/lhotse/cutset.py |  18 ++
 .../common/data/lhotse/nemo_adapters.py       |  26 ++-
 .../common/data/lhotse/text_adapters.py       | 217 ++++++++++++++++++
 .../common/prompts/nemotron_nano_v3.py        |   6 +
 .../speechlm2/models/salm_automodel.py        |  28 ++-
 .../collections/speechlm2/parts/cp_helpers.py |  89 ++++++-
 .../speechlm2/parts/encoder_chunking.py       |   5 +-
 scripts/dataloading/build_indexes.py          |  20 +-
 .../speechlm2/test_salm_cp_helpers.py         | 108 ++++++++-
 9 files changed, 492 insertions(+), 25 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 50a3d962caac..16b4be0020f1 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -51,6 +51,7 @@
     NeMoMultimodalConversationShareGPTJsonlAdapter,
     NeMoMultimodalConversationShareGPTWebdatasetAdapter,
     NeMoSFTJsonlAdapter,
+    NemotronTextConversationAdapter,
     TextTurn,
 )
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
@@ -397,6 +398,23 @@ def read_nemo_sft_jsonl(config: DictConfig) -> tuple[CutSet, bool]:
     return cuts, True
 
 
+@data_type_parser("nemotron_text_converation")
+def read_nemotron_text_converation(config: DictConfig) -> tuple[CutSet, bool]:
+    """Read Nemotron/Energon text-only conversation JSONL files or tar directories."""
+    cuts = CutSet(
+        NemotronTextConversationAdapter(
+            paths=config.paths,
+            shuffle_shards=config.shuffle,
+            shard_seed=config.shard_seed,
+            indexed=config.get("indexed", False),
+            indexes_root=config.get("indexes_root", None),
+        )
+    )
+    if not config.get("force_finite", False):
+        cuts = cuts.repeat(preserve_id=True)
+    return cuts, True
+
+
 @data_type_parser("multimodal_conversation")
 def read_multimodal_conversation_jsonl(config: DictConfig) -> tuple[CutSet, bool]:
     """Read paths to multimodal conversation JSONL files and create a CutSet."""
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index bb99084c9faa..b6d09c4fdc14 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -401,6 +401,7 @@ def __init__(
         indexes_root: str | Path | None = None,
     ) -> None:
         self.skip_missing_manifest_entries = skip_missing_manifest_entries
+        self._malformed_manifest_warning_keys: set[tuple[str, int]] = set()
         self.indexed = indexed
         self.indexes_root = indexes_root
         self.shard_id_to_manifest: dict[int, Iterable[dict]]
@@ -787,13 +788,28 @@ def _build_indexed_url_cut(self, data: dict, manifest_path: str, tar_path: str)
     def _decode_cut_at(self, idx: int) -> Cut | None:
         """Build the Cut for a global index in indexed mode (AIS or local).
 
-        Returns ``None`` if the audio member is missing and
-        ``skip_missing_manifest_entries`` is set, or if the entry has
-        ``_skipme=True`` / undecodable audio.
+        Returns ``None`` if the manifest entry/audio member is missing or
+        malformed and ``skip_missing_manifest_entries`` is set, or if the
+        entry has ``_skipme=True`` / undecodable audio.
         """
         sid, local_idx = self._resolve_global_idx(idx)
-        data = self._cuts_readers[sid][local_idx]
-        manifest_path = self._cuts_readers[sid].path
+        cuts_reader = self._cuts_readers[sid]
+        manifest_path = cuts_reader.path
+        try:
+            data = cuts_reader[local_idx]
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            if self.skip_missing_manifest_entries:
+                warning_key = (str(manifest_path), sid)
+                if warning_key not in self._malformed_manifest_warning_keys:
+                    self._malformed_manifest_warning_keys.add(warning_key)
+                    logging.warning(
+                        "Skipping malformed manifest entries in indexed Lhotse dataloader: "
+                        f"{manifest_path=} {sid=} first_local_idx={local_idx} first_global_idx={idx}. "
+                        "Further malformed entries for this manifest/shard will be skipped without additional "
+                        "warnings."
+                    )
+                return None
+            raise
         tar_path = self.shard_id_to_tar_path[sid]
         if self.use_ais_get_batch:
             return self._build_indexed_url_cut(data, manifest_path, tar_path)
diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py
index 8a168d264687..d8f96bea8b0f 100644
--- a/nemo/collections/common/data/lhotse/text_adapters.py
+++ b/nemo/collections/common/data/lhotse/text_adapters.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import json
 import math
 import os
 import random
@@ -38,6 +39,7 @@
 from lhotse.indexing import IndexedJsonlReader
 
 from nemo.collections.common.data.lhotse.indexed_adapters import (
+    IndexedTarMemberReader,
     IndexedTarSampleReader,
     _split_json_audio_pair,
 )
@@ -502,6 +504,221 @@ def _iter_streaming(self) -> Iterator[NeMoSFTExample]:
                 yield NeMoSFTExample(data, language=self.language)
 
 
+def _normalize_nemotron_text_sender(sender: str, sample_id: str) -> str:
+    role = str(sender).lower()
+    if role in ("user", "human"):
+        return "user"
+    if role in ("assistant", "gpt", "model", "bot"):
+        return "assistant"
+    if role == "system":
+        return "system"
+    if role == "tool":
+        return "tool"
+    raise ValueError(f"Unsupported sender={sender!r} in Nemotron text conversation sample id={sample_id}")
+
+
+def _flatten_nemotron_text_fragments(fragments: list, sample_id: str) -> str:
+    values = []
+    for fragment in fragments:
+        if isinstance(fragment, str):
+            values.append(fragment)
+            continue
+        if not isinstance(fragment, dict):
+            raise ValueError(
+                f"Unsupported fragment type={type(fragment).__name__} in Nemotron text conversation sample id={sample_id}"
+            )
+        fragment_type = fragment.get("t")
+        if fragment_type not in (None, "text"):
+            raise ValueError(
+                f"Unsupported fragment t={fragment_type!r} in Nemotron text conversation sample id={sample_id}"
+            )
+        values.append(str(fragment.get("value", "")))
+    return "".join(values)
+
+
+def _transform_nemotron_text_conversation(data: dict, sample_id: str) -> "NeMoMultimodalConversation":
+    conversation = data.get("conversation")
+    if not isinstance(conversation, list):
+        raise ValueError(f"Nemotron text conversation sample id={sample_id} has no list-valued 'conversation' field")
+
+    turns = []
+    for turn in conversation:
+        if not isinstance(turn, dict):
+            raise ValueError(
+                f"Unsupported turn type={type(turn).__name__} in Nemotron text conversation sample id={sample_id}"
+            )
+        role = _normalize_nemotron_text_sender(turn.get("sender"), sample_id)
+        value = _flatten_nemotron_text_fragments(turn.get("fragments", []), sample_id)
+        turns.append(TextTurn(value=value, role=role))
+    return NeMoMultimodalConversation(
+        id=str(data.get("id") or sample_id),
+        turns=turns,
+        custom=data.get("custom"),
+    )
+
+
+@dataclass
+class NemotronTextConversationAdapter(IteratorNode):
+    """
+    Read Nemotron/Energon text-only conversation data.
+
+    Supported inputs are JSONL files and materialized tar directories whose JSON
+    rows contain ``conversation`` turns with ``sender`` and ``fragments`` fields.
+    """
+
+    paths: Union[Pathlike, list[Pathlike]]
+    shuffle_shards: bool = False
+    shard_seed: Union[int, Literal["trng", "randomized"]] = "trng"
+    indexed: bool = False
+    indexes_root: Optional[Pathlike] = None
+
+    def __post_init__(self):
+        paths = [self.paths] if isinstance(self.paths, (str, Path)) else list(self.paths)
+        self.paths = [str(p) for raw in paths for p in expand_sharded_filepaths(str(raw))]
+        self._readers: list = []
+        self._reader_kinds: list[str] = []
+        self._source_paths: list[str] = []
+        self._cum_lens: list[int] = []
+        self._iter_state = PartitionedIndexedIterator()
+        if self.indexed:
+            self._init_indexed()
+
+    @property
+    def is_checkpointable(self) -> bool:
+        return self.indexed
+
+    @property
+    def is_indexed(self) -> bool:
+        return self.indexed
+
+    @property
+    def has_constant_time_access(self) -> bool:
+        return self.indexed
+
+    def _init_indexed(self) -> None:
+        from lhotse.indexing import index_file_path
+
+        for p in self.paths:
+            path = Path(p)
+            if path.is_dir():
+                tar_paths = sorted(path.rglob("*.tar"))
+                if not tar_paths:
+                    raise FileNotFoundError(f"No .tar files found under Nemotron text conversation directory: {path}")
+                for tar_path in tar_paths:
+                    self._add_indexed_tar_reader(str(tar_path), index_file_path(str(tar_path), self.indexes_root))
+            elif path.suffix == ".tar":
+                self._add_indexed_tar_reader(p, index_file_path(p, self.indexes_root))
+            else:
+                self._readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
+                self._reader_kinds.append("jsonl")
+                self._source_paths.append(p)
+        cum = 0
+        self._cum_lens.append(cum)
+        for reader in self._readers:
+            cum += len(reader)
+            self._cum_lens.append(cum)
+
+    def _add_indexed_tar_reader(self, tar_path: str, idx_path: Pathlike) -> None:
+        self._readers.append(IndexedTarMemberReader(tar_path, idx_path=idx_path))
+        self._reader_kinds.append("tar")
+        self._source_paths.append(tar_path)
+
+    def __len__(self) -> int:
+        if not self.indexed:
+            raise TypeError("NemotronTextConversationAdapter has unknown length unless constructed with indexed=True.")
+        return self._cum_lens[-1] if self._cum_lens else 0
+
+    def _resolve(self, idx: int) -> tuple[int, int]:
+        if idx < 0:
+            idx += self._cum_lens[-1]
+        for shard_idx in range(len(self._readers)):
+            if idx < self._cum_lens[shard_idx + 1]:
+                return shard_idx, idx - self._cum_lens[shard_idx]
+        raise IndexError(idx)
+
+    def _data_to_conversation(
+        self, data: dict, source_path: Union[str, Path], local_idx: int
+    ) -> "NeMoMultimodalConversation":
+        sample_id = f"{Path(source_path).stem}-{local_idx:012d}"
+        return _transform_nemotron_text_conversation(data, sample_id)
+
+    def _reader_item_to_conversation(self, shard_idx: int, local_idx: int) -> "NeMoMultimodalConversation":
+        item = self._readers[shard_idx][local_idx]
+        source_path = self._source_paths[shard_idx]
+        if self._reader_kinds[shard_idx] == "tar":
+            name, payload = item
+            if not name.endswith(".json"):
+                raise RuntimeError(
+                    f"Index {local_idx} in {source_path} points to non-JSON tar member {name!r}; "
+                    "Nemotron text conversation tar shards are expected to contain JSON samples."
+                )
+            return _transform_nemotron_text_conversation(json.loads(payload), Path(name).stem)
+        return self._data_to_conversation(item, source_path, local_idx)
+
+    def __getitem__(self, token):
+        if not self.indexed:
+            raise NotImplementedError("NemotronTextConversationAdapter only supports __getitem__ when indexed=True.")
+        idx = int(normalize_graph_token(token))
+        shard_idx, local_idx = self._resolve(idx)
+        conversation = self._reader_item_to_conversation(shard_idx, local_idx)
+        return attach_graph_origin(conversation, idx)
+
+    def state_dict(self) -> dict:
+        return self._iter_state.state_dict() if self.indexed else {}
+
+    def load_state_dict(self, sd: dict) -> None:
+        if not self.indexed:
+            return
+        self._iter_state.load_state_dict(sd)
+
+    def __iter__(self) -> Iterator["NeMoMultimodalConversation"]:
+        if self.indexed:
+            yield from self._iter_indexed()
+            return
+        yield from self._iter_streaming()
+
+    def _iter_indexed(self) -> Iterator["NeMoMultimodalConversation"]:
+        total = self._cum_lens[-1] if self._cum_lens else 0
+        for global_idx in self._iter_state.iterate(total):
+            shard_idx, local_idx = self._resolve(global_idx)
+            conversation = self._reader_item_to_conversation(shard_idx, local_idx)
+            attach_graph_origin(conversation, global_idx)
+            yield conversation
+
+    def _iter_streaming(self) -> Iterator["NeMoMultimodalConversation"]:
+        paths = list(self.paths)
+        if self.shuffle_shards:
+            random.Random(resolve_seed(self.shard_seed)).shuffle(paths)
+        for path in paths:
+            yield from self._iter_path(Path(path))
+
+    def _iter_path(self, path: Path) -> Iterator["NeMoMultimodalConversation"]:
+        if path.is_dir():
+            tar_paths = sorted(path.rglob("*.tar"))
+            if not tar_paths:
+                raise FileNotFoundError(f"No .tar files found under Nemotron text conversation directory: {path}")
+            for tar_path in tar_paths:
+                yield from self._iter_tar(tar_path)
+        elif path.suffix == ".tar":
+            yield from self._iter_tar(path)
+        else:
+            yield from self._iter_jsonl(path)
+
+    def _iter_jsonl(self, path: Path) -> Iterator["NeMoMultimodalConversation"]:
+        for idx, data in enumerate(load_jsonl(path)):
+            sample_id = f"{path.stem}-{idx:012d}"
+            yield _transform_nemotron_text_conversation(data, sample_id)
+
+    def _iter_tar(self, path: Path) -> Iterator["NeMoMultimodalConversation"]:
+        with tarfile.open(path, "r:*") as tar:
+            for info in tar:
+                if not info.isfile() or not info.name.endswith(".json"):
+                    continue
+                data = json.load(tar.extractfile(info))
+                sample_id = Path(info.name).stem
+                yield _transform_nemotron_text_conversation(data, sample_id)
+
+
 """
 NeMoMultimodalConversation: data types, file parser, default prompt formatting logic.
 """
diff --git a/nemo/collections/common/prompts/nemotron_nano_v3.py b/nemo/collections/common/prompts/nemotron_nano_v3.py
index 4840702e92ff..1162ba3dee4a 100644
--- a/nemo/collections/common/prompts/nemotron_nano_v3.py
+++ b/nemo/collections/common/prompts/nemotron_nano_v3.py
@@ -40,6 +40,12 @@ class NemotronNanoV3PromptFormatter(PromptFormatter):
                 "message": Modality.Text,
             },
         },
+        "tool": {
+            "template": f"{NANO_BOT}tool\n|message|{NANO_EOT}\n",
+            "slots": {
+                "message": Modality.Text,
+            },
+        },
         OUTPUT_ROLE: {
             "template": f"{NANO_BOT}assistant\n|message|{NANO_EOT}\n",
             "slots": {
diff --git a/nemo/collections/speechlm2/models/salm_automodel.py b/nemo/collections/speechlm2/models/salm_automodel.py
index 8add80740110..94080a2a5ebd 100644
--- a/nemo/collections/speechlm2/models/salm_automodel.py
+++ b/nemo/collections/speechlm2/models/salm_automodel.py
@@ -196,19 +196,27 @@ def prepare_inputs(self, batch: dict):
         * Take care of any necessary slicing to align the shapes of source audio,
             target audio, and target token ids.
         """
-        from nemo.collections.speechlm2.parts.cp_helpers import encode_audio_with_cp_distribution, get_cp_mesh
+        from nemo.collections.speechlm2.parts.cp_helpers import (
+            encode_audio_with_cp_distribution,
+            get_cp_mesh,
+            get_perception_fsdp_group,
+        )
 
-        cp_mesh, _, _ = get_cp_mesh(getattr(self, "_device_mesh", None))
+        device_mesh = getattr(self, "_device_mesh", None)
+        cp_mesh, _, _ = get_cp_mesh(device_mesh)
+        fsdp_sync_group = get_perception_fsdp_group(device_mesh)
 
         # Source audio encoding (distributed across CP ranks when CP is active).
         # Input audio: (B_aud, T_samples) → list of (L_i, H) embeddings.
-        audio_embs = encode_audio_with_cp_distribution(
+        audio_embs, dummy_audio_loss = encode_audio_with_cp_distribution(
             self.perception,
             batch["audios"],
             batch["audio_lens"],
             chunk_size_seconds=self.cfg.get("encoder_chunk_size_seconds", None),
             sampling_rate=self.sampling_rate,
             cp_mesh=cp_mesh,
+            fsdp_sync_group=fsdp_sync_group,
+            return_dummy_loss=True,
         )
         input_ids_to_embed = torch.where(batch["input_ids"] == self.audio_locator_tag_id, 0, batch["input_ids"])
         text_embs = self._embed_tokens(input_ids_to_embed)
@@ -219,15 +227,18 @@ def prepare_inputs(self, batch: dict):
         if self.cfg.get("packed_sequences", False):
             from nemo.collections.speechlm2.parts.packed_sequences import prepare_packed_llm_inputs
 
-            return prepare_packed_llm_inputs(
+            ans = prepare_packed_llm_inputs(
                 input_ids=batch["input_ids"],
                 text_embs=text_embs,
                 audio_embs=audio_embs,
                 target_ids=target_ids_full,
                 padding_id=self.text_pad_id,
                 placeholder_id=self.audio_locator_tag_id,
-                device_mesh=getattr(self, "_device_mesh", None),
+                device_mesh=device_mesh,
             )
+            if dummy_audio_loss is not None:
+                ans["dummy_audio_loss"] = dummy_audio_loss
+            return ans
 
         input_embs, target_ids, attention_mask = replace_placeholders_and_build_targets(
             input_ids=batch["input_ids"],
@@ -252,12 +263,15 @@ def prepare_inputs(self, batch: dict):
                 attention_mask = attention_mask[:, :-remainder]
                 target_ids = target_ids[:, :-remainder]
 
-        return {
+        ans = {
             "input_embeds": input_embs,
             "attention_mask": attention_mask,
             "target_ids": target_ids,
             "llm_kwargs": {},
         }
+        if dummy_audio_loss is not None:
+            ans["dummy_audio_loss"] = dummy_audio_loss
+        return ans
 
     def on_fit_start(self) -> None:
         """Configure the MoE aux-loss backward scaler to cancel FSDP's gradient
@@ -340,6 +354,8 @@ def _training_step_batch(self, batch: dict, batch_idx: int):
                 ignore_index=-100,
             )
             loss = loss_sum * dp_size / num_frames_global
+        if (dummy_audio_loss := inputs.get("dummy_audio_loss")) is not None:
+            loss = loss + dummy_audio_loss
 
         # Display the local per-token CE so logged values stay on the same scale as before
         # this fix. The gradient-carrying ``loss`` above is the globally-normalized quantity.
diff --git a/nemo/collections/speechlm2/parts/cp_helpers.py b/nemo/collections/speechlm2/parts/cp_helpers.py
index cae688030eb1..f278ff0923dc 100644
--- a/nemo/collections/speechlm2/parts/cp_helpers.py
+++ b/nemo/collections/speechlm2/parts/cp_helpers.py
@@ -34,7 +34,10 @@
 from torch import Tensor
 from torch.distributed.nn.functional import all_gather as differentiable_all_gather
 
-from nemo.collections.speechlm2.parts.encoder_chunking import encode_audio_with_optional_chunking
+from nemo.collections.speechlm2.parts.encoder_chunking import (
+    _get_min_chunk_size_samples,
+    encode_audio_with_optional_chunking,
+)
 
 
 def get_cp_mesh(device_mesh) -> tuple[Optional[object], int, int]:
@@ -49,6 +52,22 @@ def get_cp_mesh(device_mesh) -> tuple[Optional[object], int, int]:
     return cp_mesh, cp_mesh.size(), cp_rank
 
 
+def get_perception_fsdp_group(device_mesh):
+    """Return the process group used to FSDP-shard the perception module, if any."""
+    if device_mesh is None:
+        return None
+    dim_names = device_mesh.mesh_dim_names or ()
+    if "dp_replicate" in dim_names and "dp_shard_cp" in dim_names:
+        fsdp_mesh = device_mesh["dp_replicate", "dp_shard_cp"]
+    elif "dp_shard_cp" in dim_names:
+        fsdp_mesh = device_mesh["dp_shard_cp"]
+    else:
+        fsdp_mesh = device_mesh["dp"]
+    if fsdp_mesh.size() <= 1:
+        return None
+    return fsdp_mesh.get_group()
+
+
 def encode_audio_with_cp_distribution(
     perception,
     audios: Tensor,
@@ -57,11 +76,13 @@ def encode_audio_with_cp_distribution(
     chunk_size_seconds: Optional[float],
     sampling_rate: int,
     cp_mesh=None,
-) -> list[Tensor]:
+    fsdp_sync_group=None,
+    return_dummy_loss: bool = False,
+) -> list[Tensor] | tuple[list[Tensor], Tensor | None]:
     """Distribute the audio encoder forward across CP ranks.
 
     Falls back to :func:`encode_audio_with_optional_chunking` when ``cp_mesh is
-    None`` or there are no audios in the batch.
+    None``.
 
     With CP active, each rank encodes a contiguous slice of the audio batch
     (rank ``r`` gets ``audios[r*per_rank : (r+1)*per_rank]`` where
@@ -77,16 +98,40 @@ def encode_audio_with_cp_distribution(
     zero-padded to a globally-consistent ``max_L`` and ``all_gather``ed across
     the CP group. The full ordered list is reconstructed and dummies are
     dropped, so the return value is identical on every CP rank.
+
+    When ``fsdp_sync_group`` is provided and this rank has a text-only batch
+    while another rank in the perception FSDP group has audio, this function
+    runs a single dummy audio row through ``perception`` and returns a zero-valued
+    loss term. Adding that term to the training loss preserves the autograd edge
+    so FSDP forward/backward hooks fire on the text-only rank without affecting
+    gradients numerically.
     """
     B_aud = int(audios.shape[0])
-    if cp_mesh is None or B_aud == 0:
-        return encode_audio_with_optional_chunking(
+    fsdp_group_has_audio = _fsdp_group_has_audio(B_aud, audios.device, fsdp_sync_group)
+    if B_aud == 0:
+        dummy_loss = (
+            _dummy_audio_loss_for_fsdp_sync(
+                perception,
+                audios,
+                audio_lens,
+                chunk_size_seconds=chunk_size_seconds,
+                sampling_rate=sampling_rate,
+            )
+            if fsdp_group_has_audio
+            else None
+        )
+        ans = []
+        return (ans, dummy_loss) if return_dummy_loss else ans
+
+    if cp_mesh is None:
+        ans = encode_audio_with_optional_chunking(
             perception,
             audios,
             audio_lens,
             chunk_size_seconds=chunk_size_seconds,
             sampling_rate=sampling_rate,
         )
+        return (ans, None) if return_dummy_loss else ans
 
     cp_size = cp_mesh.size()
     cp_group = cp_mesh.get_group()
@@ -144,4 +189,36 @@ def encode_audio_with_cp_distribution(
             L = int(gathered_lens[r][i].item())
             full_embs.append(gathered_stack[r][i, :L])
 
-    return full_embs
+    return (full_embs, None) if return_dummy_loss else full_embs
+
+
+def _fsdp_group_has_audio(B_aud: int, device: torch.device, fsdp_sync_group=None) -> bool:
+    if fsdp_sync_group is None or not (dist.is_available() and dist.is_initialized()):
+        return False
+    local_has_audio = torch.tensor(1 if B_aud > 0 else 0, dtype=torch.int32, device=device)
+    dist.all_reduce(local_has_audio, op=dist.ReduceOp.MAX, group=fsdp_sync_group)
+    return bool(int(local_has_audio.item()))
+
+
+def _dummy_audio_loss_for_fsdp_sync(
+    perception,
+    audios: Tensor,
+    audio_lens: Tensor,
+    *,
+    chunk_size_seconds: Optional[float],
+    sampling_rate: int,
+) -> Tensor | None:
+    # The preprocessor minimum alone can be too short after Conformer
+    # subsampling, leaving BatchNorm with a single value per channel.
+    dummy_len = max(_get_min_chunk_size_samples(perception), int(sampling_rate))
+    dummy_audio = torch.zeros(1, dummy_len, dtype=audios.dtype, device=audios.device)
+    dummy_lens = torch.full((1,), dummy_len, dtype=audio_lens.dtype, device=audio_lens.device)
+    dummy_embs = encode_audio_with_optional_chunking(
+        perception,
+        dummy_audio,
+        dummy_lens,
+        chunk_size_seconds=chunk_size_seconds,
+        sampling_rate=sampling_rate,
+    )
+    dummy_loss = sum(emb.float().sum() for emb in dummy_embs)
+    return dummy_loss * 0.0
diff --git a/nemo/collections/speechlm2/parts/encoder_chunking.py b/nemo/collections/speechlm2/parts/encoder_chunking.py
index 79a1c9f5c02c..ff412758b797 100644
--- a/nemo/collections/speechlm2/parts/encoder_chunking.py
+++ b/nemo/collections/speechlm2/parts/encoder_chunking.py
@@ -48,8 +48,11 @@ def encode_audio_with_optional_chunking(
         embeddings are concatenated along the time axis to recover a single tensor per
         original audio row.
     """
+    if input_signal_length.numel() == 0:
+        return []
+
     chunk_size_samples = _get_chunk_size_samples(chunk_size_seconds, sampling_rate)
-    if chunk_size_samples is None or input_signal_length.numel() == 0:
+    if chunk_size_samples is None:
         audio_embs, audio_emb_lens = perception(input_signal=input_signal, input_signal_length=input_signal_length)
         return _unpad_audio_embeddings(audio_embs, audio_emb_lens)
 
diff --git a/scripts/dataloading/build_indexes.py b/scripts/dataloading/build_indexes.py
index d1eedd1a4269..ed2bb1c24812 100644
--- a/scripts/dataloading/build_indexes.py
+++ b/scripts/dataloading/build_indexes.py
@@ -179,12 +179,23 @@ def _discover_keys(entry, jobs: list[IndexJob], indexes_root: Optional[str]) ->
         for p in _expand_tars(entry.get("tarred_audio_filepaths")):
             jobs.append(IndexJob(p, NEMO_TAR, indexes_root))
     if (paths := entry.get("paths")) is not None:
-        for p in _expand_jsonl(paths):
-            jobs.append(IndexJob(p, JSONL, indexes_root))
+        _discover_paths(paths, jobs, indexes_root)
     if (sub := _resolve_input_cfg(entry.get("input_cfg"))) is not None:
         discover(sub, jobs, indexes_root)
 
 
+def _discover_paths(paths, jobs: list[IndexJob], indexes_root: Optional[str]) -> None:
+    for p in _expand_jsonl(paths):
+        path = Path(p)
+        if path.is_dir():
+            for tar_path in sorted(path.rglob("*.tar")):
+                jobs.append(IndexJob(str(tar_path), NEMO_TAR, indexes_root))
+        elif path.suffix == ".tar":
+            jobs.append(IndexJob(p, NEMO_TAR, indexes_root))
+        else:
+            jobs.append(IndexJob(p, JSONL, indexes_root))
+
+
 def discover(entry, jobs: list[IndexJob], indexes_root: Optional[str] = None) -> None:
     """Walk one entry of an ``input_cfg`` and append every required IndexJob."""
     if isinstance(entry, (list, ListConfig)):
@@ -241,9 +252,8 @@ def discover(entry, jobs: list[IndexJob], indexes_root: Optional[str] = None) ->
         _discover_shar(entry.get("shar_path"), jobs, indexes_root)
         return
 
-    if typ == "txt_jsonl":
-        for p in _expand_jsonl(entry.get("paths")):
-            jobs.append(IndexJob(p, JSONL, indexes_root))
+    if typ in ("txt_jsonl", "nemotron_text_converation"):
+        _discover_paths(entry.get("paths"), jobs, indexes_root)
         return
 
     # Unknown type — nothing to do.
diff --git a/tests/collections/speechlm2/test_salm_cp_helpers.py b/tests/collections/speechlm2/test_salm_cp_helpers.py
index c183055effa1..5ddcb9339896 100644
--- a/tests/collections/speechlm2/test_salm_cp_helpers.py
+++ b/tests/collections/speechlm2/test_salm_cp_helpers.py
@@ -133,11 +133,17 @@ class _TrainablePerceptionStub(torch.nn.Module):
     def __init__(self):
         super().__init__()
         self.scale = torch.nn.Parameter(torch.tensor(2.0))
+        self.num_calls = 0
+        self.last_input_signal_shape = None
+        self.last_input_signal_length = None
 
     def forward(self, *, input_signal, input_signal_length):
+        self.num_calls += 1
+        self.last_input_signal_shape = tuple(input_signal.shape)
+        self.last_input_signal_length = input_signal_length.detach().cpu().tolist()
         B = input_signal.shape[0]
-        embs = input_signal[:, :2].unsqueeze(-1) * self.scale
-        lens = torch.full((B,), 2, dtype=input_signal_length.dtype, device=input_signal_length.device)
+        embs = input_signal[:, : max(1, min(2, input_signal.shape[1]))].unsqueeze(-1) * self.scale
+        lens = torch.full((B,), embs.shape[1], dtype=input_signal_length.dtype, device=input_signal_length.device)
         return embs, lens
 
 
@@ -174,3 +180,101 @@ def fake_lens_all_gather(gathered_lens, local_lens, group):
     embs[0].sum().backward()
     assert perception.scale.grad is not None
     assert perception.scale.grad.item() == pytest.approx(3.0)
+
+
+def test_encode_audio_empty_rank_runs_dummy_when_fsdp_group_has_audio(monkeypatch):
+    perception = _TrainablePerceptionStub()
+    audios = torch.zeros(0, 1600, dtype=torch.float32)
+    audio_lens = torch.zeros(0, dtype=torch.long)
+    all_reduce_calls = []
+
+    def fake_all_reduce(tensor, op=None, group=None):
+        all_reduce_calls.append((int(tensor.item()), group))
+        tensor.fill_(1)
+
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_available", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_initialized", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.all_reduce", fake_all_reduce)
+
+    embs, dummy_audio_loss = encode_audio_with_cp_distribution(
+        perception,
+        audios,
+        audio_lens,
+        chunk_size_seconds=None,
+        sampling_rate=16000,
+        cp_mesh=None,
+        fsdp_sync_group="fake-fsdp-group",
+        return_dummy_loss=True,
+    )
+
+    assert embs == []
+    assert perception.num_calls == 1
+    assert perception.last_input_signal_shape == (1, 16000)
+    assert perception.last_input_signal_length == [16000]
+    assert all_reduce_calls == [(0, "fake-fsdp-group")]
+    assert dummy_audio_loss is not None
+    assert dummy_audio_loss.requires_grad
+    assert dummy_audio_loss.item() == pytest.approx(0.0)
+    dummy_audio_loss.backward()
+    assert perception.scale.grad is not None
+    assert perception.scale.grad.item() == pytest.approx(0.0)
+
+
+def test_encode_audio_empty_rank_skips_dummy_when_fsdp_group_has_no_audio(monkeypatch):
+    perception = _TrainablePerceptionStub()
+    audios = torch.zeros(0, 1600, dtype=torch.float32)
+    audio_lens = torch.zeros(0, dtype=torch.long)
+    all_reduce_calls = []
+
+    def fake_all_reduce(tensor, op=None, group=None):
+        all_reduce_calls.append((int(tensor.item()), group))
+
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_available", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_initialized", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.all_reduce", fake_all_reduce)
+
+    embs, dummy_audio_loss = encode_audio_with_cp_distribution(
+        perception,
+        audios,
+        audio_lens,
+        chunk_size_seconds=None,
+        sampling_rate=16000,
+        cp_mesh=None,
+        fsdp_sync_group="fake-fsdp-group",
+        return_dummy_loss=True,
+    )
+
+    assert embs == []
+    assert perception.num_calls == 0
+    assert all_reduce_calls == [(0, "fake-fsdp-group")]
+    assert dummy_audio_loss is None
+
+
+def test_encode_audio_nonempty_rank_participates_in_fsdp_audio_probe(monkeypatch):
+    perception = _TrainablePerceptionStub()
+    audios = torch.tensor([[1.0, 2.0, 0.0]])
+    audio_lens = torch.tensor([3], dtype=torch.long)
+    all_reduce_calls = []
+
+    def fake_all_reduce(tensor, op=None, group=None):
+        all_reduce_calls.append((int(tensor.item()), group))
+
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_available", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.is_initialized", lambda: True)
+    monkeypatch.setattr("nemo.collections.speechlm2.parts.cp_helpers.dist.all_reduce", fake_all_reduce)
+
+    embs, dummy_audio_loss = encode_audio_with_cp_distribution(
+        perception,
+        audios,
+        audio_lens,
+        chunk_size_seconds=None,
+        sampling_rate=16000,
+        cp_mesh=None,
+        fsdp_sync_group="fake-fsdp-group",
+        return_dummy_loss=True,
+    )
+
+    assert len(embs) == 1
+    assert perception.num_calls == 1
+    assert all_reduce_calls == [(1, "fake-fsdp-group")]
+    assert dummy_audio_loss is None

From 0f4a125e913d78278fd8c13af2d175ee28584f62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Thu, 11 Jun 2026 08:11:22 -0700
Subject: [PATCH 16/30] Fix ShareGPT multimodal resumable dataloading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 nemo/collections/common/data/lhotse/cutset.py |   1 +
 .../common/data/lhotse/text_adapters.py       | 380 +++++++++++++-----
 scripts/dataloading/build_indexes.py          |  43 +-
 .../test_lhotse_multimodal_dataloading.py     | 209 ++++++++++
 4 files changed, 529 insertions(+), 104 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index 16b4be0020f1..f3129a44245e 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -454,6 +454,7 @@ def read_share_gpt_as_conversation(config) -> tuple[CutSet, bool]:
             slice_length=config.get("slice_length"),
             indexed=config.get("indexed", False),
             indexes_root=config.get("indexes_root", None),
+            skip_missing_manifest_entries=config.get("skip_missing_manifest_entries", False),
         )
     )
     if not config.get("force_finite", False):
diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py
index d8f96bea8b0f..18350f516605 100644
--- a/nemo/collections/common/data/lhotse/text_adapters.py
+++ b/nemo/collections/common/data/lhotse/text_adapters.py
@@ -26,6 +26,7 @@
 import numpy as np
 import torch
 from lhotse import AudioSource, CutSet, Recording
+from lhotse.audio import AudioLoadingError
 from lhotse.custom import CustomFieldMixin
 from lhotse.cut import Cut
 from lhotse.dataset import AudioSamples
@@ -1386,63 +1387,6 @@ def _iter_jsonl(self):
         self.epoch += 1
 
 
-def _normalize_audio_placeholders(val: Union[str, list[str], None]) -> list[str]:
-    if val is None:
-        return ["<sound>", "<speech>"]
-    return [val] if isinstance(val, str) else list(val)
-
-
-def _transform_sharegpt(placeholders: list[str], data: dict, audio_path_fallback: str | None = None) -> list[dict]:
-    """Parse a ShareGPT dict into a flat list of ``{"type", "from", "value", ...}`` turn dicts."""
-    conversations = []
-    audio_path = data.get("sound") or data.get("ori_sound") or audio_path_fallback
-    for turn in data["conversations"]:
-        role = "user" if turn["from"].lower() in ("human", "user") else "assistant"
-        found = next((p for p in placeholders if p in turn["value"]), None)
-        if found:
-            parts = turn["value"].split(found)
-            if parts[0].strip():
-                conversations.append({"type": "text", "from": role.title(), "value": parts[0].strip()})
-            if not audio_path:
-                raise ValueError(
-                    f"Conversation turn contains audio placeholder '{found}' but no audio path "
-                    f"was found in 'sound', 'ori_sound' fields or fallback for sample id={data.get('id', '?')}"
-                )
-            conversations.append(
-                {
-                    "type": "audio",
-                    "from": role.title(),
-                    "value": audio_path,
-                    "duration": turn.get("duration", None),
-                    "offset": turn.get("offset", 0.0),
-                }
-            )
-            if len(parts) > 1 and parts[1].strip():
-                conversations.append({"type": "text", "from": role.title(), "value": parts[1].strip()})
-        else:
-            conversations.append({"type": "text", "from": role.title(), "value": turn["value"]})
-    return conversations
-
-
-def _create_sharegpt_turns(audio_locator_tag: str, conversations: list[dict], resolve_cut) -> list:
-    """Build ``TextTurn`` / ``AudioTurn`` objects.  *resolve_cut(turn_dict) -> Cut* supplies audio."""
-    turns = []
-    for t in conversations:
-        if t["type"] == "text":
-            turns.append(TextTurn(value=t["value"], role=t["from"].lower()))
-        else:
-            cut = resolve_cut(t)
-            turns.append(
-                AudioTurn(
-                    cut=cut,
-                    text=cut.supervisions[0].text if cut.supervisions else None,
-                    role=t["from"].lower(),
-                    audio_locator_tag=audio_locator_tag,
-                )
-            )
-    return turns
-
-
 @dataclass
 class NeMoMultimodalConversationShareGPTJsonlAdapter(IteratorNode):
     """
@@ -1479,6 +1423,7 @@ class NeMoMultimodalConversationShareGPTJsonlAdapter(IteratorNode):
     slice_length: int | None = None
     indexed: bool = False
     indexes_root: Optional[Pathlike] = None
+    skip_missing_manifest_entries: bool = False
 
     def __post_init__(self):
         from lhotse.indexing import index_file_path
@@ -1560,30 +1505,40 @@ def load_state_dict(self, sd: dict) -> None:
         self._iter_state.load_state_dict(sd)
         self.epoch = sd.get("epoch", 0)
 
-    def _build_one(self, data: dict, shard_idx: int) -> NeMoMultimodalConversation:
-        conversations = _transform_sharegpt(self.audio_placeholders, data)
-        if self._tar_readers:
-            tar_reader = self._tar_readers[shard_idx]
-            tar_path = self.tarred_audio_filepaths[shard_idx]
+    def _build_one(self, data: dict, shard_idx: int) -> NeMoMultimodalConversation | None:
+        try:
+            conversations = _ShareGPTConversationParser(self.audio_placeholders, data).transform()
+            if self._tar_readers:
+                tar_reader = self._tar_readers[shard_idx]
+                tar_path = self.tarred_audio_filepaths[shard_idx]
+                return NeMoMultimodalConversation(
+                    id=data.get("id", "missing-example-id"),
+                    turns=_ShareGPTConversationParser.create_turns(
+                        self.audio_locator_tag,
+                        conversations,
+                        lambda t: self._resolve_cut_from_indexed_tar(t, tar_reader, tar_path),
+                    ),
+                    token_equivalent_duration=self.token_equivalent_duration,
+                )
+            manifest_path = self._cuts_readers[shard_idx].path
             return NeMoMultimodalConversation(
                 id=data.get("id", "missing-example-id"),
-                turns=_create_sharegpt_turns(
+                turns=_ShareGPTConversationParser.create_turns(
                     self.audio_locator_tag,
                     conversations,
-                    lambda t: self._resolve_cut_from_indexed_tar(t, tar_reader, tar_path),
+                    lambda t, _p=manifest_path: self._resolve_cut_from_path(t, _p),
                 ),
                 token_equivalent_duration=self.token_equivalent_duration,
             )
-        manifest_path = self._cuts_readers[shard_idx].path
-        return NeMoMultimodalConversation(
-            id=data.get("id", "missing-example-id"),
-            turns=_create_sharegpt_turns(
-                self.audio_locator_tag,
-                conversations,
-                lambda t, _p=manifest_path: self._resolve_cut_from_path(t, _p),
-            ),
-            token_equivalent_duration=self.token_equivalent_duration,
-        )
+        except _SHAREGPT_AUDIO_LOADING_ERRORS as e:
+            if not self.skip_missing_manifest_entries:
+                raise
+            logging.warning(
+                "Skipping ShareGPT sample due to audio loading failure: "
+                f"sample_id={data.get('id', 'missing-example-id')!r} shard_idx={shard_idx} "
+                f"error={type(e).__name__}: {e}"
+            )
+            return None
 
     def _resolve_cut_from_indexed_tar(self, turn, tar_reader, tar_path):
         import io as _io
@@ -1592,17 +1547,23 @@ def _resolve_cut_from_indexed_tar(self, turn, tar_reader, tar_path):
         from lhotse import AudioSource as _AudioSource
         from lhotse import Recording as _Recording
 
-        audio_bytes = tar_reader.get(turn["value"])
+        audio_path = os.fspath(
+            _ShareGPTConversationParser.expect_one_audio_path(
+                turn["value"], sample_id=turn.get("id", "?"), context="audio turn value"
+            )
+        )
+        turn_for_id = {**turn, "value": audio_path}
+        audio_bytes = tar_reader.get(audio_path)
         meta = _sf.info(_io.BytesIO(audio_bytes))
         recording = _Recording(
-            id=turn["value"],
+            id=audio_path,
             sources=[_AudioSource(type="memory", channels=list(range(meta.channels)), source=audio_bytes)],
             sampling_rate=int(meta.samplerate),
             num_samples=meta.frames,
             duration=meta.duration,
         )
         cut = recording.to_cut().truncate(offset=turn.get("offset", 0.0), duration=turn.get("duration"))
-        return cut.with_id(self._make_cut_id(cut, turn))
+        return cut.with_id(self._make_cut_id(cut, turn_for_id))
 
     def __getitem__(self, token):
         if not self.indexed:
@@ -1613,6 +1574,10 @@ def __getitem__(self, token):
         shard_idx, local_idx = self._resolve(idx)
         data = self._cuts_readers[shard_idx][local_idx]
         convo = self._build_one(data, shard_idx)
+        if convo is None:
+            raise RuntimeError(
+                f"ShareGPT sample at global index {idx} is not decodable; cannot satisfy random-access __getitem__."
+            )
         return attach_graph_origin(convo, idx)
 
     def __iter__(self) -> Iterator[NeMoMultimodalConversation]:
@@ -1629,6 +1594,8 @@ def _iter_indexed_node(self) -> Iterator[NeMoMultimodalConversation]:
             shard_idx, local_idx = self._resolve(global_idx)
             data = self._cuts_readers[shard_idx][local_idx]
             convo = self._build_one(data, shard_idx)
+            if convo is None:
+                continue
             attach_graph_origin(convo, global_idx)
             yield convo
         self.epoch += 1
@@ -1644,14 +1611,22 @@ def _make_cut_id(self, cut, turn) -> str:
         return Path(turn['value']).stem
 
     def _resolve_cut_from_path(self, turn, manifest_path):
-        if is_valid_url(turn["value"]):
-            data = open_best(turn["value"], "rb").read()
-            cut = Recording.from_bytes(data, recording_id=turn["value"]).to_cut()
+        audio_path = os.fspath(
+            _ShareGPTConversationParser.expect_one_audio_path(
+                turn["value"], sample_id=turn.get("id", "?"), context="audio turn value"
+            )
+        )
+        turn_for_id = {**turn, "value": audio_path}
+        if is_valid_url(audio_path):
+            data = open_best(audio_path, "rb").read()
+            cut = Recording.from_bytes(data, recording_id=audio_path).to_cut()
         elif self.audio_root is not None:
-            cut = Recording.from_file(get_full_path(turn["value"], data_dir=self.audio_root)).to_cut()
+            cut = Recording.from_file(get_full_path(audio_path, data_dir=self.audio_root)).to_cut()
         else:
-            cut = Recording.from_file(get_full_path(turn["value"], manifest_path)).to_cut()
-        return cut.truncate(offset=turn["offset"], duration=turn["duration"]).with_id(self._make_cut_id(cut, turn))
+            cut = Recording.from_file(get_full_path(audio_path, manifest_path)).to_cut()
+        return cut.truncate(offset=turn["offset"], duration=turn["duration"]).with_id(
+            self._make_cut_id(cut, turn_for_id)
+        )
 
     def _iter_tar(self):
         # See NeMoMultimodalConversationJsonlAdapter._iter_tar for GetBatch-mode rationale.
@@ -1672,7 +1647,7 @@ def _iter_tar(self):
             )
             cntr = 0
             for idx, data in enumerate(jsonl):
-                conversations = _transform_sharegpt(self.audio_placeholders, data)
+                conversations = _ShareGPTConversationParser(self.audio_placeholders, data).transform()
                 audio_turns = [t for t in conversations if t["type"] == "audio"]
                 cuts = []
                 for turn in audio_turns:
@@ -1707,7 +1682,9 @@ def _iter_tar(self):
 
                 yield NeMoMultimodalConversation(
                     id=data.get("id", "missing-example-id"),
-                    turns=_create_sharegpt_turns(self.audio_locator_tag, conversations, lambda t: cuts.popleft()),
+                    turns=_ShareGPTConversationParser.create_turns(
+                        self.audio_locator_tag, conversations, lambda t: cuts.popleft()
+                    ),
                     token_equivalent_duration=self.token_equivalent_duration,
                 )
                 cntr += 1
@@ -1725,18 +1702,28 @@ def _iter_jsonl(self):
                 jsonl_iter = list(jsonl_iter)
                 rng.shuffle(jsonl_iter)
             for data in jsonl_iter:
-                conversations = _transform_sharegpt(self.audio_placeholders, data)
-                yield NeMoMultimodalConversation(
-                    id=data.get("id", "missing-example-id"),
-                    turns=_create_sharegpt_turns(
-                        self.audio_locator_tag,
-                        conversations,
-                        lambda t, _p=path: self._resolve_cut_from_path(t, _p),
-                    ),
-                    token_equivalent_duration=self.token_equivalent_duration,
-                )
+                try:
+                    conversations = _ShareGPTConversationParser(self.audio_placeholders, data).transform()
+                    yield NeMoMultimodalConversation(
+                        id=data.get("id", "missing-example-id"),
+                        turns=_ShareGPTConversationParser.create_turns(
+                            self.audio_locator_tag,
+                            conversations,
+                            lambda t, _p=path: self._resolve_cut_from_path(t, _p),
+                        ),
+                        token_equivalent_duration=self.token_equivalent_duration,
+                    )
+                except _SHAREGPT_AUDIO_LOADING_ERRORS as e:
+                    if not self.skip_missing_manifest_entries:
+                        raise
+                    logging.warning(
+                        "Skipping ShareGPT sample due to audio loading failure: "
+                        f"sample_id={data.get('id', 'missing-example-id')!r} manifest_path={path} "
+                        f"error={type(e).__name__}: {e}"
+                    )
         self.epoch += 1
 
+
 @dataclass
 class NeMoMultimodalConversationShareGPTWebdatasetAdapter(IteratorNode):
     """
@@ -1879,11 +1866,11 @@ def _get_rng(self) -> random.Random:
     def _yield_from_sample(self, json_data, audio_bytes, audio_name):
         sample_id = Path(audio_name).stem
         recording = Recording.from_bytes(audio_bytes, recording_id=sample_id)
-        conversations = _transform_sharegpt(self.audio_placeholders, json_data, audio_name)
+        conversations = _ShareGPTConversationParser(self.audio_placeholders, json_data, audio_name).transform()
         base_cut = recording.to_cut()
         return NeMoMultimodalConversation(
             id=json_data.get("id", sample_id),
-            turns=_create_sharegpt_turns(
+            turns=_ShareGPTConversationParser.create_turns(
                 self.audio_locator_tag,
                 conversations,
                 lambda t: base_cut.truncate(offset=t.get("offset", 0.0), duration=t.get("duration")),
@@ -1909,6 +1896,7 @@ def _iter_sequential(self):
                     yield self._yield_from_sample(json_data, audio_bytes, audio_name)
         self.epoch += 1
 
+
 class TarIterator:
     """
     Copy of lhotse.shar.readers.tar.TarIterator, modified to read both Lhotse-Shar style audio tar files
@@ -2019,3 +2007,201 @@ def _setup_writers(self):
             Path(self.output_dir).mkdir(exist_ok=True)
         self.manifest_writer = JsonlShardWriter(f"{self.output_dir}/manifest_{self.shard_idx}.jsonl", shard_size=None)
         self.tar_writer = AudioTarWriter(f"{self.output_dir}/audio_{self.shard_idx}.tar", shard_size=None)
+
+
+_SHAREGPT_AUDIO_LOADING_ERRORS = (AudioLoadingError, OSError)
+
+
+def _normalize_audio_placeholders(val: Union[str, list[str], None]) -> list[str]:
+    if val is None:
+        return ["<sound>", "<speech>"]
+    return [val] if isinstance(val, str) else list(val)
+
+
+class _ShareGPTConversationParser:
+    """Normalize ShareGPT multimodal records for the conversation adapters.
+
+    ShareGPT audio examples are intentionally loose: audio paths may be stored
+    in ``sound`` or ``ori_sound``, may be scalar or list-valued, and placement
+    in the text is expressed with placeholders such as ``<sound>``. This class
+    owns those conventions and emits the flat internal turn dictionaries shared
+    by the JSONL and WebDataset adapters.
+    """
+
+    def __init__(
+        self, placeholders: list[str], data: dict, audio_path_fallback: str | None = None
+    ) -> None:
+        self.placeholders = placeholders
+        self.data = data
+        self.sample_id = data.get("id", "?")
+        audio_path_value = data.get("sound") or data.get("ori_sound") or audio_path_fallback
+        self.audio_paths = self.normalize_audio_paths(
+            audio_path_value, sample_id=self.sample_id, field_name="sound"
+        )
+
+    def transform(self) -> list[dict]:
+        """Convert one raw ShareGPT sample into text/audio turn dictionaries.
+
+        User/human placeholders consume audio. Assistant turns are preserved as
+        text so literal tokens such as an HTML ``<audio>`` tag are not mistaken
+        for data references.
+        """
+        conversations = []
+        placeholder_count = self._placeholder_count()
+        if (
+            len(self.audio_paths) > 1
+            and placeholder_count > 1
+            and len(self.audio_paths) != placeholder_count
+        ):
+            raise ValueError(
+                f"ShareGPT sample id={self.sample_id} has {len(self.audio_paths)} audio paths but "
+                f"{placeholder_count} audio placeholders. Use one path for all placeholders, one path per "
+                f"placeholder, or a single placeholder for all paths."
+            )
+
+        audio_idx = 0
+        for turn in self.data["conversations"]:
+            role = self.role(turn)
+            remaining = turn["value"]
+            if not self.turn_can_consume_audio(turn):
+                conversations.append({"type": "text", "from": role.title(), "value": remaining.strip()})
+                continue
+
+            found_any = False
+            while True:
+                idx, found = self.find_next_audio_placeholder(remaining, self.placeholders)
+                if found is None:
+                    if remaining.strip() or not found_any:
+                        conversations.append({"type": "text", "from": role.title(), "value": remaining.strip()})
+                    break
+
+                found_any = True
+                prefix = remaining[:idx]
+                if prefix.strip():
+                    conversations.append({"type": "text", "from": role.title(), "value": prefix.strip()})
+                if not self.audio_paths:
+                    raise ValueError(
+                        f"Conversation turn contains audio placeholder '{found}' but no audio path was found in "
+                        f"'sound', 'ori_sound' fields or fallback for sample id={self.sample_id}"
+                    )
+
+                if len(self.audio_paths) > 1 and placeholder_count == 1:
+                    path_indexes = range(len(self.audio_paths))
+                elif len(self.audio_paths) > 1:
+                    path_indexes = [audio_idx]
+                    audio_idx += 1
+                else:
+                    path_indexes = [0]
+
+                for path_idx in path_indexes:
+                    audio_turn = {
+                        "type": "audio",
+                        "from": role.title(),
+                        "value": self.audio_paths[path_idx],
+                        "duration": self.audio_turn_field(turn, "duration", path_idx, self.sample_id),
+                        "offset": self.audio_turn_field(turn, "offset", path_idx, self.sample_id, default=0.0),
+                    }
+                    if "sampling_rate" in turn:
+                        audio_turn["sampling_rate"] = self.audio_turn_field(
+                            turn, "sampling_rate", path_idx, self.sample_id
+                        )
+                    conversations.append(audio_turn)
+                remaining = remaining[idx + len(found) :]
+        return conversations
+
+    def _placeholder_count(self) -> int:
+        return sum(
+            self.count_audio_placeholders(turn["value"], self.placeholders)
+            for turn in self.data["conversations"]
+            if self.turn_can_consume_audio(turn)
+        )
+
+    @staticmethod
+    def create_turns(audio_locator_tag: str, conversations: list[dict], resolve_cut) -> list:
+        """Build ``TextTurn`` / ``AudioTurn`` objects using ``resolve_cut(turn_dict)`` for audio."""
+        turns = []
+        for turn in conversations:
+            if turn["type"] == "text":
+                turns.append(TextTurn(value=turn["value"], role=turn["from"].lower()))
+            else:
+                cut = resolve_cut(turn)
+                turns.append(
+                    AudioTurn(
+                        cut=cut,
+                        text=cut.supervisions[0].text if cut.supervisions else None,
+                        role=turn["from"].lower(),
+                        audio_locator_tag=audio_locator_tag,
+                    )
+                )
+        return turns
+
+    @classmethod
+    def expect_one_audio_path(cls, value, sample_id: str, context: str) -> Pathlike:
+        paths = cls.normalize_audio_paths(value, sample_id=sample_id, field_name=context)
+        if len(paths) != 1:
+            raise ValueError(
+                f"ShareGPT sample id={sample_id} resolved one audio turn to {len(paths)} audio paths. "
+                f"Multiple paths must be expanded into separate audio turns before loading."
+            )
+        return paths[0]
+
+    @staticmethod
+    def normalize_audio_paths(value, sample_id: str, field_name: str) -> list[Pathlike]:
+        if value is None or value == "":
+            return []
+        if isinstance(value, (str, os.PathLike)):
+            return [value]
+        if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)):
+            paths = list(value)
+            for idx, path in enumerate(paths):
+                if not isinstance(path, (str, os.PathLike)):
+                    raise ValueError(
+                        f"ShareGPT sample id={sample_id} has unsupported {field_name}[{idx}]={path!r}; "
+                        f"expected a string or os.PathLike audio path."
+                    )
+            return paths
+        raise ValueError(
+            f"ShareGPT sample id={sample_id} has unsupported {field_name}={value!r}; "
+            f"expected a string, os.PathLike, or a list of audio paths."
+        )
+
+    @staticmethod
+    def find_next_audio_placeholder(text: str, placeholders: list[str]) -> tuple[int, str] | tuple[None, None]:
+        matches = [(idx, placeholder) for placeholder in placeholders if (idx := text.find(placeholder)) >= 0]
+        if not matches:
+            return None, None
+        return min(matches, key=lambda item: item[0])
+
+    @classmethod
+    def count_audio_placeholders(cls, text: str, placeholders: list[str]) -> int:
+        count = 0
+        remaining = text
+        while True:
+            idx, placeholder = cls.find_next_audio_placeholder(remaining, placeholders)
+            if placeholder is None:
+                return count
+            count += 1
+            remaining = remaining[idx + len(placeholder) :]
+
+    @staticmethod
+    def role(turn: dict) -> str:
+        return "user" if turn["from"].lower() in ("human", "user") else "assistant"
+
+    @classmethod
+    def turn_can_consume_audio(cls, turn: dict) -> bool:
+        return cls.role(turn) == "user"
+
+    @staticmethod
+    def audio_turn_field(turn: dict, field_name: str, audio_idx: int, sample_id: str, default=None):
+        value = turn.get(field_name, default)
+        if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)):
+            values = list(value)
+            if len(values) == 1:
+                return values[0]
+            if audio_idx < len(values):
+                return values[audio_idx]
+            raise ValueError(
+                f"ShareGPT sample id={sample_id} has {len(values)} values for turn field {field_name!r}, "
+                f"but audio path index {audio_idx} was requested."
+            )
+        return value
diff --git a/scripts/dataloading/build_indexes.py b/scripts/dataloading/build_indexes.py
index ed2bb1c24812..f09f6b40196d 100644
--- a/scripts/dataloading/build_indexes.py
+++ b/scripts/dataloading/build_indexes.py
@@ -49,6 +49,7 @@
     python scripts/dataloading/build_indexes.py --force --workers 16 path/to/input_cfg.yaml
 """
 
+import json
 import logging
 import sys
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
@@ -196,6 +197,39 @@ def _discover_paths(paths, jobs: list[IndexJob], indexes_root: Optional[str]) ->
             jobs.append(IndexJob(p, JSONL, indexes_root))
 
 
+def _discover_share_gpt_webdataset(data_dir, jobs: list[IndexJob], indexes_root: Optional[str]) -> None:
+    """
+    Match NeMoMultimodalConversationShareGPTWebdatasetAdapter shard discovery.
+
+    The adapter reads ``wids-meta.json`` when present; otherwise it recursively
+    scans ``data_dir`` for tar shards. Energon exports commonly place shards
+    under nested directories such as ``0/sharded_manifests/shard-0.tar``, so a
+    non-recursive glob silently misses every runtime-required tar index.
+    """
+    if data_dir is None:
+        return
+
+    for raw in _flatten_path_spec(data_dir):
+        root = Path(raw)
+        meta_path = root / "wids-meta.json"
+        if meta_path.is_file():
+            with open(meta_path) as f:
+                meta = json.load(f)
+            for shard in meta.get("shardlist", []):
+                url = shard.get("url") if isinstance(shard, dict) else None
+                if url:
+                    jobs.append(IndexJob(str(root / url), WDS_TAR, indexes_root))
+        elif root.is_dir():
+            for tar_path in sorted(root.rglob("*.tar")):
+                jobs.append(IndexJob(str(tar_path), WDS_TAR, indexes_root))
+
+        # Preserve the previous behavior for optional root-level sidecar
+        # manifests without recursively indexing unrelated metadata files.
+        if root.is_dir():
+            for jsonl_path in sorted(root.glob("*.jsonl")):
+                jobs.append(IndexJob(str(jsonl_path), JSONL, indexes_root))
+
+
 def discover(entry, jobs: list[IndexJob], indexes_root: Optional[str] = None) -> None:
     """Walk one entry of an ``input_cfg`` and append every required IndexJob."""
     if isinstance(entry, (list, ListConfig)):
@@ -231,13 +265,8 @@ def discover(entry, jobs: list[IndexJob], indexes_root: Optional[str] = None) ->
         return
 
     if typ == "share_gpt_webdataset":
-        # Layout: data_dir/shard-N.tar [+ optional shard-N.tar.idx, manifest jsonl].
-        data_dir = entry.get("data_dir")
-        if data_dir is None:
-            return
-        for ext, kind in ((".tar", WDS_TAR), (".jsonl", JSONL)):
-            for p in sorted(Path(data_dir).glob(f"*{ext}")):
-                jobs.append(IndexJob(str(p), kind, indexes_root))
+        # Layout: data_dir/wids-meta.json or recursive **/*.tar.
+        _discover_share_gpt_webdataset(entry.get("data_dir"), jobs, indexes_root)
         return
 
     if typ == "lhotse":
diff --git a/tests/collections/common/test_lhotse_multimodal_dataloading.py b/tests/collections/common/test_lhotse_multimodal_dataloading.py
index dd1cc7134d8d..07933c39fa1f 100644
--- a/tests/collections/common/test_lhotse_multimodal_dataloading.py
+++ b/tests/collections/common/test_lhotse_multimodal_dataloading.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import os
 import random
 from itertools import islice
@@ -21,6 +22,7 @@
 import pytest
 import torch
 from lhotse import CutSet, SupervisionSegment, compute_num_samples
+from lhotse.audio import AudioLoadingError
 from lhotse.shar import JsonlShardWriter
 from lhotse.testing.dummies import dummy_cut, dummy_recording
 from omegaconf import OmegaConf
@@ -319,6 +321,213 @@ def test_multimodal_conversation_input_sharegpt(sharegpt_conversations_path):
     assert t.cut.load_audio().shape == (1, 39200)
 
 
+def test_multimodal_conversation_input_sharegpt_list_audio_paths(tmp_path):
+    manifest_path = tmp_path / "sharegpt_list_manifest.jsonl"
+    dummy_recording(0, 1.0, with_data=True).to_cut().save_audio(tmp_path / "clip_a.wav")
+    dummy_recording(1, 1.5, with_data=True).to_cut().save_audio(tmp_path / "clip_b.wav")
+    dummy_recording(2, 2.0, with_data=True).to_cut().save_audio(tmp_path / "clip_c.wav")
+    data = [
+        {
+            "id": "single_list_path",
+            "sound": ["clip_a.wav"],
+            "conversations": [
+                {"from": "human", "value": "Listen <sound>"},
+                {"from": "gpt", "value": "done"},
+            ],
+        },
+        {
+            "id": "multi_list_path",
+            "sound": ["clip_b.wav", "clip_c.wav"],
+            "conversations": [
+                {"from": "human", "value": "Compare <sound> now"},
+                {"from": "gpt", "value": "done"},
+            ],
+        },
+    ]
+    lhotse.serialization.save_to_jsonl(data, manifest_path)
+
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<sound>"],
+    )
+
+    single, multi = list(adapter)
+    single_audio = [t for t in single.turns if isinstance(t, AudioTurn)]
+    assert len(single_audio) == 1
+    assert single_audio[0].cut.duration == 1.0
+    assert single_audio[0].cut.load_audio().shape == (1, 16000)
+
+    assert [type(t) for t in multi.turns] == [TextTurn, AudioTurn, AudioTurn, TextTurn, TextTurn]
+    assert multi.turns[0].value == "Compare"
+    assert multi.turns[3].value == "now"
+    multi_audio = [t for t in multi.turns if isinstance(t, AudioTurn)]
+    assert [t.cut.duration for t in multi_audio] == [1.5, 2.0]
+
+
+def test_multimodal_conversation_input_sharegpt_nested_audio_path_list_raises(tmp_path):
+    manifest_path = tmp_path / "sharegpt_bad_list_manifest.jsonl"
+    lhotse.serialization.save_to_jsonl(
+        [
+            {
+                "id": "bad_nested_path",
+                "sound": [["clip_a.wav"]],
+                "conversations": [{"from": "human", "value": "Listen <sound>"}],
+            }
+        ],
+        manifest_path,
+    )
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<sound>"],
+    )
+
+    with pytest.raises(ValueError, match=r"unsupported sound\[0\]"):
+        list(adapter)
+
+
+def test_multimodal_conversation_input_sharegpt_ignores_assistant_literal_audio_tag(tmp_path):
+    manifest_path = tmp_path / "sharegpt_assistant_literal_audio_manifest.jsonl"
+    dummy_recording(0, 1.0, with_data=True).to_cut().save_audio(tmp_path / "clip_a.wav")
+    dummy_recording(1, 1.5, with_data=True).to_cut().save_audio(tmp_path / "clip_b.wav")
+    dummy_recording(2, 2.0, with_data=True).to_cut().save_audio(tmp_path / "clip_c.wav")
+    lhotse.serialization.save_to_jsonl(
+        [
+            {
+                "id": "assistant_literal_audio_tag",
+                "sound": ["clip_a.wav", "clip_b.wav", "clip_c.wav"],
+                "conversations": [
+                    {"from": "human", "value": "First prompt <sound>"},
+                    {"from": "gpt", "value": "Use an HTML <audio> tag in the page."},
+                    {"from": "human", "value": "Second prompt <sound>"},
+                    {"from": "gpt", "value": "Then wire audio.play() to a button."},
+                    {"from": "human", "value": "Third prompt <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            }
+        ],
+        manifest_path,
+    )
+
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<audio>", "<sound>", "<speech>"],
+    )
+
+    (conversation,) = list(adapter)
+    audio_turns = [t for t in conversation.turns if isinstance(t, AudioTurn)]
+    assert [t.cut.duration for t in audio_turns] == [1.0, 1.5, 2.0]
+    assistant_texts = [t.value for t in conversation.turns if isinstance(t, TextTurn) and t.role == "assistant"]
+    assert "Use an HTML <audio> tag in the page." in assistant_texts
+
+
+def test_multimodal_conversation_input_sharegpt_user_audio_path_placeholder_mismatch_raises(tmp_path):
+    manifest_path = tmp_path / "sharegpt_user_mismatch_manifest.jsonl"
+    lhotse.serialization.save_to_jsonl(
+        [
+            {
+                "id": "bad_user_mismatch",
+                "sound": ["clip_a.wav", "clip_b.wav", "clip_c.wav"],
+                "conversations": [
+                    {"from": "human", "value": "A <sound> B <sound> C <sound> D <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            }
+        ],
+        manifest_path,
+    )
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<sound>"],
+    )
+
+    with pytest.raises(ValueError, match="3 audio paths but 4 audio placeholders"):
+        list(adapter)
+
+
+def test_multimodal_conversation_input_sharegpt_missing_audio_path_raises(tmp_path):
+    manifest_path = tmp_path / "sharegpt_missing_audio_manifest.jsonl"
+    lhotse.serialization.save_to_jsonl(
+        [
+            {
+                "id": "missing_audio",
+                "sound": "missing.wav",
+                "conversations": [
+                    {"from": "human", "value": "Listen <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            }
+        ],
+        manifest_path,
+    )
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<sound>"],
+    )
+
+    with pytest.raises(AudioLoadingError):
+        list(adapter)
+
+
+@pytest.mark.parametrize("indexed", [False, True])
+def test_multimodal_conversation_input_sharegpt_missing_audio_path_skips_when_enabled(
+    tmp_path, caplog, indexed
+):
+    manifest_path = tmp_path / "sharegpt_skip_missing_audio_manifest.jsonl"
+    dummy_recording(0, 1.0, with_data=True).to_cut().save_audio(tmp_path / "good_a.wav")
+    dummy_recording(1, 1.5, with_data=True).to_cut().save_audio(tmp_path / "good_b.wav")
+    lhotse.serialization.save_to_jsonl(
+        [
+            {
+                "id": "good_a",
+                "sound": "good_a.wav",
+                "conversations": [
+                    {"from": "human", "value": "Listen <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            },
+            {
+                "id": "missing_audio",
+                "sound": "missing.wav",
+                "conversations": [
+                    {"from": "human", "value": "Listen <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            },
+            {
+                "id": "good_b",
+                "sound": "good_b.wav",
+                "conversations": [
+                    {"from": "human", "value": "Listen <sound>"},
+                    {"from": "gpt", "value": "done"},
+                ],
+            },
+        ],
+        manifest_path,
+    )
+    if indexed:
+        create_jsonl_index(str(manifest_path))
+    adapter = NeMoMultimodalConversationShareGPTJsonlAdapter(
+        manifest_filepath=manifest_path,
+        audio_locator_tag="[audio]",
+        audio_placeholders=["<sound>"],
+        indexed=indexed,
+        skip_missing_manifest_entries=True,
+    )
+
+    with caplog.at_level(logging.WARNING):
+        conversations = list(adapter)
+
+    assert [c.id for c in conversations] == ["good_a", "good_b"]
+    assert "Skipping ShareGPT sample due to audio loading failure" in caplog.text
+    assert "missing_audio" in caplog.text
+    assert "missing.wav" in caplog.text
+
+
 @pytest.fixture
 def tokenizer(tmp_path_factory, multimodal_conversations_path):
     tmpdir = tmp_path_factory.mktemp("multi_convo_tokenizer")

From 2ba49f9c247bf145fc31098d0bfbccd2f1cc56e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Fri, 12 Jun 2026 09:38:52 -0700
Subject: [PATCH 17/30] Fix CodeQL review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../common/data/lhotse/nemo_adapters.py       |  27 +-
 .../common/data/lhotse/text_adapters.py       |  72 ++---
 nemo/utils/exp_manager.py                     |   2 +-
 .../_validate_dataloader/config_inject.py     |   2 +-
 .../_validate_dataloader/pre_validation.py    | 245 +++++++++++++-----
 scripts/dataloading/build_indexes.py          |  45 ++--
 scripts/dataloading/prefetch_indexes.py       |  21 +-
 .../common/test_lhotse_indexed_partition.py   |  22 +-
 8 files changed, 249 insertions(+), 187 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index b6d09c4fdc14..7a809d7ff387 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -20,6 +20,7 @@
 import re
 import tarfile
 from collections.abc import Mapping, Sequence
+from contextlib import closing
 from io import BytesIO
 from pathlib import Path
 from typing import Generator, Iterable, List, Literal
@@ -830,9 +831,7 @@ def __getitem__(self, token):
         idx = int(normalize_graph_token(token))
         cut = self._decode_cut_at(idx)
         if cut is None:
-            raise RuntimeError(
-                f"Cut at global index {idx} is not decodable; cannot satisfy random-access __getitem__."
-            )
+            raise IndexError(f"Cut at global index {idx} is not decodable; cannot satisfy random-access __getitem__.")
         return attach_graph_origin(cut, idx)
 
     def __len__(self) -> int:
@@ -1166,25 +1165,21 @@ def has_constant_time_access(self) -> bool:
 
     def _init_indexed(self) -> None:
         try:
-            parquet_file = pq.ParquetFile(self.path)
+            with closing(pq.ParquetFile(self.path)) as parquet_file:
+                offsets = [0]
+                for i in range(parquet_file.num_row_groups):
+                    offsets.append(offsets[-1] + parquet_file.metadata.row_group(i).num_rows)
+                self._row_group_offsets = offsets
+                self._num_row_groups = parquet_file.num_row_groups
+                self._total_rows = offsets[-1]
         except Exception as e:
             raise RuntimeError(f"Failed to open Parquet file: {self.path}") from e
-        offsets = [0]
-        for i in range(parquet_file.num_row_groups):
-            offsets.append(offsets[-1] + parquet_file.metadata.row_group(i).num_rows)
-        self._row_group_offsets = offsets
-        self._num_row_groups = parquet_file.num_row_groups
-        self._total_rows = offsets[-1]
-        del parquet_file  # close handle; reopened lazily in workers
 
     def _load_row_group(self, rg_idx: int) -> list[dict]:
         if self._cached_row_group_idx == rg_idx and self._cached_row_group is not None:
             return self._cached_row_group
-        parquet_file = pq.ParquetFile(self.path)
-        try:
+        with closing(pq.ParquetFile(self.path)) as parquet_file:
             df = parquet_file.read_row_group(rg_idx).to_pandas()
-        finally:
-            del parquet_file
         rows = df.to_dict("records")
         self._cached_row_group_idx = rg_idx
         self._cached_row_group = rows
@@ -1246,7 +1241,7 @@ def __getitem__(self, token):
         rows = self._load_row_group(rg_idx)
         cut = self._build_cut_from_row(rows[local_idx], fallback_idx=idx)
         if cut is None:
-            raise RuntimeError(f"Row {idx} in {self.path} is not decodable; cannot satisfy random-access __getitem__.")
+            raise IndexError(f"Row {idx} in {self.path} is not decodable; cannot satisfy random-access __getitem__.")
         return attach_graph_origin(cut, idx)
 
     def __len__(self) -> int:
diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py
index 18350f516605..1793c4a228bd 100644
--- a/nemo/collections/common/data/lhotse/text_adapters.py
+++ b/nemo/collections/common/data/lhotse/text_adapters.py
@@ -11,8 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
 import json
+import logging
 import math
 import os
 import random
@@ -31,14 +31,12 @@
 from lhotse.cut import Cut
 from lhotse.dataset import AudioSamples
 from lhotse.dataset.dataloading import PartitionedIndexedIterator, resolve_seed
+from lhotse.indexing import IndexedJsonlReader
+from lhotse.lazy import IteratorNode, attach_graph_origin, normalize_graph_token
 from lhotse.serialization import load_jsonl, open_best
 from lhotse.shar import AudioTarWriter, JsonlShardWriter
 from lhotse.utils import Pathlike, compute_num_samples, is_valid_url
 
-from lhotse.lazy import IteratorNode, attach_graph_origin, normalize_graph_token
-
-from lhotse.indexing import IndexedJsonlReader
-
 from nemo.collections.common.data.lhotse.indexed_adapters import (
     IndexedTarMemberReader,
     IndexedTarSampleReader,
@@ -160,9 +158,7 @@ def __post_init__(self):
         self._cum_lens: list[int] = []
         self._iter_state = PartitionedIndexedIterator()
         if self.indexed:
-            from lhotse.indexing import IndexedJsonlReader
-
-            from lhotse.indexing import index_file_path
+            from lhotse.indexing import IndexedJsonlReader, index_file_path
 
             for p in self.paths:
                 self._readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
@@ -209,7 +205,7 @@ def __getitem__(self, token):
         shard_idx, local_idx = self._resolve(idx)
         ex = self._data_to_example(self._readers[shard_idx][local_idx])
         if ex is None:
-            raise RuntimeError(
+            raise IndexError(
                 f"Index {idx} in {self.paths[shard_idx]} has no '{self.text_field}' field; "
                 f"cannot satisfy random-access __getitem__."
             )
@@ -428,9 +424,7 @@ def __post_init__(self):
         self._cum_lens: list[int] = []
         self._iter_state = PartitionedIndexedIterator()
         if self.indexed:
-            from lhotse.indexing import IndexedJsonlReader
-
-            from lhotse.indexing import index_file_path
+            from lhotse.indexing import IndexedJsonlReader, index_file_path
 
             for p in self.paths:
                 self._readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
@@ -1045,23 +1039,17 @@ def has_constant_time_access(self) -> bool:
         return self.indexed
 
     def _init_indexed(self) -> None:
-        from lhotse.indexing import IndexedJsonlReader
-
-        from lhotse.indexing import index_file_path
+        from lhotse.indexing import IndexedJsonlReader, index_file_path
 
         if self.slice_length is not None:
-            raise ValueError(
-                "NeMoMultimodalConversationJsonlAdapter(indexed=True) does not support slice_length."
-            )
+            raise ValueError("NeMoMultimodalConversationJsonlAdapter(indexed=True) does not support slice_length.")
         for p in self.manifest_filepath:
             self._cuts_readers.append(IndexedJsonlReader(p, index_path=index_file_path(p, self.indexes_root)))
         if self.tarred_audio_filepaths is not None:
             from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
 
             for p in self.tarred_audio_filepaths:
-                self._tar_readers.append(
-                    IndexedTarMemberReader(p, idx_path=index_file_path(p, self.indexes_root))
-                )
+                self._tar_readers.append(IndexedTarMemberReader(p, idx_path=index_file_path(p, self.indexes_root)))
         cum = 0
         self._cum_lens.append(cum)
         for r in self._cuts_readers:
@@ -1108,11 +1096,9 @@ def __getitem__(self, token):
                 tar_path=self.tarred_audio_filepaths[shard_idx],
             )
         else:
-            convo = self._build_conversation_local(
-                data, manifest_path=self._cuts_readers[shard_idx].path
-            )
+            convo = self._build_conversation_local(data, manifest_path=self._cuts_readers[shard_idx].path)
         if convo is None:
-            raise RuntimeError(
+            raise IndexError(
                 f"Conversation at index {idx} (shard {shard_idx}, local {local_idx}) "
                 f"could not be built; cannot satisfy random-access __getitem__."
             )
@@ -1152,9 +1138,7 @@ def _build_conversation_local(self, data: dict, manifest_path: str) -> NeMoMulti
             custom=data.get("custom"),
         )
 
-    def _build_conversation_tarred(
-        self, data: dict, tar_reader, tar_path: str
-    ) -> NeMoMultimodalConversation | None:
+    def _build_conversation_tarred(self, data: dict, tar_reader, tar_path: str) -> NeMoMultimodalConversation | None:
         import io as _io
 
         import soundfile as _sf
@@ -1180,9 +1164,7 @@ def _build_conversation_tarred(
                 num_samples=meta.frames,
                 duration=meta.duration,
             )
-            cut = recording.to_cut().truncate(
-                offset=turn.get("offset", 0.0), duration=turn.get("duration")
-            )
+            cut = recording.to_cut().truncate(offset=turn.get("offset", 0.0), duration=turn.get("duration"))
             cut = cut.with_id(self._make_cut_id(cut, turn))
             cuts.append(cut)
         cuts = deque(cuts)
@@ -1233,9 +1215,7 @@ def _iter_indexed(self) -> Iterator[NeMoMultimodalConversation]:
                     tar_path=self.tarred_audio_filepaths[shard_idx],
                 )
             else:
-                convo = self._build_conversation_local(
-                    data, manifest_path=self._cuts_readers[shard_idx].path
-                )
+                convo = self._build_conversation_local(data, manifest_path=self._cuts_readers[shard_idx].path)
             if convo is None:
                 continue
             attach_graph_origin(convo, global_idx)
@@ -1457,9 +1437,7 @@ def has_constant_time_access(self) -> bool:
         return self.indexed
 
     def _init_indexed(self) -> None:
-        from lhotse.indexing import IndexedJsonlReader
-
-        from lhotse.indexing import index_file_path
+        from lhotse.indexing import IndexedJsonlReader, index_file_path
 
         if self.slice_length is not None:
             raise ValueError(
@@ -1471,9 +1449,7 @@ def _init_indexed(self) -> None:
             from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarMemberReader
 
             for p in self.tarred_audio_filepaths:
-                self._tar_readers.append(
-                    IndexedTarMemberReader(p, idx_path=index_file_path(p, self.indexes_root))
-                )
+                self._tar_readers.append(IndexedTarMemberReader(p, idx_path=index_file_path(p, self.indexes_root)))
         cum = 0
         self._cum_lens.append(cum)
         for r in self._cuts_readers:
@@ -1575,7 +1551,7 @@ def __getitem__(self, token):
         data = self._cuts_readers[shard_idx][local_idx]
         convo = self._build_one(data, shard_idx)
         if convo is None:
-            raise RuntimeError(
+            raise IndexError(
                 f"ShareGPT sample at global index {idx} is not decodable; cannot satisfy random-access __getitem__."
             )
         return attach_graph_origin(convo, idx)
@@ -2028,16 +2004,12 @@ class _ShareGPTConversationParser:
     by the JSONL and WebDataset adapters.
     """
 
-    def __init__(
-        self, placeholders: list[str], data: dict, audio_path_fallback: str | None = None
-    ) -> None:
+    def __init__(self, placeholders: list[str], data: dict, audio_path_fallback: str | None = None) -> None:
         self.placeholders = placeholders
         self.data = data
         self.sample_id = data.get("id", "?")
         audio_path_value = data.get("sound") or data.get("ori_sound") or audio_path_fallback
-        self.audio_paths = self.normalize_audio_paths(
-            audio_path_value, sample_id=self.sample_id, field_name="sound"
-        )
+        self.audio_paths = self.normalize_audio_paths(audio_path_value, sample_id=self.sample_id, field_name="sound")
 
     def transform(self) -> list[dict]:
         """Convert one raw ShareGPT sample into text/audio turn dictionaries.
@@ -2048,11 +2020,7 @@ def transform(self) -> list[dict]:
         """
         conversations = []
         placeholder_count = self._placeholder_count()
-        if (
-            len(self.audio_paths) > 1
-            and placeholder_count > 1
-            and len(self.audio_paths) != placeholder_count
-        ):
+        if len(self.audio_paths) > 1 and placeholder_count > 1 and len(self.audio_paths) != placeholder_count:
             raise ValueError(
                 f"ShareGPT sample id={self.sample_id} has {len(self.audio_paths)} audio paths but "
                 f"{placeholder_count} audio placeholders. Use one path for all placeholders, one path per "
diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index 0c23dca9fe7c..bf89e7aa3c11 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -1591,7 +1591,7 @@ def advance(self, data_fetcher) -> None:
             self._skip_resume_validation_once = True
             self.restarting = False
             return
-        return super().advance(data_fetcher)
+        super().advance(data_fetcher)
 
     def on_advance_end(self, data_fetcher) -> None:
         """Clear the one-shot restart-validation skip after normal epoch-loop bookkeeping."""
diff --git a/scripts/dataloading/_validate_dataloader/config_inject.py b/scripts/dataloading/_validate_dataloader/config_inject.py
index fc1b5f63a3ef..d58c14f5105f 100644
--- a/scripts/dataloading/_validate_dataloader/config_inject.py
+++ b/scripts/dataloading/_validate_dataloader/config_inject.py
@@ -17,7 +17,7 @@
 import logging
 from typing import Any
 
-from omegaconf import DictConfig, ListConfig, OmegaConf
+from omegaconf import DictConfig, ListConfig
 
 LOG = logging.getLogger(__name__)
 
diff --git a/scripts/dataloading/_validate_dataloader/pre_validation.py b/scripts/dataloading/_validate_dataloader/pre_validation.py
index af46b2d8d617..66ae9155dcda 100644
--- a/scripts/dataloading/_validate_dataloader/pre_validation.py
+++ b/scripts/dataloading/_validate_dataloader/pre_validation.py
@@ -26,7 +26,7 @@
 import json
 import logging
 import sys
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Callable, Iterable, Optional
 
@@ -59,8 +59,10 @@ class PreValidationReport:
 
     def to_dict(self):
         return {
-            "checks": {c.check_id: {"status": c.status, "severity": c.severity, "detail": c.detail, **c.extra}
-                       for c in self.checks},
+            "checks": {
+                c.check_id: {"status": c.status, "severity": c.severity, "detail": c.detail, **c.extra}
+                for c in self.checks
+            },
             "summary": self.summary,
         }
 
@@ -112,8 +114,14 @@ def _check_seed_int(cfg: DictConfig):
     if isinstance(seed, int):
         return PASS, f"seed={seed}", {}
     if seed in _NON_INT_SEED_VALUES:
-        return FAIL, (f"train_ds.seed is {seed!r}; must be an integer for reproducibility across "
-                      "launches and determinism re-runs."), {}
+        return (
+            FAIL,
+            (
+                f"train_ds.seed is {seed!r}; must be an integer for reproducibility across "
+                "launches and determinism re-runs."
+            ),
+            {},
+        )
     return FAIL, f"train_ds.seed={seed!r} (type={type(seed).__name__}); must be int", {}
 
 
@@ -121,15 +129,24 @@ def _check_shard_seed_int(cfg: DictConfig):
     shard_seed = cfg.get("shard_seed", None)
     if isinstance(shard_seed, int):
         return PASS, f"shard_seed={shard_seed}", {}
-    return FAIL, (f"train_ds.shard_seed={shard_seed!r}; must be an integer. "
-                  "LazyIteratorMultiplexer raises under multi-shard + 'randomized'."), {}
+    return (
+        FAIL,
+        (
+            f"train_ds.shard_seed={shard_seed!r}; must be an integer. "
+            "LazyIteratorMultiplexer raises under multi-shard + 'randomized'."
+        ),
+        {},
+    )
 
 
 def _check_stateful_on(cfg: DictConfig):
     if cfg.get("use_stateful_dataloader", False) is True:
         return PASS, "", {}
-    return FAIL, ("use_stateful_dataloader is not True; resumability validation requires the "
-                  "StatefulDataLoader path."), {}
+    return (
+        FAIL,
+        ("use_stateful_dataloader is not True; resumability validation requires the " "StatefulDataLoader path."),
+        {},
+    )
 
 
 def _check_indexed_implies_root(cfg: DictConfig):
@@ -138,9 +155,15 @@ def _check_indexed_implies_root(cfg: DictConfig):
     if not indexed:
         return SKIP, "train_ds.indexed != True; check not applicable", {}
     if indexes_root in (None, "", "null"):
-        return FAIL, ("train_ds.indexed=True but indexes_root is unset. Without indexes_root, "
-                      "LazyIndexedSharIterator falls back to looking next to (typically remote) "
-                      "data files."), {}
+        return (
+            FAIL,
+            (
+                "train_ds.indexed=True but indexes_root is unset. Without indexes_root, "
+                "LazyIndexedSharIterator falls back to looking next to (typically remote) "
+                "data files."
+            ),
+            {},
+        )
     return PASS, f"indexes_root={indexes_root}", {}
 
 
@@ -152,8 +175,14 @@ def _check_indexes_root_exists(cfg: DictConfig):
     if p.exists():
         return PASS, f"{indexes_root} exists", {}
     # Locally on a developer laptop the path is typically cluster-specific; downgrade to WARN.
-    return WARN, (f"indexes_root={indexes_root!r} does not exist on this host. "
-                  "Expected on cluster; downgraded to WARN locally."), {}
+    return (
+        WARN,
+        (
+            f"indexes_root={indexes_root!r} does not exist on this host. "
+            "Expected on cluster; downgraded to WARN locally."
+        ),
+        {},
+    )
 
 
 def _check_idx_files_present(cfg: DictConfig):
@@ -222,14 +251,20 @@ def _check_constant_time_leaves(cfg: DictConfig):
     severity_status = FAIL if stateful else WARN
     if non_indexable or streaming:
         n = len(non_indexable) + len(streaming)
-        detail = (f"{n} leaf source(s) lack constant-time access "
-                  f"({len(non_indexable)} non-indexable type, {len(streaming)} streaming-mode). "
-                  "Resume falls back to O(N) replay; with force_map_dataset=False they also leak "
-                  "across ranks.")
-        return severity_status, detail, {
-            "non_indexable": non_indexable[:5],
-            "streaming": streaming[:5],
-        }
+        detail = (
+            f"{n} leaf source(s) lack constant-time access "
+            f"({len(non_indexable)} non-indexable type, {len(streaming)} streaming-mode). "
+            "Resume falls back to O(N) replay; with force_map_dataset=False they also leak "
+            "across ranks."
+        )
+        return (
+            severity_status,
+            detail,
+            {
+                "non_indexable": non_indexable[:5],
+                "streaming": streaming[:5],
+            },
+        )
     return PASS, "all leaf sources admit constant-time access", {}
 
 
@@ -258,9 +293,15 @@ def _check_mux_seed_not_randomized(cfg: DictConfig):
     shard_seed = cfg.get("shard_seed")
     if isinstance(shard_seed, int):
         return PASS, f"shard_seed={shard_seed}", {}
-    return FAIL, (f"force_map_dataset=False but shard_seed={shard_seed!r}. "
-                  "LazyIteratorMultiplexer raises ValueError under multi-shard with "
-                  "shard_seed='randomized'."), {}
+    return (
+        FAIL,
+        (
+            f"force_map_dataset=False but shard_seed={shard_seed!r}. "
+            "LazyIteratorMultiplexer raises ValueError under multi-shard with "
+            "shard_seed='randomized'."
+        ),
+        {},
+    )
 
 
 def _check_slice_length_vs_indexed(cfg: DictConfig):
@@ -271,9 +312,14 @@ def _check_slice_length_vs_indexed(cfg: DictConfig):
         if leaf.get("slice_length") is not None:
             offenders.append({"type": leaf.get("type"), "corpus": leaf.get("corpus")})
     if offenders:
-        return FAIL, (f"{len(offenders)} source(s) set slice_length with indexed=True. "
-                      "Lhotse rejects: \"'slice_length' is not supported with indexed=True\"."), \
-               {"examples": offenders[:5]}
+        return (
+            FAIL,
+            (
+                f"{len(offenders)} source(s) set slice_length with indexed=True. "
+                "Lhotse rejects: \"'slice_length' is not supported with indexed=True\"."
+            ),
+            {"examples": offenders[:5]},
+        )
     return PASS, "", {}
 
 
@@ -285,9 +331,14 @@ def _check_cut_map_fns_vs_indexed(cfg: DictConfig):
         if leaf.get("cut_map_fns"):
             offenders.append({"type": leaf.get("type"), "corpus": leaf.get("corpus")})
     if offenders:
-        return FAIL, (f"{len(offenders)} source(s) set cut_map_fns with indexed=True. "
-                      "Lhotse rejects: \"'cut_map_fns' is not supported with indexed=True\"."),\
-               {"examples": offenders[:5]}
+        return (
+            FAIL,
+            (
+                f"{len(offenders)} source(s) set cut_map_fns with indexed=True. "
+                "Lhotse rejects: \"'cut_map_fns' is not supported with indexed=True\"."
+            ),
+            {"examples": offenders[:5]},
+        )
     return PASS, "", {}
 
 
@@ -315,9 +366,14 @@ def _check_bucketer_buffer(cfg: DictConfig):
         return WARN, f"num_buckets={n_buckets}, bucket_buffer_size={buffer_size}", {}
     ratio = buffer_size / max(n_buckets, 1)
     if ratio < 10:
-        return WARN, (f"bucket_buffer_size={buffer_size} is < 10×num_buckets ({n_buckets}). "
-                      "Low buffers can cause BucketsDontHaveEnoughData mid-run."), \
-               {"ratio": ratio}
+        return (
+            WARN,
+            (
+                f"bucket_buffer_size={buffer_size} is < 10×num_buckets ({n_buckets}). "
+                "Low buffers can cause BucketsDontHaveEnoughData mid-run."
+            ),
+            {"ratio": ratio},
+        )
     return PASS, f"bucket_buffer_size={buffer_size}, num_buckets={n_buckets}, ratio={ratio:.1f}", {}
 
 
@@ -338,12 +394,20 @@ def _check_multi_config_flags(cfg: DictConfig):
     sub_cfgs = cfg.get("input_cfg") or []
     if not isinstance(sub_cfgs, (list, ListConfig)):
         return WARN, "multi_config=True but input_cfg is not a list", {}
-    missing = [i for i, sc in enumerate(sub_cfgs)
-               if isinstance(sc, (dict, DictConfig)) and (
-                   sc.get("indexed") is None or sc.get("indexes_root") is None)]
+    missing = [
+        i
+        for i, sc in enumerate(sub_cfgs)
+        if isinstance(sc, (dict, DictConfig)) and (sc.get("indexed") is None or sc.get("indexes_root") is None)
+    ]
     if missing:
-        return FAIL, (f"multi_config=True; {len(missing)} sub-config(s) missing indexed/indexes_root "
-                      "and top-level doesn't supply both."), {"indices": missing[:5]}
+        return (
+            FAIL,
+            (
+                f"multi_config=True; {len(missing)} sub-config(s) missing indexed/indexes_root "
+                "and top-level doesn't supply both."
+            ),
+            {"indices": missing[:5]},
+        )
     return PASS, "every sub-config sets indexed and indexes_root", {}
 
 
@@ -363,8 +427,11 @@ def _check_text_fields(cfg: DictConfig):
             if tf is not None and tf not in valid:
                 suspicious.append({"corpus": leaf.get("corpus"), "value": tf})
     if suspicious:
-        return WARN, f"{len(suspicious)} unusual text_field value(s); verify against shard 0", \
-               {"examples": suspicious[:5], "known_valid": sorted(valid)}
+        return (
+            WARN,
+            f"{len(suspicious)} unusual text_field value(s); verify against shard 0",
+            {"examples": suspicious[:5], "known_valid": sorted(valid)},
+        )
     return PASS, "text_field values match known-valid set", {}
 
 
@@ -380,8 +447,11 @@ def _check_world_size_divides_workers(cfg: DictConfig):
         return SKIP, "no leaf-shard counts derivable from config", {}
     min_shards = min(c["shards"] for c in counts)
     if min_shards < 8:  # arbitrary "small enough to worry about" heuristic
-        return WARN, f"smallest source has only {min_shards} shards; verify (num_ranks × num_workers) ≤ this", \
-               {"counts": counts[:10]}
+        return (
+            WARN,
+            f"smallest source has only {min_shards} shards; verify (num_ranks × num_workers) ≤ this",
+            {"counts": counts[:10]},
+        )
     return PASS, f"smallest source has {min_shards} shards", {"counts": counts[:10]}
 
 
@@ -416,22 +486,39 @@ def _check_world_size_divides_workers(cfg: DictConfig):
 
 
 # Types that read indexable underlying data.
-_LEAF_TYPES = frozenset({
-    "lhotse_shar", "nemo", "nemo_tarred", "multimodal_conversation", "share_gpt",
-})
+_LEAF_TYPES = frozenset(
+    {
+        "lhotse_shar",
+        "nemo",
+        "nemo_tarred",
+        "multimodal_conversation",
+        "share_gpt",
+    }
+)
 
 # Types that don't admit constant-time access at all.
-_STREAMING_ONLY_TYPES = frozenset({
-    "txt", "txt_pair", "parquet", "multi_speaker_simulator",
-})
+_STREAMING_ONLY_TYPES = frozenset(
+    {
+        "txt",
+        "txt_pair",
+        "parquet",
+        "multi_speaker_simulator",
+    }
+)
 
 # Transparent passthrough types — recurse into input_cfg.
-_TRANSFORM_TYPES = frozenset({
-    "lhotse_as_conversation", "sqa_as_conversation", "s2s_as_conversation",
-    "s2s_duplex_overlap_as_s2s_duplex", "s2s_duplex_reverse_role",
-    "lhotse_magpietts_data_as_continuation", "nemo_tarred_to_duplex",
-    "group",
-})
+_TRANSFORM_TYPES = frozenset(
+    {
+        "lhotse_as_conversation",
+        "sqa_as_conversation",
+        "s2s_as_conversation",
+        "s2s_duplex_overlap_as_s2s_duplex",
+        "s2s_duplex_reverse_role",
+        "lhotse_magpietts_data_as_continuation",
+        "nemo_tarred_to_duplex",
+        "group",
+    }
+)
 
 
 def _iter_leaf_nodes(cfg: DictConfig) -> Iterable[DictConfig]:
@@ -505,7 +592,7 @@ def _collect_leaf_paths(cfg: DictConfig) -> list[str]:
 def _leaf_to_paths(leaf: DictConfig) -> list[str]:
     """Resolve the shar/manifest paths inside ``leaf`` into flat strings."""
     paths: list[str] = []
-    if (shar := leaf.get("shar_path")):
+    if shar := leaf.get("shar_path"):
         if isinstance(shar, (dict, DictConfig)):
             for key in ("cuts", "recording"):
                 v = shar.get(key)
@@ -513,11 +600,11 @@ def _leaf_to_paths(leaf: DictConfig) -> list[str]:
                     paths.append(v)
         elif isinstance(shar, str):
             paths.append(shar)
-    if (mfp := leaf.get("manifest_filepath")):
+    if mfp := leaf.get("manifest_filepath"):
         paths.extend(_flatten_str(mfp))
-    if (taf := leaf.get("tarred_audio_filepaths")):
+    if taf := leaf.get("tarred_audio_filepaths"):
         paths.extend(_flatten_str(taf))
-    if (cuts := leaf.get("cuts_path")):
+    if cuts := leaf.get("cuts_path"):
         paths.extend(_flatten_str(cuts))
     return paths
 
@@ -525,6 +612,7 @@ def _leaf_to_paths(leaf: DictConfig) -> list[str]:
 def _count_shards(leaf: DictConfig) -> Optional[int]:
     """Best-effort shard count from a leaf's ``_OP_N..M_CL_`` patterns."""
     import re
+
     paths = _leaf_to_paths(leaf)
     if not paths:
         return None
@@ -565,6 +653,7 @@ def _try_load_yaml(path: str) -> Optional[Any]:
 
 def _isfinite(x: float) -> bool:
     import math
+
     return math.isfinite(x)
 
 
@@ -574,19 +663,33 @@ def _isfinite(x: float) -> bool:
 
 
 @click.command(help=__doc__)
-@click.option("--config", "config_path", required=True, type=click.Path(exists=True),
-              help="Training YAML containing data.train_ds.")
-@click.option("--data-blend-dir", default=None,
-              help="Substituted into ${data_blend_dir} in the config (optional locally).")
-@click.option("--section", default="train_ds", show_default=True,
-              help="Which data.* section to validate.")
-@click.option("--output-dir", default=None, type=click.Path(),
-              help="Write pre_validation.json under this directory.")
-@click.option("--ignore-fail", multiple=True, default=(),
-              help="Repeatable: check IDs whose FAIL outcome should be downgraded to WARN.")
+@click.option(
+    "--config",
+    "config_path",
+    required=True,
+    type=click.Path(exists=True),
+    help="Training YAML containing data.train_ds.",
+)
+@click.option(
+    "--data-blend-dir", default=None, help="Substituted into ${data_blend_dir} in the config (optional locally)."
+)
+@click.option("--section", default="train_ds", show_default=True, help="Which data.* section to validate.")
+@click.option("--output-dir", default=None, type=click.Path(), help="Write pre_validation.json under this directory.")
+@click.option(
+    "--ignore-fail",
+    multiple=True,
+    default=(),
+    help="Repeatable: check IDs whose FAIL outcome should be downgraded to WARN.",
+)
 @click.option("-v", "--verbose", is_flag=True, default=False, help="Verbose logs.")
-def cli(config_path: str, data_blend_dir: Optional[str], section: str, output_dir: Optional[str],
-        ignore_fail: tuple, verbose: bool) -> None:
+def cli(
+    config_path: str,
+    data_blend_dir: Optional[str],
+    section: str,
+    output_dir: Optional[str],
+    ignore_fail: tuple,
+    verbose: bool,
+) -> None:
     logging.basicConfig(
         level=logging.DEBUG if verbose else logging.INFO,
         format="[%(asctime)s %(levelname)s] %(message)s",
diff --git a/scripts/dataloading/build_indexes.py b/scripts/dataloading/build_indexes.py
index f09f6b40196d..b46c3bc5342d 100644
--- a/scripts/dataloading/build_indexes.py
+++ b/scripts/dataloading/build_indexes.py
@@ -55,19 +55,15 @@
 from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Callable, Iterable, Iterator, Optional
+from typing import Optional
 
 import click
-from omegaconf import DictConfig, ListConfig, OmegaConf
-
 from lhotse.indexing import index_file_path
+from omegaconf import DictConfig, ListConfig, OmegaConf
 
-from nemo.collections.common.data.lhotse.indexed_adapters import (
-    create_tar_index as create_nemo_tar_index,
-)
+from nemo.collections.common.data.lhotse.indexed_adapters import create_tar_index as create_nemo_tar_index
 from nemo.collections.common.data.lhotse.nemo_adapters import expand_sharded_filepaths
 
-
 # --------------------------------------------------------------------------- #
 # Tar layout taxonomy.
 # --------------------------------------------------------------------------- #
@@ -146,15 +142,17 @@ def _resolve_input_cfg(val) -> ListConfig | None:
 # ``read_cutset_from_config(config)`` and accept *any* underlying source's keys
 # (``cuts_path``, ``shar_path``, ``manifest_filepath`` [+ ``tarred_audio_filepaths``],
 # nested ``input_cfg``, …). Treat them as transparent passthroughs.
-_TRANSFORM_TYPES = frozenset({
-    "lhotse_as_conversation",
-    "sqa_as_conversation",
-    "s2s_as_conversation",
-    "s2s_duplex_overlap_as_s2s_duplex",
-    "s2s_duplex_reverse_role",
-    "lhotse_magpietts_data_as_continuation",
-    "nemo_tarred_to_duplex",
-})
+_TRANSFORM_TYPES = frozenset(
+    {
+        "lhotse_as_conversation",
+        "sqa_as_conversation",
+        "s2s_as_conversation",
+        "s2s_duplex_overlap_as_s2s_duplex",
+        "s2s_duplex_reverse_role",
+        "lhotse_magpietts_data_as_continuation",
+        "nemo_tarred_to_duplex",
+    }
+)
 
 # Types that index nothing on their own.
 _NO_INDEX_TYPES = frozenset({"txt", "txt_pair", "parquet", "multi_speaker_simulator"})
@@ -333,7 +331,8 @@ def _discover_shar(shar_path, jobs: list[IndexJob], indexes_root: Optional[str])
 
 def _build_one(job: IndexJob) -> tuple[IndexJob, str]:
     """Run the right indexer for *job*. Returns (job, status)."""
-    from lhotse.indexing import create_jsonl_index, create_tar_index as create_wds_tar_index
+    from lhotse.indexing import create_jsonl_index
+    from lhotse.indexing import create_tar_index as create_wds_tar_index
 
     idx = job.idx_path()
     # Ensure the parent directory exists for mirrored layouts.
@@ -426,8 +425,7 @@ def main(
     todo = unique if force else [j for j in unique if not _is_indexed(j)]
     skipped = len(unique) - len(todo)
 
-    logging.info("Discovered %d files (%d already indexed, %d to build).",
-                 len(unique), skipped, len(todo))
+    logging.info("Discovered %d files (%d already indexed, %d to build).", len(unique), skipped, len(todo))
 
     if dry_run or not todo:
         for j in todo:
@@ -439,7 +437,7 @@ def main(
     # Failures are still logged inline; success only emits a periodic
     # "<built>/<total> processed" heartbeat (~every 5% of total or 5000 files,
     # whichever is smaller) plus a final summary.
-    failures: list[tuple[IndexJob, BaseException]] = []
+    failures: list[tuple[IndexJob, Exception]] = []
     total = len(todo)
     log_every = max(1, min(5000, total // 20))
     pool_cls = ProcessPoolExecutor if executor == "process" else ThreadPoolExecutor
@@ -451,14 +449,17 @@ def main(
             j = futures[fut]
             try:
                 _, _status = fut.result()
-            except BaseException as e:  # noqa: BLE001 — surface any failure
+            except Exception as e:  # surface worker failures but let interrupts/system exits propagate
                 failures.append((j, e))
                 logging.error("  [FAIL] %s %s: %s", j.kind, j.path, e)
                 continue
             if done % log_every == 0 or done == total:
                 logging.info(
                     "  built %d/%d (%.1f%%)  failures=%d",
-                    done, total, 100.0 * done / total, len(failures),
+                    done,
+                    total,
+                    100.0 * done / total,
+                    len(failures),
                 )
 
     if failures:
diff --git a/scripts/dataloading/prefetch_indexes.py b/scripts/dataloading/prefetch_indexes.py
index 311ebd032329..694b695b923d 100644
--- a/scripts/dataloading/prefetch_indexes.py
+++ b/scripts/dataloading/prefetch_indexes.py
@@ -51,13 +51,13 @@
 import shutil
 import sys
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from contextlib import suppress
 from pathlib import Path
 from typing import Optional
 
 import click
-from omegaconf import OmegaConf
-
 from lhotse.indexing import index_file_path
+from omegaconf import OmegaConf
 
 # Reuse the discovery + IndexJob machinery from build_indexes.py.
 sys.path.insert(0, str(Path(__file__).parent))
@@ -82,10 +82,8 @@ def _copy_idx(src: str, dst: str) -> None:
         Path(tmp).replace(dst)
     finally:
         # Clean up if rename never happened (exception path).
-        try:
+        with suppress(FileNotFoundError):
             Path(tmp).unlink()
-        except FileNotFoundError:
-            pass
 
 
 def _is_present(local_idx: str) -> bool:
@@ -159,7 +157,9 @@ def main(
     skipped = len(pairs) - len(todo)
     logging.info(
         "Discovered %d sidecars (%d already present locally, %d to copy).",
-        len(pairs), skipped, len(todo),
+        len(pairs),
+        skipped,
+        len(todo),
     )
 
     if dry_run or not todo:
@@ -170,7 +170,7 @@ def main(
     # Per-file success logging is suppressed (80k-400k sidecars would swamp
     # stdout); failures are still logged inline, success emits a periodic
     # progress heartbeat plus a final summary.
-    failures: list[tuple[str, str, BaseException]] = []
+    failures: list[tuple[str, str, Exception]] = []
     total = len(todo)
     log_every = max(1, min(5000, total // 20))
     with ThreadPoolExecutor(max_workers=max(1, workers)) as ex:
@@ -181,14 +181,17 @@ def main(
             s, d = futures[fut]
             try:
                 fut.result()
-            except BaseException as e:  # noqa: BLE001
+            except Exception as e:
                 failures.append((s, d, e))
                 logging.error("  [FAIL] %s  ->  %s: %s", s, d, e)
                 continue
             if done % log_every == 0 or done == total:
                 logging.info(
                     "  copied %d/%d (%.1f%%)  failures=%d",
-                    done, total, 100.0 * done / total, len(failures),
+                    done,
+                    total,
+                    100.0 * done / total,
+                    len(failures),
                 )
 
     if failures:
diff --git a/tests/collections/common/test_lhotse_indexed_partition.py b/tests/collections/common/test_lhotse_indexed_partition.py
index bb69393fcdf4..1198bd6dc1d8 100644
--- a/tests/collections/common/test_lhotse_indexed_partition.py
+++ b/tests/collections/common/test_lhotse_indexed_partition.py
@@ -36,7 +36,6 @@
 import pytest
 from lhotse import CutSet
 from lhotse.dataset.dataloading import LHOTSE_USE_WORKER_PARTITION
-from lhotse.serialization import save_to_jsonl
 from lhotse.testing.dummies import DummyManifest
 
 from nemo.collections.common.data.lhotse import nemo_adapters, text_adapters
@@ -72,8 +71,7 @@ def _collect_disjoint_per_rank(build_iter_for_rank, world_size: int) -> tuple[li
         # Disjointness against every prior rank.
         for prev in per_rank:
             assert set(prev).isdisjoint(ids), (
-                f"rank {rank} slice overlaps prior rank: "
-                f"{sorted(set(prev) & set(ids))}"
+                f"rank {rank} slice overlaps prior rank: " f"{sorted(set(prev) & set(ids))}"
             )
         per_rank.append(ids)
         union.update(ids)
@@ -97,8 +95,8 @@ def tmp_audio_root(tmp_path_factory) -> Path:
 def nemo_tarred_manifest(tmp_audio_root) -> tuple[Path, Path]:
     """20-utterance NeMo tarred manifest (single shard) as
     (manifest_filepath, tarred_audio_filepath)."""
-    from lhotse.shar.writers import TarWriter
     from lhotse.serialization import SequentialJsonlWriter
+    from lhotse.shar.writers import TarWriter
 
     cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
         tmp_audio_root, progress_bar=False
@@ -158,8 +156,8 @@ def build():
 @pytest.fixture
 def parquet_manifest(tmp_audio_root) -> Path:
     """20-row parquet file: id + audio_bytes + text."""
-    pa = pytest.importorskip("pyarrow")
-    pq = pytest.importorskip("pyarrow.parquet")
+    pytest.importorskip("pyarrow")
+    pytest.importorskip("pyarrow.parquet")
     import pandas as pd
 
     cuts = DummyManifest(CutSet, begin_id=0, end_id=N_CUTS, with_data=True).save_audios(
@@ -212,9 +210,7 @@ def text_jsonl(tmp_path) -> Path:
 @pytest.mark.parametrize("world_size", [1, 2, 4, 5])
 def test_lhotse_text_jsonl_adapter_indexed_partition(text_jsonl, world_size):
     def build():
-        it = text_adapters.LhotseTextJsonlAdapter(
-            paths=str(text_jsonl), language="en", indexed=True
-        )
+        it = text_adapters.LhotseTextJsonlAdapter(paths=str(text_jsonl), language="en", indexed=True)
         return [ex.text for ex in it]
 
     per_rank, union = _collect_disjoint_per_rank(build, world_size)
@@ -239,9 +235,7 @@ def sft_jsonl(tmp_path) -> Path:
 @pytest.mark.parametrize("world_size", [1, 2, 4, 5])
 def test_nemo_sft_jsonl_adapter_indexed_partition(sft_jsonl, world_size):
     def build():
-        it = text_adapters.NeMoSFTJsonlAdapter(
-            paths=str(sft_jsonl), language="en", indexed=True
-        )
+        it = text_adapters.NeMoSFTJsonlAdapter(paths=str(sft_jsonl), language="en", indexed=True)
         # NeMoSFTExample stores the raw dict in .data; key by "id".
         return [ex.data["id"] for ex in it]
 
@@ -291,9 +285,7 @@ def mm_conversation_jsonl(tmp_audio_root) -> Path:
 
 
 @pytest.mark.parametrize("world_size", [1, 2, 4, 5])
-def test_nemo_multimodal_conversation_jsonl_adapter_indexed_partition(
-    mm_conversation_jsonl, world_size
-):
+def test_nemo_multimodal_conversation_jsonl_adapter_indexed_partition(mm_conversation_jsonl, world_size):
     def build():
         it = text_adapters.NeMoMultimodalConversationJsonlAdapter(
             manifest_filepath=[str(mm_conversation_jsonl)],

From 64fae408357b167bd60a58208f5ccd4832cc0403 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Fri, 12 Jun 2026 09:48:43 -0700
Subject: [PATCH 18/30] Fix common test failures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 nemo/collections/common/callbacks/ema.py      |  2 +-
 .../common/data/lhotse/nemo_adapters.py       | 26 ++++++++++---------
 .../common/test_lhotse_dataloading.py         |  4 +--
 .../test_lhotse_multimodal_ais_get_batch.py   |  8 ++++--
 4 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/nemo/collections/common/callbacks/ema.py b/nemo/collections/common/callbacks/ema.py
index dee125be54ef..39afe7bf2445 100644
--- a/nemo/collections/common/callbacks/ema.py
+++ b/nemo/collections/common/callbacks/ema.py
@@ -135,7 +135,7 @@ def on_load_checkpoint(
                 return
             ema_path = ckpt_path.replace(ext, f'-EMA{ext}')
             if os.path.exists(ema_path):
-                ema_state_dict = torch.load(ema_path, map_location=torch.device('cpu'))
+                ema_state_dict = torch.load(ema_path, map_location=torch.device('cpu'), weights_only=False)
 
                 checkpoint['optimizer_states'] = ema_state_dict['optimizer_states']
                 del ema_state_dict
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 7a809d7ff387..488d52b0e9e9 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -202,12 +202,14 @@ def __iter__(self) -> Generator[Cut, None, None]:
             yield cut
 
     def __getitem__(self, token):
-        if not self.indexed:
-            raise NotImplementedError("LazyNeMoIterator only supports __getitem__ when constructed with indexed=True.")
         token = normalize_graph_token(token)
+        if self.extra_fields:
+            raise NotImplementedError(
+                "LazyNeMoIterator does not support __getitem__ when extra_fields are configured."
+            )
         data = self.source[token]
         cut = self._build_cut_from_dict(data)
-        return attach_graph_origin(cut, token)
+        return attach_graph_origin(cut, token) if self.indexed else cut
 
     def __len__(self) -> int:
         return len(self.source)
@@ -1145,11 +1147,13 @@ def __init__(
         self.sampling_rate = sampling_rate
         self.indexed = indexed
         self._row_group_offsets: list[int] | None = None
+        self._num_row_groups: int | None = None
+        self._total_rows: int | None = None
         self._cached_row_group_idx: int | None = None
         self._cached_row_group: list[dict] | None = None
         self._iter_state = PartitionedIndexedIterator()
         if indexed:
-            self._init_indexed()
+            self._ensure_row_group_offsets()
 
     @property
     def is_checkpointable(self) -> bool:
@@ -1163,7 +1167,9 @@ def is_indexed(self) -> bool:
     def has_constant_time_access(self) -> bool:
         return self.indexed
 
-    def _init_indexed(self) -> None:
+    def _ensure_row_group_offsets(self) -> None:
+        if self._row_group_offsets is not None:
+            return
         try:
             with closing(pq.ParquetFile(self.path)) as parquet_file:
                 offsets = [0]
@@ -1228,10 +1234,7 @@ def _build_cut_from_row(self, row: dict, fallback_idx: int) -> Cut | None:
         return cut
 
     def __getitem__(self, token):
-        if not self.indexed:
-            raise NotImplementedError(
-                "LazyParquetIterator only supports __getitem__ when constructed with indexed=True."
-            )
+        self._ensure_row_group_offsets()
         idx = int(normalize_graph_token(token))
         if idx < 0:
             idx += self._total_rows
@@ -1245,9 +1248,8 @@ def __getitem__(self, token):
         return attach_graph_origin(cut, idx)
 
     def __len__(self) -> int:
-        if self.indexed:
-            return self._total_rows
-        raise TypeError("LazyParquetIterator has unknown length unless constructed with indexed=True.")
+        self._ensure_row_group_offsets()
+        return self._total_rows
 
     def state_dict(self) -> dict:
         if not self.indexed:
diff --git a/tests/collections/common/test_lhotse_dataloading.py b/tests/collections/common/test_lhotse_dataloading.py
index 73b94a3e7682..e1b45274d0de 100644
--- a/tests/collections/common/test_lhotse_dataloading.py
+++ b/tests/collections/common/test_lhotse_dataloading.py
@@ -1025,7 +1025,7 @@ def test_lazy_nemo_iterator_with_offset_field(tmp_path: Path):
     assert cut.supervisions[0].text == "irrelevant"
     audio = cut.load_audio()
     assert audio.shape == (1, 8000)
-    np.testing.assert_equal(audio[0], expected_audio[:8000])
+    np.testing.assert_allclose(audio[0], expected_audio[:8000], atol=5e-5)
 
     cut = cuts[1]
     assert isinstance(cut, lhotse.MonoCut)
@@ -1073,7 +1073,7 @@ def test_lazy_nemo_iterator_with_relative_paths(tmp_path: Path):
     assert cut.num_samples == 8000
     assert cut.supervisions[0].text == "irrelevant"
     assert audio.shape == (1, 8000)
-    np.testing.assert_equal(audio[0], expected_audio[:8000])
+    np.testing.assert_allclose(audio[0], expected_audio[:8000], atol=5e-5)
 
 
 def test_lhotse_cuts_resolve_relative_paths(tmp_path: Path):
diff --git a/tests/collections/common/test_lhotse_multimodal_ais_get_batch.py b/tests/collections/common/test_lhotse_multimodal_ais_get_batch.py
index b0c9c3569c37..ee3650283539 100644
--- a/tests/collections/common/test_lhotse_multimodal_ais_get_batch.py
+++ b/tests/collections/common/test_lhotse_multimodal_ais_get_batch.py
@@ -374,7 +374,9 @@ def test_salm_dataset_batch_loader_enabled(monkeypatch):
     with patch("nemo.collections.speechlm2.data.salm_dataset.AudioSamples") as audio_samples:
         ds = SALMDataset(tokenizer=_FakeTokenizer())
 
-    audio_samples.assert_called_once_with(fault_tolerant=True, use_batch_loader=True, mono_downmix=True)
+    audio_samples.assert_called_once_with(
+        fault_tolerant=True, use_batch_loader=True, ais_force_individual=False, mono_downmix=True
+    )
     assert ds.load_audio is audio_samples.return_value
 
 
@@ -386,5 +388,7 @@ def test_salm_dataset_batch_loader_disabled(monkeypatch):
     with patch("nemo.collections.speechlm2.data.salm_dataset.AudioSamples") as audio_samples:
         ds = SALMDataset(tokenizer=_FakeTokenizer())
 
-    audio_samples.assert_called_once_with(fault_tolerant=True, use_batch_loader=False, mono_downmix=True)
+    audio_samples.assert_called_once_with(
+        fault_tolerant=True, use_batch_loader=False, ais_force_individual=False, mono_downmix=True
+    )
     assert ds.load_audio is audio_samples.return_value

From 9f47392738e8d144df12f6aa572d9e3390070d41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Fri, 12 Jun 2026 11:17:16 -0700
Subject: [PATCH 19/30] Fix CI checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 nemo/collections/common/callbacks/ema.py      |   1 +
 .../collections/common/data/lhotse/_compat.py | 177 +++++++++++++++++
 nemo/collections/common/data/lhotse/cutset.py |  12 +-
 .../common/data/lhotse/dataloader.py          |   2 +-
 .../common/data/lhotse/indexed_adapters.py    |  12 +-
 .../common/data/lhotse/nemo_adapters.py       |  14 +-
 .../common/data/lhotse/text_adapters.py       |  16 +-
 nemo/utils/callbacks/preemption.py            |   4 +-
 .../_validate_dataloader/consolidate.py       | 178 +++++++++++-------
 scripts/dataloading/validate_dataloader.py    | 133 ++++++++-----
 .../test_lhotse_multimodal_dataloading.py     |  12 +-
 .../common/test_validate_dataloader.py        | 148 +++++++++------
 12 files changed, 500 insertions(+), 209 deletions(-)
 create mode 100644 nemo/collections/common/data/lhotse/_compat.py

diff --git a/nemo/collections/common/callbacks/ema.py b/nemo/collections/common/callbacks/ema.py
index 39afe7bf2445..7dbf08267ef5 100644
--- a/nemo/collections/common/callbacks/ema.py
+++ b/nemo/collections/common/callbacks/ema.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# pylint: disable=C0116
 import contextlib
 import copy
 import os
diff --git a/nemo/collections/common/data/lhotse/_compat.py b/nemo/collections/common/data/lhotse/_compat.py
new file mode 100644
index 000000000000..65839e37afad
--- /dev/null
+++ b/nemo/collections/common/data/lhotse/_compat.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=unused-import
+import os
+from collections.abc import Generator, Iterable
+from typing import Any
+
+import torch
+from torch import distributed as dist
+
+try:
+    from lhotse.dataset.dataloading import PartitionedIndexedIterator
+except ImportError:
+    LHOTSE_USE_WORKER_PARTITION = "LHOTSE_USE_WORKER_PARTITION"
+
+    def _get_world_size() -> int:
+        if "WORLD_SIZE" in os.environ:
+            return int(os.environ["WORLD_SIZE"])
+        if dist.is_available() and dist.is_initialized():
+            return dist.get_world_size()
+        return 1
+
+    def _get_rank() -> int:
+        if "RANK" in os.environ:
+            return int(os.environ["RANK"])
+        if dist.is_available() and dist.is_initialized():
+            return dist.get_rank()
+        return 0
+
+    def _get_worker_partition() -> tuple[int, int]:
+        if os.environ.get(LHOTSE_USE_WORKER_PARTITION) != "1":
+            return 0, 1
+        rank = _get_rank()
+        world_size = _get_world_size()
+        worker_info = torch.utils.data.get_worker_info()
+        if worker_info is None:
+            worker_id, num_workers = 0, 1
+        else:
+            worker_id = worker_info.id
+            num_workers = max(worker_info.num_workers, 1)
+        return rank * num_workers + worker_id, world_size * num_workers
+
+    class PartitionedIndexedIterator:
+        def __init__(self, shuffle: bool = False, seed: int = 0) -> None:
+            self._shuffle = shuffle
+            self._seed = seed
+            self._position = 0
+            self._shard_id: int | None = None
+            self._num_shards: int | None = None
+            self._restored = False
+            self._range = None
+            self._pending_range_state = None
+
+        @property
+        def position(self) -> int:
+            return self._position
+
+        def iterate(self, total_len: int) -> Generator[int, None, None]:
+            shard_id, num_shards = _get_worker_partition()
+
+            if self._restored:
+                self._restored = False
+                if self._num_shards is not None and (self._shard_id != shard_id or self._num_shards != num_shards):
+                    raise ValueError(
+                        f"PartitionedIndexedIterator topology mismatch on resume: "
+                        f"saved (shard_id={self._shard_id}, num_shards={self._num_shards}), "
+                        f"current (shard_id={shard_id}, num_shards={num_shards})."
+                    )
+                start = self._position
+            else:
+                start = 0
+                self._position = 0
+
+            self._shard_id = shard_id
+            self._num_shards = num_shards
+
+            if self._shuffle:
+                from lhotse.indexing import LazyShuffledRange
+
+                self._range = LazyShuffledRange(total_len, seed=self._seed, shard_id=shard_id, num_shards=num_shards)
+                if self._pending_range_state is not None:
+                    self._range.load_state_dict(self._pending_range_state)
+                    self._pending_range_state = None
+                shard_len = len(self._range)
+            else:
+                self._range = None
+                shard_len = (total_len - shard_id + num_shards - 1) // num_shards if total_len > shard_id else 0
+
+            for i in range(start, shard_len):
+                self._position = i + 1
+                yield self._range[i] if self._range is not None else shard_id + i * num_shards
+
+        def state_dict(self) -> dict:
+            sd = {
+                "position": self._position,
+                "shard_id": self._shard_id,
+                "num_shards": self._num_shards,
+            }
+            if self._range is not None:
+                sd["range"] = self._range.state_dict()
+            elif self._pending_range_state is not None:
+                sd["range"] = self._pending_range_state
+            return sd
+
+        def load_state_dict(self, sd: dict) -> None:
+            self._position = sd.get("position", 0)
+            self._shard_id = sd.get("shard_id")
+            self._num_shards = sd.get("num_shards")
+            if self._shuffle:
+                self._pending_range_state = sd.get("range")
+                self._range = None
+            self._restored = True
+
+
+try:
+    from lhotse.lazy import (
+        GraphOriginDict,
+        IteratorNode,
+        LazyIndexedManifestIterator,
+        attach_graph_origin,
+        normalize_graph_token,
+    )
+except ImportError:
+
+    class IteratorNode(Iterable):
+        is_checkpointable = False
+        is_indexed = False
+        has_constant_time_access = False
+
+        def state_dict(self) -> dict:
+            raise NotImplementedError(f"{type(self).__name__} is not checkpointable.")
+
+        def load_state_dict(self, sd: dict) -> None:
+            raise NotImplementedError(f"{type(self).__name__} is not checkpointable.")
+
+        def iter_children(self):
+            if hasattr(self, "source"):
+                yield getattr(self, "source")
+            if hasattr(self, "sources"):
+                yield from getattr(self, "sources")
+
+    class GraphOriginDict(dict):
+        __slots__ = ("_graph_origin",)
+
+    def normalize_graph_token(token: Any) -> Any:
+        if isinstance(token, list):
+            return tuple(normalize_graph_token(part) for part in token)
+        if isinstance(token, tuple):
+            return tuple(normalize_graph_token(part) for part in token)
+        return token
+
+    def attach_graph_origin(item: Any, token: Any) -> Any:
+        try:
+            object.__setattr__(item, "_graph_origin", token)
+        except Exception:
+            try:
+                setattr(item, "_graph_origin", token)
+            except Exception:
+                pass
+        return item
+
+    class LazyIndexedManifestIterator(IteratorNode):
+        def __init__(self, *args, **kwargs) -> None:
+            raise ImportError(
+                "LazyIndexedManifestIterator requires a Lhotse version with indexed/resumable dataloading support."
+            )
diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index f3129a44245e..a992b81d7879 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -762,9 +762,7 @@ def read_lhotse_manifest(config) -> tuple[CutSet, bool]:
         from_file_kwargs = {"indexed": config.get("indexed", None)}
         if indexes_root is not None:
             from_file_kwargs["index_path"] = index_file_path(path, indexes_root)
-        cuts = CutSet.from_file(path, **from_file_kwargs).map(
-            partial(resolve_relative_paths, manifest_path=path)
-        )
+        cuts = CutSet.from_file(path, **from_file_kwargs).map(partial(resolve_relative_paths, manifest_path=path))
     return cuts, is_tarred
 
 
@@ -1541,9 +1539,7 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
                 cuts = cuts.repeat(preserve_id=True)
         else:
             cuts = CutSet(
-                LazyNeMoIterator(
-                    config.manifest_filepath, **notar_kwargs, **notar_kwargs_extra, **common_kwargs
-                )
+                LazyNeMoIterator(config.manifest_filepath, **notar_kwargs, **notar_kwargs_extra, **common_kwargs)
             )
     else:
         # Format option 1:
@@ -1582,9 +1578,7 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
                     **common_kwargs,
                 )
             else:
-                nemo_iter = LazyNeMoIterator(
-                    manifest_path, **notar_kwargs, **notar_kwargs_extra, **common_kwargs
-                )
+                nemo_iter = LazyNeMoIterator(manifest_path, **notar_kwargs, **notar_kwargs_extra, **common_kwargs)
             # Then, determine the weight or use one provided
             if isinstance(manifest_info, str) or len(manifest_info) == 1:
                 weight = len(nemo_iter)
diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
index e52d9a70d933..fc7b82439356 100644
--- a/nemo/collections/common/data/lhotse/dataloader.py
+++ b/nemo/collections/common/data/lhotse/dataloader.py
@@ -40,7 +40,7 @@
 from lhotse.dataset.dataloading import resolve_seed
 from lhotse.dataset.sampling.base import CutSampler, SamplingConstraint, TimeConstraint
 from lhotse.lazy import LazyFlattener
-from lhotse.utils import fastcopy, fix_random_seed
+from lhotse.utils import fix_random_seed
 from omegaconf import DictConfig, OmegaConf
 
 from nemo.collections.common.data.lhotse.cutset import (
diff --git a/nemo/collections/common/data/lhotse/indexed_adapters.py b/nemo/collections/common/data/lhotse/indexed_adapters.py
index 1e623d4ee765..989db03e0406 100644
--- a/nemo/collections/common/data/lhotse/indexed_adapters.py
+++ b/nemo/collections/common/data/lhotse/indexed_adapters.py
@@ -21,7 +21,6 @@
 
 import numpy as np
 
-from lhotse.indexing import read_index
 
 # Tar block size + the all-zeros block that marks end-of-archive in tar.
 _TAR_BLOCK_SIZE = 512
@@ -78,6 +77,8 @@ def _load_index(data_path: str, idx_path: Optional[str] = None):
     local and remote sources, so the on-disk format is identical — only the
     file-size cross-check is skipped.
     """
+    from lhotse.indexing import read_index
+
     if idx_path is None:
         idx_path = data_path + '.idx'
     offsets = read_index(idx_path)
@@ -284,10 +285,7 @@ def __getitem__(self, idx: int) -> tuple[str, bytes]:
         try:
             name, data = _read_tar_member(self._fh)
         except (EOFError, tarfile.TarError) as e:
-            raise type(e)(
-                f"{e} — reading sample {idx}/{self._len} at offset {offset} "
-                f"in {self.data_path}"
-            ) from e
+            raise type(e)(f"{e} — reading sample {idx}/{self._len} at offset {offset} " f"in {self.data_path}") from e
         return name, data
 
     def _build_name_index(self) -> dict[str, int]:
@@ -309,9 +307,7 @@ def _build_name_index(self) -> dict[str, int]:
                 header = self._fh.read(_TAR_BLOCK_SIZE)
                 if len(header) < _TAR_BLOCK_SIZE or header == _TAR_ZERO_BLOCK:
                     break
-                info = tarfile.TarInfo.frombuf(
-                    header, tarfile.ENCODING, "surrogateescape"
-                )
+                info = tarfile.TarInfo.frombuf(header, tarfile.ENCODING, "surrogateescape")
                 if info.type in (tarfile.REGTYPE, tarfile.AREGTYPE):
                     name_to_idx[info.name] = i
                     break
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 488d52b0e9e9..6b5215afab07 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -36,19 +36,19 @@
 from lhotse import AudioSource, MonoCut, Recording, SupervisionSegment
 from lhotse.audio.backend import LibsndfileBackend
 from lhotse.cut import Cut
-from lhotse.dataset.dataloading import PartitionedIndexedIterator, resolve_seed
-from lhotse.lazy import (
+from lhotse.dataset.dataloading import resolve_seed
+from lhotse.lazy import LazyIteratorChain, LazyJsonlIterator
+from lhotse.serialization import open_best
+from lhotse.utils import compute_num_samples, ifnone
+
+from nemo.collections.common.data.lhotse._compat import (
     GraphOriginDict,
     IteratorNode,
     LazyIndexedManifestIterator,
-    LazyIteratorChain,
-    LazyJsonlIterator,
+    PartitionedIndexedIterator,
     attach_graph_origin,
     normalize_graph_token,
 )
-from lhotse.serialization import open_best
-from lhotse.utils import compute_num_samples, ifnone
-
 from nemo.collections.common.parts.preprocessing.manifest import get_full_path
 from nemo.utils import logging
 from nemo.utils.data_utils import is_datastore_path
diff --git a/nemo/collections/common/data/lhotse/text_adapters.py b/nemo/collections/common/data/lhotse/text_adapters.py
index 1793c4a228bd..bcff2069922b 100644
--- a/nemo/collections/common/data/lhotse/text_adapters.py
+++ b/nemo/collections/common/data/lhotse/text_adapters.py
@@ -30,13 +30,17 @@
 from lhotse.custom import CustomFieldMixin
 from lhotse.cut import Cut
 from lhotse.dataset import AudioSamples
-from lhotse.dataset.dataloading import PartitionedIndexedIterator, resolve_seed
-from lhotse.indexing import IndexedJsonlReader
-from lhotse.lazy import IteratorNode, attach_graph_origin, normalize_graph_token
+from lhotse.dataset.dataloading import resolve_seed
 from lhotse.serialization import load_jsonl, open_best
 from lhotse.shar import AudioTarWriter, JsonlShardWriter
 from lhotse.utils import Pathlike, compute_num_samples, is_valid_url
 
+from nemo.collections.common.data.lhotse._compat import (
+    IteratorNode,
+    PartitionedIndexedIterator,
+    attach_graph_origin,
+    normalize_graph_token,
+)
 from nemo.collections.common.data.lhotse.indexed_adapters import (
     IndexedTarMemberReader,
     IndexedTarSampleReader,
@@ -591,7 +595,7 @@ def has_constant_time_access(self) -> bool:
         return self.indexed
 
     def _init_indexed(self) -> None:
-        from lhotse.indexing import index_file_path
+        from lhotse.indexing import IndexedJsonlReader, index_file_path
 
         for p in self.paths:
             path = Path(p)
@@ -1406,8 +1410,6 @@ class NeMoMultimodalConversationShareGPTJsonlAdapter(IteratorNode):
     skip_missing_manifest_entries: bool = False
 
     def __post_init__(self):
-        from lhotse.indexing import index_file_path
-
         self.manifest_filepath = expand_sharded_filepaths(self.manifest_filepath)
         if self.tarred_audio_filepaths is not None:
             self.tarred_audio_filepaths = expand_sharded_filepaths(self.tarred_audio_filepaths)
@@ -1742,8 +1744,6 @@ class NeMoMultimodalConversationShareGPTWebdatasetAdapter(IteratorNode):
     def __post_init__(self):
         import json as _json
 
-        from lhotse.indexing import index_file_path
-
         meta_path = Path(self.data_dir) / "wids-meta.json"
         if meta_path.exists():
             with open(meta_path) as f:
diff --git a/nemo/utils/callbacks/preemption.py b/nemo/utils/callbacks/preemption.py
index e0a723d28bf6..4c4748821d63 100644
--- a/nemo/utils/callbacks/preemption.py
+++ b/nemo/utils/callbacks/preemption.py
@@ -76,9 +76,7 @@ def ignoring_handler(signum, frame):
             self.private_rank = torch.distributed.get_rank()
             if self.private_rank == 0:
                 signal.signal(self.sig, master_handler)
-                logging.info(
-                    f"PreemptionCallback enabled on rank 0 for signal {getattr(self.sig, 'name', self.sig)}"
-                )
+                logging.info(f"PreemptionCallback enabled on rank 0 for signal {getattr(self.sig, 'name', self.sig)}")
             else:
                 signal.signal(self.sig, ignoring_handler)
 
diff --git a/scripts/dataloading/_validate_dataloader/consolidate.py b/scripts/dataloading/_validate_dataloader/consolidate.py
index ee894da5444b..38a93077a05e 100644
--- a/scripts/dataloading/_validate_dataloader/consolidate.py
+++ b/scripts/dataloading/_validate_dataloader/consolidate.py
@@ -65,8 +65,9 @@ class ValidationReport:
 
     def to_dict(self):
         return {
-            "questions": {q.q_id: {"status": q.status, "tag": q.tag, "detail": q.detail, **q.extra}
-                          for q in self.questions},
+            "questions": {
+                q.q_id: {"status": q.status, "tag": q.tag, "detail": q.detail, **q.extra} for q in self.questions
+            },
             "throughput": self.throughput,
         }
 
@@ -88,11 +89,13 @@ def consolidate(output_dir: Path, *, checkpoint_at: int, num_determinism_runs: i
     questions.append(_q1_no_duplication(baseline))
     questions.append(_q2_no_skipping(baseline, output_dir / "groundtruth" / "cuts.jsonl"))
     questions.append(_q3_partition_correctness(baseline))
-    questions.append(_q4_exact_resume(
-        baseline,
-        _load_phase(output_dir / "resumed" / "run0"),
-        checkpoint_at=checkpoint_at,
-    ))
+    questions.append(
+        _q4_exact_resume(
+            baseline,
+            _load_phase(output_dir / "resumed" / "run0"),
+            checkpoint_at=checkpoint_at,
+        )
+    )
     if num_determinism_runs >= 2:
         run1 = _load_phase(output_dir / "baseline" / "run1")
         questions.append(_q5_determinism(baseline, run1))
@@ -129,13 +132,21 @@ def _q1_no_duplication(rows: list[dict]) -> QResult:
         else:
             dup_within_rank.append(cid)
     if dup_cross_rank:
-        return QResult("Q1", FAIL, tag="partition-rank-leak",
-                       detail=f"{len(dup_cross_rank)} cut.id(s) appeared on multiple ranks",
-                       extra={"examples": dup_cross_rank[:5]})
+        return QResult(
+            "Q1",
+            FAIL,
+            tag="partition-rank-leak",
+            detail=f"{len(dup_cross_rank)} cut.id(s) appeared on multiple ranks",
+            extra={"examples": dup_cross_rank[:5]},
+        )
     if dup_within_rank:
-        return QResult("Q1", FAIL, tag="partition-worker-leak",
-                       detail=f"{len(dup_within_rank)} cut.id(s) seen by multiple workers within one rank",
-                       extra={"examples": dup_within_rank[:5]})
+        return QResult(
+            "Q1",
+            FAIL,
+            tag="partition-worker-leak",
+            detail=f"{len(dup_within_rank)} cut.id(s) seen by multiple workers within one rank",
+            extra={"examples": dup_within_rank[:5]},
+        )
     return QResult("Q1", PASS, detail=f"{len(sightings)} distinct cuts, no duplicates")
 
 
@@ -156,16 +167,22 @@ def _q2_no_skipping(rows: list[dict], groundtruth_path: Path) -> QResult:
     missing = expected - yielded
     unexpected = yielded - expected
     if missing:
-        return QResult("Q2", FAIL, tag="skip",
-                       detail=f"{len(missing)} of {len(expected)} expected cut.id(s) never yielded",
-                       extra={"missing_examples": list(missing)[:5],
-                              "unexpected_count": len(unexpected)})
+        return QResult(
+            "Q2",
+            FAIL,
+            tag="skip",
+            detail=f"{len(missing)} of {len(expected)} expected cut.id(s) never yielded",
+            extra={"missing_examples": list(missing)[:5], "unexpected_count": len(unexpected)},
+        )
     if unexpected:
-        return QResult("Q2", FAIL, tag="id-collision",
-                       detail=f"{len(unexpected)} cut.id(s) yielded but not in ground truth",
-                       extra={"unexpected_examples": list(unexpected)[:5]})
-    return QResult("Q2", PASS,
-                   detail=f"yielded ({len(yielded)}) == ground truth ({len(expected)})")
+        return QResult(
+            "Q2",
+            FAIL,
+            tag="id-collision",
+            detail=f"{len(unexpected)} cut.id(s) yielded but not in ground truth",
+            extra={"unexpected_examples": list(unexpected)[:5]},
+        )
+    return QResult("Q2", PASS, detail=f"yielded ({len(yielded)}) == ground truth ({len(expected)})")
 
 
 def _q3_partition_correctness(rows: list[dict]) -> QResult:
@@ -187,11 +204,12 @@ def _q3_partition_correctness(rows: list[dict]) -> QResult:
     ratio = sum_distinct / max(len(grand_union), 1)
     tag = "partition-rank-leak"
     if ratio >= n_ranks - 0.5:
-        detail = (f"FULL BROADCAST: each cut.id appears on ~{ratio:.1f}/{n_ranks} ranks "
-                  f"(overlap={overlap})")
+        detail = f"FULL BROADCAST: each cut.id appears on ~{ratio:.1f}/{n_ranks} ranks " f"(overlap={overlap})"
     else:
-        detail = (f"PARTIAL OVERLAP: per-rank distinct sums to {sum_distinct} but |union|={len(grand_union)} "
-                  f"(overlap={overlap})")
+        detail = (
+            f"PARTIAL OVERLAP: per-rank distinct sums to {sum_distinct} but |union|={len(grand_union)} "
+            f"(overlap={overlap})"
+        )
     return QResult("Q3", FAIL, tag=tag, detail=detail)
 
 
@@ -224,31 +242,44 @@ def _q4_exact_resume(baseline: list[dict], resumed: list[dict], *, checkpoint_at
             continue
         overlap += 1
         if base_cuts != res_cuts:
-            divergences.append({
-                "rank": rank, "step": rstep, "baseline_step": base_step,
-                "only_in_baseline": list(base_cuts - res_cuts)[:3],
-                "only_in_resumed": list(res_cuts - base_cuts)[:3],
-            })
+            divergences.append(
+                {
+                    "rank": rank,
+                    "step": rstep,
+                    "baseline_step": base_step,
+                    "only_in_baseline": list(base_cuts - res_cuts)[:3],
+                    "only_in_resumed": list(res_cuts - base_cuts)[:3],
+                }
+            )
     # Cells in baseline-tail that the resumed run never reached.
-    for (rank, bstep) in base_by_key:
+    for rank, bstep in base_by_key:
         if bstep <= checkpoint_at:
             continue
         rstep = bstep - checkpoint_at - 1
         if (rank, rstep) not in res_by_key:
             extra_baseline_tail += 1
-    extras = {"overlap_cells": overlap, "extra_resumed_cells": extra_resumed,
-              "extra_baseline_tail_cells": extra_baseline_tail}
+    extras = {
+        "overlap_cells": overlap,
+        "extra_resumed_cells": extra_resumed,
+        "extra_baseline_tail_cells": extra_baseline_tail,
+    }
     if divergences:
-        return QResult("Q4", FAIL, tag="resume-rng-divergence",
-                       detail=f"{len(divergences)}/{overlap} overlapping cell(s) diverge after resume",
-                       extra={**extras, "examples": divergences[:5]})
+        return QResult(
+            "Q4",
+            FAIL,
+            tag="resume-rng-divergence",
+            detail=f"{len(divergences)}/{overlap} overlapping cell(s) diverge after resume",
+            extra={**extras, "examples": divergences[:5]},
+        )
     if overlap == 0:
-        return QResult("Q4", FAIL, tag="resume-length-mismatch",
-                       detail="zero overlap between resumed and baseline-tail windows",
-                       extra=extras)
-    return QResult("Q4", PASS,
-                   detail=f"{overlap} overlapping cell(s) match baseline tail bit-for-bit",
-                   extra=extras)
+        return QResult(
+            "Q4",
+            FAIL,
+            tag="resume-length-mismatch",
+            detail="zero overlap between resumed and baseline-tail windows",
+            extra=extras,
+        )
+    return QResult("Q4", PASS, detail=f"{overlap} overlapping cell(s) match baseline tail bit-for-bit", extra=extras)
 
 
 def _q5_determinism(run0: list[dict], run1: list[dict]) -> QResult:
@@ -260,20 +291,28 @@ def _q5_determinism(run0: list[dict], run1: list[dict]) -> QResult:
     if a.keys() != b.keys():
         only_a = list(a.keys() - b.keys())[:3]
         only_b = list(b.keys() - a.keys())[:3]
-        return QResult("Q5", FAIL, tag="non-determinism",
-                       detail="run0/run1 step coverage differs",
-                       extra={"only_in_run0": only_a, "only_in_run1": only_b})
+        return QResult(
+            "Q5",
+            FAIL,
+            tag="non-determinism",
+            detail="run0/run1 step coverage differs",
+            extra={"only_in_run0": only_a, "only_in_run1": only_b},
+        )
     divergences: list[dict] = []
     for k, va in a.items():
         vb = b[k]
         if va != vb:
-            divergences.append({"rank": k[0], "step": k[1],
-                                "only_run0": list(va - vb)[:3],
-                                "only_run1": list(vb - va)[:3]})
+            divergences.append(
+                {"rank": k[0], "step": k[1], "only_run0": list(va - vb)[:3], "only_run1": list(vb - va)[:3]}
+            )
     if divergences:
-        return QResult("Q5", FAIL, tag="non-determinism",
-                       detail=f"{len(divergences)} cell(s) differ between determinism runs",
-                       extra={"examples": divergences[:5]})
+        return QResult(
+            "Q5",
+            FAIL,
+            tag="non-determinism",
+            detail=f"{len(divergences)} cell(s) differ between determinism runs",
+            extra={"examples": divergences[:5]},
+        )
     return QResult("Q5", PASS, detail="run0 == run1 across all (rank, step) cells")
 
 
@@ -299,9 +338,7 @@ def _collect_throughput(run_dir: Path) -> dict:
         "p50_ms_median": p50,
         "p95_ms_max": p95,
         "batches_per_s_per_rank": (1000.0 / p50) if p50 else None,
-        "t_first_batch_ms_max": max(
-            (a.get("t_first_batch_ms") or 0) for a in aggregates
-        ) or None,
+        "t_first_batch_ms_max": max((a.get("t_first_batch_ms") or 0) for a in aggregates) or None,
     }
     if p50 and num_workers:
         out["t_gpu_min_for_overlap_ms"] = p50 / num_workers
@@ -333,12 +370,23 @@ def _load_phase(phase_dir: Path) -> list[dict]:
 
 
 @click.command(help=__doc__)
-@click.option("--output-dir", required=True, type=click.Path(exists=True),
-              help="Directory written by validate_dataloader.py.")
-@click.option("--checkpoint-at", type=int, default=0, show_default=True,
-              help="Step index at which the baseline saved state. Must match the baseline run.")
-@click.option("--num-determinism-runs", type=int, default=1, show_default=True,
-              help="If >= 2, compares baseline/run0 vs baseline/run1 for Q5.")
+@click.option(
+    "--output-dir", required=True, type=click.Path(exists=True), help="Directory written by validate_dataloader.py."
+)
+@click.option(
+    "--checkpoint-at",
+    type=int,
+    default=0,
+    show_default=True,
+    help="Step index at which the baseline saved state. Must match the baseline run.",
+)
+@click.option(
+    "--num-determinism-runs",
+    type=int,
+    default=1,
+    show_default=True,
+    help="If >= 2, compares baseline/run0 vs baseline/run1 for Q5.",
+)
 @click.option("-v", "--verbose", is_flag=True, default=False)
 def cli(output_dir: str, checkpoint_at: int, num_determinism_runs: int, verbose: bool) -> None:
     logging.basicConfig(
@@ -356,9 +404,11 @@ def cli(output_dir: str, checkpoint_at: int, num_determinism_runs: int, verbose:
         print(f"{marker}  {q.q_id}{tag}: {q.detail}")
     if report.throughput.get("available"):
         t = report.throughput
-        print(f"\nthroughput: p50={t['p50_ms_median']:.1f}ms p95={t['p95_ms_max']:.1f}ms "
-              f"=> {t['batches_per_s_per_rank']:.2f} batches/s/rank "
-              f"(num_workers={t['num_workers']}, T_gpu_min={t.get('t_gpu_min_for_overlap_ms', 0):.1f}ms)")
+        print(
+            f"\nthroughput: p50={t['p50_ms_median']:.1f}ms p95={t['p95_ms_max']:.1f}ms "
+            f"=> {t['batches_per_s_per_rank']:.2f} batches/s/rank "
+            f"(num_workers={t['num_workers']}, T_gpu_min={t.get('t_gpu_min_for_overlap_ms', 0):.1f}ms)"
+        )
     else:
         print("\nthroughput: <not collected>")
     (out_dir / "validation_report.json").write_text(json.dumps(report.to_dict(), indent=2))
diff --git a/scripts/dataloading/validate_dataloader.py b/scripts/dataloading/validate_dataloader.py
index 313b4c48ed30..4fdfaf14bcc6 100644
--- a/scripts/dataloading/validate_dataloader.py
+++ b/scripts/dataloading/validate_dataloader.py
@@ -67,31 +67,64 @@
 
 @click.command(help=__doc__)
 @click.option("--config", "config_path", required=True, type=click.Path(exists=True))
-@click.option("--data-blend-dir", default=None,
-              help="Substituted into ${data_blend_dir} in the config.")
+@click.option("--data-blend-dir", default=None, help="Substituted into ${data_blend_dir} in the config.")
 @click.option("--section", default="train_ds", show_default=True)
 @click.option("--output-dir", required=True, type=click.Path())
-@click.option("--phase", type=click.Choice([PHASE_BASELINE, PHASE_RESUMED, PHASE_GROUNDTRUTH]),
-              required=True)
-@click.option("--run-idx", type=int, default=0, show_default=True,
-              help="Which determinism re-run this is. Only used with --phase=baseline.")
-@click.option("--steps", type=int, default=200, show_default=True,
-              help="Batches to iterate. Ignored in groundtruth phase (iterates until exhaustion).")
-@click.option("--checkpoint-at", type=int, default=-1, show_default=True,
-              help="Step index at which to save state in baseline phase. -1 = don't save.")
-@click.option("--state-dir", default=None, type=click.Path(),
-              help="In --phase=resumed: directory containing state_rank_NNN.pt files.")
+@click.option("--phase", type=click.Choice([PHASE_BASELINE, PHASE_RESUMED, PHASE_GROUNDTRUTH]), required=True)
+@click.option(
+    "--run-idx",
+    type=int,
+    default=0,
+    show_default=True,
+    help="Which determinism re-run this is. Only used with --phase=baseline.",
+)
+@click.option(
+    "--steps",
+    type=int,
+    default=200,
+    show_default=True,
+    help="Batches to iterate. Ignored in groundtruth phase (iterates until exhaustion).",
+)
+@click.option(
+    "--checkpoint-at",
+    type=int,
+    default=-1,
+    show_default=True,
+    help="Step index at which to save state in baseline phase. -1 = don't save.",
+)
+@click.option(
+    "--state-dir",
+    default=None,
+    type=click.Path(),
+    help="In --phase=resumed: directory containing state_rank_NNN.pt files.",
+)
 @click.option("--force-finite/--no-force-finite", default=True, show_default=True)
 @click.option("--metadata-only/--no-metadata-only", default=True, show_default=True)
-@click.option("--num-workers-override", type=int, default=None,
-              help="Override config.{section}.num_workers.")
-@click.option("--mode", type=click.Choice(["fast", "full"]), default="fast", show_default=True,
-              help="fast: CutIdDataset (default). full: stub-only in v1, raises.")
+@click.option("--num-workers-override", type=int, default=None, help="Override config.{section}.num_workers.")
+@click.option(
+    "--mode",
+    type=click.Choice(["fast", "full"]),
+    default="fast",
+    show_default=True,
+    help="fast: CutIdDataset (default). full: stub-only in v1, raises.",
+)
 @click.option("-v", "--verbose", is_flag=True, default=False)
-def cli(config_path: str, data_blend_dir: Optional[str], section: str, output_dir: str,
-        phase: str, run_idx: int, steps: int, checkpoint_at: int,
-        state_dir: Optional[str], force_finite: bool, metadata_only: bool,
-        num_workers_override: Optional[int], mode: str, verbose: bool) -> None:
+def cli(
+    config_path: str,
+    data_blend_dir: Optional[str],
+    section: str,
+    output_dir: str,
+    phase: str,
+    run_idx: int,
+    steps: int,
+    checkpoint_at: int,
+    state_dir: Optional[str],
+    force_finite: bool,
+    metadata_only: bool,
+    num_workers_override: Optional[int],
+    mode: str,
+    verbose: bool,
+) -> None:
     if mode == "full":
         raise click.ClickException("--mode=full is not implemented in v1; use --mode=fast.")
 
@@ -109,9 +142,7 @@ def cli(config_path: str, data_blend_dir: Optional[str], section: str, output_di
     )
 
     if phase == PHASE_GROUNDTRUTH and world_size != 1:
-        raise click.ClickException(
-            f"--phase=groundtruth requires nproc-per-node=1 (got world_size={world_size})"
-        )
+        raise click.ClickException(f"--phase=groundtruth requires nproc-per-node=1 (got world_size={world_size})")
 
     cfg = OmegaConf.load(config_path)
     if data_blend_dir is not None:
@@ -155,8 +186,7 @@ def cli(config_path: str, data_blend_dir: Optional[str], section: str, output_di
     else:
         out_path = phase_dir / f"rank_{rank:03d}.jsonl"
 
-    LOG.info("phase=%s run_idx=%d steps=%d checkpoint_at=%d -> %s",
-             phase, run_idx, steps, checkpoint_at, out_path)
+    LOG.info("phase=%s run_idx=%d steps=%d checkpoint_at=%d -> %s", phase, run_idx, steps, checkpoint_at, out_path)
 
     t_total_samples: list[float] = []
     t_first_batch_ms: Optional[float] = None
@@ -186,9 +216,13 @@ def cli(config_path: str, data_blend_dir: Optional[str], section: str, output_di
             fout.write(json.dumps(row) + "\n")
 
             if step % 50 == 0:
-                LOG.info("step=%d cuts=%d t_total=%.1fms (first cut: %s)",
-                         step, len(cut_ids), t_total_ms,
-                         cut_ids[0] if cut_ids else "<empty>")
+                LOG.info(
+                    "step=%d cuts=%d t_total=%.1fms (first cut: %s)",
+                    step,
+                    len(cut_ids),
+                    t_total_ms,
+                    cut_ids[0] if cut_ids else "<empty>",
+                )
 
             if phase == PHASE_BASELINE and step == checkpoint_at:
                 state_path = phase_dir / f"state_rank_{rank:03d}.pt"
@@ -273,26 +307,41 @@ def _load_state(dataloader, *, state_dir: Optional[str], rank: int) -> None:
     dataloader.load_state_dict(state)
 
 
-def _write_throughput_summary(out_path: Path, *, t_total_samples: list[float],
-                              t_first_batch_ms: Optional[float], num_workers: int) -> None:
+def _write_throughput_summary(
+    out_path: Path, *, t_total_samples: list[float], t_first_batch_ms: Optional[float], num_workers: int
+) -> None:
     if not t_total_samples:
-        out_path.write_text(json.dumps({
-            "p50_ms": None, "p95_ms": None, "mean_ms": None, "count": 0,
-            "t_first_batch_ms": t_first_batch_ms, "num_workers": num_workers,
-        }, indent=2))
+        out_path.write_text(
+            json.dumps(
+                {
+                    "p50_ms": None,
+                    "p95_ms": None,
+                    "mean_ms": None,
+                    "count": 0,
+                    "t_first_batch_ms": t_first_batch_ms,
+                    "num_workers": num_workers,
+                },
+                indent=2,
+            )
+        )
         return
     samples = sorted(t_total_samples)
     p50 = statistics.median(samples)
     p95 = samples[int(0.95 * (len(samples) - 1))]
     mean = statistics.fmean(samples)
-    out_path.write_text(json.dumps({
-        "p50_ms": round(p50, 3),
-        "p95_ms": round(p95, 3),
-        "mean_ms": round(mean, 3),
-        "count": len(samples),
-        "t_first_batch_ms": round(t_first_batch_ms, 3) if t_first_batch_ms else None,
-        "num_workers": int(num_workers),
-    }, indent=2))
+    out_path.write_text(
+        json.dumps(
+            {
+                "p50_ms": round(p50, 3),
+                "p95_ms": round(p95, 3),
+                "mean_ms": round(mean, 3),
+                "count": len(samples),
+                "t_first_batch_ms": round(t_first_batch_ms, 3) if t_first_batch_ms else None,
+                "num_workers": int(num_workers),
+            },
+            indent=2,
+        )
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/collections/common/test_lhotse_multimodal_dataloading.py b/tests/collections/common/test_lhotse_multimodal_dataloading.py
index 07933c39fa1f..4d66fb57d7ae 100644
--- a/tests/collections/common/test_lhotse_multimodal_dataloading.py
+++ b/tests/collections/common/test_lhotse_multimodal_dataloading.py
@@ -23,17 +23,13 @@
 import torch
 from lhotse import CutSet, SupervisionSegment, compute_num_samples
 from lhotse.audio import AudioLoadingError
+from lhotse.indexing import create_jsonl_index
 from lhotse.shar import JsonlShardWriter
 from lhotse.testing.dummies import dummy_cut, dummy_recording
 from omegaconf import OmegaConf
 
 from nemo.collections.common.data.lhotse import get_lhotse_dataloader_from_config
-from lhotse.indexing import create_jsonl_index
-
-from nemo.collections.common.data.lhotse.indexed_adapters import (
-    IndexedTarSampleReader,
-    create_tar_index,
-)
+from nemo.collections.common.data.lhotse.indexed_adapters import IndexedTarSampleReader, create_tar_index
 from nemo.collections.common.data.lhotse.sampling import (
     DurationFilter,
     MultimodalFixedBucketBatchSizeConstraint2D,
@@ -474,9 +470,7 @@ def test_multimodal_conversation_input_sharegpt_missing_audio_path_raises(tmp_pa
 
 
 @pytest.mark.parametrize("indexed", [False, True])
-def test_multimodal_conversation_input_sharegpt_missing_audio_path_skips_when_enabled(
-    tmp_path, caplog, indexed
-):
+def test_multimodal_conversation_input_sharegpt_missing_audio_path_skips_when_enabled(tmp_path, caplog, indexed):
     manifest_path = tmp_path / "sharegpt_skip_missing_audio_manifest.jsonl"
     dummy_recording(0, 1.0, with_data=True).to_cut().save_audio(tmp_path / "good_a.wav")
     dummy_recording(1, 1.5, with_data=True).to_cut().save_audio(tmp_path / "good_b.wav")
diff --git a/tests/collections/common/test_validate_dataloader.py b/tests/collections/common/test_validate_dataloader.py
index b2912e5e8527..8f0c1687cce4 100644
--- a/tests/collections/common/test_validate_dataloader.py
+++ b/tests/collections/common/test_validate_dataloader.py
@@ -27,8 +27,9 @@
 REPO_ROOT = Path(__file__).resolve().parents[3]
 sys.path.insert(0, str(REPO_ROOT / "scripts" / "dataloading"))
 
-from _validate_dataloader import config_inject, consolidate as cons, pre_validation as pv  # noqa: E402
-
+from _validate_dataloader import config_inject
+from _validate_dataloader import consolidate as cons  # noqa: E402
+from _validate_dataloader import pre_validation as pv
 
 # --------------------------------------------------------------------------- #
 # config_inject
@@ -37,17 +38,20 @@
 
 @pytest.mark.unit
 def test_config_inject_top_level_and_nested():
-    cfg = OmegaConf.create({
-        "input_cfg": [
-            {"type": "lhotse_as_conversation",
-             "input_cfg": [
-                 {"type": "lhotse_shar", "weight": 1.0},
-                 {"type": "nemo_tarred", "weight": 0.5},
-             ]},
-            {"type": "group",
-             "input_cfg": [{"type": "lhotse_shar", "weight": 0.3}]},
-        ],
-    })
+    cfg = OmegaConf.create(
+        {
+            "input_cfg": [
+                {
+                    "type": "lhotse_as_conversation",
+                    "input_cfg": [
+                        {"type": "lhotse_shar", "weight": 1.0},
+                        {"type": "nemo_tarred", "weight": 0.5},
+                    ],
+                },
+                {"type": "group", "input_cfg": [{"type": "lhotse_shar", "weight": 0.3}]},
+            ],
+        }
+    )
     config_inject.inject_validator_flags(cfg, force_finite=True, metadata_only=True)
     assert cfg["force_finite"] is True
     assert cfg["metadata_only"] is True
@@ -73,28 +77,36 @@ def test_config_inject_preserves_existing_explicit_value():
 
 
 def _base_cfg():
-    return OmegaConf.create({
-        "seed": 42,
-        "shard_seed": 42,
-        "use_stateful_dataloader": True,
-        "indexed": True,
-        "indexes_root": "/tmp/idx_does_not_exist_locally",
-        "use_bucketing": True,
-        "num_buckets": 20,
-        "bucket_buffer_size": 20000,
-        "force_map_dataset": False,
-        "text_field": "answer",
-        "input_cfg": [
-            {"type": "lhotse_as_conversation",
-             "input_cfg": [
-                 {"type": "lhotse_shar", "weight": 1.0, "corpus": "ami"},
-                 {"type": "nemo_tarred", "weight": 0.13, "corpus": "librilight",
-                  "text_field": "answer",
-                  "manifest_filepath": "s3://x/manifest__OP_0..15_CL_.jsonl",
-                  "tarred_audio_filepaths": "s3://x/audio__OP_0..15_CL_.tar"},
-             ]},
-        ],
-    })
+    return OmegaConf.create(
+        {
+            "seed": 42,
+            "shard_seed": 42,
+            "use_stateful_dataloader": True,
+            "indexed": True,
+            "indexes_root": "/tmp/idx_does_not_exist_locally",
+            "use_bucketing": True,
+            "num_buckets": 20,
+            "bucket_buffer_size": 20000,
+            "force_map_dataset": False,
+            "text_field": "answer",
+            "input_cfg": [
+                {
+                    "type": "lhotse_as_conversation",
+                    "input_cfg": [
+                        {"type": "lhotse_shar", "weight": 1.0, "corpus": "ami"},
+                        {
+                            "type": "nemo_tarred",
+                            "weight": 0.13,
+                            "corpus": "librilight",
+                            "text_field": "answer",
+                            "manifest_filepath": "s3://x/manifest__OP_0..15_CL_.jsonl",
+                            "tarred_audio_filepaths": "s3://x/audio__OP_0..15_CL_.tar",
+                        },
+                    ],
+                },
+            ],
+        }
+    )
 
 
 @pytest.mark.unit
@@ -215,21 +227,36 @@ def _write_jsonl(path: Path, rows: list[dict]):
 
 
 def _row(rank, step, cut_ids, *, worker_id=0):
-    return {"step": step, "rank": rank, "world_size": 2, "worker_id": worker_id,
-            "cut_ids": cut_ids, "batch_size": len(cut_ids), "t_total_ms": 1.0,
-            "t_first_batch_ms": None}
+    return {
+        "step": step,
+        "rank": rank,
+        "world_size": 2,
+        "worker_id": worker_id,
+        "cut_ids": cut_ids,
+        "batch_size": len(cut_ids),
+        "t_total_ms": 1.0,
+        "t_first_batch_ms": None,
+    }
 
 
 @pytest.mark.unit
 def test_consolidate_q1_q3_pass(tmp_path):
     """Two ranks, disjoint cuts, no duplication."""
     base = tmp_path / "baseline" / "run0"
-    _write_jsonl(base / "rank_000.jsonl", [
-        _row(0, 0, ["a", "b"]), _row(0, 1, ["c"]),
-    ])
-    _write_jsonl(base / "rank_001.jsonl", [
-        _row(1, 0, ["d", "e"]), _row(1, 1, ["f"]),
-    ])
+    _write_jsonl(
+        base / "rank_000.jsonl",
+        [
+            _row(0, 0, ["a", "b"]),
+            _row(0, 1, ["c"]),
+        ],
+    )
+    _write_jsonl(
+        base / "rank_001.jsonl",
+        [
+            _row(1, 0, ["d", "e"]),
+            _row(1, 1, ["f"]),
+        ],
+    )
     report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
     q_by_id = {q.q_id: q for q in report.questions}
     assert q_by_id["Q1"].status == cons.PASS
@@ -273,8 +300,7 @@ def test_consolidate_q2_skip_without_groundtruth(tmp_path):
 def test_consolidate_q2_skip_detects_missing(tmp_path):
     base = tmp_path / "baseline" / "run0"
     _write_jsonl(base / "rank_000.jsonl", [_row(0, 0, ["a", "b"])])
-    _write_jsonl(tmp_path / "groundtruth" / "cuts.jsonl",
-                 [{"cut_ids": ["a", "b", "c"]}])
+    _write_jsonl(tmp_path / "groundtruth" / "cuts.jsonl", [{"cut_ids": ["a", "b", "c"]}])
     report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
     q2 = next(q for q in report.questions if q.q_id == "Q2")
     assert q2.status == cons.FAIL
@@ -287,13 +313,22 @@ def test_consolidate_q4_resume_match(tmp_path):
     resumed[0] should match baseline[checkpoint_at + 1]."""
     base = tmp_path / "baseline" / "run0"
     res = tmp_path / "resumed" / "run0"
-    _write_jsonl(base / "rank_000.jsonl", [
-        _row(0, 0, ["a"]), _row(0, 1, ["b"]), _row(0, 2, ["c"]),
-    ])
+    _write_jsonl(
+        base / "rank_000.jsonl",
+        [
+            _row(0, 0, ["a"]),
+            _row(0, 1, ["b"]),
+            _row(0, 2, ["c"]),
+        ],
+    )
     # checkpoint_at=0 -> resumed[0] == baseline[1] == ["b"], resumed[1] == baseline[2] == ["c"]
-    _write_jsonl(res / "rank_000.jsonl", [
-        _row(0, 0, ["b"]), _row(0, 1, ["c"]),
-    ])
+    _write_jsonl(
+        res / "rank_000.jsonl",
+        [
+            _row(0, 0, ["b"]),
+            _row(0, 1, ["c"]),
+        ],
+    )
     report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=1)
     q4 = next(q for q in report.questions if q.q_id == "Q4")
     assert q4.status == cons.PASS
@@ -315,8 +350,7 @@ def test_consolidate_q4_resume_diverges(tmp_path):
 @pytest.mark.unit
 def test_consolidate_q5_determinism_match(tmp_path):
     for run in ("run0", "run1"):
-        _write_jsonl(tmp_path / "baseline" / run / "rank_000.jsonl",
-                     [_row(0, 0, ["a"]), _row(0, 1, ["b"])])
+        _write_jsonl(tmp_path / "baseline" / run / "rank_000.jsonl", [_row(0, 0, ["a"]), _row(0, 1, ["b"])])
     report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=2)
     q5 = next(q for q in report.questions if q.q_id == "Q5")
     assert q5.status == cons.PASS
@@ -324,10 +358,8 @@ def test_consolidate_q5_determinism_match(tmp_path):
 
 @pytest.mark.unit
 def test_consolidate_q5_determinism_diverges(tmp_path):
-    _write_jsonl(tmp_path / "baseline" / "run0" / "rank_000.jsonl",
-                 [_row(0, 0, ["a"])])
-    _write_jsonl(tmp_path / "baseline" / "run1" / "rank_000.jsonl",
-                 [_row(0, 0, ["DIFFERENT"])])
+    _write_jsonl(tmp_path / "baseline" / "run0" / "rank_000.jsonl", [_row(0, 0, ["a"])])
+    _write_jsonl(tmp_path / "baseline" / "run1" / "rank_000.jsonl", [_row(0, 0, ["DIFFERENT"])])
     report = cons.consolidate(tmp_path, checkpoint_at=0, num_determinism_runs=2)
     q5 = next(q for q in report.questions if q.q_id == "Q5")
     assert q5.status == cons.FAIL

From c61b126633259cc6ffa8c5d223e1f3c88368d8a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Fri, 12 Jun 2026 11:21:33 -0700
Subject: [PATCH 20/30] Fix remaining callback lint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 nemo/utils/callbacks/training_stats.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo/utils/callbacks/training_stats.py b/nemo/utils/callbacks/training_stats.py
index f939942f7150..0ef07b59d891 100644
--- a/nemo/utils/callbacks/training_stats.py
+++ b/nemo/utils/callbacks/training_stats.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# pylint: disable=C0116
 """
 Training-throughput metrics that are not specific to a single model.
 

From cb3d81d82b8af6bcda0c341cbc31d0c371de145b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Fri, 12 Jun 2026 11:42:34 -0700
Subject: [PATCH 21/30] Address CodeQL compatibility comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../collections/common/data/lhotse/_compat.py | 34 +++++++++++++------
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/_compat.py b/nemo/collections/common/data/lhotse/_compat.py
index 65839e37afad..ad6a14f50f79 100644
--- a/nemo/collections/common/data/lhotse/_compat.py
+++ b/nemo/collections/common/data/lhotse/_compat.py
@@ -19,9 +19,20 @@
 import torch
 from torch import distributed as dist
 
+__all__ = [
+    "GraphOriginDict",
+    "IteratorNode",
+    "LazyIndexedManifestIterator",
+    "PartitionedIndexedIterator",
+    "attach_graph_origin",
+    "normalize_graph_token",
+]
+
 try:
-    from lhotse.dataset.dataloading import PartitionedIndexedIterator
-except ImportError:
+    from lhotse.dataset import dataloading as _lhotse_dataloading
+
+    PartitionedIndexedIterator = _lhotse_dataloading.PartitionedIndexedIterator
+except (ImportError, AttributeError):
     LHOTSE_USE_WORKER_PARTITION = "LHOTSE_USE_WORKER_PARTITION"
 
     def _get_world_size() -> int:
@@ -124,14 +135,14 @@ def load_state_dict(self, sd: dict) -> None:
 
 
 try:
-    from lhotse.lazy import (
-        GraphOriginDict,
-        IteratorNode,
-        LazyIndexedManifestIterator,
-        attach_graph_origin,
-        normalize_graph_token,
-    )
-except ImportError:
+    from lhotse import lazy as _lhotse_lazy
+
+    GraphOriginDict = _lhotse_lazy.GraphOriginDict
+    IteratorNode = _lhotse_lazy.IteratorNode
+    LazyIndexedManifestIterator = _lhotse_lazy.LazyIndexedManifestIterator
+    attach_graph_origin = _lhotse_lazy.attach_graph_origin
+    normalize_graph_token = _lhotse_lazy.normalize_graph_token
+except (ImportError, AttributeError):
 
     class IteratorNode(Iterable):
         is_checkpointable = False
@@ -167,7 +178,8 @@ def attach_graph_origin(item: Any, token: Any) -> Any:
             try:
                 setattr(item, "_graph_origin", token)
             except Exception:
-                pass
+                # Immutable extension objects may not accept ad-hoc metadata.
+                return item
         return item
 
     class LazyIndexedManifestIterator(IteratorNode):

From ae0a13116adb31256379088b4cb83841370cb27e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Fri, 12 Jun 2026 11:52:22 -0700
Subject: [PATCH 22/30] Document Lhotse compatibility shims
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 nemo/collections/common/data/lhotse/_compat.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/nemo/collections/common/data/lhotse/_compat.py b/nemo/collections/common/data/lhotse/_compat.py
index ad6a14f50f79..12f5c3d78360 100644
--- a/nemo/collections/common/data/lhotse/_compat.py
+++ b/nemo/collections/common/data/lhotse/_compat.py
@@ -12,6 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # pylint: disable=unused-import
+"""Compatibility shims for optional Lhotse indexed/resumable dataloading APIs.
+
+This module lets NeMo import with released Lhotse versions that do not expose
+those APIs yet, while delegating to the real implementations when a resumable
+Lhotse checkout is available.
+"""
 import os
 from collections.abc import Generator, Iterable
 from typing import Any

From 19a49451263c6a5aaa0ecff111c501dd747919c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Mon, 15 Jun 2026 07:56:52 -0700
Subject: [PATCH 23/30] Update the resumable dataloader migration skill
 description
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../migrate-to-resumable-dataloader/SKILL.md  | 356 ++++------
 .../references/aistore-vs-non-aistore.md      | 267 +++-----
 .../references/best-practices.md              | 263 +++-----
 .../references/conflict-matrix.md             |  54 +-
 .../references/failure-modes.md               | 607 ++++++------------
 .../references/option-reference.md            | 187 +++---
 .../templates/migration-report.md             | 175 +++--
 7 files changed, 668 insertions(+), 1241 deletions(-)

diff --git a/.claude/skills/migrate-to-resumable-dataloader/SKILL.md b/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
index 24caee384c49..0135f76ea036 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/SKILL.md
@@ -1,278 +1,166 @@
 ---
 name: migrate-to-resumable-dataloader
-description: This skill should be used when the user asks to "migrate to the resumable dataloader", "switch to indexed Lhotse", "adopt the indexed + resumable pipeline", "make my training resumable", "set up StatefulDataLoader for SALM", "use AIStore GetBatch", or "convert this YAML to the resumable path". Walks a NeMo training YAML (and optional launcher / blend / cluster info) through the indexed + resumable Lhotse migration; lints every interacting field, auto-patches the YAML and any blends, emits a migration report and a pre-flight checklist, and produces a one-shot `submit_build_indexes.py` invocation. Static analysis only; never runs jobs.
-argument-hint: '<config.yaml> [launcher.py] [blend.yaml] [--cluster=<name>]'
+description: This skill should be used when the user asks to "migrate to the resumable dataloader", "switch to indexed Lhotse", "adopt the indexed + resumable pipeline", "make my training resumable", "set up StatefulDataLoader for NeMo/Lhotse", "use AIStore GetBatch", or "convert this YAML to the resumable path". Walks a NeMo training YAML and optional launcher, data blend, and runtime context through the indexed + resumable Lhotse migration; lints interacting fields; auto-patches safe YAML changes; emits a migration report, pre-flight checklist, and index-build command. Static analysis only; never launches training.
+argument-hint: '<config.yaml> [launcher.py] [blend.yaml] [runtime-notes]'
 ---
 
-# Migrate a NeMo training YAML to the indexed + resumable Lhotse dataloader
-
-The repo's resumable path (replacing the streaming/replay loader with O(1)
-checkpoint-restore via `torchdata.StatefulDataLoader` + `.idx` sidecars) has
-~20 distinct ways to silently corrupt or hard-fail. This skill runs every
-one of those checks against a concrete YAML, auto-patches what it can, and
-emits a teaching-style migration report so the user understands every
-decision and the user-only steps to run before launching.
-
-**Map-style vs iterable-style for indexed sources.** The resumable path supports
-two dedup modes:
-
-1. **`force_map_dataset: true`** (default; safest) — sampler runs in the main
-   GPU process and over-samples `world_size` batches per step, discards
-   `world_size - 1`. Works for any source type. Costs `W×` redundant
-   sampler/manifest I/O per step.
-2. **`force_map_dataset: false`** (optimization for indexed-only configs at
-   high `world_size`) — sampler runs co-located with the dataset inside CPU
-   worker subprocesses; sample indices are partitioned across
-   `(DP rank × DataLoader worker)` via `LazyShuffledRange(shard_id, num_shards)`
-   so each shard yields a disjoint slice. Resolved at iteration time via the
-   `LHOTSE_USE_WORKER_PARTITION` env-var signal that `worker_init_fn` sets.
-   Eliminates the `W×` redundant work; near-`W×` step-time improvement at
-   scale. **Requires all sources to be indexed** (or use other dedup
-   mechanisms — see `references/failure-modes.md` §20-§23).
-
-## When to apply
-
-Trigger phrases listed in the frontmatter. Three common entry modes:
-
-1. **New migration**: user points at an experiment YAML and asks to migrate.
-   Walk every field, write patched YAML + report + pre-flight + build-indexes
-   command.
-2. **Sanity-check existing migration**: user says "audit this YAML, is it
-   resumable-correct?". Same workflow but emit only the report (no patched
-   files unless errors found).
-3. **AIStore-aware variant**: cluster has `AIS_ENDPOINT` and the blend has
-   `s3://` / `ais://` / `http(s)://` paths. Skill switches to the AIStore
-   workflow (sets `USE_AIS_GET_BATCH=true`, optionally
-   `USE_AIS_INDIVIDUAL_GETS=true`, requires `aistore` SDK in container).
+# Migrate a NeMo training YAML to indexed + resumable Lhotse
+
+Use this skill to port a NeMo training config from streaming/replay-style Lhotse
+loading to indexed access plus `torchdata.StatefulDataLoader` checkpoint/restore.
+The migration is fragile because YAML flags, launcher seed policy, index paths,
+storage backend, and resume topology all interact.
+
+## Core concepts
+
+- Indexed sources need `.idx` sidecars for random access into JSONL, tar, and
+  supported Shar-style data. Build these once per blend/source set.
+- `use_stateful_dataloader: true` lets Lightning checkpoint the dataloader
+  iterator state, but only if seeds, worker counts, and distributed topology are
+  stable across chunks.
+- Training configs must use `force_map_dataset: false` so indexed sources
+  partition across data-parallel ranks and workers without map-style sampler
+  overhead. Treat `force_map_dataset: true` for training as not launch-ready
+  unless the user explicitly approves a temporary exception; every source in the
+  training iteration graph must be indexed and partition-compatible before
+  launch.
+- Remote audio on AIStore/S3 generally needs `USE_AIS_GET_BATCH=true` so audio
+  fetches are deferred to sample time instead of constructing eager tar readers
+  for every shard.
 
 ## Inputs
 
 | input | required | source | purpose |
 |---|---|---|---|
-| Training YAML | yes | argument or `--config=` | every `data.train_ds` / `data.validation_ds` / `model` / `trainer` / `exp_manager` field that interacts with the resumable path |
-| Launcher script | no | argument or auto-detect (`train_and_eval.py`, `pretrain.sh`, raw `python salm_train.py …`, `torchrun …`) | grep for per-chunk seed rotation, missing prefetch preamble, etc. If absent, skill emits "launcher review SKIPPED — manual review required" with the things to check by hand |
-| Data-blend YAML | no | resolved from `data.train_ds.input_cfg` if it references `${data_blend_dir}/...` | walked for unindexable entries (`extra_fields`, `slice_length`, `.jsonl.gz`, `.tar.gz`, AMI Shar) |
-| Cluster name | no | `--cluster=<name>` or detected from `data_blend_dir` path | reads `cluster_configs/<cluster>.yaml` env_vars to detect AIS_ENDPOINT and pick the right code paths |
+| Training YAML | yes | argument or `--config=` | Inspect `data.train_ds`, `data.validation_ds`, `trainer`, `exp_manager`, and any model fields that affect resume. |
+| Launcher script | no | argument or auto-detect from project conventions | Check per-chunk seed policy, resume topology invariance, Python path setup, AIStore env vars, and optional index staging. |
+| Data-blend YAML | no | resolved from `data.train_ds.input_cfg` when possible | Check indexability: compressed paths, non-seekable paths, unsupported `extra_fields`, `slice_length`, and mixed indexed/non-indexed chains. |
+| Runtime context | no | argument, config file, or user-provided notes | Detect storage backend, AIStore endpoint availability, container constraints, and index mirror destination. |
 
 ## Outputs
 
-Every output lands in a fresh directory `migrate-resumable/<config-stem>/`
-in the repo root (so multiple migrations stay self-contained):
+Every output lands in `migrate-resumable/<config-stem>/` in the current repo:
 
 | output | purpose |
 |---|---|
-| `migration-report.md` | exhaustive walkthrough — every field touched, every option explained, every pitfall checked, severity-classified findings, links into MIGRATION_GUIDE.md and codebase |
-| `<config-stem>-resumable.yaml` | the patched config, ready to drop in. Preserves comments where possible; explicit `# NOTE:` block at every changed line citing the rationale |
-| `<blend-stem>-resumable.yaml` | patched blend (drops unindexable entries with rationale comments) — only emitted when blend was inspected |
-| `pre-flight-checklist.md` | manual steps before launch: build indexes, verify SDK, verify cluster fits the workflow, etc. |
-| `build-indexes-cmd.sh` | concrete one-shot shell command invoking `submit_build_indexes.py` (or a generic equivalent if the repo doesn't have it) |
+| `migration-report.md` | Findings, rationale, patched fields, and unresolved blockers. |
+| `<config-stem>-resumable.yaml` | Patched training config when safe automatic edits are possible. |
+| `<blend-stem>-resumable.yaml` | Patched blend, only when a blend was inspected and safe changes are possible. |
+| `pre-flight-checklist.md` | User-run steps before submitting training. |
+| `build-indexes-cmd.sh` | One-shot index-build command using the project wrapper when available, otherwise the generic NeMo/Lhotse index builder. |
 
 ## Workflow
 
 ### 1. Discover and parse inputs
 
-1. Resolve the training YAML path. Read it with OmegaConf.
-2. If `data.train_ds.input_cfg` references `${data_blend_dir}/<file>.yaml`,
-   try to resolve `data_blend_dir` from the YAML's top-level scalar. Locate
-   the blend on disk (try `data_blends/<cluster>/<file>.yaml` first, fall
-   back to glob across `data_blends/`).
-3. If a launcher path was given, read it as text. Otherwise inspect the
-   repo root for any of `train_and_eval.py` / `pretrain.sh` / `salm_train.sh`
-   and pick the most-likely match.
-4. If `--cluster=<name>` was passed, read `cluster_configs/<name>.yaml`.
-   Otherwise grep `data_blend_dir` for a known cluster path prefix
-   (`/lustre/fsw/portfolios/llmservice/users/...` → iad, `nrt`, `ord`).
-5. Detect AIStore presence: cluster `env_vars` contains `AIS_ENDPOINT=...`,
-   AND the blend has `s3://` or `ais://` or `http(s)://` paths in
-   `tarred_audio_filepaths` / `manifest_filepath` / `cuts_path`.
+1. Resolve the training YAML path and read it with OmegaConf or a
+   comment-preserving YAML parser.
+2. Resolve any referenced blend YAMLs from `data.*.input_cfg`. Prefer project
+   conventions when obvious, but fall back to paths relative to the config.
+3. If a launcher path is supplied, read it. Otherwise inspect likely project
+   launchers (`train.py`, `pretrain.py`, shell wrappers, or raw `torchrun` /
+   `python` commands) and pick the closest match.
+4. If runtime context is supplied, read it for container image, environment
+   variables, filesystem mounts, worker counts, and AIStore endpoint settings.
+5. Detect remote storage from source paths (`s3://`, `ais://`, `http(s)://`) and
+   local filesystem storage from ordinary absolute or relative paths.
 
 ### 2. Run lint pipeline
 
-Run every check in `references/option-reference.md`,
-`references/conflict-matrix.md`, and `references/failure-modes.md` against
-the YAML and (when present) the blend and launcher. Each check emits a
-finding entry:
+Run every relevant check in:
 
-```
-{
-  severity: fatal | error | warning | note,
-  field:    "data.train_ds.shard_seed",
-  current:  "randomized",
-  recommended: 42,
-  rationale: <one paragraph explaining the *why*>,
-  link:     "MIGRATION_GUIDE.md §… / file:line",
-}
-```
+- `references/option-reference.md`
+- `references/conflict-matrix.md`
+- `references/failure-modes.md`
+- `references/aistore-vs-non-aistore.md` when remote storage is present
+
+Each finding should include severity, field/path, current value, recommended
+value, and a short rationale.
 
 Severities:
-- **fatal** — auto-patch impossible (e.g. blend uses `extra_fields` on a
-  `nemo_tarred` entry). Skill exits non-zero with explanation; user must
-  pre-process the manifest offline.
-- **error** — auto-patch produces a working config (e.g.
-  `shard_seed: "randomized"` → fixed integer). Skill applies the patch.
-- **warning** — auto-patch optional or context-dependent; emitted as a
-  comment in the patched YAML and a section in the report.
-- **note** — informational; report-only.
-
-### 3. Emit patched YAML + blend
-
-Apply every `error`-severity patch. For each, leave a `# NOTE:` comment
-above the changed line citing the finding. Comment-preserving YAML round-trip
-uses `ruamel.yaml`; if that's not available, fall back to `omegaconf`
-serialization (loses comments, but the report still documents every change
-in detail).
-
-For the blend: drop every entry that fails the indexability checks. Each
-dropped entry leaves a `# DROPPED: <rationale>` block in its place. If the
-drop empties out the blend or removes a domain entirely, surface that in
-the report.
+
+- **fatal**: automatic patching is not possible; user must preprocess data or
+  change the source layout.
+- **error**: automatic patching is safe and should be applied.
+- **warning**: context-dependent; emit a report item and optional YAML comment.
+- **note**: informational; no patch.
+
+### 3. Emit patched YAML and blend
+
+Apply safe `error`-severity patches. Preserve comments when possible with
+`ruamel.yaml`; otherwise serialize with OmegaConf/YAML and rely on the report for
+rationale. For blend edits, never silently drop data: leave an explicit report
+entry and comment for every excluded or rewritten source.
 
 ### 4. Generate `migration-report.md`
 
-Use `templates/migration-report.md`. Sections:
-
-1. **Summary** — one paragraph: AIStore vs non-AIStore, count of changes,
-   any fatal blockers.
-2. **Inputs** — paths to training YAML / launcher / blend / cluster config
-   that were inspected.
-3. **Findings table** — every finding with severity / field / current /
-   recommended / link.
-4. **Per-section walkthrough** — `data.train_ds`, `data.validation_ds`,
-   `exp_manager`, `trainer`, AIStore-specific section. For each, the table
-   from `references/option-reference.md` filtered to fields that exist in
-   this YAML, with current vs. recommended values inline.
-5. **Pitfalls / failure modes encountered** — link to
-   `references/failure-modes.md` for each that fired.
-6. **Conflict matrix** — link to `references/conflict-matrix.md` and call
-   out any conflicts found.
-7. **Best practices reminder** — copy of `references/best-practices.md`.
-8. **Verification recipe** — the bit-exact verification snippet from
-   `MIGRATION_GUIDE.md` §3, with the user's actual config path filled in.
+Use `templates/migration-report.md`. Include:
+
+1. Summary of storage workflow, counts by severity, and readiness.
+2. Inputs inspected.
+3. Findings table.
+4. Walkthrough for train data, validation data, trainer/exp manager, launcher,
+   and storage backend.
+5. Data-blend audit.
+6. Verification and pre-flight steps.
 
 ### 5. Generate `pre-flight-checklist.md`
 
-Use `templates/pre-flight-checklist.md`. Required steps:
-
-- Build indexes via `submit_build_indexes.py` (or generic equivalent);
-  print the exact command.
-- If AIStore in play: verify `aistore` SDK ≥ 1.17 in the container; verify
-  `AIS_ENDPOINT` is set; warn about the MOSS GetBatch issue for
-  multilingual / non-EN-replicated data and recommend
-  `USE_AIS_INDIVIDUAL_GETS=true` for those.
-- If launcher absent or only a stub: list the launcher items the user must
-  hand-check (single-seed across chain, prefetch preamble, num_workers /
-  world_size invariance).
-- Recommend running the bit-exact verification snippet from MIGRATION_GUIDE
-  §3 once before sweeping.
-- Recommend the 1-node single-chunk → 1-node multi-chunk → 4-node test
-  sequence.
+Use `templates/pre-flight-checklist.md` when present. Required steps:
+
+- Build `.idx` sidecars for every training/validation/test blend involved.
+- Verify `indexes_root` points at the same stable mirror used by the runtime, or
+  that explicit node-local index staging populates it before training starts.
+- If AIStore is in play: verify `aistore` SDK availability, `AIS_ENDPOINT`, and
+  whether `USE_AIS_GET_BATCH` or `USE_AIS_INDIVIDUAL_GETS` is required.
+- Verify one invariant seed across resumable chunks.
+- Verify `num_workers`, `world_size`, and relevant distributed topology do not
+  change across resume boundaries.
+- Recommend a small smoke ladder: single-node single chunk, single-node resume,
+  then full topology.
 
 ### 6. Generate `build-indexes-cmd.sh`
 
-Single executable shell file with the exact `submit_build_indexes.py`
-invocation, using:
-- `--cluster=<detected or user-supplied cluster>`
-- `--blend=<every blend referenced from the training YAML>` (training +
-  validation blends)
-- `--bypass-nvidia-hook` if cluster is NRT or any cluster whose `cpu_partition`
-  is documented to lack `nvidia-container-cli`
-- A comment block at the top with the rationale for each flag
-
-If the repo doesn't have `submit_build_indexes.py`, emit a generic
-equivalent that does:
+Prefer a project-provided wrapper when one is clearly present. Otherwise emit a
+generic command using:
+
 ```bash
 python <NeMo>/scripts/dataloading/build_indexes.py \
-    --indexes-root <local-mirror> \
-    --workers <effective> \
-    <blend>.yaml <validation-blend>.yaml
+    --indexes-root <shared-index-mirror> \
+    --workers <N> \
+    <blend>.yaml [<validation-blend>.yaml ...]
 ```
-plus a SLURM wrapper sketch and call out that `submit_build_indexes.py` in
-the speechlm-2026h1 repo is the canonical version.
+
+If running through a managed runtime or container wrapper, include comments for required
+container image, mounts, environment variables, worker count, and any CPU/GPU
+container-hook workaround the project requires.
 
 ### 7. Print final summary to chat
 
-Short recap (under 10 lines): output dir, count of fatal/error/warning/note
-findings, link to migration report, the next single command the user
-should run (`bash migrate-resumable/<stem>/build-indexes-cmd.sh` then the
-launcher).
-
-## Knowledge base — references baked into this skill
-
-- **`references/option-reference.md`** — exhaustive field-by-field table
-  (every YAML key that interacts with the resumable path, required value,
-  rationale, see-also link). Read this for every finding.
-- **`references/failure-modes.md`** — 18 catalogued failure modes with log
-  signatures, tracebacks, and fixes. Plus an "Open investigation" section.
-- **`references/conflict-matrix.md`** — the option pairs that don't work
-  together and what to do about each.
-- **`references/best-practices.md`** — distilled checklist (priority-ordered).
-- **`references/aistore-vs-non-aistore.md`** — the two parallel workflows.
-
-- **`examples/iad-english-granary/`** — IAD English training (Granary 1.1,
-  lustre manifests, S3 tars, AIStore). Before/after pair.
-- **`examples/nrt-lustre-only/`** — NRT lustre-only training (no AIStore).
-  Before/after pair, includes the `--bypass-nvidia-hook` build-index
-  invocation.
-- **`examples/multilingual-mixed/`** — multilingual blend with mixed
-  S3/lustre. Demonstrates `USE_AIS_INDIVIDUAL_GETS=true` and the
-  AMI-Shar-drop pattern.
-
-- **`templates/migration-report.md`** — output template, fill-in-the-blank.
-- **`templates/pre-flight-checklist.md`** — output template.
-
-- **`scripts/analyze.py`** — the analysis engine. Reads YAML, runs every
-  lint check, emits findings + writes patched YAML. Pure static analysis;
-  no cluster calls.
+Keep the final chat response under 10 lines: output directory, finding counts,
+report path, and the next command the user should run.
+
+## Knowledge base
+
+- `references/option-reference.md`: field-by-field reference for YAML and
+  launcher settings.
+- `references/failure-modes.md`: known failure signatures, triggers, and fixes.
+- `references/conflict-matrix.md`: incompatible option pairs.
+- `references/best-practices.md`: priority-ordered checklist.
+- `references/aistore-vs-non-aistore.md`: storage workflow selection.
+- `templates/migration-report.md`: report template.
+- `templates/pre-flight-checklist.md`: checklist template, when present.
+- `scripts/analyze.py`: optional static-analysis helper, when present.
 
 ## Constraints
 
-- **Read MIGRATION_GUIDE.md** at `/Users/pzelasko/canary-dev/speechlm-2026h1/MIGRATION_GUIDE.md`
-  in full before running. The references in this skill cite specific
-  sections of that doc.
-- **Cross-check against the actual code** at:
-  - `lhotse_resumable/lhotse/serialization.py` (`open_best`, AIStore backend, MSC backend)
-  - `lhotse_resumable/lhotse/indexing.py` (`create_jsonl_index`, `create_tar_index`, `indexed_path_kind`, `IndexedJsonlReader`, `read_index`, `LazyShuffledRange` with `(shard_id, num_shards)` partition)
-  - `lhotse_resumable/lhotse/lazy.py` (`LazyIndexedManifestIterator.__iter__` defers `LazyShuffledRange` construction to resolve partition at iter time; `LazyIteratorChain._iter_globally_shuffled` partitions the combined range; `LazyIteratorMultiplexer.__iter__` rejects `seed='randomized'` under multi-shard partition)
-  - `lhotse_resumable/lhotse/dataset/dataloading.py` (`worker_init_fn` sets the `LHOTSE_USE_WORKER_PARTITION` signal; `get_worker_partition()` returns the trivial `(0, 1)` when that signal is absent — keeps map-style mode unaffected even under torchrun)
-  - `lhotse_resumable/lhotse/ais/batch_loader.py` (`AISBatchLoader`, `force_individual`, byte-range `shar_ptr` fallback, `_moss_attrs`)
-  - `lhotse_resumable/lhotse/dataset/input_strategies.py` (`AudioSamples`)
-  - `NeMo_resumable/nemo/collections/common/data/lhotse/indexed_adapters.py` (`IndexedTarMemberReader`, `_AISRangeReader`, `_CountingReader`, `_open_data_path`, `_load_index`)
-  - `lhotse_resumable/lhotse/indexing.py` — `index_file_path(data_path, indexes_root=None)` is the canonical `.idx` path resolver.
-  - `NeMo_resumable/nemo/collections/common/data/lhotse/dataloader.py` (`get_lhotse_sampler_from_config`, `get_lhotse_dataloader_from_config`, `force_map_dataset` handling, the auto-overwrite of `shard_seed`, `_maybe_init_main_process_for_iterable` for `num_workers=0` eager `worker_init_fn` call)
-  - `NeMo_resumable/nemo/collections/common/data/lhotse/nemo_adapters.py` (`LazyNeMoTarredIterator`, `_init_indexed`, `_iter_batch_for_ais_get_batch`, `USE_AIS_GET_BATCH` gate)
-  - `NeMo_resumable/scripts/dataloading/build_indexes.py` and `prefetch_indexes.py`
-  - `lhotse_resumable/test/test_partition.py` (49 tests pinning every partition edge case: map-style regression, empty/tiny manifests, composition with shuffler/mapper/filter/repeater, multiplexer state-dict roundtrip, chain topology mismatch, etc.)
-- **Cross-check against today's debug docs** at:
-  - `agent-debug-workspace/0909-summary.md`
-  - `agent-debug-workspace/0909-multiling-failures.md`
-  - `agent-debug-workspace/0909-longform-failures.md`
-  - `agent-debug-workspace/nano-v3-1node-resumable-tests.md`
-  These contain the freshest evidence-based knowledge. Cite line:file
-  pointers when emitting findings whose rationale traces back to them.
-- **Mention but do not duplicate** the existing `submit_build_indexes.py`
-  in the speechlm-2026h1 repo; this skill references it as the canonical
-  builder for that repo and provides a generic equivalent for users on
-  other repos.
-- **Don't write code that runs jobs on the cluster.** Static-analysis +
-  migration tool, not a job runner.
-- **Identify gaps clearly.** If something is unknown (e.g., why MOSS
-  GetBatch returns empty for multilingual data), say so explicitly in
-  `failure-modes.md` under "Open investigation" and surface that in the
-  report when relevant.
-
-## Non-goals
-
-- Do not run `submit_build_indexes.py` automatically; emit the command and
-  let the user invoke it.
-- Do not modify upstream code (NeMo_resumable / lhotse_resumable). The
-  skill works around upstream bugs via YAML / env-var settings.
-- Do not invent fields the user didn't ask about. If a value is ambiguous
-  (e.g. `seed` was unset and there's no default we can read), prompt with
-  one batched `AskUserQuestion`.
-
-## Style
-
-Match the tone of `hyperparam-sweep/SKILL.md` and `debug-cluster-run/SKILL.md`.
-Crisp, evidence-based, no fluff. Inline rationale at every decision. The
-skill is a teaching tool as well as an automated migrator — every patched
-line should land with a citation the user can verify.
+- Prefer static analysis. Do not launch training, build indexes, prefetch data, or
+  modify external runtime state unless the user explicitly asks.
+- Cross-check recommendations against the actual NeMo/Lhotse code in the user's
+  checkout when paths are available. Relevant areas are common Lhotse dataloader
+  config, indexed adapters, `lhotse.indexing`, AIStore batch loading, and NeMo
+  dataloader construction.
+- Treat project wrappers as optional conveniences, not as part of the generic
+  migration contract.
+- When evidence is missing, say so. Do not encode project-specific run history
+  or local experiment names as general guidance.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md b/.claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md
index 90371a02f2d1..54314e7364ee 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/aistore-vs-non-aistore.md
@@ -1,200 +1,79 @@
-# AIStore vs non-AIStore workflows
+# AIStore vs filesystem workflows
 
-The indexed + resumable Lhotse pipeline supports two storage backends for
-the audio tar files. Manifests can be on lustre regardless. The choice of
-backend changes which env vars / flags / container deps are required.
+Indexed + resumable Lhotse can read audio/tar sources from a local filesystem or
+from AIStore-compatible URLs. Manifests/cuts may be on disk in either workflow.
+Choose the workflow from source path schemes, not from where the process runs.
 
 ## Detection
 
-The skill picks the workflow based on the **blend's `tarred_audio_filepaths`
-scheme**, NOT the cluster name:
-
 | signal | workflow |
 |---|---|
-| `tarred_audio_filepaths: s3://...` or `ais://...` or `http(s)://...` | **AIStore** workflow |
-| `tarred_audio_filepaths: /lustre/...` (or any local-FS path) | **non-AIStore** workflow |
-| Both in the same blend | **AIStore** workflow (the local files are still loadable; AIS path is the strictly-larger superset) |
-
-Cluster `env_vars` containing `AIS_ENDPOINT=...` is a necessary but not
-sufficient signal — the blend may still be all-lustre, in which case
-`AIS_ENDPOINT` is unused.
-
-## Workflow A — AIStore (s3:// / ais:// audio)
-
-### Required setup
-
-- **`aistore` SDK installed** in the training container. Either pre-baked
-  or `pip install aistore` in the preamble (no version pin needed; the
-  lhotse_resumable code's `_moss_attrs` normalizer handles both
-  pre-/post-MossOut-rename SDKs).
-- **`AIS_ENDPOINT` exported** in cluster env_vars (and forwarded into the
-  container via `--container-env=AIS_ENDPOINT,...`). Optionally
-  `AIS_AUTHN_URL` and `AIS_AUTHN_TOKEN` for authenticated AIStore
-  deployments (MOSS GetBatch requires the token).
-- **`USE_AIS_GET_BATCH=true`** env var in the training step (set
-  automatically by `--enable-indexes-prefetch` via
-  `train_and_eval.py`). This short-circuits eager
-  `IndexedTarMemberReader` construction: the indexed tar readers would
-  otherwise instantiate one per shard at startup, which on a 41k-shard
-  blend means 41k AIS HTTP connections opened before training begins.
-  With `USE_AIS_GET_BATCH=true`, audio is fetched lazily at sample time
-  via AIStore's MOSS GetBatch — one batched HTTP call per minibatch.
-
-### Optional setup
-
-- **`USE_AIS_INDIVIDUAL_GETS=true`** (or
-  `--enable-ais-individual-gets`): bypass MOSS GetBatch entirely and
-  fetch each object via `Object.get_reader(archive_config=...).read_all()`.
-  Slower (one HTTP call per object instead of one per minibatch) but
-  works around MOSS-specific server-side issues — e.g. empty-content
-  returns for non-replicated multilingual data on iad AIS, which crashes
-  the GetBatch path's empty-content retry logic.
-
-### Required code paths
-
-| component | role |
-|---|---|
-| `lhotse.serialization.AIStoreIOBackend` | turns `s3://` / `ais://` into actual HTTP fetches via aistore SDK; gated on `AIS_ENDPOINT` env var presence |
-| `nemo.collections.common.data.lhotse.indexed_adapters._AISRangeReader` | seekable file-like wrapper that translates `seek()` + `read(n)` into AIS byte-range HTTP requests; used by the indexed tar member readers when `data_path` is a URL |
-| `nemo.collections.common.data.lhotse.indexed_adapters._open_data_path` | factory that returns either a regular `open(path, "rb")` for local paths or `_AISRangeReader` for URL paths |
-| `lhotse.ais.batch_loader.AISBatchLoader` | minibatch-time MOSS GetBatch client; aggregates all URLs from a CutSet into one request and demultiplexes the response back into manifests |
-| `lhotse.ais.batch_loader._moss_attrs` | normalizer for AIS SDK MossIn-vs-MossOut attribute differences (older `.bck` / `.provider` / `.obj_name` vs newer `.bucket_name` / `.bucket_provider` / `.object_name`); handles both transparently |
-
-### Index building
-
-- `submit_build_indexes.py` runs once per blend on a CPU SLURM job.
-- Build reads tar files via AIS (HTTP GET with byte-range), parses tar
-  headers, writes `.idx` sidecars to `<workspace>/indexes_mirror/`
-  (lustre) — mirroring the data files' s3 paths.
-- Successful index build proves the data IS on AIS. If indexing
-  succeeds for a path but training fails to fetch via MOSS GetBatch with
-  empty content, the data is replicated for individual GET but not for
-  MOSS — switch the run to `USE_AIS_INDIVIDUAL_GETS=true`.
-
-### Prefetch pipeline
-
-1. **Indexes mirror → local SSD**: `prefetch_indexes_to_ssd.sh` copies
-   `<workspace>/indexes_mirror/` to `/tmp/idx` on each node. Reads via
-   `lhotse.serialization.open_best`, so source can be lustre or remote.
-2. **Manifests** (optional): `prefetch_manifests_to_ssd.sh` pulls AIS
-   manifests to `/tmp/manifests/` and rewrites blend YAMLs to point at
-   the local copies. Only useful when `manifest_filepath` is `s3://`;
-   no-op if manifests are already on lustre.
-3. **HF cache** (optional): `cache_pretrained_to_ssd.sh` copies the
-   pretrained LLM/ASR weights from `$HF_HOME/hub/` to local SSD to avoid
-   N-rank concurrent reads from lustre at training start.
-
-All three preambles are now run **in parallel** by `train_and_eval.py`
-(each in a backgrounded subshell with PID capture; `wait` propagates any
-non-zero exit). Each prefetch script is flock-guarded, so only one rank
-per node does the actual work; the other 7 wait for the lock-holder to
-finish.
-
-### Container requirements
-
-- `aistore` Python SDK (any version ≥1.18; the `_moss_attrs` normalizer
-  handles MossIn↔MossOut renames in 1.19+).
-- `nvidia-container-cli` on every node the build/training runs on. Some
-  cpu partitions don't have it (NRT cpu partition is a known case);
-  workaround is the `--bypass-nvidia-hook` flag in
-  `submit_build_indexes.py`, which injects
-  `--export=ALL,NVIDIA_VISIBLE_DEVICES=void` so enroot's
-  `98-nvidia.sh` hook short-circuits.
-
-### Failure modes specific to AIStore
-
-See `references/failure-modes.md` §3 (`f.tell()` on non-seekable
-ObjectFileReader), §4 (`os.path.getsize` on URL paths), §5 (`open()`
-builtin on URL paths), §10 (`MossOut.bck` AttributeError), §16 (MOSS
-GetBatch returns empty content for non-replicated data).
-
-## Workflow B — non-AIStore (lustre-only)
-
-### Required setup
-
-- **All `tarred_audio_filepaths` resolve to local-FS paths** (typically
-  `/lustre/...`).
-- **`AIS_ENDPOINT` UNSET** in cluster env_vars — when present and the
-  blend has any URL paths, `AISBatchLoader` would otherwise be
-  instantiated and try to MOSS-fetch local-FS paths, causing confusing
-  errors. Comment out the env var or use a different cluster_config
-  variant.
-- **`USE_AIS_GET_BATCH=false`** (the default; `--enable-indexes-prefetch`
-  sets it to `true` so use a different launcher invocation, OR pass
-  `--no-enable-indexes-prefetch` if your launcher exposes that, OR call
-  `salm_train.py` directly without the env var set).
-
-### Required code paths
-
-| component | role |
-|---|---|
-| `lhotse.serialization.BuiltinIOBackend` | trivial `open(path, "rb")` for local files |
-| `nemo.collections.common.data.lhotse.indexed_adapters._open_data_path` | falls through to `open()` for paths that don't match `_URL_RE` |
-| `nemo.collections.common.data.lhotse.indexed_adapters.IndexedTarMemberReader` | regular seekable random access into local tars |
-| **NOT used**: `_AISRangeReader`, `AISBatchLoader`, `aistore` SDK, MOSS GetBatch, archpath-based archive member fetch |
-
-### Index building
-
-- Same `submit_build_indexes.py` invocation.
-- Build reads tar files via local `open(path, "rb")` (the
-  `_open_data_path` factory's local branch). No HTTP, no AIS.
-- Faster than the AIStore workflow per file (no network round-trip),
-  but lustre I/O can be the bottleneck with high worker counts.
-
-### Prefetch pipeline
-
-1. **Indexes mirror → local SSD**: same `prefetch_indexes_to_ssd.sh`,
-   but the source is the lustre mirror (no AIS to traverse).
-2. **Manifests prefetch**: not needed (manifests are already on
-   lustre).
-3. **HF cache**: same as AIStore workflow.
-
-### Container requirements
-
-- `aistore` SDK NOT required. Container can be slim.
-- `nvidia-container-cli` still required for the GPU portion (training
-  itself); for the CPU-only index build, the `--bypass-nvidia-hook`
-  flag still applies.
-
-### Failure modes specific to non-AIStore
-
-Mostly the local-FS-only failure modes of §1, §2, §6, §7, §8, §11-§15,
-§17 from `references/failure-modes.md`. The AIS-specific modes (§3-§5,
-§10, §16) don't fire.
-
-## Decision tree
-
-```
-                 [is `tarred_audio_filepaths` a URL?]
-                          /                 \
-                        no                  yes
-                        /                     \
-              [non-AIStore workflow]   [is AIS_ENDPOINT set?]
-                                          /          \
-                                         no          yes
-                                         /            \
-                              [ERROR: blend uses     [AIStore workflow]
-                               URLs but cluster                  \
-                               doesn't expose AIS]      [does MOSS GetBatch
-                                                         work for this data?]
-                                                              /         \
-                                                         yes              no
-                                                          /                \
-                                          [USE_AIS_GET_BATCH=true]   [USE_AIS_GET_BATCH=true
-                                          (default)                    USE_AIS_INDIVIDUAL_GETS=true]
-```
-
-## Common gotchas in mode-switching
-
-- **Same blend across clusters**: a blend with `s3://` paths only works
-  on clusters with `AIS_ENDPOINT` configured. Maintain per-cluster
-  blend variants (`data_blends/<cluster>/...`) when porting.
-- **Lustre mounts identical?** Don't assume — verify with `ls` on the
-  cluster login node before assuming a `/lustre/...` path resolves on a
-  new cluster. NRT and IAD have similar mount roots but disjoint data
-  trees.
-- **`indexes_root` is shared across both workflows**. The `.idx` file
-  format is identical (uint64 offsets + sentinel); the source-data
-  resolution is what differs. You can re-use a mirror across an AIS
-  → lustre migration as long as the blend's data file paths are
-  identical strings.
+| `tarred_audio_filepaths: s3://...`, `ais://...`, or `http(s)://...` | AIStore/remote workflow |
+| `tarred_audio_filepaths: /path/...` or relative filesystem path | filesystem workflow |
+| mixed local and remote paths | remote workflow, because it has the stricter requirements |
+
+`AIS_ENDPOINT` in the environment is necessary for AIStore access, but it is not
+sufficient evidence that the blend uses AIStore.
+
+## Remote AIStore workflow
+
+Required setup:
+
+- `aistore` SDK installed in the build/training container.
+- `AIS_ENDPOINT` exported into the process that reads remote sources.
+- `USE_AIS_GET_BATCH=true` when remote tar/audio should be fetched lazily by
+  minibatch instead of opening every shard eagerly.
+
+Optional setup:
+
+- `USE_AIS_INDIVIDUAL_GETS=true` to bypass the batch endpoint and fetch each
+  object individually. This is slower but useful when the batch endpoint is
+  unavailable or returns empty content for some objects.
+
+Index building:
+
+- The index builder reads remote tar files through AIStore byte-range capable
+  paths and writes `.idx` sidecars to the configured index mirror.
+- A successful index build proves byte-range access worked for the indexed
+  source paths. It does not prove the batch endpoint will later serve every
+  object successfully.
+
+Runtime data access:
+
+1. Keep manifests/cuts on a local/shared filesystem when random access would be
+   inefficient from remote storage.
+2. Point `data.*.indexes_root` at a persistent index mirror by default.
+3. Use node-local index staging only when direct mirror reads are too slow or
+   metadata-heavy; make the YAML path match the staged destination.
+4. Use manifest prefetch only as a fallback for remote manifest paths that
+   cannot be cached persistently.
+
+## Filesystem-only workflow
+
+Required setup:
+
+- All audio/tar paths resolve through the local filesystem visible in the
+  container/process.
+- AIStore env vars are unset or ignored when no remote paths are present.
+- `USE_AIS_GET_BATCH=false` unless a mixed remote source requires it.
+
+Index building:
+
+- The index builder reads local files directly.
+- Filesystem throughput and metadata behavior determine the best worker count.
+
+Runtime data access:
+
+1. Keep manifests/cuts on a local/shared filesystem.
+2. Point `data.*.indexes_root` at a persistent index mirror.
+3. Stage indexes to node-local SSD only when needed and only with matching YAML
+   paths.
+
+## Common gotchas
+
+- Do not infer workflow from runtime labels alone; inspect the source paths.
+- Verify filesystem mounts inside the runtime/container, not only in the host shell.
+- Reusing an index mirror requires identical source path strings and unchanged
+  source contents.
+- AIStore individual GETs and batch GETs can exercise different backend paths;
+  test the exact access mode used by training.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md b/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
index dbc25c40e6cb..6c206390080b 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/best-practices.md
@@ -1,184 +1,79 @@
-# Best practices — indexed + resumable Lhotse migration
-
-A short, prioritised checklist distilled from the failure-mode catalog and
-real-world adoption pain. Apply these before sweeping any new recipe.
-
-## Tier 1 — non-negotiable
-
-1. **Pin BOTH `seed` and `shard_seed` to fixed integers** when
-   `force_map_dataset: true` and `use_stateful_dataloader: true`. The
-   sampler RNG (`shard_seed`) is checkpointed into `meta.pt` and restored
-   verbatim on resume; if `shard_seed: "randomized"`, each new chunk derives
-   a fresh worker-PID-hashed seed at init that diverges from the saved
-   snapshot. NeMo's auto-overwrite (`dataloader.py`
-   `get_lhotse_sampler_from_config`) papers over this with a warning, but
-   pinning up front gives reviewers an obvious signal of intent.
-
-2. **Same seed across every chunk of a chain.** Lightning re-seeds
-   Python/torch/numpy global RNGs at chunk start using
-   `data.train_ds.seed`. If your launcher rotates this per chunk
-   (FIXED_SEEDS-style — what `train_and_eval.py` did historically), every
-   resume silently breaks bit-exactness in dropout / augmentation /
-   aux-loss permutations. The repo's `train_and_eval.py` now pins a single
-   seed when `--enable-indexes-prefetch` is set; for other launchers, do
-   it manually.
-
-3. **Match `num_workers` AND `world_size` between save and restore.**
-   `torchdata.StatefulDataLoader` enforces this as a hard contract. Any
-   mismatch raises immediately at load. Document the values in your
-   training script header so a re-submission can't accidentally drift.
-
-4. **Build the index mirror once per blend; reuse across experiments.**
-   `submit_build_indexes.py` skips already-indexed files (checks for
-   non-empty `.idx`), so re-runs are cheap. Pin a stable
-   `indexes_root: <workspace>/indexes_mirror` and don't move it.
-
-5. **Set `concurrent_bucketing: false` in `data.train_ds`.** Default is
-   `true`, which spawns a daemon producer thread inside
-   `DynamicBucketingSampler` that races the main thread on
-   `cuts_iter`. The main thread is the one `StatefulDataLoader`
-   checkpoints; the producer is invisible to the snapshot. After resume
-   the producer's pre-fetched cuts are lost and the per-step batch
-   composition silently diverges from the non-resumed run. The
-   throughput cost of the synchronous path is negligible at steady
-   state; the determinism gain is non-negotiable for resumable
-   training. See `failure-modes.md §19`.
-
-## Tier 2 — strongly recommended
-
-5. **Run the bit-exact verification from `MIGRATION_GUIDE.md` §3 before
-   sweeping.** ~10 sec, model-free: take 5 batches → `state_dict` →
-   take 5 more (ground-truth); fresh process loads `state_dict`, takes 5,
-   asserts equal. Catches sampler/bucketer state-dict bugs that schema
-   inspection of `meta.pt` (just confirming the keys exist) won't.
-
-5b. **Consider `force_map_dataset: false` for indexed-only configs at high
-   `world_size` (≥ 16-ish).** The default map-style path over-samples
-   `world_size` batches per step and discards `world_size - 1` — at 32 DP
-   ranks that's 32× redundant sampler/manifest I/O on the main GPU
-   process per step, and the 0909 profiling showed it nearly doubling
-   training step time. The iterable path co-locates the sampler with the
-   dataset inside CPU worker subprocesses and partitions sample indices
-   across `(DP rank × DataLoader worker)` via `LazyShuffledRange(shard_id,
-   num_shards)`. Single partition level, no double-counting; near-`W×`
-   step-time improvement at scale.
-
-   Preconditions before flipping:
-   - Every nested `input_cfg` source must be indexed (no plain
-     `LazyJsonlIterator` / `LazyManifestIterator` in the chain — see
-     `failure-modes.md §21`).
-   - Every `LazyIteratorMultiplexer.seed` is a fixed integer
-     (`shard_seed` typically pins this). `seed='randomized'` raises a
-     loud `ValueError` at iter time under multi-shard partition
-     (§22).
-   - `(world_size, num_workers)` invariant across the chain (§23).
-   - Validation still uses `force_map_dataset: true` (small, finite,
-     no perf benefit from partitioning).
-
-   The 0909 sweep flipped `0909-longform5pct.yaml` as a canary;
-   `force_map_dataset: true` remains the safe default for any config
-   that mixes indexed + non-indexed sources or that you haven't
-   profiled yet.
-
-6. **Pick exactly ONE checkpoint trigger** in
-   `exp_manager.checkpoint_callback_params` — `every_n_train_steps`,
-   `every_n_epochs`, OR `train_time_interval`. Lightning's
-   `ModelCheckpoint.__validate_init_configuration` raises
-   `MisconfigurationException` if more than one is set. External
-   preemption (cluster scheduler kills mid-chunk) doesn't go through
-   NeMo's `max_time_per_run`-based PreemptionCallback, so progress
-   between the last save and the kill is lost — pick whichever trigger
-   matches your chunk's reachable progress: `every_n_train_steps: 50`
-   if chunks reach only ~80-100 steps; `every_n_epochs: 1` if you
-   reliably get full 1000-step epochs; `train_time_interval: "00:30:00"`
-   if you prefer wall-clock semantics.
-
-7. **Test 1-node single-chunk first, then 1-node multi-chunk (resume),
-   then full N-node.** The 1-node smoke isolates dataloader/IO bugs from
-   distributed/EP issues. The multi-chunk-on-1-node test exercises the
-   resume path before scale changes. The repo's
-   `nano-v3-granary1p1-en-1node-resumable.yaml` is a working template.
-
-8. **For multilingual / non-English data on AIStore that fails MOSS
-   GetBatch with "empty content"**, switch to
-   `USE_AIS_INDIVIDUAL_GETS=true` (or
-   `train_and_eval.py --enable-ais-individual-gets`). Slower per batch
-   but bypasses the buggy MOSS path. See `references/aistore-vs-non-aistore.md`.
-
-9. **Keep `.idx` mirror on lustre, prefetch destination on local SSD.**
-   Building indexes writes to lustre (cluster-shared, persistent). The
-   training preamble copies the mirror to `/tmp/idx` on each node's local
-   SSD via `prefetch_indexes_to_ssd.sh` for fast mmap. Don't store the
-   mirror on `/tmp` — it would be lost between jobs.
-
-## Tier 3 — nice to have
-
-10. **Use `--bypass-nvidia-hook`** for clusters whose cpu partition
-    lacks `nvidia-container-cli` (e.g. NRT). The launcher injects
-    `--export=ALL,NVIDIA_VISIBLE_DEVICES=void` so enroot's
-    `98-nvidia.sh` short-circuits instead of failing the container start.
-
-11. **`--exclusive --cpus_per_task=96`** for the index build job. The
-    container's unsquashfs needs the full memory budget on first
-    extraction; without exclusive, the default per-CPU memory allocation
-    can OOM-kill the container before `build_indexes.py` even starts.
-
-12. **`--workers $((cpus - 1))`** for the index ProcessPool, leaving one
-    core for OS/scheduler. Indexing is I/O bound when manifests are on
-    s3, but the tar-header parse is GIL-heavy (so threads serialize) —
-    process pool is the right call. If you OOM, drop workers; with 96
-    workers chewing big s3 manifests we've seen `BrokenProcessPool` on
-    the very large all-asr blend.
-
-13. **Drop AMI from English Granary blends until uncompressed Shar
-    exists.** AMI's Lhotse Shar uses `.jsonl.gz` cuts which can't be
-    indexed; either re-export with `compress_jsonl=False`, or use the
-    `granary1p1-en-resumable.yaml` blend which omits AMI entirely.
-
-14. **Run preambles in parallel.** `train_and_eval.py` now backgrounds
-    each preamble (HF SSD cache / manifest prefetch / index prefetch)
-    with PID capture and `wait`-with-error-propagation. Each script's
-    flock guards cross-rank de-duplication, so backgrounding from each
-    rank is safe.
-
-## What NOT to do
-
-- **Don't skip the bit-exact verification** because "schema looks right".
-  Schema-only verification (presence of `_snapshot/_steps_since_snapshot/
-  _iterator_finished` in `meta.pt`) confirms the StatefulDataLoader is
-  being asked to checkpoint, NOT that the snapshot bytes are restored
-  correctly.
-
-- **Don't pin `aistore` SDK to an old version** "to avoid the MossOut
-  bug" — the lhotse code already handles both shapes via
-  `_moss_attrs`. Use the latest SDK; track future SDK churn with the
-  same defensive normalizer pattern.
-
-- **Don't combine `every_n_train_steps + every_n_epochs +
-  train_time_interval`** in one `checkpoint_callback_params`. Lightning
-  raises `MisconfigurationException` at startup. Pick one trigger.
-
-- **Don't enable `concurrent_bucketing=True` with custom samplers** that
-  spawn non-daemon threads. The built-in `DynamicBucketingSampler` is
-  correct (background thread is `daemon=True`); only matters if you
-  forked it.
-
-- **Don't move `indexes_root` between training and prefetch.** If the
-  YAML says `indexes_root: /tmp/idx` and the prefetch script writes to
-  `/tmp/idx2`, training silently can't find any index, falls back to
-  building on first access (slow).
-
-- **Don't flip to `force_map_dataset: false` without auditing every
-  source in the chain.** A single non-indexed source (plain
-  `LazyJsonlIterator`, `LazyManifestIterator`, compressed Shar that
-  silently fell back, etc.) yields its full content on every rank under
-  iterable mode — silent data duplication that won't show up until you
-  inspect cut-ID coverage across ranks. See `failure-modes.md §21`. When
-  in doubt, keep `force_map_dataset: true`; the over-sample-and-discard
-  dedup works regardless of source type.
-
-- **Don't set `LHOTSE_USE_WORKER_PARTITION` manually.** It's a signal
-  set by `worker_init_fn` to indicate iterable-mode partition is active.
-  Setting it from outside (e.g. in a launcher script or `.env` file)
-  while running map-style mode would re-introduce the under-sampling bug
-  fixed by §20.
+# Best practices - indexed + resumable Lhotse migration
+
+Prioritized checklist for migrating a NeMo config to indexed access and
+checkpointable dataloading.
+
+## Tier 1 - non-negotiable
+
+1. **Pin `seed` and `shard_seed` to fixed integers.** The sampler and model RNG
+   must resume from a stable state. Avoid `"randomized"` for resumable chains.
+
+2. **Use one seed across every chunk of a resumable chain.** Lightning reseeds
+   global RNGs at chunk startup. Rotating the seed breaks bit-exact resume even
+   when dataloader state restores correctly.
+
+3. **Keep `num_workers` and distributed topology invariant.** Changing worker
+   count, world size, or rank/worker assignment invalidates stateful dataloader
+   snapshots and iterable partition state.
+
+4. **Build `.idx` sidecars once per stable source path set.** Reuse a persistent
+   index mirror across experiments. Rebuild only when source contents or path
+   strings change.
+
+5. **Disable concurrent bucketing for resumable training.** Background producer
+   threads can advance iterators outside the checkpointed main-thread state.
+
+## Tier 2 - strongly recommended
+
+6. **Run a bit-exact dataloader resume check before sweeping.** Take a few
+   batches, save dataloader state, take a few more as ground truth, restore in a
+   fresh process, and compare the restored batches.
+
+7. **Enforce `force_map_dataset: false` for training.** Map-style training has
+   too much sampler/manifest overhead. Before launch, confirm every training
+   source is indexed, multiplexer seeds are fixed, and topology is stable; if a
+   source cannot be indexed, report it as a migration blocker instead of
+   silently keeping map-style training.
+
+8. **Use frequent checkpoint triggers.** External termination may not execute a
+   graceful preemption callback. Step- or time-based saves reduce lost progress.
+
+9. **Smoke test in stages.** Run single-node single-chunk, then single-node
+   multi-chunk resume, then the intended full topology.
+
+10. **Keep `.idx` files on a persistent filesystem by default.** Stage to
+    node-local SSD only when direct filesystem reads are proven problematic, and
+    ensure the YAML `indexes_root` matches the staged destination.
+
+11. **Use AIStore batch fetching deliberately.** For remote tar/audio sources,
+    `USE_AIS_GET_BATCH=true` avoids eager remote tar-reader construction. If the
+    batch endpoint fails for a dataset, use `USE_AIS_INDIVIDUAL_GETS=true` as a
+    slower fallback while investigating storage availability.
+
+## Tier 3 - operational hygiene
+
+12. **Tune index-build workers to memory and storage backend.** Many workers can
+    OOM on large manifests or remote tar headers. Reduce workers or split the
+    blend when needed.
+
+13. **Keep optional prefetch steps explicit.** Manifest prefetch, index staging,
+    and model-cache preambles should be visible in the launcher and documented in
+    the report.
+
+14. **Use CPU-safe container settings for CPU-only index builds.** Some container
+    runtimes expect GPU hooks by default; bypass or disable them when the index
+    build runs without GPU access.
+
+## What not to do
+
+- Do not trust `meta.pt` key presence alone as proof of bit-exact resume.
+- Do not combine incompatible Lightning checkpoint triggers.
+- Do not point `indexes_root` at a node-local path unless the launcher populates
+  it before every chunk.
+- Do not launch iterable training until every source in the chain has been
+  audited and made partition-compatible.
+- Do not use map-style training to bypass indexing blockers; mark the migration
+  not launch-ready unless the user explicitly approves a temporary exception
+  with the blocker and expected overhead.
+- Do not set `LHOTSE_USE_WORKER_PARTITION` manually; it is an internal signal set
+  by the dataloader worker initialization path.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md b/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
index d17f60d44cdd..f117201a0e16 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/conflict-matrix.md
@@ -1,37 +1,31 @@
-# Conflict matrix — option pairs that don't work together
+# Conflict matrix - indexed + resumable Lhotse
 
 Table format: `A | B | conflict | severity | resolution`.
 
 Severities:
-- **fatal** = auto-patch impossible; requires offline manifest pre-processing
-  or data ingestion. Skill exits non-zero with explanation.
-- **error** = auto-patchable.
-- **warning** = patchable but context-dependent; the skill emits a comment
-  in the patched YAML and a section in the report.
+
+- **fatal**: automatic patching is impossible; data must be preprocessed or the
+  launcher/storage setup must change.
+- **error**: automatic patching is usually safe.
+- **warning**: context-dependent; report clearly.
+- **note**: informational.
 
 | A | B | conflict | severity | resolution |
 |---|---|---|---|---|
-| `data.train_ds.indexed: true` | `extra_fields:` on a `nemo` / `nemo_tarred` / `multimodal_conversation` entry | `LazyNeMoTarredIterator(indexed=True)` raises `RuntimeError` (`nemo_adapters.py:485-487`). Graph-token random access has no stable index. | fatal | Pre-process the manifest offline to materialize the extra fields, drop the `extra_fields` key. |
-| `data.train_ds.indexed: true` | `slice_length:` on a `nemo` / `nemo_tarred` entry | Sliced cuts have no stable index — slicing rewrites the cut sequence. | fatal | Re-shard the audio offline to the target slice length, drop the `slice_length` key. |
-| `data.train_ds.indexed: true` | Lhotse Shar `cuts.*.jsonl.gz` (compressed cuts) | `lhotse/indexing.py:88-110` rejects compressed paths in `indexed_path_kind`. AMI's stock distribution hits this. | fatal | Drop the corpus from the blend, OR re-export the Shar with `compress_jsonl=False`, OR convert to `nemo_tarred` format. |
-| `data.train_ds.indexed: true` | `tarred_audio_filepaths: *.tar.gz` | Compressed tars can't be indexed. | fatal | Re-pack the tars uncompressed. |
-| `data.train_ds.indexed: true` | `pipe:cmd \| cmd2` paths | Pipe commands aren't seekable; `validate_indexed_access` raises `ValueError`. | fatal | Materialize the upstream of the pipe to a real file, then point at that. |
-| `data.train_ds.force_map_dataset: true` | `data.train_ds.force_iterable_dataset: true` | `dataloader.py:278-280` asserts these are mutually exclusive. | error | Keep only `force_map_dataset: true`. |
-| `data.train_ds.force_map_dataset: true` + `data.train_ds.use_stateful_dataloader: true` | `data.train_ds.shard_seed: "randomized"` | Map path doesn't need per-rank seed differentiation; `"randomized"` adds worker-PID-derived seeding that breaks across resume. NeMo's `dataloader.py:556-572` warns + auto-overwrites with `seed`. | error | Set `shard_seed: <int>` (typically equal to `seed`). |
-| `data.train_ds.use_stateful_dataloader: true` | per-chunk seed rotation in launcher | Silent corruption: model RNG (dropout, aux-loss, random-init) diverges across chunks even though sampler state restores correctly. | error | Pin a single seed across the entire chain. `train_and_eval.py:925-952` does this when `--enable-indexes-prefetch` is set. For arbitrary launchers, set the same seed in every chunk's command. |
-| `data.train_ds.use_stateful_dataloader: true` | `num_workers` change between save and restore | Hard error from `torchdata.StatefulDataLoader`. | error | Document `num_workers` in the YAML / launcher header; never change between chunks. |
-| `data.train_ds.use_stateful_dataloader: true` | `world_size` change between save and restore (`num_nodes * devices_per_node`) | Hard error from torchdata. | error | Restart from a converted HuggingFace checkpoint if you need to scale (no resume in that case). |
-| AIStore MOSS GetBatch (`USE_AIS_GET_BATCH=true`, `USE_AIS_INDIVIDUAL_GETS` unset) | non-EN-replicated multilingual data on `s3://FLEURS/...`, `s3://MCV/...`, etc. | MOSS returns 200 + empty body for missing objects. Triggers the empty-content retry path which then crashes (§10 / §16 in failure-modes.md). | warning | Set `USE_AIS_INDIVIDUAL_GETS=true` until the data is replicated to AIS, OR replicate the data, OR switch the blend to lustre tar paths if available. |
-| `data.validation_ds.force_finite: true` | training (`data.train_ds`) | `force_finite` caps the infinite-mux behavior that training requires. | error | `force_finite: true` is a validation-only flag; don't propagate it to `data.train_ds`. |
-| `exp_manager.checkpoint_callback_params.every_n_train_steps: null` | external preemption (`svc-hwinf-cs-sched`, NODE_FAIL, etc.) at < 1 epoch | No mid-epoch save; chunk progress is lost on every preemption. | warning | Add `every_n_train_steps: 50-250` (and/or `train_time_interval: "00:30:00"`). Lightning ORs the triggers. |
-| `exp_manager.max_time_per_run` ≥ SLURM walltime | SLURM SIGKILL during teardown | The internal preemption save never fires; teardown is killed mid-write. | error | Set `max_time_per_run` to `<SLURM walltime - 10min>` (e.g. `00:03:50:00` for a 4h walltime). |
-| `data.train_ds.indexes_root` | `prefetch_indexes_to_ssd.sh` destination | Mismatch → manifests fail to find their `.idx` neighbors at training time. | error | Keep both in sync. The prefetch script's default is `/tmp/idx`; the YAML's `indexes_root` must match. |
-| `submit_build_indexes.py` (no `--bypass-nvidia-hook`) | NRT cpu partition (lacks `nvidia-container-cli`) | enroot's `98-nvidia.sh` hook hard-fails container start. | error | Pass `--bypass-nvidia-hook` for any cluster whose cpu partition lacks `nvidia-container-cli`. |
-| Container `aistore` SDK < 1.17 | AIStore in play | `lhotse_resumable/lhotse/ais/batch_loader.py:75` requires `>=1.17.0`. | error | Pin `aistore>=1.17` in the build/training container preamble; `submit_build_indexes.py:227` does this. |
-| `data.train_ds.seed` | per-chunk seed rotation in launcher | Same as above — silent model-level divergence. | error | Pin `seed` in YAML AND in launcher; both must be invariant across the chain. |
-| `pretrained_llm` change | resume from a chain | `init_from_checkpoint` resharding issues; tokenizer mismatch. | warning | Don't change the LLM mid-chain. Start fresh if you need a different LLM (+ optionally `init_from_checkpoint: <previous_run.ckpt>` for transfer). |
-| `model.aux_loss_coeff > 0` | `model.activation_checkpointing_llm: true` | AC + MoE aux-loss recompute dtype flip (debug-cluster-run §6(16)). `CheckpointError: Recomputed values ... different metadata`. Orthogonal to resumable, but a frequent recipe pitfall. | error | Set `aux_loss_coeff: 0`, OR disable `activation_checkpointing_llm` (perception AC alone is fine). |
-| `data.train_ds.force_map_dataset: false` | `LazyIteratorMultiplexer(seed="randomized")` anywhere in the chain | Under iterable-mode partition all ranks must pick the same source at each multiplex step (else the global weighted distribution drifts). `LazyIteratorMultiplexer.__iter__` raises `ValueError` at iter time. | error | Use a fixed integer seed for every multiplexer in the chain (`shard_seed` is typically the one to pin). |
-| `data.train_ds.force_map_dataset: false` | non-indexed sources in the chain (plain `LazyJsonlIterator`, `LazyManifestIterator`, anything `is_indexed=False`) | Non-indexed sources don't partition — every rank reads them in full, silently duplicating data. The chain's `is_indexed` is `False` when any source is non-indexed, so `_iter_globally_shuffled` won't fire either. | warning | (a) Convert the non-indexed sources to indexed (rebuild with `submit_build_indexes.py`); OR (b) split into separate dataloaders; OR (c) revert to `force_map_dataset: true` for that config. |
-| `data.train_ds.force_map_dataset: false` | resume with different `(world_size, num_workers)` | `LazyShuffledRange.load_state_dict` validates the full topology including `shard_id` and `num_shards`. Mismatch raises `ValueError` loudly. Same contract as map-style StatefulDataLoader, but check fires at the iterator level. | error | Keep `(world_size, num_workers)` invariant across the chain. To scale, restart from a converted HuggingFace checkpoint (no resume). |
-| `data.train_ds.force_map_dataset: false` | `force_iterable_dataset: true` | Redundant but not an error — `dataloader.py:280` resolves to iterable in both cases; only `force_map_dataset=True` overrides. The `assert not (force_map_dataset and force_iterable_dataset)` still holds. | note | Pick one form. `force_map_dataset: false` is sufficient; setting both is just noise. |
+| `data.train_ds.indexed: true` | `extra_fields:` on indexed NeMo entries | Indexed adapters cannot preserve arbitrary runtime field rewrites. | fatal | Preprocess the manifest to materialize fields, then drop `extra_fields`. |
+| `data.train_ds.indexed: true` | `slice_length:` on indexed entries | Slicing changes cut/audio access and has no stable sidecar unless preprocessed. | fatal | Re-shard or preprocess offline, then drop `slice_length`. |
+| `data.train_ds.indexed: true` | compressed JSONL/Shar cuts or compressed tar paths | Compressed streams do not provide stable seekable offsets for sidecars. | fatal | Re-export uncompressed or materialize seekable sources. |
+| `data.train_ds.indexed: true` | `pipe:` paths | Pipes are not seekable. | fatal | Materialize upstream data to files or a seekable backend. |
+| `data.train_ds.force_map_dataset: true` | resumable training launch | Map-style training keeps too much sampler/manifest work on the main process. | error | Set `data.train_ds.force_map_dataset: false` after making every training source indexed and partition-compatible. |
+| `force_map_dataset: true` | `force_iterable_dataset: true` | Dataset mode selection is contradictory. | error | Keep one mode. For training, use `force_map_dataset: false`; for validation/test, keep map-style unless intentionally testing iterable behavior. |
+| `use_stateful_dataloader: true` | per-chunk seed rotation | Model-level RNG diverges across resumed chunks. | error | Pin one seed for the whole chain in YAML and launcher. |
+| `use_stateful_dataloader: true` | `num_workers` changes between chunks | Saved dataloader state is incompatible. | error | Keep worker count invariant or restart without dataloader state. |
+| `use_stateful_dataloader: true` | `world_size` / rank topology changes | Saved iterator and sampler state are topology-sensitive. | error | Keep topology invariant or restart without dataloader state. |
+| `force_map_dataset: false` | any non-indexed source in the chain | Non-indexed sources do not partition and are duplicated across ranks/workers. | fatal | Convert all sources to indexed access or split/remove the non-indexed source. Do not switch to map-style training to bypass this unless the user explicitly approves a temporary exception. |
+| `force_map_dataset: false` | multiplexer seed is `"randomized"` | Shards may choose different sources at the same step. | error | Use a fixed integer seed. |
+| `force_finite: true` | training dataset | Can cap infinite training mixtures unexpectedly. | error | Use finite mode for validation/test only unless intentionally bounded. |
+| Checkpoint cadence absent | external preemption / walltime kill | Chunk progress can be lost without mid-chunk saves. | warning | Add frequent step- or time-based checkpoints. |
+| Node-local `indexes_root` | no prefetch/staging before startup | `.idx` files are missing at runtime. | error | Point to a persistent mirror or stage indexes before every chunk. |
+| AIStore batch mode | objects unavailable through batch endpoint | Batch loader may return empty content or fail collation. | warning | Verify object availability, replicate data, or set `USE_AIS_INDIVIDUAL_GETS=true`. |
+| Container lacks AIStore SDK | AIStore source paths | Remote reads may fall back to the wrong backend or fail. | error | Install a compatible `aistore` SDK in build/training containers. |
+| CPU-only index build | GPU container hook requires GPU runtime | Container startup can fail before index build begins. | warning | Use CPU-safe container settings or bypass GPU hooks. |
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md b/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
index 849929f2881e..8927a805cce8 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/failure-modes.md
@@ -1,461 +1,278 @@
 # Failure-mode catalog
 
-Every failure mode observed during the speechlm-2026h1 migration to indexed
-+ resumable dataloading. Each entry: **signature** (what you grep for in
-logs), **trigger** (the YAML/launcher condition that produces it),
-**fix**, and **see-also** pointers.
+Failure signatures, triggers, and fixes for indexed + resumable Lhotse
+migrations. These are generic patterns; verify exact file names and line numbers
+against the user's checkout before citing them in a report.
 
-## §1 — `.jsonl.gz` AMI shar in blend
+## §1 - Compressed JSONL, Shar cuts, or tar paths
 
-**Signature**: index build fails with
-`ValueError: <ctx> requires uncompressed JSONL or tar data, but got a compressed path: <file>.jsonl.gz`
-from `lhotse/indexing.py:130-135`.
+**Signature**: index build raises a `ValueError` saying the source requires
+uncompressed JSONL or tar data but received a compressed path such as
+`*.jsonl.gz` or `*.tar.gz`.
 
-**Trigger**: blend YAML references AMI's stock distribution (Lhotse Shar
-with `cuts.*.jsonl.gz`).
+**Trigger**: an indexed source points at compressed cuts, manifests, or tar
+files. Sidecar offsets require stable byte positions in seekable files.
 
-**Fix**: drop AMI from the blend until an uncompressed Shar export (or a
-`nemo_tarred` re-export) is available. The repo's
-`data_blends/iad/granary1p1-en-resumable.yaml` does exactly this — see its
-header comment.
+**Fix**: re-export or materialize the source in an uncompressed seekable format.
+For Shar-style data, export cuts as plain JSONL when sidecar indexing is needed.
 
-## §2 — `extra_fields` / `slice_length` on `nemo_tarred` entry
+## §2 - `extra_fields` or `slice_length` on indexed NeMo entries
 
-**Signature**:
-`RuntimeError: LazyNeMoTarredIterator(indexed=True) does not support 'extra_fields' because <ctx>` from `nemo_adapters.py:485-487`,
-or
-`RuntimeError: LazyNeMoIterator(indexed=True) does not support 'extra_fields'` from `nemo_adapters.py:148-152`.
+**Signature**: an indexed NeMo iterator raises that `extra_fields` is not
+supported, or data order diverges after slicing.
 
-**Trigger**: blend entry has `extra_fields:` block (typically attaching
-text-iter / text-sample / graph-token features to a `nemo` /
-`nemo_tarred`) or `slice_length: N`.
+**Trigger**: the source applies runtime field injection or slicing while also
+requesting indexed access.
 
-**Fix**: pre-process the manifest offline to materialize the extra fields
-into the manifest, then drop the `extra_fields` key. For `slice_length`,
-re-shard the audio to the target slice and drop the key.
+**Fix**: preprocess the manifest offline so the indexed source already contains
+all required fields and shard/slice layout. Drop `extra_fields` and
+`slice_length` from the indexed YAML entry.
 
-## §3 — `f.tell()` on AIStore `ObjectFileReader`
+## §3 - Remote object reader is not seekable
 
-**Signature**: `io.UnsupportedOperation: seek` on first read of an
-`ais://` / `s3://` tar source.
+**Signature**: `io.UnsupportedOperation: seek` or `tell` on first read of a
+remote URL source.
 
-**Trigger**: AIStore SDK's `ObjectFileReader` doesn't implement
-`tell()` / `seek()`. The indexer uses `_CountingReader` to accumulate bytes
-manually; if your code path bypasses that, this fires.
+**Trigger**: the code path uses a backend reader that does not implement the
+seek/tell operations required by indexing.
 
-**Fix**: ensure the `aistore` SDK is installed in the container so lhotse
-routes via `AIStoreIOBackend`. The indexer's `create_jsonl_index` /
-`create_tar_index` accumulate bytes via `len(line)` and `_CountingReader`
-in `lhotse/indexing.py`. `submit_build_indexes.py:227` does the SDK install
-preamble.
+**Fix**: ensure the remote-storage SDK is installed and that Lhotse routes the
+path through the intended seekable/range-capable backend. For AIStore, verify
+`aistore` is installed and `AIS_ENDPOINT` is set.
 
-## §4 — `os.path.getsize(s3://…)`
+## §4 - Stdlib filesystem operations on URLs
 
-**Signature**: `FileNotFoundError: [Errno 2] No such file or directory: 's3://...'`
+**Signature**: `FileNotFoundError` from `open("s3://...")` or
+`os.path.getsize("s3://...")`.
 
-**Trigger**: legacy code path computing index file size from disk for an
-`s3://` URL.
+**Trigger**: a URL path reaches code that assumes local filesystem semantics.
 
-**Fix**: `IndexedJsonlReader._load_index` / `IndexedTarMemberReader._load_index`
-now read the size sentinel from the `.idx` file itself for URL paths.
-Confirmed at `NeMo_resumable/nemo/collections/common/data/lhotse/indexed_adapters.py:269-294`
-(uses `np.fromfile` with `<u8` dtype; final entry is the file-size
-sentinel).
+**Fix**: route URL paths through the storage-aware reader and load index metadata
+from the `.idx` file rather than local `os.path` calls.
 
-## §5 — `open(s3://…)` in tar member readers
+## §5 - Too many memory maps for large shard counts
 
-**Signature**: `FileNotFoundError: [Errno 2] No such file or directory: 's3://...'`
-on first audio fetch.
+**Signature**: `OSError: [Errno 12] Cannot allocate memory` or system
+`vm.max_map_count` exhaustion during startup.
 
-**Trigger**: `IndexedTarMemberReader` calling stdlib `open()` instead of
-the AIS-aware reader on a remote tar.
+**Trigger**: one memory map per `.idx` file across a very large number of shards.
 
-**Fix**: `_open_data_path` at `indexed_adapters.py:159-166` returns
-`_AISRangeReader(str(path))` for any path with a `://` scheme. The
-`_AISRangeReader` translates `seek + read` into AIStore HTTP range requests.
+**Fix**: load sidecars into resident arrays or otherwise reduce mmap count. The
+sidecars are usually small enough that resident arrays are acceptable.
 
-## §6 — `np.memmap` exhausts `vm.max_map_count`
+## §6 - Line-delimited JSON with `.json` extension rejected
 
-**Signature**: `OSError: [Errno 12] Cannot allocate memory` during
-training startup, with 80k+ shards.
+**Signature**: index validation rejects a line-delimited JSON manifest with a
+`.json` suffix.
 
-**Trigger**: legacy `np.memmap` per `.idx` file. With
-`vm.max_map_count = 65530` (Linux default), 80k shards × 1 mmap each
-exceeds the limit.
+**Trigger**: extension filtering assumes only `.jsonl` is valid, while some NeMo
+manifests use `.json` for one-record-per-line JSON.
 
-**Fix**: switched to `np.fromfile` (resident array). Indexes are tiny
-(KB-scale per shard), so the memory cost is negligible. Confirmed at
-`indexed_adapters.py:288-294` ("Use np.fromfile (resident memory) rather
-than np.memmap so that NeMo blends with 80k+ shards don't exhaust
-vm.max_map_count").
+**Fix**: accept both `.jsonl` and line-delimited `.json` when the contents are
+newline-separated records.
 
-## §7 — Validation manifest with `.json` extension
+## §7 - Process pool OOM during index build
 
-**Signature**: `ValueError: <ctx> path is not indexable: <file>.json`
-from `validate_indexed_access`.
+**Signature**: `concurrent.futures.process.BrokenProcessPool` after partial
+index-build progress.
 
-**Trigger**: NeMo convention to ship some manifests as `.json` (one JSON
-object per line) rather than `.jsonl`. The first version of
-`indexed_path_kind` rejected `.json`.
+**Trigger**: too many workers parse large manifests or tar headers concurrently,
+exceeding available process memory.
 
-**Fix**: `lhotse/indexing.py:99-107` now accepts both `.jsonl` and
-`.json` since the indexer only relies on newline-separated records.
+**Fix**: reduce worker count, split the blend/source list across multiple index
+runs, or increase available memory.
 
-## §8 — ProcessPool OOM during indexing
+## §8 - GPU container hook runs during CPU-only index build
 
-**Signature**: `concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly`
-in the build-indexes log, often after several minutes of forward progress.
+**Signature**: container startup fails before Python runs, often around a GPU
+runtime hook such as `nvidia-container-cli`.
 
-**Trigger**: 95 workers on a 96-cpu node + huge S3 manifests + Granary 1.1
-audio tars. The forks each load a manifest + tar header into RAM; with
-176 GiB total and 95 workers, peak per-worker RAM crosses ~1.8 GiB and
-the kernel OOM-killer fires.
+**Trigger**: a CPU-only index build uses a container/runtime setup that assumes
+GPU devices are present.
 
-**Fix**: drop to `--workers 48`, or split the blend across multiple
-build-indexes invocations. `submit_build_indexes.py:99-104` defaults
-`cpus_per_task=96`; the auto-effective worker count is `cpus_per_task - 1`
-= 95. Override with `--workers 48`.
+**Fix**: use CPU-safe container settings for index builds, or bypass/disable GPU
+hooks when the runtime has no GPU access.
 
-## §9 — Container `nvidia-container-cli` missing on cpu partition
+## §9 - AIStore SDK response shape changed
 
-**Signature**: enroot's `98-nvidia.sh` hook hard-fails container start;
-sbatch.log shows `nvidia-container-cli: command not found` or similar.
+**Signature**: an AttributeError on fields returned by the AIStore batch API,
+often in an error or empty-content path.
 
-**Trigger**: NRT cluster's `cpu` / `cpu_interactive` / `cpu_datamover`
-partitions lack `nvidia-container-cli`. IAD's cpu partition has it.
+**Trigger**: code assumes one SDK response schema while the installed SDK returns
+another.
 
-**Fix**: pass `--bypass-nvidia-hook` to `submit_build_indexes.py`
-(`:122-129, 240-245`). Sets `--export=ALL,NVIDIA_VISIBLE_DEVICES=void` on
-the sbatch line, which makes enroot's hook short-circuit.
+**Fix**: normalize SDK response attributes at the boundary and use that helper at
+all consumer sites. Avoid raw direct field access in error-handling code.
 
-## §10 — AIStore SDK `MossOut.bck` AttributeError
+## §10 - `shard_seed: "randomized"` with stateful dataloading
 
-**Signature**:
-`AttributeError: 'MossOut' object has no attribute 'bck'` in the empty-
-content retry path of `AISBatchLoader.__call__`. Cascade:
-`Error collating conversations: 'MossOut' object has no attribute 'bck'`,
-then `FallbackDataset received None`, then
-`TypeError: 'NoneType' object is not subscriptable` in
-`salm_automodel.training_step`, then DeepEP's `'unspecified launch failure'`.
+**Signature**: usually silent. Resume is not bit-exact even though the dataloader
+snapshot appears to restore.
 
-**Trigger**: `aistore>=1.20` (we're on 1.23.0) renames the MossIn-shaped
-`info.bck/.provider/.obj_name/.archpath` → MossOut-shaped
-`info.bucket_name/.bucket_provider/.obj_name/.archpath`. Triggered when
-the underlying object is missing on AIS and the SDK returns 200 + empty
-body, kicking the retry path that then crashes on attribute access.
+**Trigger**: randomized shard/sampler seed is re-derived at chunk startup while
+stateful sampler data is loaded from checkpoint.
 
-**Fix**: `_moss_attrs` normalizer at
-`lhotse_resumable/lhotse/ais/batch_loader.py:81` returns a 4-tuple
-`(bck, provider, obj_name, archpath)` for both shapes. Every consumer site
-must use it; raw `info.bck` references are bugs.
+**Fix**: pin `shard_seed` to a fixed integer, typically matching the top-level
+training seed.
 
-**See also**: `agent-debug-workspace/0909-multiling-failures.md` for the
-full causal chain (multilingual Granary 1.1 audio not on iad AIS → empty
-content → retry path crash).
+## §11 - Per-chunk seed rotation in launcher
 
-## §11 — `shard_seed: "randomized"` + `force_map_dataset: true` + `use_stateful_dataloader: true`
+**Signature**: silent model-level divergence across chunk boundaries. Data-order
+state may restore, but dropout, augmentation, and other model/global RNG draws do
+not match a continuous run.
 
-**Signature**: silent — no crash. Each fork re-derives a worker-PID-hashed
-seed at `worker_init_fn` time, but `StatefulDataLoader.load_state_dict`
-overrides the sampler state from the checkpoint. The mismatch produces
-non-bit-exact resume at the data level (within the saved snapshot
-window).
+**Trigger**: the launcher chooses a different seed for each resumable chunk.
 
-**Trigger**: `shard_seed: "randomized"` literal in YAML, paired with
-`force_map_dataset: true` + `use_stateful_dataloader: true`.
+**Fix**: use one invariant seed for the entire resumable chain. If the launcher
+computes seeds from run index, override that behavior for indexed + stateful
+runs.
 
-**Fix**: pin `shard_seed: <int>` (typically equal to `seed`).
-**NeMo's `dataloader.py:556-572` now warns + auto-overwrites** with the
-`seed` integer, so this is a safety net; explicit pinning in YAML keeps
-the rationale visible.
+## §12 - No mid-chunk checkpoint trigger
 
-## §12 — Per-chunk seed rotation in launcher
+**Signature**: only epoch-boundary checkpoints exist; progress after the last
+boundary is lost when a chunk is preempted or reaches walltime.
 
-**Signature**: silent (and worse than §11). On each chunk, Lightning
-calls `pl.seed_everything(run_seed)`, re-seeding Python/numpy/torch global
-RNG with a different value. Dropout, aux-loss, model random-init RNG draws
-diverge across chunks. The data-iteration level is correct (StatefulDataLoader
-wins the seed race for sampler state); the model level is not.
-
-**Trigger**: `train_and_eval.py`'s `FIXED_SEEDS[seed_offset+i]` rotation
-(pre-fix), or any launcher that picks a fresh seed per chunk
-(`seed = randint(...)`, `seed = run_idx`, etc.).
+**Trigger**: checkpoint config relies only on long epoch boundaries or sparse
+validation events.
 
-**Fix**: pin a single seed for the entire chain. `train_and_eval.py:925-952`
-now does this when `--enable-indexes-prefetch` is set:
-`invariant_seed = seed if seed is not None else FIXED_SEEDS[seed_offset]`,
-and all chunks use `invariant_seed`. For arbitrary launchers, grep for
-seed-per-chunk patterns and warn.
+**Fix**: add an appropriate step-based or time-based checkpoint trigger and keep
+resume-required checkpoints from being pruned prematurely.
 
-**See also**: `agent-debug-workspace/0909-longform-failures.md` Cause A
-(the original investigation).
+## §13 - Internal time guard does not catch external termination
 
-## §13 — `every_n_epochs: 1` only, no `every_n_train_steps`
-
-**Signature**: visible — only `step=N.ckpt` (where N is one
-`limit_train_batches`-aligned boundary) on disk after many hours of
-compute, with the rest of the chain producing no new checkpoints.
+**Signature**: the runtime sends SIGTERM/SIGKILL and no final checkpoint is
+written.
 
-**Trigger**: `checkpoint_callback_params.every_n_train_steps: null` AND
-`every_n_epochs: 1`. With `limit_train_batches: 1000`, 1 epoch = 1000
-steps. If chunks get preempted at ~1h before reaching the next 1000-step
-boundary, NO save happens (the preemption callback's `step=N-last.ckpt` is
-the only fallback).
-
-**Fix**: add `every_n_train_steps: 50-250` (and/or
-`train_time_interval: "00:30:00"`). Lightning ORs the triggers, so all
-three can coexist.
-
-**See also**: `agent-debug-workspace/0909-longform-failures.md` Cause B.
-
-## §14 — `max_time_per_run` doesn't fire on external SIGTERM
-
-**Signature**: SLURM SIGTERM kills the job; no extra `step=N-last.ckpt`
-written; chunk progresses through 75–150 steps but loses them all on
-restart.
-
-**Trigger**: any external preemption (`svc-hwinf-cs-sched`, NODE_FAIL, QOS
-preemption, manual cancel). NeMo's `PreemptionCallback` fires only on its
-own internal timer (`max_time_per_run`).
-
-**Fix**: doesn't fix the root issue; mitigated by frequent step/time-based
-saves (§13). Set `max_time_per_run` to `<SLURM walltime - 10min>` to keep
-the internal-timer save before SLURM SIGKILLs the teardown.
-
-## §15 — `num_workers` mismatch on resume
-
-**Signature**: torchdata `StatefulDataLoader` raises a hard error at
-`load_state_dict` time, complaining the snapshot has different
-`num_workers`.
-
-**Trigger**: chain config changes `num_workers` between chunks (e.g. saved
-under `num_workers: 4`, restored with `num_workers: 8`).
-
-**Fix**: keep `num_workers` invariant across the chain. Same rule for
-`world_size` (= `num_nodes * devices_per_node`).
-
-## §16 — AIS MOSS GetBatch returns empty content for non-replicated data
-
-**Signature**: `_inject_data_into_manifest` retries with empty content; on
-old SDK shape (`info.bck`) crashes with §10 AttributeError. With the §10
-patch in place: `Error collating conversations: <object>/<archpath> from
-bucket <provider>://<bck> returned empty content` → `FallbackDataset
-received None` → `TypeError: 'NoneType' object is not subscriptable`.
-
-**Trigger**: data path is `s3://FLEURS/tarred/<lang>/...`,
-`s3://MCV/MCV4/.../<lang>/...`, etc., and the cluster's AIS doesn't have
-that data replicated. Confirmed for non-EN multilingual on IAD AIS as of
-2026-05-09.
-
-**Fix (workaround)**: set `USE_AIS_INDIVIDUAL_GETS=true` to bypass MOSS
-GetBatch and use per-object `Object.get_reader(archive_config=...).read_all()`
-(slower but works). `lhotse_resumable/lhotse/ais/batch_loader.py` implements
-this via the `force_individual=True` ctor arg.
-
-**Fix (proper)**: replicate the missing data to AIS. Quick check from
-inside an iad container:
-```python
-from aistore import Client
-import os
-c = Client(os.environ["AIS_ENDPOINT"])
-for url in ["s3://FLEURS/tarred/bg/audio_0.tar"]:
-    try:
-        print(url, c.get_object_from_url(url).head_v2().size)
-    except Exception as e:
-        print(url, "MISSING:", e)
-```
-
-**See also**: `agent-debug-workspace/0909-multiling-failures.md`.
-
-### Open investigation
-
-**Why does MOSS GetBatch return empty content for non-EN multilingual
-data?** Most likely answer: data not replicated to AIS. But worth
-confirming via the head_v2 probe above before adopting
-`USE_AIS_INDIVIDUAL_GETS` as the permanent workaround. If the data IS on
-AIS but unreadable for some other reason, a different fix is needed (and
-the lhotse fallback would benefit from raising an explicit
-`AISBatchLoaderError: object missing on AIS` instead of letting the
-empty-content path lead to a `TypeError` 6 frames down).
-
-## §17 — `indexes_root` mismatch between training YAML and prefetch script
-
-**Signature**: training startup fails with
-`FileNotFoundError: <path>/<manifest>.idx` or, in the indexed adapter,
-`ValueError: ... .idx file not found ...` from `IndexedJsonlReader._load_index`.
-
-**Trigger**: `data.train_ds.indexes_root: /tmp/idx` in YAML but
-`prefetch_indexes_to_ssd.sh` writes to `/scratch/idx`, or vice versa.
-
-**Fix**: keep both in sync. In `submit_build_indexes.py` the mirror
-defaults to `<workspace>/indexes_mirror/`; the prefetch script then pulls
-onto each node's `/tmp/idx` (the default is `/tmp/idx` per
-`prefetch_indexes_to_ssd.sh`). The training YAML's `indexes_root` must
-match the prefetch destination.
-
-## §19 — `concurrent_bucketing: true` (default) breaks resume bit-exactness
-
-**Signature**: silent. Loss curves and per-sample order across resume
-boundaries diverge from a single-run reference; no exception fires.
-Spot-check by saving `state_dict` mid-run, restoring in a fresh
-process, and asserting batches 0..K are bit-identical (the
-`MIGRATION_GUIDE.md` §3 recipe). Without the fix, you'll see byte-level
-mismatches starting from the very first restored batch.
-
-**Trigger**: any resumable training run with `force_map_dataset: true`
-and `use_stateful_dataloader: true` but `concurrent_bucketing` left at
-its default `True`.
-
-**Cause**: `DynamicBucketingSampler` spawns a daemon producer thread
-(`lhotse_resumable/lhotse/dataset/sampling/dynamic_bucketing.py:924-944`)
-that pre-pulls cuts from `self.cuts_iter` into per-bucket queues. The
-main thread is the one `StatefulDataLoader` checkpoints; the producer
-operates concurrently. At `state_dict` time, the saved cursor reflects
-the main thread's position, NOT the producer's pre-fetched cuts. On
-resume the producer is gone; its pre-fetched cuts are lost. The
-bucketing decisions and per-step batch composition diverge from the
-non-resumed run. As a side effect the same config is also not
-bit-reproducible between two fresh runs (producer scheduling is
-OS-dependent).
-
-**Fix**: set `concurrent_bucketing: false` in `data.train_ds`. NeMo
-falls through to the synchronous `_collect_cuts_in_buckets` path
-(same file, `:954-965`) which advances the iterator only from the
-main thread. Slight throughput hit during bucket warm-up; negligible
-in steady state since the bucket buffer is normally well-stocked.
-
-**Cross-refs**: `option-reference.md` `data.train_ds.concurrent_bucketing`
-row; `best-practices.md` Tier 1.
-
----
-
-## §20 — Iterable mode (`force_map_dataset: false`) silent under-sampling when partition signal missing
-
-**Signature**: silent. Step time looks normal but training runs through far
-fewer data points than expected; loss curves are wrong (each rank ends up
-training on a sliver of its already-sliced shard). Inspect with: take a
-fresh process under torchrun (so RANK/WORLD_SIZE are set), construct a
-`LazyIndexedManifestIterator(...)` directly without going through NeMo's
-dataloader (or with `worker_init_fn` somehow not running), and assert
-`len(list(iter(it))) == n`. Pre-fix this returned `n / world_size`.
-
-**Trigger**: previously (before the env-var signal was added), `LazyIndexedManifestIterator.__iter__`
-called `get_worker_partition()` which read RANK/WORLD_SIZE directly. Under
-torchrun, those env vars are set in the main process even in map-style mode
-— so the iterator applied partition even though the sampler was about to
-over-sample-and-discard, causing 1/world_size² effective coverage per rank.
-
-**Fix**: `worker_init_fn` now sets `LHOTSE_USE_WORKER_PARTITION=1`, and
-`get_worker_partition()` returns the trivial `(0, 1)` partition when that
-flag is absent. Map-style mode never calls `worker_init_fn`, so the flag
-stays unset and partition is bypassed. For iterable mode the NeMo dataloader
-passes `worker_init_fn` to the DataLoader (workers `num_workers>0`) or calls
-it eagerly via `_maybe_init_main_process_for_iterable()` (`num_workers=0`).
-
-**Cross-refs**: `lhotse_resumable/lhotse/dataset/dataloading.py:22` (constant
-definition), `:82` (set in `worker_init_fn`), `:139-170` (`get_worker_partition`
-checks the flag, returns `(0, 1)` if unset);
-`lhotse_resumable/test/test_partition.py::test_map_style_path_yields_all_items_under_torchrun`
-pins the regression.
-
----
-
-## §21 — Iterable mode with non-indexed source in the chain → silent duplication
-
-**Signature**: silent. Each rank reads the non-indexed source(s) in full.
-Inspect with the bit-exact verification recipe (`MIGRATION_GUIDE.md §3`) on
-a config containing a mixed-indexed chain — items from the non-indexed
-source(s) show up on every rank.
-
-**Trigger**: `force_map_dataset: false` plus a `LazyIteratorChain` mixing
-`LazyIndexedManifestIterator` (indexed) with `LazyJsonlIterator` /
-`LazyManifestIterator` (non-indexed). The chain's `is_indexed` is `False`
-when any source is non-indexed, so the chain falls back to
-`_iter_sequential` which delegates to each source's `__iter__`. Indexed
-sources partition themselves; non-indexed ones don't.
-
-**Fix**: either (a) convert the non-indexed sources to indexed via
-`submit_build_indexes.py`; (b) split the non-indexed sources into a separate
-dataloader; (c) revert to `force_map_dataset: true` for this config — its
-over-sample-and-discard dedup works regardless of source type.
-
-**Cross-refs**: `lhotse_resumable/test/test_partition.py::test_chain_mixed_indexed_non_indexed_only_indexed_partitions`
-pins the documented behaviour.
-
----
-
-## §22 — Iterable mode + `LazyIteratorMultiplexer(seed="randomized")`
-
-**Signature**: loud `ValueError: LazyIteratorMultiplexer cannot use
-seed='randomized' under multi-shard (DP rank x DataLoader worker)
-iteration: each shard would draw a different RNG state and pick a different
-source at the same step, causing the global weighted source distribution to
-drift across ranks. Use a fixed integer seed.` from
-`lhotse_resumable/lhotse/lazy.py:960-970`.
-
-**Trigger**: `force_map_dataset: false` and a multiplexer somewhere in the
-iteration graph has `seed='randomized'` (or unset and inheriting the
-default randomized seed propagation).
-
-**Fix**: pin the multiplexer's `seed` (or the top-level `shard_seed` that
-flows in) to a fixed integer. Map-style mode is unaffected since partition
-collapses to `(0, 1)` and the assertion never fires.
-
-**Cross-refs**: `lhotse_resumable/test/test_partition.py::test_multiplexer_rejects_randomized_seed_under_multishard`
-and `test_multiplexer_allows_randomized_seed_single_shard`.
-
----
-
-## §23 — Iterable mode resume topology mismatch
-
-**Signature**: loud `ValueError: LazyShuffledRange state mismatch: expected
-n=…, seed=…, shard_id=…, num_shards=…; got … Resuming with a different
-DP/worker topology is not supported — drop dataloader state if the topology
-changed.` from `lhotse_resumable/lhotse/indexing.py:507-540`. For chains
-under global shuffle: `ValueError: LazyIteratorChain global-shuffle
-partition mismatch on resume: ...`.
+**Trigger**: external cancellation, node failure, preemption, or walltime signal
+bypasses the framework's graceful preemption callback.
 
-**Trigger**: a chunk saved with `(world_size=W1, num_workers=NW1)` is
-restored under `(world_size=W2, num_workers=NW2)` where
-`W1 * NW1 != W2 * NW2` or the rank/worker_id assignment differs. Common
-sources: launcher changed `--num-nodes` or `--num-workers` between chunks,
-or elastic-cluster behaviour silently re-shuffled ranks.
+**Fix**: leave a walltime buffer for graceful stops and rely on frequent
+mid-chunk checkpoints as the primary mitigation.
 
-**Fix**: keep `(world_size, num_workers)` invariant across the chain — same
-hard contract as map-style `StatefulDataLoader` (which raises analogously).
-If you must scale, restart from a converted HuggingFace checkpoint (no
-resume of dataloader state).
+## §14 - Worker or world-size mismatch on resume
 
-**Cross-refs**: `lhotse_resumable/test/test_partition.py::test_chain_globally_shuffled_topology_mismatch_on_resume`
-and `test_indexed_manifest_iterator_partition_resume_topology_mismatch_raises`.
+**Signature**: `StatefulDataLoader` or indexed iterator state raises a mismatch
+error during `load_state_dict`, or restored data order is invalid.
 
----
+**Trigger**: chunk restores with different `num_workers`, world size, or
+rank/worker topology than the chunk that saved the checkpoint.
 
-## §18 — `prefetch_indexes.py` PYTHONPATH
+**Fix**: keep topology invariant for a resumable chain. To change topology,
+restart from model weights without restoring dataloader state.
 
-**Signature**: `ImportError: cannot import name 'create_jsonl_index'` or
-`ModuleNotFoundError: No module named 'lhotse.indexing'` — the
-container's stock `lhotse` lacks the resumable extensions.
+## §15 - AIStore batch endpoint returns empty content
 
-**Trigger**: prefetch / build_indexes preamble doesn't prepend
-`lhotse_resumable/` and `NeMo_resumable/` to PYTHONPATH.
+**Signature**: batch collation receives empty content for one or more requested
+objects, often followed by a downstream `NoneType` or collation error.
 
-**Fix**: `submit_build_indexes.py:225` does
-`export PYTHONPATH={lhotse_remote}:{code_dir}:$PYTHONPATH` before invoking
-`build_indexes.py`. Arbitrary launchers must do the same.
+**Trigger**: object is not available through the batch endpoint, credentials are
+wrong, or batch and individual-object paths exercise different backend state.
 
----
+**Fix**: verify object availability through the exact access mode used by
+training. As a workaround, set `USE_AIS_INDIVIDUAL_GETS=true` and investigate
+backend replication/permission issues separately.
 
-## Cascading symptoms (NOT root causes)
+## §16 - `indexes_root` points at missing node-local storage
 
-Distributed failures cascade — one bad rank's exception triggers a NCCL
-timeout 30 min later that kills the rest. When the loud error is one of:
+**Signature**: `FileNotFoundError` or `.idx file not found` from an indexed
+reader at startup.
 
-- `EPException what(): 'unspecified launch failure'` at `deep_ep.cpp:155`
-- `DeepEP timeout check failed: rank=X, thread=Y, value=…`
-- `Watchdog caught collective operation timeout: WorkNCCL(...)`
+**Trigger**: YAML points at a node-local path such as `/tmp/idx`, but the launcher
+does not stage sidecars there before every chunk; or the staging destination does
+not match YAML.
 
-…look upstream for the Python traceback that fired first. The DeepEP /
-NCCL chatter is cascade. The 0909-multiling chains had this exact
-pattern: `TypeError: 'NoneType' object is not subscriptable` (origin) →
-DeepEP `'unspecified launch failure'` (cascade).
+**Fix**: use a persistent shared mirror by default. If staging to node-local SSD,
+ensure the preamble runs before training in every chunk and the YAML path matches
+that destination exactly.
+
+## §17 - Concurrent bucketing breaks bit-exact resume
+
+**Signature**: silent data-order divergence across resume boundaries.
+
+**Trigger**: a background bucketing producer advances the source iterator outside
+the checkpointed main-thread state.
+
+**Fix**: set `concurrent_bucketing: false` for resumable training so only the
+checkpointed path advances the iterator.
+
+## §18 - Iterable mode partitions when partition signal is missing or wrong
+
+**Signature**: silent under-sampling or over-partitioning under distributed
+environment variables.
+
+**Trigger**: indexed iterators read rank/world environment directly instead of
+using a dataloader-worker partition signal.
+
+**Fix**: ensure partitioning is activated only by the intended worker init path.
+Map-style mode should see the trivial `(0, 1)` partition.
+
+## §19 - Iterable mode with non-indexed source in the chain
+
+**Signature**: non-indexed sources appear on every rank/worker while indexed
+sources are partitioned.
+
+**Trigger**: `force_map_dataset: false` with a chain that mixes indexed and
+non-indexed iterators.
+
+**Fix**: convert every source in the iterable chain to indexed access, or split
+or remove the non-indexed sources before launching training. Do not switch to
+map-style training to bypass this unless the user explicitly approves a
+temporary exception with the expected overhead.
+
+## §20 - Iterable mode with randomized multiplexer seed
+
+**Signature**: loud `ValueError` from the multiplexer, or silent source-weight
+drift if no guard exists.
+
+**Trigger**: each shard draws a different multiplexer RNG state and chooses a
+different source at the same logical step.
+
+**Fix**: pin multiplexer seed, usually through the top-level `shard_seed`.
+
+## §21 - Iterable resume topology mismatch
+
+**Signature**: indexed range or chain state reports `shard_id` / `num_shards` /
+`world_size` mismatch on restore.
+
+**Trigger**: a checkpoint saved under one distributed-worker topology is restored
+under another.
+
+**Fix**: keep `(world_size, num_workers)` invariant. To scale differently,
+restart without dataloader state.
+
+## §22 - Training left in map-style mode
+
+**Signature**: long startup or step-time overhead from repeated sampler/manifest
+work, especially at larger world sizes.
+
+**Trigger**: migrated training YAML keeps `data.train_ds.force_map_dataset: true`
+instead of enforcing iterable partitioning.
+
+**Fix**: set `data.train_ds.force_map_dataset: false` and make every source in
+the training iteration graph indexed and partition-compatible. If a source cannot
+yet be indexed, mark the migration not launch-ready unless the user explicitly
+approves a temporary map-style exception with the specific blocker and expected
+overhead.
+
+## §23 - Build/prefetch tool imports stock Lhotse/NeMo
+
+**Signature**: `ModuleNotFoundError`, missing `lhotse.indexing`, or import errors
+for indexed/resumable symbols.
+
+**Trigger**: build-index or prefetch command does not place the modified NeMo and
+Lhotse checkouts before stock packages on `PYTHONPATH`.
+
+**Fix**: set `PYTHONPATH` or install the correct packages so helper scripts and
+training use the same indexed/resumable implementation.
+
+## §23 - Distributed backend errors hide an earlier Python exception
+
+**Signature**: NCCL/watchdog/collective timeout or launcher-level distributed
+failure appears after one rank already logged a Python traceback.
+
+**Trigger**: one rank fails during data loading or collation; other ranks block
+in distributed work until the backend times out.
+
+**Fix**: inspect logs before the distributed timeout and identify the first
+Python exception. Treat later backend chatter as a cascade unless it is the first
+error in time.
diff --git a/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md b/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
index 8620793dbf9e..aee3903a2b02 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/references/option-reference.md
@@ -1,111 +1,92 @@
-# Option reference — every YAML/launcher field that interacts with the resumable path
+# Option reference - indexed + resumable Lhotse migration
 
-Field-by-field exhaustive reference. Required values, rationale, source code
-pointer, see-also link to MIGRATION_GUIDE.md and (when relevant) to the
-0909-debug docs that motivated the field.
+Field-by-field reference for YAML and launcher settings that interact with
+indexed access, `StatefulDataLoader`, distributed topology, and storage backend.
+Line numbers in local code may drift; verify against the checkout in front of
+you when producing a report.
 
-## `data.train_ds` — required for the indexed + resumable path
+## `data.train_ds`
 
 | field | required value | purpose | see also |
 |---|---|---|---|
-| `indexed` | `true` | Routes every nested `input_cfg` source to its indexed adapter (`LazyNeMoTarredIterator(indexed=True)`, `IndexedJsonlReader`, etc.). Without this flag, the streaming/replay path is used. Defined in `LhotseDataLoadingConfig` (`NeMo_resumable/nemo/collections/common/data/lhotse/dataloader.py:261`). | MIGRATION_GUIDE.md "Step 2 — Flip two flags" |
-| `use_stateful_dataloader` | `true` | Swaps PyTorch `DataLoader` → `torchdata.StatefulDataLoader` so iterator state is checkpointed in `meta.pt` under `DataModule.train_dataloader` (3 keys: `_snapshot`, `_steps_since_snapshot`, `_iterator_finished`). Verified via `inspect_meta.py` against `step=2000.ckpt` / `step=3000.ckpt` / `step=N-last.ckpt` (see `agent-debug-workspace/nano-v3-1node-resumable-tests.md`). | `dataloader.py:272`, MIGRATION_GUIDE.md "Step 2" |
-| `force_map_dataset` | `true` (safe default) **OR** `false` (optimization for indexed-only configs at high `world_size`) | Two viable modes. **`true`**: sampler runs in the main GPU process; cross-rank dedup is over-sample-and-discard inside `DynamicBucketingSampler` (sampler generates `world_size` batches per step, picks `batches[rank]`, discards the rest). Works for any source type. Costs `W×` redundant sampler/manifest reads per step. **`false`**: sampler runs co-located with the dataset inside CPU worker subprocesses (`IterableDatasetWrapper`); sample indices are partitioned across `(DP rank × DataLoader worker)` via `LazyShuffledRange(shard_id, num_shards)`. Eliminates the `W×` redundant work — near-`W×` step-time improvement at scale. **Requires every source to be indexed** (lhotse-indexed JSONL, nemo_tarred with indexed mode, etc.); non-indexed sources mixed into the chain are NOT deduplicated and may be silently duplicated across ranks. The partition is gated by the `LHOTSE_USE_WORKER_PARTITION` env var that `worker_init_fn` sets (and `dataloader.py:_maybe_init_main_process_for_iterable` sets eagerly for the `num_workers=0` case). | `dataloader.py:247-279`, `lhotse_resumable/lhotse/indexing.py:396-571` (`LazyShuffledRange` with `(shard_id, num_shards)`; constructor L423, `state_dict` L497, `load_state_dict` L507 validates topology), `lhotse_resumable/lhotse/lazy.py:548+` (`LazyIndexedManifestIterator.__iter__` at L606), `failure-modes.md §20-§23` |
-| `indexes_root` | local SSD path (e.g. `/tmp/idx`) matching `prefetch_indexes.py` destination | Where the prefetched `.idx` mirror is read from at training time. Mirror tree preserves the data-file paths (`<indexes_root>/lustre/...` mirroring the blend's lustre paths). Resolved by `lhotse.indexing.index_file_path(data_path, indexes_root=...)` (canonical), at `lhotse_resumable/lhotse/indexing.py`. **Must match the prefetch script's destination**, otherwise manifests fail to find their `.idx` neighbors at training time. | MIGRATION_GUIDE.md "keep indexes on a separate fast disk" |
-| `seed` | a fixed integer, **invariant across chunks** | Controls Python/numpy/torch global RNG via `pl.seed_everything(seed)` at chunk start. **MUST NOT change on resume**, otherwise dropout / aux-loss / random-init diverge across chunks even though `StatefulDataLoader.load_state_dict` restores sampler state correctly. The 0909 longform chains (see `agent-debug-workspace/0909-longform-failures.md`) hit this exact silent-corruption bug because `train_and_eval.py` rotated `FIXED_SEEDS[seed_offset+i]` per chunk. Fixed in `train_and_eval.py:925-952` — when `--enable-indexes-prefetch` is set, all chunks use the same seed. | MIGRATION_GUIDE.md "Operational constraints" §1, `0909-longform-failures.md` Cause A |
-| `shard_seed` | a fixed integer (NOT `"randomized"`) under either `force_map_dataset` value | Sampler RNG for `DynamicBucketingSampler`. **Map path**: cross-rank dedup is by index slicing (`rank=global_rank, world_size=world_size` at `dataloader.py:680-681`); per-rank seed differentiation is unneeded, and `"randomized"` adds worker-PID-derived seeding that breaks across resume boundaries. NeMo's `dataloader.py:556-572` auto-overwrites `shard_seed: "randomized"` → `shard_seed: <seed>` with a warning when `force_map_dataset + use_stateful_dataloader` are both true. **Iterable path** (`force_map_dataset: false`): the multiplexer inside the sampler graph (`LazyIteratorMultiplexer`) requires all DP ranks to pick the same source at each multiplex step so the global weighted source distribution stays coherent. `seed='randomized'` would derive a different per-(rank, worker) seed and break this — `LazyIteratorMultiplexer.__iter__` (`lhotse_resumable/lhotse/lazy.py:960-970`) raises `ValueError` if `seed='randomized'` under multi-shard partition. Either mode: pin `shard_seed: <int>` explicitly in YAML. | `0909-summary.md` R2, `dataloader.py:543-572`, `failure-modes.md §22` |
-| `num_workers` | match between save and restore | `StatefulDataLoader` hard requirement: changing `num_workers` between save and restore raises a hard error from torchdata. Document the value in the YAML / launcher header. | MIGRATION_GUIDE.md "Operational constraints" §1 |
-| `concurrent_bucketing` | **`false`** when `force_map_dataset + use_stateful_dataloader` are both true | The default (`true`) spawns a `daemon=True` producer thread inside `DynamicBucketingSampler` (`lhotse_resumable/lhotse/dataset/sampling/dynamic_bucketing.py:924-944`) that pre-pulls cuts from the source iterator and fills per-bucket queues. The main thread (which `StatefulDataLoader` checkpoints) and the producer thread BOTH advance `self.cuts_iter`, so the cursor saved at `state_dict` time does NOT reflect the cuts the producer has already pre-fetched. On resume, the next-cut cursor is correct from the main thread's view but the producer's pre-fetched cuts are gone, so the bucketing/order across resume boundaries is nondeterministic. Also breaks single-run bit-exact reproducibility between two runs of the same config because the producer's scheduling is OS-thread-dependent. Set `concurrent_bucketing: false` in `data.train_ds` for any resumable run. | `lhotse_resumable/lhotse/dataset/sampling/dynamic_bucketing.py:924-944`, `failure-modes.md §<new>`, observed in `0909-multiling-*` (2026-05-11) |
-| `force_iterable_dataset` | unset (or `false`) | Mutually exclusive with `force_map_dataset: true`. `dataloader.py:278-280` asserts `not (force_map_dataset and force_iterable_dataset)`. | `dataloader.py:278-280` |
-| `force_finite` | unset / `false` (training only) | Setting this to `true` would cap the infinite-mux behavior that training requires. Only validation_ds needs `force_finite: true`. | MIGRATION_GUIDE.md "Operational constraints" §4 |
-| `extra_fields` (on any nested `nemo` / `nemo_tarred` / `multimodal_conversation`) | unset | `LazyNeMoTarredIterator(indexed=True)` raises `RuntimeError` if `extra_fields` is set (`nemo_adapters.py:485-487`: "LazyNeMoTarredIterator(indexed=True) does not support 'extra_fields'"). Same constraint on `LazyNeMoIterator` at `nemo_adapters.py:148-152`. Pre-process the manifest offline. | `nemo_adapters.py:148-152, 485-487` |
-| `slice_length` (on any nested `nemo` / `nemo_tarred`) | unset | Slicing rewrites cuts in a way that has no stable index. The dataloader still threads `slice_length` through (`dataloader.py:253-256`, `cutset.py:413, 436, 662, 678, 693, 717, 1506, 1551`), but the indexed reader does not honor it. Pre-process offline if needed. | MIGRATION_GUIDE.md "Prerequisites" §3 |
-| compressed `.jsonl.gz` / `.tar.gz` paths | reject | `lhotse.indexing.indexed_path_kind` returns `None` for any path matching `_is_compressed_path` (`lhotse_resumable/lhotse/indexing.py:88-110`); `validate_indexed_access` raises `ValueError("...requires uncompressed JSONL or tar...")`. Re-extract or re-export with `compress_jsonl=False` for Shar. | MIGRATION_GUIDE.md "Prerequisites" §1, `lhotse/indexing.py:88-110, 130-135` |
-| `pipe:` paths (`pipe:cmd \| cmd`) | reject | Pipe commands aren't seekable. `validate_indexed_access` raises `ValueError("...requires seekable data sources...")`. | `lhotse/indexing.py:126-128` |
-| `.json` extension on a JSONL manifest | accepted | NeMo ships many ASR/SLM manifests as `*.json` (one JSON object per line). `lhotse/indexing.py:99-107` accepts both `.json` and `.jsonl` since the indexer only relies on newline-separated records. (Pretty-printed multi-line JSON would produce a bogus index, but that's not a supported NeMo manifest layout.) | `lhotse/indexing.py:99-107` |
-
-## Iterable-mode partition — only when `force_map_dataset: false`
-
-These concerns only apply when you've opted into the iterable path for
-indexed sources. Skip this whole section if you've kept the default
-`force_map_dataset: true`.
-
-| concern | requirement | purpose | see also |
-|---|---|---|---|
-| `LHOTSE_USE_WORKER_PARTITION` env var | set automatically by `worker_init_fn`; never set manually | Signals to `get_worker_partition()` that worker-level partition is active. In iterable mode NeMo passes `worker_init_fn` to the DataLoader (workers `num_workers>0`) or calls it eagerly in `_maybe_init_main_process_for_iterable()` (`num_workers=0`). Map-style mode never calls `worker_init_fn`, so the signal stays unset and partition collapses to `(0, 1)` — this is what keeps the map-style path correct under torchrun (where RANK/WORLD_SIZE are already in env). | `lhotse_resumable/lhotse/dataset/dataloading.py:22` (constant), L82 (set in `worker_init_fn`), L139-170 (`get_worker_partition`), `failure-modes.md §20` |
-| All sources in the iteration graph are indexed | required | The partition is implemented in `LazyShuffledRange` and reaches `LazyIndexedManifestIterator` and `LazyIteratorChain._iter_globally_shuffled`. Non-indexed sources (plain `LazyJsonlIterator`, `LazyManifestIterator`) do NOT partition; they yield all items on every rank. If the chain mixes indexed + non-indexed sources, the non-indexed parts are duplicated across ranks — silently. Inspect every nested input_cfg entry to confirm it lands on an indexed adapter (`indexed: true` cascades, but compressed paths / `pipe:` paths / certain blends fall back to non-indexed). | `lhotse_resumable/lhotse/lazy.py` (`LazyShuffler` / `LazyMapper` / `LazyFilter` / `LazyRepeater` all delegate to the source's `__iter__`, so partition propagates), `failure-modes.md §21` |
-| `LazyIteratorMultiplexer.seed` | fixed integer (NOT `"randomized"`) | Under multi-shard partition, all ranks must pick the same source at each step (else the global weighted source distribution drifts across ranks). The multiplexer asserts this at iter time. Map-style mode is unaffected (partition is `(0, 1)` so the assertion never fires). | `lhotse_resumable/lhotse/lazy.py:960-970`, `failure-modes.md §22` |
-| Resume topology (DP rank × num_workers) | invariant between save and restore | `LazyShuffledRange.load_state_dict` validates `(n, seed, shard_id, num_shards)`; `LazyIteratorChain._iter_globally_shuffled` validates `(shard_id, num_shards)` against saved values. Topology mismatch raises a loud `ValueError`. The same hard contract as map-style `StatefulDataLoader` — but check fires earlier (at iterator level) and includes the worker dimension. | `lhotse_resumable/lhotse/indexing.py:497-540` (`LazyShuffledRange.state_dict` / `load_state_dict`), `failure-modes.md §23` |
-| `num_workers` | invariant; same as map-style | StatefulDataLoader contract. Additionally: `num_shards = world_size * num_workers`, so `num_workers` is part of the partition identity. Changing it would force a different shard assignment per rank. | MIGRATION_GUIDE.md "Operational constraints" §1 |
-| Mixed indexed/non-indexed chain | warn | Non-indexed sources in the chain are duplicated across ranks (see "All sources indexed" above). Either move them to a separate dataloader, convert them to indexed format, or revert to `force_map_dataset: true` for that config. | `failure-modes.md §21` |
-
-## `data.validation_ds` — finite map access required
-
-| field | required value | purpose | see also |
-|---|---|---|---|
-| `indexed` | `true` (inherited from train_ds OR per-validation set) | Same as `data.train_ds.indexed`. | MIGRATION_GUIDE.md Step 2 |
-| `force_map_dataset` | `true` | Map-style finite access. | MIGRATION_GUIDE.md Step 2 |
-| `force_finite` | `true` | **Caps the infinite-mux behavior that training uses**. Without this, validation loops forever (the multiplexer never raises StopIteration). MIGRATION_GUIDE.md "Operational constraints" §4 calls this out explicitly. | MIGRATION_GUIDE.md "Operational constraints" §4 |
-| `use_stateful_dataloader` | `false` (or `true`, doesn't matter) | Validation never resumes from mid-eval; eval is run-to-completion. Either value works. | — |
-| `indexes_root` | same path as train_ds | Same — must match the prefetch destination. | — |
-| `seed` / `shard_seed` | same fixed integers as train_ds (or any fixed value) | Determinism for eval. Doesn't need to be invariant across chunks the way training does. | — |
-
-## `exp_manager` — Lightning resume contract
-
-| field | required value | purpose | see also |
-|---|---|---|---|
-| `resume_if_exists` | `true` | Lightning auto-finds the latest `step=N-last.ckpt` and loads model + optimizer + dataloader state from DCP shards + `meta.pt`. Without this, every chunk starts from scratch. | MIGRATION_GUIDE.md "Lightning resume contract" |
-| `resume_ignore_no_checkpoint` | `true` | First chunk runs without prior ckpt; without this flag, the first run errors. | — |
-| `checkpoint_callback_params.every_n_train_steps` | small int (50–250 recommended) | Mid-chunk saves so external preemption (`svc-hwinf-cs-sched`, NODE_FAIL, etc.) doesn't waste 80–150 step progress. The 0909 longform chains (`0909-longform-failures.md` Cause B) accumulated **0** progress past `step=1000` because the only save trigger was `every_n_epochs: 1` and chunks averaged 75–150 steps after preemption. | `0909-longform-failures.md` Cause B |
-| `checkpoint_callback_params.train_time_interval` | `"00:30:00"` (suggested) | Belt-and-braces wall-clock save trigger. Lightning ORs the per-step and per-time triggers, so both can coexist. | best-practices.md §4 |
-| `checkpoint_callback_params.every_n_epochs` | `null` or `1` | If you keep `every_n_epochs: 1`, *also* set `every_n_train_steps`; do not rely on epochs alone. | — |
-| `checkpoint_callback_params.save_top_k` | `-1` (no pruning) | Prevents Lightning from deleting old checkpoints when `monitor` doesn't fire. With `every_n_train_steps + every_n_epochs` saves you want all of them on disk. | — |
-| `max_time_per_run` | `<SLURM walltime - 10min>` | NeMo's `PreemptionCallback` fires here, leaving a 10-minute buffer for the teardown tail. **Does NOT fire on external SIGTERM** (only on its own timer) — external cancels can still lose progress. Mitigated by frequent step/time-based saves. | debug-cluster-run §6(11) |
-
-## `trainer` — Lightning + parallelism
-
-| field | constraint | purpose | see also |
-|---|---|---|---|
-| `devices` / `num_nodes` | match between save and restore | StatefulDataLoader is sensitive to `world_size`; changing it between save and restore raises a hard error. To scale a chain mid-flight you must restart from a converted HuggingFace checkpoint (no resume). | MIGRATION_GUIDE.md "Operational constraints" §1 |
-| `max_steps` | unchanged across chain | Chain semantics: each chunk advances `global_step`; `max_steps` is the chain target. Don't reduce it mid-chain or Lightning will think training is finished. | — |
-| `limit_train_batches` | usually `1000` | Defines an "epoch". With `every_n_epochs: 1` this is also the only save trigger if `every_n_train_steps` is unset. See `every_n_train_steps` above. | — |
-
-## Launcher contract — `train_and_eval.py` and equivalents
-
-| concern | requirement | purpose | see also |
-|---|---|---|---|
-| Per-chunk seed | **invariant across all chunks of a chain** when `use_stateful_dataloader: true` | StatefulDataLoader contract: model RNG must be the same on resume so dropout/aux-loss/random-init are bit-exact across chunks. The 0909 longform chains hit this with `FIXED_SEEDS[0..9]` rotation. Fixed in `train_and_eval.py:925-952`: when `--enable-indexes-prefetch` is set, `seeds = [seed_or_default] * num_runs`. The skill should grep for any `FIXED_SEEDS[i]` / `seed = randint(...)` / `seed=run_idx` patterns in arbitrary launchers and warn. | `train_and_eval.py:925-952`, `0909-longform-failures.md` Cause A |
-| Indexes prefetch preamble | every chunk's container startup runs `prefetch_indexes.py` (or the equivalent rsync) onto each node's local SSD, populating `<indexes_root>` before `salm_train.py` starts | `train_and_eval.py:577-578` does this via `prefetch_indexes_to_ssd.sh`; if missing, training reads `.idx` files from lustre on every `__getitem__` call (slow; defeats the purpose). | `train_and_eval.py:577-578` |
-| `num_workers`, `world_size` | invariant across chain | Hard requirement of StatefulDataLoader (see above). Launcher should NOT change `--num-nodes` or `--num-workers` between chunks. | MIGRATION_GUIDE.md "Operational constraints" §1 |
-| `--bypass-nvidia-hook` for cpu partitions | required on clusters whose `cpu_partition` lacks `nvidia-container-cli` (e.g. NRT) | Without it, enroot's `98-nvidia.sh` hook hard-fails the container start on cpu partitions of those clusters. Sets `--export=ALL,NVIDIA_VISIBLE_DEVICES=void` on the sbatch line. Used by `submit_build_indexes.py:122-129, 240-245` and `train_and_eval.py`. | `submit_build_indexes.py:122-129` |
-| PYTHONPATH | must include both `lhotse_resumable/` and `NeMo_resumable/` | Without it, the in-container default `lhotse` / `nemo` are loaded and lack the resumable code. `submit_build_indexes.py:225` does this; arbitrary launchers must too. | `submit_build_indexes.py:225` |
-
-## AIStore env vars
-
-| env var | required when | purpose | see also |
-|---|---|---|---|
-| `USE_AIS_GET_BATCH` | training data is on `s3://`, `ais://`, or `http(s)://` AND the cluster has `AIS_ENDPOINT` | Skip eager `IndexedTarMemberReader` per shard; defer audio fetch to AIS at sample time via `AISBatchLoader`. Read at `nemo_adapters.py:459`. | aistore-vs-non-aistore.md |
-| `USE_AIS_INDIVIDUAL_GETS` | non-EN-replicated multilingual data on AIS, or any time MOSS GetBatch returns empty content | Routes through per-object `Object.get_reader(archive_config=...).read_all()` instead of MOSS GetBatch (the `force_individual` flag on `AISBatchLoader`). Slower but bypasses MOSS-side issues, and on `shar_ptr` sources falls back to per-object byte-range `get_reader` so non-gzipped lhotse-shar cuts work even when MOSS lacks byte-range support. `lhotse_resumable/lhotse/ais/batch_loader.py`. | failure-modes.md §16 |
-| `AIS_ENDPOINT` | always when AIStore in play | The AIS proxy URL. IAD: `http://asr.iad.oci.aistore.nvidia.com:51080`. Set in `cluster_configs/<cluster>.yaml` under `env_vars`. | `cluster_configs/iad.yaml:31` |
-| `aistore` SDK version | ≥ 1.17 | `lhotse_resumable/lhotse/ais/batch_loader.py:75` requires `aistore>=1.17.0`. As of 2026-05-10, latest is 1.23.0. The `_moss_attrs` normalizer at `batch_loader.py:81` handles both MossIn (≤1.18) and MossOut (≥1.20) attribute namings. | `lhotse_resumable/lhotse/ais/batch_loader.py:75-89` |
+| `indexed` | `true` | Routes supported sources through indexed adapters such as `IndexedJsonlReader` and indexed NeMo-tar readers. Without it, streaming/replay behavior remains active. | `nemo.collections.common.data.lhotse.dataloader`, `lhotse.indexing` |
+| `use_stateful_dataloader` | `true` | Uses `torchdata.StatefulDataLoader` so dataloader iterator state can be saved in Lightning checkpoints. | NeMo Lhotse dataloader config |
+| `force_map_dataset` | `false` for training | Enforces iterable partitioning across data-parallel ranks and workers. Map-style training has too much sampler/manifest overhead; if a source cannot yet be indexed, report the migration as not launch-ready unless the user explicitly approves a temporary exception. | failure-modes §§18-22, conflict-matrix |
+| `indexes_root` | stable filesystem mirror, or node-local path populated before startup | Tells indexed readers where to find `.idx` sidecars. Prefer a persistent shared mirror. Use `/tmp/idx` only when the launcher stages indexes there before training. | failure-modes §16 |
+| `seed` | fixed integer, invariant across chunks | Lightning reseeds Python/NumPy/Torch at chunk start. Rotating this across resumable chunks breaks model-level bit-exactness even when sampler state restores correctly. | failure-modes §11 |
+| `shard_seed` | fixed integer, not `"randomized"` | Controls sampler/multiplexer RNG. Randomized shard seeds can diverge across resume and are invalid for multi-shard iterable partitioning. | conflict-matrix |
+| `num_workers` | invariant between save and restore | `StatefulDataLoader` and iterable partition state depend on worker topology. | failure-modes §14, §21 |
+| `concurrent_bucketing` | `false` for resumable training | Background bucketing producers can advance source iterators outside the checkpointed main-thread state. | failure-modes §17 |
+| `force_iterable_dataset` | unset or compatible with `force_map_dataset: false` | Do not enable mutually exclusive dataset modes. The training target is iterable partitioning through `force_map_dataset: false`. | conflict-matrix |
+| `force_finite` | unset/false for training | Training usually needs infinite or epoch-controlled iteration; finite mode is normally for validation. | validation section |
+| `extra_fields` on indexed NeMo entries | unset | Indexed NeMo adapters cannot preserve arbitrary runtime field rewrites. Preprocess manifests instead. | failure-modes §2 |
+| `slice_length` on indexed entries | unset | Slicing rewrites cut/audio access and has no stable index unless preprocessed. | failure-modes §2 |
+| compressed `.jsonl.gz` / `.tar.gz` paths | reject for indexed sidecars | Indexing requires seekable uncompressed JSONL/tar inputs. Re-export or unpack first. | failure-modes §1 |
+| `pipe:` paths | reject | Pipe commands are not seekable. Materialize data first. | `lhotse.indexing` |
+
+## Training iterable partition (`force_map_dataset: false`)
+
+This is the required training mode for efficient indexed/resumable runs. Do not
+ship a migrated training config in map-style mode. If an indexing blocker
+prevents iterable partitioning, mark the migration not launch-ready unless the
+user explicitly approves a temporary exception.
+
+| concern | requirement | purpose |
+|---|---|---|
+| Worker partition signal | Set only by NeMo/Lhotse worker init path | Prevents map-style mode from accidentally partitioning under `torchrun` environment variables. |
+| All sources indexed | required | Non-indexed sources do not partition and will be duplicated across ranks/workers. |
+| Multiplexer seed | fixed integer | All shards must pick the same source at each multiplexing step to preserve global weighted distribution. |
+| Resume topology | invariant `(world_size, num_workers)` | Saved iterator state validates topology on restore. |
+
+## `data.validation_ds`
+
+| field | required value | purpose |
+|---|---|---|
+| `indexed` | `true` when validation sources need indexed access | Uses the same sidecar/index readers as training. |
+| `force_map_dataset` | `true` | Validation should be finite and deterministic; map-style access is simpler. |
+| `force_finite` | `true` | Prevents infinite validation loops when the training blend is infinite. |
+| `use_stateful_dataloader` | usually `false` | Validation is normally run to completion and not resumed mid-loop. |
+| `indexes_root` | same mirror as training unless intentionally separate | Validation readers need the same sidecars. |
+| `seed` / `shard_seed` | fixed integers | Keeps validation deterministic. |
+
+## Lightning / trainer settings
+
+| field | recommendation | purpose |
+|---|---|---|
+| `resume_if_exists` or equivalent | enabled for resumable chains | Ensures later chunks restore checkpointed model, optimizer, scheduler, and dataloader state. |
+| `resume_ignore_no_checkpoint` or equivalent | enabled for first chunk when supported | Allows chunk 1 to start without an existing checkpoint. |
+| Checkpoint cadence | frequent step- or time-based saves | External termination may bypass graceful preemption callbacks. Avoid losing an entire chunk. |
+| `save_top_k` / pruning policy | do not prune required resume checkpoints | Resume needs recent checkpoints and dataloader metadata. |
+| `max_time_per_run` / walltime guard | comfortably below runtime walltime | Internal graceful-stop callbacks need teardown time. |
+| `devices`, `num_nodes`, distributed topology | invariant across resume | Dataloader state is topology-sensitive. To scale differently, restart without dataloader state. |
+| `max_steps` | stable across chain | Later chunks continue global step accounting. |
+
+## Launcher contract
+
+| concern | requirement | purpose |
+|---|---|---|
+| Per-chunk seed | invariant for all chunks in a resumable chain | Prevents model-level RNG divergence across resumes. |
+| Index mirror availability | `.idx` sidecars exist before training starts | Indexed readers fail or fall back to slow behavior when sidecars are missing. |
+| Optional index staging | YAML `indexes_root` matches the staged destination | Node-local paths such as `/tmp/idx` must be populated in every chunk. |
+| `num_workers`, `world_size` | unchanged between save and restore | Required by stateful dataloading and iterable partitioning. |
+| Python path / package selection | loads the NeMo and Lhotse versions with indexed/resumable support | Avoids accidentally using stock packages without the required code. |
+| Container/runtime hooks | compatible with available CPU/GPU runtime | CPU-only index builds may need different container settings than GPU training. |
+
+## AIStore environment
+
+| env var | required when | purpose |
+|---|---|---|
+| `AIS_ENDPOINT` | any `s3://` / `ais://` source is read through AIStore | Points Lhotse/AIS clients at the proxy. |
+| `USE_AIS_GET_BATCH` | remote tar/audio sources should be fetched lazily by batch | Avoids eager tar-reader construction for every remote shard. |
+| `USE_AIS_INDIVIDUAL_GETS` | batch endpoint is unavailable or returns empty content | Falls back to per-object reads. Slower but useful for backend-specific failures. |
+| `aistore` SDK | AIStore backend in builder/training container | Required by Lhotse AIStore access paths. |
 
 ## Index building
 
-| concern | requirement | purpose | see also |
-|---|---|---|---|
-| Uncompressed sources only | `.jsonl` / `.tar` (NOT `.jsonl.gz` / `.tar.gz`); Shar `cuts.*.jsonl` not `cuts.*.jsonl.gz` | See `lhotse/indexing.py:88-110, 130-135`. AMI's stock distribution as `.jsonl.gz` Shar fails — drop AMI from the blend until an uncompressed export is available. | `data_blends/iad/granary1p1-en-resumable.yaml` header comment, MIGRATION_GUIDE.md "Prerequisites" §1 |
-| No `extra_fields` | every `nemo` / `nemo_tarred` / `multimodal_conversation` entry must omit `extra_fields` | `LazyNeMoTarredIterator(indexed=True)` raises explicitly. | `nemo_adapters.py:485-487` |
-| No `slice_length` | every `nemo` / `nemo_tarred` entry must omit `slice_length` | Sliced cuts have no stable index. | dataloader.py:253-256 |
-| Workers | 95 (on 96-cpu node) for 80k–400k files; 48 if OOM | Tar parsing is GIL-bound (process executor required). 96-cpu / 95-worker / `--exclusive` is the sweet spot. ProcessPool OOM signature: `concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly`. Drop to 48 workers if 95 OOMs. | failure-modes.md §8 |
-| Time | ~90 min for 80k files; ~2-3 h for 360k files | `submit_build_indexes.py:131` defaults `time_min=04:00:00`. | submit_build_indexes.py:131 |
-| Mirror destination | lustre under `<workspace>/indexes_mirror/` (writable, fast enough for prefetch source); NOT S3 | `prefetch_indexes.py` then pulls onto each node's local SSD at `/tmp/idx` (or whatever `indexes_root` resolves to). | submit_build_indexes.py:88-92, prefetch_indexes.py |
-| `aistore` SDK in builder container | required if any source is `s3://` / `ais://` | `submit_build_indexes.py:227` does `pip install --quiet --disable-pip-version-check aistore`. Without it, lhotse falls back to smart_open's AWS S3 client and fails with `io.UnsupportedOperation: seek`. Pin the SDK version range to match the lhotse code (`aistore>=1.17`). | submit_build_indexes.py:218-227 |
-| Reusability | once per blend; reuse across experiments | Already-indexed files are skipped; `--force` to rebuild. Re-runs are safe. | build_indexes.py:386 |
-
-## Cluster info
-
-| concern | requirement | purpose | see also |
-|---|---|---|---|
-| `cluster_configs/<cluster>.yaml` must exist | always | `submit_build_indexes.py` and `train_and_eval.py` read SSH creds, partition, container, env_vars from it. | TEMPLATE.yaml |
-| `nvidia-container-cli` on cpu partitions | NRT lacks it (cpu / cpu_interactive / cpu_datamover); IAD has it | If absent, use `--bypass-nvidia-hook` (sets `--export=ALL,NVIDIA_VISIBLE_DEVICES=void`). | submit_build_indexes.py:122-129 |
-| `AIS_ENDPOINT` env var | required when AIStore is the audio backend | Set in `env_vars:` block of cluster config. IAD has it; lustre-only clusters (typically NRT) won't. | cluster_configs/iad.yaml:31 |
+| concern | recommendation | purpose |
+|---|---|---|
+| Source format | uncompressed, seekable JSONL/tar or supported Shar cuts | Sidecar offsets must map to stable byte positions. |
+| Workers | tune for memory and storage backend | Large manifests/tars plus many workers can OOM. Reduce workers or split blends. |
+| Mirror destination | persistent shared filesystem when available | Reuse sidecars across runs and avoid per-launch rebuilds. |
+| Remote sources | verify credentials/backend before building | Indexing remote data exercises storage credentials and byte-range access. |
+| Reusability | build once per source path set | Existing sidecars can be reused while source contents and paths are unchanged. |
diff --git a/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md b/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md
index 650c49706c5d..e92d4e8d6d7a 100644
--- a/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md
+++ b/.claude/skills/migrate-to-resumable-dataloader/templates/migration-report.md
@@ -1,117 +1,98 @@
-# Migration report — `<config-stem>`
+# Migration report - `<config-stem>`
 
 - **Generated**: <YYYY-MM-DD HH:MM>
 - **Source YAML**: `<path/to/source-config.yaml>`
 - **Patched YAML**: `<path/to/source-config-resumable.yaml>`
 - **Source blend** (if inspected): `<path/to/blend.yaml>`
 - **Patched blend** (if emitted): `<path/to/blend-resumable.yaml>`
-- **Launcher** (if inspected): `<path/to/launcher.py>` (or "skipped — no launcher provided")
-- **Cluster**: `<cluster>` (AIStore: yes/no)
+- **Launcher** (if inspected): `<path/to/launcher>` (or "skipped - no launcher provided")
+- **Storage workflow**: <filesystem-only | AIStore/remote | mixed | unknown>
 
 ## Summary
 
-<One paragraph: what was changed, severity counts (e.g. "1 fatal, 4 errors,
-3 warnings, 2 notes"), whether the patched YAML is ready-to-launch or
-requires further user action.>
+<One paragraph: what changed, severity counts, whether the patched YAML is ready
+to launch, and what manual work remains.>
 
 ## Findings
 
 ### Fatal (must fix; auto-patching not possible)
 
-- _none_  — OR —
-- **`<field-path>`** (`<file>:<line>`): <one-paragraph explanation>
+- _none_ - OR -
+- **`<field-or-path>`** (`<file>:<line>`): <explanation>
   - **Current**: `<value>`
   - **Recommended**: `<value>` (or "manual rewrite")
-  - **Why fatal**: <reason auto-patch isn't possible>
-  - **References**: [option-reference §X], [failure-modes §Y]
+  - **Why fatal**: <reason auto-patch is unsafe or impossible>
+  - **References**: <reference file/section>
 
 ### Errors (auto-patched; review the diff)
 
 - **`data.train_ds.indexed`** (`<file>:<line>`): <description>
-  - **Was**: `false` → **now**: `true`
-  - **Why**: <one paragraph>
-  - **References**: [option-reference §train_ds.indexed]
+  - **Was**: `false` -> **now**: `true`
+  - **Why**: <rationale>
+  - **References**: <reference file/section>
 
-- _(more)_
+### Warnings (review manually)
 
-### Warnings (auto-patched OR commented inline; verify intent)
-
-- **`data.train_ds.shard_seed`** (`<file>:<line>`): <description>
-  - **Was**: `"randomized"` → **now**: `42`
-  - **Why**: NeMo `dataloader.py` would auto-overwrite at runtime with a
-    `WARNING` log; pinning at the YAML layer makes intent obvious to
-    reviewers and avoids the runtime warning.
-  - **References**: [conflict-matrix row 3], [failure-modes §11]
-
-- _(more)_
+- **`<field-or-path>`** (`<file>:<line>`): <description>
+  - **Current**: `<value>`
+  - **Recommended**: `<value>`
+  - **Why**: <rationale>
+  - **References**: <reference file/section>
 
-### Notes (informational; no patch)
+### Notes (informational)
 
-- **`data.validation_ds.use_stateful_dataloader`** (`<file>:<line>`):
-  Not strictly required for validation (eval doesn't checkpoint), but
-  setting it `false` matches the working 1-node smoke recipe. No change
-  needed.
+- **`<field-or-path>`** (`<file>:<line>`): <description>
 
-- _(more)_
+## Dedup Mode
 
-## Dedup mode
+<One paragraph: confirm training uses `force_map_dataset: false`; if not, mark
+the migration not launch-ready and list the blocker or explicit user exception.>
 
-<One paragraph: which `force_map_dataset` value this config uses and why.>
+- **Training target**: `force_map_dataset: false`. This enforces iterable
+  partitioning and avoids map-style sampler/manifest overhead.
+- **Validation/test target**: `force_map_dataset: true` unless intentionally
+  testing iterable behavior; finite deterministic validation is simpler in
+  map-style mode.
+- **Blocker/exception**: if training still uses `force_map_dataset: true`, mark
+  the migration not launch-ready unless the user explicitly approved an
+  exception; list the unindexed source or runtime blocker, expected overhead, and
+  work needed to move back to iterable training.
 
-- **`force_map_dataset: true`** (safe default; over-sample-and-discard
-  inside `DynamicBucketingSampler`) — works for any source type. Costs
-  `W×` redundant sampler/manifest I/O per step.
-- **`force_map_dataset: false`** (iterable + worker partition; suitable
-  for indexed-only configs at high `world_size`) — sample indices are
-  partitioned across `(DP rank × DataLoader worker)` via
-  `LazyShuffledRange(shard_id, num_shards)`. Near-`W×` step-time
-  improvement at scale. Audit required: every source must be indexed
-  (`failure-modes.md §21`), every `LazyIteratorMultiplexer.seed` must
-  be a fixed integer (§22), `(world_size, num_workers)` invariant
-  across the chain (§23).
+For training iterable mode, list:
 
-If `false` was selected, list:
 - Sources confirmed indexed: <list>
 - Multiplexer seeds confirmed integer: <list>
-- World-size / num-workers commitment: `<W>` × `<NW>` for the entire
-  chain.
+- World-size / num-workers commitment: `<W>` x `<NW>` for the full chain
 
-## Cross-cuts
+## Data Blend Audit
 
-### Data blend audit
+<List unindexable entries such as compressed manifests/tars, `pipe:` paths,
+unsupported `extra_fields`, `slice_length`, or mixed indexed/non-indexed chains.>
 
-<Drop in `references/failure-modes.md` §1 / §2 callouts: blend entries with
-`.jsonl.gz`, `.tar.gz`, `extra_fields`, `slice_length`. List the entries
-that were removed from the patched blend and the per-entry rationale.>
-
-| corpus | reason for exclusion | upstream fix |
+| entry | reason | upstream fix |
 |---|---|---|
-| ami | `cuts: *.jsonl.gz` (compressed Lhotse Shar) | re-export with `compress_jsonl=False` OR convert to `nemo_tarred` |
-| _(more)_ | _(more)_ | _(more)_ |
+| `<source>` | compressed cuts/manifests | re-export as uncompressed seekable files |
+| `<source>` | unsupported `extra_fields` | preprocess fields into the manifest |
 
-### Launcher review
+## Launcher Review
 
-<If launcher script provided: list grep findings. Otherwise: "skipped".>
+<If launcher was inspected, list findings. Otherwise write "skipped".>
 
-- **Per-chunk seed rotation**: <not detected | DETECTED at `<file>:<line>` —
-  the launcher pulls from a FIXED_SEEDS-like array; this MUST be pinned
-  to a single value when the resumable path is on. See
-  `failure-modes.md §12`. Manual fix required.>
-- **Prefetch preamble wired**: <yes / NO — `--enable-indexes-prefetch`
-  flag not set; manual addition needed. See `option-reference.md §launcher
-  flags`.>
-- **`--bypass-nvidia-hook`**: <not needed | needed for `<cluster>` cpu
-  partition — see `failure-modes.md §9`>
+- **Per-chunk seed rotation**: <not detected | detected at file:line; must pin one seed>
+- **Index access wired**: <persistent mirror | node-local staging | missing>
+- **AIStore batch audio fetch**: <needed and enabled | not needed | missing>
+- **Topology invariance**: <verified | not verifiable | violated>
+- **Python path/package selection**: <verified | not verifiable | missing>
 
-### AIStore vs lustre
+## Storage Workflow
 
-<One paragraph: which workflow this migration follows (per
-`aistore-vs-non-aistore.md` decision tree), and any cross-cluster
-caveats.>
+<One paragraph: filesystem-only vs AIStore/remote workflow, whether manifests and
+indexes are local/shared filesystem paths, and whether any prefetch/staging is
+required.>
 
-## Patched output diff
+## Patched Output Diff
 
-### `<config>.yaml` → `<config>-resumable.yaml`
+### `<config>.yaml` -> `<config>-resumable.yaml`
 
 ```diff
 -  data.train_ds:
@@ -121,48 +102,40 @@ caveats.>
 +  data.train_ds:
 +    indexed: true
 +    use_stateful_dataloader: true
-+    force_map_dataset: true
-+    indexes_root: /tmp/idx
-+    shard_seed: 42  # NOTE: pinned for StatefulDataLoader resume; see
-+                    # MIGRATION_GUIDE.md §"Operational constraints"
++    force_map_dataset: false
++    indexes_root: /shared/fs/.../indexes_mirror
++    shard_seed: 42
 ```
 
 _(full diff inline)_
 
-### `<blend>.yaml` → `<blend>-resumable.yaml`
+### `<blend>.yaml` -> `<blend>-resumable.yaml`
 
 ```diff
--  - corpus: ami
+-  - type: lhotse_shar
 -    shar_path:
--      cuts: s3://AMI/lhotse_shar/cuts._OP_*_CL_.jsonl.gz
--    type: lhotse_shar
--    weight: 0.2
-+  # AMI dropped — Lhotse Shar `cuts.*.jsonl.gz` cannot be indexed
-+  # (uncompressed sources only). Re-export with `compress_jsonl=False`
-+  # or convert to `nemo_tarred` to re-include.
+-      cuts: s3://bucket/path/cuts.0.jsonl.gz
++  # Source excluded: compressed Shar cuts cannot be indexed.
++  # Re-export with uncompressed cuts or convert to another seekable format.
 ```
 
 _(full diff inline)_
 
-## Pre-flight checklist
-
-See `pre-flight-checklist.md` next to this report. The TL;DR:
+## Pre-flight Checklist
 
 1. Build indexes via the generated `build-indexes-cmd.sh`.
-2. Run the `MIGRATION_GUIDE.md §3` bit-exact verification once on this
-   recipe.
-3. Confirm `aistore` SDK present in the container (AIStore workflow only).
-4. 1-node single-chunk → 1-node multi-chunk → full N-node smoke ladder.
-5. Submit the real run.
+2. Run a bit-exact dataloader resume check on the migrated config.
+3. Confirm storage SDKs and environment variables required by the selected
+   workflow.
+4. Confirm `indexes_root` exists and is populated from every node/container that
+   will train.
+5. Run single-node single-chunk, single-node resume, then full-topology smoke.
+6. Submit the real run.
 
 ## References
 
-- `MIGRATION_GUIDE.md` (repo root): canonical migration walkthrough.
-- `references/option-reference.md`: every YAML field, every flag, including
-  the iterable-mode partition concerns.
-- `references/conflict-matrix.md`: option pairs that conflict (includes
-  iterable-mode constraints: §20–§23).
-- `references/failure-modes.md`: 23-entry failure-mode catalog (§20–§23 cover iterable-mode partition concerns).
-- `references/best-practices.md`: prioritised checklist (tier 2 §5b covers
-  when to prefer `force_map_dataset: false`).
-- `references/aistore-vs-non-aistore.md`: workflow selection.
+- `references/option-reference.md`
+- `references/conflict-matrix.md`
+- `references/failure-modes.md`
+- `references/best-practices.md`
+- `references/aistore-vs-non-aistore.md`

From 75170c61b0243c60cdd70d77ff9805cfd33c23e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Mon, 15 Jun 2026 07:57:40 -0700
Subject: [PATCH 24/30] Script for analysis of resumable checkpoint dataset
 tree progress and fixes to bucket + shard explansion patterns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../common/data/lhotse/nemo_adapters.py       |  110 +-
 .../analyze_resumable_checkpoint.py           | 1008 +++++++++++++++++
 .../common/test_lhotse_indexed_partition.py   |   85 ++
 3 files changed, 1164 insertions(+), 39 deletions(-)
 create mode 100644 scripts/dataloading/analyze_resumable_checkpoint.py

diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 6b5215afab07..91452307fc4f 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -23,7 +23,7 @@
 from contextlib import closing
 from io import BytesIO
 from pathlib import Path
-from typing import Generator, Iterable, List, Literal
+from typing import Generator, Iterable, List, Literal, Union
 
 try:
     import pyarrow.parquet as pq
@@ -57,6 +57,8 @@
 # suffixes. We use this pattern in both indexed and streaming code paths to
 # recover the actual tar member name (offsets share a single member).
 _OFFSET_PATTERN = re.compile(r'^(?P<stem>.+)(?P<sub>-sub\d+)(?P<ext>\.\w+)?$')
+ShardKey = Union[int, tuple[int, int]]
+
 
 
 class LazyNeMoIterator(IteratorNode):
@@ -404,10 +406,11 @@ def __init__(
         indexes_root: str | Path | None = None,
     ) -> None:
         self.skip_missing_manifest_entries = skip_missing_manifest_entries
-        self._malformed_manifest_warning_keys: set[tuple[str, int]] = set()
+        self._malformed_manifest_warning_keys: set[tuple[str, ShardKey]] = set()
         self.indexed = indexed
         self.indexes_root = indexes_root
-        self.shard_id_to_manifest: dict[int, Iterable[dict]]
+        self.shard_id_to_manifest: dict[ShardKey, Iterable[dict]]
+        self._shard_key_to_manifest_path: dict[ShardKey, str] = {}
         self.paths = expand_sharded_filepaths(manifest_path)
         if len(self.paths) == 1:
             if not indexed:
@@ -426,28 +429,19 @@ def __init__(
                 self.shard_id_to_manifest = groupby("shard_id", self.source)
         else:
             json_pattern = re.compile(r"manifest[^/]*_(\d+)[^/]*\.json")
-            shard_ids = []
-            for p in self.paths:
-                m = json_pattern.search(p)
-                assert m is not None, (
-                    f"Cannot determine shard_id from manifest input specified: "
-                    f"we searched with regex '{json_pattern.pattern}' in input '{p}'"
-                )
-                shard_ids.append(int(m.group(1)))
-            self.shard_id_to_manifest = {sid: LazyJsonlIterator(p) for sid, p in zip(shard_ids, self.paths)}
+            shard_keys, _ = _extract_unique_shard_keys(self.paths, json_pattern, path_kind="manifest")
+            self._shard_key_to_manifest_path = {key: path for key, path in zip(shard_keys, self.paths)}
+            self.shard_id_to_manifest = {
+                key: LazyJsonlIterator(path) for key, path in self._shard_key_to_manifest_path.items()
+            }
             self.source = LazyIteratorChain(*self.shard_id_to_manifest.values())
 
         self.tar_paths = expand_sharded_filepaths(tar_paths)
         tar_pattern = re.compile(r"audio[^/]*_(\d+)[^/]*\.tar")
-        shard_ids = []
-        for p in self.tar_paths:
-            m = tar_pattern.search(p)
-            assert m is not None, (
-                f"Cannot determine shard_id from tar input specifier: "
-                f"we searched with regex '{tar_pattern.pattern}' in input '{p}'"
-            )
-            shard_ids.append(int(m.group(1)))
-        self.shard_id_to_tar_path = dict(zip(shard_ids, self.tar_paths))
+        shard_keys, _ = _extract_unique_shard_keys(self.tar_paths, tar_pattern, path_kind="tar")
+        self.shard_id_to_tar_path: dict[ShardKey, str] = {
+            key: path for key, path in zip(shard_keys, self.tar_paths)
+        }
 
         self.shuffle_shards = shuffle_shards
         self.shard_seed = shard_seed
@@ -489,23 +483,21 @@ def _init_indexed(self) -> None:
         if self.slice_length is not None:
             raise ValueError("LazyNeMoTarredIterator(indexed=True) does not support 'slice_length'.")
 
-        # Order shards by their integer shard_id so that global indices are stable.
-        self._sorted_shard_ids = sorted(self.shard_id_to_tar_path.keys())
-        self._cuts_readers: dict[int, IndexedJsonlReader] = {}
+        # Order shards by stable shard key so global indices are reproducible.
+        # Multi-bucket NeMo specs may expand to paths such as
+        # bucket_1/audio_0.tar and bucket_2/audio_0.tar; the occurrence suffix in
+        # ShardKey prevents those duplicate numeric shard ids from overwriting.
+        self._sorted_shard_ids: list[ShardKey] = sorted(self.shard_id_to_tar_path.keys())
+        self._cuts_readers: dict[ShardKey, IndexedJsonlReader] = {}
         # In USE_AIS_GET_BATCH mode we never open the tar files locally — audio is
         # fetched lazily via URL/file AudioSource by AudioSamples (typically batched).
-        self._tar_readers: dict[int, IndexedTarMemberReader] = {}
+        self._tar_readers: dict[ShardKey, IndexedTarMemberReader] = {}
 
-        # Map shard_id → manifest path (single or multi-file).
+        # Map shard key → manifest path (single or multi-file).
         if len(self.paths) == 1:
             shard_id_to_manifest_path = {sid: self.paths[0] for sid in self._sorted_shard_ids}
         else:
-            json_pattern = re.compile(r"manifest[^/]*_(\d+)[^/]*\.json")
-            shard_id_to_manifest_path = {}
-            for p in self.paths:
-                m = json_pattern.search(p)
-                assert m is not None
-                shard_id_to_manifest_path[int(m.group(1))] = p
+            shard_id_to_manifest_path = self._shard_key_to_manifest_path
 
         cum = 0
         cum_lens = [0]
@@ -559,10 +551,10 @@ def to_shards(self) -> List["LazyNeMoTarredIterator"]:
 
     def _validate(self) -> None:
         if self.indexed:
-            # Indexed mode keys shards by the tar path's shard_id and pairs them with
-            # the jsonl manifest of the same numeric id (see ``_init_indexed``); the
-            # streaming-time shard_id consistency check below would otherwise reject
-            # single-file inputs when the jsonl groups by a different shard_id field.
+            # Indexed mode pairs tar and manifest paths by stable shard key in
+            # ``_init_indexed``. The streaming-time shard_id consistency check below
+            # would otherwise reject single-file inputs when the jsonl groups by a
+            # different shard_id field.
             validate_extra_fields(self.extra_fields)
             return
         shard_ids_tars = set(self.shard_id_to_tar_path)
@@ -580,7 +572,7 @@ def _get_seed(self) -> int:
         return resolve_seed(self.shard_seed) + self.epoch
 
     @property
-    def shard_ids(self) -> List[int]:
+    def shard_ids(self) -> List[ShardKey]:
         return sorted(self.shard_id_to_manifest.keys())
 
     def _iter_batch_for_ais_get_batch(
@@ -700,7 +692,7 @@ def _iter_sequential(
                         ) from e
 
     # ---------------------------------------------------------------------- indexed
-    def _resolve_global_idx(self, idx: int) -> tuple[int, int]:
+    def _resolve_global_idx(self, idx: int) -> tuple[ShardKey, int]:
         if idx < 0:
             idx += self._total_len
         if idx < 0 or idx >= self._total_len:
@@ -880,7 +872,7 @@ def __iter__(self) -> Generator[Cut, None, None]:
         # NeMo tarred manifests can have multiple JSONL entries pointing at the
         # same audio member with -subN audio_filepath suffixes (per-offset cuts).
         for sid in shard_ids:
-            manifest_path = self.paths[sid] if len(self.paths) > 1 else self.paths[0]
+            manifest_path = self._shard_key_to_manifest_path[sid] if len(self.paths) > 1 else self.paths[0]
 
             def basename(d: dict) -> str:
                 return (
@@ -1294,3 +1286,43 @@ def _iter_streaming(self) -> Generator[Cut, None, None]:
                 if cut is None:
                     continue
                 yield cut
+
+
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
+
+
+def _extract_unique_shard_keys(
+    paths: list[str], pattern: re.Pattern, *, path_kind: str
+) -> tuple[list[ShardKey], list[int]]:
+    """Extract shard ids while preserving duplicate ids from expanded paths.
+
+    NeMo tarred dataset specs may contain multiple independent path dimensions,
+    e.g. ``bucket_OP_1..8_CL_/audio__OP_0..127_CL_.tar``. After expansion,
+    every bucket contains numeric tar shard ids ``0..127``. Keying readers only
+    by that numeric id silently overwrites all but the last bucket, shrinking the
+    effective dataset and causing extreme oversampling of the remaining shards.
+
+    When numeric ids are unique, keep the historical ``int`` keys. When a
+    numeric id repeats, key each occurrence as ``(shard_id, occurrence)`` so
+    manifest and tar paths remain paired one-to-one across all expanded files.
+    The raw ids are returned for callers that need the original parsed values.
+    """
+    raw_ids = []
+    for path in paths:
+        match = pattern.search(path)
+        assert match is not None, (
+            f"Cannot determine shard_id from {path_kind} input specifier: "
+            f"we searched with regex '{pattern.pattern}' in input '{path}'"
+        )
+        raw_ids.append(int(match.group(1)))
+    if len(set(raw_ids)) == len(raw_ids):
+        return raw_ids, raw_ids
+    occurrences: dict[int, int] = {}
+    keys: list[ShardKey] = []
+    for shard_id in raw_ids:
+        occurrence = occurrences.get(shard_id, 0)
+        occurrences[shard_id] = occurrence + 1
+        keys.append((shard_id, occurrence))
+    return keys, raw_ids
diff --git a/scripts/dataloading/analyze_resumable_checkpoint.py b/scripts/dataloading/analyze_resumable_checkpoint.py
new file mode 100644
index 000000000000..716775d9d394
--- /dev/null
+++ b/scripts/dataloading/analyze_resumable_checkpoint.py
@@ -0,0 +1,1008 @@
+#!/usr/bin/env python
+"""Analyze resumable Lhotse dataloader progress stored in a checkpoint.
+
+This tool answers two operational questions for indexed/resumable
+training runs:
+
+* how far each leaf dataset in the blend has advanced, expressed as total
+  utilization (for example ``70%`` or ``1389%``), completed epochs, and current
+  in-progress epoch percentage;
+* how the observed consumed-item share compares with the desired blend weight,
+  which surfaces datasets that were over- or under-sampled by the checkpoint.
+
+Expected inputs
+---------------
+Use ``--checkpoint`` for a checkpoint file, a checkpoint directory, or an
+``eval-step-N`` directory. For FSDP/DCP checkpoints, the script first looks for
+metadata-only ``meta.pt`` files and expects NeMo's per-rank
+``train_dataloader_per_rank`` payload. Use ``--allow-full-ckpt-load`` only when
+metadata is unavailable and loading a non-meta checkpoint is acceptable.
+
+Pass ``--config`` when available so the script can resolve the training blend,
+recover desired blend weights, dataset names, and count indexed examples from
+``.idx`` sidecars. Use ``--indexes-root`` when the sidecars live in a mirrored
+index tree instead of next to the manifests/tars. ``--state-json`` is a debugging
+escape hatch for analyzing an already-extracted payload without importing torch.
+
+Outputs
+-------
+By default the script prints a Markdown table. ``--output-dir`` writes
+``summary.json``, ``summary.md``, and ``summary.csv``; the JSON also includes the
+raw leaf progress states and resolved dataset specs for follow-up debugging.
+
+When to use it
+--------------
+Run it after a resumable training checkpoint is produced, before continuing a
+suspicious chain, or during a dataloader postmortem when blend utilization looks
+wrong. It is read-only: it never modifies checkpoints, indexes, or configs.
+"""
+from __future__ import annotations
+
+import argparse
+import csv
+import datetime as dt
+import json
+import math
+import os
+import re
+import sys
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Iterable
+
+
+try:
+    import yaml
+except ImportError as exc:  # pragma: no cover - startup guard
+    raise SystemExit("PyYAML is required to parse training/blend configs.") from exc
+
+
+BRACE_RANGE_PATTERN = re.compile(r"\{(-?\d+)\.\.(-?\d+)(?:\.\.(-?\d+))?\}")
+EVAL_STEP_PATTERN = re.compile(r"eval-step-(\d+)$")
+STATEFUL_KEY = "train_dataloader_per_rank"
+POSITION_KEYS = {"position", "shard_id", "num_shards"}
+INDEX_DIR_CACHE: dict[str, dict[str, int] | None] = {}
+
+
+# ---------------------------------------------------------------------------
+# Public data model
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class DatasetSpec:
+    source_index: int
+    name: str
+    desired_weight: float | None = None
+    raw_weight: float | None = None
+    hours: float | None = None
+    kind: str | None = None
+    source_path: str | None = None
+    total_items: int | None = None
+    missing_index_paths: list[str] = field(default_factory=list)
+
+
+@dataclass
+class LeafProgress:
+    source_index: int
+    rank: int | None
+    worker: str | None
+    state_type: str
+    epoch: int
+    position: int
+    shard_id: int | None
+    num_shards: int | None
+    total_len: int | None
+    state_path: str
+
+
+@dataclass
+class SummaryRow:
+    source_index: int
+    dataset: str
+    state_type: str
+    desired_weight: float | None
+    observed_weight: float | None
+    drift_abs: float | None
+    drift_ratio: float | None
+    utilization_pct: float | None
+    completed_epochs: int | None
+    current_epoch_pct: float | None
+    consumed_items: int | None
+    total_items: int | None
+    partitions_seen: int
+    min_epoch: int | None
+    max_epoch: int | None
+    min_position: int | None
+    max_position: int | None
+    missing_total: bool
+    notes: str = ""
+
+
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+
+
+def collect_dataset_specs(
+    config: dict[str, Any] | None,
+    *,
+    config_path: Path | None,
+    indexes_root: str | None,
+    data_blend_dir: str | None,
+) -> list[DatasetSpec]:
+    """Resolve training blend leaves into ordered dataset specs.
+
+    The checkpoint records source progress by leaf order, not by dataset name.
+    This function walks ``data.train_ds.input_cfg`` with the same nested blend
+    references and temperature-normalized weights used by the recipe, then
+    counts examples from indexed sidecars when available.
+    """
+    if not config:
+        return []
+    train_ds = _get_path(config, "data.train_ds")
+    if not isinstance(train_ds, dict):
+        return []
+    if data_blend_dir is None:
+        raw_dir = config.get("data_blend_dir")
+        if isinstance(raw_dir, str):
+            data_blend_dir = raw_dir
+    current_dir = config_path.parent if config_path is not None else None
+    temps = _temperature_list(train_ds)
+    leaves: list[dict[str, Any]] = []
+
+    def recurse(
+        node: Any,
+        cumulative_weight: float,
+        level: int,
+        cur_dir: Path | None,
+        inherited: dict[str, Any],
+    ) -> None:
+        node, next_dir = _load_ref_if_yaml(node, data_blend_dir=data_blend_dir, current_dir=cur_dir)
+        if isinstance(node, dict):
+            merged = dict(inherited)
+            for key, value in node.items():
+                if key not in ("input_cfg", "weight"):
+                    merged.setdefault(key, value)
+            if "input_cfg" in node:
+                child = node["input_cfg"]
+                recurse(child, cumulative_weight, level, next_dir, merged)
+            else:
+                leaf = dict(merged)
+                leaf.update(node)
+                leaf["_desired_weight"] = cumulative_weight
+                leaf["_raw_weight"] = _safe_float(node.get("weight"))
+                leaves.append(leaf)
+            return
+        if isinstance(node, list):
+            weights = [_safe_float(item.get("weight")) if isinstance(item, dict) else None for item in node]
+            if all(w is not None for w in weights):
+                temperature = temps[level] if temps and level < len(temps) else 1.0
+                local_weights = _normalize_weight_vector([float(w) for w in weights], temperature=temperature)
+            else:
+                local_weights = [1.0 / len(node)] * len(node)
+            for item, local_weight in zip(node, local_weights):
+                item_weight = cumulative_weight * local_weight
+                recurse(item, item_weight, level + 1, next_dir, inherited)
+            return
+        if _looks_like_yaml_ref(node):
+            loaded, loaded_dir = _load_ref_if_yaml(node, data_blend_dir=data_blend_dir, current_dir=cur_dir)
+            recurse(loaded, cumulative_weight, level, loaded_dir, inherited)
+
+    recurse(train_ds.get("input_cfg"), 1.0, 0, current_dir, {})
+
+    specs: list[DatasetSpec] = []
+    for idx, leaf in enumerate(leaves):
+        path_groups = _source_path_groups_for_item(leaf)
+        paths = path_groups[0] if path_groups else []
+        total_items = None
+        missing: list[str] = []
+        for group in path_groups:
+            group_total, group_missing = _count_indexed_items(group, indexes_root)
+            if group_total is not None:
+                total_items = group_total
+                paths = group
+                missing = group_missing
+                break
+            missing.extend(group_missing[:20])
+        source_path = paths[0] if paths else None
+        specs.append(
+            DatasetSpec(
+                source_index=idx,
+                name=_dataset_name(leaf, source_path, idx),
+                desired_weight=_safe_float(leaf.get("_desired_weight")),
+                raw_weight=_safe_float(leaf.get("_raw_weight")),
+                hours=_safe_float(leaf.get("hours")),
+                kind=str(leaf.get("type")) if leaf.get("type") is not None else None,
+                source_path=source_path,
+                total_items=total_items,
+                missing_index_paths=missing[:20],
+            )
+        )
+    return specs
+
+def extract_progress(payload: Any) -> tuple[list[LeafProgress], list[str]]:
+    """Extract per-leaf dataloader progress from a loaded checkpoint payload.
+
+    The preferred layout is NeMo's ``train_dataloader_per_rank`` list, but the
+    scanner also handles raw nested ``sampler_state`` payloads for debugging and
+    compatibility with partially extracted state dumps.
+    """
+    notes: list[str] = []
+    progress: list[LeafProgress] = []
+    stateful_payloads = _find_stateful_payloads(payload)
+    if stateful_payloads:
+        notes.append(f"found {len(stateful_payloads)} {STATEFUL_KEY!r} payload(s)")
+        for state_path, per_rank in stateful_payloads:
+            for idx, entry in enumerate(per_rank):
+                if not isinstance(entry, dict):
+                    continue
+                rank = entry.get("dp_rank", idx)
+                rank = rank if isinstance(rank, int) else idx
+                inner_state = entry.get("state", entry)
+                for sampler_path, sampler_state in _find_sampler_states(
+                    inner_state, f"{state_path}[{idx}].state"
+                ):
+                    worker = _worker_from_path(sampler_path)
+                    progress.extend(
+                        _collect_leaves_from_sampler(
+                            sampler_state, rank=rank, worker=worker, path=sampler_path
+                        )
+                    )
+    else:
+        notes.append(f"no {STATEFUL_KEY!r} payload found; scanning for raw sampler_state entries")
+        for sampler_path, sampler_state in _find_sampler_states(payload):
+            worker = _worker_from_path(sampler_path)
+            progress.extend(_collect_leaves_from_sampler(sampler_state, rank=None, worker=worker, path=sampler_path))
+    progress, removed = _deduplicate_progress(progress)
+    if removed:
+        notes.append(f"deduplicated {removed} duplicate leaf progress state(s)")
+    return progress, notes
+
+def summarize(progress: list[LeafProgress], specs: list[DatasetSpec]) -> list[SummaryRow]:
+    """Combine checkpoint progress and dataset specs into report rows.
+
+    Consumed examples are aggregated across ranks/workers, converted to dataset
+    utilization percentages when totals are known, and normalized into observed
+    blend weights for drift reporting.
+    """
+    spec_by_index = {spec.source_index: spec for spec in specs}
+    grouped: dict[int, list[LeafProgress]] = {}
+    for leaf in progress:
+        grouped.setdefault(leaf.source_index, []).append(leaf)
+    consumed_by_index: dict[int, int | None] = {}
+    total_observed_consumed = 0
+    for source_index, leaves in grouped.items():
+        spec = spec_by_index.get(source_index)
+        total_len = next((leaf.total_len for leaf in leaves if leaf.total_len is not None), None)
+        if total_len is None and spec is not None:
+            total_len = spec.total_items
+        values = [_consumed_items(leaf, total_len) for leaf in leaves]
+        consumed = sum(v for v in values if v is not None) if all(v is not None for v in values) else None
+        consumed_by_index[source_index] = consumed
+        if consumed is not None:
+            total_observed_consumed += consumed
+
+    rows: list[SummaryRow] = []
+    for source_index in sorted(grouped):
+        leaves = grouped[source_index]
+        spec = spec_by_index.get(source_index)
+        total_len = next((leaf.total_len for leaf in leaves if leaf.total_len is not None), None)
+        if total_len is None and spec is not None:
+            total_len = spec.total_items
+        consumed = consumed_by_index[source_index]
+        utilization = (100.0 * consumed / total_len) if consumed is not None and total_len else None
+        observed = (consumed / total_observed_consumed) if consumed is not None and total_observed_consumed else None
+        desired = spec.desired_weight if spec is not None else None
+        drift_abs = observed - desired if observed is not None and desired is not None else None
+        drift_ratio = observed / desired if observed is not None and desired not in (None, 0) else None
+        completed_epochs = math.floor(utilization / 100.0) if utilization is not None else None
+        current_epoch_pct = (
+            utilization - completed_epochs * 100.0
+            if utilization is not None and completed_epochs is not None
+            else None
+        )
+        notes = []
+        if spec is None:
+            notes.append("no matching config source")
+        elif spec.total_items is None and total_len is None:
+            notes.append("missing total; provide --indexes-root or a config with indexed sidecars")
+        elif spec.missing_index_paths:
+            notes.append(f"{len(spec.missing_index_paths)} missing index path(s)")
+        rows.append(
+            SummaryRow(
+                source_index=source_index,
+                dataset=spec.name if spec is not None else f"source-{source_index}",
+                state_type="/".join(sorted({leaf.state_type for leaf in leaves})),
+                desired_weight=desired,
+                observed_weight=observed,
+                drift_abs=drift_abs,
+                drift_ratio=drift_ratio,
+                utilization_pct=utilization,
+                completed_epochs=completed_epochs,
+                current_epoch_pct=current_epoch_pct,
+                consumed_items=consumed,
+                total_items=total_len,
+                partitions_seen=len(leaves),
+                min_epoch=min(leaf.epoch for leaf in leaves) if leaves else None,
+                max_epoch=max(leaf.epoch for leaf in leaves) if leaves else None,
+                min_position=min(leaf.position for leaf in leaves) if leaves else None,
+                max_position=max(leaf.position for leaf in leaves) if leaves else None,
+                missing_total=total_len is None,
+                notes="; ".join(notes),
+            )
+        )
+    return rows
+
+def load_checkpoint_payload(path: Path, *, allow_full_load: bool, max_full_load_mb: int) -> tuple[Any, Path]:
+    """Load the smallest checkpoint payload that contains dataloader state.
+
+    Metadata files are preferred. Full checkpoint files are skipped unless
+    explicitly allowed and under the configured size cap.
+    """
+    errors = []
+    for candidate in _checkpoint_metadata_candidates(path):
+        if not candidate.is_file():
+            continue
+        if candidate.name != "meta.pt" and not allow_full_load:
+            size_mb = candidate.stat().st_size / (1024 * 1024)
+            if size_mb > max_full_load_mb:
+                errors.append(f"skipped large non-meta checkpoint {candidate} ({size_mb:.1f} MiB)")
+                continue
+        try:
+            payload = _torch_load(candidate)
+        except Exception as exc:  # pragma: no cover - depends on checkpoint format
+            errors.append(f"{candidate}: {exc}")
+            continue
+        progress, _ = extract_progress(payload)
+        if progress:
+            return payload, candidate
+        errors.append(f"{candidate}: loaded but no dataloader progress state found")
+    detail = "\n".join(errors[-10:])
+    raise RuntimeError(f"Could not find dataloader state for {path}.\n{detail}")
+
+def load_config(path: Path | None, checkpoint: Path) -> tuple[dict[str, Any] | None, Path | None, list[str]]:
+    """Load an explicit or nearby training config used to annotate the report."""
+    notes = []
+    candidates = [path] if path is not None else _auto_config_candidates(checkpoint)
+    for candidate in candidates:
+        if candidate is None or not candidate.is_file():
+            continue
+        try:
+            if candidate.suffix == ".json":
+                data = _load_json(candidate)
+            else:
+                data = _load_yaml(candidate)
+        except Exception as exc:
+            notes.append(f"failed to load config candidate {candidate}: {exc}")
+            continue
+        if isinstance(data, dict):
+            if _get_path(data, "data.train_ds") is not None:
+                return data, candidate, notes
+            notes.append(f"skipped config candidate {candidate}: no data.train_ds")
+    if path is None:
+        notes.append("no config found; pass --config for desired weights and index totals")
+    else:
+        notes.append(f"config not found or invalid: {path}")
+    return None, None, notes
+
+def markdown_table(rows: list[SummaryRow]) -> str:
+    """Render summary rows as a compact Markdown table for logs/stdout."""
+    headers = [
+        "idx",
+        "dataset",
+        "utilization",
+        "epochs",
+        "desired_w",
+        "observed_w",
+        "drift",
+        "items",
+        "total",
+        "parts",
+        "notes",
+    ]
+    lines = [
+        "| " + " | ".join(headers) + " |",
+        "| " + " | ".join(["---"] * len(headers)) + " |",
+    ]
+    for row in rows:
+        epoch_text = ""
+        if row.completed_epochs is not None and row.current_epoch_pct is not None:
+            epoch_text = f"{row.completed_epochs} + {row.current_epoch_pct:.2f}%"
+        values = [
+            str(row.source_index),
+            row.dataset.replace("|", "\\|"),
+            _fmt_pct(row.utilization_pct),
+            epoch_text,
+            _fmt_float(row.desired_weight),
+            _fmt_float(row.observed_weight),
+            _fmt_float(row.drift_abs),
+            "" if row.consumed_items is None else str(row.consumed_items),
+            "" if row.total_items is None else str(row.total_items),
+            str(row.partitions_seen),
+            row.notes.replace("|", "\\|"),
+        ]
+        lines.append("| " + " | ".join(values) + " |")
+    return "\n".join(lines) + "\n"
+
+def write_outputs(summary: dict[str, Any], rows: list[SummaryRow], args: argparse.Namespace) -> None:
+    """Write requested JSON, Markdown, and CSV artifacts."""
+    output_dir = Path(args.output_dir) if args.output_dir else None
+    if output_dir is not None:
+        output_dir.mkdir(parents=True, exist_ok=True)
+    json_path = Path(args.output_json) if args.output_json else (output_dir / "summary.json" if output_dir else None)
+    md_path = Path(args.output_md) if args.output_md else (output_dir / "summary.md" if output_dir else None)
+    csv_path = Path(args.output_csv) if args.output_csv else (output_dir / "summary.csv" if output_dir else None)
+    if json_path is not None:
+        json_path.parent.mkdir(parents=True, exist_ok=True)
+        json_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    if md_path is not None:
+        md_path.parent.mkdir(parents=True, exist_ok=True)
+        body = [
+            f"# Resumable Dataloader Checkpoint Analysis",
+            "",
+            f"- checkpoint: `{summary['checkpoint_input']}`",
+            f"- metadata_loaded: `{summary.get('checkpoint_metadata_loaded')}`",
+            f"- config: `{summary.get('config_path') or ''}`",
+            f"- generated_at: `{summary['generated_at']}`",
+            "",
+            markdown_table(rows),
+        ]
+        md_path.write_text("\n".join(body), encoding="utf-8")
+    if csv_path is not None:
+        csv_path.parent.mkdir(parents=True, exist_ok=True)
+        with csv_path.open("w", encoding="utf-8", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=list(asdict(rows[0]).keys()) if rows else ["source_index"])
+            writer.writeheader()
+            for row in rows:
+                writer.writerow(asdict(row))
+
+def parse_args() -> argparse.Namespace:
+    """Parse CLI flags for local or cluster-submitted analysis runs."""
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--checkpoint", "--checkpoint-path", dest="checkpoint", help="Checkpoint file/dir or eval-step dir."
+    )
+    parser.add_argument("--state-json", help="JSON payload to analyze instead of loading a torch checkpoint.")
+    parser.add_argument("--config", help="Training YAML/JSON for desired weights and source index totals.")
+    parser.add_argument("--data-blend-dir", help="Override ${data_blend_dir} while resolving nested blend YAMLs.")
+    parser.add_argument("--indexes-root", help="Root containing mirrored .idx sidecars, e.g. /tmp/idx.")
+    parser.add_argument("--output-dir", help="Directory for summary.json/summary.md/summary.csv.")
+    parser.add_argument("--output-json", help="Explicit JSON output path.")
+    parser.add_argument("--output-md", help="Explicit Markdown output path.")
+    parser.add_argument("--output-csv", help="Explicit CSV output path.")
+    parser.add_argument("--allow-full-ckpt-load", action="store_true", help="Allow loading non-meta checkpoint files.")
+    parser.add_argument("--max-full-load-mb", type=int, default=512, help="Safety cap for non-meta checkpoint files.")
+    parser.add_argument("--print-table", action="store_true", help="Print Markdown table to stdout.")
+    return parser.parse_args()
+
+def main() -> int:
+    """CLI entrypoint for loading inputs, computing rows, and writing outputs."""
+    args = parse_args()
+    if not args.checkpoint and not args.state_json:
+        raise SystemExit("Pass --checkpoint or --state-json.")
+
+    checkpoint = Path(args.checkpoint).expanduser() if args.checkpoint else Path(args.state_json).expanduser()
+    loaded_path: Path | None = None
+    if args.state_json:
+        payload = _load_json(Path(args.state_json).expanduser())
+        loaded_path = Path(args.state_json).expanduser()
+    else:
+        payload, loaded_path = load_checkpoint_payload(
+            checkpoint,
+            allow_full_load=args.allow_full_ckpt_load,
+            max_full_load_mb=args.max_full_load_mb,
+        )
+
+    progress, notes = extract_progress(payload)
+    config_path = Path(args.config).expanduser() if args.config else None
+    config, loaded_config_path, config_notes = load_config(config_path, checkpoint)
+    notes.extend(config_notes)
+    specs = collect_dataset_specs(
+        config,
+        config_path=loaded_config_path,
+        indexes_root=args.indexes_root,
+        data_blend_dir=args.data_blend_dir,
+    )
+    if specs and len(specs) != len({leaf.source_index for leaf in progress}):
+        notes.append(
+            f"config source count ({len(specs)}) differs from checkpoint source count "
+            f"({len({leaf.source_index for leaf in progress})}); mapping is by source order only"
+        )
+    rows = summarize(progress, specs)
+    summary = {
+        "checkpoint_input": str(checkpoint),
+        "checkpoint_metadata_loaded": str(loaded_path) if loaded_path else None,
+        "config_path": str(loaded_config_path) if loaded_config_path else None,
+        "indexes_root": args.indexes_root,
+        "generated_at": dt.datetime.now(dt.timezone.utc).isoformat(),
+        "notes": notes,
+        "num_leaf_progress_states": len(progress),
+        "num_summary_rows": len(rows),
+        "rows": [asdict(row) for row in rows],
+        "leaf_progress": [asdict(leaf) for leaf in progress],
+        "dataset_specs": [asdict(spec) for spec in specs],
+    }
+    write_outputs(summary, rows, args)
+    if args.print_table or not (args.output_dir or args.output_json or args.output_md or args.output_csv):
+        sys.stdout.write(markdown_table(rows))
+        if notes:
+            sys.stdout.write("\nNotes:\n")
+            for note in notes:
+                sys.stdout.write(f"- {note}\n")
+    return 0
+
+
+# ---------------------------------------------------------------------------
+# Private helpers
+# ---------------------------------------------------------------------------
+
+
+def _load_yaml(path: Path) -> Any:
+    with path.open("r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+
+def _load_json(path: Path) -> Any:
+    with path.open("r", encoding="utf-8") as f:
+        return json.load(f)
+
+def _safe_float(value: Any) -> float | None:
+    if isinstance(value, bool) or value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return float(value)
+    try:
+        return float(value)
+    except (TypeError, ValueError):
+        return None
+
+def _get_path(data: Any, dotted: str) -> Any:
+    cur = data
+    for part in dotted.split("."):
+        if not isinstance(cur, dict):
+            return None
+        cur = cur.get(part)
+    return cur
+
+def _normalize_weight_vector(weights: list[float], temperature: float = 1.0) -> list[float]:
+    if not weights:
+        return []
+    scaled = [w**temperature for w in weights]
+    total = sum(scaled)
+    if total <= 0:
+        return [1.0 / len(weights)] * len(weights)
+    return [w / total for w in scaled]
+
+def _strip_url_scheme(path: str) -> str:
+    match = re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://(.+)$", path)
+    return match.group(1) if match else path.lstrip("/")
+
+def _index_path_for(data_path: str, indexes_root: str | None) -> Path | None:
+    if not indexes_root:
+        if re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", data_path):
+            return None
+        return Path(data_path + ".idx")
+    return Path(indexes_root) / (_strip_url_scheme(data_path) + ".idx")
+
+def _indexed_file_size(idx_path: Path) -> int | None:
+    parent = str(idx_path.parent)
+    entries = INDEX_DIR_CACHE.get(parent)
+    if entries is None and parent not in INDEX_DIR_CACHE:
+        try:
+            entries = {
+                entry.name: entry.stat().st_size
+                for entry in os.scandir(idx_path.parent)
+                if entry.is_file()
+            }
+        except FileNotFoundError:
+            INDEX_DIR_CACHE[parent] = None
+            return None
+        INDEX_DIR_CACHE[parent] = entries
+    if entries is None:
+        return None
+    return entries.get(idx_path.name)
+
+def _fallback_brace_expand(path: str) -> list[str]:
+    match = BRACE_RANGE_PATTERN.search(path)
+    if not match:
+        return [path]
+    start_text, end_text, step_text = match.group(1), match.group(2), match.group(3)
+    start, end = int(start_text), int(end_text)
+    step = int(step_text) if step_text is not None else (1 if start <= end else -1)
+    if step == 0:
+        return [path]
+    if start < end and step < 0:
+        return [path]
+    if start > end and step > 0:
+        return [path]
+    width = max(len(start_text.lstrip("-")), len(end_text.lstrip("-")))
+    stop = end + (1 if step > 0 else -1)
+    expanded = []
+    for idx in range(start, stop, step):
+        sign = "-" if idx < 0 else ""
+        repl = f"{sign}{abs(idx):0{width}d}"
+        expanded.extend(_fallback_brace_expand(path[: match.start()] + repl + path[match.end() :]))
+    return expanded
+
+def _expand_op_path(path: str) -> list[str]:
+    # Match NeMo expand_sharded_filepaths(): _OP_/_CL_ are aliases for brace ranges.
+    sharded = path
+    for brace_open in ("(", "[", "<", "_OP_"):
+        sharded = sharded.replace(brace_open, "{")
+    for brace_close in (")", "]", ">", "_CL_"):
+        sharded = sharded.replace(brace_close, "}")
+    try:
+        import braceexpand
+
+        return list(braceexpand.braceexpand(sharded, escape=False))
+    except ImportError:
+        return _fallback_brace_expand(sharded)
+
+def _flatten_path_values(value: Any) -> list[str]:
+    if value is None:
+        return []
+    if isinstance(value, str):
+        return [value]
+    if isinstance(value, (list, tuple)):
+        out: list[str] = []
+        for item in value:
+            if isinstance(item, str):
+                out.append(item)
+            elif isinstance(item, (list, tuple)) and item and isinstance(item[0], str):
+                out.append(item[0])
+            elif isinstance(item, dict):
+                out.extend(_flatten_path_values(item))
+        return out
+    if isinstance(value, dict):
+        out = []
+        for item in value.values():
+            out.extend(_flatten_path_values(item))
+        return out
+    return []
+
+def _count_indexed_items(paths: list[str], indexes_root: str | None) -> tuple[int | None, list[str]]:
+    total = 0
+    missing: list[str] = []
+    any_count = False
+    for path in paths:
+        for expanded in _expand_op_path(path):
+            idx_path = _index_path_for(expanded, indexes_root)
+            if idx_path is None:
+                missing.append(f"{expanded}.idx")
+                continue
+            size = _indexed_file_size(idx_path)
+            if size is None:
+                missing.append(str(idx_path))
+                continue
+            if size < 8 or size % 8 != 0:
+                missing.append(str(idx_path))
+                continue
+            total += size // 8 - 1
+            any_count = True
+    return (total if any_count else None), missing
+
+def _resolve_ref(ref: str, *, data_blend_dir: str | None, current_dir: Path | None) -> Path:
+    text = ref
+    if data_blend_dir:
+        text = text.replace("${data_blend_dir}", data_blend_dir)
+    text = os.path.expandvars(text)
+    path = Path(text)
+    if path.is_absolute():
+        return path
+    if current_dir is not None:
+        return current_dir / path
+    return Path.cwd() / path
+
+def _looks_like_yaml_ref(value: Any) -> bool:
+    return isinstance(value, str) and value.endswith((".yaml", ".yml"))
+
+def _load_ref_if_yaml(value: Any, *, data_blend_dir: str | None, current_dir: Path | None) -> tuple[Any, Path | None]:
+    if not _looks_like_yaml_ref(value):
+        return value, current_dir
+    path = _resolve_ref(value, data_blend_dir=data_blend_dir, current_dir=current_dir)
+    return _load_yaml(path), path.parent
+
+def _source_path_groups_for_item(item: dict[str, Any]) -> list[list[str]]:
+    groups: list[list[str]] = []
+    keys = [
+        "manifest_filepath",
+        "cuts_path",
+        "source_paths",
+        "source_path",
+        "shar_path",
+        "tarred_audio_filepaths",
+        "tarred_audio_filepath",
+    ]
+    kind = str(item.get("type", ""))
+    if "nemo_tarred" in kind and (item.get("tarred_audio_filepaths") or item.get("tarred_audio_filepath")):
+        keys = [
+            "tarred_audio_filepaths",
+            "tarred_audio_filepath",
+            "manifest_filepath",
+            "cuts_path",
+            "source_paths",
+            "source_path",
+            "shar_path",
+        ]
+    for key in keys:
+        paths = _flatten_path_values(item.get(key))
+        if paths:
+            groups.append(paths)
+    return groups
+
+def _source_paths_for_item(item: dict[str, Any]) -> list[str]:
+    groups = _source_path_groups_for_item(item)
+    return groups[0] if groups else []
+
+def _dataset_name(item: dict[str, Any], source_path: str | None, fallback_index: int) -> str:
+    pieces = []
+    for key in ("corpus", "language", "dataset", "name", "type"):
+        value = item.get(key)
+        if value is not None and not isinstance(value, (dict, list)):
+            pieces.append(str(value))
+    if pieces:
+        return "/".join(pieces)
+    if source_path:
+        return source_path
+    return f"source-{fallback_index}"
+
+def _temperature_list(train_ds: dict[str, Any]) -> list[float] | None:
+    value = train_ds.get("reweight_temperature")
+    if value is None:
+        return None
+    if isinstance(value, (int, float)) and not isinstance(value, bool):
+        return [float(value)] * 16
+    if isinstance(value, (list, tuple)):
+        return [float(v) for v in value]
+    return None
+
+def _iter_children(obj: Any, path: str = "$") -> Iterable[tuple[str, Any]]:
+    if isinstance(obj, dict):
+        for key, value in obj.items():
+            yield f"{path}.{key}", value
+    elif isinstance(obj, list):
+        for idx, value in enumerate(obj):
+            yield f"{path}[{idx}]", value
+
+def _find_stateful_payloads(obj: Any, path: str = "$") -> list[tuple[str, list[Any]]]:
+    found: list[tuple[str, list[Any]]] = []
+    if isinstance(obj, dict):
+        value = obj.get(STATEFUL_KEY)
+        if isinstance(value, list):
+            found.append((f"{path}.{STATEFUL_KEY}", value))
+        for child_path, child in _iter_children(obj, path):
+            found.extend(_find_stateful_payloads(child, child_path))
+    elif isinstance(obj, list):
+        for child_path, child in _iter_children(obj, path):
+            found.extend(_find_stateful_payloads(child, child_path))
+    return found
+
+def _find_sampler_states(obj: Any, path: str = "$") -> list[tuple[str, dict[str, Any]]]:
+    found: list[tuple[str, dict[str, Any]]] = []
+    if isinstance(obj, dict):
+        sampler_state = obj.get("sampler_state")
+        if isinstance(sampler_state, dict):
+            found.append((f"{path}.sampler_state", sampler_state))
+        elif "cuts_state" in obj and "diagnostics" in obj:
+            found.append((path, obj))
+        for child_path, child in _iter_children(obj, path):
+            if child is sampler_state:
+                continue
+            found.extend(_find_sampler_states(child, child_path))
+    elif isinstance(obj, list):
+        for child_path, child in _iter_children(obj, path):
+            found.extend(_find_sampler_states(child, child_path))
+    return found
+
+def _worker_from_path(path: str) -> str | None:
+    match = re.search(r"worker[_-]?(\d+)", path)
+    if match:
+        return match.group(1)
+    return None
+
+def _state_total_len(state: dict[str, Any]) -> int | None:
+    range_state = state.get("range")
+    if isinstance(range_state, dict):
+        n = range_state.get("n")
+        if isinstance(n, int) and n >= 0:
+            return n
+    n = state.get("total_len") or state.get("_total_len") or state.get("n")
+    return int(n) if isinstance(n, int) and n >= 0 else None
+
+def _leaf_from_state(
+    source_index: int,
+    rank: int | None,
+    worker: str | None,
+    path: str,
+    node_type: str,
+    state: dict[str, Any],
+) -> LeafProgress | None:
+    if not POSITION_KEYS.issubset(state.keys()):
+        return None
+    position = state.get("position")
+    if not isinstance(position, int):
+        return None
+    epoch = state.get("epoch", 0)
+    if not isinstance(epoch, int):
+        epoch = 0
+    shard_id = state.get("shard_id")
+    num_shards = state.get("num_shards")
+    return LeafProgress(
+        source_index=source_index,
+        rank=rank,
+        worker=worker,
+        state_type=node_type,
+        epoch=epoch,
+        position=position,
+        shard_id=shard_id if isinstance(shard_id, int) else None,
+        num_shards=num_shards if isinstance(num_shards, int) else None,
+        total_len=_state_total_len(state),
+        state_path=path,
+    )
+
+def _collect_leaf_states(tree: Any, *, rank: int | None, worker: str | None, path: str = "$") -> list[LeafProgress]:
+    leaves: list[LeafProgress] = []
+
+    def walk(node: Any, node_path: str) -> None:
+        if isinstance(node, dict):
+            node_type = str(node.get("_type", "state"))
+            state = node.get("_state")
+            if isinstance(state, dict):
+                leaf = _leaf_from_state(len(leaves), rank, worker, f"{node_path}._state", node_type, state)
+                if leaf is not None:
+                    leaves.append(leaf)
+                    return
+                for key in ("source", "sources"):
+                    if key in state:
+                        walk(state[key], f"{node_path}._state.{key}")
+            leaf = _leaf_from_state(len(leaves), rank, worker, node_path, node_type, node)
+            if leaf is not None:
+                leaves.append(leaf)
+                return
+            for child_path, child in _iter_children(node, node_path):
+                walk(child, child_path)
+        elif isinstance(node, list):
+            for child_path, child in _iter_children(node, node_path):
+                walk(child, child_path)
+
+    walk(tree, path)
+    return leaves
+
+def _collect_leaves_from_sampler(
+    sampler_state: dict[str, Any], *, rank: int | None, worker: str | None, path: str
+) -> list[LeafProgress]:
+    leaves: list[LeafProgress] = []
+    nested = sampler_state.get("samplers") or sampler_state.get("bucket_samplers")
+    if isinstance(nested, list):
+        for idx, sub in enumerate(nested):
+            if isinstance(sub, dict):
+                leaves.extend(
+                    _collect_leaves_from_sampler(
+                        sub,
+                        rank=rank,
+                        worker=worker,
+                        path=f"{path}.samplers[{idx}]",
+                    )
+                )
+        if leaves:
+            for idx, leaf in enumerate(leaves):
+                leaf.source_index = idx
+            return leaves
+    cuts_state = sampler_state.get("cuts_state")
+    if cuts_state is not None:
+        leaves = _collect_leaf_states(cuts_state, rank=rank, worker=worker, path=f"{path}.cuts_state")
+        for idx, leaf in enumerate(leaves):
+            leaf.source_index = idx
+    return leaves
+
+def _deduplicate_progress(progress: list[LeafProgress]) -> tuple[list[LeafProgress], int]:
+    deduped: list[LeafProgress] = []
+    seen: set[tuple[Any, ...]] = set()
+    for leaf in progress:
+        key = (leaf.rank, leaf.worker, leaf.state_path, leaf.shard_id, leaf.num_shards)
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(leaf)
+    return deduped, len(progress) - len(deduped)
+
+def _shard_len(total_len: int, shard_id: int | None, num_shards: int | None) -> int | None:
+    if shard_id is None or num_shards is None or num_shards <= 0:
+        return None
+    if total_len <= shard_id:
+        return 0
+    return (total_len - shard_id + num_shards - 1) // num_shards
+
+def _consumed_items(leaf: LeafProgress, total_len: int | None) -> int | None:
+    total = leaf.total_len if leaf.total_len is not None else total_len
+    if total is None:
+        if leaf.epoch == 0:
+            return leaf.position
+        return None
+    shard_len = _shard_len(total, leaf.shard_id, leaf.num_shards)
+    if shard_len is None:
+        return leaf.position if leaf.epoch == 0 else None
+    return leaf.epoch * shard_len + leaf.position
+
+def _eval_step_candidates(path: Path) -> list[Path]:
+    match = EVAL_STEP_PATTERN.fullmatch(path.name)
+    if not match:
+        return []
+    step = match.group(1)
+    ckpt_dir = path.parent / "checkpoints"
+    return [
+        ckpt_dir / f"step={step}.ckpt",
+        ckpt_dir / f"step={step}-last.ckpt",
+        ckpt_dir / f"step-{step}.ckpt",
+        ckpt_dir / f"step-{step}-last.ckpt",
+    ]
+
+def _checkpoint_metadata_candidates(path: Path) -> list[Path]:
+    candidates: list[Path] = []
+    if path.is_dir():
+        candidates.extend(_eval_step_candidates(path))
+        candidates.extend([path / "meta.pt", path / "checkpoint" / "meta.pt"])
+        for child in sorted(path.glob("*.ckpt")):
+            candidates.append(child)
+        for child in sorted(path.glob("**/meta.pt")):
+            candidates.append(child)
+    else:
+        candidates.append(path)
+    expanded: list[Path] = []
+    for candidate in candidates:
+        if candidate.is_dir():
+            expanded.extend([candidate / "meta.pt", candidate / "checkpoint" / "meta.pt"])
+        expanded.append(candidate)
+    deduped: list[Path] = []
+    seen = set()
+    for candidate in expanded:
+        text = str(candidate)
+        if text not in seen:
+            seen.add(text)
+            deduped.append(candidate)
+    return deduped
+
+def _torch_load(path: Path) -> Any:
+    import torch
+
+    try:
+        return torch.load(path, map_location="cpu", weights_only=False)
+    except TypeError:
+        return torch.load(path, map_location="cpu")
+
+def _auto_config_candidates(checkpoint: Path) -> list[Path]:
+    candidates = []
+    roots = [checkpoint]
+    if checkpoint.is_file():
+        roots.append(checkpoint.parent)
+    if checkpoint.is_dir():
+        roots.extend([checkpoint.parent, checkpoint.parent.parent])
+    for root in roots:
+        if root.is_file():
+            continue
+        candidates.extend(
+            [
+                root / "exp_config.yaml",
+                root / "config.yaml",
+                root / "hparams.yaml",
+                root / "config.json",
+            ]
+        )
+    deduped = []
+    seen = set()
+    for candidate in candidates:
+        text = str(candidate)
+        if text not in seen:
+            seen.add(text)
+            deduped.append(candidate)
+    return deduped
+
+def _fmt_pct(value: float | None) -> str:
+    return "" if value is None else f"{value:.2f}%"
+
+def _fmt_float(value: float | None) -> str:
+    return "" if value is None else f"{value:.6g}"
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/collections/common/test_lhotse_indexed_partition.py b/tests/collections/common/test_lhotse_indexed_partition.py
index 1198bd6dc1d8..25e2ab480275 100644
--- a/tests/collections/common/test_lhotse_indexed_partition.py
+++ b/tests/collections/common/test_lhotse_indexed_partition.py
@@ -148,6 +148,91 @@ def build():
     assert sum(len(r) for r in per_rank) == N_CUTS
 
 
+@pytest.fixture
+def nemo_tarred_duplicate_bucket_manifest(tmp_audio_root) -> tuple[list[Path], list[Path]]:
+    """Two bucket dirs that both contain manifest_0.jsonl/audios_0.tar.
+
+    Indexed LazyNeMoTarredIterator used to key both paths by numeric shard id 0,
+    silently overwriting the first bucket. The expected dataset size is 2*N_CUTS.
+    """
+    from lhotse.serialization import SequentialJsonlWriter
+    from lhotse.shar.writers import TarWriter
+
+    root = tmp_audio_root / "tarred_duplicate_buckets"
+    root.mkdir(exist_ok=True)
+    manifest_paths: list[Path] = []
+    tar_paths: list[Path] = []
+    for bucket_idx in range(2):
+        cuts = DummyManifest(
+            CutSet,
+            begin_id=bucket_idx * N_CUTS,
+            end_id=(bucket_idx + 1) * N_CUTS,
+            with_data=True,
+        ).save_audios(tmp_audio_root / f"bucket_audio_{bucket_idx}", progress_bar=False)
+        bucket = root / f"bucket_{bucket_idx}"
+        bucket.mkdir(exist_ok=True)
+        manifest_path = bucket / "manifest_0.jsonl"
+        tar_path = bucket / "audios_0.tar"
+        with (
+            TarWriter(str(tar_path), shard_size=None) as tar_writer,
+            SequentialJsonlWriter(manifest_path) as mft_writer,
+        ):
+            for cut in cuts:
+                src = cut.recording.sources[0].source
+                name = Path(src).name
+                with open(src, "rb") as f:
+                    tar_writer.write(name, BytesIO(f.read()))
+                mft_writer.write(
+                    {
+                        "audio_filepath": name,
+                        "text": "irrelevant",
+                        "duration": cut.duration,
+                        "lang": "en",
+                        "shard_id": 0,
+                        "cut_id": cut.id,
+                    }
+                )
+        manifest_paths.append(manifest_path)
+        tar_paths.append(tar_path)
+    return manifest_paths, tar_paths
+
+
+@pytest.mark.parametrize("world_size", [1, 2, 4, 5])
+def test_lazy_nemo_tarred_iterator_indexed_preserves_duplicate_bucket_shard_ids(
+    nemo_tarred_duplicate_bucket_manifest, world_size
+):
+    manifest_paths, tar_paths = nemo_tarred_duplicate_bucket_manifest
+
+    def build():
+        it = nemo_adapters.LazyNeMoTarredIterator(
+            manifest_path=[str(path) for path in manifest_paths],
+            tar_paths=[str(path) for path in tar_paths],
+            indexed=True,
+        )
+        assert len(it) == 2 * N_CUTS
+        assert len(it.shard_id_to_tar_path) == 2
+        return [cut.custom["cut_id"] for cut in it]
+
+    per_rank, union = _collect_disjoint_per_rank(build, world_size)
+    assert len(union) == 2 * N_CUTS, f"missing {2 * N_CUTS - len(union)} items at world_size={world_size}"
+    assert sum(len(r) for r in per_rank) == 2 * N_CUTS
+
+
+def test_lazy_nemo_tarred_iterator_streaming_preserves_duplicate_bucket_shard_ids(
+    nemo_tarred_duplicate_bucket_manifest,
+):
+    manifest_paths, tar_paths = nemo_tarred_duplicate_bucket_manifest
+    it = nemo_adapters.LazyNeMoTarredIterator(
+        manifest_path=[str(path) for path in manifest_paths],
+        tar_paths=[str(path) for path in tar_paths],
+        indexed=False,
+    )
+
+    ids = [cut.custom["cut_id"] for cut in it]
+    assert len(ids) == 2 * N_CUTS
+    assert len(set(ids)) == 2 * N_CUTS
+
+
 # ---------------------------------------------------------------------------
 # 2. LazyParquetIterator
 # ---------------------------------------------------------------------------

From eef6656dc41a3ae3ac09ad8c976af6a1f8f0bfb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Mon, 22 Jun 2026 13:46:07 -0700
Subject: [PATCH 25/30] Add ability to gracefully skip over malformed JSON
 lines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 nemo/collections/common/data/lhotse/cutset.py |  5 +++-
 .../common/data/lhotse/nemo_adapters.py       | 24 ++++++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/cutset.py b/nemo/collections/common/data/lhotse/cutset.py
index a992b81d7879..921dad55ea6e 100644
--- a/nemo/collections/common/data/lhotse/cutset.py
+++ b/nemo/collections/common/data/lhotse/cutset.py
@@ -1520,7 +1520,10 @@ def read_nemo_manifest(config) -> tuple[CutSet, bool]:
     # and other data statistics.
     metadata_only = config.get("metadata_only", False)
     force_finite = config.get("force_finite", False)
-    notar_kwargs = {"metadata_only": metadata_only}
+    notar_kwargs = {
+        "metadata_only": metadata_only,
+        "skip_missing_manifest_entries": config.get("skip_missing_manifest_entries", False),
+    }
     tar_kwargs_extra = {"indexed": indexed, **indexed_extra} if indexed else {}
     is_tarred = config.get("tarred_audio_filepaths") is not None
     if isinstance(config.manifest_filepath, (str, Path)):
diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 91452307fc4f..6ee97124240f 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -60,6 +60,20 @@
 ShardKey = Union[int, tuple[int, int]]
 
 
+_MALFORMED_INDEXED_MANIFEST_WARNING_KEYS: set[tuple[str, str]] = set()
+
+
+def _warn_malformed_indexed_manifest_record(ex: BaseException, idx: int, path: str | Path) -> None:
+    key = (str(path), type(ex).__name__)
+    if key in _MALFORMED_INDEXED_MANIFEST_WARNING_KEYS:
+        return
+    _MALFORMED_INDEXED_MANIFEST_WARNING_KEYS.add(key)
+    logging.warning(
+        "Skipping malformed indexed NeMo manifest records; "
+        f"first occurrence path={path!r} idx={idx} error={type(ex).__name__}: {ex}. "
+        "Further records with the same path/error type are suppressed in this worker."
+    )
+
 
 class LazyNeMoIterator(IteratorNode):
     """
@@ -134,6 +148,7 @@ def __init__(
         extra_fields: list[dict[str, str]] | None = None,
         indexed: bool = False,
         indexes_root: str | Path | None = None,
+        skip_missing_manifest_entries: bool = False,
     ) -> None:
         self.path = path
         self.shuffle_shards = shuffle_shards
@@ -144,6 +159,7 @@ def __init__(
         self.extra_fields = extra_fields
         self.indexed = indexed
         self.indexes_root = indexes_root
+        self.skip_missing_manifest_entries = skip_missing_manifest_entries
         validate_extra_fields(self.extra_fields)
         paths = expand_sharded_filepaths(path)
 
@@ -158,7 +174,13 @@ def __init__(
 
             seed = resolve_seed(shard_seed) if shard_seed not in (None, "trng", "randomized") else 0
             indexed_sources = [
-                LazyIndexedManifestIterator(p, index_path=index_file_path(p, indexes_root), decode=GraphOriginDict)
+                LazyIndexedManifestIterator(
+                    p,
+                    index_path=index_file_path(p, indexes_root),
+                    decode=GraphOriginDict,
+                    skip_decode_errors=skip_missing_manifest_entries,
+                    decode_error_callback=_warn_malformed_indexed_manifest_record,
+                )
                 for p in paths
             ]
             if len(indexed_sources) == 1:

From 0e35bab4261b233159f37424718981a6ce3f29e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Mon, 22 Jun 2026 13:56:39 -0700
Subject: [PATCH 26/30] Fix PR CI lint checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../common/data/lhotse/nemo_adapters.py       |  4 +-
 .../analyze_resumable_checkpoint.py           | 70 ++++++++++++++++---
 2 files changed, 60 insertions(+), 14 deletions(-)

diff --git a/nemo/collections/common/data/lhotse/nemo_adapters.py b/nemo/collections/common/data/lhotse/nemo_adapters.py
index 6ee97124240f..a0d39c0017fd 100644
--- a/nemo/collections/common/data/lhotse/nemo_adapters.py
+++ b/nemo/collections/common/data/lhotse/nemo_adapters.py
@@ -461,9 +461,7 @@ def __init__(
         self.tar_paths = expand_sharded_filepaths(tar_paths)
         tar_pattern = re.compile(r"audio[^/]*_(\d+)[^/]*\.tar")
         shard_keys, _ = _extract_unique_shard_keys(self.tar_paths, tar_pattern, path_kind="tar")
-        self.shard_id_to_tar_path: dict[ShardKey, str] = {
-            key: path for key, path in zip(shard_keys, self.tar_paths)
-        }
+        self.shard_id_to_tar_path: dict[ShardKey, str] = {key: path for key, path in zip(shard_keys, self.tar_paths)}
 
         self.shuffle_shards = shuffle_shards
         self.shard_seed = shard_seed
diff --git a/scripts/dataloading/analyze_resumable_checkpoint.py b/scripts/dataloading/analyze_resumable_checkpoint.py
index 716775d9d394..fa566b5b6494 100644
--- a/scripts/dataloading/analyze_resumable_checkpoint.py
+++ b/scripts/dataloading/analyze_resumable_checkpoint.py
@@ -1,4 +1,17 @@
 #!/usr/bin/env python
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Analyze resumable Lhotse dataloader progress stored in a checkpoint.
 
 This tool answers two operational questions for indexed/resumable
@@ -221,6 +234,7 @@ def recurse(
         )
     return specs
 
+
 def extract_progress(payload: Any) -> tuple[list[LeafProgress], list[str]]:
     """Extract per-leaf dataloader progress from a loaded checkpoint payload.
 
@@ -240,14 +254,10 @@ def extract_progress(payload: Any) -> tuple[list[LeafProgress], list[str]]:
                 rank = entry.get("dp_rank", idx)
                 rank = rank if isinstance(rank, int) else idx
                 inner_state = entry.get("state", entry)
-                for sampler_path, sampler_state in _find_sampler_states(
-                    inner_state, f"{state_path}[{idx}].state"
-                ):
+                for sampler_path, sampler_state in _find_sampler_states(inner_state, f"{state_path}[{idx}].state"):
                     worker = _worker_from_path(sampler_path)
                     progress.extend(
-                        _collect_leaves_from_sampler(
-                            sampler_state, rank=rank, worker=worker, path=sampler_path
-                        )
+                        _collect_leaves_from_sampler(sampler_state, rank=rank, worker=worker, path=sampler_path)
                     )
     else:
         notes.append(f"no {STATEFUL_KEY!r} payload found; scanning for raw sampler_state entries")
@@ -259,6 +269,7 @@ def extract_progress(payload: Any) -> tuple[list[LeafProgress], list[str]]:
         notes.append(f"deduplicated {removed} duplicate leaf progress state(s)")
     return progress, notes
 
+
 def summarize(progress: list[LeafProgress], specs: list[DatasetSpec]) -> list[SummaryRow]:
     """Combine checkpoint progress and dataset specs into report rows.
 
@@ -334,6 +345,7 @@ def summarize(progress: list[LeafProgress], specs: list[DatasetSpec]) -> list[Su
         )
     return rows
 
+
 def load_checkpoint_payload(path: Path, *, allow_full_load: bool, max_full_load_mb: int) -> tuple[Any, Path]:
     """Load the smallest checkpoint payload that contains dataloader state.
 
@@ -361,6 +373,7 @@ def load_checkpoint_payload(path: Path, *, allow_full_load: bool, max_full_load_
     detail = "\n".join(errors[-10:])
     raise RuntimeError(f"Could not find dataloader state for {path}.\n{detail}")
 
+
 def load_config(path: Path | None, checkpoint: Path) -> tuple[dict[str, Any] | None, Path | None, list[str]]:
     """Load an explicit or nearby training config used to annotate the report."""
     notes = []
@@ -386,6 +399,7 @@ def load_config(path: Path | None, checkpoint: Path) -> tuple[dict[str, Any] | N
         notes.append(f"config not found or invalid: {path}")
     return None, None, notes
 
+
 def markdown_table(rows: list[SummaryRow]) -> str:
     """Render summary rows as a compact Markdown table for logs/stdout."""
     headers = [
@@ -425,6 +439,7 @@ def markdown_table(rows: list[SummaryRow]) -> str:
         lines.append("| " + " | ".join(values) + " |")
     return "\n".join(lines) + "\n"
 
+
 def write_outputs(summary: dict[str, Any], rows: list[SummaryRow], args: argparse.Namespace) -> None:
     """Write requested JSON, Markdown, and CSV artifacts."""
     output_dir = Path(args.output_dir) if args.output_dir else None
@@ -457,6 +472,7 @@ def write_outputs(summary: dict[str, Any], rows: list[SummaryRow], args: argpars
             for row in rows:
                 writer.writerow(asdict(row))
 
+
 def parse_args() -> argparse.Namespace:
     """Parse CLI flags for local or cluster-submitted analysis runs."""
     parser = argparse.ArgumentParser(description=__doc__)
@@ -476,6 +492,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--print-table", action="store_true", help="Print Markdown table to stdout.")
     return parser.parse_args()
 
+
 def main() -> int:
     """CLI entrypoint for loading inputs, computing rows, and writing outputs."""
     args = parse_args()
@@ -542,10 +559,12 @@ def _load_yaml(path: Path) -> Any:
     with path.open("r", encoding="utf-8") as f:
         return yaml.safe_load(f)
 
+
 def _load_json(path: Path) -> Any:
     with path.open("r", encoding="utf-8") as f:
         return json.load(f)
 
+
 def _safe_float(value: Any) -> float | None:
     if isinstance(value, bool) or value is None:
         return None
@@ -556,6 +575,7 @@ def _safe_float(value: Any) -> float | None:
     except (TypeError, ValueError):
         return None
 
+
 def _get_path(data: Any, dotted: str) -> Any:
     cur = data
     for part in dotted.split("."):
@@ -564,6 +584,7 @@ def _get_path(data: Any, dotted: str) -> Any:
         cur = cur.get(part)
     return cur
 
+
 def _normalize_weight_vector(weights: list[float], temperature: float = 1.0) -> list[float]:
     if not weights:
         return []
@@ -573,10 +594,12 @@ def _normalize_weight_vector(weights: list[float], temperature: float = 1.0) ->
         return [1.0 / len(weights)] * len(weights)
     return [w / total for w in scaled]
 
+
 def _strip_url_scheme(path: str) -> str:
     match = re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://(.+)$", path)
     return match.group(1) if match else path.lstrip("/")
 
+
 def _index_path_for(data_path: str, indexes_root: str | None) -> Path | None:
     if not indexes_root:
         if re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*://", data_path):
@@ -584,16 +607,13 @@ def _index_path_for(data_path: str, indexes_root: str | None) -> Path | None:
         return Path(data_path + ".idx")
     return Path(indexes_root) / (_strip_url_scheme(data_path) + ".idx")
 
+
 def _indexed_file_size(idx_path: Path) -> int | None:
     parent = str(idx_path.parent)
     entries = INDEX_DIR_CACHE.get(parent)
     if entries is None and parent not in INDEX_DIR_CACHE:
         try:
-            entries = {
-                entry.name: entry.stat().st_size
-                for entry in os.scandir(idx_path.parent)
-                if entry.is_file()
-            }
+            entries = {entry.name: entry.stat().st_size for entry in os.scandir(idx_path.parent) if entry.is_file()}
         except FileNotFoundError:
             INDEX_DIR_CACHE[parent] = None
             return None
@@ -602,6 +622,7 @@ def _indexed_file_size(idx_path: Path) -> int | None:
         return None
     return entries.get(idx_path.name)
 
+
 def _fallback_brace_expand(path: str) -> list[str]:
     match = BRACE_RANGE_PATTERN.search(path)
     if not match:
@@ -624,6 +645,7 @@ def _fallback_brace_expand(path: str) -> list[str]:
         expanded.extend(_fallback_brace_expand(path[: match.start()] + repl + path[match.end() :]))
     return expanded
 
+
 def _expand_op_path(path: str) -> list[str]:
     # Match NeMo expand_sharded_filepaths(): _OP_/_CL_ are aliases for brace ranges.
     sharded = path
@@ -638,6 +660,7 @@ def _expand_op_path(path: str) -> list[str]:
     except ImportError:
         return _fallback_brace_expand(sharded)
 
+
 def _flatten_path_values(value: Any) -> list[str]:
     if value is None:
         return []
@@ -660,6 +683,7 @@ def _flatten_path_values(value: Any) -> list[str]:
         return out
     return []
 
+
 def _count_indexed_items(paths: list[str], indexes_root: str | None) -> tuple[int | None, list[str]]:
     total = 0
     missing: list[str] = []
@@ -681,6 +705,7 @@ def _count_indexed_items(paths: list[str], indexes_root: str | None) -> tuple[in
             any_count = True
     return (total if any_count else None), missing
 
+
 def _resolve_ref(ref: str, *, data_blend_dir: str | None, current_dir: Path | None) -> Path:
     text = ref
     if data_blend_dir:
@@ -693,15 +718,18 @@ def _resolve_ref(ref: str, *, data_blend_dir: str | None, current_dir: Path | No
         return current_dir / path
     return Path.cwd() / path
 
+
 def _looks_like_yaml_ref(value: Any) -> bool:
     return isinstance(value, str) and value.endswith((".yaml", ".yml"))
 
+
 def _load_ref_if_yaml(value: Any, *, data_blend_dir: str | None, current_dir: Path | None) -> tuple[Any, Path | None]:
     if not _looks_like_yaml_ref(value):
         return value, current_dir
     path = _resolve_ref(value, data_blend_dir=data_blend_dir, current_dir=current_dir)
     return _load_yaml(path), path.parent
 
+
 def _source_path_groups_for_item(item: dict[str, Any]) -> list[list[str]]:
     groups: list[list[str]] = []
     keys = [
@@ -730,10 +758,12 @@ def _source_path_groups_for_item(item: dict[str, Any]) -> list[list[str]]:
             groups.append(paths)
     return groups
 
+
 def _source_paths_for_item(item: dict[str, Any]) -> list[str]:
     groups = _source_path_groups_for_item(item)
     return groups[0] if groups else []
 
+
 def _dataset_name(item: dict[str, Any], source_path: str | None, fallback_index: int) -> str:
     pieces = []
     for key in ("corpus", "language", "dataset", "name", "type"):
@@ -746,6 +776,7 @@ def _dataset_name(item: dict[str, Any], source_path: str | None, fallback_index:
         return source_path
     return f"source-{fallback_index}"
 
+
 def _temperature_list(train_ds: dict[str, Any]) -> list[float] | None:
     value = train_ds.get("reweight_temperature")
     if value is None:
@@ -756,6 +787,7 @@ def _temperature_list(train_ds: dict[str, Any]) -> list[float] | None:
         return [float(v) for v in value]
     return None
 
+
 def _iter_children(obj: Any, path: str = "$") -> Iterable[tuple[str, Any]]:
     if isinstance(obj, dict):
         for key, value in obj.items():
@@ -764,6 +796,7 @@ def _iter_children(obj: Any, path: str = "$") -> Iterable[tuple[str, Any]]:
         for idx, value in enumerate(obj):
             yield f"{path}[{idx}]", value
 
+
 def _find_stateful_payloads(obj: Any, path: str = "$") -> list[tuple[str, list[Any]]]:
     found: list[tuple[str, list[Any]]] = []
     if isinstance(obj, dict):
@@ -777,6 +810,7 @@ def _find_stateful_payloads(obj: Any, path: str = "$") -> list[tuple[str, list[A
             found.extend(_find_stateful_payloads(child, child_path))
     return found
 
+
 def _find_sampler_states(obj: Any, path: str = "$") -> list[tuple[str, dict[str, Any]]]:
     found: list[tuple[str, dict[str, Any]]] = []
     if isinstance(obj, dict):
@@ -794,12 +828,14 @@ def _find_sampler_states(obj: Any, path: str = "$") -> list[tuple[str, dict[str,
             found.extend(_find_sampler_states(child, child_path))
     return found
 
+
 def _worker_from_path(path: str) -> str | None:
     match = re.search(r"worker[_-]?(\d+)", path)
     if match:
         return match.group(1)
     return None
 
+
 def _state_total_len(state: dict[str, Any]) -> int | None:
     range_state = state.get("range")
     if isinstance(range_state, dict):
@@ -809,6 +845,7 @@ def _state_total_len(state: dict[str, Any]) -> int | None:
     n = state.get("total_len") or state.get("_total_len") or state.get("n")
     return int(n) if isinstance(n, int) and n >= 0 else None
 
+
 def _leaf_from_state(
     source_index: int,
     rank: int | None,
@@ -840,6 +877,7 @@ def _leaf_from_state(
         state_path=path,
     )
 
+
 def _collect_leaf_states(tree: Any, *, rank: int | None, worker: str | None, path: str = "$") -> list[LeafProgress]:
     leaves: list[LeafProgress] = []
 
@@ -868,6 +906,7 @@ def walk(node: Any, node_path: str) -> None:
     walk(tree, path)
     return leaves
 
+
 def _collect_leaves_from_sampler(
     sampler_state: dict[str, Any], *, rank: int | None, worker: str | None, path: str
 ) -> list[LeafProgress]:
@@ -895,6 +934,7 @@ def _collect_leaves_from_sampler(
             leaf.source_index = idx
     return leaves
 
+
 def _deduplicate_progress(progress: list[LeafProgress]) -> tuple[list[LeafProgress], int]:
     deduped: list[LeafProgress] = []
     seen: set[tuple[Any, ...]] = set()
@@ -906,6 +946,7 @@ def _deduplicate_progress(progress: list[LeafProgress]) -> tuple[list[LeafProgre
         deduped.append(leaf)
     return deduped, len(progress) - len(deduped)
 
+
 def _shard_len(total_len: int, shard_id: int | None, num_shards: int | None) -> int | None:
     if shard_id is None or num_shards is None or num_shards <= 0:
         return None
@@ -913,6 +954,7 @@ def _shard_len(total_len: int, shard_id: int | None, num_shards: int | None) ->
         return 0
     return (total_len - shard_id + num_shards - 1) // num_shards
 
+
 def _consumed_items(leaf: LeafProgress, total_len: int | None) -> int | None:
     total = leaf.total_len if leaf.total_len is not None else total_len
     if total is None:
@@ -924,6 +966,7 @@ def _consumed_items(leaf: LeafProgress, total_len: int | None) -> int | None:
         return leaf.position if leaf.epoch == 0 else None
     return leaf.epoch * shard_len + leaf.position
 
+
 def _eval_step_candidates(path: Path) -> list[Path]:
     match = EVAL_STEP_PATTERN.fullmatch(path.name)
     if not match:
@@ -937,6 +980,7 @@ def _eval_step_candidates(path: Path) -> list[Path]:
         ckpt_dir / f"step-{step}-last.ckpt",
     ]
 
+
 def _checkpoint_metadata_candidates(path: Path) -> list[Path]:
     candidates: list[Path] = []
     if path.is_dir():
@@ -962,6 +1006,7 @@ def _checkpoint_metadata_candidates(path: Path) -> list[Path]:
             deduped.append(candidate)
     return deduped
 
+
 def _torch_load(path: Path) -> Any:
     import torch
 
@@ -970,6 +1015,7 @@ def _torch_load(path: Path) -> Any:
     except TypeError:
         return torch.load(path, map_location="cpu")
 
+
 def _auto_config_candidates(checkpoint: Path) -> list[Path]:
     candidates = []
     roots = [checkpoint]
@@ -997,9 +1043,11 @@ def _auto_config_candidates(checkpoint: Path) -> list[Path]:
             deduped.append(candidate)
     return deduped
 
+
 def _fmt_pct(value: float | None) -> str:
     return "" if value is None else f"{value:.2f}%"
 
+
 def _fmt_float(value: float | None) -> str:
     return "" if value is None else f"{value:.6g}"
 

From a6278c92ca11d6c9cf6e3206d402457dba026ef6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Mon, 22 Jun 2026 14:42:10 -0700
Subject: [PATCH 27/30] Pin Lhotse to 2.0.0a2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 pyproject.toml | 12 ++++++------
 uv.lock        | 18 +++++++++---------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index ad7711a3d582..3e233a9e2816 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -115,7 +115,7 @@ asr-only = [
     "braceexpand",
     "einops",
     "kaldialign",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.1",
     "packaging",
     "sacrebleu",
@@ -138,7 +138,7 @@ tts = [
     "pyopenjtalk",
     "braceexpand",
     "kaldialign",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.1",
     "packaging",
     "sacrebleu",
@@ -161,7 +161,7 @@ tts = [
 
 audio = [
     "einops",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.0",
     "matplotlib",
     "pesq; (platform_machine != 'x86_64' or platform_system != 'Darwin')",
@@ -206,7 +206,7 @@ all = [
     "sentencepiece<1.0.0",
     "braceexpand",
     "kaldialign",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.1",
     "packaging",
     "sacrebleu",
@@ -288,7 +288,7 @@ asr = [
     "braceexpand",
     "einops",
     "kaldialign",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.1",
     "packaging",
     "sacrebleu",
@@ -317,7 +317,7 @@ speechlm2 = [
     "braceexpand",
     "einops",
     "kaldialign",
-    "lhotse>=1.33.0",
+    "lhotse==2.0.0a2",
     "librosa>=0.10.1",
     "packaging",
     "sacrebleu",
diff --git a/uv.lock b/uv.lock
index a6be21cea190..a5d7b5c0646a 100644
--- a/uv.lock
+++ b/uv.lock
@@ -3002,7 +3002,7 @@ wheels = [
 
 [[package]]
 name = "lhotse"
-version = "1.33.0"
+version = "2.0.0a2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "audioread" },
@@ -3021,9 +3021,9 @@ dependencies = [
     { name = "torch", version = "2.12.0+cu132", source = { registry = "https://download.pytorch.org/whl/cu132" }, marker = "(sys_platform == 'linux' and extra != 'extra-12-nemo-toolkit-compiled-a100' and extra == 'extra-12-nemo-toolkit-cu13') or (sys_platform == 'linux' and extra != 'extra-12-nemo-toolkit-compiled' and extra == 'extra-12-nemo-toolkit-cu13') or (extra == 'extra-12-nemo-toolkit-cu12' and extra == 'extra-12-nemo-toolkit-cu13') or (extra == 'extra-12-nemo-toolkit-compiled' and extra == 'extra-12-nemo-toolkit-compiled-a100')" },
     { name = "tqdm" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/9d/5a/b606c87b0a50322200aafb0f0682e719890bf0f045152b53e161090a6e8f/lhotse-1.33.0.tar.gz", hash = "sha256:3e91fca8531fc4c1798d0a6de1b3c7ea6bf2e181df70e5985927a131761c67f5", size = 686482, upload-time = "2026-04-20T13:11:08.579Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ed/2c/8235b4820a0a22402f6ce1ed3b3ba9de2c50a82704f63ca2d63bb7ae65bb/lhotse-2.0.0a2.tar.gz", hash = "sha256:f0fe285179060f5bcd96a289a8c3238d623c4842450f3acbdbd81ed08086de28", size = 736543, upload-time = "2026-06-22T21:37:30.175Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ad/e2/fbcb65dfed851f28ea15eca62cf449bc0b36378b005e6bec720714a9fb19/lhotse-1.33.0-py3-none-any.whl", hash = "sha256:8697bc74a8f3101594fca5661c7318c30899f3fdb132a44c7e99e794be6ac061", size = 903925, upload-time = "2026-04-20T13:11:07.027Z" },
+    { url = "https://files.pythonhosted.org/packages/73/a1/7d5d04df1a815866681bd133d3653530de15b5a44f10108bdb3b60505c75/lhotse-2.0.0a2-py3-none-any.whl", hash = "sha256:a20d934727929c42715d8d3a7f0aa68b8f7f89509dc8104bed6108a508aac9cf", size = 959069, upload-time = "2026-06-22T21:37:28.51Z" },
 ]
 
 [[package]]
@@ -4579,12 +4579,12 @@ requires-dist = [
     { name = "kaldialign", marker = "extra == 'asr-only'" },
     { name = "kaldialign", marker = "extra == 'speechlm2'" },
     { name = "kaldialign", marker = "extra == 'tts'" },
-    { name = "lhotse", marker = "extra == 'all'", specifier = ">=1.33.0" },
-    { name = "lhotse", marker = "extra == 'asr'", specifier = ">=1.33.0" },
-    { name = "lhotse", marker = "extra == 'asr-only'", specifier = ">=1.33.0" },
-    { name = "lhotse", marker = "extra == 'audio'", specifier = ">=1.33.0" },
-    { name = "lhotse", marker = "extra == 'speechlm2'", specifier = ">=1.33.0" },
-    { name = "lhotse", marker = "extra == 'tts'", specifier = ">=1.33.0" },
+    { name = "lhotse", marker = "extra == 'all'", specifier = "==2.0.0a2" },
+    { name = "lhotse", marker = "extra == 'asr'", specifier = "==2.0.0a2" },
+    { name = "lhotse", marker = "extra == 'asr-only'", specifier = "==2.0.0a2" },
+    { name = "lhotse", marker = "extra == 'audio'", specifier = "==2.0.0a2" },
+    { name = "lhotse", marker = "extra == 'speechlm2'", specifier = "==2.0.0a2" },
+    { name = "lhotse", marker = "extra == 'tts'", specifier = "==2.0.0a2" },
     { name = "librosa", marker = "extra == 'all'" },
     { name = "librosa", marker = "extra == 'all'", specifier = ">=0.10.0" },
     { name = "librosa", marker = "extra == 'all'", specifier = ">=0.10.1" },

From 3c2ad5d5c6c3692e30a488d786f47dea18fc6a79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Mon, 22 Jun 2026 14:55:30 -0700
Subject: [PATCH 28/30] Fix speechlm2 dataset docs list formatting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 docs/source/speechlm2/datasets.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/speechlm2/datasets.rst b/docs/source/speechlm2/datasets.rst
index f408458b86c3..505d0c7ce5b1 100644
--- a/docs/source/speechlm2/datasets.rst
+++ b/docs/source/speechlm2/datasets.rst
@@ -292,6 +292,7 @@ The DataModule class in the speechlm2 collection manages dataset loading, prepar
     )
 
 The DataModule takes care of:
+
 1. Setting up proper data parallel ranks for dataloaders
 2. Instantiating the dataloaders with configuration from YAML
 3. Managing multiple datasets for validation/testing

From 26901b5b18626d862ff6e9dbba60599a9465a638 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Tue, 23 Jun 2026 04:09:07 -0700
Subject: [PATCH 29/30] Fix dataloader iterator resume after skipped validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 nemo/utils/exp_manager.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/nemo/utils/exp_manager.py b/nemo/utils/exp_manager.py
index bf89e7aa3c11..202d6cb8bd35 100644
--- a/nemo/utils/exp_manager.py
+++ b/nemo/utils/exp_manager.py
@@ -20,6 +20,7 @@
 import time
 import warnings
 from collections import defaultdict
+from copy import deepcopy
 from dataclasses import dataclass, field
 from datetime import timedelta
 from pathlib import Path
@@ -1564,9 +1565,23 @@ def _save_last_checkpoint_and_exit(trainer: lightning.pytorch.Trainer, reason: s
 
 def configure_no_restart_validation_training_loop(trainer: lightning.pytorch.Trainer) -> None:
     """configure_no_restart_validation_training_loop"""
-    if type(trainer.fit_loop.epoch_loop) != _TrainingEpochLoop:
+    if type(trainer.fit_loop.epoch_loop) is not _TrainingEpochLoop:
         warnings.warn("Detected custom epoch loop. Skipping no validation on restart support.", UserWarning)
         return
+
+    fit_loop = trainer.fit_loop
+    if not getattr(fit_loop, "_nemo_restart_loader_state_cache_installed", False):
+        original_load_combined_loader_states = fit_loop._load_combined_loader_states
+
+        def _load_combined_loader_states_with_cache() -> None:
+            states = getattr(fit_loop, "_combined_loader_states_to_load", None)
+            if getattr(fit_loop, "restarting", False) and states:
+                fit_loop._nemo_restart_combined_loader_states = deepcopy(states)
+            original_load_combined_loader_states()
+
+        fit_loop._load_combined_loader_states = _load_combined_loader_states_with_cache
+        fit_loop._nemo_restart_loader_state_cache_installed = True
+
     # Pass trainer object to avoid trainer getting overwritten as None
     loop = SkipResumeTrainingValidationLoop(trainer, trainer.min_steps, trainer.max_steps)
     trainer.fit_loop.epoch_loop = loop
@@ -1588,11 +1603,23 @@ def advance(self, data_fetcher) -> None:
         """Skip restart validation without replaying an already-completed train batch."""
         if self.restarting and super()._should_check_val_fx(data_fetcher):
             logging.info("Skipping restart validation without replaying a completed training batch")
+            self._reload_unconsumed_restart_dataloader_state()
             self._skip_resume_validation_once = True
             self.restarting = False
             return
         super().advance(data_fetcher)
 
+    def _reload_unconsumed_restart_dataloader_state(self) -> None:
+        """Reapply the checkpoint dataloader cursor after skipping restart validation."""
+        fit_loop = self.trainer.fit_loop
+        states = getattr(fit_loop, "_nemo_restart_combined_loader_states", None)
+        combined_loader = getattr(fit_loop, "_combined_loader", None)
+        if not states or combined_loader is None or not hasattr(combined_loader, "_load_state_dicts"):
+            return
+
+        combined_loader._load_state_dicts(deepcopy(states))
+        fit_loop._nemo_restart_combined_loader_states = None
+
     def on_advance_end(self, data_fetcher) -> None:
         """Clear the one-shot restart-validation skip after normal epoch-loop bookkeeping."""
         try:

From a4638fab67952bf16b795b2a86a6747d82d17929 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20=C5=BBelasko?= <pzelasko@nvidia.com>
Date: Tue, 23 Jun 2026 09:27:00 -0700
Subject: [PATCH 30/30] Fix ASR Lhotse AIS batch loader tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Piotr Żelasko <pzelasko@nvidia.com>
---
 .../asr/data/audio_to_text_lhotse.py          | 49 +++++++++++------
 .../asr/data/audio_to_text_lhotse_prompted.py | 23 ++------
 .../asr/test_asr_lhotse_dataset.py            | 52 ++++++++++++++++---
 3 files changed, 82 insertions(+), 42 deletions(-)

diff --git a/nemo/collections/asr/data/audio_to_text_lhotse.py b/nemo/collections/asr/data/audio_to_text_lhotse.py
index 38ce7a0e8f9e..ec9b92045bb5 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 from typing import Dict, Optional, Tuple
 
@@ -55,23 +56,10 @@ def __init__(self, tokenizer: TokenizerSpec, return_cuts: bool = False):
         self.use_ais_get_batch = os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true"
         self.ais_force_individual = os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true"
 
-        # Try to use use_batch_loader if available (Lhotse >= 1.32.0)
-        try:
-            self.load_audio = AudioSamples(
-                fault_tolerant=True,
-                use_batch_loader=self.use_ais_get_batch,
-                ais_force_individual=self.ais_force_individual,
-            )
-        except TypeError:
-            # Lhotse < 1.32.0 doesn't support use_batch_loader
-            if self.use_ais_get_batch:
-                import logging
-
-                logging.warning(
-                    "AIS batch loading requested but not supported by this Lhotse version. "
-                    "Please upgrade to Lhotse >= 1.32.0"
-                )
-            self.load_audio = AudioSamples(fault_tolerant=True)
+        self.load_audio = _make_audio_samples(
+            use_batch_loader=self.use_ais_get_batch,
+            ais_force_individual=self.ais_force_individual,
+        )
 
         self.return_cuts = return_cuts
 
@@ -92,3 +80,30 @@ def __getitem__(self, cuts) -> Tuple[torch.Tensor, ...]:
         if self.return_cuts:
             return audio, audio_lens, tokens, token_lens, cuts.drop_in_memory_data()
         return audio, audio_lens, tokens, token_lens
+
+
+def _make_audio_samples(use_batch_loader: bool, ais_force_individual: bool) -> AudioSamples:
+    kwargs = {
+        "fault_tolerant": True,
+        "use_batch_loader": use_batch_loader,
+        "ais_force_individual": ais_force_individual,
+    }
+    try:
+        return AudioSamples(**kwargs)
+    except TypeError as exc:
+        if "ais_force_individual" in str(exc):
+            kwargs.pop("ais_force_individual")
+            try:
+                return AudioSamples(**kwargs)
+            except TypeError as retry_exc:
+                exc = retry_exc
+
+        if "use_batch_loader" not in str(exc):
+            raise
+
+        if use_batch_loader:
+            logging.warning(
+                "AIS batch loading requested but not supported by this Lhotse version. "
+                "Please upgrade to Lhotse >= 1.32.0"
+            )
+        return AudioSamples(fault_tolerant=True)
diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
index 2fa10edd8cf9..a4689fd177b4 100644
--- a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
+++ b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -18,9 +18,9 @@
 import torch.utils.data
 from lhotse import CutSet
 from lhotse.cut import MixedCut
-from lhotse.dataset import AudioSamples
 from lhotse.dataset.collation import collate_vectors
 
+from nemo.collections.asr.data.audio_to_text_lhotse import _make_audio_samples
 from nemo.collections.common.data import apply_prompt_format_fn
 from nemo.collections.common.prompts import PromptFormatter
 from nemo.collections.common.tokenizers import TokenizerSpec
@@ -85,23 +85,10 @@ def __init__(
         self.use_ais_get_batch = os.environ.get("USE_AIS_GET_BATCH", "False").lower() == "true"
         self.ais_force_individual = os.environ.get("USE_AIS_INDIVIDUAL_GETS", "False").lower() == "true"
 
-        # Try to use use_batch_loader if available (Lhotse >= 1.32.0)
-        try:
-            self.load_audio = AudioSamples(
-                fault_tolerant=True,
-                use_batch_loader=self.use_ais_get_batch,
-                ais_force_individual=self.ais_force_individual,
-            )
-        except TypeError:
-            # Lhotse < 1.32.0 doesn't support use_batch_loader
-            if self.use_ais_get_batch:
-                import logging
-
-                logging.warning(
-                    "AIS batch loading requested but not supported by this Lhotse version. "
-                    "Please upgrade to Lhotse >= 1.32.0"
-                )
-            self.load_audio = AudioSamples(fault_tolerant=True)
+        self.load_audio = _make_audio_samples(
+            use_batch_loader=self.use_ais_get_batch,
+            ais_force_individual=self.ais_force_individual,
+        )
 
         self.padding_value = self.tokenizer.pad_id
         self.prompt = prompt
diff --git a/tests/collections/asr/test_asr_lhotse_dataset.py b/tests/collections/asr/test_asr_lhotse_dataset.py
index 618eb2d89c30..4d7edba2dc3f 100644
--- a/tests/collections/asr/test_asr_lhotse_dataset.py
+++ b/tests/collections/asr/test_asr_lhotse_dataset.py
@@ -105,29 +105,45 @@ def test_lhotse_asr_dataset_metadata(tokenizer):
 def test_lhotse_asr_dataset_ais_batch_loading_enabled(tokenizer, monkeypatch):
     """Test that USE_AIS_GET_BATCH=true passes use_batch_loader=True to AudioSamples."""
     monkeypatch.setenv("USE_AIS_GET_BATCH", "true")
+    monkeypatch.delenv("USE_AIS_INDIVIDUAL_GETS", raising=False)
 
     with patch.object(AudioSamples, "__init__", return_value=None) as mock_init:
         mock_init.side_effect = lambda *args, **kwargs: None
         try:
-            dataset = LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+            LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
         except Exception:
             pass
         # Check that AudioSamples was called with use_batch_loader=True
-        mock_init.assert_called_with(fault_tolerant=True, use_batch_loader=True)
+        mock_init.assert_called_with(fault_tolerant=True, use_batch_loader=True, ais_force_individual=False)
+
+
+def test_lhotse_asr_dataset_ais_batch_loading_force_individual(tokenizer, monkeypatch):
+    """Test that USE_AIS_INDIVIDUAL_GETS=true is passed to AudioSamples."""
+    monkeypatch.setenv("USE_AIS_GET_BATCH", "true")
+    monkeypatch.setenv("USE_AIS_INDIVIDUAL_GETS", "true")
+
+    with patch.object(AudioSamples, "__init__", return_value=None) as mock_init:
+        mock_init.side_effect = lambda *args, **kwargs: None
+        try:
+            LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+        except Exception:
+            pass
+        mock_init.assert_called_with(fault_tolerant=True, use_batch_loader=True, ais_force_individual=True)
 
 
 def test_lhotse_asr_dataset_ais_batch_loading_disabled(tokenizer, monkeypatch):
     """Test that without USE_AIS_GET_BATCH, use_batch_loader=False is passed to AudioSamples."""
     monkeypatch.delenv("USE_AIS_GET_BATCH", raising=False)
+    monkeypatch.delenv("USE_AIS_INDIVIDUAL_GETS", raising=False)
 
     with patch.object(AudioSamples, "__init__", return_value=None) as mock_init:
         mock_init.side_effect = lambda *args, **kwargs: None
         try:
-            dataset = LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+            LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
         except Exception:
             pass
         # Check that AudioSamples was called with use_batch_loader=False
-        mock_init.assert_called_with(fault_tolerant=True, use_batch_loader=False)
+        mock_init.assert_called_with(fault_tolerant=True, use_batch_loader=False, ais_force_individual=False)
 
 
 def test_lhotse_asr_dataset_ais_batch_loading_fallback(tokenizer, monkeypatch):
@@ -145,8 +161,30 @@ def mock_init(self, *args, **kwargs):
         return original_init(self, *args, **kwargs)
 
     with patch.object(AudioSamples, "__init__", mock_init):
-        dataset = LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+        LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
 
-    # First call should have use_batch_loader=True, second call should not
-    assert call_args[0] == {"fault_tolerant": True, "use_batch_loader": True}
+    # First call should use AIS batch options, second call should not.
+    assert call_args[0] == {"fault_tolerant": True, "use_batch_loader": True, "ais_force_individual": False}
     assert call_args[1] == {"fault_tolerant": True}
+
+
+def test_lhotse_asr_dataset_ais_force_individual_fallback(tokenizer, monkeypatch):
+    """Test fallback when Lhotse supports use_batch_loader but not ais_force_individual."""
+    monkeypatch.setenv("USE_AIS_GET_BATCH", "true")
+    monkeypatch.setenv("USE_AIS_INDIVIDUAL_GETS", "true")
+
+    call_args = []
+
+    original_init = AudioSamples.__init__
+
+    def mock_init(self, *args, **kwargs):
+        call_args.append(kwargs.copy())
+        if "ais_force_individual" in kwargs:
+            raise TypeError("unexpected keyword argument 'ais_force_individual'")
+        return original_init(self, *args, **kwargs)
+
+    with patch.object(AudioSamples, "__init__", mock_init):
+        LhotseSpeechToTextBpeDataset(tokenizer=tokenizer)
+
+    assert call_args[0] == {"fault_tolerant": True, "use_batch_loader": True, "ais_force_individual": True}
+    assert call_args[1] == {"fault_tolerant": True, "use_batch_loader": True}