From 6caf4aa53594688420af40b231c38676577f8d60 Mon Sep 17 00:00:00 2001
From: Vadym Prokopov <vprokopov@sohosquared.com>
Date: Fri, 26 Jun 2026 11:30:14 -0500
Subject: [PATCH 1/2] fix(asr): clamp diarization cluster count to
 max_num_speakers

For short sessions, SpeakerClustering.forward_infer estimates the speaker
count via getEnhancedSpeakerCount(), which constructs NMESC with
max_num_speakers=emb.shape[0] (the number of embedding segments) instead of
the configured max_num_speakers. The resulting est_num_of_spk_enhanced is
then consumed in forward_unit_infer without re-applying the limit, so a
short audio file can be clustered into more speakers than max_num_speakers
allows.

Clamp n_clusters to max_num_speakers after the speaker count is selected.
This is a no-op for the oracle and standard NME estimation paths (both
already bounded by max_num_speakers) and fixes the over-counting that can
occur on the enhanced-count path.

Signed-off-by: Vadym Prokopov <vprokopov@sohosquared.com>
---
 nemo/collections/asr/parts/utils/offline_clustering.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nemo/collections/asr/parts/utils/offline_clustering.py b/nemo/collections/asr/parts/utils/offline_clustering.py
index 71291a665bcf..9a2de44dbed0 100644
--- a/nemo/collections/asr/parts/utils/offline_clustering.py
+++ b/nemo/collections/asr/parts/utils/offline_clustering.py
@@ -1248,6 +1248,8 @@ def forward_unit_infer(
         else:
             n_clusters = int(est_num_of_spk.item())
 
+        n_clusters = min(n_clusters, max_num_speakers)
+
         spectral_model = SpectralClustering(
             n_clusters=n_clusters, n_random_trials=kmeans_random_trials, cuda=self.cuda, device=self.device
         )

From b82819b701ba73b602bb076be2b0e6b615907543 Mon Sep 17 00:00:00 2001
From: Vadym Prokopov <vprokopov@sohosquared.com>
Date: Fri, 26 Jun 2026 11:53:44 -0500
Subject: [PATCH 2/2] test(asr): assert clustering caps clusters at
 max_num_speakers

getEnhancedSpeakerCount estimates the speaker count with
max_num_speakers=emb.shape[0], so for short sessions est_num_of_spk_enhanced
can exceed the requested max_num_speakers. Add a CPU unit test that calls
SpeakerClustering.forward_unit_infer with an enhanced count larger than
max_num_speakers and asserts the number of output clusters is capped at
max_num_speakers. Fails before the clamp fix (returns 8 clusters), passes
after (capped at 2/3).

Signed-off-by: Vadym Prokopov <vprokopov@sohosquared.com>
---
 .../speaker_tasks/utils/test_diar_utils.py    | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tests/collections/speaker_tasks/utils/test_diar_utils.py b/tests/collections/speaker_tasks/utils/test_diar_utils.py
index 71ae2dc16d8e..3a4ec89b8e17 100644
--- a/tests/collections/speaker_tasks/utils/test_diar_utils.py
+++ b/tests/collections/speaker_tasks/utils/test_diar_utils.py
@@ -840,6 +840,42 @@ def test_offline_speaker_clustering_very_short_cpu(
         assert Y_out.shape[0] == mc[-1]
         assert all(permuted_Y == gt)
 
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    @pytest.mark.parametrize("max_num_speakers", [2, 3])
+    @pytest.mark.parametrize("est_num_of_spk_enhanced", [8])
+    @pytest.mark.parametrize("n_spks, spk_dur, seed", [(8, 1.0, 0)])
+    def test_offline_speaker_clustering_enhanced_count_respects_max_num_speakers_cpu(
+        self,
+        max_num_speakers,
+        est_num_of_spk_enhanced,
+        n_spks,
+        spk_dur,
+        seed,
+    ):
+        """For short sessions the enhanced speaker count from ``getEnhancedSpeakerCount`` is
+        estimated with ``max_num_speakers=emb.shape[0]``, so it can exceed the requested
+        ``max_num_speakers``. ``forward_unit_infer`` must cap the final number of clusters at
+        ``max_num_speakers`` instead of honoring the enhanced estimate blindly. Here we feed an
+        enhanced count larger than ``max_num_speakers`` and assert the cap is respected.
+        """
+        em, ts, mc, mw, spk_ts, gt = generate_toy_data(
+            n_spks=n_spks, spk_dur=spk_dur, perturb_sigma=0.1, torch_seed=seed
+        )
+        embs_in_scales, _ = split_input_data(em, ts, mc)
+        affinity_mat = getCosAffinityMatrix(embs_in_scales[-1])
+        offline_speaker_clustering = SpeakerClustering(maj_vote_spk_count=False, min_samples_for_nmesc=0, cuda=False)
+        Y_out = offline_speaker_clustering.forward_unit_infer(
+            mat=affinity_mat,
+            oracle_num_speakers=-1,
+            max_num_speakers=max_num_speakers,
+            est_num_of_spk_enhanced=torch.tensor(est_num_of_spk_enhanced),
+        )
+        # One label per segment ...
+        assert Y_out.shape[0] == affinity_mat.shape[0]
+        # ... and never more speakers than requested, even when the enhanced count exceeds it.
+        assert len(set(Y_out.tolist())) <= max_num_speakers
+
     @pytest.mark.run_only_on('GPU')
     @pytest.mark.unit
     @pytest.mark.parametrize("spk_dur", [0.25, 0.5, 0.75, 1, 2, 4])