From 6caf4aa53594688420af40b231c38676577f8d60 Mon Sep 17 00:00:00 2001 From: Vadym Prokopov Date: Fri, 26 Jun 2026 11:30:14 -0500 Subject: [PATCH 1/2] fix(asr): clamp diarization cluster count to max_num_speakers For short sessions, SpeakerClustering.forward_infer estimates the speaker count via getEnhancedSpeakerCount(), which constructs NMESC with max_num_speakers=emb.shape[0] (the number of embedding segments) instead of the configured max_num_speakers. The resulting est_num_of_spk_enhanced is then consumed in forward_unit_infer without re-applying the limit, so a short audio file can be clustered into more speakers than max_num_speakers allows. Clamp n_clusters to max_num_speakers after the speaker count is selected. This is a no-op for the oracle and standard NME estimation paths (both already bounded by max_num_speakers) and fixes the over-counting that can occur on the enhanced-count path. Signed-off-by: Vadym Prokopov --- nemo/collections/asr/parts/utils/offline_clustering.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo/collections/asr/parts/utils/offline_clustering.py b/nemo/collections/asr/parts/utils/offline_clustering.py index 71291a665bcf..9a2de44dbed0 100644 --- a/nemo/collections/asr/parts/utils/offline_clustering.py +++ b/nemo/collections/asr/parts/utils/offline_clustering.py @@ -1248,6 +1248,8 @@ def forward_unit_infer( else: n_clusters = int(est_num_of_spk.item()) + n_clusters = min(n_clusters, max_num_speakers) + spectral_model = SpectralClustering( n_clusters=n_clusters, n_random_trials=kmeans_random_trials, cuda=self.cuda, device=self.device ) From b82819b701ba73b602bb076be2b0e6b615907543 Mon Sep 17 00:00:00 2001 From: Vadym Prokopov Date: Fri, 26 Jun 2026 11:53:44 -0500 Subject: [PATCH 2/2] test(asr): assert clustering caps clusters at max_num_speakers getEnhancedSpeakerCount estimates the speaker count with max_num_speakers=emb.shape[0], so for short sessions est_num_of_spk_enhanced can exceed the requested max_num_speakers. Add a CPU unit test that calls SpeakerClustering.forward_unit_infer with an enhanced count larger than max_num_speakers and asserts the number of output clusters is capped at max_num_speakers. Fails before the clamp fix (returns 8 clusters), passes after (capped at 2/3). Signed-off-by: Vadym Prokopov --- .../speaker_tasks/utils/test_diar_utils.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/collections/speaker_tasks/utils/test_diar_utils.py b/tests/collections/speaker_tasks/utils/test_diar_utils.py index 71ae2dc16d8e..3a4ec89b8e17 100644 --- a/tests/collections/speaker_tasks/utils/test_diar_utils.py +++ b/tests/collections/speaker_tasks/utils/test_diar_utils.py @@ -840,6 +840,42 @@ def test_offline_speaker_clustering_very_short_cpu( assert Y_out.shape[0] == mc[-1] assert all(permuted_Y == gt) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + @pytest.mark.parametrize("max_num_speakers", [2, 3]) + @pytest.mark.parametrize("est_num_of_spk_enhanced", [8]) + @pytest.mark.parametrize("n_spks, spk_dur, seed", [(8, 1.0, 0)]) + def test_offline_speaker_clustering_enhanced_count_respects_max_num_speakers_cpu( + self, + max_num_speakers, + est_num_of_spk_enhanced, + n_spks, + spk_dur, + seed, + ): + """For short sessions the enhanced speaker count from ``getEnhancedSpeakerCount`` is + estimated with ``max_num_speakers=emb.shape[0]``, so it can exceed the requested + ``max_num_speakers``. ``forward_unit_infer`` must cap the final number of clusters at + ``max_num_speakers`` instead of honoring the enhanced estimate blindly. Here we feed an + enhanced count larger than ``max_num_speakers`` and assert the cap is respected. + """ + em, ts, mc, mw, spk_ts, gt = generate_toy_data( + n_spks=n_spks, spk_dur=spk_dur, perturb_sigma=0.1, torch_seed=seed + ) + embs_in_scales, _ = split_input_data(em, ts, mc) + affinity_mat = getCosAffinityMatrix(embs_in_scales[-1]) + offline_speaker_clustering = SpeakerClustering(maj_vote_spk_count=False, min_samples_for_nmesc=0, cuda=False) + Y_out = offline_speaker_clustering.forward_unit_infer( + mat=affinity_mat, + oracle_num_speakers=-1, + max_num_speakers=max_num_speakers, + est_num_of_spk_enhanced=torch.tensor(est_num_of_spk_enhanced), + ) + # One label per segment ... + assert Y_out.shape[0] == affinity_mat.shape[0] + # ... and never more speakers than requested, even when the enhanced count exceeds it. + assert len(set(Y_out.tolist())) <= max_num_speakers + @pytest.mark.run_only_on('GPU') @pytest.mark.unit @pytest.mark.parametrize("spk_dur", [0.25, 0.5, 0.75, 1, 2, 4])