From 402b167765fe12853ae9ae63c5f69c283b1d1b56 Mon Sep 17 00:00:00 2001 From: Arshammik <79arsham@gmail.com> Date: Sat, 23 May 2026 13:25:07 -0400 Subject: [PATCH] fix(experimental.pp.hvg): scope per-batch clip threshold in Pearson residuals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_highly_variable_pearson_residuals` reassigned the function-scoped `clip` parameter from `None` to `sqrt(n_batch_0)` on the first batch iteration, which made the `if clip is None:` check False on every subsequent batch. The first batch's clip was therefore reused for every other batch, even though Lause-Berens-Kobak (2021) specifies a per-batch threshold of `sqrt(n_cells_in_batch)`. For an unbalanced split — e.g. two batches of 5000 and 200 cells — the small batch silently inherited `sqrt(5000) ≈ 70.7` instead of using its own `sqrt(200) ≈ 14.1`. Residuals in the small batch that should be clipped were not, inflating per-gene residual variance and shifting HVG ranking. The bug also made the result depend on alphabetical batch-label ordering (`np.unique` picks the first label), which is a reproducibility hazard. The fix introduces a loop-local `clip_batch` variable so the per-batch default is recomputed each iteration. The user-facing `clip` parameter is untouched. Two new tests in `tests/test_highly_variable_genes.py` cover the regression: one instruments `_calculate_res_sparse` to capture the per-batch clip value, the other verifies that swapping batch labels A<->B on identical data produces identical `residual_variances`. --- docs/release-notes/4138.fix.md | 5 + .../experimental/pp/_highly_variable_genes.py | 15 ++- tests/test_highly_variable_genes.py | 96 +++++++++++++++++++ 3 files changed, 112 insertions(+), 4 deletions(-) create mode 100644 docs/release-notes/4138.fix.md diff --git a/docs/release-notes/4138.fix.md b/docs/release-notes/4138.fix.md new file mode 100644 index 0000000000..07418ae220 --- /dev/null +++ b/docs/release-notes/4138.fix.md @@ -0,0 +1,5 @@ +Scope the per-batch clip threshold in {func}`scanpy.experimental.pp.highly_variable_genes` +(``flavor='pearson_residuals'``) so each batch uses its own ``sqrt(n_cells_in_batch)`` +per Lause-Berens-Kobak 2021, instead of silently reusing the first batch's value +for every subsequent batch. +{smaller}`A Mikaeili` diff --git a/src/scanpy/experimental/pp/_highly_variable_genes.py b/src/scanpy/experimental/pp/_highly_variable_genes.py index cb4016f311..1b975fa398 100644 --- a/src/scanpy/experimental/pp/_highly_variable_genes.py +++ b/src/scanpy/experimental/pp/_highly_variable_genes.py @@ -173,11 +173,18 @@ def _highly_variable_pearson_residuals( # noqa: PLR0912, PLR0915 adata_subset = adata_subset_prefilter[:, nonzero_genes] x_batch = _get_obs_rep(adata_subset, layer=layer) - # Prepare clipping + # Prepare clipping. Per Lause–Berens–Kobak 2021, the clip threshold is + # `sqrt(N)` where N is the number of cells used for the residual + # computation — so per-batch when `batch_key` is set. Use a loop-local + # variable so that the per-batch default is recomputed each iteration; + # reassigning the outer `clip` variable would freeze the first batch's + # value for every subsequent batch. if clip is None: n = x_batch.shape[0] - clip = np.sqrt(n) - if clip < 0: + clip_batch = np.sqrt(n) + else: + clip_batch = clip + if clip_batch < 0: msg = "Pearson residuals require `clip>=0` or `clip=None`." raise ValueError(msg) @@ -197,7 +204,7 @@ def _highly_variable_pearson_residuals( # noqa: PLR0912, PLR0915 sums_genes=sums_genes, sums_cells=sums_cells, sum_total=np.float64(sum_total), - clip=np.float64(clip), + clip=np.float64(clip_batch), theta=np.float64(theta), n_genes=x_batch.shape[1], n_cells=x_batch.shape[0], diff --git a/tests/test_highly_variable_genes.py b/tests/test_highly_variable_genes.py index 3f320b6d33..b4adccecc2 100644 --- a/tests/test_highly_variable_genes.py +++ b/tests/test_highly_variable_genes.py @@ -364,6 +364,102 @@ def test_pearson_residuals_batch( assert len(output_df) == n_genes +def test_pearson_residuals_batch_clip_scoped_per_batch(monkeypatch) -> None: + """Each batch must receive its own clip threshold. + + Per Lause-Berens-Kobak 2021 the clip is ``sqrt(n_cells_in_batch)``. Before + the fix, the function-scoped ``clip`` parameter was reassigned on the first + batch iteration and reused for every subsequent batch — so a small batch + silently inherited a large batch's clip value. This test instruments the + residual kernels and verifies that each batch receives its own. + """ + from scanpy.experimental.pp import _highly_variable_genes as hvg_module + + rng = np.random.default_rng(0) + n_big, n_small, n_genes = 5000, 200, 200 + counts = (rng.random((n_big + n_small, n_genes)) < 0.05).astype(np.int32) + counts *= rng.integers(1, 31, size=counts.shape, dtype=np.int32) + adata = AnnData(X=sps.csr_matrix(counts.astype(np.float32))) # noqa: TID251 + adata.obs["batch"] = pd.Categorical(["big"] * n_big + ["small"] * n_small) + + captured_clips: list[float] = [] + original_sparse = hvg_module._calculate_res_sparse + + def spy_sparse(*args, **kwargs): + captured_clips.append(float(kwargs.get("clip", -1))) + return original_sparse(*args, **kwargs) + + monkeypatch.setattr(hvg_module, "_calculate_res_sparse", spy_sparse) + + sc.experimental.pp.highly_variable_genes( + adata, + flavor="pearson_residuals", + n_top_genes=100, + batch_key="batch", + check_values=False, + ) + + assert len(captured_clips) >= 2, ( + f"expected >= 2 per-batch kernel calls, got {len(captured_clips)}" + ) + + # Per-batch correct clips: sqrt(5000) ~ 70.71 and sqrt(200) ~ 14.14. + distinct = {round(c, 1) for c in captured_clips} + expected_big = round(float(np.sqrt(n_big)), 1) + expected_small = round(float(np.sqrt(n_small)), 1) + assert distinct == {expected_big, expected_small}, ( + f"expected distinct clip values {{{expected_small}, {expected_big}}}; " + f"got {sorted(distinct)} from per-batch calls {captured_clips}. " + "Each batch's clip must be sqrt(n_cells_in_batch), not the first " + "batch's value carried over." + ) + + +def test_pearson_residuals_batch_order_invariant() -> None: + """HVG ranking must not depend on alphabetical batch-label order. + + Before the fix, the first batch's ``sqrt(n)`` clip leaked into all other + batches, so renaming "A"<->"B" silently changed which clip was applied to + which cells — a reproducibility hazard. + """ + rng = np.random.default_rng(0) + n_big, n_small, n_genes = 5000, 200, 200 + counts = (rng.random((n_big + n_small, n_genes)) < 0.05).astype(np.int32) + counts *= rng.integers(1, 31, size=counts.shape, dtype=np.int32) + x = sps.csr_matrix(counts.astype(np.float32)) # noqa: TID251 + + a1 = AnnData(X=x.copy()) + a1.obs["batch"] = pd.Categorical(["A"] * n_big + ["B"] * n_small) + sc.experimental.pp.highly_variable_genes( + a1, + flavor="pearson_residuals", + n_top_genes=100, + batch_key="batch", + check_values=False, + ) + + a2 = AnnData(X=x.copy()) + a2.obs["batch"] = pd.Categorical(["B"] * n_big + ["A"] * n_small) + sc.experimental.pp.highly_variable_genes( + a2, + flavor="pearson_residuals", + n_top_genes=100, + batch_key="batch", + check_values=False, + ) + + np.testing.assert_allclose( + a1.var["residual_variances"].to_numpy(), + a2.var["residual_variances"].to_numpy(), + atol=1e-6, + err_msg=( + "Swapping batch labels A<->B changed residual_variances on " + "identical data. The first batch's clip value is leaking into " + "later batches via the function-scoped `clip` variable." + ), + ) + + @pytest.mark.parametrize( ("flavor", "params", "ref_path"), [