From 2dd03ced541a5253a77d3c43226f578822dec37f Mon Sep 17 00:00:00 2001 From: Arshammik <79arsham@gmail.com> Date: Sat, 23 May 2026 13:49:52 -0400 Subject: [PATCH 1/4] hvg(pearson_residuals): scope per-batch clip threshold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_highly_variable_pearson_residuals` reassigned the function-scoped `clip` parameter inside the per-batch loop via `if clip is None: clip = cp.sqrt(n, ...)`. After the first batch, `clip is None` was False, so every subsequent batch reused the first batch's threshold instead of computing its own `sqrt(n_cells_in_batch)` per Lause-Berens-Kobak 2021. For a 5000/200 split with `clip=None`, the small batch silently inherited `sqrt(5000) ≈ 70.7` instead of using its own `sqrt(200) ≈ 14.1`. Residuals in the small batch that should be clipped were not, inflating their per-gene variance and shifting HVG ranking. The result also depended on which batch label was alphabetically first (since `np.unique` sorts), so renaming "A"↔"B" silently changed the HVG output. The fix introduces a loop-local `clip_batch` variable so the per-batch default is recomputed each iteration. The user-facing `clip` parameter is untouched. Two new tests in `tests/test_hvg.py`: - `test_pearson_residuals_batch_clip_scoped_per_batch` monkeypatches the underlying `_pr_cuda.csc_hvg_res` / `dense_hvg_res` bindings and asserts that two distinct clip values reach the kernel for a 5000/200 split. - `test_pearson_residuals_batch_order_invariant` swaps batch labels A↔B on identical data and asserts that `residual_variances` is unchanged. Both fail on the unpatched code and pass on the fix; the full 58-test pearson HVG subset of `tests/test_hvg.py` still passes. scanpy ports the same routine and has the same bug — see scverse/scanpy#4138. --- .../preprocessing/_hvg/_pearson_residuals.py | 13 ++- tests/test_hvg.py | 95 +++++++++++++++++++ 2 files changed, 105 insertions(+), 3 deletions(-) diff --git a/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py b/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py index 47c50e0f..dc530e4d 100644 --- a/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py +++ b/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py @@ -71,15 +71,22 @@ def _highly_variable_pearson_residuals( nnz_per_gene = cp.sum(X_batch != 0, axis=0).ravel() nonzero_genes = cp.array(nnz_per_gene >= 1) X_batch = X_batch[:, nonzero_genes] + # Per Lause-Berens-Kobak 2021, the clip threshold is `sqrt(N)` over the + # cells entering the residual computation — so per-batch when `batch_key` + # is set. We use a loop-local `clip_batch` so the default is recomputed + # each iteration; reassigning the outer `clip` parameter would freeze + # the first batch's value for every subsequent batch. if clip is None: n = X_batch.shape[0] - clip = cp.sqrt(n, dtype=dtype) - if clip < 0: + clip_batch = cp.sqrt(n, dtype=dtype) + else: + clip_batch = clip + if clip_batch < 0: raise ValueError("Pearson residuals require `clip>=0` or `clip=None`.") n_cells = X_batch.shape[0] n_genes = X_batch.shape[1] - clip_val = float(clip) + clip_val = float(clip_batch) inv_theta = 1.0 / theta residual_gene_var = cp.zeros(n_genes, dtype=dtype, order="C") stream = cp.cuda.get_current_stream().ptr diff --git a/tests/test_hvg.py b/tests/test_hvg.py index a8d24e55..f296a061 100644 --- a/tests/test_hvg.py +++ b/tests/test_hvg.py @@ -397,6 +397,101 @@ def test_highly_variable_genes_pearson_residuals_batch(n_top_genes, dtype): assert len(cudata.var) == n_genes +def test_pearson_residuals_batch_clip_scoped_per_batch(monkeypatch): + """The per-batch Pearson-residual clip threshold must be ``sqrt(n_cells_in_batch)`` + (Lause-Berens-Kobak 2021). Before the fix at + ``preprocessing/_hvg/_pearson_residuals.py:74-76`` the function-scoped + ``clip`` variable was reassigned in the first batch iteration and reused + for every subsequent batch — so a small batch silently inherited a large + batch's clip value. This test instruments the underlying CUDA bindings to + capture each batch's clip and asserts they differ. + """ + from rapids_singlecell._cuda import _pr_cuda as _pr + + rng = np.random.default_rng(0) + n_big, n_small, n_genes = 5000, 200, 200 + counts = (rng.random((n_big + n_small, n_genes)) < 0.05).astype(np.int32) + counts *= rng.integers(1, 31, size=counts.shape, dtype=np.int32) + cudata = AnnData(X=cpx.scipy.sparse.csr_matrix(csr_matrix(counts.astype(np.float32)))) + cudata.obs["batch"] = np.array(["big"] * n_big + ["small"] * n_small) + cudata.obs["batch"] = cudata.obs["batch"].astype("category") + + captured_clips = [] + original_csc = _pr.csc_hvg_res + original_dense = _pr.dense_hvg_res + + def spy_csc(*args, **kwargs): + captured_clips.append(float(kwargs.get("clip", -1))) + return original_csc(*args, **kwargs) + + def spy_dense(*args, **kwargs): + captured_clips.append(float(kwargs.get("clip", -1))) + return original_dense(*args, **kwargs) + + monkeypatch.setattr(_pr, "csc_hvg_res", spy_csc) + monkeypatch.setattr(_pr, "dense_hvg_res", spy_dense) + + rsc.pp.highly_variable_genes( + cudata, + flavor="pearson_residuals", + n_top_genes=100, + batch_key="batch", + check_values=False, + ) + + assert len(captured_clips) >= 2, ( + f"expected >= 2 per-batch kernel calls, got {len(captured_clips)}" + ) + distinct = {round(c, 1) for c in captured_clips} + expected_big = round(float(np.sqrt(n_big)), 1) + expected_small = round(float(np.sqrt(n_small)), 1) + assert distinct == {expected_big, expected_small}, ( + f"expected per-batch clips {{{expected_small}, {expected_big}}}; " + f"got {sorted(distinct)} across calls {captured_clips}. Each batch " + f"must receive sqrt(n_cells_in_batch), not the first batch's value." + ) + + +def test_pearson_residuals_batch_order_invariant(): + """Renaming batch labels A<->B on identical data must not change the HVG + output. Before the fix, the first batch (alphabetically) defined the clip + used for all subsequent batches, so the result depended silently on + ``np.unique`` sort order. + """ + rng = np.random.default_rng(0) + n_big, n_small, n_genes = 5000, 200, 200 + counts = (rng.random((n_big + n_small, n_genes)) < 0.05).astype(np.int32) + counts *= rng.integers(1, 31, size=counts.shape, dtype=np.int32) + X = csr_matrix(counts.astype(np.float32)) + + a1 = AnnData(X=cpx.scipy.sparse.csr_matrix(X.copy())) + a1.obs["batch"] = np.array(["A"] * n_big + ["B"] * n_small) + a1.obs["batch"] = a1.obs["batch"].astype("category") + rsc.pp.highly_variable_genes( + a1, flavor="pearson_residuals", n_top_genes=100, + batch_key="batch", check_values=False, + ) + + a2 = AnnData(X=cpx.scipy.sparse.csr_matrix(X.copy())) + a2.obs["batch"] = np.array(["B"] * n_big + ["A"] * n_small) + a2.obs["batch"] = a2.obs["batch"].astype("category") + rsc.pp.highly_variable_genes( + a2, flavor="pearson_residuals", n_top_genes=100, + batch_key="batch", check_values=False, + ) + + np.testing.assert_allclose( + a1.var["residual_variances"].to_numpy(), + a2.var["residual_variances"].to_numpy(), + atol=1e-5, + err_msg=( + "Swapping batch labels A<->B changed residual_variances on " + "identical data. The first batch's clip is leaking into later " + "batches via the function-scoped `clip` variable." + ), + ) + + @pytest.mark.parametrize("dtype", ["float32", "float64"]) @pytest.mark.parametrize("sparse", [True, False]) def test_poisson_gene_selection_compare_to_scvi(dtype, sparse): From 36a1ddd4504a1d376b01db6ec8aed2a10f22315e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 23 May 2026 18:47:09 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_hvg.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/test_hvg.py b/tests/test_hvg.py index f296a061..a57c7f55 100644 --- a/tests/test_hvg.py +++ b/tests/test_hvg.py @@ -412,7 +412,9 @@ def test_pearson_residuals_batch_clip_scoped_per_batch(monkeypatch): n_big, n_small, n_genes = 5000, 200, 200 counts = (rng.random((n_big + n_small, n_genes)) < 0.05).astype(np.int32) counts *= rng.integers(1, 31, size=counts.shape, dtype=np.int32) - cudata = AnnData(X=cpx.scipy.sparse.csr_matrix(csr_matrix(counts.astype(np.float32)))) + cudata = AnnData( + X=cpx.scipy.sparse.csr_matrix(csr_matrix(counts.astype(np.float32))) + ) cudata.obs["batch"] = np.array(["big"] * n_big + ["small"] * n_small) cudata.obs["batch"] = cudata.obs["batch"].astype("category") @@ -468,16 +470,22 @@ def test_pearson_residuals_batch_order_invariant(): a1.obs["batch"] = np.array(["A"] * n_big + ["B"] * n_small) a1.obs["batch"] = a1.obs["batch"].astype("category") rsc.pp.highly_variable_genes( - a1, flavor="pearson_residuals", n_top_genes=100, - batch_key="batch", check_values=False, + a1, + flavor="pearson_residuals", + n_top_genes=100, + batch_key="batch", + check_values=False, ) a2 = AnnData(X=cpx.scipy.sparse.csr_matrix(X.copy())) a2.obs["batch"] = np.array(["B"] * n_big + ["A"] * n_small) a2.obs["batch"] = a2.obs["batch"].astype("category") rsc.pp.highly_variable_genes( - a2, flavor="pearson_residuals", n_top_genes=100, - batch_key="batch", check_values=False, + a2, + flavor="pearson_residuals", + n_top_genes=100, + batch_key="batch", + check_values=False, ) np.testing.assert_allclose( From f841432bda9d716591e76524dffda1eaeac4e57d Mon Sep 17 00:00:00 2001 From: Arshammik <79arsham@gmail.com> Date: Tue, 26 May 2026 14:01:11 -0400 Subject: [PATCH 3/4] tests/hvg: slim per review Drop the whitebox test_pearson_residuals_batch_clip_scoped_per_batch (the order-invariant test already covers the observable behavior), the file:line reference docstring on the surviving test, and the inline comment above the clip_batch assignment. --- .../preprocessing/_hvg/_pearson_residuals.py | 5 -- tests/test_hvg.py | 68 +------------------ 2 files changed, 1 insertion(+), 72 deletions(-) diff --git a/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py b/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py index dc530e4d..79a83045 100644 --- a/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py +++ b/src/rapids_singlecell/preprocessing/_hvg/_pearson_residuals.py @@ -71,11 +71,6 @@ def _highly_variable_pearson_residuals( nnz_per_gene = cp.sum(X_batch != 0, axis=0).ravel() nonzero_genes = cp.array(nnz_per_gene >= 1) X_batch = X_batch[:, nonzero_genes] - # Per Lause-Berens-Kobak 2021, the clip threshold is `sqrt(N)` over the - # cells entering the residual computation — so per-batch when `batch_key` - # is set. We use a loop-local `clip_batch` so the default is recomputed - # each iteration; reassigning the outer `clip` parameter would freeze - # the first batch's value for every subsequent batch. if clip is None: n = X_batch.shape[0] clip_batch = cp.sqrt(n, dtype=dtype) diff --git a/tests/test_hvg.py b/tests/test_hvg.py index a57c7f55..7821b3fd 100644 --- a/tests/test_hvg.py +++ b/tests/test_hvg.py @@ -397,69 +397,8 @@ def test_highly_variable_genes_pearson_residuals_batch(n_top_genes, dtype): assert len(cudata.var) == n_genes -def test_pearson_residuals_batch_clip_scoped_per_batch(monkeypatch): - """The per-batch Pearson-residual clip threshold must be ``sqrt(n_cells_in_batch)`` - (Lause-Berens-Kobak 2021). Before the fix at - ``preprocessing/_hvg/_pearson_residuals.py:74-76`` the function-scoped - ``clip`` variable was reassigned in the first batch iteration and reused - for every subsequent batch — so a small batch silently inherited a large - batch's clip value. This test instruments the underlying CUDA bindings to - capture each batch's clip and asserts they differ. - """ - from rapids_singlecell._cuda import _pr_cuda as _pr - - rng = np.random.default_rng(0) - n_big, n_small, n_genes = 5000, 200, 200 - counts = (rng.random((n_big + n_small, n_genes)) < 0.05).astype(np.int32) - counts *= rng.integers(1, 31, size=counts.shape, dtype=np.int32) - cudata = AnnData( - X=cpx.scipy.sparse.csr_matrix(csr_matrix(counts.astype(np.float32))) - ) - cudata.obs["batch"] = np.array(["big"] * n_big + ["small"] * n_small) - cudata.obs["batch"] = cudata.obs["batch"].astype("category") - - captured_clips = [] - original_csc = _pr.csc_hvg_res - original_dense = _pr.dense_hvg_res - - def spy_csc(*args, **kwargs): - captured_clips.append(float(kwargs.get("clip", -1))) - return original_csc(*args, **kwargs) - - def spy_dense(*args, **kwargs): - captured_clips.append(float(kwargs.get("clip", -1))) - return original_dense(*args, **kwargs) - - monkeypatch.setattr(_pr, "csc_hvg_res", spy_csc) - monkeypatch.setattr(_pr, "dense_hvg_res", spy_dense) - - rsc.pp.highly_variable_genes( - cudata, - flavor="pearson_residuals", - n_top_genes=100, - batch_key="batch", - check_values=False, - ) - - assert len(captured_clips) >= 2, ( - f"expected >= 2 per-batch kernel calls, got {len(captured_clips)}" - ) - distinct = {round(c, 1) for c in captured_clips} - expected_big = round(float(np.sqrt(n_big)), 1) - expected_small = round(float(np.sqrt(n_small)), 1) - assert distinct == {expected_big, expected_small}, ( - f"expected per-batch clips {{{expected_small}, {expected_big}}}; " - f"got {sorted(distinct)} across calls {captured_clips}. Each batch " - f"must receive sqrt(n_cells_in_batch), not the first batch's value." - ) - - def test_pearson_residuals_batch_order_invariant(): - """Renaming batch labels A<->B on identical data must not change the HVG - output. Before the fix, the first batch (alphabetically) defined the clip - used for all subsequent batches, so the result depended silently on - ``np.unique`` sort order. - """ + """HVG ranking must not depend on alphabetical batch-label order.""" rng = np.random.default_rng(0) n_big, n_small, n_genes = 5000, 200, 200 counts = (rng.random((n_big + n_small, n_genes)) < 0.05).astype(np.int32) @@ -492,11 +431,6 @@ def test_pearson_residuals_batch_order_invariant(): a1.var["residual_variances"].to_numpy(), a2.var["residual_variances"].to_numpy(), atol=1e-5, - err_msg=( - "Swapping batch labels A<->B changed residual_variances on " - "identical data. The first batch's clip is leaking into later " - "batches via the function-scoped `clip` variable." - ), ) From 5b8d53709dfcb4a286c5a329dab087e0dcc2f9c9 Mon Sep 17 00:00:00 2001 From: Arshammik <79arsham@gmail.com> Date: Tue, 26 May 2026 22:45:39 -0400 Subject: [PATCH 4/4] docs: add release note for #674 --- docs/release-notes/0.15.2.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/release-notes/0.15.2.md b/docs/release-notes/0.15.2.md index 342f9e0b..1a934840 100644 --- a/docs/release-notes/0.15.2.md +++ b/docs/release-notes/0.15.2.md @@ -3,3 +3,7 @@ ```{rubric} Features ``` * Add pseudobulk based distance metrics to {class}`~rapids_singlecell.ptg.Distance`: ``euclidean``, ``root_mean_squared_error``, ``mse``, ``mean_absolute_error``, ``pearson_distance``, ``cosine_distance``, ``r2_distance``. Matches ``pertpy.tl.Distance`` {pr}`676` {smaller}`S Dicks` + +```{rubric} Bug fixes +``` +* Fixes per-batch clip threshold in `pp.highly_variable_genes(flavor="pearson_residuals", batch_key=...)`: each batch now uses its own `sqrt(n_cells_in_batch)` instead of silently reusing the first batch's value {pr}`674` {smaller}`A Mikaeili`