diff --git a/openavmkit/ratio_study.py b/openavmkit/ratio_study.py
index f65984d..77f26a0 100644
--- a/openavmkit/ratio_study.py
+++ b/openavmkit/ratio_study.py
@@ -419,6 +419,52 @@ def _clean_label(label: str) -> str:
return label
+def _compute_breakdown_edges(values, quantiles=0, bins_cfg=None, bin_labels=None):
+ """Compute monotonic bin edges + labels for a ratio-study breakdown.
+
+ Returns ``(edges, labels)`` for :func:`pandas.cut`, or ``(None, None)``
+ when the column is degenerate (fewer than two distinct edges, e.g. an
+ all-NaN or constant column) and so cannot be binned.
+
+ Two modes:
+
+ - ``bins_cfg`` given: explicit value edges (e.g. ``[0, 0.1, 0.25, 0.5,
+ 0.75, 1.0]``); ``bin_labels`` optionally overrides the auto-generated
+ "lo - hi" labels.
+ - otherwise: ``quantiles`` equal-count bins computed from ``values``.
+
+ Robust to NaN (uses ``np.nanquantile``) and to heavily-skewed columns
+ where many rows share a value: an edge is only kept when it strictly
+ increases, so the result never trips pandas' "bins must increase
+ monotonically" error.
+ """
+ if bins_cfg:
+ edges = list(bins_cfg)
+ if len(edges) < 2:
+ return None, None
+ if bin_labels is None:
+ bin_labels = [f"{edges[i]:g} - {edges[i + 1]:g}" for i in range(len(edges) - 1)]
+ return edges, list(bin_labels)
+
+ if quantiles and quantiles > 0:
+ edges = [0]
+ labels = []
+ last = 0
+ for q in range(quantiles + 1):
+ try:
+ qv = np.nanquantile(values, q / quantiles)
+ except IndexError:
+ continue
+ label = f"{q / quantiles * 100:3.0f}th %ile
({last:,.0f} - {qv:,.0f})"
+ if np.isfinite(qv) and qv > edges[-1]:
+ edges.append(qv)
+ labels.append(label)
+ last = qv
+ return (edges, labels) if len(edges) >= 2 else (None, None)
+
+ return None, None
+
+
def _run_ratio_study_breakdowns(
settings: dict, model_group: str, df_sales: pd.DataFrame, confidence_interval=0.95, iterations=10000
) -> dict:
@@ -547,20 +593,13 @@ def _run_ratio_study_breakdowns(
quantiles = breakdown.get("quantiles", 0)
slice_size = breakdown.get("slice_size", 0)
df_sub = df.copy()
- if quantiles > 0:
- bins = [0]
- labels = []
- last_value = 0
- for q in range(quantiles + 1):
- try:
- quantile_value = np.quantile(df_sub[by], q / quantiles)
- except IndexError:
- continue
- percentile = f"{q / quantiles * 100:3.0f}th %ile
({last_value:,.0f} - {quantile_value:,.0f})"
- if quantile_value not in bins:
- bins.append(quantile_value)
- labels.append(percentile)
- last_value = quantile_value
+ bins, labels = _compute_breakdown_edges(
+ df_sub[by],
+ quantiles=quantiles,
+ bins_cfg=breakdown.get("bins"),
+ bin_labels=breakdown.get("bin_labels"),
+ )
+ if bins is not None:
df_sub["quantile"] = pd.cut(
df_sub[by],
bins=bins,
diff --git a/tests/test_ratio_study_binning.py b/tests/test_ratio_study_binning.py
new file mode 100644
index 0000000..3f24e29
--- /dev/null
+++ b/tests/test_ratio_study_binning.py
@@ -0,0 +1,50 @@
+import numpy as np
+import pandas as pd
+
+from openavmkit.ratio_study import _compute_breakdown_edges
+
+
+def _strictly_increasing(edges):
+ return all(edges[i] < edges[i + 1] for i in range(len(edges) - 1))
+
+
+def test_quantiles_handle_zero_inflated_column():
+ # 80% of rows share value 0, then a spread. Equal-count quantile edges
+ # collapse onto 0; the previous np.quantile + "not in bins" logic produced
+ # non-monotonic edges and pd.cut raised "bins must increase monotonically".
+ values = pd.Series([0.0] * 80 + list(np.linspace(0.05, 1.0, 20)))
+ edges, labels = _compute_breakdown_edges(values, quantiles=5)
+ assert edges is not None
+ assert _strictly_increasing(edges)
+ assert len(labels) == len(edges) - 1
+
+
+def test_quantiles_ignore_nan():
+ values = pd.Series([1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0])
+ edges, labels = _compute_breakdown_edges(values, quantiles=4)
+ assert edges is not None
+ assert all(np.isfinite(e) for e in edges)
+ assert _strictly_increasing(edges)
+
+
+def test_degenerate_columns_return_none():
+ # constant column -> no distinct edges -> caller skips the breakdown
+ assert _compute_breakdown_edges(pd.Series([0.0] * 50), quantiles=5) == (None, None)
+ # all-NaN column -> same
+ assert _compute_breakdown_edges(pd.Series([np.nan, np.nan, np.nan]), quantiles=5) == (None, None)
+
+
+def test_explicit_bins_with_labels():
+ edges, labels = _compute_breakdown_edges(
+ pd.Series([0.05, 0.3, 0.8]),
+ bins_cfg=[0, 0.1, 0.25, 0.5, 0.75, 1.0],
+ bin_labels=["0-10%", "10-25%", "25-50%", "50-75%", "75-100%"],
+ )
+ assert edges == [0, 0.1, 0.25, 0.5, 0.75, 1.0]
+ assert labels == ["0-10%", "10-25%", "25-50%", "50-75%", "75-100%"]
+
+
+def test_explicit_bins_autogenerate_labels():
+ edges, labels = _compute_breakdown_edges(pd.Series([1, 2, 3]), bins_cfg=[0, 10, 20])
+ assert edges == [0, 10, 20]
+ assert len(labels) == 2