diff --git a/openavmkit/ratio_study.py b/openavmkit/ratio_study.py index f65984d..77f26a0 100644 --- a/openavmkit/ratio_study.py +++ b/openavmkit/ratio_study.py @@ -419,6 +419,52 @@ def _clean_label(label: str) -> str: return label +def _compute_breakdown_edges(values, quantiles=0, bins_cfg=None, bin_labels=None): + """Compute monotonic bin edges + labels for a ratio-study breakdown. + + Returns ``(edges, labels)`` for :func:`pandas.cut`, or ``(None, None)`` + when the column is degenerate (fewer than two distinct edges, e.g. an + all-NaN or constant column) and so cannot be binned. + + Two modes: + + - ``bins_cfg`` given: explicit value edges (e.g. ``[0, 0.1, 0.25, 0.5, + 0.75, 1.0]``); ``bin_labels`` optionally overrides the auto-generated + "lo - hi" labels. + - otherwise: ``quantiles`` equal-count bins computed from ``values``. + + Robust to NaN (uses ``np.nanquantile``) and to heavily-skewed columns + where many rows share a value: an edge is only kept when it strictly + increases, so the result never trips pandas' "bins must increase + monotonically" error. + """ + if bins_cfg: + edges = list(bins_cfg) + if len(edges) < 2: + return None, None + if bin_labels is None: + bin_labels = [f"{edges[i]:g} - {edges[i + 1]:g}" for i in range(len(edges) - 1)] + return edges, list(bin_labels) + + if quantiles and quantiles > 0: + edges = [0] + labels = [] + last = 0 + for q in range(quantiles + 1): + try: + qv = np.nanquantile(values, q / quantiles) + except IndexError: + continue + label = f"{q / quantiles * 100:3.0f}th %ile
({last:,.0f} - {qv:,.0f})" + if np.isfinite(qv) and qv > edges[-1]: + edges.append(qv) + labels.append(label) + last = qv + return (edges, labels) if len(edges) >= 2 else (None, None) + + return None, None + + def _run_ratio_study_breakdowns( settings: dict, model_group: str, df_sales: pd.DataFrame, confidence_interval=0.95, iterations=10000 ) -> dict: @@ -547,20 +593,13 @@ def _run_ratio_study_breakdowns( quantiles = breakdown.get("quantiles", 0) slice_size = breakdown.get("slice_size", 0) df_sub = df.copy() - if quantiles > 0: - bins = [0] - labels = [] - last_value = 0 - for q in range(quantiles + 1): - try: - quantile_value = np.quantile(df_sub[by], q / quantiles) - except IndexError: - continue - percentile = f"{q / quantiles * 100:3.0f}th %ile
({last_value:,.0f} - {quantile_value:,.0f})" - if quantile_value not in bins: - bins.append(quantile_value) - labels.append(percentile) - last_value = quantile_value + bins, labels = _compute_breakdown_edges( + df_sub[by], + quantiles=quantiles, + bins_cfg=breakdown.get("bins"), + bin_labels=breakdown.get("bin_labels"), + ) + if bins is not None: df_sub["quantile"] = pd.cut( df_sub[by], bins=bins, diff --git a/tests/test_ratio_study_binning.py b/tests/test_ratio_study_binning.py new file mode 100644 index 0000000..3f24e29 --- /dev/null +++ b/tests/test_ratio_study_binning.py @@ -0,0 +1,50 @@ +import numpy as np +import pandas as pd + +from openavmkit.ratio_study import _compute_breakdown_edges + + +def _strictly_increasing(edges): + return all(edges[i] < edges[i + 1] for i in range(len(edges) - 1)) + + +def test_quantiles_handle_zero_inflated_column(): + # 80% of rows share value 0, then a spread. Equal-count quantile edges + # collapse onto 0; the previous np.quantile + "not in bins" logic produced + # non-monotonic edges and pd.cut raised "bins must increase monotonically". + values = pd.Series([0.0] * 80 + list(np.linspace(0.05, 1.0, 20))) + edges, labels = _compute_breakdown_edges(values, quantiles=5) + assert edges is not None + assert _strictly_increasing(edges) + assert len(labels) == len(edges) - 1 + + +def test_quantiles_ignore_nan(): + values = pd.Series([1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]) + edges, labels = _compute_breakdown_edges(values, quantiles=4) + assert edges is not None + assert all(np.isfinite(e) for e in edges) + assert _strictly_increasing(edges) + + +def test_degenerate_columns_return_none(): + # constant column -> no distinct edges -> caller skips the breakdown + assert _compute_breakdown_edges(pd.Series([0.0] * 50), quantiles=5) == (None, None) + # all-NaN column -> same + assert _compute_breakdown_edges(pd.Series([np.nan, np.nan, np.nan]), quantiles=5) == (None, None) + + +def test_explicit_bins_with_labels(): + edges, labels = _compute_breakdown_edges( + pd.Series([0.05, 0.3, 0.8]), + bins_cfg=[0, 0.1, 0.25, 0.5, 0.75, 1.0], + bin_labels=["0-10%", "10-25%", "25-50%", "50-75%", "75-100%"], + ) + assert edges == [0, 0.1, 0.25, 0.5, 0.75, 1.0] + assert labels == ["0-10%", "10-25%", "25-50%", "50-75%", "75-100%"] + + +def test_explicit_bins_autogenerate_labels(): + edges, labels = _compute_breakdown_edges(pd.Series([1, 2, 3]), bins_cfg=[0, 10, 20]) + assert edges == [0, 10, 20] + assert len(labels) == 2