Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 53 additions & 14 deletions openavmkit/ratio_study.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,52 @@ def _clean_label(label: str) -> str:
return label


def _compute_breakdown_edges(values, quantiles=0, bins_cfg=None, bin_labels=None):
"""Compute monotonic bin edges + labels for a ratio-study breakdown.

Returns ``(edges, labels)`` for :func:`pandas.cut`, or ``(None, None)``
when the column is degenerate (fewer than two distinct edges, e.g. an
all-NaN or constant column) and so cannot be binned.

Two modes:

- ``bins_cfg`` given: explicit value edges (e.g. ``[0, 0.1, 0.25, 0.5,
0.75, 1.0]``); ``bin_labels`` optionally overrides the auto-generated
"lo - hi" labels.
- otherwise: ``quantiles`` equal-count bins computed from ``values``.

Robust to NaN (uses ``np.nanquantile``) and to heavily-skewed columns
where many rows share a value: an edge is only kept when it strictly
increases, so the result never trips pandas' "bins must increase
monotonically" error.
"""
if bins_cfg:
edges = list(bins_cfg)
if len(edges) < 2:
return None, None
if bin_labels is None:
bin_labels = [f"{edges[i]:g} - {edges[i + 1]:g}" for i in range(len(edges) - 1)]
return edges, list(bin_labels)

if quantiles and quantiles > 0:
edges = [0]
labels = []
last = 0
for q in range(quantiles + 1):
try:
qv = np.nanquantile(values, q / quantiles)
except IndexError:
continue
label = f"{q / quantiles * 100:3.0f}th %ile<br>({last:,.0f} - {qv:,.0f})"
if np.isfinite(qv) and qv > edges[-1]:
edges.append(qv)
labels.append(label)
last = qv
return (edges, labels) if len(edges) >= 2 else (None, None)

return None, None


def _run_ratio_study_breakdowns(
settings: dict, model_group: str, df_sales: pd.DataFrame, confidence_interval=0.95, iterations=10000
) -> dict:
Expand Down Expand Up @@ -547,20 +593,13 @@ def _run_ratio_study_breakdowns(
quantiles = breakdown.get("quantiles", 0)
slice_size = breakdown.get("slice_size", 0)
df_sub = df.copy()
if quantiles > 0:
bins = [0]
labels = []
last_value = 0
for q in range(quantiles + 1):
try:
quantile_value = np.quantile(df_sub[by], q / quantiles)
except IndexError:
continue
percentile = f"{q / quantiles * 100:3.0f}th %ile<br>({last_value:,.0f} - {quantile_value:,.0f})"
if quantile_value not in bins:
bins.append(quantile_value)
labels.append(percentile)
last_value = quantile_value
bins, labels = _compute_breakdown_edges(
df_sub[by],
quantiles=quantiles,
bins_cfg=breakdown.get("bins"),
bin_labels=breakdown.get("bin_labels"),
)
if bins is not None:
df_sub["quantile"] = pd.cut(
df_sub[by],
bins=bins,
Expand Down
50 changes: 50 additions & 0 deletions tests/test_ratio_study_binning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np
import pandas as pd

from openavmkit.ratio_study import _compute_breakdown_edges


def _strictly_increasing(edges):
return all(edges[i] < edges[i + 1] for i in range(len(edges) - 1))


def test_quantiles_handle_zero_inflated_column():
# 80% of rows share value 0, then a spread. Equal-count quantile edges
# collapse onto 0; the previous np.quantile + "not in bins" logic produced
# non-monotonic edges and pd.cut raised "bins must increase monotonically".
values = pd.Series([0.0] * 80 + list(np.linspace(0.05, 1.0, 20)))
edges, labels = _compute_breakdown_edges(values, quantiles=5)
assert edges is not None
assert _strictly_increasing(edges)
assert len(labels) == len(edges) - 1


def test_quantiles_ignore_nan():
values = pd.Series([1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0])
edges, labels = _compute_breakdown_edges(values, quantiles=4)
assert edges is not None
assert all(np.isfinite(e) for e in edges)
assert _strictly_increasing(edges)


def test_degenerate_columns_return_none():
# constant column -> no distinct edges -> caller skips the breakdown
assert _compute_breakdown_edges(pd.Series([0.0] * 50), quantiles=5) == (None, None)
# all-NaN column -> same
assert _compute_breakdown_edges(pd.Series([np.nan, np.nan, np.nan]), quantiles=5) == (None, None)


def test_explicit_bins_with_labels():
edges, labels = _compute_breakdown_edges(
pd.Series([0.05, 0.3, 0.8]),
bins_cfg=[0, 0.1, 0.25, 0.5, 0.75, 1.0],
bin_labels=["0-10%", "10-25%", "25-50%", "50-75%", "75-100%"],
)
assert edges == [0, 0.1, 0.25, 0.5, 0.75, 1.0]
assert labels == ["0-10%", "10-25%", "25-50%", "50-75%", "75-100%"]


def test_explicit_bins_autogenerate_labels():
edges, labels = _compute_breakdown_edges(pd.Series([1, 2, 3]), bins_cfg=[0, 10, 20])
assert edges == [0, 10, 20]
assert len(labels) == 2
Loading