From 8c16c00ec069d0092b7c482ca8031ebf48d9d673 Mon Sep 17 00:00:00 2001
From: Russell Richie <drussellmrichie@github.com>
Date: Sat, 6 Jun 2026 15:20:20 -0400
Subject: [PATCH] Detect orphaned multi-parcel (bulk) deeds in sales scrutiny

A bulk deed (one `key_sale` recorded against many parcels) carries a single
consideration covering the whole bundle, which gets stamped onto each parcel,
so the per-parcel sale price is not a usable arm's-length signal.

The existing duplicate-based heuristics (repeated deed+date, repeated
date+price) only catch bulk deeds while their sibling rows are still present
in the sales table. But `process_data` de-duplicates sales to one row per
parcel and filters out invalid sales, so a deed can be reduced to a single
"orphan" row whose siblings are gone -- it still carries the inflated bundle
price, but there is no duplicate left for those heuristics to match against.
In a real county dataset this leaked ~370 bulk-deed sales into the vacant-land
training set (it inflated the vacant median price-per-sqft ~4x) that scrutiny
silently passed.

Fix:
- `process_data`: compute deed->parcel multiplicity (`sale_parcel_count`)
  right after the sales merge, before the valid-sale filter and de-dup, so the
  signal survives the thinning.
- `flag_bulk_deeds()` helper + a `flag_bulk_deed` heuristic in `run_heuristics`:
  prefers `sale_parcel_count` (catches orphans), falls back to counting parcels
  per deed in the current table (catches bulk deeds whose rows survive).
- Gated on column presence -> no-op for datasets without `key`/`key_sale`.
  Honors the existing `drop` flag (flag-only when `drop=False`).
- Unit tests for the helper (precomputed, fallback, NA, and no-op paths).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 openavmkit/data.py                 | 13 ++++++
 openavmkit/sales_scrutiny_study.py | 50 +++++++++++++++++++++
 tests/test_sales_scrutiny.py       | 72 ++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+)
 create mode 100644 tests/test_sales_scrutiny.py

diff --git a/openavmkit/data.py b/openavmkit/data.py
index 5fbcb528..99769bce 100644
--- a/openavmkit/data.py
+++ b/openavmkit/data.py
@@ -634,6 +634,19 @@ def process_data(
         dataframes, merge_sales, settings, required_key="key_sale"
     )
 
+    # Record multi-parcel ("bulk") deed multiplicity BEFORE the sales table is
+    # filtered to valid sales and de-duplicated to one row per parcel below. A
+    # single deed (``key_sale``) recorded against many parcels (``key``) stamps the
+    # full bundle consideration onto each parcel; once the sibling rows are dropped,
+    # that signal is invisible to the duplicate-based scrutiny heuristics (a lone
+    # surviving "orphan" row has nothing left to be a duplicate of). Persisting the
+    # count here lets run_heuristics flag such sales even when only one row of the
+    # deed survives. See flag_bulk_deeds() in sales_scrutiny_study.py.
+    if "key_sale" in df_sales.columns and "key" in df_sales.columns:
+        df_sales["sale_parcel_count"] = (
+            df_sales.groupby("key_sale")["key"].transform("nunique").astype("Int64")
+        )
+
     if "valid_sale" not in df_sales:
         raise ValueError("The 'valid_sale' column is required in the sales data. If you don't have anything to go on, you can just create that column and fill it with an assumption (i.e. all are valid), but ideally you should look for some kind of validation criteria for your sales.")
     if "vacant_sale" not in df_sales:
diff --git a/openavmkit/sales_scrutiny_study.py b/openavmkit/sales_scrutiny_study.py
index f5f68a46..29830ef0 100644
--- a/openavmkit/sales_scrutiny_study.py
+++ b/openavmkit/sales_scrutiny_study.py
@@ -563,6 +563,45 @@ def drop_manual_exclusions(
     return sup
 
 
+def flag_bulk_deeds(df_sales: pd.DataFrame) -> pd.Series:
+    """Flag multi-parcel ("bulk") deed sales.
+
+    A single deed (``key_sale``) recorded against multiple parcels (``key``) carries
+    one consideration that covers the whole bundle, so the per-parcel sale price is
+    not a usable arm's-length signal. These are best dropped from modeling.
+
+    The duplicate-based heuristics in :func:`run_heuristics` (repeated deed+date,
+    repeated date+price) can only catch bulk deeds whose sibling rows are still
+    present in the sales table. Once sales are de-duplicated to one row per parcel
+    (and invalid sales are filtered out), a deed can be reduced to a single "orphan"
+    row whose siblings are gone -- leaving nothing for those heuristics to match
+    against, even though the surviving row still carries the inflated bundle price.
+
+    To stay robust to that thinning, this helper prefers the ``sale_parcel_count``
+    column computed at ingestion in :func:`openavmkit.data.process_data` (before the
+    sales table is filtered/de-duplicated). When that column is absent it falls back
+    to counting distinct parcels per deed in the current table, which still catches
+    bulk deeds whose duplicate rows survive.
+
+    Parameters
+    ----------
+    df_sales : pd.DataFrame
+        Sales table. Must contain ``key_sale``; ``sale_parcel_count`` (preferred) or
+        ``key`` enables detection.
+
+    Returns
+    -------
+    pd.Series
+        Boolean Series, indexed like ``df_sales``, True for bulk-deed sales.
+    """
+    if "sale_parcel_count" in df_sales.columns:
+        return df_sales["sale_parcel_count"].fillna(1).astype("int64").gt(1)
+    if "key_sale" in df_sales.columns and "key" in df_sales.columns:
+        counts = df_sales.groupby("key_sale")["key"].transform("nunique")
+        return counts.gt(1)
+    return pd.Series(False, index=df_sales.index)
+
+
 def run_heuristics(
     sup: SalesUniversePair, settings: dict, drop: bool = True, verbose: bool = False
 ) -> SalesUniversePair:
@@ -640,10 +679,20 @@ def run_heuristics(
     )
     df_sales.loc[idx_false_vacant, "flag_false_vacant"] = True
 
+    # 4 -- Flag multi-parcel ("bulk") deeds. Robust to the one-row-per-parcel
+    # de-duplication that happens during ingestion: uses the sale_parcel_count
+    # column computed in process_data when present (catches "orphan" bulk-deed
+    # sales whose sibling rows were dropped), else falls back to counting parcels
+    # per deed in the current table. See flag_bulk_deeds().
+    idx_bulk_deed = flag_bulk_deeds(df_sales)
+    if idx_bulk_deed.any():
+        df_sales.loc[idx_bulk_deed, "flag_bulk_deed"] = True
+
     files = {
         "flag_dupe_deed_date": "duplicated_deeds_and_dates",
         "flag_dupe_date_price": "duplicated_dates_and_prices",
         "flag_false_vacant": "classified_vacant_but_bldg_older_than_sale_year",
+        "flag_bulk_deed": "multi_parcel_bulk_deeds",
     }
 
     locations = get_locations(settings, df_sales)
@@ -711,6 +760,7 @@ def run_heuristics(
                         "flag_dupe_deed_date": "Repeated\ndeed & sale date",
                         "flag_dupe_date_price": "Repeated\nsale date & price",
                         "flag_false_vacant": "Bldg older\nthan sale year",
+                        "flag_bulk_deed": "Multi-parcel\nbulk deed",
                     }
                 )
 
diff --git a/tests/test_sales_scrutiny.py b/tests/test_sales_scrutiny.py
new file mode 100644
index 00000000..43952559
--- /dev/null
+++ b/tests/test_sales_scrutiny.py
@@ -0,0 +1,72 @@
+"""Tests for sales-scrutiny heuristics."""
+
+import pandas as pd
+
+from openavmkit.sales_scrutiny_study import flag_bulk_deeds
+
+
+def test_flag_bulk_deeds_precomputed_count_catches_orphan():
+    """When ``sale_parcel_count`` is present (computed at ingestion before the sales
+    table is thinned to one row per parcel), a lone surviving "orphan" row of a
+    multi-parcel deed is still flagged -- the case the duplicate-based heuristics
+    miss once the sibling rows are dropped."""
+    df = pd.DataFrame(
+        {
+            "key": ["A", "B", "C"],
+            "key_sale": ["deed1", "deed2", "deed3"],
+            # deed1 covered 15 parcels but only this orphan row survived dedup;
+            # deed2 was a normal single-parcel sale; deed3 a 2-parcel bundle.
+            "sale_parcel_count": [15, 1, 2],
+        }
+    )
+    flagged = flag_bulk_deeds(df)
+    assert flagged.tolist() == [True, False, True]
+
+
+def test_flag_bulk_deeds_fallback_counts_parcels_per_deed():
+    """Without the precomputed column, fall back to counting distinct parcels per
+    deed in the current table (catches bulk deeds whose duplicate rows survive)."""
+    df = pd.DataFrame(
+        {
+            "key": ["A", "B", "C", "D"],
+            # deed1 recorded against 3 parcels; deed2 a single-parcel sale.
+            "key_sale": ["deed1", "deed1", "deed1", "deed2"],
+        }
+    )
+    flagged = flag_bulk_deeds(df)
+    assert flagged.tolist() == [True, True, True, False]
+
+
+def test_flag_bulk_deeds_precomputed_takes_precedence_over_fallback():
+    """The precomputed count wins even when per-table parcel counts disagree
+    (e.g. all sibling rows were dropped, so the table shows one row per deed)."""
+    df = pd.DataFrame(
+        {
+            "key": ["A", "B"],
+            "key_sale": ["deed1", "deed2"],  # each appears once in the table
+            "sale_parcel_count": [8, 1],      # ...but deed1 truly spanned 8 parcels
+        }
+    )
+    flagged = flag_bulk_deeds(df)
+    assert flagged.tolist() == [True, False]
+
+
+def test_flag_bulk_deeds_no_usable_columns_returns_all_false():
+    """No ``sale_parcel_count`` and no ``key`` -> nothing can be inferred."""
+    df = pd.DataFrame({"key_sale": ["deed1", "deed2"]})
+    flagged = flag_bulk_deeds(df)
+    assert flagged.tolist() == [False, False]
+    assert list(flagged.index) == list(df.index)
+
+
+def test_flag_bulk_deeds_handles_na_count():
+    """A NaN ``sale_parcel_count`` (deed never seen at ingestion) is treated as 1."""
+    df = pd.DataFrame(
+        {
+            "key": ["A", "B"],
+            "key_sale": ["deed1", "deed2"],
+            "sale_parcel_count": pd.array([pd.NA, 3], dtype="Int64"),
+        }
+    )
+    flagged = flag_bulk_deeds(df)
+    assert flagged.tolist() == [False, True]