Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions openavmkit/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,19 @@ def process_data(
dataframes, merge_sales, settings, required_key="key_sale"
)

# Record multi-parcel ("bulk") deed multiplicity BEFORE the sales table is
# filtered to valid sales and de-duplicated to one row per parcel below. A
# single deed (``key_sale``) recorded against many parcels (``key``) stamps the
# full bundle consideration onto each parcel; once the sibling rows are dropped,
# that signal is invisible to the duplicate-based scrutiny heuristics (a lone
# surviving "orphan" row has nothing left to be a duplicate of). Persisting the
# count here lets run_heuristics flag such sales even when only one row of the
# deed survives. See flag_bulk_deeds() in sales_scrutiny_study.py.
if "key_sale" in df_sales.columns and "key" in df_sales.columns:
df_sales["sale_parcel_count"] = (
df_sales.groupby("key_sale")["key"].transform("nunique").astype("Int64")
)

if "valid_sale" not in df_sales:
raise ValueError("The 'valid_sale' column is required in the sales data. If you don't have anything to go on, you can just create that column and fill it with an assumption (i.e. all are valid), but ideally you should look for some kind of validation criteria for your sales.")
if "vacant_sale" not in df_sales:
Expand Down
50 changes: 50 additions & 0 deletions openavmkit/sales_scrutiny_study.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,45 @@ def drop_manual_exclusions(
return sup


def flag_bulk_deeds(df_sales: pd.DataFrame) -> pd.Series:
"""Flag multi-parcel ("bulk") deed sales.

A single deed (``key_sale``) recorded against multiple parcels (``key``) carries
one consideration that covers the whole bundle, so the per-parcel sale price is
not a usable arm's-length signal. These are best dropped from modeling.

The duplicate-based heuristics in :func:`run_heuristics` (repeated deed+date,
repeated date+price) can only catch bulk deeds whose sibling rows are still
present in the sales table. Once sales are de-duplicated to one row per parcel
(and invalid sales are filtered out), a deed can be reduced to a single "orphan"
row whose siblings are gone -- leaving nothing for those heuristics to match
against, even though the surviving row still carries the inflated bundle price.

To stay robust to that thinning, this helper prefers the ``sale_parcel_count``
column computed at ingestion in :func:`openavmkit.data.process_data` (before the
sales table is filtered/de-duplicated). When that column is absent it falls back
to counting distinct parcels per deed in the current table, which still catches
bulk deeds whose duplicate rows survive.

Parameters
----------
df_sales : pd.DataFrame
Sales table. Must contain ``key_sale``; ``sale_parcel_count`` (preferred) or
``key`` enables detection.

Returns
-------
pd.Series
Boolean Series, indexed like ``df_sales``, True for bulk-deed sales.
"""
if "sale_parcel_count" in df_sales.columns:
return df_sales["sale_parcel_count"].fillna(1).astype("int64").gt(1)
if "key_sale" in df_sales.columns and "key" in df_sales.columns:
counts = df_sales.groupby("key_sale")["key"].transform("nunique")
return counts.gt(1)
return pd.Series(False, index=df_sales.index)


def run_heuristics(
sup: SalesUniversePair, settings: dict, drop: bool = True, verbose: bool = False
) -> SalesUniversePair:
Expand Down Expand Up @@ -640,10 +679,20 @@ def run_heuristics(
)
df_sales.loc[idx_false_vacant, "flag_false_vacant"] = True

# 4 -- Flag multi-parcel ("bulk") deeds. Robust to the one-row-per-parcel
# de-duplication that happens during ingestion: uses the sale_parcel_count
# column computed in process_data when present (catches "orphan" bulk-deed
# sales whose sibling rows were dropped), else falls back to counting parcels
# per deed in the current table. See flag_bulk_deeds().
idx_bulk_deed = flag_bulk_deeds(df_sales)
if idx_bulk_deed.any():
df_sales.loc[idx_bulk_deed, "flag_bulk_deed"] = True

files = {
"flag_dupe_deed_date": "duplicated_deeds_and_dates",
"flag_dupe_date_price": "duplicated_dates_and_prices",
"flag_false_vacant": "classified_vacant_but_bldg_older_than_sale_year",
"flag_bulk_deed": "multi_parcel_bulk_deeds",
}

locations = get_locations(settings, df_sales)
Expand Down Expand Up @@ -711,6 +760,7 @@ def run_heuristics(
"flag_dupe_deed_date": "Repeated\ndeed & sale date",
"flag_dupe_date_price": "Repeated\nsale date & price",
"flag_false_vacant": "Bldg older\nthan sale year",
"flag_bulk_deed": "Multi-parcel\nbulk deed",
}
)

Expand Down
72 changes: 72 additions & 0 deletions tests/test_sales_scrutiny.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Tests for sales-scrutiny heuristics."""

import pandas as pd

from openavmkit.sales_scrutiny_study import flag_bulk_deeds


def test_flag_bulk_deeds_precomputed_count_catches_orphan():
"""When ``sale_parcel_count`` is present (computed at ingestion before the sales
table is thinned to one row per parcel), a lone surviving "orphan" row of a
multi-parcel deed is still flagged -- the case the duplicate-based heuristics
miss once the sibling rows are dropped."""
df = pd.DataFrame(
{
"key": ["A", "B", "C"],
"key_sale": ["deed1", "deed2", "deed3"],
# deed1 covered 15 parcels but only this orphan row survived dedup;
# deed2 was a normal single-parcel sale; deed3 a 2-parcel bundle.
"sale_parcel_count": [15, 1, 2],
}
)
flagged = flag_bulk_deeds(df)
assert flagged.tolist() == [True, False, True]


def test_flag_bulk_deeds_fallback_counts_parcels_per_deed():
"""Without the precomputed column, fall back to counting distinct parcels per
deed in the current table (catches bulk deeds whose duplicate rows survive)."""
df = pd.DataFrame(
{
"key": ["A", "B", "C", "D"],
# deed1 recorded against 3 parcels; deed2 a single-parcel sale.
"key_sale": ["deed1", "deed1", "deed1", "deed2"],
}
)
flagged = flag_bulk_deeds(df)
assert flagged.tolist() == [True, True, True, False]


def test_flag_bulk_deeds_precomputed_takes_precedence_over_fallback():
"""The precomputed count wins even when per-table parcel counts disagree
(e.g. all sibling rows were dropped, so the table shows one row per deed)."""
df = pd.DataFrame(
{
"key": ["A", "B"],
"key_sale": ["deed1", "deed2"], # each appears once in the table
"sale_parcel_count": [8, 1], # ...but deed1 truly spanned 8 parcels
}
)
flagged = flag_bulk_deeds(df)
assert flagged.tolist() == [True, False]


def test_flag_bulk_deeds_no_usable_columns_returns_all_false():
"""No ``sale_parcel_count`` and no ``key`` -> nothing can be inferred."""
df = pd.DataFrame({"key_sale": ["deed1", "deed2"]})
flagged = flag_bulk_deeds(df)
assert flagged.tolist() == [False, False]
assert list(flagged.index) == list(df.index)


def test_flag_bulk_deeds_handles_na_count():
"""A NaN ``sale_parcel_count`` (deed never seen at ingestion) is treated as 1."""
df = pd.DataFrame(
{
"key": ["A", "B"],
"key_sale": ["deed1", "deed2"],
"sale_parcel_count": pd.array([pd.NA, 3], dtype="Int64"),
}
)
flagged = flag_bulk_deeds(df)
assert flagged.tolist() == [False, True]
Loading