diff --git a/openavmkit/data.py b/openavmkit/data.py index 5fbcb52..99769bc 100644 --- a/openavmkit/data.py +++ b/openavmkit/data.py @@ -634,6 +634,19 @@ def process_data( dataframes, merge_sales, settings, required_key="key_sale" ) + # Record multi-parcel ("bulk") deed multiplicity BEFORE the sales table is + # filtered to valid sales and de-duplicated to one row per parcel below. A + # single deed (``key_sale``) recorded against many parcels (``key``) stamps the + # full bundle consideration onto each parcel; once the sibling rows are dropped, + # that signal is invisible to the duplicate-based scrutiny heuristics (a lone + # surviving "orphan" row has nothing left to be a duplicate of). Persisting the + # count here lets run_heuristics flag such sales even when only one row of the + # deed survives. See flag_bulk_deeds() in sales_scrutiny_study.py. + if "key_sale" in df_sales.columns and "key" in df_sales.columns: + df_sales["sale_parcel_count"] = ( + df_sales.groupby("key_sale")["key"].transform("nunique").astype("Int64") + ) + if "valid_sale" not in df_sales: raise ValueError("The 'valid_sale' column is required in the sales data. If you don't have anything to go on, you can just create that column and fill it with an assumption (i.e. all are valid), but ideally you should look for some kind of validation criteria for your sales.") if "vacant_sale" not in df_sales: diff --git a/openavmkit/sales_scrutiny_study.py b/openavmkit/sales_scrutiny_study.py index f5f68a4..29830ef 100644 --- a/openavmkit/sales_scrutiny_study.py +++ b/openavmkit/sales_scrutiny_study.py @@ -563,6 +563,45 @@ def drop_manual_exclusions( return sup +def flag_bulk_deeds(df_sales: pd.DataFrame) -> pd.Series: + """Flag multi-parcel ("bulk") deed sales. + + A single deed (``key_sale``) recorded against multiple parcels (``key``) carries + one consideration that covers the whole bundle, so the per-parcel sale price is + not a usable arm's-length signal. These are best dropped from modeling. + + The duplicate-based heuristics in :func:`run_heuristics` (repeated deed+date, + repeated date+price) can only catch bulk deeds whose sibling rows are still + present in the sales table. Once sales are de-duplicated to one row per parcel + (and invalid sales are filtered out), a deed can be reduced to a single "orphan" + row whose siblings are gone -- leaving nothing for those heuristics to match + against, even though the surviving row still carries the inflated bundle price. + + To stay robust to that thinning, this helper prefers the ``sale_parcel_count`` + column computed at ingestion in :func:`openavmkit.data.process_data` (before the + sales table is filtered/de-duplicated). When that column is absent it falls back + to counting distinct parcels per deed in the current table, which still catches + bulk deeds whose duplicate rows survive. + + Parameters + ---------- + df_sales : pd.DataFrame + Sales table. Must contain ``key_sale``; ``sale_parcel_count`` (preferred) or + ``key`` enables detection. + + Returns + ------- + pd.Series + Boolean Series, indexed like ``df_sales``, True for bulk-deed sales. + """ + if "sale_parcel_count" in df_sales.columns: + return df_sales["sale_parcel_count"].fillna(1).astype("int64").gt(1) + if "key_sale" in df_sales.columns and "key" in df_sales.columns: + counts = df_sales.groupby("key_sale")["key"].transform("nunique") + return counts.gt(1) + return pd.Series(False, index=df_sales.index) + + def run_heuristics( sup: SalesUniversePair, settings: dict, drop: bool = True, verbose: bool = False ) -> SalesUniversePair: @@ -640,10 +679,20 @@ def run_heuristics( ) df_sales.loc[idx_false_vacant, "flag_false_vacant"] = True + # 4 -- Flag multi-parcel ("bulk") deeds. Robust to the one-row-per-parcel + # de-duplication that happens during ingestion: uses the sale_parcel_count + # column computed in process_data when present (catches "orphan" bulk-deed + # sales whose sibling rows were dropped), else falls back to counting parcels + # per deed in the current table. See flag_bulk_deeds(). + idx_bulk_deed = flag_bulk_deeds(df_sales) + if idx_bulk_deed.any(): + df_sales.loc[idx_bulk_deed, "flag_bulk_deed"] = True + files = { "flag_dupe_deed_date": "duplicated_deeds_and_dates", "flag_dupe_date_price": "duplicated_dates_and_prices", "flag_false_vacant": "classified_vacant_but_bldg_older_than_sale_year", + "flag_bulk_deed": "multi_parcel_bulk_deeds", } locations = get_locations(settings, df_sales) @@ -711,6 +760,7 @@ def run_heuristics( "flag_dupe_deed_date": "Repeated\ndeed & sale date", "flag_dupe_date_price": "Repeated\nsale date & price", "flag_false_vacant": "Bldg older\nthan sale year", + "flag_bulk_deed": "Multi-parcel\nbulk deed", } ) diff --git a/tests/test_sales_scrutiny.py b/tests/test_sales_scrutiny.py new file mode 100644 index 0000000..4395255 --- /dev/null +++ b/tests/test_sales_scrutiny.py @@ -0,0 +1,72 @@ +"""Tests for sales-scrutiny heuristics.""" + +import pandas as pd + +from openavmkit.sales_scrutiny_study import flag_bulk_deeds + + +def test_flag_bulk_deeds_precomputed_count_catches_orphan(): + """When ``sale_parcel_count`` is present (computed at ingestion before the sales + table is thinned to one row per parcel), a lone surviving "orphan" row of a + multi-parcel deed is still flagged -- the case the duplicate-based heuristics + miss once the sibling rows are dropped.""" + df = pd.DataFrame( + { + "key": ["A", "B", "C"], + "key_sale": ["deed1", "deed2", "deed3"], + # deed1 covered 15 parcels but only this orphan row survived dedup; + # deed2 was a normal single-parcel sale; deed3 a 2-parcel bundle. + "sale_parcel_count": [15, 1, 2], + } + ) + flagged = flag_bulk_deeds(df) + assert flagged.tolist() == [True, False, True] + + +def test_flag_bulk_deeds_fallback_counts_parcels_per_deed(): + """Without the precomputed column, fall back to counting distinct parcels per + deed in the current table (catches bulk deeds whose duplicate rows survive).""" + df = pd.DataFrame( + { + "key": ["A", "B", "C", "D"], + # deed1 recorded against 3 parcels; deed2 a single-parcel sale. + "key_sale": ["deed1", "deed1", "deed1", "deed2"], + } + ) + flagged = flag_bulk_deeds(df) + assert flagged.tolist() == [True, True, True, False] + + +def test_flag_bulk_deeds_precomputed_takes_precedence_over_fallback(): + """The precomputed count wins even when per-table parcel counts disagree + (e.g. all sibling rows were dropped, so the table shows one row per deed).""" + df = pd.DataFrame( + { + "key": ["A", "B"], + "key_sale": ["deed1", "deed2"], # each appears once in the table + "sale_parcel_count": [8, 1], # ...but deed1 truly spanned 8 parcels + } + ) + flagged = flag_bulk_deeds(df) + assert flagged.tolist() == [True, False] + + +def test_flag_bulk_deeds_no_usable_columns_returns_all_false(): + """No ``sale_parcel_count`` and no ``key`` -> nothing can be inferred.""" + df = pd.DataFrame({"key_sale": ["deed1", "deed2"]}) + flagged = flag_bulk_deeds(df) + assert flagged.tolist() == [False, False] + assert list(flagged.index) == list(df.index) + + +def test_flag_bulk_deeds_handles_na_count(): + """A NaN ``sale_parcel_count`` (deed never seen at ingestion) is treated as 1.""" + df = pd.DataFrame( + { + "key": ["A", "B"], + "key_sale": ["deed1", "deed2"], + "sale_parcel_count": pd.array([pd.NA, 3], dtype="Int64"), + } + ) + flagged = flag_bulk_deeds(df) + assert flagged.tolist() == [False, True]