From eedab0843405af8689e482d5d849e86d9923bfd1 Mon Sep 17 00:00:00 2001 From: Jake Beierle Date: Tue, 31 Mar 2026 17:34:55 -0400 Subject: [PATCH 1/6] Fix latency feature extraction --- support_code/behavior_summaries.py | 16 ++- tests/support_code/__init__.py | 0 tests/support_code/test_behavior_summaries.py | 132 ++++++++++++++++++ 3 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 tests/support_code/__init__.py create mode 100644 tests/support_code/test_behavior_summaries.py diff --git a/support_code/behavior_summaries.py b/support_code/behavior_summaries.py index a24a26f..4b452b9 100644 --- a/support_code/behavior_summaries.py +++ b/support_code/behavior_summaries.py @@ -131,6 +131,18 @@ def aggregate_data_by_bin_size( grouped = data.groupby("MouseID") filtered_data = pd.concat([group.iloc[:bin_size] for _, group in grouped]) + # Extract latency values before summing. agg with a positional lambda + # preserves NaN (unlike first()/last() which skip NaN), returns a Series + # indexed by MouseID for correct alignment with aggregated. + latency_first_col = f"{behavior}_latency_to_first_prediction" + latency_last_col = f"{behavior}_latency_to_last_prediction" + latency_first = filtered_data.groupby("MouseID")[latency_first_col].agg( + lambda s: s.iloc[0] + ) + latency_last = filtered_data.groupby("MouseID")[latency_last_col].agg( + lambda s: s.iloc[-1] + ) + # Aggregate numeric columns by summing them numeric_cols = filtered_data.select_dtypes(include=["number"]).columns aggregated = filtered_data.groupby("MouseID")[numeric_cols].sum() @@ -181,10 +193,10 @@ def aggregate_data_by_bin_size( # TODO: var and std need to be aggregated across bins. # This is non-trivial because of the partial bouts and their associated weights. aggregated[f"bin_first_{bin_size * 5}.{behavior}_latency_first_prediction"] = ( - aggregated[f"{behavior}_latency_to_first_prediction"].head(1) + latency_first ) aggregated[f"bin_last_{bin_size * 5}.{behavior}_latency_last_prediction"] = ( - aggregated[f"{behavior}_latency_to_last_prediction"].tail(1) + latency_last ) # Reset index to make MouseID a regular column diff --git a/tests/support_code/__init__.py b/tests/support_code/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/support_code/test_behavior_summaries.py b/tests/support_code/test_behavior_summaries.py new file mode 100644 index 0000000..ef5f594 --- /dev/null +++ b/tests/support_code/test_behavior_summaries.py @@ -0,0 +1,132 @@ +"""Unit tests for support_code/behavior_summaries.py.""" + +import math +import sys +from pathlib import Path + +import pandas as pd +import pytest + +# behavior_summaries.py lives in support_code/, which is not a package. +# Add it to sys.path so we can import it directly. +sys.path.insert(0, str(Path(__file__).parents[2] / "support_code")) + +import behavior_summaries # noqa: E402 + + +BEHAVIOR = "Jumping" + + +def _make_filtered_data( + latency_first_values: list, + latency_last_values: list, + mouse_id: str = "mouse_A", +) -> pd.DataFrame: + """Build a minimal per-bin DataFrame matching the shape expected by aggregate_data_by_bin_size.""" + n = len(latency_first_values) + return pd.DataFrame( + { + "MouseID": [mouse_id] * n, + f"{BEHAVIOR}_latency_to_first_prediction": latency_first_values, + f"{BEHAVIOR}_latency_to_last_prediction": latency_last_values, + f"{BEHAVIOR}_time_behavior": [100.0] * n, + f"{BEHAVIOR}_time_not_behavior": [200.0] * n, + f"{BEHAVIOR}_behavior_dist": [50.0] * n, + f"{BEHAVIOR}_behavior_dist_threshold": [10.0] * n, + f"{BEHAVIOR}_behavior_dist_seg": [5.0] * n, + f"{BEHAVIOR}_bout_behavior": [2] * n, + f"{BEHAVIOR}_avg_bout_duration": [1.5] * n, + f"{BEHAVIOR}__stats_sample_count": [2] * n, + f"{BEHAVIOR}_bout_duration_std": [0.1] * n, + f"{BEHAVIOR}_bout_duration_var": [0.01] * n, + } + ) + + +class TestLatencyFirstPrediction: + def test_returns_first_bin_value_when_present(self): + """latency_first should be the first bin's value, not a cumulative sum.""" + data = _make_filtered_data( + latency_first_values=[2506.0, 9412.0, 18082.0, float("nan")], + latency_last_values=[4900.0, 11000.0, 19000.0, float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + col = f"bin_first_20.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(2506.0) + + def test_returns_nan_when_first_bin_has_no_behavior(self): + """latency_first should be NaN when the first bin has no behavior, not a later bin's value.""" + data = _make_filtered_data( + latency_first_values=[float("nan"), 5000.0, 12000.0, float("nan")], + latency_last_values=[float("nan"), 8000.0, 15000.0, float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + col = f"bin_first_20.{BEHAVIOR}_latency_first_prediction" + assert math.isnan(result[col].iloc[0]) + + def test_single_bin_returns_that_bins_value(self): + data = _make_filtered_data( + latency_first_values=[2506.0], + latency_last_values=[4900.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + col = f"bin_first_5.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(2506.0) + + +class TestLatencyLastPrediction: + def test_returns_last_bin_value_when_present(self): + """latency_last should be the last bin's value, not a cumulative sum.""" + data = _make_filtered_data( + latency_first_values=[2506.0, 9412.0, 18082.0, 38222.0], + latency_last_values=[4900.0, 11000.0, 19000.0, 45000.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + col = f"bin_last_20.{BEHAVIOR}_latency_last_prediction" + assert result[col].iloc[0] == pytest.approx(45000.0) + + def test_returns_nan_when_last_bin_has_no_behavior(self): + """latency_last should be NaN when the last bin has no behavior, not a previous bin's value.""" + data = _make_filtered_data( + latency_first_values=[float("nan"), 5000.0, 12000.0, float("nan")], + latency_last_values=[float("nan"), 8000.0, 15000.0, float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + col = f"bin_last_20.{BEHAVIOR}_latency_last_prediction" + assert math.isnan(result[col].iloc[0]) + + def test_single_bin_returns_that_bins_value(self): + data = _make_filtered_data( + latency_first_values=[2506.0], + latency_last_values=[4900.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + col = f"bin_last_5.{BEHAVIOR}_latency_last_prediction" + assert result[col].iloc[0] == pytest.approx(4900.0) + + +class TestMultiMouseAlignment: + def test_each_mouse_gets_its_own_first_latency(self): + """With multiple mice, each should receive their own first-bin latency value.""" + mouse_a = _make_filtered_data( + latency_first_values=[2506.0, 9412.0], + latency_last_values=[4900.0, 11000.0], + mouse_id="mouse_A", + ) + mouse_b = _make_filtered_data( + latency_first_values=[float("nan"), 5000.0], + latency_last_values=[float("nan"), 8000.0], + mouse_id="mouse_B", + ) + data = pd.concat([mouse_a, mouse_b], ignore_index=True) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + result = result.set_index("MouseID") + + first_col = f"bin_first_10.{BEHAVIOR}_latency_first_prediction" + last_col = f"bin_last_10.{BEHAVIOR}_latency_last_prediction" + + assert result.loc["mouse_A", first_col] == pytest.approx(2506.0) + assert math.isnan(result.loc["mouse_B", first_col]) + + assert result.loc["mouse_A", last_col] == pytest.approx(11000.0) + assert result.loc["mouse_B", last_col] == pytest.approx(8000.0) From f158cce80b5fb6ac2c42a6add000d5891087fea7 Mon Sep 17 00:00:00 2001 From: Jake Beierle Date: Wed, 1 Apr 2026 08:00:10 -0400 Subject: [PATCH 2/6] Fix cumulative sum bug --- support_code/behavior_summaries.py | 24 ++++-- tests/support_code/test_behavior_summaries.py | 86 +++++++++++++++++++ 2 files changed, 102 insertions(+), 8 deletions(-) diff --git a/support_code/behavior_summaries.py b/support_code/behavior_summaries.py index 4b452b9..5458067 100644 --- a/support_code/behavior_summaries.py +++ b/support_code/behavior_summaries.py @@ -143,6 +143,17 @@ def aggregate_data_by_bin_size( lambda s: s.iloc[-1] ) + # Extract last-bin avg_bout_duration and sample_count before summing. + # Same pattern as latency: last bin's value per MouseID, NaN-preserving. + avg_bout_dur_col = f"{behavior}_avg_bout_duration" + sample_count_col = f"{behavior}__stats_sample_count" + last_bin_avg_bout_dur = filtered_data.groupby("MouseID")[avg_bout_dur_col].agg( + lambda s: s.iloc[-1] + ) + last_bin_sample_count = filtered_data.groupby("MouseID")[sample_count_col].agg( + lambda s: s.iloc[-1] + ) + # Aggregate numeric columns by summing them numeric_cols = filtered_data.select_dtypes(include=["number"]).columns aggregated = filtered_data.groupby("MouseID")[numeric_cols].sum() @@ -182,14 +193,11 @@ def aggregate_data_by_bin_size( behavior_bout_col ] - # Additional stats - if np.sum(aggregated[f"{behavior}__stats_sample_count"]) == 0: - aggregated[f"bin_avg_{bin_size * 5}.{behavior}_avg_bout_length"] = np.nan - else: - aggregated[f"bin_avg_{bin_size * 5}.{behavior}_avg_bout_length"] = np.average( - aggregated[f"{behavior}_avg_bout_duration"], - weights=aggregated[f"{behavior}__stats_sample_count"], - ) + # Average bout length: use the last bin's value directly. Where that bin + # had no behavior (sample_count == 0), force NaN as a safety guard. + aggregated[f"bin_avg_{bin_size * 5}.{behavior}_avg_bout_length"] = ( + last_bin_avg_bout_dur.where(last_bin_sample_count > 0, other=np.nan) + ) # TODO: var and std need to be aggregated across bins. # This is non-trivial because of the partial bouts and their associated weights. aggregated[f"bin_first_{bin_size * 5}.{behavior}_latency_first_prediction"] = ( diff --git a/tests/support_code/test_behavior_summaries.py b/tests/support_code/test_behavior_summaries.py index ef5f594..7b3a9eb 100644 --- a/tests/support_code/test_behavior_summaries.py +++ b/tests/support_code/test_behavior_summaries.py @@ -105,6 +105,72 @@ def test_single_bin_returns_that_bins_value(self): assert result[col].iloc[0] == pytest.approx(4900.0) +def _make_per_bin_data( + avg_bout_durations: list, + stats_sample_counts: list, + mouse_id: str = "mouse_A", +) -> pd.DataFrame: + """Build a per-bin DataFrame with varying avg_bout_duration and sample counts.""" + n = len(avg_bout_durations) + return pd.DataFrame( + { + "MouseID": [mouse_id] * n, + f"{BEHAVIOR}_latency_to_first_prediction": [100.0] * n, + f"{BEHAVIOR}_latency_to_last_prediction": [200.0] * n, + f"{BEHAVIOR}_time_behavior": [100.0] * n, + f"{BEHAVIOR}_time_not_behavior": [200.0] * n, + f"{BEHAVIOR}_behavior_dist": [50.0] * n, + f"{BEHAVIOR}_behavior_dist_threshold": [10.0] * n, + f"{BEHAVIOR}_behavior_dist_seg": [5.0] * n, + f"{BEHAVIOR}_bout_behavior": [2] * n, + f"{BEHAVIOR}_avg_bout_duration": avg_bout_durations, + f"{BEHAVIOR}__stats_sample_count": stats_sample_counts, + f"{BEHAVIOR}_bout_duration_std": [0.1] * n, + f"{BEHAVIOR}_bout_duration_var": [0.01] * n, + } + ) + + +class TestAvgBoutLength: + def test_single_bin_returns_that_bins_value(self): + data = _make_per_bin_data( + avg_bout_durations=[18.8], + stats_sample_counts=[5], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + col = f"bin_avg_5.{BEHAVIOR}_avg_bout_length" + assert result[col].iloc[0] == pytest.approx(18.8) + + def test_multi_bin_returns_last_bin_value_not_sum(self): + """avg_bout_length should be the last bin's value, not a cumulative sum.""" + data = _make_per_bin_data( + avg_bout_durations=[10.0, 20.0, 30.0], + stats_sample_counts=[5, 3, 4], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=3, behavior=BEHAVIOR) + col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" + # Should be 30.0 (last bin), NOT 60.0 (sum of 10+20+30) + assert result[col].iloc[0] == pytest.approx(30.0) + + def test_returns_nan_when_last_bin_has_no_behavior(self): + data = _make_per_bin_data( + avg_bout_durations=[18.0, 0.0], + stats_sample_counts=[4, 0], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" + assert math.isnan(result[col].iloc[0]) + + def test_returns_nan_when_all_bins_have_no_behavior(self): + data = _make_per_bin_data( + avg_bout_durations=[0.0, 0.0], + stats_sample_counts=[0, 0], + ) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" + assert math.isnan(result[col].iloc[0]) + + class TestMultiMouseAlignment: def test_each_mouse_gets_its_own_first_latency(self): """With multiple mice, each should receive their own first-bin latency value.""" @@ -130,3 +196,23 @@ def test_each_mouse_gets_its_own_first_latency(self): assert result.loc["mouse_A", last_col] == pytest.approx(11000.0) assert result.loc["mouse_B", last_col] == pytest.approx(8000.0) + + def test_each_mouse_gets_its_own_avg_bout_length(self): + """Each mouse should get its own last-bin avg_bout_length, not a shared scalar.""" + mouse_a = _make_per_bin_data( + avg_bout_durations=[10.0, 20.0], + stats_sample_counts=[3, 5], + mouse_id="mouse_A", + ) + mouse_b = _make_per_bin_data( + avg_bout_durations=[7.0, 0.0], + stats_sample_counts=[2, 0], + mouse_id="mouse_B", + ) + data = pd.concat([mouse_a, mouse_b], ignore_index=True) + result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + result = result.set_index("MouseID") + + col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" + assert result.loc["mouse_A", col] == pytest.approx(20.0) + assert math.isnan(result.loc["mouse_B", col]) From aad32e1a0bc18138e29bbae351b3510f0ea7f971 Mon Sep 17 00:00:00 2001 From: Alexander Berger Date: Wed, 1 Apr 2026 16:21:30 -0400 Subject: [PATCH 3/6] Fix formatting/linting --- tests/support_code/__init__.py | 1 + tests/support_code/test_behavior_summaries.py | 64 +++++++++++++++---- 2 files changed, 51 insertions(+), 14 deletions(-) diff --git a/tests/support_code/__init__.py b/tests/support_code/__init__.py index e69de29..fb5b506 100644 --- a/tests/support_code/__init__.py +++ b/tests/support_code/__init__.py @@ -0,0 +1 @@ +"""Tests for support code modules.""" diff --git a/tests/support_code/test_behavior_summaries.py b/tests/support_code/test_behavior_summaries.py index 7b3a9eb..bf00a38 100644 --- a/tests/support_code/test_behavior_summaries.py +++ b/tests/support_code/test_behavior_summaries.py @@ -11,8 +11,7 @@ # Add it to sys.path so we can import it directly. sys.path.insert(0, str(Path(__file__).parents[2] / "support_code")) -import behavior_summaries # noqa: E402 - +import behavior_summaries BEHAVIOR = "Jumping" @@ -44,13 +43,17 @@ def _make_filtered_data( class TestLatencyFirstPrediction: + """Tests for latency_first_prediction aggregation.""" + def test_returns_first_bin_value_when_present(self): """latency_first should be the first bin's value, not a cumulative sum.""" data = _make_filtered_data( latency_first_values=[2506.0, 9412.0, 18082.0, float("nan")], latency_last_values=[4900.0, 11000.0, 19000.0, float("nan")], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=4, behavior=BEHAVIOR + ) col = f"bin_first_20.{BEHAVIOR}_latency_first_prediction" assert result[col].iloc[0] == pytest.approx(2506.0) @@ -60,28 +63,37 @@ def test_returns_nan_when_first_bin_has_no_behavior(self): latency_first_values=[float("nan"), 5000.0, 12000.0, float("nan")], latency_last_values=[float("nan"), 8000.0, 15000.0, float("nan")], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=4, behavior=BEHAVIOR + ) col = f"bin_first_20.{BEHAVIOR}_latency_first_prediction" assert math.isnan(result[col].iloc[0]) def test_single_bin_returns_that_bins_value(self): + """Single bin should return that bin's latency_first value.""" data = _make_filtered_data( latency_first_values=[2506.0], latency_last_values=[4900.0], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR + ) col = f"bin_first_5.{BEHAVIOR}_latency_first_prediction" assert result[col].iloc[0] == pytest.approx(2506.0) class TestLatencyLastPrediction: + """Tests for latency_last_prediction aggregation.""" + def test_returns_last_bin_value_when_present(self): """latency_last should be the last bin's value, not a cumulative sum.""" data = _make_filtered_data( latency_first_values=[2506.0, 9412.0, 18082.0, 38222.0], latency_last_values=[4900.0, 11000.0, 19000.0, 45000.0], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=4, behavior=BEHAVIOR + ) col = f"bin_last_20.{BEHAVIOR}_latency_last_prediction" assert result[col].iloc[0] == pytest.approx(45000.0) @@ -91,16 +103,21 @@ def test_returns_nan_when_last_bin_has_no_behavior(self): latency_first_values=[float("nan"), 5000.0, 12000.0, float("nan")], latency_last_values=[float("nan"), 8000.0, 15000.0, float("nan")], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=4, behavior=BEHAVIOR + ) col = f"bin_last_20.{BEHAVIOR}_latency_last_prediction" assert math.isnan(result[col].iloc[0]) def test_single_bin_returns_that_bins_value(self): + """Single bin should return that bin's latency_last value.""" data = _make_filtered_data( latency_first_values=[2506.0], latency_last_values=[4900.0], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR + ) col = f"bin_last_5.{BEHAVIOR}_latency_last_prediction" assert result[col].iloc[0] == pytest.approx(4900.0) @@ -132,12 +149,17 @@ def _make_per_bin_data( class TestAvgBoutLength: + """Tests for avg_bout_length aggregation.""" + def test_single_bin_returns_that_bins_value(self): + """Single bin should return that bin's avg_bout_duration.""" data = _make_per_bin_data( avg_bout_durations=[18.8], stats_sample_counts=[5], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR + ) col = f"bin_avg_5.{BEHAVIOR}_avg_bout_length" assert result[col].iloc[0] == pytest.approx(18.8) @@ -147,31 +169,41 @@ def test_multi_bin_returns_last_bin_value_not_sum(self): avg_bout_durations=[10.0, 20.0, 30.0], stats_sample_counts=[5, 3, 4], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=3, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR + ) col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" # Should be 30.0 (last bin), NOT 60.0 (sum of 10+20+30) assert result[col].iloc[0] == pytest.approx(30.0) def test_returns_nan_when_last_bin_has_no_behavior(self): + """Should return NaN when the last bin has no behavior.""" data = _make_per_bin_data( avg_bout_durations=[18.0, 0.0], stats_sample_counts=[4, 0], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR + ) col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" assert math.isnan(result[col].iloc[0]) def test_returns_nan_when_all_bins_have_no_behavior(self): + """Should return NaN when all bins have no behavior.""" data = _make_per_bin_data( avg_bout_durations=[0.0, 0.0], stats_sample_counts=[0, 0], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR + ) col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" assert math.isnan(result[col].iloc[0]) class TestMultiMouseAlignment: + """Tests for multi-mouse alignment in aggregation.""" + def test_each_mouse_gets_its_own_first_latency(self): """With multiple mice, each should receive their own first-bin latency value.""" mouse_a = _make_filtered_data( @@ -185,7 +217,9 @@ def test_each_mouse_gets_its_own_first_latency(self): mouse_id="mouse_B", ) data = pd.concat([mouse_a, mouse_b], ignore_index=True) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR + ) result = result.set_index("MouseID") first_col = f"bin_first_10.{BEHAVIOR}_latency_first_prediction" @@ -210,7 +244,9 @@ def test_each_mouse_gets_its_own_avg_bout_length(self): mouse_id="mouse_B", ) data = pd.concat([mouse_a, mouse_b], ignore_index=True) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR + ) result = result.set_index("MouseID") col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" From dcb55288f49d14ded0663cc5a974fde0a423c2d9 Mon Sep 17 00:00:00 2001 From: Jake Beierle Date: Mon, 6 Apr 2026 11:42:29 -0400 Subject: [PATCH 4/6] Fix first latncy bug --- nextflow/modules/jabs_classifiers.nf | 4 +- nextflow/workflows/feature_generation.nf | 10 +- support_code/behavior_summaries.py | 60 ++-- tests/support_code/test_behavior_summaries.py | 307 +++++++++++------- 4 files changed, 245 insertions(+), 136 deletions(-) diff --git a/nextflow/modules/jabs_classifiers.nf b/nextflow/modules/jabs_classifiers.nf index d53520b..ae24b85 100644 --- a/nextflow/modules/jabs_classifiers.nf +++ b/nextflow/modules/jabs_classifiers.nf @@ -176,14 +176,14 @@ process BEHAVIOR_TABLE_TO_FEATURES { label "r_jabs_table_convert" input: - tuple path(in_summary_table), val(bin_size) + tuple path(in_summary_table), val(bin_size), val(prev_bin_size) output: path("${in_summary_table.baseName}_features_${bin_size}.csv"), emit: features script: """ - python3 ${params.support_code_dir}/behavior_summaries.py -f ${in_summary_table} -b ${bin_size} -o "${in_summary_table.baseName}_features_${bin_size}.csv" + python3 ${params.support_code_dir}/behavior_summaries.py -f ${in_summary_table} -b ${bin_size} -p ${prev_bin_size} -o "${in_summary_table.baseName}_features_${bin_size}.csv" """ } diff --git a/nextflow/workflows/feature_generation.nf b/nextflow/workflows/feature_generation.nf index c8eb1c6..eb0a2b1 100644 --- a/nextflow/workflows/feature_generation.nf +++ b/nextflow/workflows/feature_generation.nf @@ -131,12 +131,20 @@ workflow SINGLE_MOUSE_V6_FEATURES { .collect() merged_bout_tables = AGGREGATE_BOUT_TABLES(all_bout_tables).merged_bout_tables + // Compute incremental bin pairs: [bin_size, prev_bin_size] + // Each bin_size is paired with the previous feature_bin so that latency + // features describe only the incremental time window. + sorted_bins = params.feature_bins.sort() + bin_pairs = sorted_bins.withIndex().collect { bin_size, idx -> + [bin_size, idx == 0 ? 0 : sorted_bins[idx - 1]] + } + // Combine table data into feature file all_summary_tables = heuristic_tables .concat(classifier_tables) .map { bout_table, summary_table -> summary_table } .flatten() - .combine(params.feature_bins) + .combine(Channel.fromList(bin_pairs)) individual_behavior_features = BEHAVIOR_TABLE_TO_FEATURES(all_summary_tables) // Features are named columns (wide) split across multiple files // Transform them into long format so that we can row-concat without sorting diff --git a/support_code/behavior_summaries.py b/support_code/behavior_summaries.py index 5458067..04d418f 100644 --- a/support_code/behavior_summaries.py +++ b/support_code/behavior_summaries.py @@ -34,6 +34,13 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "-o", "--output", type=str, required=True, help="output file name" ) + parser.add_argument( + "-p", + "--prev_bin_size", + type=int, + default=0, + help="previous bin size (rows to skip for incremental latency features)", + ) return parser.parse_args() @@ -115,7 +122,7 @@ def get_columns_to_exclude(behavior: str) -> list: def aggregate_data_by_bin_size( - data: pd.DataFrame, bin_size: int, behavior: str + data: pd.DataFrame, bin_size: int, behavior: str, prev_bin_size: int = 0 ) -> pd.DataFrame: """Aggregate data by bin size. @@ -123,6 +130,9 @@ def aggregate_data_by_bin_size( data: Preprocessed dataframe. bin_size: Number of bins to aggregate. behavior: Behavior name. + prev_bin_size: Previous bin size; rows before this index are excluded + from incremental features (latency). Sum features and avg_bout_length + remain cumulative from bin 0. Returns: pd.DataFrame: Aggregated dataframe. @@ -131,28 +141,34 @@ def aggregate_data_by_bin_size( grouped = data.groupby("MouseID") filtered_data = pd.concat([group.iloc[:bin_size] for _, group in grouped]) - # Extract latency values before summing. agg with a positional lambda - # preserves NaN (unlike first()/last() which skip NaN), returns a Series - # indexed by MouseID for correct alignment with aggregated. + # Incremental slice: only the "new" bins for latency features. + # E.g., with feature_bins=[1,4], bin_size=4, prev_bin_size=1: + # filtered_data has bins 0-3 (0-20min), incremental has bins 1-3 (5-20min) + incremental_data = pd.concat([ + group.iloc[prev_bin_size:bin_size] + for _, group in filtered_data.groupby("MouseID") + ]) + + # Latency: first()/last() skip NaN within the incremental window. + # For a single-bin window, returns that bin's value or NaN. + # For a multi-bin window, returns first/last non-NaN, or NaN if all are NaN. latency_first_col = f"{behavior}_latency_to_first_prediction" latency_last_col = f"{behavior}_latency_to_last_prediction" - latency_first = filtered_data.groupby("MouseID")[latency_first_col].agg( - lambda s: s.iloc[0] - ) - latency_last = filtered_data.groupby("MouseID")[latency_last_col].agg( - lambda s: s.iloc[-1] - ) + latency_first = incremental_data.groupby("MouseID")[latency_first_col].first() + latency_last = incremental_data.groupby("MouseID")[latency_last_col].last() - # Extract last-bin avg_bout_duration and sample_count before summing. - # Same pattern as latency: last bin's value per MouseID, NaN-preserving. + # Avg bout length: cumulative weighted average across ALL bins (0 to bin_size), + # matching the semantics of sum features. avg_bout_dur_col = f"{behavior}_avg_bout_duration" sample_count_col = f"{behavior}__stats_sample_count" - last_bin_avg_bout_dur = filtered_data.groupby("MouseID")[avg_bout_dur_col].agg( - lambda s: s.iloc[-1] - ) - last_bin_sample_count = filtered_data.groupby("MouseID")[sample_count_col].agg( - lambda s: s.iloc[-1] - ) + + def _weighted_avg_bout(group): + total_count = group[sample_count_col].sum() + if total_count == 0: + return np.nan + return np.average(group[avg_bout_dur_col], weights=group[sample_count_col]) + + avg_bout_length = filtered_data.groupby("MouseID").apply(_weighted_avg_bout) # Aggregate numeric columns by summing them numeric_cols = filtered_data.select_dtypes(include=["number"]).columns @@ -193,11 +209,7 @@ def aggregate_data_by_bin_size( behavior_bout_col ] - # Average bout length: use the last bin's value directly. Where that bin - # had no behavior (sample_count == 0), force NaN as a safety guard. - aggregated[f"bin_avg_{bin_size * 5}.{behavior}_avg_bout_length"] = ( - last_bin_avg_bout_dur.where(last_bin_sample_count > 0, other=np.nan) - ) + aggregated[f"bin_avg_{bin_size * 5}.{behavior}_avg_bout_length"] = avg_bout_length # TODO: var and std need to be aggregated across bins. # This is non-trivial because of the partial bouts and their associated weights. aggregated[f"bin_first_{bin_size * 5}.{behavior}_latency_first_prediction"] = ( @@ -227,7 +239,7 @@ def main(): # Aggregate data by bin size aggregated_data = aggregate_data_by_bin_size( - processed_data, args.bin_size, behavior + processed_data, args.bin_size, behavior, args.prev_bin_size ) # Drop excluded columns diff --git a/tests/support_code/test_behavior_summaries.py b/tests/support_code/test_behavior_summaries.py index 7b3a9eb..cb0db14 100644 --- a/tests/support_code/test_behavior_summaries.py +++ b/tests/support_code/test_behavior_summaries.py @@ -4,6 +4,7 @@ import sys from pathlib import Path +import numpy as np import pandas as pd import pytest @@ -17,13 +18,19 @@ BEHAVIOR = "Jumping" -def _make_filtered_data( +def _make_bin_data( latency_first_values: list, latency_last_values: list, + avg_bout_durations: list | None = None, + stats_sample_counts: list | None = None, mouse_id: str = "mouse_A", ) -> pd.DataFrame: - """Build a minimal per-bin DataFrame matching the shape expected by aggregate_data_by_bin_size.""" + """Build a per-bin DataFrame matching the shape expected by aggregate_data_by_bin_size.""" n = len(latency_first_values) + if avg_bout_durations is None: + avg_bout_durations = [1.5] * n + if stats_sample_counts is None: + stats_sample_counts = [2] * n return pd.DataFrame( { "MouseID": [mouse_id] * n, @@ -35,8 +42,8 @@ def _make_filtered_data( f"{BEHAVIOR}_behavior_dist_threshold": [10.0] * n, f"{BEHAVIOR}_behavior_dist_seg": [5.0] * n, f"{BEHAVIOR}_bout_behavior": [2] * n, - f"{BEHAVIOR}_avg_bout_duration": [1.5] * n, - f"{BEHAVIOR}__stats_sample_count": [2] * n, + f"{BEHAVIOR}_avg_bout_duration": avg_bout_durations, + f"{BEHAVIOR}__stats_sample_count": stats_sample_counts, f"{BEHAVIOR}_bout_duration_std": [0.1] * n, f"{BEHAVIOR}_bout_duration_var": [0.01] * n, } @@ -44,175 +51,257 @@ def _make_filtered_data( class TestLatencyFirstPrediction: - def test_returns_first_bin_value_when_present(self): - """latency_first should be the first bin's value, not a cumulative sum.""" - data = _make_filtered_data( - latency_first_values=[2506.0, 9412.0, 18082.0, float("nan")], - latency_last_values=[4900.0, 11000.0, 19000.0, float("nan")], - ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) - col = f"bin_first_20.{BEHAVIOR}_latency_first_prediction" - assert result[col].iloc[0] == pytest.approx(2506.0) + """Tests for bin_first_XX.latency_first_prediction (incremental semantics).""" - def test_returns_nan_when_first_bin_has_no_behavior(self): - """latency_first should be NaN when the first bin has no behavior, not a later bin's value.""" - data = _make_filtered_data( - latency_first_values=[float("nan"), 5000.0, 12000.0, float("nan")], - latency_last_values=[float("nan"), 8000.0, 15000.0, float("nan")], - ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) - col = f"bin_first_20.{BEHAVIOR}_latency_first_prediction" - assert math.isnan(result[col].iloc[0]) - - def test_single_bin_returns_that_bins_value(self): - data = _make_filtered_data( + def test_single_bin_returns_value(self): + """bin_size=1, prev_bin_size=0: returns first bin's value.""" + data = _make_bin_data( latency_first_values=[2506.0], latency_last_values=[4900.0], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR, prev_bin_size=0 + ) col = f"bin_first_5.{BEHAVIOR}_latency_first_prediction" assert result[col].iloc[0] == pytest.approx(2506.0) + def test_single_bin_nan_returns_nan(self): + data = _make_bin_data( + latency_first_values=[float("nan")], + latency_last_values=[float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR, prev_bin_size=0 + ) + col = f"bin_first_5.{BEHAVIOR}_latency_first_prediction" + assert math.isnan(result[col].iloc[0]) + + def test_consecutive_bins_returns_incremental_value(self): + """bin_size=2, prev_bin_size=1: should return bin 1's value (5-10min), not bin 0's.""" + data = _make_bin_data( + latency_first_values=[2506.0, 9412.0], + latency_last_values=[4900.0, 11000.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_first_10.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(9412.0) -class TestLatencyLastPrediction: - def test_returns_last_bin_value_when_present(self): - """latency_last should be the last bin's value, not a cumulative sum.""" - data = _make_filtered_data( - latency_first_values=[2506.0, 9412.0, 18082.0, 38222.0], - latency_last_values=[4900.0, 11000.0, 19000.0, 45000.0], - ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) - col = f"bin_last_20.{BEHAVIOR}_latency_last_prediction" - assert result[col].iloc[0] == pytest.approx(45000.0) - - def test_returns_nan_when_last_bin_has_no_behavior(self): - """latency_last should be NaN when the last bin has no behavior, not a previous bin's value.""" - data = _make_filtered_data( - latency_first_values=[float("nan"), 5000.0, 12000.0, float("nan")], - latency_last_values=[float("nan"), 8000.0, 15000.0, float("nan")], - ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=4, behavior=BEHAVIOR) - col = f"bin_last_20.{BEHAVIOR}_latency_last_prediction" + def test_non_consecutive_bins_returns_first_in_range(self): + """bin_size=3, prev_bin_size=1: incremental window is bins 1-2 (5-15min). + Should return first non-NaN in that range.""" + data = _make_bin_data( + latency_first_values=[2506.0, 9412.0, 18082.0], + latency_last_values=[4900.0, 11000.0, 19000.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_first_15.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(9412.0) + + def test_non_consecutive_skips_nan_in_range(self): + """bin_size=3, prev_bin_size=1: bins 1-2, bin 1 is NaN → returns bin 2's value.""" + data = _make_bin_data( + latency_first_values=[2506.0, float("nan"), 18082.0], + latency_last_values=[4900.0, float("nan"), 19000.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_first_15.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(18082.0) + + def test_incremental_all_nan_returns_nan(self): + """bin_size=3, prev_bin_size=1: bins 1-2 both NaN → NaN.""" + data = _make_bin_data( + latency_first_values=[2506.0, float("nan"), float("nan")], + latency_last_values=[4900.0, float("nan"), float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_first_15.{BEHAVIOR}_latency_first_prediction" assert math.isnan(result[col].iloc[0]) - def test_single_bin_returns_that_bins_value(self): - data = _make_filtered_data( + def test_prev_bin_zero_returns_first_non_nan(self): + """bin_size=3, prev_bin_size=0: full window 0-15min.""" + data = _make_bin_data( + latency_first_values=[float("nan"), 9412.0, 18082.0], + latency_last_values=[float("nan"), 11000.0, 19000.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=0 + ) + col = f"bin_first_15.{BEHAVIOR}_latency_first_prediction" + assert result[col].iloc[0] == pytest.approx(9412.0) + + +class TestLatencyLastPrediction: + """Tests for bin_last_XX.latency_last_prediction (incremental semantics).""" + + def test_single_bin_returns_value(self): + data = _make_bin_data( latency_first_values=[2506.0], latency_last_values=[4900.0], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR, prev_bin_size=0 + ) col = f"bin_last_5.{BEHAVIOR}_latency_last_prediction" assert result[col].iloc[0] == pytest.approx(4900.0) + def test_consecutive_bins_returns_incremental_value(self): + """bin_size=2, prev_bin_size=1: returns bin 1's last prediction.""" + data = _make_bin_data( + latency_first_values=[2506.0, 9412.0], + latency_last_values=[4900.0, 14863.0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_last_10.{BEHAVIOR}_latency_last_prediction" + assert result[col].iloc[0] == pytest.approx(14863.0) -def _make_per_bin_data( - avg_bout_durations: list, - stats_sample_counts: list, - mouse_id: str = "mouse_A", -) -> pd.DataFrame: - """Build a per-bin DataFrame with varying avg_bout_duration and sample counts.""" - n = len(avg_bout_durations) - return pd.DataFrame( - { - "MouseID": [mouse_id] * n, - f"{BEHAVIOR}_latency_to_first_prediction": [100.0] * n, - f"{BEHAVIOR}_latency_to_last_prediction": [200.0] * n, - f"{BEHAVIOR}_time_behavior": [100.0] * n, - f"{BEHAVIOR}_time_not_behavior": [200.0] * n, - f"{BEHAVIOR}_behavior_dist": [50.0] * n, - f"{BEHAVIOR}_behavior_dist_threshold": [10.0] * n, - f"{BEHAVIOR}_behavior_dist_seg": [5.0] * n, - f"{BEHAVIOR}_bout_behavior": [2] * n, - f"{BEHAVIOR}_avg_bout_duration": avg_bout_durations, - f"{BEHAVIOR}__stats_sample_count": stats_sample_counts, - f"{BEHAVIOR}_bout_duration_std": [0.1] * n, - f"{BEHAVIOR}_bout_duration_var": [0.01] * n, - } - ) + def test_non_consecutive_returns_last_non_nan(self): + """bin_size=3, prev_bin_size=1: bins 1-2, returns last non-NaN.""" + data = _make_bin_data( + latency_first_values=[2506.0, 9412.0, float("nan")], + latency_last_values=[4900.0, 11000.0, float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_last_15.{BEHAVIOR}_latency_last_prediction" + assert result[col].iloc[0] == pytest.approx(11000.0) + + def test_incremental_all_nan_returns_nan(self): + data = _make_bin_data( + latency_first_values=[2506.0, float("nan"), float("nan")], + latency_last_values=[4900.0, float("nan"), float("nan")], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 + ) + col = f"bin_last_15.{BEHAVIOR}_latency_last_prediction" + assert math.isnan(result[col].iloc[0]) class TestAvgBoutLength: - def test_single_bin_returns_that_bins_value(self): - data = _make_per_bin_data( + """Tests for avg_bout_length (cumulative weighted average over all bins 0..bin_size).""" + + def test_single_bin_returns_value(self): + data = _make_bin_data( + latency_first_values=[100.0], + latency_last_values=[200.0], avg_bout_durations=[18.8], stats_sample_counts=[5], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=1, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=1, behavior=BEHAVIOR + ) col = f"bin_avg_5.{BEHAVIOR}_avg_bout_length" assert result[col].iloc[0] == pytest.approx(18.8) - def test_multi_bin_returns_last_bin_value_not_sum(self): - """avg_bout_length should be the last bin's value, not a cumulative sum.""" - data = _make_per_bin_data( + def test_cumulative_weighted_average(self): + """avg_bout_length is cumulative: weighted avg across all bins 0..bin_size.""" + data = _make_bin_data( + latency_first_values=[100.0, 200.0, 300.0], + latency_last_values=[150.0, 250.0, 350.0], avg_bout_durations=[10.0, 20.0, 30.0], stats_sample_counts=[5, 3, 4], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=3, behavior=BEHAVIOR) - col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" - # Should be 30.0 (last bin), NOT 60.0 (sum of 10+20+30) - assert result[col].iloc[0] == pytest.approx(30.0) - - def test_returns_nan_when_last_bin_has_no_behavior(self): - data = _make_per_bin_data( - avg_bout_durations=[18.0, 0.0], - stats_sample_counts=[4, 0], + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR, prev_bin_size=1 ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) - col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" - assert math.isnan(result[col].iloc[0]) + col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" + # Weighted avg = (10*5 + 20*3 + 30*4) / (5+3+4) = 230/12 ≈ 19.1667 + expected = np.average([10.0, 20.0, 30.0], weights=[5, 3, 4]) + assert result[col].iloc[0] == pytest.approx(expected) - def test_returns_nan_when_all_bins_have_no_behavior(self): - data = _make_per_bin_data( + def test_returns_nan_when_all_bins_no_behavior(self): + data = _make_bin_data( + latency_first_values=[float("nan"), float("nan")], + latency_last_values=[float("nan"), float("nan")], avg_bout_durations=[0.0, 0.0], stats_sample_counts=[0, 0], ) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR + ) col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" assert math.isnan(result[col].iloc[0]) + def test_skips_bins_with_no_behavior_in_weighted_avg(self): + """Bins with sample_count=0 have zero weight and don't affect the average.""" + data = _make_bin_data( + latency_first_values=[100.0, float("nan"), 300.0], + latency_last_values=[150.0, float("nan"), 350.0], + avg_bout_durations=[10.0, 0.0, 30.0], + stats_sample_counts=[5, 0, 4], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR + ) + col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" + # Weighted avg = (10*5 + 30*4) / (5+4) = 170/9 ≈ 18.889 + expected = np.average([10.0, 30.0], weights=[5, 4]) + assert result[col].iloc[0] == pytest.approx(expected) + class TestMultiMouseAlignment: - def test_each_mouse_gets_its_own_first_latency(self): - """With multiple mice, each should receive their own first-bin latency value.""" - mouse_a = _make_filtered_data( + def test_each_mouse_gets_own_latency(self): + """Each mouse gets its own incremental latency values.""" + mouse_a = _make_bin_data( latency_first_values=[2506.0, 9412.0], latency_last_values=[4900.0, 11000.0], mouse_id="mouse_A", ) - mouse_b = _make_filtered_data( - latency_first_values=[float("nan"), 5000.0], - latency_last_values=[float("nan"), 8000.0], + mouse_b = _make_bin_data( + latency_first_values=[3000.0, float("nan")], + latency_last_values=[5000.0, float("nan")], mouse_id="mouse_B", ) data = pd.concat([mouse_a, mouse_b], ignore_index=True) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR, prev_bin_size=1 + ) result = result.set_index("MouseID") first_col = f"bin_first_10.{BEHAVIOR}_latency_first_prediction" last_col = f"bin_last_10.{BEHAVIOR}_latency_last_prediction" - assert result.loc["mouse_A", first_col] == pytest.approx(2506.0) + assert result.loc["mouse_A", first_col] == pytest.approx(9412.0) assert math.isnan(result.loc["mouse_B", first_col]) assert result.loc["mouse_A", last_col] == pytest.approx(11000.0) - assert result.loc["mouse_B", last_col] == pytest.approx(8000.0) + assert math.isnan(result.loc["mouse_B", last_col]) - def test_each_mouse_gets_its_own_avg_bout_length(self): - """Each mouse should get its own last-bin avg_bout_length, not a shared scalar.""" - mouse_a = _make_per_bin_data( + def test_each_mouse_gets_own_avg_bout_length(self): + """Each mouse gets its own cumulative weighted avg_bout_length.""" + mouse_a = _make_bin_data( + latency_first_values=[100.0, 200.0], + latency_last_values=[150.0, 250.0], avg_bout_durations=[10.0, 20.0], stats_sample_counts=[3, 5], mouse_id="mouse_A", ) - mouse_b = _make_per_bin_data( + mouse_b = _make_bin_data( + latency_first_values=[300.0, 400.0], + latency_last_values=[350.0, 450.0], avg_bout_durations=[7.0, 0.0], stats_sample_counts=[2, 0], mouse_id="mouse_B", ) data = pd.concat([mouse_a, mouse_b], ignore_index=True) - result = behavior_summaries.aggregate_data_by_bin_size(data, bin_size=2, behavior=BEHAVIOR) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=2, behavior=BEHAVIOR + ) result = result.set_index("MouseID") col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" - assert result.loc["mouse_A", col] == pytest.approx(20.0) - assert math.isnan(result.loc["mouse_B", col]) + # Mouse A: (10*3 + 20*5) / (3+5) = 130/8 = 16.25 + assert result.loc["mouse_A", col] == pytest.approx(16.25) + # Mouse B: only bin 0 has behavior → 7.0 + assert result.loc["mouse_B", col] == pytest.approx(7.0) From c979183e88593878186fc3a489727db12fbac42e Mon Sep 17 00:00:00 2001 From: Jake Beierle Date: Mon, 6 Apr 2026 15:16:58 -0400 Subject: [PATCH 5/6] Fix average bout, includes all bouts from vid start --- support_code/behavior_summaries.py | 9 ++++-- tests/support_code/test_behavior_summaries.py | 32 +++++++++++++++++++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/support_code/behavior_summaries.py b/support_code/behavior_summaries.py index 04d418f..7bdeff2 100644 --- a/support_code/behavior_summaries.py +++ b/support_code/behavior_summaries.py @@ -163,10 +163,13 @@ def aggregate_data_by_bin_size( sample_count_col = f"{behavior}__stats_sample_count" def _weighted_avg_bout(group): - total_count = group[sample_count_col].sum() - if total_count == 0: + mask = group[sample_count_col] > 0 + if not mask.any(): return np.nan - return np.average(group[avg_bout_dur_col], weights=group[sample_count_col]) + return np.average( + group.loc[mask, avg_bout_dur_col], + weights=group.loc[mask, sample_count_col], + ) avg_bout_length = filtered_data.groupby("MouseID").apply(_weighted_avg_bout) diff --git a/tests/support_code/test_behavior_summaries.py b/tests/support_code/test_behavior_summaries.py index ad36657..4b3fdcd 100644 --- a/tests/support_code/test_behavior_summaries.py +++ b/tests/support_code/test_behavior_summaries.py @@ -248,6 +248,38 @@ def test_returns_nan_when_all_bins_have_no_behavior(self): col = f"bin_avg_10.{BEHAVIOR}_avg_bout_length" assert math.isnan(result[col].iloc[0]) + def test_first_bin_no_behavior_uses_later_bins(self): + """No behavior in first bin (NaN duration, 0 count) — later bins drive the average.""" + data = _make_bin_data( + latency_first_values=[float("nan"), 100.0, 200.0], + latency_last_values=[float("nan"), 150.0, 250.0], + avg_bout_durations=[float("nan"), 10.0, 20.0], + stats_sample_counts=[0, 5, 3], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR + ) + col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" + # Bins 1 and 2 contribute: (10*5 + 20*3) / (5+3) = 110/8 = 13.75 + expected = np.average([10.0, 20.0], weights=[5, 3]) + assert result[col].iloc[0] == pytest.approx(expected) + + def test_nan_duration_with_zero_weight_is_ignored(self): + """NaN avg_bout_duration with sample_count=0 should not poison the weighted average.""" + data = _make_bin_data( + latency_first_values=[100.0, 200.0, float("nan")], + latency_last_values=[150.0, 250.0, float("nan")], + avg_bout_durations=[10.0, 20.0, float("nan")], + stats_sample_counts=[5, 3, 0], + ) + result = behavior_summaries.aggregate_data_by_bin_size( + data, bin_size=3, behavior=BEHAVIOR + ) + col = f"bin_avg_15.{BEHAVIOR}_avg_bout_length" + # Only bins 0 and 1 contribute: (10*5 + 20*3) / (5+3) = 110/8 = 13.75 + expected = np.average([10.0, 20.0], weights=[5, 3]) + assert result[col].iloc[0] == pytest.approx(expected) + def test_skips_bins_with_no_behavior_in_weighted_avg(self): """Bins with sample_count=0 have zero weight and don't affect the average.""" data = _make_bin_data( From bc3a0cf6b110453658496b184f7747a9361df79f Mon Sep 17 00:00:00 2001 From: Jacob Beierle Date: Tue, 7 Apr 2026 10:13:19 -0400 Subject: [PATCH 6/6] Fix formating and linting errors --- support_code/behavior_summaries.py | 10 ++++++---- tests/support_code/test_behavior_summaries.py | 7 ++++++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/support_code/behavior_summaries.py b/support_code/behavior_summaries.py index 7bdeff2..0d9c06a 100644 --- a/support_code/behavior_summaries.py +++ b/support_code/behavior_summaries.py @@ -144,10 +144,12 @@ def aggregate_data_by_bin_size( # Incremental slice: only the "new" bins for latency features. # E.g., with feature_bins=[1,4], bin_size=4, prev_bin_size=1: # filtered_data has bins 0-3 (0-20min), incremental has bins 1-3 (5-20min) - incremental_data = pd.concat([ - group.iloc[prev_bin_size:bin_size] - for _, group in filtered_data.groupby("MouseID") - ]) + incremental_data = pd.concat( + [ + group.iloc[prev_bin_size:bin_size] + for _, group in filtered_data.groupby("MouseID") + ] + ) # Latency: first()/last() skip NaN within the incremental window. # For a single-bin window, returns that bin's value or NaN. diff --git a/tests/support_code/test_behavior_summaries.py b/tests/support_code/test_behavior_summaries.py index 4b3fdcd..6c51542 100644 --- a/tests/support_code/test_behavior_summaries.py +++ b/tests/support_code/test_behavior_summaries.py @@ -65,6 +65,7 @@ def test_single_bin_returns_value(self): assert result[col].iloc[0] == pytest.approx(2506.0) def test_single_bin_nan_returns_nan(self): + """NaN input returns NaN for single bin.""" data = _make_bin_data( latency_first_values=[float("nan")], latency_last_values=[float("nan")], @@ -89,7 +90,9 @@ def test_consecutive_bins_returns_incremental_value(self): def test_non_consecutive_bins_returns_first_in_range(self): """bin_size=3, prev_bin_size=1: incremental window is bins 1-2 (5-15min). - Should return first non-NaN in that range.""" + + Should return first non-NaN in that range. + """ data = _make_bin_data( latency_first_values=[2506.0, 9412.0, 18082.0], latency_last_values=[4900.0, 11000.0, 19000.0], @@ -141,6 +144,7 @@ class TestLatencyLastPrediction: """Tests for bin_last_XX.latency_last_prediction (incremental semantics).""" def test_single_bin_returns_value(self): + """Single bin returns the last prediction value.""" data = _make_bin_data( latency_first_values=[2506.0], latency_last_values=[4900.0], @@ -176,6 +180,7 @@ def test_non_consecutive_returns_last_non_nan(self): assert result[col].iloc[0] == pytest.approx(11000.0) def test_incremental_all_nan_returns_nan(self): + """All NaN in incremental range returns NaN.""" data = _make_bin_data( latency_first_values=[2506.0, float("nan"), float("nan")], latency_last_values=[4900.0, float("nan"), float("nan")],