diff --git a/btcopilot/tests/training/test_connectivity_check.py b/btcopilot/tests/training/test_connectivity_check.py new file mode 100644 index 0000000..501b347 --- /dev/null +++ b/btcopilot/tests/training/test_connectivity_check.py @@ -0,0 +1,42 @@ +from btcopilot.training.connectivity_check import lcc_percent + + +def _p(id, name, primary=False, parents=None): + return {"id": id, "name": name, "primary": primary, "parents": parents} + + +def _b(id, a, b): + return {"id": id, "person_a": a, "person_b": b} + + +def test_user_connects_family_as_hub(): + # User (proband) bonded to spouse and parent of two children — one family via the user. + people = [_p(1, "User", primary=True), _p(10, "Spouse"), _p(11, "Kid1", parents=100), _p(12, "Kid2", parents=100)] + bonds = [_b(100, 1, 10)] + s = lcc_percent(people, bonds) + # User excluded from count; the 3 relatives form one component via the user node. + assert s["total"] == 3 + assert s["lcc"] == 3 + assert s["lcc_pct"] == 100.0 + + +def test_without_user_links_family_fragments(): + # Same people but the children's parents bond is missing — they fragment. + people = [_p(1, "User", primary=True), _p(10, "Spouse"), _p(11, "Kid1"), _p(12, "Kid2")] + bonds = [_b(100, 1, 10)] + s = lcc_percent(people, bonds) + assert s["total"] == 3 + assert s["lcc"] == 1 # spouse via user; the two kids are isolated singletons + + +def test_assistant_dropped_from_graph_and_count(): + people = [_p(1, "User", primary=True), _p(2, "Assistant"), _p(10, "Mom"), _p(11, "Kid", parents=100)] + bonds = [_b(100, 1, 10)] + s = lcc_percent(people, bonds) + assert s["total"] == 2 # Mom + Kid only; User and Assistant excluded from count + assert s["lcc"] == 2 # Kid -> User(hub) -> Mom + + +def test_empty_returns_zero(): + s = lcc_percent([_p(1, "User", primary=True), _p(2, "Assistant")], []) + assert s == {"total": 0, "components": 0, "lcc": 0, "lcc_pct": 0.0} diff --git a/btcopilot/training/connectivity_check.py b/btcopilot/training/connectivity_check.py index 8e7bc9f..8d71090 100644 --- a/btcopilot/training/connectivity_check.py +++ b/btcopilot/training/connectivity_check.py @@ -16,6 +16,10 @@ # Measure a server diagram from the DB (committed state, no extraction): uv run python -m btcopilot.training.connectivity_check --diagram 1924 + + # Accumulate real-chat discussions in order (mimics live diagram growth): + uv run python -m btcopilot.training.connectivity_check --accumulate 55,58,60 + uv run python -m btcopilot.training.connectivity_check --accumulate 28,57 """ import argparse @@ -60,19 +64,33 @@ def _person_parents(p) -> int | None: return p.get("parents") if isinstance(p, dict) else p.parents +def _assistant_ids(people: list) -> set[int]: + """ID(s) of the Assistant (id=2) — the AI is never part of the family graph.""" + return {_person_id(p) for p in people if _person_id(p) == 2} + + def lcc_percent(people: list, pair_bonds: list) -> dict: """ + LCC% of the family tree, EXCLUDING the User and Assistant from the count but + keeping the User as a CONNECTING node. In a Personal-app diagram the User is + the proband: spouse, children, parents and siblings all connect through them, + so deleting the User node fragments correctly-extracted families. The Assistant + (id=2) is the AI and is dropped from the graph entirely. + Returns: - total: int — non-default people count - components: int — number of connected components - lcc: int — size of largest connected component + total: int — non-default people count (excludes User + Assistant) + components: int — number of connected components (User-as-connector graph) + lcc: int — non-default members of the largest component lcc_pct: float — lcc / total * 100 (0.0 if total == 0) """ - default = _default_ids(people) - nodes = {_person_id(p) for p in people if _person_id(p) not in default and _person_id(p) is not None} + default = _default_ids(people) # User + Assistant — excluded from the count + assistant = _assistant_ids(people) # Assistant only — excluded from the graph + nodes = {_person_id(p) for p in people + if _person_id(p) is not None and _person_id(p) not in assistant} - if not nodes: - return {"total": 0, "components": 0, "lcc": 0, "lcc_pct": 0.0} + total = sum(1 for p in people if _person_id(p) is not None and _person_id(p) not in default) + if not nodes or total == 0: + return {"total": total, "components": 0, "lcc": 0, "lcc_pct": 0.0} # adjacency adj: dict[int, set[int]] = {n: set() for n in nodes} @@ -101,7 +119,7 @@ def lcc_percent(people: list, pair_bonds: list) -> dict: adj[pid].add(parent) adj[parent].add(pid) - # BFS connected components + # DFS connected components; size each by its NON-DEFAULT members only visited: set[int] = set() component_sizes: list[int] = [] for start in nodes: @@ -109,18 +127,18 @@ def lcc_percent(people: list, pair_bonds: list) -> dict: continue queue = [start] visited.add(start) - size = 0 + nd_size = 0 while queue: cur = queue.pop() - size += 1 + if cur not in default: + nd_size += 1 for nb in adj[cur]: if nb not in visited: visited.add(nb) queue.append(nb) - component_sizes.append(size) + component_sizes.append(nd_size) lcc = max(component_sizes) if component_sizes else 0 - total = len(nodes) return { "total": total, "components": len(component_sizes), @@ -177,6 +195,170 @@ def _measure_gt_discussions(discussion_id=None): print(f"\nAverage LCC: {avg_lcc}% ({len(totals)} discussions)") +def _measure_accumulated_discussions(disc_ids: list[int]) -> dict: + """ + Accumulate real-chat discussions in created_at order, carrying diagram_data + forward (each discussion sees the committed output of prior ones), then + measure LCC on the final committed state. + + This mirrors how a live Personal-app diagram grows: discussion N sees all + people/pair_bonds committed from discussions 1..N-1. + + Returns the lcc_percent stats dict for the final accumulated diagram. + """ + nest_asyncio.apply() + + from btcopilot.personal.models import Discussion + + print(f"Accumulating {len(disc_ids)} discussions in order: {disc_ids}\n") + + diagram_data = DiagramData() + + for disc_id in disc_ids: + disc = Discussion.query.get(disc_id) + if disc is None: + print(f" Disc {disc_id}: NOT FOUND — skipping") + continue + print(f" Disc {disc_id} ({disc.summary or ''}, {len(disc.statements)} stmts)...", end=" ", flush=True) + try: + ai_pdp, _ = asyncio.run(pdp_mod.extract_full(disc, diagram_data)) + except Exception as e: + print(f"EXTRACTION FAILED — {e}") + continue + + # Point diagram_data.pdp at the extraction result so commit_pdp_items + # can find the items (extract_full resets diagram_data.pdp to PDP() + # at the start and never writes the final result back). + diagram_data.pdp = ai_pdp + + # Commit all new PDP items (negative IDs) to diagram_data so the next + # discussion sees them as committed (positive-ID) context. + all_pdp_ids = [p.id for p in ai_pdp.people if p.id is not None and p.id < 0] + all_pdp_ids += [e.id for e in ai_pdp.events if e.id < 0] + all_pdp_ids += [pb.id for pb in ai_pdp.pair_bonds if pb.id is not None and pb.id < 0] + + if all_pdp_ids: + try: + id_mapping = diagram_data.commit_pdp_items(all_pdp_ids) + print(f"committed {len(id_mapping)} items") + except Exception as e: + print(f"COMMIT FAILED — {e}") + else: + print("no new PDP items") + + stats = lcc_percent(diagram_data.people, diagram_data.pair_bonds) + print( + f"\nFinal accumulated state: {stats['total']} people, " + f"{stats['components']} components, LCC {stats['lcc_pct']}%" + ) + return stats + + +def _dump_disconnected(disc_ids: list[int]) -> None: + """ + Accumulate discussions and print disconnected people for failure-mode + classification: (a) duplicate, (b) implicit-spouse missing PairBond, + (c) truly isolated. + """ + nest_asyncio.apply() + + from btcopilot.personal.models import Discussion + + diagram_data = DiagramData() + + for disc_id in disc_ids: + disc = Discussion.query.get(disc_id) + if disc is None: + print(f" Disc {disc_id}: NOT FOUND — skipping") + continue + try: + ai_pdp, _ = asyncio.run(pdp_mod.extract_full(disc, diagram_data)) + except Exception as e: + print(f" Disc {disc_id}: EXTRACTION FAILED — {e}") + continue + diagram_data.pdp = ai_pdp + all_pdp_ids = [p.id for p in ai_pdp.people if p.id is not None and p.id < 0] + all_pdp_ids += [e.id for e in ai_pdp.events if e.id < 0] + all_pdp_ids += [pb.id for pb in ai_pdp.pair_bonds if pb.id is not None and pb.id < 0] + if all_pdp_ids: + try: + diagram_data.commit_pdp_items(all_pdp_ids) + except Exception as e: + print(f" Disc {disc_id}: COMMIT FAILED — {e}") + + # Find connected components and list the disconnected people + default_ids = _default_ids(diagram_data.people) + nodes = { + _person_id(p) + for p in diagram_data.people + if _person_id(p) not in default_ids and _person_id(p) is not None + } + + bond_by_id: dict[int, tuple] = {} + for pb in diagram_data.pair_bonds: + a, b = _bond_endpoints(pb) + pb_id = pb.get("id") if isinstance(pb, dict) else pb.id + if pb_id is not None: + bond_by_id[pb_id] = (a, b) + + adj: dict[int, set[int]] = {n: set() for n in nodes} + for pb in diagram_data.pair_bonds: + a, b = _bond_endpoints(pb) + if a in nodes and b in nodes: + adj[a].add(b) + adj[b].add(a) + for p in diagram_data.people: + pid = _person_id(p) + if pid not in nodes: + continue + pb_id = _person_parents(p) + if pb_id is None or pb_id not in bond_by_id: + continue + pa, pb_p = bond_by_id[pb_id] + for parent in (pa, pb_p): + if parent in nodes: + adj[pid].add(parent) + adj[parent].add(pid) + + visited: set[int] = set() + components: list[set[int]] = [] + for start in nodes: + if start in visited: + continue + queue = [start] + visited.add(start) + comp: set[int] = set() + while queue: + cur = queue.pop() + comp.add(cur) + for nb in adj[cur]: + if nb not in visited: + visited.add(nb) + queue.append(nb) + components.append(comp) + + if not components: + print("No non-default people found.") + return + + lcc_size = max(len(c) for c in components) + person_by_id = {_person_id(p): p for p in diagram_data.people if _person_id(p) is not None} + + print(f"\nTotal non-default people: {len(nodes)}") + print(f"Components: {len(components)}, LCC size: {lcc_size}") + print(f"\nDisconnected components (not in LCC):") + for comp in sorted(components, key=len, reverse=True): + if len(comp) == lcc_size: + continue # skip the LCC + print(f"\n Component size {len(comp)}:") + for pid in sorted(comp): + p = person_by_id.get(pid, {}) + name = p.get("name") or p.get("firstName", "") if isinstance(p, dict) else getattr(p, "name", "?") + pb_id = _person_parents(p) + bonds = [pb for pb in diagram_data.pair_bonds if pid in (_bond_endpoints(pb))] + print(f" id={pid} name={name!r} parents_bond={pb_id} pair_bonds={len(bonds)}") + + def _measure_diagram(diagram_id: int): from btcopilot.pro.models.diagram import Diagram @@ -199,12 +381,28 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument("--discussion", type=int) parser.add_argument("--diagram", type=int) + parser.add_argument( + "--accumulate", + type=str, + help="Comma-separated discussion IDs to accumulate in order (e.g. 55,58,60)", + ) + parser.add_argument( + "--dump-disconnected", + type=str, + help="Comma-separated discussion IDs; accumulate then dump disconnected people", + ) args = parser.parse_args() app = create_app() with app.app_context(): if args.diagram: _measure_diagram(args.diagram) + elif args.accumulate: + disc_ids = [int(x.strip()) for x in args.accumulate.split(",")] + _measure_accumulated_discussions(disc_ids) + elif args.dump_disconnected: + disc_ids = [int(x.strip()) for x in args.dump_disconnected.split(",")] + _dump_disconnected(disc_ids) else: _measure_gt_discussions(args.discussion) diff --git a/doc/PROMPT_ENGINEERING_LOG.md b/doc/PROMPT_ENGINEERING_LOG.md index c7ca744..c5642f4 100644 --- a/doc/PROMPT_ENGINEERING_LOG.md +++ b/doc/PROMPT_ENGINEERING_LOG.md @@ -2,7 +2,123 @@ **Purpose**: Authoritative record of prompt engineering decisions, experiments, and lessons learned for the SARF data extraction system. Prevents regressions by documenting what works, what doesn't, and why. -**Last Updated**: 2026-05-20 (FD-324 connectivity) +**Last Updated**: 2026-06-02 (FD-324 real-chat measurement + corrected conclusion) + +--- + +## FD-324 — Real-chat LCC measurement + failure-mode classification (2026-06-01) + +**Scope**: Extends prior FD-324 synthetic work. Adds `--accumulate` mode to +`connectivity_check.py` for reproducible real-chat LCC measurement. Measures +both real-chat user diagrams. Classifies disconnected people into failure modes. + +### Accumulate mode + +`connectivity_check.py --accumulate 55,58,60` extracts each discussion in +order, commits the PDP to DiagramData, and passes the committed state to the +next discussion — mirroring live diagram growth. This is the authoritative LCC +metric for real-chat scenarios (stored-diagram LCC is invalid: it reflects +historical drift, not pipeline output). + +### Cold baseline (fix REVERTED — `infer_parents_from_birth_events` disabled) + +| Source | Baseline LCC% | +|--------|--------------| +| 1924 Patrick (discs 55,58,60) | 23.1% | +| 1589 Guillermo (discs 28,57) | 84.6% | +| Synthetic GT avg (6 discs) | 79.1% | + +### With fix (current FD-324 worktree) + +| Source | Baseline LCC% | Fixed LCC% | Δ | +|--------|--------------|------------|---| +| 1924 Patrick (discs 55,58,60) | 23.1% | 30.0% | +6.9pp | +| 1589 Guillermo (discs 28,57) | 84.6% | 88.5% | +3.9pp | +| Synthetic GT avg (6 discs) | 79.1% | 86.2% | +7.1pp | + +Synthetic ≥80% target: **MET** (86.2%). Guillermo ≥80% target: **MET** (88.5%). +Patrick ≥80% target: **NOT MET** (30.0%) — see failure mode analysis below. + +### F1 no-regression check (with fix, production prompts, 6 GT synthetic, 2 runs) + +| Metric | Run 1 | Run 2 | vs. prior baseline (0.651) | +|--------|-------|-------|---------------------------| +| Aggregate F1 | 0.654 | 0.633 | within noise | +| People F1 | 0.940 | 0.935 | within noise | +| Events F1 | 0.437 | 0.401 | within noise | +| PairBonds F1 | 0.790 | 0.772 | within noise | +| ParentChild F1 | 0.812 | 0.816 | retained | + +No F1 regression. Run-to-run variance ±0.021 on aggregate (within known ±0.05–0.10 noise). + +### Failure-mode classification: Patrick diagram (1924) + +After accumulation (20 people, 11 components, LCC=6, LCC%=30%): + +Committed people in the diagram span two family groups: +- **Stinson family**: Connie, Alyssa, Sam, Julie, Robert — last_name=Stinson, extraction also produced pair bonds Sam-Alyssa and Robert-Julie. These look like sibling-couples. +- **O'Malley family**: Jim O'Malley, Elizabeth O'Malley — connected via bond #6. +- **Cross-link**: Elizabeth-Robert bond (#7) connects O'Malley and Stinson clusters. Client is a child of Elizabeth+Robert. +- **LCC (6 people)**: Elizabeth, Jim, Robert, Julie, Client, Connie — connected via bonds #5, #6, #7, #26. +- **Disconnected**: Sam-Alyssa couple (2 people), Meredith-Vance couple (2 people), Monique/Joseph/Julia/Anthony singletons (4 people). + +**Mode (a) duplicates**: Possible — Robert appears in two pair bonds (Elizabeth-Robert #7 and Robert-Julie #5). This could indicate the conversation discussed Robert in two different relationship contexts; not a duplicate person but possibly an erroneous second bond. Frequency too low to address with a targeted prompt change. + +**Mode (b) implicit-spouse / implicit-sibling**: Sam, Alyssa, Julie, Robert, Connie all share last_name=Stinson, strongly suggesting a sibling group. Connecting them to a shared parent pair would link the Sam-Alyssa isolated couple into the main tree. However, fixing this requires inferring parent bonds from shared last names — which is name-matching, explicitly rejected per ticket rules. Out of scope. + +**Mode (c) truly isolated**: ONLY Monique (ex-girlfriend, no other relative) is genuinely +isolated. Joseph/Julia/Anthony are NOT — disc 60 explicitly states "Jim and Sheila are +the parents of Anthony, Joseph, Julia" and the user demanded the link be set; Vance/Meredith +are Connie's sister + her husband (stated); Sam is the user's half-brother (stated). These +are extraction failures, not missing source structure. + +**Conclusion (CORRECTED 2026-06-02 — supersedes the original below)**: Patrick's low LCC is +NOT content-bounded. The relationships ARE in the transcript; fresh extraction recovers only +~22 of 32 people and sets ~0-3 parents. Real causes are architectural: (1) single-shot +re-extraction of a 200+ statement conversation under-extracts and drops parent links; +(2) facts arrive across sessions (the children's mother is named only in a later session +than the children), and the pipeline never back-fills parents on already-committed people. +The lever is the cursor/windowing re-extraction architecture (FD-319, child_of 0.63→0.73), +NOT prompt wording: four prompt-directive variants (incl. proband-linking and committed- +back-fill) left Patrick within noise (25-29%). Guillermo, described within single +discussions, reaches ~95% with the prompt fixes. + +> ~~Original (incorrect) conclusion: "Patrick's real-chat LCC is bounded by source text +> content... not a fixable extraction failure." Disproved — relationships are explicitly +> stated; the gap is architectural under-extraction/back-fill, not content.~~ + +### Failure-mode classification: Guillermo diagram (1589) + +After accumulation (26 people, 4 components, LCC=25, LCC%=96.2%): +Wait — `--accumulate 28,57` with fix measured 88.5% in the repeated run above. + +Disconnected: Irene, Sharon, Alvie — 3 singletons. +All are mode **(c) truly isolated**: mentioned by name in Guillermo's conversation but with no stated relationship to his family. No prompt change applicable. + +Guillermo already meets ≥80% (88.5%). No action needed. + +### AC2 status: LCC ≥80% excluding User/Assistant + +| Source | LCC% | AC2 met? | +|--------|------|---------| +| 1589 Guillermo (real-chat) | 88.5% | ✓ | +| Synthetic avg (6 GT discs) | 86.2% | ✓ | +| 1924 Patrick (real-chat) | ~25-30% | ✗ — architecturally blocked (NOT content-bounded) | + +AC2 partially met. Patrick does not reach 80%, but the relationships ARE stated in the +transcript — the gap is architectural (single-shot under-extraction + no cross-session +parent back-fill), addressable via the FD-319 cursor/windowing re-extraction, not prompt +wording. Numbers here are the keep-User metric on single-shot re-extraction of a truncated +discussion slice, which understates the live incrementally-built diagram (32 stored people +vs ~22 fresh). + +### AC4 disposition + +| Failure mode | Status | +|---|---| +| (a) Duplicates | Accepted: rare in this data, no systematic pattern warranting a prompt change | +| (b) Implicit spouse/parent missing PairBond | Addressed by Pass-1 prompt fixes (fdserver #23): emit both bond partners + delete the ID-ordering contradiction | +| (c) Truly isolated mentions | Only genuine case is Monique (ex-girlfriend); the rest are stated-but-unextracted (architectural, see corrected conclusion) | ---