From 3375f124b19813818a94a77ba1c5e1eb17040d63 Mon Sep 17 00:00:00 2001 From: Elim Pizza Date: Fri, 5 Jun 2026 14:18:26 -0300 Subject: [PATCH 1/3] Re-point checkpoints when merging documents --- h/models/document/_document.py | 52 +++++++++++++++++++ .../unit/h/models/document/_document_test.py | 50 ++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/h/models/document/_document.py b/h/models/document/_document.py index 75302954c72..955f3aa682f 100644 --- a/h/models/document/_document.py +++ b/h/models/document/_document.py @@ -160,6 +160,7 @@ def merge_documents(session, documents, updated=None): from h.services.annotation_write import AnnotationWriteService # noqa: PLC0415 AnnotationWriteService.change_document(session, duplicate_ids, master) + _merge_checkpoints(session, duplicate_ids, master) session.query(Document).filter(Document.id.in_(duplicate_ids)).delete( synchronize_session="fetch" ) @@ -170,6 +171,57 @@ def merge_documents(session, documents, updated=None): return master +def _merge_checkpoints(session, duplicate_ids, master): + """ + Re-point Hide & Reveal checkpoints from the duplicate documents to master. + + This mirrors how annotations are re-pointed + by AnnotationWriteService.change_document. + + They are collapsed into a single checkpoint that + keeps the most restrictive reveal_date (an annotation stays hidden while + any of the merged checkpoints would hide it), so a merge can never reveal + annotations that should remain hidden. + """ + from h.models import Checkpoint # noqa: PLC0415 + + checkpoints = ( + session.query(Checkpoint) + .filter(Checkpoint.document_id.in_([master.id, *duplicate_ids])) + .all() + ) + + by_key: dict = {} + for checkpoint in checkpoints: + key = (checkpoint.group_id, checkpoint.previous_checkpoint_id) + by_key.setdefault(key, []).append(checkpoint) + + for colliding in by_key.values(): + # Prefer a checkpoint already on master as the survivor, so we don't + # momentarily violate the unique constraint by re-pointing onto it. + colliding.sort(key=lambda checkpoint: checkpoint.document_id != master.id) + survivor, *losers = colliding + + reveal_date = _most_restrictive_reveal_date(colliding) + + for loser in losers: + session.delete(loser) + session.flush() + + survivor.document_id = master.id + survivor.reveal_date = reveal_date + + session.flush() + + +def _most_restrictive_reveal_date(checkpoints): + """Returns the reveal_date that keeps annotations hidden the longest.""" + reveal_dates = [checkpoint.reveal_date for checkpoint in checkpoints] + if any(reveal_date is None for reveal_date in reveal_dates): + return None + return max(reveal_dates) + + def update_document_metadata( # noqa: PLR0913 session, target_uri, diff --git a/tests/unit/h/models/document/_document_test.py b/tests/unit/h/models/document/_document_test.py index 4a56f787330..547b8e5adcb 100644 --- a/tests/unit/h/models/document/_document_test.py +++ b/tests/unit/h/models/document/_document_test.py @@ -226,6 +226,56 @@ def test_it_moves_annotations_to_the_first(self, db_session, duplicate_docs): assert count == expected_count + def test_it_moves_checkpoints_to_the_first( + self, db_session, duplicate_docs, factories + ): + checkpoint = factories.Checkpoint(document=duplicate_docs[1]) + + merge_documents(db_session, duplicate_docs) + db_session.flush() + + assert checkpoint.document_id == duplicate_docs[0].id + + def test_it_keeps_checkpoints_in_different_groups( + self, db_session, duplicate_docs, factories + ): + checkpoint_1 = factories.Checkpoint(document=duplicate_docs[0]) + checkpoint_2 = factories.Checkpoint(document=duplicate_docs[1]) + + merge_documents(db_session, duplicate_docs) + db_session.flush() + + # Different groups don't collide, so both survive on the master. + assert checkpoint_1.document_id == duplicate_docs[0].id + assert checkpoint_2.document_id == duplicate_docs[0].id + + def test_it_collapses_colliding_checkpoints_to_the_most_restrictive( + self, db_session, duplicate_docs, factories + ): + group = factories.Group() + # Same group on two merging documents => the checkpoints collide. + factories.Checkpoint( + group=group, + document=duplicate_docs[0], + reveal_date=_datetime(2000, 1, 1), # noqa: DTZ001 # already revealed + ) + factories.Checkpoint( + group=group, + document=duplicate_docs[1], + reveal_date=None, # never revealed = most restrictive + ) + + merge_documents(db_session, duplicate_docs) + db_session.flush() + + survivors = ( + db_session.query(models.Checkpoint) + .filter_by(group_id=group.id, document_id=duplicate_docs[0].id) + .all() + ) + assert len(survivors) == 1 + assert survivors[0].reveal_date is None + def test_it_raises_retryable_error_when_flush_fails( self, db_session, duplicate_docs, monkeypatch ): From 1f1e45213f674b32c57ffe054b32dbf1ad846854 Mon Sep 17 00:00:00 2001 From: Elim Pizza Date: Fri, 5 Jun 2026 14:22:22 -0300 Subject: [PATCH 2/3] Fix lint --- h/models/document/_document.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/h/models/document/_document.py b/h/models/document/_document.py index 955f3aa682f..5694754a813 100644 --- a/h/models/document/_document.py +++ b/h/models/document/_document.py @@ -215,7 +215,7 @@ def _merge_checkpoints(session, duplicate_ids, master): def _most_restrictive_reveal_date(checkpoints): - """Returns the reveal_date that keeps annotations hidden the longest.""" + """Return the reveal_date that keeps annotations hidden the longest.""" reveal_dates = [checkpoint.reveal_date for checkpoint in checkpoints] if any(reveal_date is None for reveal_date in reveal_dates): return None From 18362fbab41d95bfa9b54c38700e9120972ae00b Mon Sep 17 00:00:00 2001 From: Elim Pizza Date: Fri, 5 Jun 2026 14:59:21 -0300 Subject: [PATCH 3/3] Test coverage --- .../unit/h/models/document/_document_test.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/unit/h/models/document/_document_test.py b/tests/unit/h/models/document/_document_test.py index 547b8e5adcb..9e110e6e9bb 100644 --- a/tests/unit/h/models/document/_document_test.py +++ b/tests/unit/h/models/document/_document_test.py @@ -276,6 +276,32 @@ def test_it_collapses_colliding_checkpoints_to_the_most_restrictive( assert len(survivors) == 1 assert survivors[0].reveal_date is None + def test_it_collapses_colliding_checkpoints_to_the_latest_reveal_date( + self, db_session, duplicate_docs, factories + ): + group = factories.Group() + factories.Checkpoint( + group=group, + document=duplicate_docs[0], + reveal_date=_datetime(2000, 1, 1), # noqa: DTZ001 + ) + factories.Checkpoint( + group=group, + document=duplicate_docs[1], + reveal_date=_datetime(2030, 1, 1), # noqa: DTZ001 # hides for longest + ) + + merge_documents(db_session, duplicate_docs) + db_session.flush() + + survivors = ( + db_session.query(models.Checkpoint) + .filter_by(group_id=group.id, document_id=duplicate_docs[0].id) + .all() + ) + assert len(survivors) == 1 + assert survivors[0].reveal_date == _datetime(2030, 1, 1) # noqa: DTZ001 + def test_it_raises_retryable_error_when_flush_fails( self, db_session, duplicate_docs, monkeypatch ):