Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions h/models/document/_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def merge_documents(session, documents, updated=None):
from h.services.annotation_write import AnnotationWriteService # noqa: PLC0415

AnnotationWriteService.change_document(session, duplicate_ids, master)
_merge_checkpoints(session, duplicate_ids, master)
session.query(Document).filter(Document.id.in_(duplicate_ids)).delete(
synchronize_session="fetch"
)
Expand All @@ -170,6 +171,57 @@ def merge_documents(session, documents, updated=None):
return master


def _merge_checkpoints(session, duplicate_ids, master):
"""
Re-point Hide & Reveal checkpoints from the duplicate documents to master.

This mirrors how annotations are re-pointed
by AnnotationWriteService.change_document.

They are collapsed into a single checkpoint that
keeps the most restrictive reveal_date (an annotation stays hidden while
any of the merged checkpoints would hide it), so a merge can never reveal
annotations that should remain hidden.
"""
from h.models import Checkpoint # noqa: PLC0415

checkpoints = (
session.query(Checkpoint)
.filter(Checkpoint.document_id.in_([master.id, *duplicate_ids]))
.all()
)

by_key: dict = {}
for checkpoint in checkpoints:
key = (checkpoint.group_id, checkpoint.previous_checkpoint_id)
by_key.setdefault(key, []).append(checkpoint)

for colliding in by_key.values():
# Prefer a checkpoint already on master as the survivor, so we don't
# momentarily violate the unique constraint by re-pointing onto it.
colliding.sort(key=lambda checkpoint: checkpoint.document_id != master.id)
survivor, *losers = colliding

reveal_date = _most_restrictive_reveal_date(colliding)

for loser in losers:
session.delete(loser)
session.flush()

survivor.document_id = master.id
survivor.reveal_date = reveal_date

session.flush()


def _most_restrictive_reveal_date(checkpoints):
"""Return the reveal_date that keeps annotations hidden the longest."""
reveal_dates = [checkpoint.reveal_date for checkpoint in checkpoints]
if any(reveal_date is None for reveal_date in reveal_dates):
return None
return max(reveal_dates)


def update_document_metadata( # noqa: PLR0913
session,
target_uri,
Expand Down
76 changes: 76 additions & 0 deletions tests/unit/h/models/document/_document_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,82 @@ def test_it_moves_annotations_to_the_first(self, db_session, duplicate_docs):

assert count == expected_count

def test_it_moves_checkpoints_to_the_first(
self, db_session, duplicate_docs, factories
):
checkpoint = factories.Checkpoint(document=duplicate_docs[1])

merge_documents(db_session, duplicate_docs)
db_session.flush()

assert checkpoint.document_id == duplicate_docs[0].id

def test_it_keeps_checkpoints_in_different_groups(
self, db_session, duplicate_docs, factories
):
checkpoint_1 = factories.Checkpoint(document=duplicate_docs[0])
checkpoint_2 = factories.Checkpoint(document=duplicate_docs[1])

merge_documents(db_session, duplicate_docs)
db_session.flush()

# Different groups don't collide, so both survive on the master.
assert checkpoint_1.document_id == duplicate_docs[0].id
assert checkpoint_2.document_id == duplicate_docs[0].id

def test_it_collapses_colliding_checkpoints_to_the_most_restrictive(
self, db_session, duplicate_docs, factories
):
group = factories.Group()
# Same group on two merging documents => the checkpoints collide.
factories.Checkpoint(
group=group,
document=duplicate_docs[0],
reveal_date=_datetime(2000, 1, 1), # noqa: DTZ001 # already revealed
)
factories.Checkpoint(
group=group,
document=duplicate_docs[1],
reveal_date=None, # never revealed = most restrictive
)

merge_documents(db_session, duplicate_docs)
db_session.flush()

survivors = (
db_session.query(models.Checkpoint)
.filter_by(group_id=group.id, document_id=duplicate_docs[0].id)
.all()
)
assert len(survivors) == 1
assert survivors[0].reveal_date is None

def test_it_collapses_colliding_checkpoints_to_the_latest_reveal_date(
self, db_session, duplicate_docs, factories
):
group = factories.Group()
factories.Checkpoint(
group=group,
document=duplicate_docs[0],
reveal_date=_datetime(2000, 1, 1), # noqa: DTZ001
)
factories.Checkpoint(
group=group,
document=duplicate_docs[1],
reveal_date=_datetime(2030, 1, 1), # noqa: DTZ001 # hides for longest
)

merge_documents(db_session, duplicate_docs)
db_session.flush()

survivors = (
db_session.query(models.Checkpoint)
.filter_by(group_id=group.id, document_id=duplicate_docs[0].id)
.all()
)
assert len(survivors) == 1
assert survivors[0].reveal_date == _datetime(2030, 1, 1) # noqa: DTZ001

def test_it_raises_retryable_error_when_flush_fails(
self, db_session, duplicate_docs, monkeypatch
):
Expand Down
Loading