Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 42 additions & 12 deletions website/admin/keyword_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,29 +39,59 @@ def queryset(self, request, queryset):
return queryset


def _parent_fk_field_name(through_model):
"""Name of the FK on a keywords-through model that points to the *owning*
object (Publication/Talk/…) — i.e. the relation that isn't the Keyword side.
"""
for field in through_model._meta.get_fields():
if field.many_to_one and field.related_model is not Keyword:
return field.name
raise RuntimeError(f"No owning FK found on {through_model.__name__}")


@transaction.atomic
def merge_keywords_into_target(target, sources):
"""Reassign every reference from ``sources`` onto ``target``, then delete the
sources. Returns the number of source keywords removed.

For each source keyword and each model that references keywords, every
tagged object gains the target via ``obj.keywords.add(target)`` — idempotent,
so an object already tagged with the target ends up with a single row, not a
duplicate. Deleting the source then drops its own M2M rows, so no explicit
``remove()`` is needed. ``target`` is skipped if present in ``sources``.
Works at the M2M *through-table* level: each existing ``(object, source)``
row is repointed to ``(object, target)`` rather than calling
``obj.keywords.add(target)``. This is deliberate — the deployed
``website_*_keywords`` tables carry a legacy NOT-NULL ``sort_value`` column
(the field used to be a SortedManyToManyField), so *inserting* a fresh row
via .add() violates that constraint, whereas *updating* an existing row
preserves its sort_value. When the object is already tagged with the target,
the source row is deleted instead, so no duplicate ``(object, keyword)`` pair
is created (this also covers an object tagged with two different sources at
once). ``target`` is ignored if present in ``sources``.

Runs in a transaction: a failure mid-merge rolls back rather than leaving the
taxonomy half-merged.
"""
removed = 0
sources = [s for s in sources if s.pk != target.pk]

for model in KEYWORD_USERS:
through = model.keywords.through
owner_id_field = f"{_parent_fk_field_name(through)}_id"
# Snapshot the source rows first; we mutate/delete as we go.
for row in list(through.objects.filter(keyword__in=sources)):
owner_id = getattr(row, owner_id_field)
target_already_present = (
through.objects
.filter(keyword=target, **{owner_id_field: owner_id})
.exclude(pk=row.pk)
.exists()
)
if target_already_present:
row.delete()
else:
row.keyword = target
row.save(update_fields=['keyword'])

removed = len(sources)
for source in sources:
if source.pk == target.pk:
continue
for accessor in KEYWORD_REVERSE_ACCESSORS:
for obj in getattr(source, accessor).all():
obj.keywords.add(target)
# Any stray through rows are CASCADE-dropped with the keyword.
source.delete()
removed += 1
return removed


Expand Down
49 changes: 49 additions & 0 deletions website/tests/test_keyword_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from django.contrib.auth import get_user_model
from django.contrib.messages.storage.fallback import FallbackStorage
from django.db import connection
from django.test import RequestFactory
from django.urls import reverse

Expand Down Expand Up @@ -76,6 +77,54 @@ def test_object_already_tagged_with_target_has_no_duplicate(self):
[target.pk])
self.assertFalse(Keyword.objects.filter(pk=source.pk).exists())

def test_object_tagged_with_two_sources_at_once(self):
target = Keyword.objects.create(keyword="Design")
source_a = Keyword.objects.create(keyword="design")
source_b = Keyword.objects.create(keyword="DESIGN")
pub = self.make_publication(title="Double tagged")
pub.keywords.add(source_a, source_b) # tagged with BOTH sources, not target

merge_keywords_into_target(target, [source_a, source_b])

# Both sources collapse to a single target row, not a duplicate pair.
self.assertEqual(list(pub.keywords.values_list('pk', flat=True)),
[target.pk])
self.assertEqual(Keyword.objects.filter(
pk__in=[source_a.pk, source_b.pk]).count(), 0)

def test_merge_survives_legacy_sort_value_column(self):
"""Reproduces the -test/prod failure: the deployed keywords through table
carries a leftover NOT-NULL ``sort_value`` column (the field used to be a
SortedManyToManyField). Inserting a new through row via .add() violates
the constraint; the merge must repoint existing rows instead.

settings_test builds the schema from the current (plain M2M) models, so
the column isn't there by default — we add it here to mimic the deployed
drift, then assert the merge still succeeds.
"""
target = Keyword.objects.create(keyword="Design Process")
source = Keyword.objects.create(keyword="design process")
pub = self.make_publication(title="Legacy schema paper")
pub.keywords.add(source) # inserted before the column exists — fine

table = Publication.keywords.through._meta.db_table
with connection.cursor() as cursor:
# The .add() above left deferred FK trigger events pending, which
# block ALTER TABLE; force them to resolve now so we can add the
# column inside this test transaction.
cursor.execute('SET CONSTRAINTS ALL IMMEDIATE')
cursor.execute(f'ALTER TABLE {table} ADD COLUMN sort_value integer')
cursor.execute(f'UPDATE {table} SET sort_value = 0')
cursor.execute(
f'ALTER TABLE {table} ALTER COLUMN sort_value SET NOT NULL')

# Must not raise IntegrityError on the NOT-NULL sort_value column.
merge_keywords_into_target(target, [source])

self.assertEqual(list(pub.keywords.values_list('pk', flat=True)),
[target.pk])
self.assertFalse(Keyword.objects.filter(pk=source.pk).exists())

def test_target_in_sources_is_skipped(self):
target = Keyword.objects.create(keyword="Robotics")
source = Keyword.objects.create(keyword="robotics")
Expand Down
Loading