diff --git a/website/admin/keyword_admin.py b/website/admin/keyword_admin.py index d2a3365f..c4d3f7bb 100644 --- a/website/admin/keyword_admin.py +++ b/website/admin/keyword_admin.py @@ -39,29 +39,59 @@ def queryset(self, request, queryset): return queryset +def _parent_fk_field_name(through_model): + """Name of the FK on a keywords-through model that points to the *owning* + object (Publication/Talk/…) — i.e. the relation that isn't the Keyword side. + """ + for field in through_model._meta.get_fields(): + if field.many_to_one and field.related_model is not Keyword: + return field.name + raise RuntimeError(f"No owning FK found on {through_model.__name__}") + + @transaction.atomic def merge_keywords_into_target(target, sources): """Reassign every reference from ``sources`` onto ``target``, then delete the sources. Returns the number of source keywords removed. - For each source keyword and each model that references keywords, every - tagged object gains the target via ``obj.keywords.add(target)`` — idempotent, - so an object already tagged with the target ends up with a single row, not a - duplicate. Deleting the source then drops its own M2M rows, so no explicit - ``remove()`` is needed. ``target`` is skipped if present in ``sources``. + Works at the M2M *through-table* level: each existing ``(object, source)`` + row is repointed to ``(object, target)`` rather than calling + ``obj.keywords.add(target)``. This is deliberate — the deployed + ``website_*_keywords`` tables carry a legacy NOT-NULL ``sort_value`` column + (the field used to be a SortedManyToManyField), so *inserting* a fresh row + via .add() violates that constraint, whereas *updating* an existing row + preserves its sort_value. When the object is already tagged with the target, + the source row is deleted instead, so no duplicate ``(object, keyword)`` pair + is created (this also covers an object tagged with two different sources at + once). ``target`` is ignored if present in ``sources``. Runs in a transaction: a failure mid-merge rolls back rather than leaving the taxonomy half-merged. """ - removed = 0 + sources = [s for s in sources if s.pk != target.pk] + + for model in KEYWORD_USERS: + through = model.keywords.through + owner_id_field = f"{_parent_fk_field_name(through)}_id" + # Snapshot the source rows first; we mutate/delete as we go. + for row in list(through.objects.filter(keyword__in=sources)): + owner_id = getattr(row, owner_id_field) + target_already_present = ( + through.objects + .filter(keyword=target, **{owner_id_field: owner_id}) + .exclude(pk=row.pk) + .exists() + ) + if target_already_present: + row.delete() + else: + row.keyword = target + row.save(update_fields=['keyword']) + + removed = len(sources) for source in sources: - if source.pk == target.pk: - continue - for accessor in KEYWORD_REVERSE_ACCESSORS: - for obj in getattr(source, accessor).all(): - obj.keywords.add(target) + # Any stray through rows are CASCADE-dropped with the keyword. source.delete() - removed += 1 return removed diff --git a/website/tests/test_keyword_merge.py b/website/tests/test_keyword_merge.py index dac7026e..9bcea8b1 100644 --- a/website/tests/test_keyword_merge.py +++ b/website/tests/test_keyword_merge.py @@ -16,6 +16,7 @@ from django.contrib.auth import get_user_model from django.contrib.messages.storage.fallback import FallbackStorage +from django.db import connection from django.test import RequestFactory from django.urls import reverse @@ -76,6 +77,54 @@ def test_object_already_tagged_with_target_has_no_duplicate(self): [target.pk]) self.assertFalse(Keyword.objects.filter(pk=source.pk).exists()) + def test_object_tagged_with_two_sources_at_once(self): + target = Keyword.objects.create(keyword="Design") + source_a = Keyword.objects.create(keyword="design") + source_b = Keyword.objects.create(keyword="DESIGN") + pub = self.make_publication(title="Double tagged") + pub.keywords.add(source_a, source_b) # tagged with BOTH sources, not target + + merge_keywords_into_target(target, [source_a, source_b]) + + # Both sources collapse to a single target row, not a duplicate pair. + self.assertEqual(list(pub.keywords.values_list('pk', flat=True)), + [target.pk]) + self.assertEqual(Keyword.objects.filter( + pk__in=[source_a.pk, source_b.pk]).count(), 0) + + def test_merge_survives_legacy_sort_value_column(self): + """Reproduces the -test/prod failure: the deployed keywords through table + carries a leftover NOT-NULL ``sort_value`` column (the field used to be a + SortedManyToManyField). Inserting a new through row via .add() violates + the constraint; the merge must repoint existing rows instead. + + settings_test builds the schema from the current (plain M2M) models, so + the column isn't there by default — we add it here to mimic the deployed + drift, then assert the merge still succeeds. + """ + target = Keyword.objects.create(keyword="Design Process") + source = Keyword.objects.create(keyword="design process") + pub = self.make_publication(title="Legacy schema paper") + pub.keywords.add(source) # inserted before the column exists — fine + + table = Publication.keywords.through._meta.db_table + with connection.cursor() as cursor: + # The .add() above left deferred FK trigger events pending, which + # block ALTER TABLE; force them to resolve now so we can add the + # column inside this test transaction. + cursor.execute('SET CONSTRAINTS ALL IMMEDIATE') + cursor.execute(f'ALTER TABLE {table} ADD COLUMN sort_value integer') + cursor.execute(f'UPDATE {table} SET sort_value = 0') + cursor.execute( + f'ALTER TABLE {table} ALTER COLUMN sort_value SET NOT NULL') + + # Must not raise IntegrityError on the NOT-NULL sort_value column. + merge_keywords_into_target(target, [source]) + + self.assertEqual(list(pub.keywords.values_list('pk', flat=True)), + [target.pk]) + self.assertFalse(Keyword.objects.filter(pk=source.pk).exists()) + def test_target_in_sources_is_skipped(self): target = Keyword.objects.create(keyword="Robotics") source = Keyword.objects.create(keyword="robotics")