makeabilitylab · jonfroehlich · Jun 21, 2026 · Jun 21, 2026
diff --git a/website/admin/keyword_admin.py b/website/admin/keyword_admin.py
@@ -39,29 +39,59 @@ def queryset(self, request, queryset):
         return queryset
 
 
+def _parent_fk_field_name(through_model):
+    """Name of the FK on a keywords-through model that points to the *owning*
+    object (Publication/Talk/…) — i.e. the relation that isn't the Keyword side.
+    """
+    for field in through_model._meta.get_fields():
+        if field.many_to_one and field.related_model is not Keyword:
+            return field.name
+    raise RuntimeError(f"No owning FK found on {through_model.__name__}")
+
+
 @transaction.atomic
 def merge_keywords_into_target(target, sources):
     """Reassign every reference from ``sources`` onto ``target``, then delete the
     sources. Returns the number of source keywords removed.
 
-    For each source keyword and each model that references keywords, every
-    tagged object gains the target via ``obj.keywords.add(target)`` — idempotent,
-    so an object already tagged with the target ends up with a single row, not a
-    duplicate. Deleting the source then drops its own M2M rows, so no explicit
-    ``remove()`` is needed. ``target`` is skipped if present in ``sources``.
+    Works at the M2M *through-table* level: each existing ``(object, source)``
+    row is repointed to ``(object, target)`` rather than calling
+    ``obj.keywords.add(target)``. This is deliberate — the deployed
+    ``website_*_keywords`` tables carry a legacy NOT-NULL ``sort_value`` column
+    (the field used to be a SortedManyToManyField), so *inserting* a fresh row
+    via .add() violates that constraint, whereas *updating* an existing row
+    preserves its sort_value. When the object is already tagged with the target,
+    the source row is deleted instead, so no duplicate ``(object, keyword)`` pair
+    is created (this also covers an object tagged with two different sources at
+    once). ``target`` is ignored if present in ``sources``.
 
     Runs in a transaction: a failure mid-merge rolls back rather than leaving the
     taxonomy half-merged.
     """
-    removed = 0
+    sources = [s for s in sources if s.pk != target.pk]
+
+    for model in KEYWORD_USERS:
+        through = model.keywords.through
+        owner_id_field = f"{_parent_fk_field_name(through)}_id"
+        # Snapshot the source rows first; we mutate/delete as we go.
+        for row in list(through.objects.filter(keyword__in=sources)):
+            owner_id = getattr(row, owner_id_field)
+            target_already_present = (
+                through.objects
+                .filter(keyword=target, **{owner_id_field: owner_id})
+                .exclude(pk=row.pk)
+                .exists()
+            )
+            if target_already_present:
+                row.delete()
+            else:
+                row.keyword = target
+                row.save(update_fields=['keyword'])
+
+    removed = len(sources)
     for source in sources:
-        if source.pk == target.pk:
-            continue
-        for accessor in KEYWORD_REVERSE_ACCESSORS:
-            for obj in getattr(source, accessor).all():
-                obj.keywords.add(target)
+        # Any stray through rows are CASCADE-dropped with the keyword.
         source.delete()
-        removed += 1
     return removed
 
 

diff --git a/website/tests/test_keyword_merge.py b/website/tests/test_keyword_merge.py
@@ -16,6 +16,7 @@
 
 from django.contrib.auth import get_user_model
 from django.contrib.messages.storage.fallback import FallbackStorage
+from django.db import connection
 from django.test import RequestFactory
 from django.urls import reverse
 
@@ -76,6 +77,54 @@ def test_object_already_tagged_with_target_has_no_duplicate(self):
                          [target.pk])
         self.assertFalse(Keyword.objects.filter(pk=source.pk).exists())
 
+    def test_object_tagged_with_two_sources_at_once(self):
+        target = Keyword.objects.create(keyword="Design")
+        source_a = Keyword.objects.create(keyword="design")
+        source_b = Keyword.objects.create(keyword="DESIGN")
+        pub = self.make_publication(title="Double tagged")
+        pub.keywords.add(source_a, source_b)  # tagged with BOTH sources, not target
+
+        merge_keywords_into_target(target, [source_a, source_b])
+
+        # Both sources collapse to a single target row, not a duplicate pair.
+        self.assertEqual(list(pub.keywords.values_list('pk', flat=True)),
+                         [target.pk])
+        self.assertEqual(Keyword.objects.filter(
+            pk__in=[source_a.pk, source_b.pk]).count(), 0)
+
+    def test_merge_survives_legacy_sort_value_column(self):
+        """Reproduces the -test/prod failure: the deployed keywords through table
+        carries a leftover NOT-NULL ``sort_value`` column (the field used to be a
+        SortedManyToManyField). Inserting a new through row via .add() violates
+        the constraint; the merge must repoint existing rows instead.
+
+        settings_test builds the schema from the current (plain M2M) models, so
+        the column isn't there by default — we add it here to mimic the deployed
+        drift, then assert the merge still succeeds.
+        """
+        target = Keyword.objects.create(keyword="Design Process")
+        source = Keyword.objects.create(keyword="design process")
+        pub = self.make_publication(title="Legacy schema paper")
+        pub.keywords.add(source)  # inserted before the column exists — fine
+
+        table = Publication.keywords.through._meta.db_table
+        with connection.cursor() as cursor:
+            # The .add() above left deferred FK trigger events pending, which
+            # block ALTER TABLE; force them to resolve now so we can add the
+            # column inside this test transaction.
+            cursor.execute('SET CONSTRAINTS ALL IMMEDIATE')
+            cursor.execute(f'ALTER TABLE {table} ADD COLUMN sort_value integer')
+            cursor.execute(f'UPDATE {table} SET sort_value = 0')
+            cursor.execute(
+                f'ALTER TABLE {table} ALTER COLUMN sort_value SET NOT NULL')
+
+        # Must not raise IntegrityError on the NOT-NULL sort_value column.
+        merge_keywords_into_target(target, [source])
+
+        self.assertEqual(list(pub.keywords.values_list('pk', flat=True)),
+                         [target.pk])
+        self.assertFalse(Keyword.objects.filter(pk=source.pk).exists())
+
     def test_target_in_sources_is_skipped(self):
         target = Keyword.objects.create(keyword="Robotics")
         source = Keyword.objects.create(keyword="robotics")