From 13cc7c7cbe505b62b02aa751a6e9d68be7a6f5fa Mon Sep 17 00:00:00 2001
From: Jon Froehlich <jonf@cs.uw.edu>
Date: Sat, 20 Jun 2026 17:16:14 -0700
Subject: [PATCH] fix(keyword): merge by repointing through-rows, not .add()
 (#1352)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The keyword-merge action 500'd on -test/prod with "null value in column
sort_value violates not-null constraint". The deployed website_*_keywords
through tables carry a leftover NOT-NULL sort_value column (keywords used to be
a SortedManyToManyField), so obj.keywords.add(target) — which INSERTs a fresh
through row — fails the constraint. settings_test builds the schema from the
current plain-M2M models, so the column isn't present locally and the original
tests couldn't catch it.

Fix: merge at the through-table level by repointing each existing
(object, source) row to (object, target) — an UPDATE that preserves the row's
sort_value — instead of inserting a new row. If the object already has the
target (including when it's tagged with two sources at once), the source row is
deleted so no duplicate (object, keyword) pair is created.

Adds a regression test that reproduces the deployed drift by adding a NOT-NULL
sort_value column to the through table before merging, plus a two-sources-on-
one-object dedup test.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 website/admin/keyword_admin.py      | 54 ++++++++++++++++++++++-------
 website/tests/test_keyword_merge.py | 49 ++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 12 deletions(-)

diff --git a/website/admin/keyword_admin.py b/website/admin/keyword_admin.py
index d2a3365f..c4d3f7bb 100644
--- a/website/admin/keyword_admin.py
+++ b/website/admin/keyword_admin.py
@@ -39,29 +39,59 @@ def queryset(self, request, queryset):
         return queryset
 
 
+def _parent_fk_field_name(through_model):
+    """Name of the FK on a keywords-through model that points to the *owning*
+    object (Publication/Talk/…) — i.e. the relation that isn't the Keyword side.
+    """
+    for field in through_model._meta.get_fields():
+        if field.many_to_one and field.related_model is not Keyword:
+            return field.name
+    raise RuntimeError(f"No owning FK found on {through_model.__name__}")
+
+
 @transaction.atomic
 def merge_keywords_into_target(target, sources):
     """Reassign every reference from ``sources`` onto ``target``, then delete the
     sources. Returns the number of source keywords removed.
 
-    For each source keyword and each model that references keywords, every
-    tagged object gains the target via ``obj.keywords.add(target)`` — idempotent,
-    so an object already tagged with the target ends up with a single row, not a
-    duplicate. Deleting the source then drops its own M2M rows, so no explicit
-    ``remove()`` is needed. ``target`` is skipped if present in ``sources``.
+    Works at the M2M *through-table* level: each existing ``(object, source)``
+    row is repointed to ``(object, target)`` rather than calling
+    ``obj.keywords.add(target)``. This is deliberate — the deployed
+    ``website_*_keywords`` tables carry a legacy NOT-NULL ``sort_value`` column
+    (the field used to be a SortedManyToManyField), so *inserting* a fresh row
+    via .add() violates that constraint, whereas *updating* an existing row
+    preserves its sort_value. When the object is already tagged with the target,
+    the source row is deleted instead, so no duplicate ``(object, keyword)`` pair
+    is created (this also covers an object tagged with two different sources at
+    once). ``target`` is ignored if present in ``sources``.
 
     Runs in a transaction: a failure mid-merge rolls back rather than leaving the
     taxonomy half-merged.
     """
-    removed = 0
+    sources = [s for s in sources if s.pk != target.pk]
+
+    for model in KEYWORD_USERS:
+        through = model.keywords.through
+        owner_id_field = f"{_parent_fk_field_name(through)}_id"
+        # Snapshot the source rows first; we mutate/delete as we go.
+        for row in list(through.objects.filter(keyword__in=sources)):
+            owner_id = getattr(row, owner_id_field)
+            target_already_present = (
+                through.objects
+                .filter(keyword=target, **{owner_id_field: owner_id})
+                .exclude(pk=row.pk)
+                .exists()
+            )
+            if target_already_present:
+                row.delete()
+            else:
+                row.keyword = target
+                row.save(update_fields=['keyword'])
+
+    removed = len(sources)
     for source in sources:
-        if source.pk == target.pk:
-            continue
-        for accessor in KEYWORD_REVERSE_ACCESSORS:
-            for obj in getattr(source, accessor).all():
-                obj.keywords.add(target)
+        # Any stray through rows are CASCADE-dropped with the keyword.
         source.delete()
-        removed += 1
     return removed
 
 
diff --git a/website/tests/test_keyword_merge.py b/website/tests/test_keyword_merge.py
index dac7026e..9bcea8b1 100644
--- a/website/tests/test_keyword_merge.py
+++ b/website/tests/test_keyword_merge.py
@@ -16,6 +16,7 @@
 
 from django.contrib.auth import get_user_model
 from django.contrib.messages.storage.fallback import FallbackStorage
+from django.db import connection
 from django.test import RequestFactory
 from django.urls import reverse
 
@@ -76,6 +77,54 @@ def test_object_already_tagged_with_target_has_no_duplicate(self):
                          [target.pk])
         self.assertFalse(Keyword.objects.filter(pk=source.pk).exists())
 
+    def test_object_tagged_with_two_sources_at_once(self):
+        target = Keyword.objects.create(keyword="Design")
+        source_a = Keyword.objects.create(keyword="design")
+        source_b = Keyword.objects.create(keyword="DESIGN")
+        pub = self.make_publication(title="Double tagged")
+        pub.keywords.add(source_a, source_b)  # tagged with BOTH sources, not target
+
+        merge_keywords_into_target(target, [source_a, source_b])
+
+        # Both sources collapse to a single target row, not a duplicate pair.
+        self.assertEqual(list(pub.keywords.values_list('pk', flat=True)),
+                         [target.pk])
+        self.assertEqual(Keyword.objects.filter(
+            pk__in=[source_a.pk, source_b.pk]).count(), 0)
+
+    def test_merge_survives_legacy_sort_value_column(self):
+        """Reproduces the -test/prod failure: the deployed keywords through table
+        carries a leftover NOT-NULL ``sort_value`` column (the field used to be a
+        SortedManyToManyField). Inserting a new through row via .add() violates
+        the constraint; the merge must repoint existing rows instead.
+
+        settings_test builds the schema from the current (plain M2M) models, so
+        the column isn't there by default — we add it here to mimic the deployed
+        drift, then assert the merge still succeeds.
+        """
+        target = Keyword.objects.create(keyword="Design Process")
+        source = Keyword.objects.create(keyword="design process")
+        pub = self.make_publication(title="Legacy schema paper")
+        pub.keywords.add(source)  # inserted before the column exists — fine
+
+        table = Publication.keywords.through._meta.db_table
+        with connection.cursor() as cursor:
+            # The .add() above left deferred FK trigger events pending, which
+            # block ALTER TABLE; force them to resolve now so we can add the
+            # column inside this test transaction.
+            cursor.execute('SET CONSTRAINTS ALL IMMEDIATE')
+            cursor.execute(f'ALTER TABLE {table} ADD COLUMN sort_value integer')
+            cursor.execute(f'UPDATE {table} SET sort_value = 0')
+            cursor.execute(
+                f'ALTER TABLE {table} ALTER COLUMN sort_value SET NOT NULL')
+
+        # Must not raise IntegrityError on the NOT-NULL sort_value column.
+        merge_keywords_into_target(target, [source])
+
+        self.assertEqual(list(pub.keywords.values_list('pk', flat=True)),
+                         [target.pk])
+        self.assertFalse(Keyword.objects.filter(pk=source.pk).exists())
+
     def test_target_in_sources_is_skipped(self):
         target = Keyword.objects.create(keyword="Robotics")
         source = Keyword.objects.create(keyword="robotics")