diff --git a/website/admin/data_health/checks/__init__.py b/website/admin/data_health/checks/__init__.py
index ca356579..557a265c 100644
--- a/website/admin/data_health/checks/__init__.py
+++ b/website/admin/data_health/checks/__init__.py
@@ -12,4 +12,5 @@
project_leadership,
position_integrity,
news_health,
+ duplicate_keywords,
)
diff --git a/website/admin/data_health/checks/duplicate_keywords.py b/website/admin/data_health/checks/duplicate_keywords.py
new file mode 100644
index 00000000..f24e2696
--- /dev/null
+++ b/website/admin/data_health/checks/duplicate_keywords.py
@@ -0,0 +1,98 @@
+"""
+Data-health check: keywords that collapse to the same normalized text.
+
+Keywords are free-text with no uniqueness constraint, so case/whitespace
+variants coexist (e.g. ``Speech`` / ``speech`` / ``Speech ``) and fragment the
+public keyword pages. This check *finds* those near-duplicate clusters; the
+``Merge selected keywords`` action on the Keyword admin changelist (#1352) is
+the tool to *fix* them. Strictly read-only.
+
+One row is emitted per keyword in any cluster of two or more keywords that
+share a normalized key (``.strip().casefold()``), with per-model usage counts
+to support the merge decision (which variant to keep as the target).
+"""
+
+from collections import defaultdict
+from urllib.parse import urlencode
+
+from django.urls import reverse
+
+from website.admin.data_health.registry import HealthCheck, register_check
+from website.models import (Keyword, Publication, Talk, Poster, Grant,
+ Project, ProjectUmbrella)
+
+# Reverse accessor on Keyword for each model that holds a `keywords` M2M.
+# (Video is not an Artifact, so it has no keywords.) Keep this in lockstep with
+# KEYWORD_USERS in website/admin/keyword_admin.py.
+KEYWORD_REVERSE_ACCESSORS = (
+ ('publication_count', 'publication_set'),
+ ('talk_count', 'talk_set'),
+ ('poster_count', 'poster_set'),
+ ('grant_count', 'grant_set'),
+ ('project_count', 'project_set'),
+ ('project_umbrella_count', 'projectumbrella_set'),
+)
+
+
+def normalize_keyword(text):
+ """Cluster key for near-duplicate detection: trim + case-fold.
+
+ casefold() is the aggressive, Unicode-aware lower() — so ``Speech``,
+ ``speech``, and ``Speech `` all collapse to the same key.
+ """
+ return (text or '').strip().casefold()
+
+
+@register_check
+class DuplicateKeywordsCheck(HealthCheck):
+ slug = 'duplicate-keywords'
+ title = 'Duplicate keywords (same normalized text)'
+ description = (
+ 'Keywords that collapse to the same text after trimming whitespace and '
+ 'folding case (e.g. "Speech" / "speech" / "Speech ") — candidates for '
+ 'the "Merge selected keywords" action on the Keyword admin (#1352).'
+ )
+ group = 'Artifacts'
+ columns = [
+ 'cluster_key', 'id', 'keyword',
+ 'publication_count', 'talk_count', 'poster_count', 'grant_count',
+ 'project_count', 'project_umbrella_count', 'total_uses',
+ ]
+
+ def get_rows(self):
+ # Group every keyword by its normalized key.
+ clusters = defaultdict(list)
+ for kw in Keyword.objects.all():
+ clusters[normalize_keyword(kw.keyword)].append(kw)
+
+ rows = []
+ for key, members in clusters.items():
+ if len(members) < 2:
+ continue # only multi-keyword clusters are near-duplicates
+ for kw in members:
+ rows.append(self._row(key, kw))
+
+ # Stable, scannable ordering: cluster together, then by id.
+ rows.sort(key=lambda r: (r['cluster_key'], r['id']))
+ return rows
+
+ def row_link(self, row):
+ """Deep-link each row to the Keyword changelist pre-filtered to its
+ cluster, so the editor lands on exactly the variants to select and run
+ the "Merge selected keywords" action on — wiring this finder to its
+ fixer. The admin search is icontains on ``keyword``, so the folded
+ cluster key matches every casing variant in the cluster.
+ """
+ url = reverse('admin:website_keyword_changelist')
+ return ('Merge in admin →', f"{url}?{urlencode({'q': row['cluster_key']})}")
+
+ def _row(self, key, kw):
+ counts = {col: getattr(kw, accessor).count()
+ for col, accessor in KEYWORD_REVERSE_ACCESSORS}
+ return {
+ 'cluster_key': key,
+ 'id': kw.pk,
+ 'keyword': kw.keyword,
+ **counts,
+ 'total_uses': sum(counts.values()),
+ }
diff --git a/website/admin/data_health/registry.py b/website/admin/data_health/registry.py
index 9719231c..a681dd42 100644
--- a/website/admin/data_health/registry.py
+++ b/website/admin/data_health/registry.py
@@ -39,6 +39,14 @@ def get_rows(self):
"""Return a list of row dicts (read-only). Override in subclasses."""
raise NotImplementedError
+ def row_link(self, row):
+ """Optional: a ``(label, url)`` pair to act on ``row`` from the detail
+ page, or ``None``. Lets a check wire its read-only findings to a fixer
+ elsewhere in the admin (e.g. the keyword-merge changelist). Default: no
+ link. Only affects the on-screen table — the CSV export is unchanged.
+ """
+ return None
+
def count(self):
"""Number of flagged rows. Override for a cheaper count if available."""
return len(self.get_rows())
diff --git a/website/admin/data_health/views.py b/website/admin/data_health/views.py
index 86641d08..d512928f 100644
--- a/website/admin/data_health/views.py
+++ b/website/admin/data_health/views.py
@@ -53,13 +53,19 @@ def detail(request, check_slug):
if check is None:
raise Http404("Unknown data-health check.")
# Flatten dict rows into cell lists aligned to the column order so the
- # template can render them without a dynamic dict-lookup filter.
+ # template can render them without a dynamic dict-lookup filter. Each row
+ # also carries an optional (label, url) action link the check may provide.
columns = check.columns
- rows = [[row.get(col, '') for col in columns] for row in check.get_rows()]
+ rows = [
+ {'cells': [row.get(col, '') for col in columns],
+ 'link': check.row_link(row)}
+ for row in check.get_rows()
+ ]
context = _admin_context(request, title=check.title)
context['check'] = check
context['columns'] = columns
context['rows'] = rows
+ context['has_links'] = any(row['link'] for row in rows)
return render(request, 'admin/data_health/detail.html', context)
diff --git a/website/admin/keyword_admin.py b/website/admin/keyword_admin.py
index 09f41df3..d2a3365f 100644
--- a/website/admin/keyword_admin.py
+++ b/website/admin/keyword_admin.py
@@ -1,4 +1,7 @@
-from django.contrib import admin
+from django.contrib import admin, messages
+from django.contrib.admin.helpers import ACTION_CHECKBOX_NAME
+from django.db import transaction
+from django.shortcuts import render
from website.models import (Keyword, Publication, Talk, Poster, Grant,
Project, ProjectUmbrella)
from website.admin.utils import related_count_subquery
@@ -10,6 +13,13 @@
# Artifact, so it has no keywords.)
KEYWORD_USERS = (Publication, Talk, Poster, Grant, Project, ProjectUmbrella)
+# Reverse accessor on Keyword for each model in KEYWORD_USERS — the relations a
+# merge must walk to reattach the target keyword before deleting the sources.
+KEYWORD_REVERSE_ACCESSORS = (
+ 'publication_set', 'talk_set', 'poster_set',
+ 'grant_set', 'project_set', 'projectumbrella_set',
+)
+
class KeywordUsageFilter(admin.SimpleListFilter):
"""Filter keywords by whether anything references them — the core
@@ -29,6 +39,32 @@ def queryset(self, request, queryset):
return queryset
+@transaction.atomic
+def merge_keywords_into_target(target, sources):
+ """Reassign every reference from ``sources`` onto ``target``, then delete the
+ sources. Returns the number of source keywords removed.
+
+ For each source keyword and each model that references keywords, every
+ tagged object gains the target via ``obj.keywords.add(target)`` — idempotent,
+ so an object already tagged with the target ends up with a single row, not a
+ duplicate. Deleting the source then drops its own M2M rows, so no explicit
+ ``remove()`` is needed. ``target`` is skipped if present in ``sources``.
+
+ Runs in a transaction: a failure mid-merge rolls back rather than leaving the
+ taxonomy half-merged.
+ """
+ removed = 0
+ for source in sources:
+ if source.pk == target.pk:
+ continue
+ for accessor in KEYWORD_REVERSE_ACCESSORS:
+ for obj in getattr(source, accessor).all():
+ obj.keywords.add(target)
+ source.delete()
+ removed += 1
+ return removed
+
+
@admin.register(Keyword, site=ml_admin_site)
class KeywordAdmin(admin.ModelAdmin):
list_display = ['keyword', 'project_count', 'publication_count', 'total_usage']
@@ -38,6 +74,69 @@ class KeywordAdmin(admin.ModelAdmin):
search_fields = ['keyword']
ordering = ['keyword']
list_filter = (KeywordUsageFilter,)
+ actions = ['merge_keywords']
+
+ @admin.action(description='Merge selected keywords (dedup taxonomy)')
+ def merge_keywords(self, request, queryset):
+ """Merge two or more keywords into one chosen target (#1352).
+
+ Two-step Django action: the first click shows an intermediate page to
+ pick which selected keyword to keep as the target; confirming reassigns
+ every reference onto it (across all six keyword-holding models) and
+ deletes the others. Destructive, so it always routes through the
+ confirmation page — it never merges on the first click.
+ """
+ selected = list(queryset.order_by('keyword'))
+ if len(selected) < 2:
+ self.message_user(
+ request,
+ "Select two or more keywords to merge.",
+ level=messages.WARNING,
+ )
+ return None
+
+ if request.POST.get('confirm_merge'):
+ target = next((k for k in selected
+ if str(k.pk) == request.POST.get('target')), None)
+ if target is None:
+ self.message_user(
+ request,
+ "Pick which keyword to keep as the merge target.",
+ level=messages.WARNING,
+ )
+ else:
+ sources = [k for k in selected if k.pk != target.pk]
+ source_names = ', '.join(f'"{k.keyword}"' for k in sources)
+ removed = merge_keywords_into_target(target, sources)
+ self.message_user(
+ request,
+ f'Merged {removed} keyword(s) ({source_names}) into '
+ f'"{target.keyword}".',
+ level=messages.SUCCESS,
+ )
+ return None # back to the changelist
+
+ # First click (or a target wasn't chosen): show the confirmation page,
+ # annotating each candidate with its total usage to inform the choice.
+ candidates = [
+ {'keyword': k, 'total_uses': sum(
+ getattr(k, accessor).count()
+ for accessor in KEYWORD_REVERSE_ACCESSORS)}
+ for k in selected
+ ]
+ context = {
+ **self.admin_site.each_context(request),
+ 'title': 'Merge keywords',
+ 'candidates': candidates,
+ 'selected_ids': [str(k.pk) for k in selected],
+ 'action_checkbox_name': ACTION_CHECKBOX_NAME,
+ 'opts': self.model._meta,
+ }
+ return render(
+ request,
+ 'admin/website/keyword/merge_keywords.html',
+ context,
+ )
def change_view(self, request, object_id, form_url='', extra_context=None):
"""Add projects and publications to the context. We then use this extra data in
diff --git a/website/models/keyword.py b/website/models/keyword.py
index 55827d27..0fe3e156 100644
--- a/website/models/keyword.py
+++ b/website/models/keyword.py
@@ -1,8 +1,26 @@
+import re
+
from django.db import models
class Keyword(models.Model):
keyword = models.CharField(max_length=255)
+ def save(self, *args, **kwargs):
+ """Normalize whitespace before saving so casual variants can't create
+ near-duplicate tags (#1352). Trims the ends and collapses internal runs
+ of whitespace to a single space, so "Speech ", " Speech", and
+ "Speech recognition" can't coexist with their clean forms.
+
+ This is the partial, no-migration ward (layer 1): it catches every
+ creation path, including the inline "add keyword" widget on Publication/
+ Project forms. Case-insensitive uniqueness (e.g. blocking "Speech" vs
+ "speech") is a separate DB constraint deferred until existing dupes are
+ merged — casing is intentionally preserved here (VR, HCI, iOS).
+ """
+ if self.keyword:
+ self.keyword = re.sub(r'\s+', ' ', self.keyword).strip()
+ super().save(*args, **kwargs)
+
def get_project_count(self):
"""Returns the number of projects that use keyword"""
return self.project_set.count()
diff --git a/website/templates/admin/data_health/detail.html b/website/templates/admin/data_health/detail.html
index b5ce0c78..34907ff8 100644
--- a/website/templates/admin/data_health/detail.html
+++ b/website/templates/admin/data_health/detail.html
@@ -23,12 +23,16 @@
{% for col in columns %}
{{ col }} {% endfor %}
+ {% if has_links %}{% translate 'Action' %} {% endif %}
+ {% blocktranslate trimmed %} + Choose which keyword to keep. Every reference to the other + selected keywords (across publications, talks, posters, grants, projects, + and project umbrellas) will be reassigned to it, and the others will be + permanently deleted. This cannot be undone. + {% endblocktranslate %} +
+ + +{% endblock %} diff --git a/website/tests/test_keyword_merge.py b/website/tests/test_keyword_merge.py new file mode 100644 index 00000000..dac7026e --- /dev/null +++ b/website/tests/test_keyword_merge.py @@ -0,0 +1,206 @@ +""" +Tests for the destructive "Merge selected keywords" admin action and the +duplicate-keywords data-health check (#1352). + +Coverage: + * merge reassigns references across all six keyword-holding models, then + deletes the merged-away keywords; + * an object already tagged with the target gains no duplicate M2M row; + * a keyword passed as both source and target is skipped (no self-delete); + * the admin action is a no-op when fewer than two keywords are selected; + * the data-health check clusters case/whitespace variants and ignores + singletons. +""" + +from datetime import date + +from django.contrib.auth import get_user_model +from django.contrib.messages.storage.fallback import FallbackStorage +from django.test import RequestFactory +from django.urls import reverse + +from website.models import (Keyword, Publication, Talk, Poster, Grant, + Project, ProjectUmbrella, Sponsor) +from website.admin.admin_site import ml_admin_site +from website.admin.keyword_admin import KeywordAdmin, merge_keywords_into_target +from website.admin.data_health.checks.duplicate_keywords import DuplicateKeywordsCheck +from website.tests.base import DatabaseTestCase + + +class KeywordMergeHelperTests(DatabaseTestCase): + """Exercise the reusable merge function directly (no request plumbing).""" + + def _tag_one_object_per_model(self, keyword): + """Create one object of each of the six keyword-holding models, tag each + with ``keyword``, and return them so callers can assert reassignment.""" + sponsor = Sponsor.objects.create(name="NSF") + objects = { + 'publication': self.make_publication(title="P"), + 'talk': self.make_talk(title="T"), + 'poster': Poster.objects.create(title="Po", date=date(2024, 1, 1), + forum_name="CHI"), + 'project': self.make_project(name="Proj"), + 'project_umbrella': ProjectUmbrella.objects.create( + name="Umb", short_name="umb"), + 'grant': Grant.objects.create(title="G", date=date(2024, 1, 1), + forum_name="NSF", sponsor=sponsor), + } + for obj in objects.values(): + obj.keywords.add(keyword) + return objects + + def test_merge_reassigns_all_six_relations_and_deletes_source(self): + target = Keyword.objects.create(keyword="Speech") + source = Keyword.objects.create(keyword="speech") + objects = self._tag_one_object_per_model(source) + + removed = merge_keywords_into_target(target, [source]) + + self.assertEqual(removed, 1) + self.assertFalse(Keyword.objects.filter(pk=source.pk).exists()) + for name, obj in objects.items(): + keyword_ids = list(obj.keywords.values_list('pk', flat=True)) + self.assertEqual(keyword_ids, [target.pk], + f"{name} should now reference only the target keyword") + + def test_object_already_tagged_with_target_has_no_duplicate(self): + target = Keyword.objects.create(keyword="HCI") + source = Keyword.objects.create(keyword="hci") + pub = self.make_publication(title="Already tagged") + pub.keywords.add(target, source) # tagged with BOTH + + merge_keywords_into_target(target, [source]) + + # add() is idempotent, so the target appears exactly once — not twice. + self.assertEqual(list(pub.keywords.values_list('pk', flat=True)), + [target.pk]) + self.assertFalse(Keyword.objects.filter(pk=source.pk).exists()) + + def test_target_in_sources_is_skipped(self): + target = Keyword.objects.create(keyword="Robotics") + source = Keyword.objects.create(keyword="robotics") + + # Passing the target itself among the sources must not delete it. + removed = merge_keywords_into_target(target, [target, source]) + + self.assertEqual(removed, 1) + self.assertTrue(Keyword.objects.filter(pk=target.pk).exists()) + self.assertFalse(Keyword.objects.filter(pk=source.pk).exists()) + + +class KeywordMergeActionTests(DatabaseTestCase): + """Exercise the admin action wrapper (selection guards + confirm path).""" + + def setUp(self): + super().setUp() + self.admin = KeywordAdmin(Keyword, ml_admin_site) + self.factory = RequestFactory() + + def _request(self, data=None): + request = self.factory.post('/admin/website/keyword/', data or {}) + request.user = None + # Action handlers call message_user(), which needs a message store. + setattr(request, 'session', {}) + setattr(request, '_messages', FallbackStorage(request)) + return request + + def test_single_selection_is_a_noop(self): + only = Keyword.objects.create(keyword="solo") + request = self._request() + + result = self.admin.merge_keywords(request, Keyword.objects.filter(pk=only.pk)) + + self.assertIsNone(result) + self.assertTrue(Keyword.objects.filter(pk=only.pk).exists()) + + def test_confirm_post_merges_and_deletes(self): + target = Keyword.objects.create(keyword="Speech") + source = Keyword.objects.create(keyword="speech") + pub = self.make_publication(title="Confirmed merge") + pub.keywords.add(source) + + request = self._request({ + 'confirm_merge': 'yes', + 'target': str(target.pk), + }) + result = self.admin.merge_keywords( + request, Keyword.objects.filter(pk__in=[target.pk, source.pk])) + + self.assertIsNone(result) # returns to changelist + self.assertFalse(Keyword.objects.filter(pk=source.pk).exists()) + self.assertEqual(list(pub.keywords.values_list('pk', flat=True)), + [target.pk]) + + +class KeywordNormalizationTests(DatabaseTestCase): + """Layer-1 ward: Keyword.save() normalizes whitespace (#1352).""" + + def test_save_trims_and_collapses_whitespace(self): + kw = Keyword.objects.create(keyword=" Speech recognition ") + kw.refresh_from_db() + self.assertEqual(kw.keyword, "Speech recognition") + + def test_casing_is_preserved(self): + # Whitespace is normalized, but casing is intentionally left alone so + # acronyms like VR / HCI / iOS keep their intended form. + kw = Keyword.objects.create(keyword=" HCI ") + kw.refresh_from_db() + self.assertEqual(kw.keyword, "HCI") + + +class DuplicateKeywordsCheckTests(DatabaseTestCase): + """The finder must cluster case/whitespace variants and skip singletons.""" + + def test_clusters_case_variants_and_ignores_unique_keywords(self): + # Whitespace variants are now prevented at save (layer 1), so the + # finder's remaining job is case variants, which still coexist until the + # layer-2 case-insensitive constraint lands. + Keyword.objects.create(keyword="Speech") + Keyword.objects.create(keyword="speech") + Keyword.objects.create(keyword="Robotics") # unique → not flagged + + rows = DuplicateKeywordsCheck().get_rows() + + flagged = {r['keyword'] for r in rows} + self.assertEqual(flagged, {"Speech", "speech"}) + # Both share one normalized cluster key. + self.assertEqual({r['cluster_key'] for r in rows}, {"speech"}) + + def test_total_uses_counts_references(self): + kw_a = Keyword.objects.create(keyword="VR") + Keyword.objects.create(keyword="vr") # variant so the cluster qualifies + pub = self.make_publication(title="VR paper") + pub.keywords.add(kw_a) + + rows = {r['id']: r for r in DuplicateKeywordsCheck().get_rows()} + + self.assertEqual(rows[kw_a.pk]['publication_count'], 1) + self.assertEqual(rows[kw_a.pk]['total_uses'], 1) + + def test_row_link_points_to_filtered_keyword_changelist(self): + kw = Keyword.objects.create(keyword="Speech") + Keyword.objects.create(keyword="speech") # makes the cluster qualify + + rows = {r['id']: r for r in DuplicateKeywordsCheck().get_rows()} + label, url = DuplicateKeywordsCheck().row_link(rows[kw.pk]) + + self.assertIn('Merge', label) + # Deep-links to the Keyword changelist filtered to the folded cluster key. + self.assertIn('/admin/website/keyword/', url) + self.assertIn('q=speech', url) + + def test_detail_page_renders_deep_link(self): + # End-to-end: the data-health detail page renders the per-row "Merge in + # admin" deep link (exercises the template's optional-link branch). + Keyword.objects.create(keyword="Speech") + Keyword.objects.create(keyword="speech") + superuser = get_user_model().objects.create_superuser( + username="dh_admin", email="a@b.co", password="pw") + self.client.force_login(superuser) + + resp = self.client.get( + reverse("admin:data_health_detail", args=["duplicate-keywords"])) + + self.assertEqual(resp.status_code, 200) + self.assertContains(resp, "/admin/website/keyword/?q=speech") + self.assertContains(resp, "Merge in admin")