makeabilitylab · jonfroehlich · Jun 21, 2026 · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026
diff --git a/website/admin/data_health/checks/__init__.py b/website/admin/data_health/checks/__init__.py
@@ -12,4 +12,5 @@
     project_leadership,
     position_integrity,
     news_health,
+    duplicate_keywords,
 )
diff --git a/website/admin/data_health/checks/duplicate_keywords.py b/website/admin/data_health/checks/duplicate_keywords.py
@@ -0,0 +1,98 @@
+"""
+Data-health check: keywords that collapse to the same normalized text.
+
+Keywords are free-text with no uniqueness constraint, so case/whitespace
+variants coexist (e.g. ``Speech`` / ``speech`` / ``Speech ``) and fragment the
+public keyword pages. This check *finds* those near-duplicate clusters; the
+``Merge selected keywords`` action on the Keyword admin changelist (#1352) is
+the tool to *fix* them. Strictly read-only.
+
+One row is emitted per keyword in any cluster of two or more keywords that
+share a normalized key (``.strip().casefold()``), with per-model usage counts
+to support the merge decision (which variant to keep as the target).
+"""
+
+from collections import defaultdict
+from urllib.parse import urlencode
+
+from django.urls import reverse
+
+from website.admin.data_health.registry import HealthCheck, register_check
+from website.models import (Keyword, Publication, Talk, Poster, Grant,
+                            Project, ProjectUmbrella)
+
+# Reverse accessor on Keyword for each model that holds a `keywords` M2M.
+# (Video is not an Artifact, so it has no keywords.) Keep this in lockstep with
+# KEYWORD_USERS in website/admin/keyword_admin.py.
+KEYWORD_REVERSE_ACCESSORS = (
+    ('publication_count', 'publication_set'),
+    ('talk_count', 'talk_set'),
+    ('poster_count', 'poster_set'),
+    ('grant_count', 'grant_set'),
+    ('project_count', 'project_set'),
+    ('project_umbrella_count', 'projectumbrella_set'),
+)
+
+
+def normalize_keyword(text):
+    """Cluster key for near-duplicate detection: trim + case-fold.
+
+    casefold() is the aggressive, Unicode-aware lower() — so ``Speech``,
+    ``speech``, and ``Speech `` all collapse to the same key.
+    """
+    return (text or '').strip().casefold()
+
+
+@register_check
+class DuplicateKeywordsCheck(HealthCheck):
+    slug = 'duplicate-keywords'
+    title = 'Duplicate keywords (same normalized text)'
+    description = (
+        'Keywords that collapse to the same text after trimming whitespace and '
+        'folding case (e.g. "Speech" / "speech" / "Speech ") — candidates for '
+        'the "Merge selected keywords" action on the Keyword admin (#1352).'
+    )
+    group = 'Artifacts'
+    columns = [
+        'cluster_key', 'id', 'keyword',
+        'publication_count', 'talk_count', 'poster_count', 'grant_count',
+        'project_count', 'project_umbrella_count', 'total_uses',
+    ]
+
+    def get_rows(self):
+        # Group every keyword by its normalized key.
+        clusters = defaultdict(list)
+        for kw in Keyword.objects.all():
+            clusters[normalize_keyword(kw.keyword)].append(kw)
+
+        rows = []
+        for key, members in clusters.items():
+            if len(members) < 2:
+                continue  # only multi-keyword clusters are near-duplicates
+            for kw in members:
+                rows.append(self._row(key, kw))
+
+        # Stable, scannable ordering: cluster together, then by id.
+        rows.sort(key=lambda r: (r['cluster_key'], r['id']))
+        return rows
+
+    def row_link(self, row):
+        """Deep-link each row to the Keyword changelist pre-filtered to its
+        cluster, so the editor lands on exactly the variants to select and run
+        the "Merge selected keywords" action on — wiring this finder to its
+        fixer. The admin search is icontains on ``keyword``, so the folded
+        cluster key matches every casing variant in the cluster.
+        """
+        url = reverse('admin:website_keyword_changelist')
+        return ('Merge in admin →', f"{url}?{urlencode({'q': row['cluster_key']})}")
+
+    def _row(self, key, kw):
+        counts = {col: getattr(kw, accessor).count()
+                  for col, accessor in KEYWORD_REVERSE_ACCESSORS}
+        return {
+            'cluster_key': key,
+            'id': kw.pk,
+            'keyword': kw.keyword,
+            **counts,
+            'total_uses': sum(counts.values()),
+        }
diff --git a/website/admin/data_health/registry.py b/website/admin/data_health/registry.py
@@ -39,6 +39,14 @@ def get_rows(self):
         """Return a list of row dicts (read-only). Override in subclasses."""
         raise NotImplementedError
 
+    def row_link(self, row):
+        """Optional: a ``(label, url)`` pair to act on ``row`` from the detail
+        page, or ``None``. Lets a check wire its read-only findings to a fixer
+        elsewhere in the admin (e.g. the keyword-merge changelist). Default: no
+        link. Only affects the on-screen table — the CSV export is unchanged.
+        """
+        return None
+
     def count(self):
         """Number of flagged rows. Override for a cheaper count if available."""
         return len(self.get_rows())

diff --git a/website/admin/data_health/views.py b/website/admin/data_health/views.py
@@ -53,13 +53,19 @@ def detail(request, check_slug):
     if check is None:
         raise Http404("Unknown data-health check.")
     # Flatten dict rows into cell lists aligned to the column order so the
-    # template can render them without a dynamic dict-lookup filter.
+    # template can render them without a dynamic dict-lookup filter. Each row
+    # also carries an optional (label, url) action link the check may provide.
     columns = check.columns
-    rows = [[row.get(col, '') for col in columns] for row in check.get_rows()]
+    rows = [
+        {'cells': [row.get(col, '') for col in columns],
+         'link': check.row_link(row)}
+        for row in check.get_rows()
+    ]
     context = _admin_context(request, title=check.title)
     context['check'] = check
     context['columns'] = columns
     context['rows'] = rows
+    context['has_links'] = any(row['link'] for row in rows)
     return render(request, 'admin/data_health/detail.html', context)
 
 

diff --git a/website/admin/keyword_admin.py b/website/admin/keyword_admin.py
@@ -1,4 +1,7 @@
-from django.contrib import admin
+from django.contrib import admin, messages
+from django.contrib.admin.helpers import ACTION_CHECKBOX_NAME
+from django.db import transaction
+from django.shortcuts import render
 from website.models import (Keyword, Publication, Talk, Poster, Grant,
                             Project, ProjectUmbrella)
 from website.admin.utils import related_count_subquery
@@ -10,6 +13,13 @@
 # Artifact, so it has no keywords.)
 KEYWORD_USERS = (Publication, Talk, Poster, Grant, Project, ProjectUmbrella)
 
+# Reverse accessor on Keyword for each model in KEYWORD_USERS — the relations a
+# merge must walk to reattach the target keyword before deleting the sources.
+KEYWORD_REVERSE_ACCESSORS = (
+    'publication_set', 'talk_set', 'poster_set',
+    'grant_set', 'project_set', 'projectumbrella_set',
+)
+
 
 class KeywordUsageFilter(admin.SimpleListFilter):
     """Filter keywords by whether anything references them — the core
@@ -29,6 +39,32 @@ def queryset(self, request, queryset):
         return queryset
 
 
+@transaction.atomic
+def merge_keywords_into_target(target, sources):
+    """Reassign every reference from ``sources`` onto ``target``, then delete the
+    sources. Returns the number of source keywords removed.
+
+    For each source keyword and each model that references keywords, every
+    tagged object gains the target via ``obj.keywords.add(target)`` — idempotent,
+    so an object already tagged with the target ends up with a single row, not a
+    duplicate. Deleting the source then drops its own M2M rows, so no explicit
+    ``remove()`` is needed. ``target`` is skipped if present in ``sources``.
+
+    Runs in a transaction: a failure mid-merge rolls back rather than leaving the
+    taxonomy half-merged.
+    """
+    removed = 0
+    for source in sources:
+        if source.pk == target.pk:
+            continue
+        for accessor in KEYWORD_REVERSE_ACCESSORS:
+            for obj in getattr(source, accessor).all():
+                obj.keywords.add(target)
+        source.delete()
+        removed += 1
+    return removed
+
+
 @admin.register(Keyword, site=ml_admin_site)
 class KeywordAdmin(admin.ModelAdmin):
     list_display = ['keyword', 'project_count', 'publication_count', 'total_usage']
@@ -38,6 +74,69 @@ class KeywordAdmin(admin.ModelAdmin):
     search_fields = ['keyword']
     ordering = ['keyword']
     list_filter = (KeywordUsageFilter,)
+    actions = ['merge_keywords']
+
+    @admin.action(description='Merge selected keywords (dedup taxonomy)')
+    def merge_keywords(self, request, queryset):
+        """Merge two or more keywords into one chosen target (#1352).
+
+        Two-step Django action: the first click shows an intermediate page to
+        pick which selected keyword to keep as the target; confirming reassigns
+        every reference onto it (across all six keyword-holding models) and
+        deletes the others. Destructive, so it always routes through the
+        confirmation page — it never merges on the first click.
+        """
+        selected = list(queryset.order_by('keyword'))
+        if len(selected) < 2:
+            self.message_user(
+                request,
+                "Select two or more keywords to merge.",
+                level=messages.WARNING,
+            )
+            return None
+
+        if request.POST.get('confirm_merge'):
+            target = next((k for k in selected
+                           if str(k.pk) == request.POST.get('target')), None)
+            if target is None:
+                self.message_user(
+                    request,
+                    "Pick which keyword to keep as the merge target.",
+                    level=messages.WARNING,
+                )
+            else:
+                sources = [k for k in selected if k.pk != target.pk]
+                source_names = ', '.join(f'"{k.keyword}"' for k in sources)
+                removed = merge_keywords_into_target(target, sources)
+                self.message_user(
+                    request,
+                    f'Merged {removed} keyword(s) ({source_names}) into '
+                    f'"{target.keyword}".',
+                    level=messages.SUCCESS,
+                )
+                return None  # back to the changelist
+
+        # First click (or a target wasn't chosen): show the confirmation page,
+        # annotating each candidate with its total usage to inform the choice.
+        candidates = [
+            {'keyword': k, 'total_uses': sum(
+                getattr(k, accessor).count()
+                for accessor in KEYWORD_REVERSE_ACCESSORS)}
+            for k in selected
+        ]
+        context = {
+            **self.admin_site.each_context(request),
+            'title': 'Merge keywords',
+            'candidates': candidates,
+            'selected_ids': [str(k.pk) for k in selected],
+            'action_checkbox_name': ACTION_CHECKBOX_NAME,
+            'opts': self.model._meta,
+        }
+        return render(
+            request,
+            'admin/website/keyword/merge_keywords.html',
+            context,
+        )
 
     def change_view(self, request, object_id, form_url='', extra_context=None):
         """Add projects and publications to the context. We then use this extra data in

diff --git a/website/models/keyword.py b/website/models/keyword.py
@@ -1,8 +1,26 @@
+import re
+
 from django.db import models
 
 class Keyword(models.Model):
     keyword = models.CharField(max_length=255)
 
+    def save(self, *args, **kwargs):
+        """Normalize whitespace before saving so casual variants can't create
+        near-duplicate tags (#1352). Trims the ends and collapses internal runs
+        of whitespace to a single space, so "Speech ", " Speech", and
+        "Speech  recognition" can't coexist with their clean forms.
+
+        This is the partial, no-migration ward (layer 1): it catches every
+        creation path, including the inline "add keyword" widget on Publication/
+        Project forms. Case-insensitive uniqueness (e.g. blocking "Speech" vs
+        "speech") is a separate DB constraint deferred until existing dupes are
+        merged — casing is intentionally preserved here (VR, HCI, iOS).
+        """
+        if self.keyword:
+            self.keyword = re.sub(r'\s+', ' ', self.keyword).strip()
+        super().save(*args, **kwargs)
+
     def get_project_count(self):
         """Returns the number of projects that use keyword"""
         return self.project_set.count()

diff --git a/website/templates/admin/data_health/detail.html b/website/templates/admin/data_health/detail.html
@@ -23,12 +23,16 @@
         <thead>
           <tr>
             {% for col in columns %}<th scope="col">{{ col }}</th>{% endfor %}
+            {% if has_links %}<th scope="col">{% translate 'Action' %}</th>{% endif %}
           </tr>
         </thead>
         <tbody>
           {% for row in rows %}
             <tr class="{% cycle 'row1' 'row2' %}">
-              {% for cell in row %}<td>{{ cell }}</td>{% endfor %}
+              {% for cell in row.cells %}<td>{{ cell }}</td>{% endfor %}
+              {% if has_links %}
+                <td>{% if row.link %}<a class="button" href="{{ row.link.1 }}">{{ row.link.0 }}</a>{% endif %}</td>
+              {% endif %}
             </tr>
           {% endfor %}
         </tbody>

diff --git a/website/templates/admin/website/keyword/merge_keywords.html b/website/templates/admin/website/keyword/merge_keywords.html
@@ -0,0 +1,59 @@
+{% extends "admin/base_site.html" %}
+{% load i18n %}
+
+{% block breadcrumbs %}
+  <div class="breadcrumbs">
+    <a href="{% url 'admin:index' %}">{% translate 'Home' %}</a>
+    &rsaquo; <a href="{% url 'admin:website_keyword_changelist' %}">{% translate 'Keywords' %}</a>
+    &rsaquo; {% translate 'Merge keywords' %}
+  </div>
+{% endblock %}
+
+{% block content %}
+  <p>
+    {% blocktranslate trimmed %}
+      Choose which keyword to <strong>keep</strong>. Every reference to the other
+      selected keywords (across publications, talks, posters, grants, projects,
+      and project umbrellas) will be reassigned to it, and the others will be
+      <strong>permanently deleted</strong>. This cannot be undone.
+    {% endblocktranslate %}
+  </p>
+
+  <form method="post">
+    {% csrf_token %}
+    <input type="hidden" name="action" value="merge_keywords">
+    <input type="hidden" name="confirm_merge" value="yes">
+    {% for id in selected_ids %}
+      <input type="hidden" name="{{ action_checkbox_name }}" value="{{ id }}">
+    {% endfor %}
+
+    <fieldset class="module aligned">
+      <table id="result_list">
+        <thead>
+          <tr>
+            <th scope="col">{% translate 'Keep as target' %}</th>
+            <th scope="col">{% translate 'Keyword' %}</th>
+            <th scope="col">{% translate 'Total uses' %}</th>
+          </tr>
+        </thead>
+        <tbody>
+          {% for candidate in candidates %}
+            <tr class="{% cycle 'row1' 'row2' %}">
+              <td>
+                <input type="radio" name="target" id="target_{{ candidate.keyword.pk }}"
+                       value="{{ candidate.keyword.pk }}"{% if forloop.first %} checked{% endif %}>
+              </td>
+              <td><label for="target_{{ candidate.keyword.pk }}">{{ candidate.keyword.keyword }}</label></td>
+              <td>{{ candidate.total_uses }}</td>
+            </tr>
+          {% endfor %}
+        </tbody>
+      </table>
+    </fieldset>
+
+    <div class="submit-row">
+      <input type="submit" class="default" value="{% translate 'Merge keywords' %}">
+      <a href="{% url 'admin:website_keyword_changelist' %}" class="button cancel-link">{% translate 'Cancel' %}</a>
+    </div>
+  </form>
+{% endblock %}