diff --git a/website/admin/data_health/checks/__init__.py b/website/admin/data_health/checks/__init__.py index ca356579..557a265c 100644 --- a/website/admin/data_health/checks/__init__.py +++ b/website/admin/data_health/checks/__init__.py @@ -12,4 +12,5 @@ project_leadership, position_integrity, news_health, + duplicate_keywords, ) diff --git a/website/admin/data_health/checks/duplicate_keywords.py b/website/admin/data_health/checks/duplicate_keywords.py new file mode 100644 index 00000000..f24e2696 --- /dev/null +++ b/website/admin/data_health/checks/duplicate_keywords.py @@ -0,0 +1,98 @@ +""" +Data-health check: keywords that collapse to the same normalized text. + +Keywords are free-text with no uniqueness constraint, so case/whitespace +variants coexist (e.g. ``Speech`` / ``speech`` / ``Speech ``) and fragment the +public keyword pages. This check *finds* those near-duplicate clusters; the +``Merge selected keywords`` action on the Keyword admin changelist (#1352) is +the tool to *fix* them. Strictly read-only. + +One row is emitted per keyword in any cluster of two or more keywords that +share a normalized key (``.strip().casefold()``), with per-model usage counts +to support the merge decision (which variant to keep as the target). +""" + +from collections import defaultdict +from urllib.parse import urlencode + +from django.urls import reverse + +from website.admin.data_health.registry import HealthCheck, register_check +from website.models import (Keyword, Publication, Talk, Poster, Grant, + Project, ProjectUmbrella) + +# Reverse accessor on Keyword for each model that holds a `keywords` M2M. +# (Video is not an Artifact, so it has no keywords.) Keep this in lockstep with +# KEYWORD_USERS in website/admin/keyword_admin.py. +KEYWORD_REVERSE_ACCESSORS = ( + ('publication_count', 'publication_set'), + ('talk_count', 'talk_set'), + ('poster_count', 'poster_set'), + ('grant_count', 'grant_set'), + ('project_count', 'project_set'), + ('project_umbrella_count', 'projectumbrella_set'), +) + + +def normalize_keyword(text): + """Cluster key for near-duplicate detection: trim + case-fold. + + casefold() is the aggressive, Unicode-aware lower() — so ``Speech``, + ``speech``, and ``Speech `` all collapse to the same key. + """ + return (text or '').strip().casefold() + + +@register_check +class DuplicateKeywordsCheck(HealthCheck): + slug = 'duplicate-keywords' + title = 'Duplicate keywords (same normalized text)' + description = ( + 'Keywords that collapse to the same text after trimming whitespace and ' + 'folding case (e.g. "Speech" / "speech" / "Speech ") — candidates for ' + 'the "Merge selected keywords" action on the Keyword admin (#1352).' + ) + group = 'Artifacts' + columns = [ + 'cluster_key', 'id', 'keyword', + 'publication_count', 'talk_count', 'poster_count', 'grant_count', + 'project_count', 'project_umbrella_count', 'total_uses', + ] + + def get_rows(self): + # Group every keyword by its normalized key. + clusters = defaultdict(list) + for kw in Keyword.objects.all(): + clusters[normalize_keyword(kw.keyword)].append(kw) + + rows = [] + for key, members in clusters.items(): + if len(members) < 2: + continue # only multi-keyword clusters are near-duplicates + for kw in members: + rows.append(self._row(key, kw)) + + # Stable, scannable ordering: cluster together, then by id. + rows.sort(key=lambda r: (r['cluster_key'], r['id'])) + return rows + + def row_link(self, row): + """Deep-link each row to the Keyword changelist pre-filtered to its + cluster, so the editor lands on exactly the variants to select and run + the "Merge selected keywords" action on — wiring this finder to its + fixer. The admin search is icontains on ``keyword``, so the folded + cluster key matches every casing variant in the cluster. + """ + url = reverse('admin:website_keyword_changelist') + return ('Merge in admin →', f"{url}?{urlencode({'q': row['cluster_key']})}") + + def _row(self, key, kw): + counts = {col: getattr(kw, accessor).count() + for col, accessor in KEYWORD_REVERSE_ACCESSORS} + return { + 'cluster_key': key, + 'id': kw.pk, + 'keyword': kw.keyword, + **counts, + 'total_uses': sum(counts.values()), + } diff --git a/website/admin/data_health/registry.py b/website/admin/data_health/registry.py index 9719231c..a681dd42 100644 --- a/website/admin/data_health/registry.py +++ b/website/admin/data_health/registry.py @@ -39,6 +39,14 @@ def get_rows(self): """Return a list of row dicts (read-only). Override in subclasses.""" raise NotImplementedError + def row_link(self, row): + """Optional: a ``(label, url)`` pair to act on ``row`` from the detail + page, or ``None``. Lets a check wire its read-only findings to a fixer + elsewhere in the admin (e.g. the keyword-merge changelist). Default: no + link. Only affects the on-screen table — the CSV export is unchanged. + """ + return None + def count(self): """Number of flagged rows. Override for a cheaper count if available.""" return len(self.get_rows()) diff --git a/website/admin/data_health/views.py b/website/admin/data_health/views.py index 86641d08..d512928f 100644 --- a/website/admin/data_health/views.py +++ b/website/admin/data_health/views.py @@ -53,13 +53,19 @@ def detail(request, check_slug): if check is None: raise Http404("Unknown data-health check.") # Flatten dict rows into cell lists aligned to the column order so the - # template can render them without a dynamic dict-lookup filter. + # template can render them without a dynamic dict-lookup filter. Each row + # also carries an optional (label, url) action link the check may provide. columns = check.columns - rows = [[row.get(col, '') for col in columns] for row in check.get_rows()] + rows = [ + {'cells': [row.get(col, '') for col in columns], + 'link': check.row_link(row)} + for row in check.get_rows() + ] context = _admin_context(request, title=check.title) context['check'] = check context['columns'] = columns context['rows'] = rows + context['has_links'] = any(row['link'] for row in rows) return render(request, 'admin/data_health/detail.html', context) diff --git a/website/admin/keyword_admin.py b/website/admin/keyword_admin.py index 09f41df3..d2a3365f 100644 --- a/website/admin/keyword_admin.py +++ b/website/admin/keyword_admin.py @@ -1,4 +1,7 @@ -from django.contrib import admin +from django.contrib import admin, messages +from django.contrib.admin.helpers import ACTION_CHECKBOX_NAME +from django.db import transaction +from django.shortcuts import render from website.models import (Keyword, Publication, Talk, Poster, Grant, Project, ProjectUmbrella) from website.admin.utils import related_count_subquery @@ -10,6 +13,13 @@ # Artifact, so it has no keywords.) KEYWORD_USERS = (Publication, Talk, Poster, Grant, Project, ProjectUmbrella) +# Reverse accessor on Keyword for each model in KEYWORD_USERS — the relations a +# merge must walk to reattach the target keyword before deleting the sources. +KEYWORD_REVERSE_ACCESSORS = ( + 'publication_set', 'talk_set', 'poster_set', + 'grant_set', 'project_set', 'projectumbrella_set', +) + class KeywordUsageFilter(admin.SimpleListFilter): """Filter keywords by whether anything references them — the core @@ -29,6 +39,32 @@ def queryset(self, request, queryset): return queryset +@transaction.atomic +def merge_keywords_into_target(target, sources): + """Reassign every reference from ``sources`` onto ``target``, then delete the + sources. Returns the number of source keywords removed. + + For each source keyword and each model that references keywords, every + tagged object gains the target via ``obj.keywords.add(target)`` — idempotent, + so an object already tagged with the target ends up with a single row, not a + duplicate. Deleting the source then drops its own M2M rows, so no explicit + ``remove()`` is needed. ``target`` is skipped if present in ``sources``. + + Runs in a transaction: a failure mid-merge rolls back rather than leaving the + taxonomy half-merged. + """ + removed = 0 + for source in sources: + if source.pk == target.pk: + continue + for accessor in KEYWORD_REVERSE_ACCESSORS: + for obj in getattr(source, accessor).all(): + obj.keywords.add(target) + source.delete() + removed += 1 + return removed + + @admin.register(Keyword, site=ml_admin_site) class KeywordAdmin(admin.ModelAdmin): list_display = ['keyword', 'project_count', 'publication_count', 'total_usage'] @@ -38,6 +74,69 @@ class KeywordAdmin(admin.ModelAdmin): search_fields = ['keyword'] ordering = ['keyword'] list_filter = (KeywordUsageFilter,) + actions = ['merge_keywords'] + + @admin.action(description='Merge selected keywords (dedup taxonomy)') + def merge_keywords(self, request, queryset): + """Merge two or more keywords into one chosen target (#1352). + + Two-step Django action: the first click shows an intermediate page to + pick which selected keyword to keep as the target; confirming reassigns + every reference onto it (across all six keyword-holding models) and + deletes the others. Destructive, so it always routes through the + confirmation page — it never merges on the first click. + """ + selected = list(queryset.order_by('keyword')) + if len(selected) < 2: + self.message_user( + request, + "Select two or more keywords to merge.", + level=messages.WARNING, + ) + return None + + if request.POST.get('confirm_merge'): + target = next((k for k in selected + if str(k.pk) == request.POST.get('target')), None) + if target is None: + self.message_user( + request, + "Pick which keyword to keep as the merge target.", + level=messages.WARNING, + ) + else: + sources = [k for k in selected if k.pk != target.pk] + source_names = ', '.join(f'"{k.keyword}"' for k in sources) + removed = merge_keywords_into_target(target, sources) + self.message_user( + request, + f'Merged {removed} keyword(s) ({source_names}) into ' + f'"{target.keyword}".', + level=messages.SUCCESS, + ) + return None # back to the changelist + + # First click (or a target wasn't chosen): show the confirmation page, + # annotating each candidate with its total usage to inform the choice. + candidates = [ + {'keyword': k, 'total_uses': sum( + getattr(k, accessor).count() + for accessor in KEYWORD_REVERSE_ACCESSORS)} + for k in selected + ] + context = { + **self.admin_site.each_context(request), + 'title': 'Merge keywords', + 'candidates': candidates, + 'selected_ids': [str(k.pk) for k in selected], + 'action_checkbox_name': ACTION_CHECKBOX_NAME, + 'opts': self.model._meta, + } + return render( + request, + 'admin/website/keyword/merge_keywords.html', + context, + ) def change_view(self, request, object_id, form_url='', extra_context=None): """Add projects and publications to the context. We then use this extra data in diff --git a/website/models/keyword.py b/website/models/keyword.py index 55827d27..0fe3e156 100644 --- a/website/models/keyword.py +++ b/website/models/keyword.py @@ -1,8 +1,26 @@ +import re + from django.db import models class Keyword(models.Model): keyword = models.CharField(max_length=255) + def save(self, *args, **kwargs): + """Normalize whitespace before saving so casual variants can't create + near-duplicate tags (#1352). Trims the ends and collapses internal runs + of whitespace to a single space, so "Speech ", " Speech", and + "Speech recognition" can't coexist with their clean forms. + + This is the partial, no-migration ward (layer 1): it catches every + creation path, including the inline "add keyword" widget on Publication/ + Project forms. Case-insensitive uniqueness (e.g. blocking "Speech" vs + "speech") is a separate DB constraint deferred until existing dupes are + merged — casing is intentionally preserved here (VR, HCI, iOS). + """ + if self.keyword: + self.keyword = re.sub(r'\s+', ' ', self.keyword).strip() + super().save(*args, **kwargs) + def get_project_count(self): """Returns the number of projects that use keyword""" return self.project_set.count() diff --git a/website/templates/admin/data_health/detail.html b/website/templates/admin/data_health/detail.html index b5ce0c78..34907ff8 100644 --- a/website/templates/admin/data_health/detail.html +++ b/website/templates/admin/data_health/detail.html @@ -23,12 +23,16 @@ {% for col in columns %}{{ col }}{% endfor %} + {% if has_links %}{% translate 'Action' %}{% endif %} {% for row in rows %} - {% for cell in row %}{{ cell }}{% endfor %} + {% for cell in row.cells %}{{ cell }}{% endfor %} + {% if has_links %} + {% if row.link %}{{ row.link.0 }}{% endif %} + {% endif %} {% endfor %} diff --git a/website/templates/admin/website/keyword/merge_keywords.html b/website/templates/admin/website/keyword/merge_keywords.html new file mode 100644 index 00000000..948fba48 --- /dev/null +++ b/website/templates/admin/website/keyword/merge_keywords.html @@ -0,0 +1,59 @@ +{% extends "admin/base_site.html" %} +{% load i18n %} + +{% block breadcrumbs %} + +{% endblock %} + +{% block content %} +

+ {% blocktranslate trimmed %} + Choose which keyword to keep. Every reference to the other + selected keywords (across publications, talks, posters, grants, projects, + and project umbrellas) will be reassigned to it, and the others will be + permanently deleted. This cannot be undone. + {% endblocktranslate %} +

+ +
+ {% csrf_token %} + + + {% for id in selected_ids %} + + {% endfor %} + +
+ + + + + + + + + + {% for candidate in candidates %} + + + + + + {% endfor %} + +
{% translate 'Keep as target' %}{% translate 'Keyword' %}{% translate 'Total uses' %}
+ + {{ candidate.total_uses }}
+
+ +
+ + {% translate 'Cancel' %} +
+
+{% endblock %} diff --git a/website/tests/test_keyword_merge.py b/website/tests/test_keyword_merge.py new file mode 100644 index 00000000..dac7026e --- /dev/null +++ b/website/tests/test_keyword_merge.py @@ -0,0 +1,206 @@ +""" +Tests for the destructive "Merge selected keywords" admin action and the +duplicate-keywords data-health check (#1352). + +Coverage: + * merge reassigns references across all six keyword-holding models, then + deletes the merged-away keywords; + * an object already tagged with the target gains no duplicate M2M row; + * a keyword passed as both source and target is skipped (no self-delete); + * the admin action is a no-op when fewer than two keywords are selected; + * the data-health check clusters case/whitespace variants and ignores + singletons. +""" + +from datetime import date + +from django.contrib.auth import get_user_model +from django.contrib.messages.storage.fallback import FallbackStorage +from django.test import RequestFactory +from django.urls import reverse + +from website.models import (Keyword, Publication, Talk, Poster, Grant, + Project, ProjectUmbrella, Sponsor) +from website.admin.admin_site import ml_admin_site +from website.admin.keyword_admin import KeywordAdmin, merge_keywords_into_target +from website.admin.data_health.checks.duplicate_keywords import DuplicateKeywordsCheck +from website.tests.base import DatabaseTestCase + + +class KeywordMergeHelperTests(DatabaseTestCase): + """Exercise the reusable merge function directly (no request plumbing).""" + + def _tag_one_object_per_model(self, keyword): + """Create one object of each of the six keyword-holding models, tag each + with ``keyword``, and return them so callers can assert reassignment.""" + sponsor = Sponsor.objects.create(name="NSF") + objects = { + 'publication': self.make_publication(title="P"), + 'talk': self.make_talk(title="T"), + 'poster': Poster.objects.create(title="Po", date=date(2024, 1, 1), + forum_name="CHI"), + 'project': self.make_project(name="Proj"), + 'project_umbrella': ProjectUmbrella.objects.create( + name="Umb", short_name="umb"), + 'grant': Grant.objects.create(title="G", date=date(2024, 1, 1), + forum_name="NSF", sponsor=sponsor), + } + for obj in objects.values(): + obj.keywords.add(keyword) + return objects + + def test_merge_reassigns_all_six_relations_and_deletes_source(self): + target = Keyword.objects.create(keyword="Speech") + source = Keyword.objects.create(keyword="speech") + objects = self._tag_one_object_per_model(source) + + removed = merge_keywords_into_target(target, [source]) + + self.assertEqual(removed, 1) + self.assertFalse(Keyword.objects.filter(pk=source.pk).exists()) + for name, obj in objects.items(): + keyword_ids = list(obj.keywords.values_list('pk', flat=True)) + self.assertEqual(keyword_ids, [target.pk], + f"{name} should now reference only the target keyword") + + def test_object_already_tagged_with_target_has_no_duplicate(self): + target = Keyword.objects.create(keyword="HCI") + source = Keyword.objects.create(keyword="hci") + pub = self.make_publication(title="Already tagged") + pub.keywords.add(target, source) # tagged with BOTH + + merge_keywords_into_target(target, [source]) + + # add() is idempotent, so the target appears exactly once — not twice. + self.assertEqual(list(pub.keywords.values_list('pk', flat=True)), + [target.pk]) + self.assertFalse(Keyword.objects.filter(pk=source.pk).exists()) + + def test_target_in_sources_is_skipped(self): + target = Keyword.objects.create(keyword="Robotics") + source = Keyword.objects.create(keyword="robotics") + + # Passing the target itself among the sources must not delete it. + removed = merge_keywords_into_target(target, [target, source]) + + self.assertEqual(removed, 1) + self.assertTrue(Keyword.objects.filter(pk=target.pk).exists()) + self.assertFalse(Keyword.objects.filter(pk=source.pk).exists()) + + +class KeywordMergeActionTests(DatabaseTestCase): + """Exercise the admin action wrapper (selection guards + confirm path).""" + + def setUp(self): + super().setUp() + self.admin = KeywordAdmin(Keyword, ml_admin_site) + self.factory = RequestFactory() + + def _request(self, data=None): + request = self.factory.post('/admin/website/keyword/', data or {}) + request.user = None + # Action handlers call message_user(), which needs a message store. + setattr(request, 'session', {}) + setattr(request, '_messages', FallbackStorage(request)) + return request + + def test_single_selection_is_a_noop(self): + only = Keyword.objects.create(keyword="solo") + request = self._request() + + result = self.admin.merge_keywords(request, Keyword.objects.filter(pk=only.pk)) + + self.assertIsNone(result) + self.assertTrue(Keyword.objects.filter(pk=only.pk).exists()) + + def test_confirm_post_merges_and_deletes(self): + target = Keyword.objects.create(keyword="Speech") + source = Keyword.objects.create(keyword="speech") + pub = self.make_publication(title="Confirmed merge") + pub.keywords.add(source) + + request = self._request({ + 'confirm_merge': 'yes', + 'target': str(target.pk), + }) + result = self.admin.merge_keywords( + request, Keyword.objects.filter(pk__in=[target.pk, source.pk])) + + self.assertIsNone(result) # returns to changelist + self.assertFalse(Keyword.objects.filter(pk=source.pk).exists()) + self.assertEqual(list(pub.keywords.values_list('pk', flat=True)), + [target.pk]) + + +class KeywordNormalizationTests(DatabaseTestCase): + """Layer-1 ward: Keyword.save() normalizes whitespace (#1352).""" + + def test_save_trims_and_collapses_whitespace(self): + kw = Keyword.objects.create(keyword=" Speech recognition ") + kw.refresh_from_db() + self.assertEqual(kw.keyword, "Speech recognition") + + def test_casing_is_preserved(self): + # Whitespace is normalized, but casing is intentionally left alone so + # acronyms like VR / HCI / iOS keep their intended form. + kw = Keyword.objects.create(keyword=" HCI ") + kw.refresh_from_db() + self.assertEqual(kw.keyword, "HCI") + + +class DuplicateKeywordsCheckTests(DatabaseTestCase): + """The finder must cluster case/whitespace variants and skip singletons.""" + + def test_clusters_case_variants_and_ignores_unique_keywords(self): + # Whitespace variants are now prevented at save (layer 1), so the + # finder's remaining job is case variants, which still coexist until the + # layer-2 case-insensitive constraint lands. + Keyword.objects.create(keyword="Speech") + Keyword.objects.create(keyword="speech") + Keyword.objects.create(keyword="Robotics") # unique → not flagged + + rows = DuplicateKeywordsCheck().get_rows() + + flagged = {r['keyword'] for r in rows} + self.assertEqual(flagged, {"Speech", "speech"}) + # Both share one normalized cluster key. + self.assertEqual({r['cluster_key'] for r in rows}, {"speech"}) + + def test_total_uses_counts_references(self): + kw_a = Keyword.objects.create(keyword="VR") + Keyword.objects.create(keyword="vr") # variant so the cluster qualifies + pub = self.make_publication(title="VR paper") + pub.keywords.add(kw_a) + + rows = {r['id']: r for r in DuplicateKeywordsCheck().get_rows()} + + self.assertEqual(rows[kw_a.pk]['publication_count'], 1) + self.assertEqual(rows[kw_a.pk]['total_uses'], 1) + + def test_row_link_points_to_filtered_keyword_changelist(self): + kw = Keyword.objects.create(keyword="Speech") + Keyword.objects.create(keyword="speech") # makes the cluster qualify + + rows = {r['id']: r for r in DuplicateKeywordsCheck().get_rows()} + label, url = DuplicateKeywordsCheck().row_link(rows[kw.pk]) + + self.assertIn('Merge', label) + # Deep-links to the Keyword changelist filtered to the folded cluster key. + self.assertIn('/admin/website/keyword/', url) + self.assertIn('q=speech', url) + + def test_detail_page_renders_deep_link(self): + # End-to-end: the data-health detail page renders the per-row "Merge in + # admin" deep link (exercises the template's optional-link branch). + Keyword.objects.create(keyword="Speech") + Keyword.objects.create(keyword="speech") + superuser = get_user_model().objects.create_superuser( + username="dh_admin", email="a@b.co", password="pw") + self.client.force_login(superuser) + + resp = self.client.get( + reverse("admin:data_health_detail", args=["duplicate-keywords"])) + + self.assertEqual(resp.status_code, 200) + self.assertContains(resp, "/admin/website/keyword/?q=speech") + self.assertContains(resp, "Merge in admin")