Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions website/admin/data_health/checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@
project_leadership,
position_integrity,
news_health,
duplicate_keywords,
)
98 changes: 98 additions & 0 deletions website/admin/data_health/checks/duplicate_keywords.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""
Data-health check: keywords that collapse to the same normalized text.

Keywords are free-text with no uniqueness constraint, so case/whitespace
variants coexist (e.g. ``Speech`` / ``speech`` / ``Speech ``) and fragment the
public keyword pages. This check *finds* those near-duplicate clusters; the
``Merge selected keywords`` action on the Keyword admin changelist (#1352) is
the tool to *fix* them. Strictly read-only.

One row is emitted per keyword in any cluster of two or more keywords that
share a normalized key (``.strip().casefold()``), with per-model usage counts
to support the merge decision (which variant to keep as the target).
"""

from collections import defaultdict
from urllib.parse import urlencode

from django.urls import reverse

from website.admin.data_health.registry import HealthCheck, register_check
from website.models import (Keyword, Publication, Talk, Poster, Grant,
Project, ProjectUmbrella)

# Reverse accessor on Keyword for each model that holds a `keywords` M2M.
# (Video is not an Artifact, so it has no keywords.) Keep this in lockstep with
# KEYWORD_USERS in website/admin/keyword_admin.py.
KEYWORD_REVERSE_ACCESSORS = (
('publication_count', 'publication_set'),
('talk_count', 'talk_set'),
('poster_count', 'poster_set'),
('grant_count', 'grant_set'),
('project_count', 'project_set'),
('project_umbrella_count', 'projectumbrella_set'),
)


def normalize_keyword(text):
"""Cluster key for near-duplicate detection: trim + case-fold.

casefold() is the aggressive, Unicode-aware lower() — so ``Speech``,
``speech``, and ``Speech `` all collapse to the same key.
"""
return (text or '').strip().casefold()


@register_check
class DuplicateKeywordsCheck(HealthCheck):
slug = 'duplicate-keywords'
title = 'Duplicate keywords (same normalized text)'
description = (
'Keywords that collapse to the same text after trimming whitespace and '
'folding case (e.g. "Speech" / "speech" / "Speech ") — candidates for '
'the "Merge selected keywords" action on the Keyword admin (#1352).'
)
group = 'Artifacts'
columns = [
'cluster_key', 'id', 'keyword',
'publication_count', 'talk_count', 'poster_count', 'grant_count',
'project_count', 'project_umbrella_count', 'total_uses',
]

def get_rows(self):
# Group every keyword by its normalized key.
clusters = defaultdict(list)
for kw in Keyword.objects.all():
clusters[normalize_keyword(kw.keyword)].append(kw)

rows = []
for key, members in clusters.items():
if len(members) < 2:
continue # only multi-keyword clusters are near-duplicates
for kw in members:
rows.append(self._row(key, kw))

# Stable, scannable ordering: cluster together, then by id.
rows.sort(key=lambda r: (r['cluster_key'], r['id']))
return rows

def row_link(self, row):
"""Deep-link each row to the Keyword changelist pre-filtered to its
cluster, so the editor lands on exactly the variants to select and run
the "Merge selected keywords" action on — wiring this finder to its
fixer. The admin search is icontains on ``keyword``, so the folded
cluster key matches every casing variant in the cluster.
"""
url = reverse('admin:website_keyword_changelist')
return ('Merge in admin →', f"{url}?{urlencode({'q': row['cluster_key']})}")

def _row(self, key, kw):
counts = {col: getattr(kw, accessor).count()
for col, accessor in KEYWORD_REVERSE_ACCESSORS}
return {
'cluster_key': key,
'id': kw.pk,
'keyword': kw.keyword,
**counts,
'total_uses': sum(counts.values()),
}
8 changes: 8 additions & 0 deletions website/admin/data_health/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@ def get_rows(self):
"""Return a list of row dicts (read-only). Override in subclasses."""
raise NotImplementedError

def row_link(self, row):
"""Optional: a ``(label, url)`` pair to act on ``row`` from the detail
page, or ``None``. Lets a check wire its read-only findings to a fixer
elsewhere in the admin (e.g. the keyword-merge changelist). Default: no
link. Only affects the on-screen table — the CSV export is unchanged.
"""
return None

def count(self):
"""Number of flagged rows. Override for a cheaper count if available."""
return len(self.get_rows())
Expand Down
10 changes: 8 additions & 2 deletions website/admin/data_health/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,19 @@ def detail(request, check_slug):
if check is None:
raise Http404("Unknown data-health check.")
# Flatten dict rows into cell lists aligned to the column order so the
# template can render them without a dynamic dict-lookup filter.
# template can render them without a dynamic dict-lookup filter. Each row
# also carries an optional (label, url) action link the check may provide.
columns = check.columns
rows = [[row.get(col, '') for col in columns] for row in check.get_rows()]
rows = [
{'cells': [row.get(col, '') for col in columns],
'link': check.row_link(row)}
for row in check.get_rows()
]
context = _admin_context(request, title=check.title)
context['check'] = check
context['columns'] = columns
context['rows'] = rows
context['has_links'] = any(row['link'] for row in rows)
return render(request, 'admin/data_health/detail.html', context)


Expand Down
101 changes: 100 additions & 1 deletion website/admin/keyword_admin.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from django.contrib import admin
from django.contrib import admin, messages
from django.contrib.admin.helpers import ACTION_CHECKBOX_NAME
from django.db import transaction
from django.shortcuts import render
from website.models import (Keyword, Publication, Talk, Poster, Grant,
Project, ProjectUmbrella)
from website.admin.utils import related_count_subquery
Expand All @@ -10,6 +13,13 @@
# Artifact, so it has no keywords.)
KEYWORD_USERS = (Publication, Talk, Poster, Grant, Project, ProjectUmbrella)

# Reverse accessor on Keyword for each model in KEYWORD_USERS — the relations a
# merge must walk to reattach the target keyword before deleting the sources.
KEYWORD_REVERSE_ACCESSORS = (
'publication_set', 'talk_set', 'poster_set',
'grant_set', 'project_set', 'projectumbrella_set',
)


class KeywordUsageFilter(admin.SimpleListFilter):
"""Filter keywords by whether anything references them — the core
Expand All @@ -29,6 +39,32 @@ def queryset(self, request, queryset):
return queryset


@transaction.atomic
def merge_keywords_into_target(target, sources):
"""Reassign every reference from ``sources`` onto ``target``, then delete the
sources. Returns the number of source keywords removed.

For each source keyword and each model that references keywords, every
tagged object gains the target via ``obj.keywords.add(target)`` — idempotent,
so an object already tagged with the target ends up with a single row, not a
duplicate. Deleting the source then drops its own M2M rows, so no explicit
``remove()`` is needed. ``target`` is skipped if present in ``sources``.

Runs in a transaction: a failure mid-merge rolls back rather than leaving the
taxonomy half-merged.
"""
removed = 0
for source in sources:
if source.pk == target.pk:
continue
for accessor in KEYWORD_REVERSE_ACCESSORS:
for obj in getattr(source, accessor).all():
obj.keywords.add(target)
source.delete()
removed += 1
return removed


@admin.register(Keyword, site=ml_admin_site)
class KeywordAdmin(admin.ModelAdmin):
list_display = ['keyword', 'project_count', 'publication_count', 'total_usage']
Expand All @@ -38,6 +74,69 @@ class KeywordAdmin(admin.ModelAdmin):
search_fields = ['keyword']
ordering = ['keyword']
list_filter = (KeywordUsageFilter,)
actions = ['merge_keywords']

@admin.action(description='Merge selected keywords (dedup taxonomy)')
def merge_keywords(self, request, queryset):
"""Merge two or more keywords into one chosen target (#1352).

Two-step Django action: the first click shows an intermediate page to
pick which selected keyword to keep as the target; confirming reassigns
every reference onto it (across all six keyword-holding models) and
deletes the others. Destructive, so it always routes through the
confirmation page — it never merges on the first click.
"""
selected = list(queryset.order_by('keyword'))
if len(selected) < 2:
self.message_user(
request,
"Select two or more keywords to merge.",
level=messages.WARNING,
)
return None

if request.POST.get('confirm_merge'):
target = next((k for k in selected
if str(k.pk) == request.POST.get('target')), None)
if target is None:
self.message_user(
request,
"Pick which keyword to keep as the merge target.",
level=messages.WARNING,
)
else:
sources = [k for k in selected if k.pk != target.pk]
source_names = ', '.join(f'"{k.keyword}"' for k in sources)
removed = merge_keywords_into_target(target, sources)
self.message_user(
request,
f'Merged {removed} keyword(s) ({source_names}) into '
f'"{target.keyword}".',
level=messages.SUCCESS,
)
return None # back to the changelist

# First click (or a target wasn't chosen): show the confirmation page,
# annotating each candidate with its total usage to inform the choice.
candidates = [
{'keyword': k, 'total_uses': sum(
getattr(k, accessor).count()
for accessor in KEYWORD_REVERSE_ACCESSORS)}
for k in selected
]
context = {
**self.admin_site.each_context(request),
'title': 'Merge keywords',
'candidates': candidates,
'selected_ids': [str(k.pk) for k in selected],
'action_checkbox_name': ACTION_CHECKBOX_NAME,
'opts': self.model._meta,
}
return render(
request,
'admin/website/keyword/merge_keywords.html',
context,
)

def change_view(self, request, object_id, form_url='', extra_context=None):
"""Add projects and publications to the context. We then use this extra data in
Expand Down
18 changes: 18 additions & 0 deletions website/models/keyword.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,26 @@
import re

from django.db import models

class Keyword(models.Model):
keyword = models.CharField(max_length=255)

def save(self, *args, **kwargs):
"""Normalize whitespace before saving so casual variants can't create
near-duplicate tags (#1352). Trims the ends and collapses internal runs
of whitespace to a single space, so "Speech ", " Speech", and
"Speech recognition" can't coexist with their clean forms.

This is the partial, no-migration ward (layer 1): it catches every
creation path, including the inline "add keyword" widget on Publication/
Project forms. Case-insensitive uniqueness (e.g. blocking "Speech" vs
"speech") is a separate DB constraint deferred until existing dupes are
merged — casing is intentionally preserved here (VR, HCI, iOS).
"""
if self.keyword:
self.keyword = re.sub(r'\s+', ' ', self.keyword).strip()
super().save(*args, **kwargs)

def get_project_count(self):
"""Returns the number of projects that use keyword"""
return self.project_set.count()
Expand Down
6 changes: 5 additions & 1 deletion website/templates/admin/data_health/detail.html
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,16 @@
<thead>
<tr>
{% for col in columns %}<th scope="col">{{ col }}</th>{% endfor %}
{% if has_links %}<th scope="col">{% translate 'Action' %}</th>{% endif %}
</tr>
</thead>
<tbody>
{% for row in rows %}
<tr class="{% cycle 'row1' 'row2' %}">
{% for cell in row %}<td>{{ cell }}</td>{% endfor %}
{% for cell in row.cells %}<td>{{ cell }}</td>{% endfor %}
{% if has_links %}
<td>{% if row.link %}<a class="button" href="{{ row.link.1 }}">{{ row.link.0 }}</a>{% endif %}</td>
{% endif %}
</tr>
{% endfor %}
</tbody>
Expand Down
59 changes: 59 additions & 0 deletions website/templates/admin/website/keyword/merge_keywords.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{% extends "admin/base_site.html" %}
{% load i18n %}

{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% translate 'Home' %}</a>
&rsaquo; <a href="{% url 'admin:website_keyword_changelist' %}">{% translate 'Keywords' %}</a>
&rsaquo; {% translate 'Merge keywords' %}
</div>
{% endblock %}

{% block content %}
<p>
{% blocktranslate trimmed %}
Choose which keyword to <strong>keep</strong>. Every reference to the other
selected keywords (across publications, talks, posters, grants, projects,
and project umbrellas) will be reassigned to it, and the others will be
<strong>permanently deleted</strong>. This cannot be undone.
{% endblocktranslate %}
</p>

<form method="post">
{% csrf_token %}
<input type="hidden" name="action" value="merge_keywords">
<input type="hidden" name="confirm_merge" value="yes">
{% for id in selected_ids %}
<input type="hidden" name="{{ action_checkbox_name }}" value="{{ id }}">
{% endfor %}

<fieldset class="module aligned">
<table id="result_list">
<thead>
<tr>
<th scope="col">{% translate 'Keep as target' %}</th>
<th scope="col">{% translate 'Keyword' %}</th>
<th scope="col">{% translate 'Total uses' %}</th>
</tr>
</thead>
<tbody>
{% for candidate in candidates %}
<tr class="{% cycle 'row1' 'row2' %}">
<td>
<input type="radio" name="target" id="target_{{ candidate.keyword.pk }}"
value="{{ candidate.keyword.pk }}"{% if forloop.first %} checked{% endif %}>
</td>
<td><label for="target_{{ candidate.keyword.pk }}">{{ candidate.keyword.keyword }}</label></td>
<td>{{ candidate.total_uses }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</fieldset>

<div class="submit-row">
<input type="submit" class="default" value="{% translate 'Merge keywords' %}">
<a href="{% url 'admin:website_keyword_changelist' %}" class="button cancel-link">{% translate 'Cancel' %}</a>
</div>
</form>
{% endblock %}
Loading
Loading