From d13aeccf8cd1056f1d98388e78e5ecdc4de56cea Mon Sep 17 00:00:00 2001 From: "David G. Johnston" Date: Mon, 28 Apr 2025 11:45:13 -0700 Subject: [PATCH 1/2] Modernize setup and development runtime environment Thanks to Jelte for most of this. I basically just moved the SQL code to a migration and stood up a dev environment via make. The downgrading of parentid to null-able was required though I do not know why. I added a couple of views to make interactive "select *" doable. Added an upsert to ensure that a dev setup has a list. I reworked the text search configurations from what was in schema.sql. I couldn't figure out how to make the existing code work and for my development needs it wasn't important. --- .gitignore | 6 + Makefile | 10 ++ README.md | 77 +++++++++ dev_requirements.txt | 2 + django/archives/example_settings_local.py | 19 +++ .../migrations/0005_sync_with_loader.py | 131 ++++++++++++++ .../migrations/0006_alter_message_parentid.py | 18 ++ django/archives/mailarchives/models.py | 2 +- django/run_dev.py | 22 +++ django/uwsgi_dev.ini | 10 ++ loader/archives.ini.sample | 16 +- loader/lib/parser.py | 2 +- loader/sql/schema.sql | 160 ------------------ requirements.txt | 7 + 14 files changed, 312 insertions(+), 170 deletions(-) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README.md create mode 100644 dev_requirements.txt create mode 100644 django/archives/example_settings_local.py create mode 100644 django/archives/mailarchives/migrations/0005_sync_with_loader.py create mode 100644 django/archives/mailarchives/migrations/0006_alter_message_parentid.py create mode 100755 django/run_dev.py create mode 100644 django/uwsgi_dev.ini delete mode 100644 loader/sql/schema.sql create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ae1b7d1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +# Ignore files in the Python virtual environment +/env/ + +# A convenient place to store downloaded mbox files from production +# to facilitate testing, and automation. +/mboxes/ diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..08cc20d --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +# A handy target to reset the development environment back to a clean slate +# and run the development server. +# XXX: For now just use the single mbox file that was previously downloaded. +# Additional work in this area, for testing use cases, is needed. +dev-rebuild-and-run: + dropdb --if-exists archives + createdb archives + django/manage.py migrate + loader/load_message.py --list pgsql-hackers --mbox mboxes/pgsql-hackers.202504 >/dev/null + cd ./django && ./run_dev.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..f5ae54a --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +# PG archives + +This application manages PostgreSQL mailing list archives. However, the search +feature is implemented in pgweb. + +## The Application + +This is a Django 4.2 application backed by PostgreSQL and running on Python 3.x. + +## Getting Started + +### Ubuntu instructions + +First, prepare your development environment by installing python3, postgresql-server-dev-X.Y, formail and libtidy (use `--no-install-recommends` to avoid installing postfix): + +```bash +sudo apt install python3 postgresql-server-dev-14 procmail libtidy5deb1 --no-install-recommends +``` + +Next, configure your local environment with virtualenv and install local dependencies. + +```bash +python3 -m venv env +source env/bin/activate +pip install -r dev_requirements.txt +``` + +Create a database for the application: + +```bash +createdb archives +cd django +./manage.py migrate +# Creates pgsql-hackers list with ID 1 if open. +``` + +Create config for the loader scripts: + +```bash +cp loader/archives.ini.sample loader/archives.ini +``` + +Load some emails from the actual PostgreSQL archives by downloading an mbox +file from and running the +following command. NOTE: it's totally fine if some of the emails will fail to +load. + +```bash +loader/load_message.py --list pgsql-hackers --mbox /path/to/downloaded/mbox/file +``` + +Then go to the `django` directory, that's where the actual web application is. + +```bash +cd django +``` + +Create a local settings file (feel free to edit it): + +```bash +cp archives/example_settings_local.py archives/settings_local.py +``` + +Finally, you're ready to start the web application: + +```bash +./run_dev.py +``` + +Or, download the April 2025 mbox file from the PostgreSQL archives and place it in /mboxes. +Then run: +```bash +make dev-rebuild-and-run +``` + +Then open to view your local mailing +list archives. diff --git a/dev_requirements.txt b/dev_requirements.txt new file mode 100644 index 0000000..3b878a3 --- /dev/null +++ b/dev_requirements.txt @@ -0,0 +1,2 @@ +-r requirements.txt +uwsgi diff --git a/django/archives/example_settings_local.py b/django/archives/example_settings_local.py new file mode 100644 index 0000000..4657b62 --- /dev/null +++ b/django/archives/example_settings_local.py @@ -0,0 +1,19 @@ +# Enable more debugging information +DEBUG = True +# Prevent logging to try to send emails to postgresql.org admins. +# Use the default Django logging settings instead. +LOGGING = None + +DATABASES = { + "default": { + "ENGINE": "django.db.backends.postgresql_psycopg2", + "NAME": "archives", + "USER": "postgres", + "PASSWORD": "postgres", + "HOST": "0.0.0.0", + } +} + +# Allow API access to all clients +PUBLIC_ARCHIVES = True +ALLOWED_HOSTS = ["*"] diff --git a/django/archives/mailarchives/migrations/0005_sync_with_loader.py b/django/archives/mailarchives/migrations/0005_sync_with_loader.py new file mode 100644 index 0000000..c12bb52 --- /dev/null +++ b/django/archives/mailarchives/migrations/0005_sync_with_loader.py @@ -0,0 +1,131 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.18 on 2019-06-19 19:02 +from __future__ import unicode_literals + +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('mailarchives', '0004_resend_rate_limit'), + ] + + operations = [ + migrations.RunSQL( + """ +ALTER TABLE messages ADD COLUMN rawtxt bytea NOT NULL; +ALTER TABLE messages ADD COLUMN fti tsvector NOT NULL; +ALTER TABLE attachments ADD COLUMN attachment bytea NOT NULL; +CREATE TABLE loaderrors( + id SERIAL NOT NULL PRIMARY KEY, + listid int NOT NULL, + dat timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, + msgid text NOT NULL, + srctype text NOT NULL, + src text NOT NULL, + err text NOT NULL +); +CREATE SEQUENCE threadid_seq; +CREATE TABLE list_months( + listid int NOT NULL REFERENCES lists(listid), + year int NOT NULL, + month int NOT NULL, + CONSTRAINT list_months_pk PRIMARY KEY (listid, year, month) +); +CREATE TABLE list_threads( + threadid int NOT NULL, /* comes from threadid_seq */ + listid int NOT NULL REFERENCES lists(listid), + CONSTRAINT pg_list_threads PRIMARY KEY (threadid, listid) +); +CREATE INDEX list_threads_listid_idx ON list_threads(listid); +CREATE TABLE unresolved_messages( + message int NOT NULL REFERENCES messages, + priority int NOT NULL, + msgid text NOT NULL, + CONSTRAINT unresolved_messages_pkey PRIMARY KEY (message, priority) +); +CREATE UNIQUE INDEX idx_unresolved_msgid_message ON unresolved_messages(msgid, message); + +/* A couple of convenience views that exclude the content fields. */ +CREATE VIEW messages_meta AS + SELECT + id, + parentid, + threadid, + _from, + _to, + cc, + subject, + date, + has_attachment, + hiddenstatus, + messageid + FROM messages; + +CREATE VIEW attachments_meta AS + SELECT + id, + message, + filename, + contenttype + FROM attachments; + +INSERT INTO listgroups (groupid, groupname, sortkey) VALUES (1, 'Developer lists', 1) + ON CONFLICT (groupid) DO NOTHING; + +INSERT INTO lists (listid, listname, shortdesc, description, active, groupid, subscriber_access) + VALUES (1, 'pgsql-hackers', 'pgsql-hackers', -- implicit concatentation below + 'The PostgreSQL developers team lives here. ' + 'Discussion of current development issues, problems and bugs, and proposed new features. ' + 'If your question cannot be answered by people in the other lists, ' + 'and it is likely that only a developer will know the answer, you may re-post your question in this list. ' + 'You must try elsewhere first!', True, 1, True) + ON CONFLICT (listid) DO NOTHING; + +CREATE TEXT SEARCH CONFIGURATION pg (COPY=pg_catalog.english); + +/* +CREATE TEXT SEARCH DICTIONARY english_ispell ( + TEMPLATE = ispell, + DictFile = english, + AffFile = english, + StopWords = english +); +CREATE TEXT SEARCH DICTIONARY pg_dict ( + TEMPLATE = synonym, + SYNONYMS = pg_dict +); +CREATE TEXT SEARCH DICTIONARY pg_stop ( + TEMPLATE = simple, + StopWords = pg_dict +); +*/ +ALTER TEXT SEARCH CONFIGURATION pg + ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, + word, hword, hword_part + WITH english_stem; + +ALTER TEXT SEARCH CONFIGURATION pg + DROP MAPPING FOR email, url, url_path, sfloat, float; + +CREATE FUNCTION messages_fti_trigger_func() RETURNS trigger AS $$ +BEGIN + NEW.fti = setweight(to_tsvector('public.pg', coalesce(new.subject, '')), 'A') || + setweight(to_tsvector('public.pg', coalesce(new.bodytxt, '')), 'D'); + RETURN NEW; +END +$$ LANGUAGE 'plpgsql'; + +CREATE TRIGGER messages_fti_trigger + BEFORE INSERT OR UPDATE OF subject, bodytxt ON messages + FOR EACH ROW EXECUTE PROCEDURE messages_fti_trigger_func(); + +CREATE INDEX messages_fti_idx ON messages USING gin(fti); + + """, + ), + + ] diff --git a/django/archives/mailarchives/migrations/0006_alter_message_parentid.py b/django/archives/mailarchives/migrations/0006_alter_message_parentid.py new file mode 100644 index 0000000..16d56bc --- /dev/null +++ b/django/archives/mailarchives/migrations/0006_alter_message_parentid.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.20 on 2025-04-28 18:37 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('mailarchives', '0005_sync_with_loader'), + ] + + operations = [ + migrations.AlterField( + model_name='message', + name='parentid', + field=models.IntegerField(blank=True, null=True), + ), + ] diff --git a/django/archives/mailarchives/models.py b/django/archives/mailarchives/models.py index 44c4469..4f75acc 100644 --- a/django/archives/mailarchives/models.py +++ b/django/archives/mailarchives/models.py @@ -25,7 +25,7 @@ class Message(models.Model): messageid = models.TextField(null=False) bodytxt = models.TextField(null=False) # rawtxt is a bytea field, which django doesn't support (easily) - parentid = models.IntegerField(null=False, blank=False) + parentid = models.IntegerField(null=True, blank=True) has_attachment = models.BooleanField(null=False, default=False) hiddenstatus = models.IntegerField(null=True) # fti is a tsvector field, which django doesn't support (easily) diff --git a/django/run_dev.py b/django/run_dev.py new file mode 100755 index 0000000..268cd83 --- /dev/null +++ b/django/run_dev.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +from importlib.machinery import PathFinder +import subprocess +import sys + +django_path = PathFinder().find_spec("django").submodule_search_locations[0] + +django_admin_path = django_path + "/contrib/admin/static/admin" + +if len(sys.argv) > 1: + ini_file = sys.argv[1] +else: + ini_file = "uwsgi_dev.ini" + +subprocess.run( + [ + "uwsgi", + "--static-map", + f"/static/admin={django_path}/contrib/admin/static/admin", + ini_file, + ] +) diff --git a/django/uwsgi_dev.ini b/django/uwsgi_dev.ini new file mode 100644 index 0000000..9ab26b2 --- /dev/null +++ b/django/uwsgi_dev.ini @@ -0,0 +1,10 @@ +[uwsgi] +threads=1 +env=DJANGO_SETTINGS_MODULE=archives.settings +module=archives.wsgi:application +py-autoreload=1 +touch-reload = archives/settings.py +touch-reload = archives/settings_local.py +touch-reload = uwsgi_dev.ini +http=127.0.0.1:8001 +static-map=/media-archives=media diff --git a/loader/archives.ini.sample b/loader/archives.ini.sample index a146f69..1c3c6e2 100644 --- a/loader/archives.ini.sample +++ b/loader/archives.ini.sample @@ -2,16 +2,16 @@ connstr=dbname=archives [varnish] -purgeurl=https://wrigleys.postgresql.org/api/varnish/purge/ +#purgeurl=https://wrigleys.postgresql.org/api/varnish/purge/ [smtp] -server=localhost:9911 -heloname=localhost -resender=noreply@example.com +#server=localhost:9911 +#heloname=localhost +#resender=noreply@example.com [pglister] # synchronize subscribers between pgarchives and pglister -subscribers=0 -root=/path/to/pglister -myname=pgarchives -apikey=CHANGEME +#subscribers=0 +#root=/path/to/pglister +#myname=pgarchives +#apikey=CHANGEME diff --git a/loader/lib/parser.py b/loader/lib/parser.py index 027ed53..e81b193 100644 --- a/loader/lib/parser.py +++ b/loader/lib/parser.py @@ -62,7 +62,7 @@ def _extract_date(d): self.date = lowdate # Else we're going to go with what we found self.bodytxt = self.get_body() - self.attachments = [] + self.attachments = [] # (filename, contenttype, payload) self.get_attachments() if len(self.attachments) > 0: log.status("Found %s attachments" % len(self.attachments)) diff --git a/loader/sql/schema.sql b/loader/sql/schema.sql deleted file mode 100644 index be735d9..0000000 --- a/loader/sql/schema.sql +++ /dev/null @@ -1,160 +0,0 @@ -\set ON_ERROR_STOP on - -BEGIN; - -CREATE TABLE messages ( - id SERIAL NOT NULL PRIMARY KEY, - parentid int REFERENCES messages, - threadid int NOT NULL, - _from text NOT NULL, - _to text NOT NULL, - cc text NOT NULL, - subject text NOT NULL, - date timestamptz NOT NULL, - loaddate timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, - has_attachment boolean NOT NULL, - hiddenstatus int NULL, - messageid text NOT NULL, - bodytxt text NOT NULL, - rawtxt bytea NOT NULL, - fti tsvector NOT NULL -); -CREATE INDEX idx_messages_threadid ON messages(threadid); -CREATE UNIQUE INDEX idx_messages_msgid ON messages(messageid); -CREATE INDEX idx_messages_date ON messages(date); -CREATE INDEX idx_messages_parentid ON messages(parentid); - -CREATE TABLE message_hide_reasons ( - message int NOT NULL PRIMARY KEY REFERENCES messages, - dt timestamptz, - reason text, - by text -); - -CREATE SEQUENCE threadid_seq; - -CREATE TABLE unresolved_messages( - message int NOT NULL REFERENCES messages, - priority int NOT NULL, - msgid text NOT NULL, - CONSTRAINT unresolved_messages_pkey PRIMARY KEY (message, priority) -); - -CREATE UNIQUE INDEX idx_unresolved_msgid_message ON unresolved_messages(msgid, message); - -CREATE TABLE listgroups( - groupid int NOT NULL PRIMARY KEY, - groupname text NOT NULL UNIQUE, - sortkey int NOT NULL -); - -CREATE TABLE lists( - listid int NOT NULL PRIMARY KEY, - listname text NOT NULL UNIQUE, - shortdesc text NOT NULL, - description text NOT NULL, - active boolean NOT NULL, - groupid int NOT NULL REFERENCES listgroups(groupid) -); - -CREATE TABLE list_months( - listid int NOT NULL REFERENCES lists(listid), - year int NOT NULL, - month int NOT NULL, - CONSTRAINT list_months_pk PRIMARY KEY (listid, year, month) -); - -CREATE TABLE list_threads( - threadid int NOT NULL, /* comes from threadid_seq */ - listid int NOT NULL REFERENCES lists(listid), - CONSTRAINT pg_list_threads PRIMARY KEY (threadid, listid) -); -CREATE INDEX list_threads_listid_idx ON list_threads(listid); - -CREATE TABLE attachments( - id serial not null primary key, - message int not null references messages(id), - filename text not null, - contenttype text not null, - attachment bytea not null -); -CREATE INDEX idx_attachments_msg ON attachments(message); - -CREATE TABLE loaderrors( - id SERIAL NOT NULL PRIMARY KEY, - listid int NOT NULL, - dat timestamptz NOT NULL DEFAULT CURRENT_TIMESTAMP, - msgid text NOT NULL, - srctype text NOT NULL, - src text NOT NULL, - err text NOT NULL -); - -/* textsearch configs */ -CREATE TEXT SEARCH CONFIGURATION pg (PARSER=tsparser); - -CREATE TEXT SEARCH DICTIONARY english_ispell ( - TEMPLATE = ispell, - DictFile = en_us, - AffFile = en_us, - StopWords = english -); -CREATE TEXT SEARCH DICTIONARY pg_dict ( - TEMPLATE = synonym, - SYNONYMS = pg_dict -); -CREATE TEXT SEARCH DICTIONARY pg_stop ( - TEMPLATE = simple, - StopWords = pg_dict -); -ALTER TEXT SEARCH CONFIGURATION pg - ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, - word, hword, hword_part - WITH pg_stop, pg_dict, english_ispell, english_stem; -ALTER TEXT SEARCH CONFIGURATION pg - DROP MAPPING FOR email, url, url_path, sfloat, float; - -CREATE FUNCTION messages_fti_trigger_func() RETURNS trigger AS $$ -BEGIN - NEW.fti = setweight(to_tsvector('public.pg', coalesce(new.subject, '')), 'A') || - setweight(to_tsvector('public.pg', coalesce(new.bodytxt, '')), 'D'); - RETURN NEW; -END -$$ LANGUAGE 'plpgsql'; - -CREATE TRIGGER messages_fti_trigger - BEFORE INSERT OR UPDATE OF subject, bodytxt ON messages - FOR EACH ROW EXECUTE PROCEDURE messages_fti_trigger_func(); -CREATE INDEX messages_fti_idx ON messages USING gin(fti); - -CREATE TABLE legacymap( - listid int not null, - year int not null, - month int not null, - msgnum int not null, - msgid text not null, -CONSTRAINT legacymap_pk PRIMARY KEY (listid, year, month, msgnum) -); - -/* Simple API for hiding messages */ -CREATE OR REPLACE FUNCTION hide_message(msgid_txt text, reason_code integer, user_txt text, reason_txt text) - RETURNS integer AS -$BODY$ -DECLARE - returned_id integer; -BEGIN - UPDATE messages SET hiddenstatus = reason_code WHERE messageid = msgid_txt RETURNING id INTO returned_id; - - IF NOT FOUND THEN - RAISE EXCEPTION 'The specified message (%) could not be found.', msgid_txt; - END IF; - - INSERT INTO message_hide_reasons (message, dt, reason, by) VALUES (returned_id, now(), reason_txt, user_txt); - - RETURN returned_id; -END; -$BODY$ - LANGUAGE plpgsql VOLATILE - COST 100; - -\echo Dont forget to commit! diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0064172 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +django>=4.2,<4.3 +psycopg2 +requests +pycryptodome +pycryptodomex +python-dateutil +pytidylib From cb8ff168e486a0423c8a24185d3e6927931ff74d Mon Sep 17 00:00:00 2001 From: "David G. Johnston" Date: Mon, 28 Apr 2025 13:12:20 -0700 Subject: [PATCH 2/2] Implement Advanced Searching Features and CFApp Integration Note: None of this is fully fleshed out. Mainly because it isn't my primary focus at the moment; I just needed to do enough to allow this to interoperate with CFApp. This patch removes all reliance on pgweb to search and view the archive. It should be fairly simple to incorporate the relevant changes into pgweb if desired but ideally this application should provide both the archive and the user interface. The present reliance of pgweb to generate string html and write it out using {{html | safe}} is largely abolished in favor of proper html element usage - divs and span with classes. Note in particular the abstract_found and abstract_after reworking. The added __str__ overrides on list and groups allows the drop-down selection list to show the proper labels. The related work in views.py is basically to continue to handle both the pgweb/simple search interface as well as this new advanced search interface. The search implementation has been factored out and the new search view added for local search - leaving archive-search for pgweb. A while back I made a feature request to limit search results to a single message per thread. I've implemented an initial version of that here. The advanced search page re-implemented here adds a checkbox to enable this mode; and the results show both the thread id and ranking within the thread, whether "oneperthread" is checked or not. This patch also introduces a new function named is_patch. This uses the same exact criteria that CFBot uses. The thinking here is that knowing which messages and threads contains patches is useful context to use while searching or within search results. The immediate use for this information is the newly added threads view. Presently coded for demonstration purposes only, this view presents the most recent threads that contain patches. With this information it is possible, and demonstrated, to actually push all of the needed information from pgarchvies to pgcommitfest to create a new patch, with reasonable defaults. For the moment it is created using CFBot as the creator but given our single sign-on authentication filtering the thread results on user and creating the patch with them as author should be doable. Content-Type now gets passed around in the attachment metadata. --- django/archives/example_settings_local.py | 2 + django/archives/mailarchives/api.py | 171 ++++++++++- .../migrations/0007_add_is_patch_function.py | 24 ++ django/archives/mailarchives/models.py | 6 + .../templates/advancedsearch.html | 81 ++++++ .../mailarchives/templates/threads.html | 117 ++++++++ django/archives/mailarchives/views.py | 266 ++++++++++++++---- django/archives/urls.py | 5 + 8 files changed, 609 insertions(+), 63 deletions(-) create mode 100644 django/archives/mailarchives/migrations/0007_add_is_patch_function.py create mode 100644 django/archives/mailarchives/templates/advancedsearch.html create mode 100644 django/archives/mailarchives/templates/threads.html diff --git a/django/archives/example_settings_local.py b/django/archives/example_settings_local.py index 4657b62..6ff29fe 100644 --- a/django/archives/example_settings_local.py +++ b/django/archives/example_settings_local.py @@ -17,3 +17,5 @@ # Allow API access to all clients PUBLIC_ARCHIVES = True ALLOWED_HOSTS = ["*"] + +PGWEB_ADDRESS = 'http://localhost:8001' diff --git a/django/archives/mailarchives/api.py b/django/archives/mailarchives/api.py index a6b2536..135fc0c 100644 --- a/django/archives/mailarchives/api.py +++ b/django/archives/mailarchives/api.py @@ -1,13 +1,15 @@ from django.http import HttpResponse, HttpResponseForbidden from django.shortcuts import get_object_or_404 from django.conf import settings +from django.db import connection import ipaddress from .views import cache from .models import Message, List import json - +import requests +from django.http import JsonResponse def is_host_allowed(request): for ip_range in settings.API_CLIENTS: @@ -117,9 +119,174 @@ def thread(request, msgid): 'date': m.date.isoformat(), 'from': m.mailfrom, 'subj': m.subject, - 'atts': [{'id': a.id, 'name': a.filename} for a in m.attachment_set.all()], + 'atts': [{'id': a.id, 'name': a.filename, 'is_patch': a.is_patch, 'content_type': a.contenttype} + for a in m.attachment_set.extra(select={'is_patch': 'attachments.is_patch'}).all()], } for m in mlist], resp) if settings.PUBLIC_ARCHIVES: resp['xkey'] = 'pgat_{0}'.format(msg.threadid) return resp + +def threads_with_patches(request): + if not settings.PUBLIC_ARCHIVES: + return HttpResponseForbidden('No API access on private archives for now') + + with connection.cursor() as cursor: + cursor.execute("""-- Find threads with patches + select * + from ( + select distinct on (threadid) + pm.threadid, + pm.id, + pm._from, + pm.subject, + pm.messageid, + ma.patch_count, + tm.subject AS thread_subject, + pm.date AS patch_date, + tm.date AS thread_date, + tm.messageid AS thread_messageid + from messages AS pm --patch message + -- threadid is a shared value but not a foreign key to anything + -- in particular, it is not a self-join of messages + join lateral ( + select * + from messages as im + where im.threadid = pm.threadid + order by im.date asc + limit 1 + ) AS tm on true --thread message is first known message + join lateral ( + select count(*) as patch_count + from attachments + where pm.id = attachments.message and is_patch(attachments) + ) as ma on true + where pm.has_attachment and ma.patch_count > 0 and pm.hiddenstatus is null + order by pm.threadid, pm.date desc + ) as threads_with_patches + order by patch_date DESC + limit 10; + """) + rows = cursor.fetchall() + + # Convert the SQL result into thread_list + thread_list = [ + { + "thread_id": str(row[0]), + "message_id": row[1], + "file_count": row[5], + "file_version": None, + "commit_sha": None, + "patch_id": None, + "subject_line": row[3], + "thread_subject": row[6], + "sender": row[2], + "id": row[1], + "patch_date": row[7].strftime('%Y-%m-%d %H:%M:%S') if row[7] else None, + "thread_date": row[8].strftime('%Y-%m-%d %H:%M:%S') if row[8] else None, + "message_code": row[4], + "thread_code": row[9] + } + for row in rows + ] + + resp = HttpResponse(content_type='application/json') + json.dump(thread_list, resp) + + return resp + +def get_patch_data_as_json(threadid, messageid): + with connection.cursor() as cursor: + cursor.execute("""-- Find threads with patches + select + pm.threadid, + pm.id, + tm.messageid as thread_messageid, + mrm.mostrecent_messageid, + pm.messageid as patch_messageid, + ma.fileset, + pm._from as patch_from_author, + tm.date as thread_messagedate, + mrm.mostrecent_messagedate, + pm.date as patch_messagedate, + tm.subject as thread_subject_line, + mrm.most_recent_subject_line, + mrm.most_recent_from_author, + tm._from as thread_from_author + from messages AS pm --patch message + join lateral ( + select * + from messages as im + where im.threadid = pm.threadid + order by im.date asc + limit 1 + ) AS tm on true --thread message is first known message + join lateral ( + select + id as mostrecent_id, + messageid as mostrecent_messageid, + date as mostrecent_messagedate, + subject as most_recent_subject_line, + _from as most_recent_from_author + from messages + where threadid = pm.threadid + order by date desc limit 1 + ) as mrm on true + join lateral ( + select jsonb_agg( + jsonb_build_object( + 'attachment_id', a.id, + 'filename', a.filename, + 'content_type', a.contenttype, + 'is_patch', is_patch(a) + ) order by a.filename) as fileset + from attachments as a + where pm.id = a.message + ) as ma on true + where pm.id = %s; + """, + (messageid,)) + row = cursor.fetchone() + + # Convert the SQL result into patch_data + patch_data = { + "thread_id": row[0], + "message_id": row[1], + "thread_message_id": row[2], + "most_recent_message_id": row[3], + "patch_message_id": row[4], + "patch_from_author": row[6], + "fileset": json.loads(row[5]) if row[5] else [], + "thread_message_date": row[7].isoformat() if row[7] else None, + "most_recent_message_date": row[8].isoformat() if row[8] else None, + "patch_message_date": row[9].isoformat() if row[9] else None, + "thread_subject_line": row[10], + "most_recent_subject_line": row[11], + "most_recent_from_author": row[12], + "thread_from_author": row[13], + } + + return json.dumps(patch_data) + +def create_cfapp_patch(request): + if not settings.PUBLIC_ARCHIVES: + return HttpResponseForbidden('No API access on private archives for now') + + if request.method != 'POST': + return JsonResponse({'error': 'Invalid request method'}, status=405) + + body_string = request.body.decode("utf-8") + body_json = json.loads(body_string) + + try: + # Forward the request body to the external service + response = requests.post( + 'http://localhost:8007/api/test/cfapp/create_patch', + headers={'Content-Type': 'application/json'}, + data=get_patch_data_as_json(body_json["thread_id"], body_json["message_id"]), + ) + + # Return the response from the external service + return JsonResponse(response.json(), status=response.status_code) + except requests.RequestException as e: + return JsonResponse({'error': f'Failed to proxy request: {str(e)}'}, status=500) diff --git a/django/archives/mailarchives/migrations/0007_add_is_patch_function.py b/django/archives/mailarchives/migrations/0007_add_is_patch_function.py new file mode 100644 index 0000000..1420926 --- /dev/null +++ b/django/archives/mailarchives/migrations/0007_add_is_patch_function.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11.18 on 2019-06-19 19:02 +from __future__ import unicode_literals + +from django.db import migrations + +class Migration(migrations.Migration): + + dependencies = [ + ('mailarchives', '0006_alter_message_parentid'), + ] + + operations = [ + migrations.RunSQL( + """ +CREATE FUNCTION is_patch(att attachments) RETURNS boolean LANGUAGE sql IMMUTABLE STRICT + RETURN (att).filename ~ '\.(diff|diff\.gz|patch|patch\.gz|tar\.gz|tgz|tar\.bz2|zip)$'; +; + """, + reverse_sql=""" +DROP FUNCTION is_patch(att attachments); + """, + ), + ] diff --git a/django/archives/mailarchives/models.py b/django/archives/mailarchives/models.py index 4f75acc..c0ff6ea 100644 --- a/django/archives/mailarchives/models.py +++ b/django/archives/mailarchives/models.py @@ -74,6 +74,9 @@ class ListGroup(models.Model): groupname = models.CharField(max_length=200, null=False, blank=False) sortkey = models.IntegerField(null=False) + def __str__(self): + return self.groupname + class Meta: db_table = 'listgroups' @@ -93,6 +96,9 @@ def maybe_shortdesc(self): return self.shortdesc return self.listname + def __str__(self): + return self.listname + class Meta: db_table = 'lists' diff --git a/django/archives/mailarchives/templates/advancedsearch.html b/django/archives/mailarchives/templates/advancedsearch.html new file mode 100644 index 0000000..7f9d9b1 --- /dev/null +++ b/django/archives/mailarchives/templates/advancedsearch.html @@ -0,0 +1,81 @@ +{%extends "page.html"%} +{%block title%}PostgreSQL Mailing Lists Search{%endblock%} + +{%block contents%} +

PostgreSQL Mailing Lists Search

+ +
+
+
+ +
+ + + + +
+
+
Examples 2025-04 (early)
+
CANqtF-pgb87qQr94rMeWKsAa2JGBw9Ygo_wH2bzvVZpi4Mnaig@mail.gmail.com
+
+
+ + +
+
+ + +
+
+ + +
+
+ +
+
+
+
+ +{%if search_error %} +
{{search_error}}
+{%elif query == '' %} +
Click the magnifying glass on the query box to search.
+{%else%} + + {%if hitcount == 0 %} +

Your search for {{query}} returned no hits.

+ {%else%} +

Results {{firsthit}}-{{lasthit}} of {%if hitcount == 1000%}more than 1000{%else%}{{hitcount}}{%endif%}.

+ {%if pagelinks %}Result pages: {{pagelinks|safe}}

{%endif%} + {%for hit in hits %} +
+
{{forloop.counter0|add:firsthit}}. {{hit.subject}} [{{hit.rank|floatformat:2}}]
+
From {{hit.author}} on {{hit.date}}.
+
Thread# {{hit.threadid}} Rank: {{hit.thread_rank}}
+
{{hit.abstract_found}} {{hit.abstract_after}}
+ +
+ {%endfor%} + {%if pagelinks %}Result pages: {{pagelinks|safe}}

{%endif%} + {%endif%} +{%endif%} + +{%endblock%} diff --git a/django/archives/mailarchives/templates/threads.html b/django/archives/mailarchives/templates/threads.html new file mode 100644 index 0000000..9def39c --- /dev/null +++ b/django/archives/mailarchives/templates/threads.html @@ -0,0 +1,117 @@ +{%extends "page.html"%} +{%block title%}PostgreSQL Mailing List Archives{%endblock%} +{%load pgfilters%} +{%block contents%} +

Thread Viewer

+ + + + + + + + + + + + + + + + + + + + + + +
Subject LineSenderThread SubjectFile CountFile VersionCommit SHAPatch IDPatch DateThread DateIDAction
+ + + +{%endblock%} diff --git a/django/archives/mailarchives/views.py b/django/archives/mailarchives/views.py index 69172bd..647f561 100644 --- a/django/archives/mailarchives/views.py +++ b/django/archives/mailarchives/views.py @@ -706,67 +706,32 @@ def resend_complete(request, messageid): }) -@csrf_exempt -def search(request): - if not settings.PUBLIC_ARCHIVES: - # We don't support searching of non-public archives at all at this point. - # XXX: room for future improvement - return HttpResponseForbidden('Not public archives') - # Only certain hosts are allowed to call the search API - allowed = False - for ip_range in settings.SEARCH_CLIENTS: - if ipaddress.ip_address(request.META['REMOTE_ADDR']) in ipaddress.ip_network(ip_range): - allowed = True - break - if not allowed: - return HttpResponseForbidden('Invalid host') +def perform_search(query, datecode, sortcode, oneperthread=False, listid=None, listnames=None, streamer=None): + if not query and not streamer: + return [] - curs = connection.cursor() + if not query and streamer: + return False - # Perform a search of the archives and return a JSON document. - # Expects the following (optional) POST parameters: - # q = query to search for - # ln = comma separate list of listnames to search in - # d = number of days back to search for, or -1 (or not specified) - # to search the full archives - # s = sort results by ['r'=rank, 'd'=date, 'i'=inverse date] - if not request.method == 'POST': - raise Http404('I only respond to POST') + if listid and listnames: + raise Exception("Cannot specify both listid and listname") - if 'q' not in request.POST: - raise Http404('No search query specified') - query = request.POST['q'] + curs = connection.cursor() - if 'ln' in request.POST: + lists = None + if listnames: try: curs.execute("SELECT listid FROM lists WHERE listname=ANY(%(names)s)", { - 'names': request.POST['ln'].split(','), + 'names': listnames.split(','), }) lists = [x for x, in curs.fetchall()] except Exception: # If failing to parse list of lists, just search all lists = None - else: - lists = None - if 'd' in request.POST: - days = int(request.POST['d']) - if days < 1 or days > 365: - firstdate = None - else: - firstdate = datetime.now() - timedelta(days=days) - else: - firstdate = None - - if 's' in request.POST: - list_sort = request.POST['s'] - if list_sort not in ('d', 'r', 'i'): - list_stort = 'r' - else: - list_sort = 'r' - - # Ok, we have all we need to do the search + if listid: + lists = [listid] if query.find('@') > 0: cleaned_id = query.strip().removeprefix('<').removesuffix('>') @@ -778,15 +743,59 @@ def search(request): }) a = curs.fetchall() if len(a) == 1: - # Yup, this was a messageid - resp = HttpResponse(content_type='application/json') - - json.dump({'messageidmatch': 1}, resp) - return resp + if streamer: + json.dump({'messageidmatch': 1}, streamer) + else: + return [{'messageidmatch': cleaned_id}] # If not found, fall through to a regular search + firstdate = None + if datecode: + days = int(datecode) + if days >= 1 and days <= 365: + firstdate = datetime.now() - timedelta(days=days) + + list_sort = 'i' + if sortcode: + if sortcode in ('d', 'r', 'i'): + list_sort = sortcode + curs.execute("SET gin_fuzzy_search_limit=10000") - qstr = "SELECT messageid, date, subject, _from, ts_rank_cd(fti, plainto_tsquery('public.pg', %(q)s)), ts_headline(bodytxt, plainto_tsquery('public.pg', %(q)s),'StartSel=\"[[[[[[\",StopSel=\"]]]]]]\"') FROM messages m WHERE fti @@ plainto_tsquery('public.pg', %(q)s)" + + qstr = """-- Search for messages matching query -- +SELECT * FROM ( +SELECT + *, +""" + qstr += " row_number() over (partition by threadid order by " + if list_sort == 'r': + qstr += "ts_rank_cd DESC" + elif list_sort == 'd': + qstr += "date DESC" + else: + qstr += "date ASC" + qstr += ") AS thread_rank" + + qstr +=""" +FROM +( + SELECT + messageid, + threadid, + date, + subject, + _from, + ts_rank_cd(fti, plainto_tsquery('public.pg', %(q)s)), + ts_headline( + bodytxt, + plainto_tsquery('public.pg', %(q)s), + 'StartSel=\"[[[[[[\", + StopSel=\"]]]]]]\"' + ) + FROM messages m + WHERE fti @@ plainto_tsquery('public.pg', %(q)s) +""" + params = { 'q': query, } @@ -796,18 +805,22 @@ def search(request): if firstdate: qstr += " AND m.date > %(date)s" params['date'] = firstdate + + qstr += ") AS finding ) AS ranking" + + if oneperthread: + qstr += " WHERE thread_rank = 1" + if list_sort == 'r': - qstr += " ORDER BY ts_rank_cd(fti, plainto_tsquery(%(q)s)) DESC LIMIT 1000" + qstr += " ORDER BY ts_rank_cd DESC LIMIT 1000" elif list_sort == 'd': qstr += " ORDER BY date DESC LIMIT 1000" else: qstr += " ORDER BY date ASC LIMIT 1000" curs.execute(qstr, params) - - resp = HttpResponse(content_type='application/json') - - json.dump([ + if streamer: + json.dump([ { 'm': messageid, 'd': date.isoformat(), @@ -815,10 +828,141 @@ def search(request): 'f': mailfrom, 'r': rank, 'a': abstract.replace("[[[[[[", "").replace("]]]]]]", ""), - } for messageid, date, subject, mailfrom, rank, abstract in curs.fetchall()], - resp) + } for messageid, threadid, date, subject, mailfrom, rank, abstract, thread_rank in curs.fetchall()], + streamer) + return True + else: + return [ + { + 'm': messageid, + 't': threadid, + 'tr': thread_rank, + 'd': date.isoformat(), + 's': subject, + 'f': mailfrom, + 'r': rank, + 'a': abstract.replace("[[[[[[", "").replace("]]]]]]", ""), + 'a_found': abstract[abstract.find("[[[[[[") + 6:abstract.find("]]]]]]")], + 'a_after': abstract.replace(abstract[abstract.find("[[[[[["):abstract.find("]]]]]]") + 6], "").replace("[[[[[[", "").replace("]]]]]]", ""), + } for messageid, threadid, date, subject, mailfrom, rank, abstract, thread_rank in curs.fetchall()] + + +def advanced_search(request): + """ + 'pagelinks': " ".join( + generate_pagelinks(pagenum, + (totalhits - 1) // hitsperpage + 1, + querystr)), + """ + queryval = request.GET.get('q', None) + sortval = request.GET.get('s', 'd') + dateval = request.GET.get('d', '-1') + oneperthread = request.GET.get('r', '0') + + listid = 1 + + hits = perform_search(queryval, dateval, sortval, oneperthread=='1', listid=listid) + + totalhits = len(hits) + + if totalhits == 1: + # might be a messageid match + if 'messageidmatch' in hits[0]: + return HttpResponseRedirect('/message-id/%s' % hits[0]['messageidmatch']) + + firsthit = 1 + hitsperpage = 20 + + sortoptions = ( + {'val': 'r', 'text': 'Rank', 'selected': request.GET.get('s', '') not in ('d', 'i')}, + {'val': 'd', 'text': 'Date', 'selected': request.GET.get('s', '') == 'd'}, + {'val': 'i', 'text': 'Reverse date', 'selected': request.GET.get('s', '') == 'i'}, + ) + + dateoptions = ( + {'val': -1, 'text': 'anytime'}, + {'val': 1, 'text': 'within last day'}, + {'val': 7, 'text': 'within last week'}, + {'val': 31, 'text': 'within last month'}, + {'val': 186, 'text': 'within last 6 months'}, + {'val': 365, 'text': 'within last year'}, + ) + + (groups, listgroupid) = get_all_groups_and_lists(request) + return render_nav(NavContext(request, all_groups=groups), 'advancedsearch.html', { + 'groups': [{'groupname': g['groupname'], 'lists': g['lists']} for g in groups], + 'hitcount': totalhits, + 'firsthit': firsthit, + 'lasthit': min(totalhits, firsthit + hitsperpage - 1), + 'query': request.GET['q'] if 'q' in request.GET else '', + 'archives_root': '/', #settings.ARCHIVES_FRONT_ADDRESS, + 'pagelinks': '', + 'hits': [{ + 'date': h['d'], + 'subject': h['s'], + 'author': h['f'], + 'messageid': h['m'], + 'threadid': h['t'], + 'thread_rank': h['tr'], + 'abstract': h['a'], + 'abstract_found': h['a_found'], + 'abstract_after': h['a_after'], + 'rank': h['r'], + } for h in hits[firsthit - 1:firsthit + hitsperpage - 1]], + 'sortoptions': sortoptions, + 'lists': List.objects.all().order_by("group__sortkey"), + 'listid': listid, + 'dates': dateoptions, + 'dateval': dateval, + 'oneperthread': oneperthread, + }) + +@csrf_exempt +def search(request): + if not settings.PUBLIC_ARCHIVES: + # We don't support searching of non-public archives at all at this point. + # XXX: room for future improvement + return HttpResponseForbidden('Not public archives') + + # Only certain hosts are allowed to call the search API + allowed = False + for ip_range in settings.SEARCH_CLIENTS: + if ipaddress.ip_address(request.META['REMOTE_ADDR']) in ipaddress.ip_network(ip_range): + allowed = True + break + if not allowed: + return HttpResponseForbidden('Invalid host') + + # Perform a search of the archives and return a JSON document. + # Expects the following (optional) POST parameters: + # q = query to search for + # ln = comma separate list of listnames to search in + # d = number of days back to search for, or -1 (or not specified) + # to search the full archives + # s = sort results by ['r'=rank, 'd'=date, 'i'=inverse date] + if not request.method == 'POST': + raise Http404('I only respond to POST') + + if 'q' not in request.POST: + raise Http404('No search query specified') + query = request.POST['q'] + ln = request.POST['ln'] if 'ln' in request.POST else None + + dateval = request.POST.get('d', '-1') + sortval = request.POST.get('s', 'i') + + resp = HttpResponse(content_type='application/json') + perform_search(query, dateval, sortval, listname=ln, streamer=resp) return resp +def threads(request): + return render( + request, + 'threads.html', + { + 'request': request, + }) + @cache(seconds=10) def web_sync_timestamp(request): diff --git a/django/archives/urls.py b/django/archives/urls.py index 993c9f9..e10fb00 100644 --- a/django/archives/urls.py +++ b/django/archives/urls.py @@ -9,6 +9,10 @@ import archives.mailarchives.api urlpatterns = [ + re_path(r'^threads/', archives.mailarchives.views.threads), + re_path(r'^api/threads_with_patches/', archives.mailarchives.api.threads_with_patches), + re_path(r'^api/create_cfapp_patch$', archives.mailarchives.api.create_cfapp_patch), + # Examples: # re_path(r'^$', 'archives.views.home', name='home), # re_path(r'^archives/', include('archives.foo.urls')), @@ -44,6 +48,7 @@ # Search re_path(r'^archives-search/', archives.mailarchives.views.search), + re_path(r'^search/$', archives.mailarchives.views.advanced_search), # Date etc indexes re_path(r'^list/([\w-]+)/$', archives.mailarchives.views.monthlist),