diff --git a/database/init.sql b/database/init.sql index 3810773..54bf038 100644 --- a/database/init.sql +++ b/database/init.sql @@ -33,6 +33,14 @@ CREATE TABLE cml_stats ( max_rsl REAL, mean_rsl REAL, stddev_rsl REAL, + completeness_percent_6h REAL, + total_records_6h BIGINT, + valid_records_6h BIGINT, + mean_rsl_6h REAL, + stddev_rsl_6h REAL, + completeness_percent_1h REAL, + mean_rsl_1h REAL, + stddev_rsl_1h REAL, last_rsl REAL, last_update TIMESTAMPTZ DEFAULT NOW(), PRIMARY KEY (cml_id, user_id) @@ -100,6 +108,69 @@ BEGIN END; $$ LANGUAGE plpgsql; +-- update_cml_stats_windowed(target_cml_id, target_user_id) +-- +-- Computes 6-hour and 1-hour windowed statistics in a single table scan using +-- FILTER clauses. TimescaleDB chunk exclusion prunes all chunks older than 6 hours +-- at the storage level, so the query only touches the current uncompressed chunk +-- regardless of total dataset size. +CREATE OR REPLACE FUNCTION update_cml_stats_windowed( + target_cml_id TEXT, + target_user_id TEXT DEFAULT 'demo_openmrg' +) RETURNS VOID AS $$ +DECLARE + now_ts TIMESTAMPTZ := NOW(); +BEGIN + INSERT INTO cml_stats ( + cml_id, user_id, + completeness_percent_6h, total_records_6h, valid_records_6h, + mean_rsl_6h, stddev_rsl_6h, + completeness_percent_1h, mean_rsl_1h, stddev_rsl_1h, + last_rsl, last_update + ) + SELECT + target_cml_id, target_user_id, + -- 6h window + ROUND( + 100.0 * COUNT(rsl) FILTER (WHERE time >= now_ts - INTERVAL '6 hours') + / NULLIF(COUNT(*) FILTER (WHERE time >= now_ts - INTERVAL '6 hours'), 0), + 2), + COUNT(*) FILTER (WHERE time >= now_ts - INTERVAL '6 hours'), + COUNT(rsl) FILTER (WHERE time >= now_ts - INTERVAL '6 hours'), + ROUND(AVG(rsl) FILTER (WHERE time >= now_ts - INTERVAL '6 hours')::numeric, 2), + ROUND(STDDEV(rsl) FILTER (WHERE time >= now_ts - INTERVAL '6 hours')::numeric, 2), + -- 1h window + ROUND( + 100.0 * COUNT(rsl) FILTER (WHERE time >= now_ts - INTERVAL '1 hour') + / NULLIF(COUNT(*) FILTER (WHERE time >= now_ts - INTERVAL '1 hour'), 0), + 2), + ROUND(AVG(rsl) FILTER (WHERE time >= now_ts - INTERVAL '1 hour')::numeric, 2), + ROUND(STDDEV(rsl) FILTER (WHERE time >= now_ts - INTERVAL '1 hour')::numeric, 2), + -- last_rsl: unconstrained so we get the true last RSL even if the CML + -- has been quiet for more than 6 hours + (SELECT rsl FROM cml_data + WHERE cml_id = target_cml_id + AND user_id = target_user_id + ORDER BY time DESC LIMIT 1), + now_ts + FROM cml_data + WHERE cml_id = target_cml_id + AND user_id = target_user_id + AND time >= now_ts - INTERVAL '6 hours' + ON CONFLICT (cml_id, user_id) DO UPDATE SET + completeness_percent_6h = EXCLUDED.completeness_percent_6h, + total_records_6h = EXCLUDED.total_records_6h, + valid_records_6h = EXCLUDED.valid_records_6h, + mean_rsl_6h = EXCLUDED.mean_rsl_6h, + stddev_rsl_6h = EXCLUDED.stddev_rsl_6h, + completeness_percent_1h = EXCLUDED.completeness_percent_1h, + mean_rsl_1h = EXCLUDED.mean_rsl_1h, + stddev_rsl_1h = EXCLUDED.stddev_rsl_1h, + last_rsl = EXCLUDED.last_rsl, + last_update = EXCLUDED.last_update; +END; +$$ LANGUAGE plpgsql; + SELECT create_hypertable('cml_data', 'time'); -- Per-user lookup indexes. @@ -211,8 +282,10 @@ GRANT SELECT ON cml_data TO webserver_role; GRANT SELECT ON cml_metadata TO webserver_role; GRANT SELECT ON cml_stats TO webserver_role; --- Parser calls update_cml_stats() to upsert per-CML statistics. +-- Parser calls update_cml_stats() to upsert per-CML lifetime statistics. GRANT EXECUTE ON FUNCTION update_cml_stats(TEXT, TEXT) TO demo_openmrg, demo_orange_cameroun; +-- Parser calls update_cml_stats_windowed() from the background stats timer. +GRANT EXECUTE ON FUNCTION update_cml_stats_windowed(TEXT, TEXT) TO demo_openmrg, demo_orange_cameroun; -- Row-Level Security on cml_metadata and cml_stats. -- cml_data is excluded: TimescaleDB does not allow RLS on compressed diff --git a/database/migrations/009_add_windowed_stats.sql b/database/migrations/009_add_windowed_stats.sql new file mode 100644 index 0000000..837b897 --- /dev/null +++ b/database/migrations/009_add_windowed_stats.sql @@ -0,0 +1,95 @@ +-- Migration 009: add windowed cml_stats columns and update_cml_stats_windowed function +-- +-- Replaces the expensive full-history refresh (update_cml_stats, called for every CML +-- every 60 seconds) with a cheap windowed refresh that only touches the current +-- uncompressed TimescaleDB chunk (~6 hours of data), reducing stats refresh time from +-- 20-30 s to < 1 s and Postgres CPU from ~100% to < 5%. +-- +-- New columns on cml_stats: 6-hour and 1-hour windowed completeness, record counts, +-- mean RSL, and stddev RSL. +-- New function: update_cml_stats_windowed(target_cml_id, target_user_id) +-- +-- The parser must be updated alongside this migration: +-- - parser/db_writer.py: replace refresh_stats() call with refresh_windowed_stats() +-- - parser/db_writer.py: wire _update_stats_for_cmls() into write_rawdata() so +-- lifetime columns (total_records, min_rsl, max_rsl, ...) stay current. +-- +-- Apply with: +-- docker compose exec -T database psql -U myuser -d mydatabase \ +-- < database/migrations/009_add_windowed_stats.sql + +ALTER TABLE cml_stats + ADD COLUMN IF NOT EXISTS completeness_percent_6h REAL, + ADD COLUMN IF NOT EXISTS total_records_6h BIGINT, + ADD COLUMN IF NOT EXISTS valid_records_6h BIGINT, + ADD COLUMN IF NOT EXISTS mean_rsl_6h REAL, + ADD COLUMN IF NOT EXISTS stddev_rsl_6h REAL, + ADD COLUMN IF NOT EXISTS completeness_percent_1h REAL, + ADD COLUMN IF NOT EXISTS mean_rsl_1h REAL, + ADD COLUMN IF NOT EXISTS stddev_rsl_1h REAL; + +-- update_cml_stats_windowed(target_cml_id, target_user_id) +-- +-- Computes 6-hour and 1-hour windowed statistics in a single table scan using +-- FILTER clauses. TimescaleDB chunk exclusion prunes all chunks older than 6 hours +-- at the storage level, so the query only touches the current uncompressed chunk +-- regardless of total dataset size. +CREATE OR REPLACE FUNCTION update_cml_stats_windowed( + target_cml_id TEXT, + target_user_id TEXT DEFAULT 'demo_openmrg' +) RETURNS VOID AS $$ +DECLARE + now_ts TIMESTAMPTZ := NOW(); +BEGIN + INSERT INTO cml_stats ( + cml_id, user_id, + completeness_percent_6h, total_records_6h, valid_records_6h, + mean_rsl_6h, stddev_rsl_6h, + completeness_percent_1h, mean_rsl_1h, stddev_rsl_1h, + last_rsl, last_update + ) + SELECT + target_cml_id, target_user_id, + -- 6h window + ROUND( + 100.0 * COUNT(rsl) FILTER (WHERE time >= now_ts - INTERVAL '6 hours') + / NULLIF(COUNT(*) FILTER (WHERE time >= now_ts - INTERVAL '6 hours'), 0), + 2), + COUNT(*) FILTER (WHERE time >= now_ts - INTERVAL '6 hours'), + COUNT(rsl) FILTER (WHERE time >= now_ts - INTERVAL '6 hours'), + ROUND(AVG(rsl) FILTER (WHERE time >= now_ts - INTERVAL '6 hours')::numeric, 2), + ROUND(STDDEV(rsl) FILTER (WHERE time >= now_ts - INTERVAL '6 hours')::numeric, 2), + -- 1h window + ROUND( + 100.0 * COUNT(rsl) FILTER (WHERE time >= now_ts - INTERVAL '1 hour') + / NULLIF(COUNT(*) FILTER (WHERE time >= now_ts - INTERVAL '1 hour'), 0), + 2), + ROUND(AVG(rsl) FILTER (WHERE time >= now_ts - INTERVAL '1 hour')::numeric, 2), + ROUND(STDDEV(rsl) FILTER (WHERE time >= now_ts - INTERVAL '1 hour')::numeric, 2), + -- last_rsl: unconstrained so we get the true last RSL even if the CML + -- has been quiet for more than 6 hours + (SELECT rsl FROM cml_data + WHERE cml_id = target_cml_id + AND user_id = target_user_id + ORDER BY time DESC LIMIT 1), + now_ts + FROM cml_data + WHERE cml_id = target_cml_id + AND user_id = target_user_id + AND time >= now_ts - INTERVAL '6 hours' + ON CONFLICT (cml_id, user_id) DO UPDATE SET + completeness_percent_6h = EXCLUDED.completeness_percent_6h, + total_records_6h = EXCLUDED.total_records_6h, + valid_records_6h = EXCLUDED.valid_records_6h, + mean_rsl_6h = EXCLUDED.mean_rsl_6h, + stddev_rsl_6h = EXCLUDED.stddev_rsl_6h, + completeness_percent_1h = EXCLUDED.completeness_percent_1h, + mean_rsl_1h = EXCLUDED.mean_rsl_1h, + stddev_rsl_1h = EXCLUDED.stddev_rsl_1h, + last_rsl = EXCLUDED.last_rsl, + last_update = EXCLUDED.last_update; +END; +$$ LANGUAGE plpgsql; + +GRANT EXECUTE ON FUNCTION update_cml_stats_windowed(TEXT, TEXT) + TO demo_openmrg, demo_orange_cameroun; diff --git a/parser/db_writer.py b/parser/db_writer.py index de69fb8..49b4e41 100644 --- a/parser/db_writer.py +++ b/parser/db_writer.py @@ -304,12 +304,16 @@ def write_rawdata(self, df) -> int: ) ) - # Commit immediately after insert; stats are updated separately + # Update lifetime stats for CMLs in this batch (same transaction as the insert) + cml_ids = df_subset["cml_id"].unique().tolist() + self._update_stats_for_cmls(cml_ids) + + # Single commit covers both the data insert and the stats update try: if self.conn: self.conn.commit() except Exception: - logger.exception("Failed to commit raw data") + logger.exception("Failed to commit raw data and stats") raise return rows_written @@ -383,3 +387,25 @@ def refresh_stats(self) -> None: finally: if cur and not cur.closed: cur.close() + + def refresh_windowed_stats(self) -> None: + """Recalculate windowed (6h/1h) cml_stats for all known CMLs. + Cheap: only touches the most-recent uncompressed TimescaleDB chunk.""" + cur = self.conn.cursor() + try: + cur.execute( + "SELECT update_cml_stats_windowed(cml_id::text, %s) " + "FROM (SELECT DISTINCT cml_id FROM cml_metadata WHERE user_id = %s) t", + (self.user_id, self.user_id), + ) + self.conn.commit() + logger.info("Refreshed windowed cml_stats for all CMLs") + except Exception: + try: + self.conn.rollback() + except Exception: + pass + logger.exception("Failed to refresh windowed cml_stats") + finally: + if cur and not cur.closed: + cur.close() diff --git a/parser/main.py b/parser/main.py index 2bff93e..81a4498 100644 --- a/parser/main.py +++ b/parser/main.py @@ -125,14 +125,14 @@ def stats_loop(): # Run immediately on startup so Grafana has fresh stats without # waiting a full interval after the backlog is processed. try: - stats_db.refresh_stats() + stats_db.refresh_windowed_stats() except Exception: - logger.exception("Stats thread: initial refresh_stats failed") + logger.exception("Stats thread: initial refresh_windowed_stats failed") while not stop_event.wait(Config.STATS_REFRESH_INTERVAL): try: - stats_db.refresh_stats() + stats_db.refresh_windowed_stats() except Exception: - logger.exception("Stats thread: refresh_stats failed") + logger.exception("Stats thread: refresh_windowed_stats failed") stats_db.close() stats_thread = threading.Thread(target=stats_loop, daemon=True, name="stats-refresh") diff --git a/parser/tests/test_db_writer.py b/parser/tests/test_db_writer.py index 67e52b0..bf04abc 100644 --- a/parser/tests/test_db_writer.py +++ b/parser/tests/test_db_writer.py @@ -319,6 +319,33 @@ def test__update_stats_for_cmls_rollback_on_error(mock_connection): mock_connection.rollback.assert_called() +def test_refresh_windowed_stats_commits_on_success(mock_connection): + """refresh_windowed_stats calls update_cml_stats_windowed and commits.""" + writer = DBWriter("postgresql://test", user_id="demo_openmrg") + writer.conn = mock_connection + + writer.refresh_windowed_stats() + + cur = mock_connection.cursor.return_value + cur.execute.assert_called_once() + sql = cur.execute.call_args.args[0] + assert "update_cml_stats_windowed" in sql + mock_connection.commit.assert_called_once() + + +def test_refresh_windowed_stats_rollback_on_error(mock_connection): + """refresh_windowed_stats rolls back and swallows the exception on DB error.""" + writer = DBWriter("postgresql://test", user_id="demo_openmrg") + writer.conn = mock_connection + + mock_connection.cursor.return_value.execute.side_effect = Exception("DB error") + + writer.refresh_windowed_stats() # must not raise + + mock_connection.rollback.assert_called_once() + mock_connection.commit.assert_not_called() + + # --------------------------------------------------------------------------- # log_file_event # --------------------------------------------------------------------------- diff --git a/parser/tests/test_main.py b/parser/tests/test_main.py index 3d3b992..f5a2842 100644 --- a/parser/tests/test_main.py +++ b/parser/tests/test_main.py @@ -121,6 +121,91 @@ def test_metadata_exception_is_swallowed( mock_batch.assert_called_once() +def _run_stats_loop(tmp_path, mock_event, *, configure_db=None): + """Run main(), then invoke the captured stats_loop closure under the same patches. + + Calling stats_loop() inside the with-block is critical: if called after + the block exits, DBWriter and Config are unpatched, connect() fails against + a real DB, and the retry loop hangs forever. + + Returns the MagicMock instance used as every DBWriter in the test. + """ + captured = {} + + class CapturingThread(threading.Thread): + def __init__(self, *args, target=None, name=None, **kwargs): + super().__init__(*args, target=target, name=name, **kwargs) + if name == "stats-refresh": + captured["stats_loop"] = target + + def start(self): + pass # don't spawn real threads in unit tests + + mock_db = MagicMock() + + with patch("parser.main.threading.Thread", CapturingThread), \ + patch("parser.main.threading.Event", return_value=mock_event), \ + patch("parser.main.FileManager"), \ + patch("parser.main.FileWatcher"), \ + patch("parser.main.DBWriter", return_value=mock_db), \ + patch("parser.main.Config.PARSER_ENABLED", True), \ + patch("parser.main.Config.PROCESS_EXISTING_ON_STARTUP", False), \ + patch("parser.main.Config.DATABASE_URL", "postgresql://test"), \ + patch("parser.main.Config.USER_ID", "test_user"), \ + patch("parser.main.Config.STATS_REFRESH_INTERVAL", 60), \ + patch("parser.main.Config.INCOMING_DIR", tmp_path), \ + patch("parser.main.Config.ARCHIVED_DIR", tmp_path), \ + patch("parser.main.Config.QUARANTINE_DIR", tmp_path), \ + patch("parser.main.time.sleep", side_effect=KeyboardInterrupt): + try: + main() + except (KeyboardInterrupt, SystemExit): + pass + if configure_db is not None: + configure_db(mock_db) + if "stats_loop" in captured: + captured["stats_loop"]() + + return mock_db + + +def test_stats_loop_calls_refresh_windowed_stats_on_startup(tmp_path): + """stats_loop calls refresh_windowed_stats once before entering the timer loop.""" + mock_event = MagicMock() + mock_event.is_set.return_value = False # connect loop: enter → connect succeeds → break + mock_event.wait.return_value = True # timer wait → loop exits immediately + + mock_db = _run_stats_loop(tmp_path, mock_event) + + mock_db.refresh_windowed_stats.assert_called_once() + mock_db.close.assert_called() # called by stats_loop (possibly also by main cleanup) + + +def test_stats_loop_initial_refresh_error_is_swallowed(tmp_path): + """stats_loop swallows exceptions raised by the startup refresh_windowed_stats call.""" + mock_event = MagicMock() + mock_event.is_set.return_value = False + mock_event.wait.return_value = True + + def configure(db): + db.refresh_windowed_stats.side_effect = Exception("stats error") + + _run_stats_loop(tmp_path, mock_event, configure_db=configure) # must not raise + + +def test_stats_loop_calls_refresh_windowed_stats_in_timer_loop(tmp_path): + """stats_loop calls refresh_windowed_stats again on each timer tick.""" + mock_event = MagicMock() + mock_event.is_set.return_value = False + # First timer wait → False (enter loop body), second → True (exit) + mock_event.wait.side_effect = [False, True] + + mock_db = _run_stats_loop(tmp_path, mock_event) + + # startup call + 1 timer-loop call = 2 total + assert mock_db.refresh_windowed_stats.call_count == 2 + + def test_stats_loop_creates_dbwriter_with_config_user_id(tmp_path): """stats_loop must pass user_id=Config.USER_ID to DBWriter. diff --git a/scripts/generate_config.py b/scripts/generate_config.py index ca7a6de..a0acf41 100644 --- a/scripts/generate_config.py +++ b/scripts/generate_config.py @@ -280,6 +280,7 @@ def generate_users_json(users: list[dict], existing_json: dict) -> dict: GRANT SELECT ON cml_data_secure TO {user_id}; GRANT SELECT ON cml_data_1h_secure TO {user_id}; GRANT EXECUTE ON FUNCTION update_cml_stats(TEXT, TEXT) TO {user_id}; +GRANT EXECUTE ON FUNCTION update_cml_stats_windowed(TEXT, TEXT) TO {user_id}; -- file_processing_log: parser INSERTs a row for every processed file; -- webserver_role only needs SELECT. diff --git a/webserver/main.py b/webserver/main.py index 750a3c4..5792b4b 100644 --- a/webserver/main.py +++ b/webserver/main.py @@ -528,45 +528,33 @@ def api_cml_stats(): cur.execute( """ SELECT - cs.cml_id::text, - cs.total_records, - cs.valid_records, - cs.null_records, - cs.completeness_percent, - cs.min_rsl, - cs.max_rsl, - cs.mean_rsl, - cs.stddev_rsl, - cs.last_rsl, - ROUND(STDDEV(cd.rsl)::numeric, 2) as stddev_last_60min - FROM cml_stats cs - LEFT JOIN ( - SELECT cml_id, rsl - FROM cml_data_secure - WHERE time >= (SELECT MAX(bucket) FROM cml_data_1h_secure) - INTERVAL '60 minutes' - ) cd ON cs.cml_id = cd.cml_id - GROUP BY cs.cml_id, cs.total_records, cs.valid_records, cs.null_records, - cs.completeness_percent, cs.min_rsl, cs.max_rsl, cs.mean_rsl, - cs.stddev_rsl, cs.last_rsl - ORDER BY cs.cml_id - """ + cml_id::text, + completeness_percent_6h, + total_records_6h, + valid_records_6h, + mean_rsl_6h, + stddev_rsl_6h, + completeness_percent_1h, + stddev_rsl_1h, + last_rsl + FROM cml_stats + ORDER BY cml_id + """ ) data = cur.fetchall() cur.close() stats = [ { - "cml_id": str(row[0]), - "total_records": int(row[1]), - "valid_records": int(row[2]), - "null_records": int(row[3]), - "completeness_percent": safe_float(row[4]), - "min_rsl": safe_float(row[5]), - "max_rsl": safe_float(row[6]), - "mean_rsl": safe_float(row[7]), - "stddev_rsl": safe_float(row[8]), - "last_rsl": safe_float(row[9]), - "stddev_last_60min": safe_float(row[10]), + "cml_id": str(row[0]), + "completeness_percent": safe_float(row[1]), # 6h window + "total_records": int(row[2] or 0), + "valid_records": int(row[3] or 0), + "mean_rsl": safe_float(row[4]), + "stddev_rsl": safe_float(row[5]), + "completeness_percent_1h": safe_float(row[6]), + "stddev_last_60min": safe_float(row[7]), # pre-computed 1h stddev + "last_rsl": safe_float(row[8]), } for row in data ] @@ -613,8 +601,8 @@ def get_archive_statistics(user_id: str): with user_db_scope(user_id) as conn: cur = conn.cursor() - # Row count via secure view - cur.execute("SELECT COUNT(*) FROM cml_data_secure") + # Row count from precomputed cml_stats (RLS enforced via user_db_scope) + cur.execute("SELECT COALESCE(SUM(total_records), 0) FROM cml_stats") stats["total_records"] = cur.fetchone()[0] # CML count (RLS enforced) diff --git a/webserver/templates/realtime.html b/webserver/templates/realtime.html index 86f2ad1..4cbcf33 100644 --- a/webserver/templates/realtime.html +++ b/webserver/templates/realtime.html @@ -168,13 +168,13 @@ function buildPopupText(cmlId, stats) { var popupText = 'CML ' + cmlId + '
'; if (stats) { - popupText += 'Completeness: ' + stats.completeness_percent + '%
'; - popupText += 'Valid records: ' + stats.valid_records + '/' + stats.total_records + '
'; + popupText += 'Completeness (6h): ' + stats.completeness_percent + '%
'; + popupText += 'Valid records (6h): ' + stats.valid_records + '/' + stats.total_records + '
'; if (stats.last_rsl !== null) { popupText += 'Last RSL: ' + stats.last_rsl + ' dBm
'; } if (stats.stddev_last_60min !== null) { - popupText += 'Std Dev (60min): ' + stats.stddev_last_60min + '
'; + popupText += 'Std Dev (1h): ' + stats.stddev_last_60min + '
'; } } return popupText; @@ -226,9 +226,9 @@ var select = L.DomUtil.create('select', '', container); select.id = 'coloringOption'; select.innerHTML = ` - + - + `; // Prevent map interactions when using the select diff --git a/webserver/tests/test_api_cml_stats.py b/webserver/tests/test_api_cml_stats.py index f90834e..6c903d4 100644 --- a/webserver/tests/test_api_cml_stats.py +++ b/webserver/tests/test_api_cml_stats.py @@ -24,21 +24,19 @@ def test_api_cml_stats_returns_cached_stats(monkeypatch): mock_cursor = Mock() mock_conn.cursor.return_value = mock_cursor - # Row fields: cml_id, total_records, valid_records, null_records, completeness_percent, - # min_rsl, max_rsl, mean_rsl, stddev_rsl, last_rsl, stddev_last_60min + # Row fields: cml_id, completeness_percent_6h, total_records_6h, valid_records_6h, + # mean_rsl_6h, stddev_rsl_6h, completeness_percent_1h, stddev_rsl_1h, last_rsl mock_cursor.fetchall.return_value = [ ( "10001", - 10, - 9, - 1, - 90.0, - -60.0, - -40.0, - -50.0, - 3.0, - -45.0, - 1.5, + 94.2, # completeness_percent_6h + 2160, # total_records_6h + 2031, # valid_records_6h + -50.0, # mean_rsl_6h + 3.0, # stddev_rsl_6h + 90.0, # completeness_percent_1h + 1.3, # stddev_rsl_1h + -45.0, # last_rsl ) ] @@ -67,5 +65,7 @@ def mock_user_db_scope(user_id): assert len(data) == 1 row = data[0] assert row["cml_id"] == "10001" - assert row["completeness_percent"] == 90.0 + assert row["completeness_percent"] == 94.2 + assert row["completeness_percent_1h"] == 90.0 + assert row["stddev_last_60min"] == 1.3 assert row["last_rsl"] == -45.0 diff --git a/webserver/tests/test_api_routes.py b/webserver/tests/test_api_routes.py index 8799bc4..40485b0 100644 --- a/webserver/tests/test_api_routes.py +++ b/webserver/tests/test_api_routes.py @@ -87,6 +87,32 @@ def test_api_data_time_range_no_data(auth_client): assert data["latest"] is None +def test_get_archive_statistics_reads_total_records_from_cml_stats(auth_client): + """get_archive_statistics must use SUM(total_records) from cml_stats, not COUNT(*). + + Regression guard for the query that replaced: + SELECT COUNT(*) FROM cml_data_secure + with: + SELECT COALESCE(SUM(total_records), 0) FROM cml_stats + """ + client, cursor = auth_client + # fetchone called 3 times: total_records, cml_count, date_range + cursor.fetchone.side_effect = [(99000,), (12,), (None,)] + + result = wm.get_archive_statistics("demo_openmrg") + + assert result["total_records"] == 99000 + assert result["cml_count"] == 12 + + executed_sql = [c.args[0] for c in cursor.execute.call_args_list] + assert any( + "SUM(total_records)" in sql and "cml_stats" in sql for sql in executed_sql + ), "expected COALESCE(SUM(total_records), 0) FROM cml_stats" + assert not any("COUNT(*)" in sql and "cml_data" in sql for sql in executed_sql), ( + "COUNT(*) FROM cml_data must not be used (full hypertable scan regression)" + ) + + def test_overview_reads_total_records_from_cml_stats(monkeypatch, auth_client): """overview() must query cml_stats for total_records, not scan cml_data_secure.