diff --git a/Lite.Tests/AlertBadgeAckTests.cs b/Lite.Tests/AlertBadgeAckTests.cs
new file mode 100644
index 00000000..7f3ee90d
--- /dev/null
+++ b/Lite.Tests/AlertBadgeAckTests.cs
@@ -0,0 +1,90 @@
+using System;
+using PerformanceMonitorLite.Services;
+using Xunit;
+
+namespace PerformanceMonitorLite.Tests;
+
+///
+/// Guards the Lite server-tab badge acknowledgement logic for the low-disk (#754) and
+/// failed-Agent-job (#749) conditions added in #1128. Those conditions are plain booleans with
+/// no event timestamp, so the timestamp-based ack clear in
+/// never fires for them. The review fix added
+/// — the false->true
+/// transition hook MainWindow calls so a freshly-breaching disk or a brand-new failed job
+/// re-lights an acknowledged badge, matching the Dashboard's re-show behaviour.
+///
+/// Each test uses a unique server id so the shared (CWD-relative) alert_state.json the service
+/// persists to cannot leak state between tests; assertions read in-memory state, so they hold
+/// even when the best-effort file write is unavailable.
+///
+public class AlertBadgeAckTests
+{
+ private static string NewServerId() => "badge-test-" + Guid.NewGuid().ToString("N");
+
+ [Fact]
+ public void StandingLowDisk_AfterAck_StaysSuppressed_UntilNewConditionClearsIt()
+ {
+ var svc = new AlertStateService();
+ var server = NewServerId();
+
+ /* A standing low-disk breach lights the badge... */
+ Assert.True(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: true, hasFailedJob: false, latestEventTimeUtc: null));
+
+ /* ...the user dismisses it: the badge stays suppressed even though the breach is still
+ standing — with no event timestamp, UpdateAlertCounts can never auto-clear the ack. */
+ svc.AcknowledgeAlert(server);
+ Assert.False(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: true, hasFailedJob: false, latestEventTimeUtc: null));
+
+ /* A fresh false->true transition (re-breach / new job) clears the ack and re-lights. */
+ var fired = false;
+ svc.SuppressionStateChanged += (_, _) => fired = true;
+ svc.ClearAcknowledgementForNewCondition(server);
+ Assert.True(fired);
+ Assert.True(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: true, hasFailedJob: false, latestEventTimeUtc: null));
+ }
+
+ [Fact]
+ public void FailedJob_AfterAck_StaysSuppressed_UntilNewConditionClearsIt()
+ {
+ var svc = new AlertStateService();
+ var server = NewServerId();
+
+ Assert.True(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: false, hasFailedJob: true, latestEventTimeUtc: null));
+
+ svc.AcknowledgeAlert(server);
+ Assert.False(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: false, hasFailedJob: true, latestEventTimeUtc: null));
+
+ svc.ClearAcknowledgementForNewCondition(server);
+ Assert.True(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: false, hasFailedJob: true, latestEventTimeUtc: null));
+ }
+
+ [Fact]
+ public void ClearAcknowledgementForNewCondition_WhenNothingAcknowledged_DoesNotFireEvent()
+ {
+ var svc = new AlertStateService();
+ var server = NewServerId();
+
+ var fired = false;
+ svc.SuppressionStateChanged += (_, _) => fired = true;
+
+ /* No prior ack for this server -> nothing to clear -> no event, no save churn. */
+ svc.ClearAcknowledgementForNewCondition(server);
+ Assert.False(fired);
+
+ /* And a standing condition still lights the badge (it was never suppressed). */
+ Assert.True(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: true, hasFailedJob: false, latestEventTimeUtc: null));
+ }
+
+ [Fact]
+ public void SilencedServer_NewCondition_StaysSuppressed()
+ {
+ /* ClearAcknowledgementForNewCondition only clears the *ack*; a fully-silenced server must
+ stay dark even when a new disk/job condition appears. */
+ var svc = new AlertStateService();
+ var server = NewServerId();
+
+ svc.SilenceServer(server);
+ svc.ClearAcknowledgementForNewCondition(server);
+ Assert.False(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: true, hasFailedJob: true, latestEventTimeUtc: null));
+ }
+}
diff --git a/Lite/MainWindow.xaml.cs b/Lite/MainWindow.xaml.cs
index 952f734e..09ef711a 100644
--- a/Lite/MainWindow.xaml.cs
+++ b/Lite/MainWindow.xaml.cs
@@ -1027,6 +1027,12 @@ private void CloseServerTab(string serverId)
/* Clean up alert state for this server */
_alertStateService.RemoveServerState(serverId);
+ /* #1128 review fix: drop the per-server badge state so a stale low-disk / failed-job flag
+ doesn't flash on reopen, and the dicts don't grow with tab churn. */
+ _badgeLowDisk.Remove(serverId);
+ _badgeFailedJob.Remove(serverId);
+ _lastBadgeCounts.Remove(serverId);
+
// Show empty state if no tabs open
if (_openServerTabs.Count == 0)
{
@@ -1542,6 +1548,15 @@ private async void CheckPerformanceAlerts(ServerSummaryItem summary)
failed-job conditions (#754/#749); null when the server isn't in the list. */
var badgeServer = _serverManager.GetAllServers().FirstOrDefault(s =>
RemoteCollectorService.GetDeterministicHashCode(RemoteCollectorService.GetServerNameForStorage(s)).ToString() == key);
+
+ /* #1128 review fix: snapshot the prior badge flags, recompute them as locals through the
+ sweep, and write them ONCE at the end — so a disabled feature / offline server clears a
+ stale badge (not just the active branch), and a false->true transition clears the ack. */
+ bool prevBadgeLowDisk = badgeServer != null && _badgeLowDisk.TryGetValue(badgeServer.Id, out var _pBadgeLd) && _pBadgeLd;
+ bool prevBadgeFailedJob = badgeServer != null && _badgeFailedJob.TryGetValue(badgeServer.Id, out var _pBadgeFj) && _pBadgeFj;
+ bool curBadgeLowDisk = false;
+ bool curBadgeFailedJob = false;
+
var alertCooldown = TimeSpan.FromMinutes(App.AlertCooldownMinutes);
/* Skip popup/email alerts if user has acknowledged or silenced this server */
@@ -1965,12 +1980,9 @@ await _emailAlertService.TrySendAlertEmailAsync(
var volumes = await Task.Run(() => _dataService.GetVolumeFreeSpaceAsync(summary.ServerId));
var breached = GetBreachedVolumes(volumes);
- /* Drive the server tab badge — a breached volume is a standing condition (#754). */
- if (badgeServer != null)
- {
- _badgeLowDisk[badgeServer.Id] = breached.Count > 0;
- RefreshServerBadgeExtras(badgeServer.Id);
- }
+ /* Drive the server tab badge — a breached volume is a standing condition (#754).
+ Recorded as a local; the flags are written once at the end of the sweep (#1128 review). */
+ curBadgeLowDisk = breached.Count > 0;
if (breached.Count > 0)
{
@@ -2137,12 +2149,9 @@ dedups so the same failure never re-fires. */
off the UI thread; MFA serialization / throttle / retry handled inside). */
var failedJobs = await _collectorService.GetRecentlyFailedJobsAsync(server, App.AlertFailedJobLookbackMinutes);
- /* Drive the server tab badge — a failure in the lookback window (#749). */
- if (badgeServer != null)
- {
- _badgeFailedJob[badgeServer.Id] = failedJobs.Count > 0;
- RefreshServerBadgeExtras(badgeServer.Id);
- }
+ /* Drive the server tab badge — a failure in the lookback window (#749). Recorded
+ as a local; written once at the end of the sweep (#1128 review). */
+ curBadgeFailedJob = failedJobs.Count > 0;
if (failedJobs.Count > 0)
{
@@ -2195,6 +2204,21 @@ await _emailAlertService.TrySendAlertEmailAsync(
AppLogger.Error("Alerts", $"Failed to check failed jobs for {summary.DisplayName}: {ex.Message}");
}
}
+
+ /* #1128 review fix: write the badge's low-disk / failed-job flags ONCE per sweep from the
+ values computed above. Doing it here (not only inside the feature-enabled / online / msdb
+ branches) means a disabled feature or an offline server clears a previously-lit badge
+ instead of leaving it stale. A false->true transition is a genuinely new condition, so it
+ clears any acknowledgement — matching the Dashboard, whose IsWorseThanBaseline re-shows on
+ a new disk/job condition. RefreshServerBadgeExtras re-renders once (no-op without a tab). */
+ if (badgeServer != null)
+ {
+ _badgeLowDisk[badgeServer.Id] = curBadgeLowDisk;
+ _badgeFailedJob[badgeServer.Id] = curBadgeFailedJob;
+ if ((curBadgeLowDisk && !prevBadgeLowDisk) || (curBadgeFailedJob && !prevBadgeFailedJob))
+ _alertStateService.ClearAcknowledgementForNewCondition(badgeServer.Id);
+ RefreshServerBadgeExtras(badgeServer.Id);
+ }
}
private static string TruncateText(string text, int maxLength = 300)
diff --git a/Lite/Services/AlertStateService.cs b/Lite/Services/AlertStateService.cs
index cc468303..e54c2bbf 100644
--- a/Lite/Services/AlertStateService.cs
+++ b/Lite/Services/AlertStateService.cs
@@ -96,6 +96,24 @@ public void AcknowledgeAlert(string serverId)
SuppressionStateChanged?.Invoke(this, EventArgs.Empty);
}
+ ///
+ /// Clears a server's acknowledgement when a genuinely new badge condition (a fresh low-disk
+ /// breach or failed Agent job) appears. Those booleans have no event timestamp of their own,
+ /// so the timestamp-based clear in never fires for them; the
+ /// caller detects the false->true transition and calls this so the badge re-lights —
+ /// matching the Dashboard's re-show on a new disk/job condition (#1128 review).
+ ///
+ public void ClearAcknowledgementForNewCondition(string serverId)
+ {
+ bool changed;
+ lock (_lock)
+ {
+ changed = _acknowledgedAlerts.Remove(serverId);
+ if (changed) Save();
+ }
+ if (changed) SuppressionStateChanged?.Invoke(this, EventArgs.Empty);
+ }
+
///
/// Silences a server entirely (no badges until unsilenced). Persisted across restarts.
///
diff --git a/install/25_process_deadlock_xml.sql b/install/25_process_deadlock_xml.sql
index 23808747..ba2ac40f 100644
--- a/install/25_process_deadlock_xml.sql
+++ b/install/25_process_deadlock_xml.sql
@@ -132,7 +132,7 @@ BEGIN
BEGIN
SELECT
@start_date = MIN(dx.event_time),
- @end_date = DATEADD(SECOND, 1, MAX(dx.event_time))
+ @end_date = MAX(dx.event_time)
FROM collect.deadlock_xml AS dx
WHERE dx.is_processed = 0
AND dx.event_time IS NOT NULL
@@ -153,7 +153,11 @@ BEGIN
*/
SELECT
@start_date_local = DATEADD(MINUTE, @utc_offset_minutes, @start_date),
- @end_date_local = DATEADD(MINUTE, @utc_offset_minutes, @end_date);
+ /* +1s on the PARSER's local upper bound (not on @end_date itself) so sp_BlitzLock
+ includes events at exactly @end_date, while the mark below uses the un-padded UTC
+ @end_date — so a deadlock inserted concurrently in that extra second is left for the
+ next run rather than marked unparsed. Mirrors process_blocked_process_xml. */
+ @end_date_local = DATEADD(SECOND, 1, DATEADD(MINUTE, @utc_offset_minutes, @end_date));
IF @debug = 1
BEGIN
@@ -244,10 +248,11 @@ BEGIN
here: the XACT_STATE() = -1 check and the CATCH block both roll back
without marking, so a real parse failure still retries next run. Raw
XML is retained (is_processed = 1, not deleted); data-retention
- handles cleanup. The +1s pad on @end_date above guarantees the parse
- window covers every unprocessed event, so we never mark a row
- sp_BlitzLock did not get to see. event_time is UTC, matching
- @start_date / @end_date.
+ handles cleanup. The +1s pad on @end_date_local (the parser's local bound,
+ set above) guarantees sp_BlitzLock sees every event up to and including
+ @end_date, while this mark uses the un-padded UTC @end_date so a row inserted
+ concurrently after @end_date is left for the next run. event_time is UTC,
+ matching @start_date / @end_date.
*/
IF @rows_parsed = 0 AND @debug = 1
BEGIN
@@ -285,18 +290,22 @@ BEGIN
VALUES
(
N'process_deadlock_xml',
- CASE WHEN @rows_available = 0 THEN N'SUCCESS'
- WHEN @rows_parsed > 0 THEN N'SUCCESS'
- ELSE N'NO_RESULTS'
- END,
+ /*
+ A clean parse run is SUCCESS even when sp_BlitzLock produced 0 parsed
+ deadlocks: the events were processed and marked (above), they simply held
+ no reconstructable deadlock graph (un-parseable-by-design). Genuine failures
+ take the CATCH path and log ERROR. This ends the perpetual NO_RESULTS this
+ collector used to emit and mirrors process_blocked_process_xml exactly
+ (previously this proc still logged NO_RESULTS + "left unprocessed for retry"
+ after it had already marked the rows processed — a false signal).
+ Tradeoff: a silent sp_BlitzLock failure that returns 0 rows with no error and
+ a committable transaction is indistinguishable from "nothing to parse", so
+ those events are marked processed — an accepted cost of ending the retry loop.
+ */
+ N'SUCCESS',
@rows_available,
DATEDIFF(MILLISECOND, @start_time, SYSDATETIME()),
- CASE WHEN @rows_available > 0 AND @rows_parsed = 0
- THEN N'sp_BlitzLock returned 0 parsed results for '
- + CAST(@rows_available AS nvarchar(20))
- + N' XML events - rows left unprocessed for retry'
- ELSE NULL
- END
+ NULL
);
IF @debug = 1
diff --git a/install/53_collect_server_properties.sql b/install/53_collect_server_properties.sql
index 901da4ea..d81f42e8 100644
--- a/install/53_collect_server_properties.sql
+++ b/install/53_collect_server_properties.sql
@@ -319,6 +319,13 @@ BEGIN
CONCAT
(
CONVERT(nvarchar(128), SERVERPROPERTY(N'Edition')), N'|',
+ /* Include the Azure tier inputs that drive the normalized edition /
+ service_objective columns. SERVERPROPERTY('Edition') is the constant
+ 'SQL Azure' on Azure SQL DB, so without these a pure tier/SLO change with
+ no vCore/memory delta would not change the hash and the collector would
+ SKIP, never recording the new tier. NULL (on-prem) concats as empty. */
+ CONVERT(nvarchar(128), DATABASEPROPERTYEX(DB_NAME(), N'Edition')), N'|',
+ CONVERT(nvarchar(128), DATABASEPROPERTYEX(DB_NAME(), N'ServiceObjective')), N'|',
CONVERT(nvarchar(128), SERVERPROPERTY(N'ProductVersion')), N'|',
CONVERT(nvarchar(128), SERVERPROPERTY(N'ProductLevel')), N'|',
@engine_edition, N'|',