Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions Lite.Tests/AlertBadgeAckTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
using System;
using PerformanceMonitorLite.Services;
using Xunit;

namespace PerformanceMonitorLite.Tests;

/// <summary>
/// Guards the Lite server-tab badge acknowledgement logic for the low-disk (#754) and
/// failed-Agent-job (#749) conditions added in #1128. Those conditions are plain booleans with
/// no event timestamp, so the timestamp-based ack clear in
/// <see cref="AlertStateService.UpdateAlertCounts"/> never fires for them. The review fix added
/// <see cref="AlertStateService.ClearAcknowledgementForNewCondition"/> — the false-&gt;true
/// transition hook MainWindow calls so a freshly-breaching disk or a brand-new failed job
/// re-lights an acknowledged badge, matching the Dashboard's re-show behaviour.
///
/// Each test uses a unique server id so the shared (CWD-relative) alert_state.json the service
/// persists to cannot leak state between tests; assertions read in-memory state, so they hold
/// even when the best-effort file write is unavailable.
/// </summary>
public class AlertBadgeAckTests
{
private static string NewServerId() => "badge-test-" + Guid.NewGuid().ToString("N");

[Fact]
public void StandingLowDisk_AfterAck_StaysSuppressed_UntilNewConditionClearsIt()
{
var svc = new AlertStateService();
var server = NewServerId();

/* A standing low-disk breach lights the badge... */
Assert.True(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: true, hasFailedJob: false, latestEventTimeUtc: null));

/* ...the user dismisses it: the badge stays suppressed even though the breach is still
standing — with no event timestamp, UpdateAlertCounts can never auto-clear the ack. */
svc.AcknowledgeAlert(server);
Assert.False(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: true, hasFailedJob: false, latestEventTimeUtc: null));

/* A fresh false->true transition (re-breach / new job) clears the ack and re-lights. */
var fired = false;
svc.SuppressionStateChanged += (_, _) => fired = true;
svc.ClearAcknowledgementForNewCondition(server);
Assert.True(fired);
Assert.True(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: true, hasFailedJob: false, latestEventTimeUtc: null));
}

[Fact]
public void FailedJob_AfterAck_StaysSuppressed_UntilNewConditionClearsIt()
{
var svc = new AlertStateService();
var server = NewServerId();

Assert.True(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: false, hasFailedJob: true, latestEventTimeUtc: null));

svc.AcknowledgeAlert(server);
Assert.False(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: false, hasFailedJob: true, latestEventTimeUtc: null));

svc.ClearAcknowledgementForNewCondition(server);
Assert.True(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: false, hasFailedJob: true, latestEventTimeUtc: null));
}

[Fact]
public void ClearAcknowledgementForNewCondition_WhenNothingAcknowledged_DoesNotFireEvent()
{
var svc = new AlertStateService();
var server = NewServerId();

var fired = false;
svc.SuppressionStateChanged += (_, _) => fired = true;

/* No prior ack for this server -> nothing to clear -> no event, no save churn. */
svc.ClearAcknowledgementForNewCondition(server);
Assert.False(fired);

/* And a standing condition still lights the badge (it was never suppressed). */
Assert.True(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: true, hasFailedJob: false, latestEventTimeUtc: null));
}

[Fact]
public void SilencedServer_NewCondition_StaysSuppressed()
{
/* ClearAcknowledgementForNewCondition only clears the *ack*; a fully-silenced server must
stay dark even when a new disk/job condition appears. */
var svc = new AlertStateService();
var server = NewServerId();

svc.SilenceServer(server);
svc.ClearAcknowledgementForNewCondition(server);
Assert.False(svc.UpdateAlertCounts(server, 0, 0, hasLowDisk: true, hasFailedJob: true, latestEventTimeUtc: null));
}
}
48 changes: 36 additions & 12 deletions Lite/MainWindow.xaml.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,12 @@ private void CloseServerTab(string serverId)
/* Clean up alert state for this server */
_alertStateService.RemoveServerState(serverId);

/* #1128 review fix: drop the per-server badge state so a stale low-disk / failed-job flag
doesn't flash on reopen, and the dicts don't grow with tab churn. */
_badgeLowDisk.Remove(serverId);
_badgeFailedJob.Remove(serverId);
_lastBadgeCounts.Remove(serverId);

// Show empty state if no tabs open
if (_openServerTabs.Count == 0)
{
Expand Down Expand Up @@ -1542,6 +1548,15 @@ private async void CheckPerformanceAlerts(ServerSummaryItem summary)
failed-job conditions (#754/#749); null when the server isn't in the list. */
var badgeServer = _serverManager.GetAllServers().FirstOrDefault(s =>
RemoteCollectorService.GetDeterministicHashCode(RemoteCollectorService.GetServerNameForStorage(s)).ToString() == key);

/* #1128 review fix: snapshot the prior badge flags, recompute them as locals through the
sweep, and write them ONCE at the end — so a disabled feature / offline server clears a
stale badge (not just the active branch), and a false->true transition clears the ack. */
bool prevBadgeLowDisk = badgeServer != null && _badgeLowDisk.TryGetValue(badgeServer.Id, out var _pBadgeLd) && _pBadgeLd;
bool prevBadgeFailedJob = badgeServer != null && _badgeFailedJob.TryGetValue(badgeServer.Id, out var _pBadgeFj) && _pBadgeFj;
bool curBadgeLowDisk = false;
bool curBadgeFailedJob = false;

var alertCooldown = TimeSpan.FromMinutes(App.AlertCooldownMinutes);

/* Skip popup/email alerts if user has acknowledged or silenced this server */
Expand Down Expand Up @@ -1965,12 +1980,9 @@ await _emailAlertService.TrySendAlertEmailAsync(
var volumes = await Task.Run(() => _dataService.GetVolumeFreeSpaceAsync(summary.ServerId));
var breached = GetBreachedVolumes(volumes);

/* Drive the server tab badge — a breached volume is a standing condition (#754). */
if (badgeServer != null)
{
_badgeLowDisk[badgeServer.Id] = breached.Count > 0;
RefreshServerBadgeExtras(badgeServer.Id);
}
/* Drive the server tab badge — a breached volume is a standing condition (#754).
Recorded as a local; the flags are written once at the end of the sweep (#1128 review). */
curBadgeLowDisk = breached.Count > 0;

if (breached.Count > 0)
{
Expand Down Expand Up @@ -2137,12 +2149,9 @@ dedups so the same failure never re-fires. */
off the UI thread; MFA serialization / throttle / retry handled inside). */
var failedJobs = await _collectorService.GetRecentlyFailedJobsAsync(server, App.AlertFailedJobLookbackMinutes);

/* Drive the server tab badge — a failure in the lookback window (#749). */
if (badgeServer != null)
{
_badgeFailedJob[badgeServer.Id] = failedJobs.Count > 0;
RefreshServerBadgeExtras(badgeServer.Id);
}
/* Drive the server tab badge — a failure in the lookback window (#749). Recorded
as a local; written once at the end of the sweep (#1128 review). */
curBadgeFailedJob = failedJobs.Count > 0;

if (failedJobs.Count > 0)
{
Expand Down Expand Up @@ -2195,6 +2204,21 @@ await _emailAlertService.TrySendAlertEmailAsync(
AppLogger.Error("Alerts", $"Failed to check failed jobs for {summary.DisplayName}: {ex.Message}");
}
}

/* #1128 review fix: write the badge's low-disk / failed-job flags ONCE per sweep from the
values computed above. Doing it here (not only inside the feature-enabled / online / msdb
branches) means a disabled feature or an offline server clears a previously-lit badge
instead of leaving it stale. A false->true transition is a genuinely new condition, so it
clears any acknowledgement — matching the Dashboard, whose IsWorseThanBaseline re-shows on
a new disk/job condition. RefreshServerBadgeExtras re-renders once (no-op without a tab). */
if (badgeServer != null)
{
_badgeLowDisk[badgeServer.Id] = curBadgeLowDisk;
_badgeFailedJob[badgeServer.Id] = curBadgeFailedJob;
if ((curBadgeLowDisk && !prevBadgeLowDisk) || (curBadgeFailedJob && !prevBadgeFailedJob))
_alertStateService.ClearAcknowledgementForNewCondition(badgeServer.Id);
RefreshServerBadgeExtras(badgeServer.Id);
}
}

private static string TruncateText(string text, int maxLength = 300)
Expand Down
18 changes: 18 additions & 0 deletions Lite/Services/AlertStateService.cs
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,24 @@ public void AcknowledgeAlert(string serverId)
SuppressionStateChanged?.Invoke(this, EventArgs.Empty);
}

/// <summary>
/// Clears a server's acknowledgement when a genuinely new badge condition (a fresh low-disk
/// breach or failed Agent job) appears. Those booleans have no event timestamp of their own,
/// so the timestamp-based clear in <see cref="UpdateAlertCounts"/> never fires for them; the
/// caller detects the false-&gt;true transition and calls this so the badge re-lights —
/// matching the Dashboard's re-show on a new disk/job condition (#1128 review).
/// </summary>
public void ClearAcknowledgementForNewCondition(string serverId)
{
bool changed;
lock (_lock)
{
changed = _acknowledgedAlerts.Remove(serverId);
if (changed) Save();
}
if (changed) SuppressionStateChanged?.Invoke(this, EventArgs.Empty);
}

/// <summary>
/// Silences a server entirely (no badges until unsilenced). Persisted across restarts.
/// </summary>
Expand Down
41 changes: 25 additions & 16 deletions install/25_process_deadlock_xml.sql
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ BEGIN
BEGIN
SELECT
@start_date = MIN(dx.event_time),
@end_date = DATEADD(SECOND, 1, MAX(dx.event_time))
@end_date = MAX(dx.event_time)
FROM collect.deadlock_xml AS dx
WHERE dx.is_processed = 0
AND dx.event_time IS NOT NULL
Expand All @@ -153,7 +153,11 @@ BEGIN
*/
SELECT
@start_date_local = DATEADD(MINUTE, @utc_offset_minutes, @start_date),
@end_date_local = DATEADD(MINUTE, @utc_offset_minutes, @end_date);
/* +1s on the PARSER's local upper bound (not on @end_date itself) so sp_BlitzLock
includes events at exactly @end_date, while the mark below uses the un-padded UTC
@end_date — so a deadlock inserted concurrently in that extra second is left for the
next run rather than marked unparsed. Mirrors process_blocked_process_xml. */
@end_date_local = DATEADD(SECOND, 1, DATEADD(MINUTE, @utc_offset_minutes, @end_date));

IF @debug = 1
BEGIN
Expand Down Expand Up @@ -244,10 +248,11 @@ BEGIN
here: the XACT_STATE() = -1 check and the CATCH block both roll back
without marking, so a real parse failure still retries next run. Raw
XML is retained (is_processed = 1, not deleted); data-retention
handles cleanup. The +1s pad on @end_date above guarantees the parse
window covers every unprocessed event, so we never mark a row
sp_BlitzLock did not get to see. event_time is UTC, matching
@start_date / @end_date.
handles cleanup. The +1s pad on @end_date_local (the parser's local bound,
set above) guarantees sp_BlitzLock sees every event up to and including
@end_date, while this mark uses the un-padded UTC @end_date so a row inserted
concurrently after @end_date is left for the next run. event_time is UTC,
matching @start_date / @end_date.
*/
IF @rows_parsed = 0 AND @debug = 1
BEGIN
Expand Down Expand Up @@ -285,18 +290,22 @@ BEGIN
VALUES
(
N'process_deadlock_xml',
CASE WHEN @rows_available = 0 THEN N'SUCCESS'
WHEN @rows_parsed > 0 THEN N'SUCCESS'
ELSE N'NO_RESULTS'
END,
/*
A clean parse run is SUCCESS even when sp_BlitzLock produced 0 parsed
deadlocks: the events were processed and marked (above), they simply held
no reconstructable deadlock graph (un-parseable-by-design). Genuine failures
take the CATCH path and log ERROR. This ends the perpetual NO_RESULTS this
collector used to emit and mirrors process_blocked_process_xml exactly
(previously this proc still logged NO_RESULTS + "left unprocessed for retry"
after it had already marked the rows processed — a false signal).
Tradeoff: a silent sp_BlitzLock failure that returns 0 rows with no error and
a committable transaction is indistinguishable from "nothing to parse", so
those events are marked processed — an accepted cost of ending the retry loop.
*/
N'SUCCESS',
@rows_available,
DATEDIFF(MILLISECOND, @start_time, SYSDATETIME()),
CASE WHEN @rows_available > 0 AND @rows_parsed = 0
THEN N'sp_BlitzLock returned 0 parsed results for '
+ CAST(@rows_available AS nvarchar(20))
+ N' XML events - rows left unprocessed for retry'
ELSE NULL
END
NULL
);

IF @debug = 1
Expand Down
7 changes: 7 additions & 0 deletions install/53_collect_server_properties.sql
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,13 @@ BEGIN
CONCAT
(
CONVERT(nvarchar(128), SERVERPROPERTY(N'Edition')), N'|',
/* Include the Azure tier inputs that drive the normalized edition /
service_objective columns. SERVERPROPERTY('Edition') is the constant
'SQL Azure' on Azure SQL DB, so without these a pure tier/SLO change with
no vCore/memory delta would not change the hash and the collector would
SKIP, never recording the new tier. NULL (on-prem) concats as empty. */
CONVERT(nvarchar(128), DATABASEPROPERTYEX(DB_NAME(), N'Edition')), N'|',
CONVERT(nvarchar(128), DATABASEPROPERTYEX(DB_NAME(), N'ServiceObjective')), N'|',
CONVERT(nvarchar(128), SERVERPROPERTY(N'ProductVersion')), N'|',
CONVERT(nvarchar(128), SERVERPROPERTY(N'ProductLevel')), N'|',
@engine_edition, N'|',
Expand Down
Loading