From 9682dbfbd4d69b260b5672fb889a4cf907d8957f Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Mon, 8 Jun 2026 03:02:34 -0400 Subject: [PATCH] WS5: LPIM / IFI / memory dumps as advise-only server-health recommendations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surface three standing server-health gaps as advise-only recommendation cards on the existing recommendations path (no Apply — these need OS / service-account / investigation work, not a setting flip): - CONFIG_IFI_DISABLED — Instant File Initialization off - CONFIG_LPIM_DISABLED — Lock Pages in Memory off - SERVER_MEMORY_DUMPS — sys.dm_server_memory_dumps count > 0 Vertical slice across the Dashboard SQL collector and both apps' C#, mirroring the WS3 config-advisory pattern exactly. Schema (collect.server_properties): three new nullable columns lock_pages_in_memory bit, instant_file_initialization_enabled bit, memory_dump_count integer — added to the CREATE TABLE in install/02 (fresh installs) AND via an idempotent guarded ALTER in upgrades/2.11.0-to-2.12.0/04_add_server_health_columns.sql (existing installs). Collector (install/53_collect_server_properties.sql): extends the existing collect.server_properties_collector (dispatched by the one Collection job's master collector — no new job/schedule/proc). Reads LPIM from sys.dm_os_sys_info.sql_memory_model, IFI from sys.dm_server_services.instant_file_initialization_enabled (DMV + column guarded, dynamic SQL so older builds compile), dumps from sys.dm_server_memory_dumps (DMV guarded). Each is defensive (engine-edition guard + TRY/CATCH) and left NULL when unavailable so the collector never fails; all three are folded into the row_hash so a change re-collects. Facts (advise-only, mirrors WS3): - FactScorer.ScoreConfigFact: each key scores a 0.4 advisory base only when bad (IFI/LPIM Value==0, dumps Value>0), 0 otherwise. - InferenceEngine.ConfigAdvisoryRootKeys: each roots a standalone card below the 0.5 incident threshold (surfaces on a quiet, healthy server). - FactAdvice: one advice block per key (headline + investigation + copy-paste remediation guidance; no generated Apply T-SQL). Collectors emit the facts with shared noise-control gating (Dashboard SqlServerFactCollector + Lite DuckDbFactCollector + Lite RemoteCollectorService.ServerProperties + DuckDB schema v27 migration): - IFI off: always advisory (low-noise, universally good advice). - LPIM off: only on non-Express editions with >= 32 GB RAM (don't flag tiny instances). [THRESHOLD FLAGGED FOR TUNING] - Memory dumps: advisory whenever count > 0. Tests (Dashboard.Tests, shared-library home for the WS3 analogues): 14 new cases — scoring (bad -> 0.4, fine -> 0), rooting (each roots standalone below 0.5), and advice presence. Dashboard.Tests 420 -> 434, Lite.Tests 362 (unchanged). Both apps build clean. NOT verified live (no run against a live SQL Server): the actual DMV reads and the upgrade ALTER are untested on a real instance — reviewer to run the install/upgrade test. Co-Authored-By: Claude Opus 4.8 (1M context) --- Dashboard.Tests/FactScorerTests.cs | 48 +++++++ Dashboard.Tests/InferenceEngineTests.cs | 41 ++++++ Dashboard/Analysis/SqlServerFactCollector.cs | 79 ++++++++++- Lite/Analysis/DuckDbFactCollector.cs | 76 +++++++++- Lite/Database/DuckDbInitializer.cs | 18 ++- Lite/Database/Schema.cs | 5 +- ...RemoteCollectorService.ServerProperties.cs | 134 ++++++++++++++++++ PerformanceMonitor.Analysis/FactAdvice.cs | 30 ++++ PerformanceMonitor.Analysis/FactScorer.cs | 27 +++- .../InferenceEngine.cs | 6 + install/02_create_tables.sql | 3 + install/53_collect_server_properties.sql | 111 ++++++++++++++- .../04_add_server_health_columns.sql | 96 +++++++++++++ upgrades/2.11.0-to-2.12.0/upgrade.txt | 1 + 14 files changed, 666 insertions(+), 9 deletions(-) create mode 100644 upgrades/2.11.0-to-2.12.0/04_add_server_health_columns.sql diff --git a/Dashboard.Tests/FactScorerTests.cs b/Dashboard.Tests/FactScorerTests.cs index 52d8b576..d018ed43 100644 --- a/Dashboard.Tests/FactScorerTests.cs +++ b/Dashboard.Tests/FactScorerTests.cs @@ -238,4 +238,52 @@ public void BuildNarrowMemoryFact_NullInputs_ReturnsNull() Assert.Null(FactRemediation.BuildNarrowMemoryFact(1, null, 24000)); Assert.Null(FactRemediation.BuildNarrowMemoryFact(1, 28672, null)); } + + /* ── WS5: server-health advisory facts (LPIM / IFI / memory dumps) ── */ + + // Each WS5 server-health fact scores its 0.4 advisory base ONLY when the value is bad, and 0 + // otherwise — mirroring the WS3 server-config keys. The collectors gate emission (Express / + // small-RAM / dumps>0) so a fact that would score 0 is normally never emitted, but the scorer + // is still independently bad-only so it can be unit-tested in isolation. + [Theory] + // IFI: disabled (Value 0) is bad; enabled (Value 1) is fine. + [InlineData("CONFIG_IFI_DISABLED", 0, 0.4)] + [InlineData("CONFIG_IFI_DISABLED", 1, 0.0)] + // LPIM: disabled (Value 0) is bad; enabled (Value 1) is fine. + [InlineData("CONFIG_LPIM_DISABLED", 0, 0.4)] + [InlineData("CONFIG_LPIM_DISABLED", 1, 0.0)] + // Memory dumps: any count > 0 is bad; 0 is fine. + [InlineData("SERVER_MEMORY_DUMPS", 1, 0.4)] + [InlineData("SERVER_MEMORY_DUMPS", 7, 0.4)] + [InlineData("SERVER_MEMORY_DUMPS", 0, 0.0)] + public void ServerHealthFact_ScoresBadValueOnly(string key, long value, double expected) + { + var facts = new List + { + new() { Source = "config", Key = key, Value = value } + }; + + var scorer = new FactScorer(); + scorer.ScoreAll(facts); + + Assert.Equal(expected, facts[0].BaseSeverity, precision: 4); + } + + // Advice exists for each WS5 server-health root key (dead-fact guard — a fact that roots but + // renders no advice is the P1 dead-fact bug class). + [Theory] + [InlineData("CONFIG_IFI_DISABLED")] + [InlineData("CONFIG_LPIM_DISABLED")] + [InlineData("SERVER_MEMORY_DUMPS")] + public void ServerHealthKeys_HaveAdviceBlocks(string key) + { + var advice = FactAdvice.GetForFactKey(key); + + Assert.NotNull(advice); + Assert.False(string.IsNullOrWhiteSpace(advice!.Headline)); + Assert.False(string.IsNullOrWhiteSpace(advice.Investigation)); + Assert.False(string.IsNullOrWhiteSpace(advice.Remediation)); + // Advise-only: no generated Apply T-SQL is attached to the bare advice block. + Assert.Null(advice.RemediationTsql); + } } diff --git a/Dashboard.Tests/InferenceEngineTests.cs b/Dashboard.Tests/InferenceEngineTests.cs index 37081570..6cd6ac79 100644 --- a/Dashboard.Tests/InferenceEngineTests.cs +++ b/Dashboard.Tests/InferenceEngineTests.cs @@ -141,4 +141,45 @@ public void ServerConfigFacts_AllFour_RootFourDistinctFindings() Assert.Contains(stories, s => s.RootFactKey == "CONFIG_MAX_MEMORY_MB"); Assert.Contains(stories, s => s.RootFactKey == "CONFIG_MIN_MAX_MEMORY_NARROW"); } + + // WS5: each server-health advisory fact at its 0.4 advisory base roots a standalone + // recommendation, below the 0.5 incident threshold — because each is a config-advisory root + // key. One finding per fact (they are leaves with no edges between them). Mirrors the WS3 + // ServerConfigFact_RootsStandalone_BelowMinimumSeverity test. + [Theory] + [InlineData("CONFIG_IFI_DISABLED")] + [InlineData("CONFIG_LPIM_DISABLED")] + [InlineData("SERVER_MEMORY_DUMPS")] + public void ServerHealthFact_RootsStandalone_BelowMinimumSeverity(string key) + { + var engine = new InferenceEngine(new RelationshipGraph()); + var facts = new List + { + new() { Key = key, Source = "config", Value = 0, Severity = 0.4 } + }; + + var stories = engine.BuildStories(facts); + + Assert.Contains(stories, s => s.RootFactKey == key); + } + + // All three server-health facts together root three distinct standalone findings (no edges + // between them, so none consumes another). + [Fact] + public void ServerHealthFacts_AllThree_RootThreeDistinctFindings() + { + var engine = new InferenceEngine(new RelationshipGraph()); + var facts = new List + { + new() { Key = "CONFIG_IFI_DISABLED", Source = "config", Value = 0, Severity = 0.4 }, + new() { Key = "CONFIG_LPIM_DISABLED", Source = "config", Value = 0, Severity = 0.4 }, + new() { Key = "SERVER_MEMORY_DUMPS", Source = "config", Value = 3, Severity = 0.4 }, + }; + + var stories = engine.BuildStories(facts); + + Assert.Contains(stories, s => s.RootFactKey == "CONFIG_IFI_DISABLED"); + Assert.Contains(stories, s => s.RootFactKey == "CONFIG_LPIM_DISABLED"); + Assert.Contains(stories, s => s.RootFactKey == "SERVER_MEMORY_DUMPS"); + } } diff --git a/Dashboard/Analysis/SqlServerFactCollector.cs b/Dashboard/Analysis/SqlServerFactCollector.cs index 3715d5ba..cc2a3f86 100644 --- a/Dashboard/Analysis/SqlServerFactCollector.cs +++ b/Dashboard/Analysis/SqlServerFactCollector.cs @@ -1985,7 +1985,10 @@ SELECT TOP 1 cores_per_socket, is_hadr_enabled, edition, - product_version + product_version, + lock_pages_in_memory, + instant_file_initialization_enabled, + memory_dump_count FROM collect.server_properties ORDER BY collection_time DESC"; @@ -1998,6 +2001,10 @@ FROM collect.server_properties var socketCount = reader.IsDBNull(3) ? 0 : Convert.ToInt32(reader.GetValue(3)); var coresPerSocket = reader.IsDBNull(4) ? 0 : Convert.ToInt32(reader.GetValue(4)); var hadrEnabled = !reader.IsDBNull(5) && Convert.ToBoolean(reader.GetValue(5)); + var edition = reader.IsDBNull(6) ? string.Empty : reader.GetString(6); + bool? lpim = reader.IsDBNull(8) ? (bool?)null : Convert.ToBoolean(reader.GetValue(8)); + bool? ifi = reader.IsDBNull(9) ? (bool?)null : Convert.ToBoolean(reader.GetValue(9)); + int? dumpCount = reader.IsDBNull(10) ? (int?)null : Convert.ToInt32(reader.GetValue(10)); if (cpuCount == 0) return; @@ -2017,6 +2024,11 @@ FROM collect.server_properties ["hadr_enabled"] = hadrEnabled ? 1 : 0 } }); + + // WS5 server-health advisories (advise-only). Gating lives here so a fact that would + // score 0 is simply never emitted (noise control); the scorer then scores the emitted + // fact's Value. Shared with Lite — keep the rules identical (see DuckDbFactCollector). + EmitServerHealthFacts(context, facts, edition, physicalMemMb, lpim, ifi, dumpCount); } catch (Exception ex) { @@ -2024,6 +2036,71 @@ FROM collect.server_properties } } + // RAM floor below which LPIM-off is not worth flagging — on a small buffer pool the OS paging + // SQL out is not the practical risk it is on a large dedicated host. Shared rule with Lite. + private const long LpimAdvisoryMinPhysicalMemoryMb = 32 * 1024; + + /// + /// Emits the WS5 advise-only server-health facts (IFI off / LPIM off / memory dumps) from the + /// latest server_properties values, applying the noise-control gating both apps share: + /// • IFI: emit whenever the value is known (Value = enabled bit) — universally good advice. + /// • LPIM: emit only on non-Express editions with meaningful RAM (Value = enabled bit) — so a + /// tiny instance never flags. When LPIM is ON the emitted Value scores 0 (harmless). + /// • Dumps: emit whenever the count is known (Value = count) — the scorer flags count > 0. + /// + private static void EmitServerHealthFacts( + AnalysisContext context, List facts, string edition, long physicalMemMb, + bool? lockPagesInMemory, bool? instantFileInit, int? memoryDumpCount) + { + var isExpress = edition.Contains("Express", StringComparison.OrdinalIgnoreCase); + + if (instantFileInit.HasValue) + { + facts.Add(new Fact + { + Source = "config", + Key = "CONFIG_IFI_DISABLED", + Value = instantFileInit.Value ? 1 : 0, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["instant_file_initialization_enabled"] = instantFileInit.Value ? 1 : 0 + } + }); + } + + if (lockPagesInMemory.HasValue && !isExpress && physicalMemMb >= LpimAdvisoryMinPhysicalMemoryMb) + { + facts.Add(new Fact + { + Source = "config", + Key = "CONFIG_LPIM_DISABLED", + Value = lockPagesInMemory.Value ? 1 : 0, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["lock_pages_in_memory"] = lockPagesInMemory.Value ? 1 : 0, + ["physical_memory_mb"] = physicalMemMb + } + }); + } + + if (memoryDumpCount.HasValue) + { + facts.Add(new Fact + { + Source = "config", + Key = "SERVER_MEMORY_DUMPS", + Value = memoryDumpCount.Value, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["memory_dump_count"] = memoryDumpCount.Value + } + }); + } + } + /// /// Collects disk space facts from database_size_stats: volume free space, file sizes. /// diff --git a/Lite/Analysis/DuckDbFactCollector.cs b/Lite/Analysis/DuckDbFactCollector.cs index 974bb154..39271b5c 100644 --- a/Lite/Analysis/DuckDbFactCollector.cs +++ b/Lite/Analysis/DuckDbFactCollector.cs @@ -1844,7 +1844,8 @@ private async Task CollectServerPropertiesFactsAsync(AnalysisContext context, Li using var cmd = connection.CreateCommand(); cmd.CommandText = @" SELECT COALESCE(vcore_count, cpu_count) AS cpu_count, hyperthread_ratio, physical_memory_mb, - socket_count, cores_per_socket, is_hadr_enabled, edition, product_version + socket_count, cores_per_socket, is_hadr_enabled, edition, product_version, + lock_pages_in_memory, instant_file_initialization_enabled, memory_dump_count FROM server_properties WHERE server_id = $1 ORDER BY collection_time DESC @@ -1861,6 +1862,10 @@ ORDER BY collection_time DESC var socketCount = reader.IsDBNull(3) ? 0 : Convert.ToInt32(reader.GetValue(3)); var coresPerSocket = reader.IsDBNull(4) ? 0 : Convert.ToInt32(reader.GetValue(4)); var hadrEnabled = !reader.IsDBNull(5) && Convert.ToBoolean(reader.GetValue(5)); + var edition = reader.IsDBNull(6) ? string.Empty : reader.GetString(6); + bool? lpim = reader.IsDBNull(8) ? (bool?)null : Convert.ToBoolean(reader.GetValue(8)); + bool? ifi = reader.IsDBNull(9) ? (bool?)null : Convert.ToBoolean(reader.GetValue(9)); + int? dumpCount = reader.IsDBNull(10) ? (int?)null : Convert.ToInt32(reader.GetValue(10)); if (cpuCount == 0) return; @@ -1880,10 +1885,79 @@ ORDER BY collection_time DESC ["hadr_enabled"] = hadrEnabled ? 1 : 0 } }); + + // WS5 server-health advisories (advise-only). Gating mirrors the Dashboard collector so + // both apps agree on what is worth flagging; a fact that would score 0 is simply never + // emitted (noise control). + EmitServerHealthFacts(context, facts, edition, physicalMemMb, lpim, ifi, dumpCount); } catch { /* Table may not exist or have no data */ } } + // RAM floor below which LPIM-off is not worth flagging — shared rule with the Dashboard + // SqlServerFactCollector (small buffer pools do not suffer from OS paging the way large ones do). + private const long LpimAdvisoryMinPhysicalMemoryMb = 32 * 1024; + + /// + /// Emits the WS5 advise-only server-health facts (IFI off / LPIM off / memory dumps) from the + /// latest server_properties values, applying the same noise-control gating as the Dashboard: + /// • IFI: emit whenever the value is known (Value = enabled bit) — universally good advice. + /// • LPIM: emit only on non-Express editions with meaningful RAM (Value = enabled bit). + /// • Dumps: emit whenever the count is known (Value = count) — the scorer flags count > 0. + /// + private static void EmitServerHealthFacts( + AnalysisContext context, List facts, string edition, long physicalMemMb, + bool? lockPagesInMemory, bool? instantFileInit, int? memoryDumpCount) + { + var isExpress = edition.Contains("Express", StringComparison.OrdinalIgnoreCase); + + if (instantFileInit.HasValue) + { + facts.Add(new Fact + { + Source = "config", + Key = "CONFIG_IFI_DISABLED", + Value = instantFileInit.Value ? 1 : 0, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["instant_file_initialization_enabled"] = instantFileInit.Value ? 1 : 0 + } + }); + } + + if (lockPagesInMemory.HasValue && !isExpress && physicalMemMb >= LpimAdvisoryMinPhysicalMemoryMb) + { + facts.Add(new Fact + { + Source = "config", + Key = "CONFIG_LPIM_DISABLED", + Value = lockPagesInMemory.Value ? 1 : 0, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["lock_pages_in_memory"] = lockPagesInMemory.Value ? 1 : 0, + ["physical_memory_mb"] = physicalMemMb + } + }); + } + + if (memoryDumpCount.HasValue) + { + facts.Add(new Fact + { + Source = "config", + Key = "SERVER_MEMORY_DUMPS", + Value = memoryDumpCount.Value, + ServerId = context.ServerId, + Metadata = new Dictionary + { + ["memory_dump_count"] = memoryDumpCount.Value + } + }); + } + } + /// /// Collects disk space facts from database_size_stats: volume free space, file sizes. /// diff --git a/Lite/Database/DuckDbInitializer.cs b/Lite/Database/DuckDbInitializer.cs index 3cacfb7e..4da99867 100644 --- a/Lite/Database/DuckDbInitializer.cs +++ b/Lite/Database/DuckDbInitializer.cs @@ -97,7 +97,7 @@ public void Dispose() /// /// Current schema version. Increment this when schema changes require table rebuilds. /// - internal const int CurrentSchemaVersion = 26; + internal const int CurrentSchemaVersion = 27; private readonly string _archivePath; @@ -716,6 +716,22 @@ New tables only — no existing table changes needed. Tables created by _logger?.LogWarning("Migration to v26 encountered an error (non-fatal): {Error}", ex.Message); } } + + if (fromVersion < 27) + { + _logger?.LogInformation("Running migration to v27: adding server-health columns (LPIM/IFI/memory dumps) to server_properties"); + try + { + await ExecuteNonQueryAsync(connection, "ALTER TABLE server_properties ADD COLUMN IF NOT EXISTS lock_pages_in_memory BOOLEAN"); + await ExecuteNonQueryAsync(connection, "ALTER TABLE server_properties ADD COLUMN IF NOT EXISTS instant_file_initialization_enabled BOOLEAN"); + await ExecuteNonQueryAsync(connection, "ALTER TABLE server_properties ADD COLUMN IF NOT EXISTS memory_dump_count INTEGER"); + } + catch (Exception ex) + { + _logger?.LogError(ex, "Migration to v27 failed"); + throw; + } + } } /// diff --git a/Lite/Database/Schema.cs b/Lite/Database/Schema.cs index 1dace30b..c9cf969c 100644 --- a/Lite/Database/Schema.cs +++ b/Lite/Database/Schema.cs @@ -658,7 +658,10 @@ CREATE TABLE IF NOT EXISTS server_properties ( is_clustered BOOLEAN, enterprise_features VARCHAR, service_objective VARCHAR, - vcore_count INTEGER + vcore_count INTEGER, + lock_pages_in_memory BOOLEAN, + instant_file_initialization_enabled BOOLEAN, + memory_dump_count INTEGER )"; public const string CreateServerPropertiesIndex = @" diff --git a/Lite/Services/RemoteCollectorService.ServerProperties.cs b/Lite/Services/RemoteCollectorService.ServerProperties.cs index 7b29d65b..a367f788 100644 --- a/Lite/Services/RemoteCollectorService.ServerProperties.cs +++ b/Lite/Services/RemoteCollectorService.ServerProperties.cs @@ -23,6 +23,11 @@ public partial class RemoteCollectorService /// Collects server edition, version, CPU/memory hardware metadata for /// license audit and FinOps cost attribution. On-load only collector. /// + // RAM floor below which LPIM-off is not worth flagging (shared rule with the Dashboard + // SqlServerFactCollector). On a small buffer pool the OS paging SQL out is not the practical + // risk it is on a large dedicated host. + private const long LpimAdvisoryMinPhysicalMemoryMb = 32 * 1024; + private async Task CollectServerPropertiesAsync(ServerConnection server, CancellationToken cancellationToken) { var serverStatus = _serverManager.GetConnectionStatus(server.Id); @@ -105,6 +110,12 @@ count from the service_objective string (e.g. "HS_Gen5_14" → 14). */ vcoreCount = ParseVcoreFromServiceObjective(serviceObjective); } + /* WS5 server-health values (LPIM / IFI / memory dumps). Read best-effort on the same + connection — version-gated and Azure-unavailable DMVs are guarded inside the query + and any failure leaves the values NULL rather than failing the collector. */ + var (lockPagesInMemory, instantFileInit, memoryDumpCount) = + await ReadServerHealthAsync(sqlConnection, isAzureSqlDb, cancellationToken); + sqlSw.Stop(); var duckSw = Stopwatch.StartNew(); @@ -135,6 +146,9 @@ count from the service_objective string (e.g. "HS_Gen5_14" → 14). */ .AppendValue((string?)null) // enterprise_features — not collected in Lite (requires cross-database cursor) .AppendValue(serviceObjective) .AppendValue(vcoreCount) + .AppendValue(lockPagesInMemory) + .AppendValue(instantFileInit) + .AppendValue(memoryDumpCount) .EndRow(); rowsCollected++; } @@ -154,6 +168,126 @@ count from the service_objective string (e.g. "HS_Gen5_14" → 14). */ return rowsCollected; } + /// + /// Reads the WS5 server-health values — Lock Pages in Memory, Instant File Initialization, + /// and the SQL Server memory-dump count — best-effort from a single guarded batch. Each value + /// is left NULL when its source is unavailable (Azure SQL DB, older builds, or a missing + /// column/DMV), and any failure is swallowed so this never fails the server-properties + /// collection. Mirrors the defensive logic in install/53_collect_server_properties.sql. + /// + private async Task<(bool? lpim, bool? ifi, int? dumpCount)> ReadServerHealthAsync( + SqlConnection sqlConnection, bool isAzureSqlDb, CancellationToken cancellationToken) + { + if (isAzureSqlDb) + { + /* None of these sources are meaningful/available on Azure SQL DB. */ + return (null, null, null); + } + + /* LPIM: sql_memory_model_desc (LOCK_PAGES / LARGE_PAGES => on; CONVENTIONAL => off). + IFI: sys.dm_server_services.instant_file_initialization_enabled ('Y'/'N'), guarded on + DMV + column existence and read via dynamic SQL so older builds still compile. + Dumps: COUNT from sys.dm_server_memory_dumps, guarded on DMV existence. + Each branch is wrapped in TRY/CATCH inside the batch so one missing source does not + prevent reading the others. */ + const string query = @" +SET TRANSACTION ISOLATION LEVEL READ UNCOMMITTED; + +DECLARE + @lpim bit = NULL, + @ifi bit = NULL, + @dumps integer = NULL; + +BEGIN TRY + SELECT + @lpim = + CASE + WHEN osi.sql_memory_model IN (2, 3) + THEN CONVERT(bit, 1) + ELSE CONVERT(bit, 0) + END + FROM sys.dm_os_sys_info AS osi; +END TRY +BEGIN CATCH + SET @lpim = NULL; +END CATCH; + +IF OBJECT_ID(N'sys.dm_server_services', N'V') IS NOT NULL +AND EXISTS +( + SELECT + 1/0 + FROM sys.system_columns AS sc + WHERE sc.object_id = OBJECT_ID(N'sys.dm_server_services') + AND sc.name = N'instant_file_initialization_enabled' +) +BEGIN + BEGIN TRY + DECLARE + @ifi_sql nvarchar(max) = + N' + SELECT TOP (1) + @ifi_out = + CASE + WHEN ss.instant_file_initialization_enabled = N''Y'' + THEN CONVERT(bit, 1) + WHEN ss.instant_file_initialization_enabled = N''N'' + THEN CONVERT(bit, 0) + ELSE NULL + END + FROM sys.dm_server_services AS ss + WHERE ss.servicename LIKE N''SQL Server (%'';'; + + EXECUTE sys.sp_executesql + @ifi_sql, + N'@ifi_out bit OUTPUT', + @ifi_out = @ifi OUTPUT; + END TRY + BEGIN CATCH + SET @ifi = NULL; + END CATCH; +END; + +IF OBJECT_ID(N'sys.dm_server_memory_dumps', N'V') IS NOT NULL +BEGIN + BEGIN TRY + SELECT + @dumps = COUNT_BIG(*) + FROM sys.dm_server_memory_dumps AS smd; + END TRY + BEGIN CATCH + SET @dumps = NULL; + END CATCH; +END; + +SELECT + lock_pages_in_memory = @lpim, + instant_file_initialization_enabled = @ifi, + memory_dump_count = @dumps;"; + + try + { + using var command = new SqlCommand(query, sqlConnection); + command.CommandTimeout = CommandTimeoutSeconds; + + using var reader = await command.ExecuteReaderAsync(cancellationToken); + if (!await reader.ReadAsync(cancellationToken)) + { + return (null, null, null); + } + + bool? lpim = reader.IsDBNull(0) ? (bool?)null : reader.GetBoolean(0); + bool? ifi = reader.IsDBNull(1) ? (bool?)null : reader.GetBoolean(1); + int? dumpCount = reader.IsDBNull(2) ? (int?)null : reader.GetInt32(2); + return (lpim, ifi, dumpCount); + } + catch (Exception ex) + { + _logger?.LogDebug(ex, "ReadServerHealthAsync failed; leaving server-health values NULL"); + return (null, null, null); + } + } + /// /// Parses the vCore count from an Azure SQL DB service objective string. /// vCore tiers follow the pattern {Tier}_{Gen}_{VcoreCount} diff --git a/PerformanceMonitor.Analysis/FactAdvice.cs b/PerformanceMonitor.Analysis/FactAdvice.cs index c5d0ec1e..44ccb131 100644 --- a/PerformanceMonitor.Analysis/FactAdvice.cs +++ b/PerformanceMonitor.Analysis/FactAdvice.cs @@ -469,6 +469,36 @@ private static Dictionary BuildAdviceTable() Remediation: "Lower min server memory well below max so SQL can grow into the cap under load but release memory back to the OS when idle (min is best left at the default 0, or a modest floor only if you have a specific reason). This is NOT auto-applied — how low to set it is a workload judgement. Copy the statement, choose your floor, and run `sp_configure 'min server memory (MB)', ` + RECONFIGURE."); + // ───────────────────────────────────────────────────────────────── + // Server health (WS5) — advise-only: LPIM, IFI, memory dumps. + // No Apply (these need OS / service-account / investigation work); + // advice prose + copy-paste guidance only. + // ───────────────────────────────────────────────────────────────── + + t["CONFIG_IFI_DISABLED"] = new AdviceBlock( + Headline: + "Instant File Initialization is OFF — data file growth and restores zero-fill the whole allocation", + Investigation: + "Without Instant File Initialization (IFI) SQL Server writes zeros across every newly-allocated data-file region before it can be used — so every data-file autogrowth, every CREATE DATABASE, every RESTORE, and every TempDB re-creation at startup stalls for as long as it takes to zero the new space (minutes, on large files). The finding's metadata carries the service account that needs the right. Confirm the current state with `SELECT servicename, instant_file_initialization_enabled FROM sys.dm_server_services;` (SQL 2016 SP1+). IFI applies to DATA files only — log files are always zeroed regardless, which is why fixed-MB log autogrowth still matters.", + Remediation: + "Grant the SQL Server service account the Windows 'Perform volume maintenance tasks' user right (SeManageVolumePrivilege): secpol.msc → Local Policies → User Rights Assignment → Perform volume maintenance tasks → add the service account, then restart the SQL Server service for it to take effect. The SQL Server 2016+ setup wizard offers this as a checkbox; on Linux IFI is effectively always on (no action). This is an OS-level change, not a T-SQL one, so there is nothing to Apply from here — but it is one of the highest-value, lowest-risk changes you can make to file-growth and restore times."); + + t["CONFIG_LPIM_DISABLED"] = new AdviceBlock( + Headline: + "Lock Pages in Memory is OFF — the OS can page out SQL Server's buffer pool under memory pressure", + Investigation: + "Lock Pages in Memory (LPIM) prevents Windows from trimming SQL Server's working set — paging the buffer pool out to disk — when the OS comes under memory pressure. Without it, a memory-hungry neighbour process (or a runaway on the box) can force SQL's data cache out to the page file, and you see a sudden, unexplained plunge in Page Life Expectancy with a spike in hard faults that no SQL workload explains. This finding only fires on non-Express editions with meaningful RAM, where the buffer pool is large enough for paging to hurt. Read the current memory model with `SELECT sql_memory_model_desc FROM sys.dm_os_sys_info;` — CONVENTIONAL means LPIM is off; LOCK_PAGES or LARGE_PAGES means it is in effect.", + Remediation: + "LPIM is a judgement call, not an automatic win, which is why there is nothing to Apply from here. Grant it only alongside a correctly-configured 'max server memory (MB)' cap — LPIM with an uncapped max can starve the OS itself. To enable: grant the SQL Server service account the Windows 'Lock pages in memory' user right (secpol.msc → Local Policies → User Rights Assignment → Lock pages in memory), then restart the SQL Server service. It is most valuable on dedicated database hosts with large buffer pools; on a shared or memory-constrained box, fix 'max server memory' first and weigh LPIM second."); + + t["SERVER_MEMORY_DUMPS"] = new AdviceBlock( + Headline: + "SQL Server has written one or more memory dumps — the engine hit a condition it considered worth dumping", + Investigation: + "Every row in `sys.dm_server_memory_dumps` is a point where SQL Server detected something abnormal — an access violation, a non-yielding scheduler, a latch time-out, a failed assertion, or a stack corruption — and wrote a dump for diagnosis. The finding's metadata carries the dump count. List them with `SELECT filename, creation_time, size_in_bytes FROM sys.dm_server_memory_dumps ORDER BY creation_time DESC;` and correlate the creation_time against the SQL Server ERRORLOG (`EXEC sys.xp_readerrorlog 0, 1, N'dump';`) — the log entry around each dump names the failure type. A single old dump from a since-patched build is usually historical; recent or repeating dumps are a live reliability signal.", + Remediation: + "Dumps mean investigate, not a setting to flip — so there is nothing to Apply. First, get current on Cumulative Updates: a large share of dump-producing bugs are already fixed in later builds, and 'apply the latest CU and re-evaluate' resolves many cases outright. If dumps continue on a current build, match the ERRORLOG failure type to a known issue or open a case with Microsoft and attach the dump files — they are the artifact support needs. Watch the volume holding the dump directory: repeated large dumps can themselves fill the disk. Do not delete the dump files until you (or support) have read them."); + // ───────────────────────────────────────────────────────────────── // Jobs / disk / bad actors // ───────────────────────────────────────────────────────────────── diff --git a/PerformanceMonitor.Analysis/FactScorer.cs b/PerformanceMonitor.Analysis/FactScorer.cs index 1ad39d70..29de7eb5 100644 --- a/PerformanceMonitor.Analysis/FactScorer.cs +++ b/PerformanceMonitor.Analysis/FactScorer.cs @@ -234,9 +234,12 @@ private static double ScorePerfmonFact(Fact fact) /// /// Scores config-source advisory facts. FILE_AUTOGROWTH_PERCENT (WS3) is a base-0.3 - /// advisory; the four server-level config keys (WS3) score 0.4 ONLY when the value is bad, - /// and 0 otherwise — so audit_config still sees every CONFIG_* fact (it reads the raw value), - /// but only a BAD one roots a recommendation card (via InferenceEngine.ConfigAdvisoryRootKeys). + /// advisory; the four server-level config keys (WS3) and the three server-health keys (WS5: + /// CONFIG_IFI_DISABLED / CONFIG_LPIM_DISABLED / SERVER_MEMORY_DUMPS) score 0.4 ONLY when the + /// value is bad, and 0 otherwise — so audit_config still sees every CONFIG_* fact (it reads the + /// raw value), but only a BAD one roots a recommendation card (via + /// InferenceEngine.ConfigAdvisoryRootKeys). The WS5 keys are advise-only: there is no Apply, + /// only advice prose and copy-paste guidance. /// Edition is NOT needed to decide "bad" — only later for the recommended MAXDOP value. /// Every other "config"-source fact (SERVER_* / DATABASE_TOTAL_SIZE_MB / SERVER_HARDWARE / /// CONFIG_MAX_WORKER_THREADS / CONFIG_MIN_MEMORY_MB) is a leaf/amplifier with no base severity @@ -268,6 +271,24 @@ private static double ScoreConfigFact(Fact fact) case "CONFIG_MIN_MAX_MEMORY_NARROW": return 0.4; + // WS5 server-health advisories (advise-only — no Apply). Each carries the bad/good + // signal in Value so it scores its 0.4 advisory base only when bad and 0 otherwise; + // the noise-control gating (Express / small-RAM for LPIM, dumps>0, IFI-known) lives in + // the collectors so a fact that would score 0 is simply never emitted. + + // IFI off (Value == 0) is universally good advice — always advisory when known. + case "CONFIG_IFI_DISABLED": + return fact.Value == 0 ? 0.4 : 0.0; + + // LPIM off (Value == 0). The collector only emits this when it plausibly matters + // (not Express, meaningful RAM), so reaching the scorer with Value 0 is already a flag. + case "CONFIG_LPIM_DISABLED": + return fact.Value == 0 ? 0.4 : 0.0; + + // A memory dump always warrants a look — advisory when the count is > 0. + case "SERVER_MEMORY_DUMPS": + return fact.Value > 0 ? 0.4 : 0.0; + default: return 0.0; } diff --git a/PerformanceMonitor.Analysis/InferenceEngine.cs b/PerformanceMonitor.Analysis/InferenceEngine.cs index 437ce050..baa10dcf 100644 --- a/PerformanceMonitor.Analysis/InferenceEngine.cs +++ b/PerformanceMonitor.Analysis/InferenceEngine.cs @@ -45,6 +45,12 @@ public class InferenceEngine "CONFIG_CTFP", "CONFIG_MAX_MEMORY_MB", "CONFIG_MIN_MAX_MEMORY_NARROW", + // WS5: server-health advisories (advise-only). Each scores its 0.4 advisory base only when + // bad (FactScorer) and roots its own standalone card here, bypassing the 0.5 incident + // threshold — a standing server-health gap should surface on a quiet, healthy server. + "CONFIG_IFI_DISABLED", + "CONFIG_LPIM_DISABLED", + "SERVER_MEMORY_DUMPS", }; private readonly RelationshipGraph _graph; diff --git a/install/02_create_tables.sql b/install/02_create_tables.sql index ee016039..4e56f859 100644 --- a/install/02_create_tables.sql +++ b/install/02_create_tables.sql @@ -1501,6 +1501,9 @@ BEGIN is_clustered bit NULL, enterprise_features nvarchar(max) NULL, service_objective sysname NULL, + lock_pages_in_memory bit NULL, + instant_file_initialization_enabled bit NULL, + memory_dump_count integer NULL, row_hash binary(32) NULL, CONSTRAINT PK_server_properties diff --git a/install/53_collect_server_properties.sql b/install/53_collect_server_properties.sql index 9f56f3af..499da32d 100644 --- a/install/53_collect_server_properties.sql +++ b/install/53_collect_server_properties.sql @@ -54,7 +54,10 @@ BEGIN @error_message nvarchar(4000), @engine_edition integer = CONVERT(integer, SERVERPROPERTY(N'EngineEdition')), - @major_version integer; + @major_version integer, + @lock_pages_in_memory bit = NULL, + @instant_file_initialization_enabled bit = NULL, + @memory_dump_count integer = NULL; /* Parse major version for feature gating @@ -206,6 +209,101 @@ BEGIN DROP TABLE #sku_features; END; + /* + Server-health properties (WS5): Lock Pages in Memory, Instant File + Initialization, and SQL Server memory dump count. Each is captured + defensively and left NULL where the source is unavailable (older + builds, Azure SQL DB) so the collector never fails on a missing + DMV/column. + */ + + /* + LPIM: sys.dm_os_sys_info.sql_memory_model. 1 = CONVENTIONAL (off), + 2 = LOCK_PAGES, 3 = LARGE_PAGES (both mean LPIM is in effect). + sql_memory_model exists on all supported on-prem versions; it is not + meaningful on Azure SQL DB (engine edition 5), where it is left NULL. + */ + IF @engine_edition <> 5 + BEGIN + BEGIN TRY + SELECT + @lock_pages_in_memory = + CASE + WHEN osi.sql_memory_model IN (2, 3) + THEN CONVERT(bit, 1) + ELSE CONVERT(bit, 0) + END + FROM sys.dm_os_sys_info AS osi; + END TRY + BEGIN CATCH + SET @lock_pages_in_memory = NULL; + END CATCH; + END; + + /* + IFI: sys.dm_server_services.instant_file_initialization_enabled + ('Y'/'N'). The DMV is on-prem only and the column was added in later + builds (SQL 2016 SP1 / 2017+), so guard on both DMV and column + existence and read via dynamic SQL so older builds still compile. + servicename is matched with LIKE N'SQL Server (%' to cover the default + instance and named instances. + */ + IF @engine_edition <> 5 + AND OBJECT_ID(N'sys.dm_server_services', N'V') IS NOT NULL + AND EXISTS + ( + SELECT + 1/0 + FROM sys.system_columns AS sc + WHERE sc.object_id = OBJECT_ID(N'sys.dm_server_services') + AND sc.name = N'instant_file_initialization_enabled' + ) + BEGIN + BEGIN TRY + DECLARE + @ifi_sql nvarchar(max) = + N' + SELECT TOP (1) + @ifi_out = + CASE + WHEN ss.instant_file_initialization_enabled = N''Y'' + THEN CONVERT(bit, 1) + WHEN ss.instant_file_initialization_enabled = N''N'' + THEN CONVERT(bit, 0) + ELSE NULL + END + FROM sys.dm_server_services AS ss + WHERE ss.servicename LIKE N''SQL Server (%'';'; + + EXECUTE sys.sp_executesql + @ifi_sql, + N'@ifi_out bit OUTPUT', + @ifi_out = @instant_file_initialization_enabled OUTPUT; + END TRY + BEGIN CATCH + SET @instant_file_initialization_enabled = NULL; + END CATCH; + END; + + /* + Memory dumps: COUNT(*) from sys.dm_server_memory_dumps. On-prem only + (not exposed on Azure SQL DB) and unavailable on some + editions/versions, so guard on DMV existence and TRY/CATCH; a missing + DMV leaves the count NULL rather than zero. + */ + IF @engine_edition <> 5 + AND OBJECT_ID(N'sys.dm_server_memory_dumps', N'V') IS NOT NULL + BEGIN + BEGIN TRY + SELECT + @memory_dump_count = COUNT_BIG(*) + FROM sys.dm_server_memory_dumps AS smd; + END TRY + BEGIN CATCH + SET @memory_dump_count = NULL; + END CATCH; + END; + /* Deduplication: check if anything changed since last collection */ @@ -226,7 +324,10 @@ BEGIN @engine_edition, N'|', (SELECT osi.cpu_count FROM sys.dm_os_sys_info AS osi), N'|', (SELECT osi.physical_memory_kb FROM sys.dm_os_sys_info AS osi), N'|', - ISNULL(@enterprise_features, N'') + ISNULL(@enterprise_features, N''), N'|', + ISNULL(CONVERT(nvarchar(10), @lock_pages_in_memory), N'NULL'), N'|', + ISNULL(CONVERT(nvarchar(10), @instant_file_initialization_enabled), N'NULL'), N'|', + ISNULL(CONVERT(nvarchar(11), @memory_dump_count), N'NULL') ) ); @@ -286,6 +387,9 @@ BEGIN is_clustered, enterprise_features, service_objective, + lock_pages_in_memory, + instant_file_initialization_enabled, + memory_dump_count, row_hash ) SELECT @@ -318,6 +422,9 @@ BEGIN THEN CONVERT(sysname, DATABASEPROPERTYEX(DB_NAME(), N'ServiceObjective')) ELSE NULL END, + lock_pages_in_memory = @lock_pages_in_memory, + instant_file_initialization_enabled = @instant_file_initialization_enabled, + memory_dump_count = @memory_dump_count, row_hash = @current_hash FROM sys.dm_os_sys_info AS osi OPTION(RECOMPILE); diff --git a/upgrades/2.11.0-to-2.12.0/04_add_server_health_columns.sql b/upgrades/2.11.0-to-2.12.0/04_add_server_health_columns.sql new file mode 100644 index 00000000..db644078 --- /dev/null +++ b/upgrades/2.11.0-to-2.12.0/04_add_server_health_columns.sql @@ -0,0 +1,96 @@ +/* +Copyright 2026 Darling Data, LLC +https://www.erikdarling.com/ + +Upgrade from 2.11.0 to 2.12.0 +WS5: surface Lock Pages in Memory (LPIM), Instant File Initialization (IFI), +and SQL Server memory dumps as advise-only server-health recommendations. + +Adds three nullable columns to collect.server_properties so the +server_properties_collector can persist these values for the analysis path +(CONFIG_LPIM_DISABLED / CONFIG_IFI_DISABLED / SERVER_MEMORY_DUMPS facts). + +Idempotent and partial-failure safe: each column gets its own guarded ALTER, +so a re-run (or a re-run after an interrupted run) converges to the target +shape. Matches the IF NOT EXISTS (sys.columns) ... ALTER ADD style used in +install/02_create_tables.sql. +*/ + +SET ANSI_NULLS ON; +SET ANSI_PADDING ON; +SET ANSI_WARNINGS ON; +SET ARITHABORT ON; +SET CONCAT_NULL_YIELDS_NULL ON; +SET QUOTED_IDENTIFIER ON; +SET NUMERIC_ROUNDABORT OFF; +SET IMPLICIT_TRANSACTIONS OFF; +SET STATISTICS TIME, IO OFF; +GO + +USE PerformanceMonitor; +GO + +IF OBJECT_ID(N'collect.server_properties', N'U') IS NULL +BEGIN + PRINT 'collect.server_properties does not exist - no action taken'; +END; +ELSE +BEGIN + IF NOT EXISTS + ( + SELECT + 1/0 + FROM sys.columns AS c + WHERE c.object_id = OBJECT_ID(N'collect.server_properties') + AND c.name = N'lock_pages_in_memory' + ) + BEGIN + ALTER TABLE collect.server_properties + ADD lock_pages_in_memory bit NULL; + + PRINT 'Added lock_pages_in_memory to collect.server_properties'; + END; + ELSE + BEGIN + PRINT 'collect.server_properties.lock_pages_in_memory already exists - no action taken'; + END; + + IF NOT EXISTS + ( + SELECT + 1/0 + FROM sys.columns AS c + WHERE c.object_id = OBJECT_ID(N'collect.server_properties') + AND c.name = N'instant_file_initialization_enabled' + ) + BEGIN + ALTER TABLE collect.server_properties + ADD instant_file_initialization_enabled bit NULL; + + PRINT 'Added instant_file_initialization_enabled to collect.server_properties'; + END; + ELSE + BEGIN + PRINT 'collect.server_properties.instant_file_initialization_enabled already exists - no action taken'; + END; + + IF NOT EXISTS + ( + SELECT + 1/0 + FROM sys.columns AS c + WHERE c.object_id = OBJECT_ID(N'collect.server_properties') + AND c.name = N'memory_dump_count' + ) + BEGIN + ALTER TABLE collect.server_properties + ADD memory_dump_count integer NULL; + + PRINT 'Added memory_dump_count to collect.server_properties'; + END; + ELSE + BEGIN + PRINT 'collect.server_properties.memory_dump_count already exists - no action taken'; + END; +END; +GO diff --git a/upgrades/2.11.0-to-2.12.0/upgrade.txt b/upgrades/2.11.0-to-2.12.0/upgrade.txt index a2543b93..c25a3ef4 100644 --- a/upgrades/2.11.0-to-2.12.0/upgrade.txt +++ b/upgrades/2.11.0-to-2.12.0/upgrade.txt @@ -1,3 +1,4 @@ 01_extend_blocked_process_report_columns.sql 02_make_other_process_cpu_nullable.sql 03_add_transaction_mutex_ignored_wait.sql +04_add_server_health_columns.sql