From dc0e6a90bd373f50b8d862f260fdd25a98effa3f Mon Sep 17 00:00:00 2001 From: Cliff Schomburg Date: Thu, 25 Jun 2026 14:34:07 -0700 Subject: [PATCH 1/8] fix(grafanactl): reconcile stale Grafana datasource URLs during modify The modify datasource reconcile command only managed Azure Monitor Workspace integrations (resource IDs) but never checked the actual datasource URLs in Grafana. When an AMW Prometheus query endpoint hostname changes, datasource URLs become stale and dashboards fail with DNS resolution errors. After integration reconciliation, the command now lists Grafana datasources, compares each Managed_Prometheus_* URL against the current AMW PrometheusQueryEndpoint, and updates any that differ. Fixes: ARO-27914 Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/grafanactl/cmd/modify/cmd.go | 100 +++++++++++++++++--- tools/grafanactl/cmd/modify/options.go | 8 ++ tools/grafanactl/internal/grafana/client.go | 10 ++ 3 files changed, 103 insertions(+), 15 deletions(-) diff --git a/tools/grafanactl/cmd/modify/cmd.go b/tools/grafanactl/cmd/modify/cmd.go index 3583bd36..80b2ab25 100644 --- a/tools/grafanactl/cmd/modify/cmd.go +++ b/tools/grafanactl/cmd/modify/cmd.go @@ -84,15 +84,10 @@ func (opts *RawAddDatasourceOptions) Run(ctx context.Context) error { return completed.Run(ctx) } -func (o *CompletedAddDatasourceOptions) getMatchingWorkspaceIDs(ctx context.Context, logger logr.Logger) (set.Set[string], error) { +func getMatchingWorkspaceIDs(workspaces []armmonitor.AzureMonitorWorkspaceResource, logger logr.Logger) set.Set[string] { validWorkspaceIDs := set.New[string]() - monitorWorkspaces, err := o.MonitorWorkspaceClient.GetAllMonitorWorkspaces(ctx) - if err != nil { - return nil, fmt.Errorf("failed to list Azure Monitor Workspaces: %w", err) - } - - for _, workspace := range monitorWorkspaces { + for _, workspace := range workspaces { if workspace.Properties == nil || workspace.Properties.ProvisioningState == nil || workspace.ID == nil { continue } @@ -102,7 +97,74 @@ func (o *CompletedAddDatasourceOptions) getMatchingWorkspaceIDs(ctx context.Cont } } - return validWorkspaceIDs, nil + return validWorkspaceIDs +} + +func getWorkspaceEndpoints(workspaces []armmonitor.AzureMonitorWorkspaceResource, logger logr.Logger) map[string]string { + endpoints := make(map[string]string) + + for _, workspace := range workspaces { + if workspace.Name == nil || workspace.Properties == nil || + workspace.Properties.ProvisioningState == nil || + workspace.Properties.Metrics == nil || + workspace.Properties.Metrics.PrometheusQueryEndpoint == nil { + continue + } + if *workspace.Properties.ProvisioningState == armmonitor.ProvisioningStateSucceeded { + name := strings.ToLower(*workspace.Name) + endpoints[name] = *workspace.Properties.Metrics.PrometheusQueryEndpoint + logger.Info("Found workspace endpoint", "workspace-name", *workspace.Name, "endpoint", endpoints[name]) + } + } + + return endpoints +} + +func (o *CompletedAddDatasourceOptions) reconcileDatasourceURLs(ctx context.Context, logger logr.Logger, workspaceEndpoints map[string]string) error { + datasources, err := o.GrafanaClient.ListDataSources(ctx) + if err != nil { + return fmt.Errorf("failed to list Grafana datasources: %w", err) + } + + for _, ds := range datasources { + if ds.Type != "prometheus" { + continue + } + + workspaceName := strings.TrimPrefix(ds.Name, "Managed_Prometheus_") + if workspaceName == ds.Name { + continue + } + + expectedEndpoint, ok := workspaceEndpoints[strings.ToLower(workspaceName)] + if !ok { + logger.Info("No matching workspace found for datasource, skipping URL check", "datasource-name", ds.Name) + continue + } + + if ds.URL == expectedEndpoint { + logger.Info("Datasource URL is current", "datasource-name", ds.Name, "url", ds.URL) + continue + } + + logger.Info("Datasource URL is stale, updating", + "datasource-name", ds.Name, + "current-url", ds.URL, + "expected-url", expectedEndpoint) + + if o.DryRun { + continue + } + + ds.URL = expectedEndpoint + if err := o.GrafanaClient.UpdateDataSource(ctx, ds); err != nil { + return fmt.Errorf("failed to update datasource %q URL: %w", ds.Name, err) + } + + logger.Info("Updated datasource URL", "datasource-name", ds.Name, "new-url", expectedEndpoint) + } + + return nil } func (o *CompletedAddDatasourceOptions) Run(ctx context.Context) error { @@ -115,11 +177,13 @@ func (o *CompletedAddDatasourceOptions) Run(ctx context.Context) error { return fmt.Errorf("failed to get Grafana instance: %w", err) } - validWorkspaceIDs, err := o.getMatchingWorkspaceIDs(ctx, logger) + monitorWorkspaces, err := o.MonitorWorkspaceClient.GetAllMonitorWorkspaces(ctx) if err != nil { - return fmt.Errorf("failed to get valid workspace IDs: %w", err) + return fmt.Errorf("failed to list Azure Monitor Workspaces: %w", err) } + validWorkspaceIDs := getMatchingWorkspaceIDs(monitorWorkspaces, logger) + integrationList := set.New[string]() for _, integration := range grafana.Properties.GrafanaIntegrations.AzureMonitorWorkspaceIntegrations { if integration.AzureMonitorWorkspaceResourceID == nil { @@ -142,14 +206,20 @@ func (o *CompletedAddDatasourceOptions) Run(ctx context.Context) error { if o.DryRun { logger.Info("Dry run - would reconcile Azure Monitor Workspace integrations", "total-integrations", integrationList.Len()) - return nil + } else { + logger.Info("Reconciling Azure Monitor Workspace integrations", "total-integrations", integrationList.Len()) + + err = o.ManagedGrafanaClient.UpdateGrafanaIntegrations(ctx, o.ResourceGroup, o.GrafanaName, integrationList.UnsortedList()) + if err != nil { + return fmt.Errorf("failed to update Grafana integrations: %w", err) + } } - logger.Info("Reconciling Azure Monitor Workspace integrations", "total-integrations", integrationList.Len()) + workspaceEndpoints := getWorkspaceEndpoints(monitorWorkspaces, logger) - err = o.ManagedGrafanaClient.UpdateGrafanaIntegrations(ctx, o.ResourceGroup, o.GrafanaName, integrationList.UnsortedList()) - if err != nil { - return fmt.Errorf("failed to update Grafana integrations: %w", err) + logger.Info("Reconciling datasource URLs") + if err := o.reconcileDatasourceURLs(ctx, logger, workspaceEndpoints); err != nil { + return fmt.Errorf("failed to reconcile datasource URLs: %w", err) } return nil diff --git a/tools/grafanactl/cmd/modify/options.go b/tools/grafanactl/cmd/modify/options.go index 28cbd3d5..47eec6ee 100644 --- a/tools/grafanactl/cmd/modify/options.go +++ b/tools/grafanactl/cmd/modify/options.go @@ -23,6 +23,7 @@ import ( "github.com/Azure/ARO-Tools/tools/cmdutils" "github.com/Azure/ARO-Tools/tools/grafanactl/cmd/base" "github.com/Azure/ARO-Tools/tools/grafanactl/internal/azure" + "github.com/Azure/ARO-Tools/tools/grafanactl/internal/grafana" ) // RawAddDatasourceOptions represents the initial, unvalidated configuration for add datasource operations. @@ -48,6 +49,7 @@ type ValidatedAddDatasourceOptions struct { // for add datasource operations. type CompletedAddDatasourceOptions struct { *validatedAddDatasourceOptions + GrafanaClient *grafana.Client MonitorWorkspaceClient *azure.MonitorWorkspaceClient ManagedGrafanaClient *azure.ManagedGrafanaClient } @@ -107,6 +109,11 @@ func (o *ValidatedAddDatasourceOptions) Complete(ctx context.Context) (*Complete return nil, fmt.Errorf("failed to create managed Grafana client: %w", err) } + grafanaClient, err := grafana.NewClient(ctx, cred, managedGrafanaClient, o.SubscriptionID, o.ResourceGroup, o.GrafanaName) + if err != nil { + return nil, fmt.Errorf("failed to create Grafana client: %w", err) + } + monitorWorkspaceClient, err := azure.NewMonitorWorkspaceClient(o.SubscriptionID, cred, clientOpts) if err != nil { return nil, fmt.Errorf("failed to create monitor workspace client: %w", err) @@ -114,6 +121,7 @@ func (o *ValidatedAddDatasourceOptions) Complete(ctx context.Context) (*Complete return &CompletedAddDatasourceOptions{ validatedAddDatasourceOptions: o.validatedAddDatasourceOptions, + GrafanaClient: grafanaClient, MonitorWorkspaceClient: monitorWorkspaceClient, ManagedGrafanaClient: managedGrafanaClient, }, nil diff --git a/tools/grafanactl/internal/grafana/client.go b/tools/grafanactl/internal/grafana/client.go index 417d23f5..1f1683f0 100644 --- a/tools/grafanactl/internal/grafana/client.go +++ b/tools/grafanactl/internal/grafana/client.go @@ -108,6 +108,16 @@ func (c *Client) DeleteDataSource(ctx context.Context, dataSourceName string) er return nil } +// UpdateDataSource updates a datasource in the Grafana instance. +func (c *Client) UpdateDataSource(ctx context.Context, ds sdk.Datasource) error { + _, err := c.grafanaClient.UpdateDatasource(ctx, ds) + if err != nil { + return fmt.Errorf("failed to update datasource: %w", err) + } + + return nil +} + // ListFolders returns all folders in the Grafana instance. func (c *Client) ListFolders(ctx context.Context) ([]sdk.Folder, error) { folders, err := c.grafanaClient.GetAllFolders(ctx) From 0fb370bc8d58d5d9d19e0bc33b9516661322d4c3 Mon Sep 17 00:00:00 2001 From: Cliff Schomburg Date: Thu, 25 Jun 2026 21:02:05 -0700 Subject: [PATCH 2/8] fix(grafanactl): clarify dry-run vs live log messages for datasource URL updates Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/grafanactl/cmd/modify/cmd.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tools/grafanactl/cmd/modify/cmd.go b/tools/grafanactl/cmd/modify/cmd.go index 80b2ab25..c0501b09 100644 --- a/tools/grafanactl/cmd/modify/cmd.go +++ b/tools/grafanactl/cmd/modify/cmd.go @@ -147,21 +147,23 @@ func (o *CompletedAddDatasourceOptions) reconcileDatasourceURLs(ctx context.Cont continue } - logger.Info("Datasource URL is stale, updating", - "datasource-name", ds.Name, - "current-url", ds.URL, - "expected-url", expectedEndpoint) - if o.DryRun { + logger.Info("Dry run - would update stale datasource URL", + "datasource-name", ds.Name, + "current-url", ds.URL, + "expected-url", expectedEndpoint) continue } + logger.Info("Updating stale datasource URL", + "datasource-name", ds.Name, + "current-url", ds.URL, + "expected-url", expectedEndpoint) + ds.URL = expectedEndpoint if err := o.GrafanaClient.UpdateDataSource(ctx, ds); err != nil { return fmt.Errorf("failed to update datasource %q URL: %w", ds.Name, err) } - - logger.Info("Updated datasource URL", "datasource-name", ds.Name, "new-url", expectedEndpoint) } return nil From 24084fc5e974b517fe56d583f1cf8a705882672d Mon Sep 17 00:00:00 2001 From: Cliff Schomburg Date: Fri, 26 Jun 2026 09:25:36 -0700 Subject: [PATCH 3/8] fix(grafanactl): reconcile stale URLs and delete orphaned Grafana datasources The modify datasource reconcile command only managed Azure Monitor Workspace integrations (resource IDs) but never checked the actual datasource URLs or removed orphaned datasources in Grafana. After integration reconciliation, the command now: - Updates datasource URLs when the AMW PrometheusQueryEndpoint has changed (fixes DNS resolution errors from stale hostnames) - Deletes orphaned Managed_Prometheus_* datasources whose workspaces no longer exist Both operations respect --dry-run. This consolidates the cleanup previously handled by the separate clean fixup-datasources command into the pipeline-integrated reconcile step. Fixes: ARO-27914 Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/grafanactl/cmd/modify/cmd.go | 116 +++++++++++++++++--- tools/grafanactl/cmd/modify/options.go | 8 ++ tools/grafanactl/internal/grafana/client.go | 10 ++ 3 files changed, 119 insertions(+), 15 deletions(-) diff --git a/tools/grafanactl/cmd/modify/cmd.go b/tools/grafanactl/cmd/modify/cmd.go index 3583bd36..1b3c663f 100644 --- a/tools/grafanactl/cmd/modify/cmd.go +++ b/tools/grafanactl/cmd/modify/cmd.go @@ -16,6 +16,7 @@ package modify import ( "context" + "errors" "fmt" "strings" @@ -84,15 +85,10 @@ func (opts *RawAddDatasourceOptions) Run(ctx context.Context) error { return completed.Run(ctx) } -func (o *CompletedAddDatasourceOptions) getMatchingWorkspaceIDs(ctx context.Context, logger logr.Logger) (set.Set[string], error) { +func getMatchingWorkspaceIDs(workspaces []armmonitor.AzureMonitorWorkspaceResource, logger logr.Logger) set.Set[string] { validWorkspaceIDs := set.New[string]() - monitorWorkspaces, err := o.MonitorWorkspaceClient.GetAllMonitorWorkspaces(ctx) - if err != nil { - return nil, fmt.Errorf("failed to list Azure Monitor Workspaces: %w", err) - } - - for _, workspace := range monitorWorkspaces { + for _, workspace := range workspaces { if workspace.Properties == nil || workspace.Properties.ProvisioningState == nil || workspace.ID == nil { continue } @@ -102,7 +98,89 @@ func (o *CompletedAddDatasourceOptions) getMatchingWorkspaceIDs(ctx context.Cont } } - return validWorkspaceIDs, nil + return validWorkspaceIDs +} + +func getWorkspaceEndpoints(workspaces []armmonitor.AzureMonitorWorkspaceResource, logger logr.Logger) map[string]string { + endpoints := make(map[string]string) + + for _, workspace := range workspaces { + if workspace.Name == nil || workspace.Properties == nil || + workspace.Properties.ProvisioningState == nil || + workspace.Properties.Metrics == nil || + workspace.Properties.Metrics.PrometheusQueryEndpoint == nil { + continue + } + if *workspace.Properties.ProvisioningState == armmonitor.ProvisioningStateSucceeded { + name := strings.ToLower(*workspace.Name) + endpoints[name] = *workspace.Properties.Metrics.PrometheusQueryEndpoint + logger.Info("Found workspace endpoint", "workspace-name", *workspace.Name, "endpoint", endpoints[name]) + } + } + + return endpoints +} + +func (o *CompletedAddDatasourceOptions) reconcileDatasources(ctx context.Context, logger logr.Logger, workspaceEndpoints map[string]string) error { + datasources, err := o.GrafanaClient.ListDataSources(ctx) + if err != nil { + return fmt.Errorf("failed to list Grafana datasources: %w", err) + } + + var deleteErrors error + for _, ds := range datasources { + if ds.Type != "prometheus" { + continue + } + + workspaceName := strings.TrimPrefix(ds.Name, "Managed_Prometheus_") + if workspaceName == ds.Name { + continue + } + + expectedEndpoint, ok := workspaceEndpoints[strings.ToLower(workspaceName)] + if !ok { + if o.DryRun { + logger.Info("Dry run - would delete orphaned datasource", "datasource-name", ds.Name) + continue + } + + logger.Info("Deleting orphaned datasource", "datasource-name", ds.Name) + if err := o.GrafanaClient.DeleteDataSource(ctx, ds.Name); err != nil { + deleteErrors = errors.Join(deleteErrors, fmt.Errorf("failed to delete datasource %q: %w", ds.Name, err)) + } + continue + } + + if ds.URL == expectedEndpoint { + logger.Info("Datasource URL is current", "datasource-name", ds.Name, "url", ds.URL) + continue + } + + if o.DryRun { + logger.Info("Dry run - would update stale datasource URL", + "datasource-name", ds.Name, + "current-url", ds.URL, + "expected-url", expectedEndpoint) + continue + } + + logger.Info("Updating stale datasource URL", + "datasource-name", ds.Name, + "current-url", ds.URL, + "expected-url", expectedEndpoint) + + ds.URL = expectedEndpoint + if err := o.GrafanaClient.UpdateDataSource(ctx, ds); err != nil { + return fmt.Errorf("failed to update datasource %q URL: %w", ds.Name, err) + } + } + + if deleteErrors != nil { + return fmt.Errorf("failed to delete orphaned datasources: %w", deleteErrors) + } + + return nil } func (o *CompletedAddDatasourceOptions) Run(ctx context.Context) error { @@ -115,11 +193,13 @@ func (o *CompletedAddDatasourceOptions) Run(ctx context.Context) error { return fmt.Errorf("failed to get Grafana instance: %w", err) } - validWorkspaceIDs, err := o.getMatchingWorkspaceIDs(ctx, logger) + monitorWorkspaces, err := o.MonitorWorkspaceClient.GetAllMonitorWorkspaces(ctx) if err != nil { - return fmt.Errorf("failed to get valid workspace IDs: %w", err) + return fmt.Errorf("failed to list Azure Monitor Workspaces: %w", err) } + validWorkspaceIDs := getMatchingWorkspaceIDs(monitorWorkspaces, logger) + integrationList := set.New[string]() for _, integration := range grafana.Properties.GrafanaIntegrations.AzureMonitorWorkspaceIntegrations { if integration.AzureMonitorWorkspaceResourceID == nil { @@ -142,14 +222,20 @@ func (o *CompletedAddDatasourceOptions) Run(ctx context.Context) error { if o.DryRun { logger.Info("Dry run - would reconcile Azure Monitor Workspace integrations", "total-integrations", integrationList.Len()) - return nil + } else { + logger.Info("Reconciling Azure Monitor Workspace integrations", "total-integrations", integrationList.Len()) + + err = o.ManagedGrafanaClient.UpdateGrafanaIntegrations(ctx, o.ResourceGroup, o.GrafanaName, integrationList.UnsortedList()) + if err != nil { + return fmt.Errorf("failed to update Grafana integrations: %w", err) + } } - logger.Info("Reconciling Azure Monitor Workspace integrations", "total-integrations", integrationList.Len()) + workspaceEndpoints := getWorkspaceEndpoints(monitorWorkspaces, logger) - err = o.ManagedGrafanaClient.UpdateGrafanaIntegrations(ctx, o.ResourceGroup, o.GrafanaName, integrationList.UnsortedList()) - if err != nil { - return fmt.Errorf("failed to update Grafana integrations: %w", err) + logger.Info("Reconciling datasources") + if err := o.reconcileDatasources(ctx, logger, workspaceEndpoints); err != nil { + return fmt.Errorf("failed to reconcile datasources: %w", err) } return nil diff --git a/tools/grafanactl/cmd/modify/options.go b/tools/grafanactl/cmd/modify/options.go index 28cbd3d5..47eec6ee 100644 --- a/tools/grafanactl/cmd/modify/options.go +++ b/tools/grafanactl/cmd/modify/options.go @@ -23,6 +23,7 @@ import ( "github.com/Azure/ARO-Tools/tools/cmdutils" "github.com/Azure/ARO-Tools/tools/grafanactl/cmd/base" "github.com/Azure/ARO-Tools/tools/grafanactl/internal/azure" + "github.com/Azure/ARO-Tools/tools/grafanactl/internal/grafana" ) // RawAddDatasourceOptions represents the initial, unvalidated configuration for add datasource operations. @@ -48,6 +49,7 @@ type ValidatedAddDatasourceOptions struct { // for add datasource operations. type CompletedAddDatasourceOptions struct { *validatedAddDatasourceOptions + GrafanaClient *grafana.Client MonitorWorkspaceClient *azure.MonitorWorkspaceClient ManagedGrafanaClient *azure.ManagedGrafanaClient } @@ -107,6 +109,11 @@ func (o *ValidatedAddDatasourceOptions) Complete(ctx context.Context) (*Complete return nil, fmt.Errorf("failed to create managed Grafana client: %w", err) } + grafanaClient, err := grafana.NewClient(ctx, cred, managedGrafanaClient, o.SubscriptionID, o.ResourceGroup, o.GrafanaName) + if err != nil { + return nil, fmt.Errorf("failed to create Grafana client: %w", err) + } + monitorWorkspaceClient, err := azure.NewMonitorWorkspaceClient(o.SubscriptionID, cred, clientOpts) if err != nil { return nil, fmt.Errorf("failed to create monitor workspace client: %w", err) @@ -114,6 +121,7 @@ func (o *ValidatedAddDatasourceOptions) Complete(ctx context.Context) (*Complete return &CompletedAddDatasourceOptions{ validatedAddDatasourceOptions: o.validatedAddDatasourceOptions, + GrafanaClient: grafanaClient, MonitorWorkspaceClient: monitorWorkspaceClient, ManagedGrafanaClient: managedGrafanaClient, }, nil diff --git a/tools/grafanactl/internal/grafana/client.go b/tools/grafanactl/internal/grafana/client.go index 417d23f5..1f1683f0 100644 --- a/tools/grafanactl/internal/grafana/client.go +++ b/tools/grafanactl/internal/grafana/client.go @@ -108,6 +108,16 @@ func (c *Client) DeleteDataSource(ctx context.Context, dataSourceName string) er return nil } +// UpdateDataSource updates a datasource in the Grafana instance. +func (c *Client) UpdateDataSource(ctx context.Context, ds sdk.Datasource) error { + _, err := c.grafanaClient.UpdateDatasource(ctx, ds) + if err != nil { + return fmt.Errorf("failed to update datasource: %w", err) + } + + return nil +} + // ListFolders returns all folders in the Grafana instance. func (c *Client) ListFolders(ctx context.Context) ([]sdk.Folder, error) { folders, err := c.grafanaClient.GetAllFolders(ctx) From 7ff0c4a64c4a55803a3cf92b8944cd875fb4d6bf Mon Sep 17 00:00:00 2001 From: Cliff Schomburg Date: Fri, 26 Jun 2026 09:47:12 -0700 Subject: [PATCH 4/8] fix(grafanactl): use workspace existence for orphan detection, not endpoint presence A workspace that exists but has no PrometheusQueryEndpoint yet would be kept as an integration (causing Grafana to maintain its datasource) but treated as orphaned by datasource reconciliation (causing deletion). This created a delete-recreate loop. Now orphan detection checks whether the workspace exists at all, and only skips URL comparison for workspaces without endpoints yet. Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/grafanactl/cmd/modify/cmd.go | 31 ++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/tools/grafanactl/cmd/modify/cmd.go b/tools/grafanactl/cmd/modify/cmd.go index 1b3c663f..9a608cc3 100644 --- a/tools/grafanactl/cmd/modify/cmd.go +++ b/tools/grafanactl/cmd/modify/cmd.go @@ -101,6 +101,21 @@ func getMatchingWorkspaceIDs(workspaces []armmonitor.AzureMonitorWorkspaceResour return validWorkspaceIDs } +func getActiveWorkspaceNames(workspaces []armmonitor.AzureMonitorWorkspaceResource) set.Set[string] { + names := set.New[string]() + + for _, workspace := range workspaces { + if workspace.Name == nil || workspace.Properties == nil || workspace.Properties.ProvisioningState == nil { + continue + } + if *workspace.Properties.ProvisioningState == armmonitor.ProvisioningStateSucceeded { + names.Insert(strings.ToLower(*workspace.Name)) + } + } + + return names +} + func getWorkspaceEndpoints(workspaces []armmonitor.AzureMonitorWorkspaceResource, logger logr.Logger) map[string]string { endpoints := make(map[string]string) @@ -121,7 +136,7 @@ func getWorkspaceEndpoints(workspaces []armmonitor.AzureMonitorWorkspaceResource return endpoints } -func (o *CompletedAddDatasourceOptions) reconcileDatasources(ctx context.Context, logger logr.Logger, workspaceEndpoints map[string]string) error { +func (o *CompletedAddDatasourceOptions) reconcileDatasources(ctx context.Context, logger logr.Logger, activeWorkspaceNames set.Set[string], workspaceEndpoints map[string]string) error { datasources, err := o.GrafanaClient.ListDataSources(ctx) if err != nil { return fmt.Errorf("failed to list Grafana datasources: %w", err) @@ -138,8 +153,9 @@ func (o *CompletedAddDatasourceOptions) reconcileDatasources(ctx context.Context continue } - expectedEndpoint, ok := workspaceEndpoints[strings.ToLower(workspaceName)] - if !ok { + lowerName := strings.ToLower(workspaceName) + + if !activeWorkspaceNames.Has(lowerName) { if o.DryRun { logger.Info("Dry run - would delete orphaned datasource", "datasource-name", ds.Name) continue @@ -152,6 +168,12 @@ func (o *CompletedAddDatasourceOptions) reconcileDatasources(ctx context.Context continue } + expectedEndpoint, ok := workspaceEndpoints[lowerName] + if !ok { + logger.Info("Workspace exists but has no Prometheus endpoint yet, skipping", "datasource-name", ds.Name) + continue + } + if ds.URL == expectedEndpoint { logger.Info("Datasource URL is current", "datasource-name", ds.Name, "url", ds.URL) continue @@ -231,10 +253,11 @@ func (o *CompletedAddDatasourceOptions) Run(ctx context.Context) error { } } + activeWorkspaceNames := getActiveWorkspaceNames(monitorWorkspaces) workspaceEndpoints := getWorkspaceEndpoints(monitorWorkspaces, logger) logger.Info("Reconciling datasources") - if err := o.reconcileDatasources(ctx, logger, workspaceEndpoints); err != nil { + if err := o.reconcileDatasources(ctx, logger, activeWorkspaceNames, workspaceEndpoints); err != nil { return fmt.Errorf("failed to reconcile datasources: %w", err) } From 766c047d4c419fc5c01cf2e5367c11d3c6b49bc4 Mon Sep 17 00:00:00 2001 From: Cliff Schomburg Date: Fri, 26 Jun 2026 11:59:50 -0700 Subject: [PATCH 5/8] fix(grafanactl): treat transitional provisioning states as active Workspaces can be in transitional states like Creating or Updating intermittently. Only exclude workspaces in terminal failure states (Failed, Canceled) from integration and orphan detection, so that workspaces being updated are not temporarily removed and recreated. getWorkspaceEndpoints still requires Succeeded since transitional workspaces may not have a valid PrometheusQueryEndpoint yet. Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/grafanactl/cmd/modify/cmd.go | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tools/grafanactl/cmd/modify/cmd.go b/tools/grafanactl/cmd/modify/cmd.go index 9a608cc3..85be67ad 100644 --- a/tools/grafanactl/cmd/modify/cmd.go +++ b/tools/grafanactl/cmd/modify/cmd.go @@ -85,6 +85,15 @@ func (opts *RawAddDatasourceOptions) Run(ctx context.Context) error { return completed.Run(ctx) } +func isTerminalFailureState(state armmonitor.ProvisioningState) bool { + switch state { + case armmonitor.ProvisioningStateFailed, armmonitor.ProvisioningStateCanceled: + return true + default: + return false + } +} + func getMatchingWorkspaceIDs(workspaces []armmonitor.AzureMonitorWorkspaceResource, logger logr.Logger) set.Set[string] { validWorkspaceIDs := set.New[string]() @@ -92,10 +101,13 @@ func getMatchingWorkspaceIDs(workspaces []armmonitor.AzureMonitorWorkspaceResour if workspace.Properties == nil || workspace.Properties.ProvisioningState == nil || workspace.ID == nil { continue } - if *workspace.Properties.ProvisioningState == armmonitor.ProvisioningStateSucceeded { - logger.Info("Found", "workspace-id", *workspace.ID, "provisioning-state", *workspace.Properties.ProvisioningState) - validWorkspaceIDs.Insert(strings.ToLower(*workspace.ID)) + state := *workspace.Properties.ProvisioningState + if isTerminalFailureState(state) { + logger.Info("Skipping workspace in terminal failure state", "workspace-id", *workspace.ID, "provisioning-state", state) + continue } + logger.Info("Found", "workspace-id", *workspace.ID, "provisioning-state", state) + validWorkspaceIDs.Insert(strings.ToLower(*workspace.ID)) } return validWorkspaceIDs @@ -108,7 +120,7 @@ func getActiveWorkspaceNames(workspaces []armmonitor.AzureMonitorWorkspaceResour if workspace.Name == nil || workspace.Properties == nil || workspace.Properties.ProvisioningState == nil { continue } - if *workspace.Properties.ProvisioningState == armmonitor.ProvisioningStateSucceeded { + if !isTerminalFailureState(*workspace.Properties.ProvisioningState) { names.Insert(strings.ToLower(*workspace.Name)) } } From 986da23b98cce41dcee76672bce4877a4fa1dbb1 Mon Sep 17 00:00:00 2001 From: Cliff Schomburg Date: Fri, 26 Jun 2026 12:40:21 -0700 Subject: [PATCH 6/8] fix(grafanactl): improve error handling in datasource reconciliation - Include datasource name and ID in UpdateDataSource error messages for easier troubleshooting - Collect all reconciliation errors (both deletes and updates) using errors.Join instead of returning on the first update failure, which would silently drop accumulated delete errors Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/grafanactl/cmd/modify/cmd.go | 10 +++++----- tools/grafanactl/internal/grafana/client.go | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/grafanactl/cmd/modify/cmd.go b/tools/grafanactl/cmd/modify/cmd.go index 85be67ad..9d01c9c2 100644 --- a/tools/grafanactl/cmd/modify/cmd.go +++ b/tools/grafanactl/cmd/modify/cmd.go @@ -154,7 +154,7 @@ func (o *CompletedAddDatasourceOptions) reconcileDatasources(ctx context.Context return fmt.Errorf("failed to list Grafana datasources: %w", err) } - var deleteErrors error + var reconcileErrors error for _, ds := range datasources { if ds.Type != "prometheus" { continue @@ -175,7 +175,7 @@ func (o *CompletedAddDatasourceOptions) reconcileDatasources(ctx context.Context logger.Info("Deleting orphaned datasource", "datasource-name", ds.Name) if err := o.GrafanaClient.DeleteDataSource(ctx, ds.Name); err != nil { - deleteErrors = errors.Join(deleteErrors, fmt.Errorf("failed to delete datasource %q: %w", ds.Name, err)) + reconcileErrors = errors.Join(reconcileErrors, fmt.Errorf("failed to delete datasource %q: %w", ds.Name, err)) } continue } @@ -206,12 +206,12 @@ func (o *CompletedAddDatasourceOptions) reconcileDatasources(ctx context.Context ds.URL = expectedEndpoint if err := o.GrafanaClient.UpdateDataSource(ctx, ds); err != nil { - return fmt.Errorf("failed to update datasource %q URL: %w", ds.Name, err) + reconcileErrors = errors.Join(reconcileErrors, fmt.Errorf("failed to update datasource %q URL: %w", ds.Name, err)) } } - if deleteErrors != nil { - return fmt.Errorf("failed to delete orphaned datasources: %w", deleteErrors) + if reconcileErrors != nil { + return fmt.Errorf("failed to reconcile datasources: %w", reconcileErrors) } return nil diff --git a/tools/grafanactl/internal/grafana/client.go b/tools/grafanactl/internal/grafana/client.go index 1f1683f0..dac992eb 100644 --- a/tools/grafanactl/internal/grafana/client.go +++ b/tools/grafanactl/internal/grafana/client.go @@ -112,7 +112,7 @@ func (c *Client) DeleteDataSource(ctx context.Context, dataSourceName string) er func (c *Client) UpdateDataSource(ctx context.Context, ds sdk.Datasource) error { _, err := c.grafanaClient.UpdateDatasource(ctx, ds) if err != nil { - return fmt.Errorf("failed to update datasource: %w", err) + return fmt.Errorf("failed to update datasource %q (ID %d): %w", ds.Name, ds.ID, err) } return nil From 528fc72c66605135d42999a151a907f21459c662 Mon Sep 17 00:00:00 2001 From: Cliff Schomburg Date: Fri, 26 Jun 2026 12:45:10 -0700 Subject: [PATCH 7/8] fix(grafanactl): use endpoint from workspaces in transitional states A workspace transitioning through Updating still has its PrometheusQueryEndpoint from the previous Succeeded state. Use isTerminalFailureState instead of requiring Succeeded, and rely on the existing nil guard for workspaces that genuinely lack an endpoint. Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/grafanactl/cmd/modify/cmd.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/grafanactl/cmd/modify/cmd.go b/tools/grafanactl/cmd/modify/cmd.go index 9d01c9c2..7c597db2 100644 --- a/tools/grafanactl/cmd/modify/cmd.go +++ b/tools/grafanactl/cmd/modify/cmd.go @@ -138,11 +138,12 @@ func getWorkspaceEndpoints(workspaces []armmonitor.AzureMonitorWorkspaceResource workspace.Properties.Metrics.PrometheusQueryEndpoint == nil { continue } - if *workspace.Properties.ProvisioningState == armmonitor.ProvisioningStateSucceeded { - name := strings.ToLower(*workspace.Name) - endpoints[name] = *workspace.Properties.Metrics.PrometheusQueryEndpoint - logger.Info("Found workspace endpoint", "workspace-name", *workspace.Name, "endpoint", endpoints[name]) + if isTerminalFailureState(*workspace.Properties.ProvisioningState) { + continue } + name := strings.ToLower(*workspace.Name) + endpoints[name] = *workspace.Properties.Metrics.PrometheusQueryEndpoint + logger.Info("Found workspace endpoint", "workspace-name", *workspace.Name, "endpoint", endpoints[name]) } return endpoints From 3b325248632baf668f6a4a8d634f1294471eafd0 Mon Sep 17 00:00:00 2001 From: Cliff Schomburg Date: Fri, 26 Jun 2026 12:54:09 -0700 Subject: [PATCH 8/8] fix(grafanactl): only delete datasources for workspaces that no longer exist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Orphan detection now includes all workspaces regardless of provisioning state. A workspace in Failed or Canceled state still exists as an Azure resource, so its datasource should be preserved — it may help identify broken clusters through the Grafana UI, and the pipeline will restore everything when the workspace is fixed. Datasources are only deleted when the workspace is truly gone (not returned by the API at all). Provisioning state filtering remains only for integration reconciliation and endpoint URL updates. Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/grafanactl/cmd/modify/cmd.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/grafanactl/cmd/modify/cmd.go b/tools/grafanactl/cmd/modify/cmd.go index 7c597db2..5b952f9a 100644 --- a/tools/grafanactl/cmd/modify/cmd.go +++ b/tools/grafanactl/cmd/modify/cmd.go @@ -117,12 +117,10 @@ func getActiveWorkspaceNames(workspaces []armmonitor.AzureMonitorWorkspaceResour names := set.New[string]() for _, workspace := range workspaces { - if workspace.Name == nil || workspace.Properties == nil || workspace.Properties.ProvisioningState == nil { + if workspace.Name == nil { continue } - if !isTerminalFailureState(*workspace.Properties.ProvisioningState) { - names.Insert(strings.ToLower(*workspace.Name)) - } + names.Insert(strings.ToLower(*workspace.Name)) } return names