From 2425135ce75d5a58bcd17866284dc881afb9bb12 Mon Sep 17 00:00:00 2001 From: nocturnalastro Date: Tue, 31 Mar 2026 17:19:20 +0100 Subject: [PATCH] Add webhook service endpoint readiness check before creating default PtpOperatorConfig The operator could fail to create the default PtpOperatorConfig on startup due to a race condition: the local webhook server is ready but the Kubernetes Service endpoints are not yet populated, causing the API server's validating webhook call to fail with "no endpoints available". Without retry, the default config is never created and the linuxptp-daemon DaemonSet is never spawned. Extend waitForWebhookServer to also verify the webhook-service is reachable via cluster DNS (using a TCP dial) before proceeding to create the default config. This ensures the endpoint controller has had time to populate the Service endpoints. Use goroutine-local err variables to avoid a data race with mgr.Start, and check ctx.Done() so the goroutine terminates cleanly on manager shutdown. Generated-by: Cursor --- main.go | 71 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 23 deletions(-) diff --git a/main.go b/main.go index 6e1548232..8cf156f3e 100644 --- a/main.go +++ b/main.go @@ -21,6 +21,7 @@ import ( "crypto/tls" "flag" "fmt" + "net" "net/http" "os" "strings" @@ -269,17 +270,14 @@ func main() { } go func() { - // Wait until the webhook server is ready. setupLog.Info("waiting for validating webhook to be ready") - err = waitForWebhookServer(checker) - if err != nil { + if err := waitForWebhookServer(ctx, checker); err != nil { setupLog.Error(err, "unable to create default PtpOperatorConfig due to webhook not ready") - } else { - // create default before the webhook are setup - err = createDefaultOperatorConfig(ctrl.GetConfigOrDie()) - if err != nil { - setupLog.Error(err, "unable to create default PtpOperatorConfig") - } + return + } + + if err := createDefaultOperatorConfig(ctx, restConfig); err != nil { + setupLog.Error(err, "unable to create default PtpOperatorConfig") } }() setupLog.Info("starting manager") @@ -290,7 +288,7 @@ func main() { } -func createDefaultOperatorConfig(cfg *rest.Config) error { +func createDefaultOperatorConfig(ctx context.Context, cfg *rest.Config) error { logger := setupLog.WithName("createDefaultOperatorConfig") c, err := client.New(cfg, client.Options{Scheme: scheme}) if err != nil { @@ -301,7 +299,7 @@ func createDefaultOperatorConfig(cfg *rest.Config) error { DaemonNodeSelector: map[string]string{}, }, } - err = c.Get(context.TODO(), types.NamespacedName{ + err = c.Get(ctx, types.NamespacedName{ Name: names.DefaultOperatorConfigName, Namespace: names.Namespace}, config) if err != nil { @@ -309,7 +307,7 @@ func createDefaultOperatorConfig(cfg *rest.Config) error { logger.Info("Create default OperatorConfig") config.Namespace = names.Namespace config.Name = names.DefaultOperatorConfigName - err = c.Create(context.TODO(), config) + err = c.Create(ctx, config) if err != nil { return err } @@ -361,28 +359,55 @@ func fetchTLSConfig(cfg *rest.Config) (configv1.TLSProfileSpec, configv1.TLSAdhe return profileSpec, adherencePolicy, nil } -// waitForWebhookServer waits until the webhook server is ready. -func waitForWebhookServer(checker func(req *http.Request) error) error { +// waitForWebhookServer waits until the local webhook server is listening and +// the webhook-service is reachable via the cluster DNS. The latter is necessary +// because the Kubernetes endpoint controller populates the Service endpoints +// asynchronously; without this check, the API server may reject webhook calls +// with "no endpoints available". +func waitForWebhookServer(ctx context.Context, checker func(req *http.Request) error) error { const ( - timeout = 30 * time.Second // Adjust timeout as needed - pollingFreq = 1 * time.Second // Polling frequency + timeout = 60 * time.Second + pollingFreq = 1 * time.Second + dialTimeout = 2 * time.Second ) start := time.Now() + webhookServiceAddr := fmt.Sprintf("webhook-service.%s.svc:%d", names.Namespace, 443) - // Create an HTTP request to check the readiness of the webhook server. - req, err := http.NewRequest("GET", "https://localhost:9443/healthz", nil) + req, err := http.NewRequestWithContext(ctx, "GET", "https://localhost:9443/healthz", nil) if err != nil { return err } - // Poll the checker function until it returns nil (indicating success) - // or until the timeout is reached. for { if err = checker(req); err == nil { - return nil - } else if time.Since(start) > timeout { + break + } + if time.Since(start) > timeout { return fmt.Errorf("timeout waiting for webhook server to start") } - time.Sleep(pollingFreq) // Poll every second + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(pollingFreq): + } + } + + setupLog.Info("webhook server started, waiting for service endpoints") + + for { + conn, err := (&net.Dialer{Timeout: dialTimeout}).DialContext(ctx, "tcp", webhookServiceAddr) + if err == nil { + conn.Close() + setupLog.Info("webhook service endpoints are ready") + return nil + } + if time.Since(start) > timeout { + return fmt.Errorf("timeout waiting for webhook service endpoints to be ready") + } + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(pollingFreq): + } } }