From 4f3f4f23c5db0b0af61bb493c8968cdc23de0947 Mon Sep 17 00:00:00 2001 From: Joao Morais Date: Wed, 13 May 2026 15:31:04 -0300 Subject: [PATCH] adding logs and exponential backoff on execPod There are a few failures in the reported issue regarding execPod: * Timeout calling the router endpoint from the loopback interface of the router container; * Timeout resolving DNS * A container not found error, suggesting a race between the presence of the router pod, and the absence of its conterpart in the container runtime. It was not possible to be sure about the root cause of some failures due to the missing of the router logs, being added in this update. For the timeouts, increasing it and using from a const, this should give some more time for the router to synchronize any pending configuration. Also, adding exponential backoff to the execPod call, in case of an unexpected failure, giving another chance for the environment to recover itself. https://redhat.atlassian.net/browse/OCPBUGS-85696 --- test/extended/router/config_manager.go | 3 +- .../extended/router/config_manager_ingress.go | 28 ++++++++++++++++--- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/test/extended/router/config_manager.go b/test/extended/router/config_manager.go index 25fa4a44bc38..a04e75e9a8c7 100644 --- a/test/extended/router/config_manager.go +++ b/test/extended/router/config_manager.go @@ -31,6 +31,7 @@ import ( ) const timeoutSeconds = 3 * 60 +const fastTimeoutSeconds = 15 var _ = g.Describe("[sig-network][Feature:Router][apigroup:route.openshift.io]", func() { defer g.GinkgoRecover() @@ -862,7 +863,7 @@ func readURL(ns, execPodName, host, abspath, ipaddr string) (string, error) { return output, nil } -func waitForRouteToRespond(ns, execPodName, proto, host, abspath, ipaddr string, port int) error { +func waitForRouteToRespond(ns, execPodName, proto, host, abspath, ipaddr string, _ int) error { execPod := execPodRef{ NamespacedName: types.NamespacedName{ Namespace: ns, diff --git a/test/extended/router/config_manager_ingress.go b/test/extended/router/config_manager_ingress.go index 86b970845876..c74c0d02c4f9 100644 --- a/test/extended/router/config_manager_ingress.go +++ b/test/extended/router/config_manager_ingress.go @@ -87,6 +87,10 @@ var _ = g.Describe("[sig-network-edge][Feature:Router][apigroup:route.openshift. } else { outputEndpointSlice(epsList.Items...) } + if execPod.Name != "" { + // only dump logs if execPod was already assigned + exutil.DumpPodLogsStartingWithInNamespace(execPod.Name, execPod.Namespace, oc) + } } if controller.Name != "" { err := oc.AdminOperatorClient().OperatorV1().IngressControllers(controller.Namespace).Delete(ctx, controller.Name, *metav1.NewDeleteOptions(1)) @@ -700,7 +704,23 @@ func eventuallyRouteAllServers(execPod execPodRef, hostname string, secure bool, // execPodReadURL executes a `curl` in the provided exec pod, retuning the response code and response content. // In case the server response is empty, the response code is `0` and no error is reported. +// It makes 5 attempts to run in case of an error. func execPodReadURL(execPod execPodRef, host string, secure bool, abspath string) (code int, output string, err error) { + // 5x attempts, 2x factor, 2s first interval, so intervals between attempts are: 2s, 4s, 8s, 16s + backoff := wait.Backoff{ + Steps: 5, + Duration: 2 * time.Second, + Factor: 2, + } + _ = wait.ExponentialBackoff(backoff, func() (bool, error) { + code, output, err = innerExecPodReadURL(execPod, host, secure, abspath) + return err == nil, nil + }) + return +} + +// innerExecPodReadURL does the actual execPod call +func innerExecPodReadURL(execPod execPodRef, host string, secure bool, abspath string) (code int, output string, err error) { host = exutil.IPUrl(host) proto := "http" port := 80 @@ -709,7 +729,7 @@ func execPodReadURL(execPod execPodRef, host string, secure bool, abspath string port = 443 } uri := fmt.Sprintf("%s://%s:%d%s", proto, host, port, abspath) - cmd := fmt.Sprintf("curl -ksS -m 5 -w '\n%%{http_code}' --resolve %s:%d:%s %q", host, port, execPod.ipAddress, uri) + cmd := fmt.Sprintf("curl -ksS --max-time %d -w '\n%%{http_code}' --resolve %s:%d:%s %q", fastTimeoutSeconds, host, port, execPod.ipAddress, uri) output, err = e2eoutput.RunHostCmd(execPod.Namespace, execPod.Name, cmd) // Checking for curl's "(52) empty response from server", this means a FIN or RST from the server side. @@ -739,7 +759,7 @@ func execPodReadURL(execPod execPodRef, host string, secure bool, abspath string // if expectedCode is `0`, an empty response and FIN or RST is expected from the server side. func execPodWaitURL(ctx context.Context, execPod execPodRef, host string, secure bool, abspath string, expectedCode int, timeout time.Duration) (output string, err error) { err = wait.PollUntilContextTimeout(ctx, 2*time.Second, timeout, true, func(ctx context.Context) (done bool, err error) { - code, out, err := execPodReadURL(execPod, host, secure, abspath) + code, out, err := innerExecPodReadURL(execPod, host, secure, abspath) if err != nil || code != expectedCode { framework.Logf("URL is not ready. Expected code: %d; Response code: %d, err: %v", expectedCode, code, err) return false, nil @@ -858,7 +878,7 @@ func (r *routeStackBuilder) createDetachedService(ctx context.Context) (serviceN // we also need the deprecated Endpoints API, since router still uses it depending on the ROUTER_WATCH_ENDPOINTS envvar var epCurrent *corev1.Endpoints - err = wait.PollUntilContextTimeout(ctx, time.Second, 10*time.Second, false, func(ctx context.Context) (done bool, err error) { + err = wait.PollUntilContextTimeout(ctx, time.Second, fastTimeoutSeconds*time.Second, false, func(ctx context.Context) (done bool, err error) { epCurrent, err = r.kubeClient.CoreV1().Endpoints(svcCurrent.Namespace).Get(ctx, svcCurrent.Name, metav1.GetOptions{}) if err != nil { framework.Logf("error fetching Endpoints: %s", err.Error()) @@ -881,7 +901,7 @@ func (r *routeStackBuilder) createDetachedService(ctx context.Context) (serviceN } // EndpointSlice use to be created as soon as the Endpoints resource is created. Lets wait for it, and create ourselves in case it is missing - err = wait.PollUntilContextTimeout(ctx, time.Second, 10*time.Second, false, func(ctx context.Context) (done bool, err error) { + err = wait.PollUntilContextTimeout(ctx, time.Second, fastTimeoutSeconds*time.Second, false, func(ctx context.Context) (done bool, err error) { _, err = r.fetchEndpointSlice(ctx, serviceName) if err != nil { framework.Logf("error fetching EndpointSlice: %s", err.Error())