From 5b0beed97a8f65171bf46be046fb004833fba722 Mon Sep 17 00:00:00 2001 From: Ned Petrov Date: Mon, 15 Jun 2026 08:17:51 +0300 Subject: [PATCH] Fix two flaky Windows unit tests in jobsupervisor Windows Job Supervisor (AfterEach/AfterSuite): After Stop()+RemoveAllJobs(), the Windows SCM marks services deleted but the underlying WinSW/pipe.exe process may still hold file handles on job-service-wrapper.exe and its log files briefly. With multiple services and flapping restart cycles on a busy CI runner, this can exceed 60 seconds. Increase the Eventually timeout to 2 minutes with a 1s poll interval, and apply the same pattern to logDir (AfterEach) and TempDir (AfterSuite) which previously used a hard Expect. Monit Job Supervisor (StopAndWait timer test): The test used a racy Eventually(WatcherCount)+Increment pattern. Between the Eventually seeing count=2 and Increment running, the goroutine could transition through checkServices() (where count drops to 1), causing Increment to miss the stop-loop Sleep watcher and leave the goroutine blocked on a fake timer that would never fire. Replace both pairs with advanceTime(), which atomically waits for the required watcher count before incrementing. --- jobsupervisor/monit_job_supervisor_test.go | 7 ++----- jobsupervisor/windows_job_supervisor_test.go | 10 ++++++---- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/jobsupervisor/monit_job_supervisor_test.go b/jobsupervisor/monit_job_supervisor_test.go index 8ab24dae3..461a1b323 100644 --- a/jobsupervisor/monit_job_supervisor_test.go +++ b/jobsupervisor/monit_job_supervisor_test.go @@ -378,18 +378,15 @@ var _ = Describe("monitJobSupervisor", func() { errchan <- monit.StopAndWait() }() - Eventually(timeService.WatcherCount).Should(Equal(2)) // we hit the pending sleep + advanceTime(timeService, 3*time.Minute, 2) // wait for pending sleep, then advance client.StatusStatus = fakemonit.FakeMonitStatus{ Services: []boshmonit.Service{ {Monitored: true, Name: "foo", Status: "unknown", Pending: false}, }, } - timeService.Increment(3 * time.Minute) - Eventually(timeService.WatcherCount).Should(Equal(2)) // we hit the stop sleep - - timeService.Increment(3 * time.Minute) + advanceTime(timeService, 3*time.Minute, 2) // wait for stop sleep, then advance Eventually(errchan).Should(Receive(Equal(errors.New("Timed out waiting for services 'foo' to stop after 5 minutes")))) }) diff --git a/jobsupervisor/windows_job_supervisor_test.go b/jobsupervisor/windows_job_supervisor_test.go index 9a6f59159..f6777fae6 100755 --- a/jobsupervisor/windows_job_supervisor_test.go +++ b/jobsupervisor/windows_job_supervisor_test.go @@ -64,8 +64,7 @@ var ( ) var _ = AfterSuite(func() { - err := os.RemoveAll(TempDir) - Expect(err).NotTo(HaveOccurred()) + Eventually(func() error { return os.RemoveAll(TempDir) }, 2*time.Minute, time.Second).Should(Succeed()) gexec.CleanupBuildArtifacts() match := func(s string) bool { @@ -486,8 +485,11 @@ var _ = Describe("WindowsJobSupervisor", func() { AfterEach(func() { Expect(jobSupervisor.Stop()).To(Succeed()) Expect(jobSupervisor.RemoveAllJobs()).To(Succeed()) - Eventually(func() error { return fs.RemoveAll(jobDir) }, 60*time.Second).Should(Succeed()) - Expect(fs.RemoveAll(logDir)).To(Succeed()) + // Windows releases file handles on the service wrapper exe and log + // files asynchronously after the SCM reports services as deleted. + // Use a long timeout with a slow poll to avoid spinning on busy CI. + Eventually(func() error { return fs.RemoveAll(jobDir) }, 2*time.Minute, time.Second).Should(Succeed()) + Eventually(func() error { return fs.RemoveAll(logDir) }, 2*time.Minute, time.Second).Should(Succeed()) }) Describe("AddJob", func() {