# strace -fyp "$(pidof initdb)"
strace: Process 1110494 attached
writev(4<pipe:[1501970]>, [{iov_base="COMMENT ON FUNCTION iso8859_1_to"..., iov_len=138}, {iov_base=NULL, iov_len=0}], 2
# strace -fyp "$(pidof postgres)"
strace: Process 1110552 attached
futex(0x7fffecd311b8, FUTEX_WAIT, 4294967295, NULL
# ps aux | grep Setup
nixbld16 1107329 0.0 0.1 1073757344 30608 ? Sl 08:23 0:00 ./Setup test
# strace -fyp "$(pidof 'Setup')"
strace: Process 1107329 attached with 5 threads
[pid 1107334] restart_syscall(<... resuming interrupted read ...> <unfinished ...>
[pid 1107344] futex(0x7ffff7e0793c, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 1107329] futex(0x7fffffff9dbc, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 1107331] futex(0x7ffff7e98a0c, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 1107333] futex(0x7ffff7e98a0c, FUTEX_WAKE_PRIVATE, 1 <unfinished ...>
[pid 1107331] <... futex resumed>) = -1 EAGAIN (Resource temporarily unavailable)
[pid 1107333] <... futex resumed>) = 0
[pid 1107333] epoll_pwait(4<anon_inode:[eventpoll]>, <unfinished ...>
[pid 1107331] read(3<anon_inode:[timerfd]>, <unfinished ...>
[pid 1107333] <... epoll_pwait resumed>[], 64, 0, NULL, 8) = 0
[pid 1107331] <... read resumed>"\256\1\0\0\0\0\0\0", 8) = 8
[pid 1107333] epoll_pwait(4<anon_inode:[eventpoll]>, <unfinished ...>
[pid 1107331] read(3<anon_inode:[timerfd]>, <unfinished ...>
[pid 1107333] <... epoll_pwait resumed>[], 64, 0, NULL, 8) = 0
[pid 1107333] epoll_pwait(4<anon_inode:[eventpoll]>, <unfinished ...>
[pid 1107331] <... read resumed>"\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, "\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] write(10<anon_inode:[eventfd]>, "\377\0\0\0\0\0\0\0", 8) = 8
[pid 1107334] <... restart_syscall resumed>) = 1
[pid 1107331] read(3<anon_inode:[timerfd]>, <unfinished ...>
[pid 1107334] read(10<anon_inode:[eventfd]>, "\377\0\0\0\0\0\0\0", 8) = 8
[pid 1107334] futex(0x7ffff7e0793c, FUTEX_WAKE_PRIVATE, 1) = 1
[pid 1107344] <... futex resumed>) = 0
[pid 1107334] poll([{fd=8<pipe:[1492976]>, events=POLLIN}, {fd=10<anon_inode:[eventfd]>, events=POLLIN}], 2, -1 <unfinished ...>
[pid 1107344] rt_sigprocmask(SIG_BLOCK, [INT], [], 8) = 0
[pid 1107331] <... read resumed>"\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] read(3<anon_inode:[timerfd]>, <unfinished ...>
[pid 1107344] rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
[pid 1107344] futex(0x7ffff7e0793c, FUTEX_WAIT_PRIVATE, 2, NULL <unfinished ...>
[pid 1107331] <... read resumed>"\1\0\0\0\0\0\0\0", 8) = 8
[pid 1107331] futex(0x7ffff7e98a0c, FUTEX_WAIT_PRIVATE, 2, NULL
Hi, for static-haskell-nix I run a lot of packages in its CI.
For
tmp-postgres, that CI seems to have discovered a way to get the test suite stuck indefinitely after a postgres crash (unclear of what nature that crash exactly is,coredumpctlsuggests signals3and6were involved).The failure doesn't seem to be deterministic, because sometimes the tests ran through fine.
With the test hanging for hours, this is the process tree as shown in
htop:This is how I observe the postgres crash/shutdown:
Unfortunately I didn't have core dump files enabled so I can't
gdbinto the postgres process to get more info.To provide additional data, here are
straceinvocations of the 3 involved processes, in case that helps:Details
The CI run in which this occurred is here.
If I were to make a super rough guess at what's happening, I'd say that there's a code path that allows some postgres process to die (for whatever reason), but that information isn't propagated up (or blocked on some IO pipe) to shut down the other processes and display an error message.
This is really just a drive-by issue report, as I'm not currently a user of
tmp-postgresnor have a good understanding of its working.But maybe it can be useful for you to catch the odd async-exception-not-handled or other race condition, thus improving the package.
Cheers!