diff --git a/.github/workflows/pg-ci.yml b/.github/workflows/pg-ci.yml
index 8560e9389f66f..fb8bf2848b87c 100644
--- a/.github/workflows/pg-ci.yml
+++ b/.github/workflows/pg-ci.yml
@@ -453,10 +453,17 @@ jobs:
- name: Configure
shell: *su_postgres_shell
+ # --enable-wait-event-timing is added on this one task so the
+ # wait-event-timing build path -- including the expected output at
+ # src/test/regress/expected/wait_event_timing.out -- is exercised by
+ # CI on every push. The other tasks build without the flag, so the
+ # stub path and its alternate output wait_event_timing_1.out stay
+ # covered as well.
run: |
./configure \
--enable-cassert --enable-injection-points --enable-debug \
--enable-tap-tests --enable-nls \
+ --enable-wait-event-timing \
--with-segsize-blocks=6 \
--with-libnuma \
--with-liburing \
diff --git a/configure b/configure
index 5f77f3cac29f3..2c488e10f31d0 100755
--- a/configure
+++ b/configure
@@ -850,6 +850,7 @@ enable_debug
enable_profiling
enable_coverage
enable_dtrace
+enable_wait_event_timing
enable_tap_tests
enable_injection_points
with_blocksize
@@ -1551,6 +1552,8 @@ Optional Features:
--enable-profiling build with profiling enabled
--enable-coverage build with coverage testing instrumentation
--enable-dtrace build with DTrace support
+ --enable-wait-event-timing
+ build with wait event timing instrumentation
--enable-tap-tests enable TAP tests (requires Perl and IPC::Run)
--enable-injection-points
enable injection points (for testing)
@@ -3633,6 +3636,35 @@ fi
+#
+# --enable-wait-event-timing adds wait event timing instrumentation
+#
+
+
+# Check whether --enable-wait-event-timing was given.
+if test "${enable_wait_event_timing+set}" = set; then :
+ enableval=$enable_wait_event_timing;
+ case $enableval in
+ yes)
+
+$as_echo "#define USE_WAIT_EVENT_TIMING 1" >>confdefs.h
+
+ ;;
+ no)
+ :
+ ;;
+ *)
+ as_fn_error $? "no argument expected for --enable-wait-event-timing option" "$LINENO" 5
+ ;;
+ esac
+
+else
+ enable_wait_event_timing=no
+
+fi
+
+
+
#
# TAP tests
#
diff --git a/configure.ac b/configure.ac
index 61cee42daa721..f3f1d64eb777a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -225,6 +225,14 @@ fi
AC_SUBST(DTRACEFLAGS)])
AC_SUBST(enable_dtrace)
+#
+# --enable-wait-event-timing adds wait event timing instrumentation
+#
+PGAC_ARG_BOOL(enable, wait-event-timing, no,
+ [build with wait event timing instrumentation],
+ [AC_DEFINE([USE_WAIT_EVENT_TIMING], 1,
+ [Define to 1 to build with wait event timing. (--enable-wait-event-timing)])])
+
#
# TAP tests
#
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index fa566c9e5535a..f45fa020b5635 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9110,6 +9110,112 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
+
+ wait_event_capture (enum)
+
+ wait_event_capture configuration parameter
+
+
+
+
+ Controls collection of wait event timing instrumentation. Requires
+ the server to be compiled with
+ . Possible values are
+ off (the default), stats, and
+ trace; each level is a strict superset of the
+ previous one.
+
+
+ At stats, the server records per-backend wait
+ event statistics — counts, total and average durations, and a
+ log2 duration histogram — visible in the
+
+ pg_stat_wait_event_timing view.
+ Two clock_gettime()-grade timestamps are taken
+ around every wait event transition, costing roughly
+ 40–100 ns each on modern hardware; while
+ wait_event_capture is off the
+ hot path adds only a single predictable branch.
+
+
+ At trace, the server additionally records every
+ individual wait event, together with query-attribution markers,
+ into a per-session ring buffer exposed via the
+
+ pg_backend_wait_event_trace view.
+ The ring is allocated lazily from dynamic shared memory on first
+ use (default 4 MB per backend; see
+ ), so only
+ sessions that enable trace pay the per-ring memory cost. Query
+ attribution — matching each wait to the query that incurred
+ it — requires both and
+ ; a warning is logged if
+ either is missing when trace is enabled.
+
+
+ Only superusers and users with the appropriate SET
+ privilege can change this setting. Read access to the resulting
+ statistics is controlled separately by membership in the
+ pg_read_all_stats
+ role (which the pg_monitor role inherits). To
+ delegate the ability to change the setting to a monitoring role,
+ use GRANT SET ON PARAMETER:
+
+GRANT SET ON PARAMETER wait_event_capture TO pg_monitor;
+
+
+
+
+
+
+ wait_event_timing_max_tranches (integer)
+
+ wait_event_timing_max_tranches configuration parameter
+
+
+
+
+ Sets the maximum number of distinct LWLock tranches whose timing is
+ recorded individually per backend. PostgreSQL maintains a
+ per-backend hash table mapping each tranche the backend encounters
+ to its histogram; once the table fills, further tranches are
+ counted against lwlock_overflow_count in
+
+ pg_stat_wait_event_timing_overflow
+ and not individually timed. Sized at server start; this parameter
+ has no effect on builds compiled without
+ . The default is
+ 192; raise it if your installation loads many
+ extensions that register their own LWLock tranches and you observe
+ a non-zero lwlock_overflow_count. This
+ parameter can only be set at server start.
+
+
+
+
+
+ wait_event_trace_ring_size (integer)
+
+ wait_event_trace_ring_size configuration parameter
+
+
+
+
+ Sets the size of the per-backend ring buffer used at the
+ trace level of
+ . Each backend that enables
+ trace allocates a ring of this size from a cluster-wide dynamic
+ shared memory area. The value must be a power of two; larger rings
+ retain a longer history of wait events before wrapping. If this
+ parameter is specified without units, it is taken as kilobytes. The
+ default is 4096 (4 MB). This parameter has
+ no effect on builds compiled without
+ , and can only be set at
+ server start.
+
+
+
+
track_functions (enum)
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 08d5b8245529f..8d1307bedbb8c 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -551,6 +551,33 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
+
+ pg_stat_wait_event_timingpg_stat_wait_event_timing
+ One row per backend per wait event, showing accumulated timing
+ statistics. See
+
+ pg_stat_wait_event_timing for details.
+
+
+
+
+ pg_stat_wait_event_timing_overflowpg_stat_wait_event_timing_overflow
+ Per-backend truncation and reset counters for the wait-event
+ timing subsystem. See
+
+ pg_stat_wait_event_timing_overflow for details.
+
+
+
+
+ pg_backend_wait_event_tracepg_backend_wait_event_trace
+ Individual wait event records from the current backend's trace
+ ring buffer. See
+
+ pg_backend_wait_event_trace for details.
+
+
+
@@ -3699,6 +3726,605 @@ description | Waiting for a newly initialized WAL file to reach durable storage
+
+ pg_stat_wait_event_timing
+
+
+ pg_stat_wait_event_timing
+
+
+
+ The pg_stat_wait_event_timing view contains one
+ row for each combination of backend and wait event that has a non-zero
+ call count. It shows accumulated timing statistics collected when
+ is set to stats
+ or trace. Requires the server to be compiled with
+ .
+
+
+
+ pg_stat_wait_event_timing View
+
+
+
+
+ Column Type
+
+
+ Description
+
+
+
+
+
+
+
+ pidinteger
+
+
+ Process ID of the backend
+
+
+
+
+
+ backend_typetext
+
+
+ Type of the backend (e.g. client backend,
+ checkpointer, walwriter)
+
+
+
+
+
+ procnumberinteger
+
+
+ Internal slot number (0-based process number) of the backend.
+
+
+
+
+
+ wait_event_typetext
+
+
+ Wait event type (e.g. IO, LWLock,
+ Timeout)
+
+
+
+
+
+ wait_eventtext
+
+
+ Wait event name (e.g. DataFileRead,
+ WALWrite, PgSleep)
+
+
+
+
+
+ callsbigint
+
+
+ Number of times this wait event occurred
+
+
+
+
+
+ total_time_msdouble precision
+
+
+ Total time spent in this wait event, in milliseconds
+
+
+
+
+
+ avg_time_usdouble precision
+
+
+ Average wait duration, in microseconds
+
+
+
+
+
+ max_time_usdouble precision
+
+
+ Maximum single wait duration, in microseconds
+
+
+
+
+
+ histogrambigint[]
+
+
+ Log2 histogram of wait durations with 32 buckets. Bin edges are
+ powers of two on the nanosecond axis: bucket 0 covers
+ [0, 1024) ns, bucket k covers
+ [2^(k+9),
+ 2^(k+10)) ns, and the last bucket covers
+ [2^40, ∞) ns. The boundaries approximate the
+ decimal-microsecond grid (1024 ns ≈ 1 μs, 2048 ns ≈
+ 2 μs, ...); the exact edges are chosen to let the hot path skip
+ a division by 1000. The
+
+ pg_wait_event_timing_histogram_buckets
+ view provides the numeric bin edges and human-readable labels for
+ each index; the canonical join pattern is:
+
+SELECT w.wait_event, b.label, h.count
+FROM pg_stat_wait_event_timing w,
+ LATERAL unnest(w.histogram) WITH ORDINALITY AS h(count, idx)
+JOIN pg_wait_event_timing_histogram_buckets b ON b.bucket_idx = h.idx - 1
+WHERE w.wait_event = 'PgSleep'
+ORDER BY b.bucket_idx;
+
+
+
+
+
+
+
+
+
+
+ pg_wait_event_timing_histogram_buckets
+
+
+ pg_wait_event_timing_histogram_buckets
+
+
+
+ The pg_wait_event_timing_histogram_buckets
+ view describes the 32 bins used by the
+ histogram column of
+
+ pg_stat_wait_event_timing. It always
+ contains 32 rows in ascending order of
+ bucket_idx, and is independent of runtime
+ state; a join against it attaches numeric bin edges and human
+ labels to any histogram array. Bins are powers of two on the
+ nanosecond axis: bin 0 covers [0, 1us), each
+ subsequent bin doubles its lower edge, and the final bin
+ (bucket_idx = 31) is open-ended at
+ approximately 1024 seconds.
+
+
+
+ The 32-bin layout (rather than the more common 16-bin choice for
+ log-scale histograms) is deliberate: real-world wait-event
+ distributions have long tails routinely extending past 16 ms
+ into multi-second territory (slow-disk
+ DataFileRead, lock contention waits, replication
+ apply waits, vacuum waits). A 16-bin histogram would collapse all
+ of those into a single overflow bin, hiding the very signal that
+ wait-event timing exists to surface. The 32-bin layout keeps the
+ long tail individually addressable up to about 17 minutes
+ before the open-ended bin; single waits beyond that belong in
+ auto_explain
+ or pg_stat_activity, not a histogram.
+
+
+
+ pg_wait_event_timing_histogram_buckets View
+
+
+
+
+ Column Type
+
+
+ Description
+
+
+
+
+
+
+
+ bucket_idxinteger
+
+
+ Zero-based bin index (0–31). Matches the offset into the
+ histogram array of
+ pg_stat_wait_event_timing.
+
+
+
+
+
+ lower_nsbigint
+
+
+ Inclusive lower edge of this bin in nanoseconds.
+
+
+
+
+
+ upper_nsbigint
+
+
+ Exclusive upper edge of this bin in nanoseconds, or
+ NULL for the final bin which extends to
+ infinity.
+
+
+
+
+
+ labeltext
+
+
+ Short human-readable label for the bin (e.g.
+ <1us, 1-2us,
+ >=1024s), expressed on the approximate
+ decimal-microsecond grid the bin edges are aligned to.
+
+
+
+
+
+
+
+
+ pg_stat_wait_event_timing_overflow
+
+
+ pg_stat_wait_event_timing_overflow
+
+
+
+ The pg_stat_wait_event_timing_overflow view
+ exposes per-backend truncation counters for the wait-event timing
+ subsystem. Each backend owns a bounded LWLock timing hash
+ ( tranches) and a bounded flat event array; events that cannot
+ be mapped to a slot are counted here. A non-zero value means the
+ corresponding row(s) in
+
+ pg_stat_wait_event_timing
+ are incomplete for that backend. Requires the server to be
+ compiled with .
+
+
+
+ pg_stat_wait_event_timing_overflow View
+
+
+
+
+ Column Type
+
+
+ Description
+
+
+
+
+
+
+
+ pidinteger
+
+
+ Process ID of the backend
+
+
+
+
+
+ backend_typetext
+
+
+ Type of the backend (e.g. client backend,
+ checkpointer, walwriter)
+
+
+
+
+
+ procnumberinteger
+
+
+ Internal slot number (0-based process number) of the backend.
+
+
+
+
+
+ lwlock_overflow_countbigint
+
+
+ Number of LWLock wait events dropped because the per-backend
+ LWLock timing hash was already full (more distinct tranches
+ observed in this session than
+ allows).
+ Zero means no LWLock truncation. A one-time
+ WARNING is also emitted to the server log on
+ first overflow. If you see this counter rising, raise
+ wait_event_timing_max_tranches at server
+ start (the per-backend memory cost is proportional and
+ described under that GUC).
+
+
+
+
+
+ flat_overflow_countbigint
+
+
+ Number of non-LWLock wait events dropped because the event
+ could not be mapped to a known class / index. This almost
+ always indicates a code path emitting a wait event of a class
+ the timing infrastructure was not compiled for; it should be
+ zero in supported builds.
+
+
+
+
+
+ reset_countbigint
+
+
+ Number of resets this backend has observed and acted
+ on; not a request counter. Own-backend resets via
+ pg_stat_reset_wait_event_timing(NULL) (or
+ passing the caller's own PID) are synchronous and bump this
+ column once per call. Cross-backend reset requests
+ coalesce: if several
+ pg_stat_reset_wait_event_timing(pid)
+ calls land between two of the target's wait events, the target
+ observes them as a single reset and increments
+ reset_count only once. Callers
+ polling for asynchronous-reset acknowledgment should watch for
+ any N → N+1 transition.
+
+
+
+
+
+
+
+
+ pg_backend_wait_event_trace
+
+
+ pg_backend_wait_event_trace
+
+
+
+ The pg_backend_wait_event_trace view shows
+ individual wait event records from the current backend's
+ trace ring buffer. Each record captures either a single wait event
+ (with timestamp and duration) or a query-attribution marker. Two
+ marker families exist: ExecStart/ExecEnd
+ bracket every executor invocation, and
+ QueryStart/QueryEnd mark
+ top-level query-id transitions and the transition to idle. See
+ for the gating rules of
+ each marker family.
+ Requires to be set to
+ trace. The ring buffer is sized by
+ (default 4 MB =
+ 131072 records of 32 bytes each); older records are overwritten in
+ FIFO order. The view is session-local
+ and analogous in scope to
+
+ pg_backend_memory_contexts; querying it
+ from a superuser session still returns only that session's own
+ records, never another backend's.
+
+
+
+ The pg_backend_wait_event_trace view is
+ intended for session-local interactive diagnostics:
+ running ad-hoc SELECT queries against your own
+ session's trace from psql while
+ investigating wait-event behaviour. The view materialises up to
+ one ring's worth of records (default ~4 MB, controlled by
+ ) into a
+ tuplestore on each call, which is bounded and acceptable for that
+ use; for narrow result sets, append
+ ORDER BY seq DESC LIMIT N
+ to get the most recent records.
+
+
+
+ Cross-backend monitoring tools — extensions and background
+ workers that read wait events losslessly from every backend's
+ ring — should not consume through this
+ view. The in-tree cross-backend reader is
+ pg_get_wait_event_trace
+ (see ); the underlying
+ per-session SQL function returns only the calling backend's own
+ ring, so a background worker invoking
+ SELECT * FROM pg_backend_wait_event_trace via
+ SPI would receive only its own (typically empty) ring, not the
+ target backend's data. External tools that need cross-backend
+ access without going through SQL use the shared-memory snapshot
+ pattern documented on
+ WaitEventTraceControl in
+ src/include/utils/wait_event_timing.h:
+ snapshot trace_slots[procNumber].generation,
+ acquire WaitEventTraceCtl->lock in
+ LW_SHARED, resolve the target slot's
+ ring_ptr via
+ dsa_get_address, snapshot the relevant slice
+ of the ring into local memory, release the lock, re-snapshot
+ generation and discard the read if it
+ changed, then process the snapshot off the lock. That bypasses
+ this view entirely and is the supported cross-backend interface
+ for monitoring extensions.
+
+
+
+ Slot lifecycle. Per-backend trace rings are
+ not freed when their owner backend exits. The ring stays
+ allocated in shared memory in an orphaned state
+ so the dying backend's final waits remain readable by the
+ cross-backend interface —
+ pg_get_wait_event_trace
+ (see ) for in-tree
+ access, or external background workers that follow the
+ snapshot pattern documented above.
+ This does not change the behaviour of this view,
+ which always reads the calling backend's own ring and is
+ unaffected by orphan-state slots belonging to other
+ procnumbers. The lifecycle change matters for short-lived
+ backends that exit before any monitoring tool has read their
+ data: parallel workers in particular exit in milliseconds at
+ end-of-parallel-query, well below typical reader polling
+ intervals, and without orphan-persistence their final waits
+ would be lost. Orphaned rings are reclaimed automatically when a new
+ backend takes over the same procNumber
+ slot, and the DBA can force a sweep at any time via
+ pg_stat_clear_orphaned_wait_event_rings.
+ The worst-case orphan-memory footprint is bounded by the slot
+ count times ~4 MB; see
+ pg_stat_clear_orphaned_wait_event_rings
+ under for details and
+ the deployment patterns where the function is most useful.
+
+
+
+ The ring buffer is designed as a lock-free transport mechanism for
+ external consumption. At high wait event rates (e.g., 220K events/sec),
+ the ring wraps in roughly 0.5–1 seconds. External consumers
+ (background workers, extensions) can attribute events to queries by
+ scanning for ExecStart markers (or, when the
+ executor markers are unavailable, QueryStart); if
+ both have been overwritten, events before the next visible marker are
+ unattributed. Consumers should poll the ring buffer before it wraps
+ and can use the query_id column of
+
+ pg_stat_activity as a fallback for the
+ current query context.
+
+
+
+ The seq column is the absolute write
+ position of each record; it is monotonically increasing and never
+ resets while the ring is alive. A consumer polling the ring
+ repeatedly can detect wraparound losses by tracking
+ max(seq) between successive scrapes: given two
+ consecutive polls returning N2 rows with
+ maximum seq values
+ S1 (previous poll) and
+ S2 (current poll), the number of records
+ overwritten before the second poll could read them is
+ max(0, (S2 - S1) - N2). No separate
+ trace overflow counter is exposed because this
+ information is exact and derivable from seq
+ alone.
+
+
+
+ QueryStart/QueryEnd markers are
+ emitted as matched pairs around each protocol phase that touches a
+ query_id. In simple protocol that is one
+ pair per statement. In extended protocol there is one pair around
+ each of Parse, Bind, and
+ Execute for the same
+ query_id — so a single parameterized
+ statement produces three nested pairs, plus the surrounding
+ ExecStart/ExecEnd pair from the
+ executor. This per-phase pairing lets consumers measure how much
+ time a query spent in each protocol phase (parse vs. bind vs.
+ execute) by computing the duration between each pair, and lets a
+ total-time-per-query rollup be expressed as the sum of pair
+ durations rather than a single subtraction. Consumers that just want
+ "how long did this query take in the executor" should use the
+ ExecStart/ExecEnd pair, which
+ brackets each executor invocation regardless of protocol. (Statements
+ that invoke the executor recursively — for example via
+ PL/pgSQL or other server-side code —
+ produce one pair per invocation.)
+
+
+
+ pg_backend_wait_event_trace View
+
+
+
+
+ Column Type
+
+
+ Description
+
+
+
+
+
+
+
+ seqbigint
+
+
+ Sequence number of this record in the ring buffer
+
+
+
+
+
+ timestamp_nsbigint
+
+
+ Monotonic clock timestamp in nanoseconds
+
+
+
+
+
+ wait_event_typetext
+
+
+ Wait event type, or Query for query markers
+
+
+
+
+
+ wait_eventtext
+
+
+ Wait event name, or one of ExecStart,
+ ExecEnd, QueryStart,
+ QueryEnd for query-attribution markers.
+
+
+
+
+
+ duration_usdouble precision
+
+
+ Wait duration in microseconds (0 for query markers)
+
+
+
+
+
+ query_idbigint
+
+
+ Query identifier for query markers (0 for wait events)
+
+
+
+
+
+
+
+
pg_stat_database
@@ -5736,6 +6362,152 @@ description | Waiting for a newly initialized WAL file to reach durable storage
+
+
+
+ pg_stat_reset_wait_event_timing
+
+ pg_stat_reset_wait_event_timing ( pidintegerDEFAULTNULL )
+ void
+
+
+ Resets wait event timing counters for a single backend, identified
+ by its process ID (see pid in
+
+ pg_stat_activity).
+ Passing NULL (or the caller's own
+ pg_backend_pid()) resets the current session;
+ any user may do this. Passing any other PID resets that backend
+ and requires membership in the
+ pg_signal_backend
+ role — the same role required by
+ pg_stat_reset_backend_stats,
+ pg_terminate_backend, and
+ pg_cancel_backend. Unknown or
+ already-exited PIDs are silent no-ops, matching the behavior of
+ pg_stat_reset_backend_stats.
+
+
+
+
+
+
+ pg_stat_reset_wait_event_timing_all
+
+ pg_stat_reset_wait_event_timing_all ()
+ void
+
+
+ Resets wait event timing counters for every backend in the
+ cluster. Requires superuser. This is intentionally stricter
+ than the per-backend variant
+ pg_stat_reset_wait_event_timing(pid),
+ which only requires pg_signal_backend: the
+ cluster-wide form has unbounded blast radius (it affects every
+ backend in a single call) and would erase forensic patterns
+ that span multiple backends, so it is gated to the cluster
+ owner. Returns before the resets have been observed by their
+ target backends; callers that need strict read-after-reset
+ semantics should poll each target's
+ reset_count column.
+
+
+
+
+
+
+ pg_get_wait_event_trace
+
+ pg_get_wait_event_trace ( procnumberinteger )
+ setof record
+
+
+ Returns individual wait event records from the trace ring of
+ the backend that currently or previously occupied the slot
+ identified by procnumber. Reads slots
+ in OWNED state (live writer) and
+ ORPHANED state (writer has exited but the
+ ring is preserved for post-mortem reading) uniformly. An
+ empty result indicates the slot is in FREE
+ state (no ring) or no records have been written. Concurrent
+ slot transitions cannot interrupt the read because the
+ function holds the cross-backend trace lock in
+ SHARED mode throughout the iteration; the
+ per-record seqlock protocol skips any record being written
+ by a concurrent live writer.
+
+
+ This is the canonical cross-backend reader. External
+ monitoring extensions that need cross-backend access without
+ going through SQL should follow the same snapshot pattern
+ documented on WaitEventTraceControl
+ in src/include/utils/wait_event_timing.h;
+ this function serves as both the reference implementation and
+ a DBA-facing diagnostic tool. The
+ procnumber argument can be obtained
+ from the procnumber column of
+ pg_stat_get_wait_event_timing or
+ pg_stat_get_wait_event_timing_overflow
+ for live backends. For post-mortem reads of short-lived
+ backends (parallel workers, autovacuum, walsender) the
+ procnumber must be captured while the
+ backend is still alive, or discovered by iterating slots in a
+ monitoring background worker. A pid-keyed lookup for live
+ backends only is one query away:
+
+
+SELECT * FROM pg_get_wait_event_trace(
+ (SELECT procnumber FROM pg_stat_get_wait_event_timing(target_pid)
+ WHERE pid = target_pid LIMIT 1));
+
+
+
+ Requires membership in pg_read_all_stats
+ (matching the privilege model of the session-local view
+ pg_backend_wait_event_trace).
+
+
+
+
+
+
+ pg_stat_clear_orphaned_wait_event_rings
+
+ pg_stat_clear_orphaned_wait_event_rings ()
+ bigint
+
+
+ Frees every wait-event-trace ring whose owner backend has
+ exited. Returns the number of rings released. Requires
+ superuser.
+
+
+ When a backend that had wait_event_capture =
+ trace exits, its ~4 MB trace ring is
+ intentionally not freed at exit so that
+ cross-backend consumers
+ (pg_get_wait_event_trace and extensions
+ following the snapshot pattern) can still read the dying
+ backend's final waits. The
+ memory is reclaimed lazily: in the common case, the ring is
+ freed automatically when a new backend takes over the same
+ procNumber slot. This function is the
+ explicit DBA-driven sweep for the pathological case where
+ capture was briefly enabled, then disabled, on a cluster with
+ long-lived pooled connections that never recycle the
+ procNumber. The maximum amount of memory
+ this can release is bounded by the slot count times the
+ per-ring size (~400 MB at max_connections
+ = 100, ~4 GB at 1000); on most deployments the function will
+ report 0 because connection churn already drained orphans
+ naturally.
+
+
+ Safe to call when capture is currently off
+ and even when no orphans exist (returns 0 in both cases).
+
+
+
diff --git a/meson.build b/meson.build
index 568e0e150bfa8..1a31fe3faf2ab 100644
--- a/meson.build
+++ b/meson.build
@@ -505,6 +505,7 @@ meson_bin = find_program(meson_binpath, native: true)
cdata.set('USE_ASSERT_CHECKING', get_option('cassert') ? 1 : false)
cdata.set('USE_INJECTION_POINTS', get_option('injection_points') ? 1 : false)
+cdata.set('USE_WAIT_EVENT_TIMING', get_option('wait_event_timing') ? 1 : false)
blocksize = get_option('blocksize').to_int() * 1024
diff --git a/meson_options.txt b/meson_options.txt
index 6a793f3e47943..1f191d3a9d621 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -40,6 +40,9 @@ option('pgport', type: 'integer', value: 5432,
option('cassert', type: 'boolean', value: false,
description: 'Enable assertion checks (for debugging)')
+option('wait_event_timing', type: 'boolean', value: false,
+ description: 'Enable wait event timing instrumentation')
+
option('tap_tests', type: 'feature', value: 'auto',
description: 'Enable TAP tests')
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 8f129baec9061..4bd9c1e8a86ed 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1557,3 +1557,106 @@ CREATE VIEW pg_aios AS
SELECT * FROM pg_get_aios();
REVOKE ALL ON pg_aios FROM PUBLIC;
GRANT SELECT ON pg_aios TO pg_read_all_stats;
+
+-- Taxonomy for the histogram column on pg_stat_wait_event_timing. The
+-- histogram array has one entry per bucket, in ascending order. This
+-- view names them so callers do not have to memorise the layout; join
+-- against it via unnest(histogram) WITH ORDINALITY.
+--
+-- WARNING: keep this list in lock-step with WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS
+-- and wait_event_timing_bucket() in src/backend/utils/activity/wait_event_timing.c.
+-- Bin edges are powers of two in nanoseconds; labels are the approximate
+-- decimal-microsecond grid documented in src/include/utils/wait_event_timing.h.
+CREATE VIEW pg_wait_event_timing_histogram_buckets AS
+ SELECT bucket_idx, lower_ns, upper_ns, label
+ FROM (VALUES
+ ( 0, 0::bigint, 1024::bigint, '<1us'::text),
+ ( 1, 1024::bigint, 2048::bigint, '1-2us'),
+ ( 2, 2048::bigint, 4096::bigint, '2-4us'),
+ ( 3, 4096::bigint, 8192::bigint, '4-8us'),
+ ( 4, 8192::bigint, 16384::bigint, '8-16us'),
+ ( 5, 16384::bigint, 32768::bigint, '16-32us'),
+ ( 6, 32768::bigint, 65536::bigint, '32-64us'),
+ ( 7, 65536::bigint, 131072::bigint, '64-128us'),
+ ( 8, 131072::bigint, 262144::bigint, '128-256us'),
+ ( 9, 262144::bigint, 524288::bigint, '256-512us'),
+ (10, 524288::bigint, 1048576::bigint, '512us-1ms'),
+ (11, 1048576::bigint, 2097152::bigint, '1-2ms'),
+ (12, 2097152::bigint, 4194304::bigint, '2-4ms'),
+ (13, 4194304::bigint, 8388608::bigint, '4-8ms'),
+ (14, 8388608::bigint, 16777216::bigint, '8-16ms'),
+ (15, 16777216::bigint, 33554432::bigint, '16-32ms'),
+ (16, 33554432::bigint, 67108864::bigint, '32-64ms'),
+ (17, 67108864::bigint, 134217728::bigint, '64-128ms'),
+ (18, 134217728::bigint, 268435456::bigint, '128-256ms'),
+ (19, 268435456::bigint, 536870912::bigint, '256-512ms'),
+ (20, 536870912::bigint, 1073741824::bigint, '512ms-1s'),
+ (21, 1073741824::bigint, 2147483648::bigint, '1-2s'),
+ (22, 2147483648::bigint, 4294967296::bigint, '2-4s'),
+ (23, 4294967296::bigint, 8589934592::bigint, '4-8s'),
+ (24, 8589934592::bigint, 17179869184::bigint, '8-16s'),
+ (25, 17179869184::bigint, 34359738368::bigint, '16-32s'),
+ (26, 34359738368::bigint, 68719476736::bigint, '32-64s'),
+ (27, 68719476736::bigint, 137438953472::bigint, '64-128s'),
+ (28, 137438953472::bigint, 274877906944::bigint, '128-256s'),
+ (29, 274877906944::bigint, 549755813888::bigint, '256-512s'),
+ (30, 549755813888::bigint, 1099511627776::bigint, '512s-1024s'),
+ (31, 1099511627776::bigint, NULL::bigint, '>=1024s')
+ ) AS t(bucket_idx, lower_ns, upper_ns, label);
+
+CREATE VIEW pg_stat_wait_event_timing AS
+ SELECT
+ t.pid,
+ t.backend_type,
+ t.procnumber,
+ t.wait_event_type,
+ t.wait_event,
+ t.calls,
+ t.total_time_ms,
+ t.avg_time_us,
+ t.max_time_us,
+ t.histogram
+ FROM pg_stat_get_wait_event_timing(NULL) t;
+REVOKE ALL ON pg_stat_wait_event_timing FROM PUBLIC;
+GRANT SELECT ON pg_stat_wait_event_timing TO pg_read_all_stats;
+
+CREATE VIEW pg_stat_wait_event_timing_overflow AS
+ SELECT
+ t.pid,
+ t.backend_type,
+ t.procnumber,
+ t.lwlock_overflow_count,
+ t.flat_overflow_count,
+ t.reset_count
+ FROM pg_stat_get_wait_event_timing_overflow(NULL) t;
+REVOKE ALL ON pg_stat_wait_event_timing_overflow FROM PUBLIC;
+GRANT SELECT ON pg_stat_wait_event_timing_overflow TO pg_read_all_stats;
+
+-- Per-session wait event trace ring, one record per completed wait
+-- (wait_event_capture = trace).
+-- Reading a session's trace exposes its query_id and wait sequence, which can
+-- leak across SECURITY DEFINER call chains, so the view AND both underlying
+-- SRFs are locked to pg_read_all_stats.
+CREATE VIEW pg_backend_wait_event_trace AS
+ SELECT
+ t.seq,
+ t.timestamp_ns,
+ t.wait_event_type,
+ t.wait_event,
+ t.duration_us,
+ t.query_id
+ FROM pg_get_backend_wait_event_trace() t;
+REVOKE ALL ON pg_backend_wait_event_trace FROM PUBLIC;
+GRANT SELECT ON pg_backend_wait_event_trace TO pg_read_all_stats;
+-- Revoke the session-local SRF itself, not just the view, so a role that can
+-- enable trace cannot read its own ring via the function and bypass the view.
+REVOKE EXECUTE ON FUNCTION pg_get_backend_wait_event_trace() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_backend_wait_event_trace() TO pg_read_all_stats;
+-- Cross-backend reader, keyed by procnumber (reads OWNED and ORPHANED slots).
+REVOKE EXECUTE ON FUNCTION pg_get_wait_event_trace(int4) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_wait_event_trace(int4) TO pg_read_all_stats;
+-- Cluster-scope operations: revoked from PUBLIC (administrators can
+-- delegate with GRANT EXECUTE); not granted to pg_read_all_stats because
+-- they mutate state rather than read it.
+REVOKE EXECUTE ON FUNCTION pg_stat_reset_wait_event_timing_all() FROM PUBLIC;
+REVOKE EXECUTE ON FUNCTION pg_stat_clear_orphaned_wait_event_rings() FROM PUBLIC;
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 4b30f7686801a..e0e64eab7ff5a 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -64,6 +64,7 @@
#include "utils/partcache.h"
#include "utils/rls.h"
#include "utils/snapmgr.h"
+#include "utils/wait_event_timing.h"
/* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */
@@ -133,6 +134,8 @@ ExecutorStart(QueryDesc *queryDesc, int eflags)
*/
pgstat_report_query_id(queryDesc->plannedstmt->queryId, false);
+ wait_event_trace_exec_start(queryDesc->plannedstmt->queryId);
+
if (ExecutorStart_hook)
(*ExecutorStart_hook) (queryDesc, eflags);
else
@@ -476,6 +479,8 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
void
ExecutorEnd(QueryDesc *queryDesc)
{
+ wait_event_trace_exec_end(queryDesc->plannedstmt->queryId);
+
if (ExecutorEnd_hook)
(*ExecutorEnd_hook) (queryDesc);
else
diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c
index ad4bf4bd2a8e9..6575dec4f00c3 100644
--- a/src/backend/postmaster/auxprocess.c
+++ b/src/backend/postmaster/auxprocess.c
@@ -26,6 +26,7 @@
#include "utils/memutils.h"
#include "utils/ps_status.h"
#include "utils/wait_event.h"
+#include "utils/wait_event_timing.h"
static void ShutdownAuxiliaryProcess(int code, Datum arg);
@@ -113,6 +114,16 @@ AuxiliaryProcessMainCommon(void)
*/
CreateAuxProcessResourceOwner();
+#ifdef USE_WAIT_EVENT_TIMING
+
+ /*
+ * Attach trace ring if wait_event_capture = trace was set via
+ * postgresql.conf
+ */
+ if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && my_trace_proc_number >= 0)
+ wait_event_trace_attach(my_trace_proc_number);
+#endif
+
/* Initialize backend status information */
pgstat_beinit();
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 6fa9de33e1ca2..4c2e4e6ec0253 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -57,6 +57,7 @@
#include "utils/timeout.h"
#include "utils/timestamp.h"
#include "utils/wait_event.h"
+#include "utils/wait_event_timing.h"
/* GUC variables */
int DeadlockTimeout = 1000;
@@ -541,6 +542,7 @@ InitProcess(void)
/* now that we have a proc, report wait events to shared memory */
pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+ pgstat_set_wait_event_timing_storage(MyProcNumber);
/*
* We might be reusing a semaphore that belonged to a failed process. So
@@ -713,6 +715,7 @@ InitAuxiliaryProcess(void)
/* now that we have a proc, report wait events to shared memory */
pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+ pgstat_set_wait_event_timing_storage(MyProcNumber);
/* Check that group locking fields are in a proper initial state. */
Assert(MyProc->lockGroupLeader == NULL);
@@ -1039,6 +1042,7 @@ ProcKill(int code, Datum arg)
/* See comment above, close to DisownLatch() */
pgstat_reset_wait_event_storage();
+ pgstat_reset_wait_event_timing_storage();
MyProc = NULL;
MyProcNumber = INVALID_PROC_NUMBER;
@@ -1101,6 +1105,7 @@ AuxiliaryProcKill(int code, Datum arg)
/* look at the equivalent ProcKill() code for comments */
SwitchBackToLocalLatch();
pgstat_reset_wait_event_storage();
+ pgstat_reset_wait_event_timing_storage();
proc = MyProc;
MyProc = NULL;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index dbef734a93f15..2b716b0c45d87 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -86,6 +86,7 @@
#include "utils/timeout.h"
#include "utils/timestamp.h"
#include "utils/varlena.h"
+#include "utils/wait_event_timing.h"
/* ----------------
* global variables
@@ -1423,6 +1424,17 @@ exec_parse_message(const char *query_string, /* string to execute */
*/
debug_query_string = query_string;
+ /*
+ * In pipelined extended protocol, a Parse can arrive while the previous
+ * query's st_query_id is still set and st_state is still RUNNING (no
+ * Sync->idle between queries). Flush the prior id with force=true so the
+ * QUERY_END marker fires before pgstat_report_activity below zeros
+ * st_query_id. Skip when st_state != RUNNING: coming from idle means
+ * send_ready_for_query already emitted the QUERY_END.
+ */
+ if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE &&
+ MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING)
+ pgstat_report_query_id(0, true);
pgstat_report_activity(STATE_RUNNING, query_string);
set_ps_display("PARSE");
@@ -1692,6 +1704,14 @@ exec_bind_message(StringInfo input_message)
*/
debug_query_string = psrc->query_string;
+ /*
+ * See exec_parse_message: flush the prior query_id's QUERY_END before
+ * pgstat_report_activity zeros it; the state gate avoids a duplicate
+ * QUERY_END right after a Sync->idle transition.
+ */
+ if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE &&
+ MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING)
+ pgstat_report_query_id(0, true);
pgstat_report_activity(STATE_RUNNING, psrc->query_string);
foreach(lc, psrc->query_list)
@@ -2183,6 +2203,14 @@ exec_execute_message(const char *portal_name, long max_rows)
*/
debug_query_string = sourceText;
+ /*
+ * See exec_parse_message: flush the prior query_id's QUERY_END before
+ * pgstat_report_activity zeros it; the state gate avoids a duplicate
+ * QUERY_END right after a Sync->idle transition.
+ */
+ if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE &&
+ MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING)
+ pgstat_report_query_id(0, true);
pgstat_report_activity(STATE_RUNNING, sourceText);
foreach(lc, portal->stmts)
@@ -4654,6 +4682,17 @@ PostgresMain(const char *dbname, const char *username)
*/
if (send_ready_for_query)
{
+ /*
+ * Emit QUERY_END before going idle so idle waits (ClientRead
+ * etc.) are not attributed to the finished query.
+ */
+ {
+ volatile PgBackendStatus *beentry = MyBEEntry;
+
+ if (beentry != NULL && beentry->st_query_id != 0)
+ wait_event_trace_query_end(beentry->st_query_id);
+ }
+
if (IsAbortedTransactionBlockState())
{
set_ps_display("idle in transaction (aborted)");
diff --git a/src/backend/utils/.gitignore b/src/backend/utils/.gitignore
index fa9cfb39693db..5051e36d1f01f 100644
--- a/src/backend/utils/.gitignore
+++ b/src/backend/utils/.gitignore
@@ -7,4 +7,5 @@
/errcodes.h
/pgstat_wait_event.c
/wait_event_funcs_data.c
+/wait_event_timing_data.h
/wait_event_types.h
diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile
index 81b4a956bda3f..5c11d8294f01a 100644
--- a/src/backend/utils/Makefile
+++ b/src/backend/utils/Makefile
@@ -43,7 +43,7 @@ generated-header-symlinks: $(top_builddir)/src/include/utils/header-stamp submak
submake-adt-headers:
$(MAKE) -C adt jsonpath_gram.h
-$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h
+$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h
# fmgr-stamp records the last time we ran Gen_fmgrtab.pl. We don't rely on
# the timestamps of the individual output files, because the Perl script
@@ -60,6 +60,7 @@ guc_tables.inc.c: $(top_srcdir)/src/backend/utils/misc/guc_parameters.dat $(top_
pgstat_wait_event.c: wait_event_types.h
wait_event_funcs_data.c: wait_event_types.h
+wait_event_timing_data.h: wait_event_types.h
wait_event_types.h: $(top_srcdir)/src/backend/utils/activity/wait_event_names.txt $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl
$(PERL) $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl --code $<
@@ -79,8 +80,8 @@ endif
# These generated headers must be symlinked into src/include/.
# We use header-stamp to record that we've done this because the symlinks
# themselves may appear older than fmgr-stamp.
-$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h
- cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h; do \
+$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h
+ cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h; do \
rm -f $$file && $(LN_S) "../../../$(subdir)/$$file" . ; \
done
touch $@
@@ -99,4 +100,4 @@ uninstall-data:
clean:
rm -f probes.h probes.h.tmp
rm -f fmgroids.h fmgrprotos.h fmgrtab.c fmgr-stamp errcodes.h guc_tables.inc.c
- rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c
+ rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h
diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile
index ca3ef89bf5997..60154d8055780 100644
--- a/src/backend/utils/activity/Makefile
+++ b/src/backend/utils/activity/Makefile
@@ -35,10 +35,12 @@ OBJS = \
pgstat_wal.o \
pgstat_xact.o \
wait_event.o \
- wait_event_funcs.o
+ wait_event_funcs.o \
+ wait_event_timing.o
# Force these dependencies to be known even without dependency info built:
wait_event.o: wait_event.c $(top_builddir)/src/backend/utils/pgstat_wait_event.c
wait_event_funcs.o: wait_event_funcs.c $(top_builddir)/src/backend/utils/wait_event_funcs_data.c
+wait_event_timing.o: wait_event_timing.c $(top_builddir)/src/backend/utils/wait_event_timing_data.h
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index d685fc5cd87c0..19a6dd48aef9e 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -24,6 +24,7 @@
#include "utils/ascii.h"
#include "utils/guc.h" /* for application_name */
#include "utils/memutils.h"
+#include "utils/wait_event_timing.h"
/* ----------
@@ -670,6 +671,18 @@ pgstat_report_query_id(int64 query_id, bool force)
if (beentry->st_query_id != INT64CONST(0) && !force)
return;
+ /*
+ * Emit trace markers for query-to-query transitions. QUERY_END fires
+ * here when st_query_id transitions from one non-zero value to another
+ * (multi-statement simple protocol, pipelined extended protocol). The
+ * last-query-to-idle QUERY_END is emitted in PostgresMain() at
+ * send_ready_for_query.
+ */
+ if (beentry->st_query_id != 0 && beentry->st_query_id != query_id)
+ wait_event_trace_query_end(beentry->st_query_id);
+ if (query_id != 0 && query_id != beentry->st_query_id)
+ wait_event_trace_query_start(query_id);
+
/*
* Update my status entry, following the protocol of bumping
* st_changecount before and after. We use a volatile pointer here to
diff --git a/src/backend/utils/activity/generate-wait_event_types.pl b/src/backend/utils/activity/generate-wait_event_types.pl
index d39a30d04783d..f3f1f107a4c04 100644
--- a/src/backend/utils/activity/generate-wait_event_types.pl
+++ b/src/backend/utils/activity/generate-wait_event_types.pl
@@ -5,6 +5,7 @@
# - wait_event_types.h (if --code is passed)
# - pgstat_wait_event.c (if --code is passed)
# - wait_event_funcs_data.c (if --code is passed)
+# - wait_event_timing_data.h (if --code is passed)
# - wait_event_types.sgml (if --docs is passed)
#
# Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
@@ -269,17 +270,195 @@
}
}
+ # -----------------------------------------------------------
+ # Compute wait_event_timing class mapping data.
+ #
+ # The dense class table maps raw classId (0x00..max) to a
+ # dense index, with per-class slot counts rounded up to the
+ # next power of 2 (minimum 16). Extension and InjectionPoint
+ # are fixed at 128 because extensions register custom events.
+ # LWLock uses a hash table (dense = -1).
+ # -----------------------------------------------------------
+
+ # Map section name -> raw classId (from wait_classes.h constants)
+ my %class_to_raw = (
+ 'Lock' => 0x03,
+ 'Buffer' => 0x04,
+ 'Activity' => 0x05,
+ 'Client' => 0x06,
+ 'Extension' => 0x07,
+ 'IPC' => 0x08,
+ 'Timeout' => 0x09,
+ 'IO' => 0x0A,
+ 'InjectionPoint' => 0x0B,
+ );
+
+ # Classes that need fixed large slot counts (dynamically extensible)
+ my %fixed_slot_classes = (
+ 'Extension' => 128,
+ 'InjectionPoint' => 128,
+ );
+
+ # Count events per class from the parsed data.
+ # Build a list of (className, rawId, actualCount) sorted by rawId.
+ my @timing_classes;
+ foreach my $waitclass (keys %hashwe)
+ {
+ my $short = $waitclass;
+ $short =~ s/^WaitEvent//;
+
+ # Skip LWLock -- uses hash table, not flat array
+ next unless exists $class_to_raw{$short};
+
+ my $raw_id = $class_to_raw{$short};
+ my $count = scalar @{ $hashwe{$waitclass} };
+
+ push @timing_classes, {
+ name => $short,
+ raw_id => $raw_id,
+ actual => $count,
+ };
+ }
+
+ # InjectionPoint (0x0B) has no section in wait_event_names.txt
+ # because its events are dynamically registered at runtime.
+ # Add it explicitly with actual=0 and a fixed slot count.
+ if (!grep { $_->{name} eq 'InjectionPoint' } @timing_classes)
+ {
+ push @timing_classes, {
+ name => 'InjectionPoint',
+ raw_id => $class_to_raw{'InjectionPoint'},
+ actual => 0,
+ };
+ }
+
+ # Sort by raw classId
+ @timing_classes = sort { $a->{raw_id} <=> $b->{raw_id} } @timing_classes;
+
+ # Compute slot counts: next power of 2, minimum 16, or fixed
+ foreach my $cls (@timing_classes)
+ {
+ if (exists $fixed_slot_classes{$cls->{name}})
+ {
+ $cls->{slots} = $fixed_slot_classes{$cls->{name}};
+ }
+ else
+ {
+ my $slots = 16; # minimum
+ $slots *= 2 while $slots < $cls->{actual};
+ $cls->{slots} = $slots;
+ }
+ }
+
+ # Compute cumulative offsets
+ my $offset = 0;
+ foreach my $cls (@timing_classes)
+ {
+ $cls->{offset} = $offset;
+ $offset += $cls->{slots};
+ }
+ my $total_events = $offset;
+
+ # Determine max raw classId for array sizing
+ my $max_raw = 0;
+ foreach my $cls (@timing_classes)
+ {
+ $max_raw = $cls->{raw_id} if $cls->{raw_id} > $max_raw;
+ }
+ my $raw_classes = $max_raw + 1;
+ my $dense_classes = scalar @timing_classes;
+
+ # Emit timing defines into wait_event_types.h
+ printf $h "\n/* Wait event timing flat array sizing (generated) */\n";
+ printf $h "#define WAIT_EVENT_TIMING_RAW_CLASSES\t%d\n", $raw_classes;
+ printf $h "#define WAIT_EVENT_TIMING_DENSE_CLASSES\t%d\n", $dense_classes;
+ printf $h "#define WAIT_EVENT_TIMING_NUM_EVENTS\t%d\n\n", $total_events;
+
printf $h "#endif /* WAIT_EVENT_TYPES_H */\n";
close $h;
close $c;
close $wc;
+ # Generate wait_event_timing_data.h with the mapping arrays.
+ # A header (rather than a .c file) keeps the file-extension category
+ # straight: it is included into a single TU (wait_event_timing.c) and
+ # defines static const tables there. The include guard makes the
+ # single-owner intent explicit and prevents accidental double inclusion.
+ my $ttmp = "$output_path/wait_event_timing_data.h.tmp$$";
+ open my $t, '>', $ttmp or die "Could not open $ttmp: $!";
+ printf $t $header_comment, 'wait_event_timing_data.h';
+
+ printf $t "#ifndef WAIT_EVENT_TIMING_DATA_H\n";
+ printf $t "#define WAIT_EVENT_TIMING_DATA_H\n\n";
+
+ # Emit wait_event_class_dense[]
+ printf $t "static const int8 wait_event_class_dense[WAIT_EVENT_TIMING_RAW_CLASSES] = {\n";
+ for (my $i = 0; $i < $raw_classes; $i++)
+ {
+ my $dense = -1;
+ my $comment = "unused";
+ for (my $d = 0; $d < $dense_classes; $d++)
+ {
+ if ($timing_classes[$d]->{raw_id} == $i)
+ {
+ $dense = $d;
+ $comment = $timing_classes[$d]->{name};
+ last;
+ }
+ }
+ # classId 0x01 is LWLock
+ if ($i == 0x01)
+ {
+ $comment = "LWLock (uses hash)";
+ }
+ my $comma = ($i < $raw_classes - 1) ? "," : "";
+ printf $t "\t%2d$comma\t\t/* 0x%02x: %s */\n", $dense, $i, $comment;
+ }
+ printf $t "};\n\n";
+
+ # Emit wait_event_class_nevents[]
+ printf $t "static const int wait_event_class_nevents[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n";
+ for (my $d = 0; $d < $dense_classes; $d++)
+ {
+ my $cls = $timing_classes[$d];
+ my $comma = ($d < $dense_classes - 1) ? "," : "";
+ printf $t "\t%d$comma\t\t/* %s (actual: %d) */\n",
+ $cls->{slots}, $cls->{name}, $cls->{actual};
+ }
+ printf $t "};\n\n";
+
+ # Emit wait_event_class_offset[]
+ printf $t "static const int wait_event_class_offset[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n";
+ for (my $d = 0; $d < $dense_classes; $d++)
+ {
+ my $cls = $timing_classes[$d];
+ my $comma = ($d < $dense_classes - 1) ? "," : "";
+ printf $t "\t%d$comma\t\t/* %s */\n", $cls->{offset}, $cls->{name};
+ }
+ printf $t "};\n\n";
+
+ # Emit wait_event_dense_to_classid[]
+ printf $t "static const uint8 wait_event_dense_to_classid[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n\t";
+ for (my $d = 0; $d < $dense_classes; $d++)
+ {
+ my $cls = $timing_classes[$d];
+ my $comma = ($d < $dense_classes - 1) ? ", " : "";
+ printf $t "0x%02x$comma", $cls->{raw_id};
+ }
+ printf $t "\n};\n\n";
+
+ printf $t "#endif /* WAIT_EVENT_TIMING_DATA_H */\n";
+
+ close $t;
+
rename($htmp, "$output_path/wait_event_types.h")
|| die "rename: $htmp to $output_path/wait_event_types.h: $!";
rename($ctmp, "$output_path/pgstat_wait_event.c")
|| die "rename: $ctmp to $output_path/pgstat_wait_event.c: $!";
rename($wctmp, "$output_path/wait_event_funcs_data.c")
|| die "rename: $wctmp to $output_path/wait_event_funcs_data.c: $!";
+ rename($ttmp, "$output_path/wait_event_timing_data.h")
+ || die "rename: $ttmp to $output_path/wait_event_timing_data.h: $!";
}
# Generate the .sgml file.
elsif ($gen_docs)
diff --git a/src/backend/utils/activity/meson.build b/src/backend/utils/activity/meson.build
index 1aa7ece52908c..1da4e216c4263 100644
--- a/src/backend/utils/activity/meson.build
+++ b/src/backend/utils/activity/meson.build
@@ -19,6 +19,7 @@ backend_sources += files(
'pgstat_subscription.c',
'pgstat_wal.c',
'pgstat_xact.c',
+ 'wait_event_timing.c',
)
# this includes a .c file with contents generated in ../../../include/activity,
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 95635c7f56ce7..c8fab55b36321 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -41,8 +41,7 @@ static const char *pgstat_get_wait_io(WaitEventIO w);
static uint32 local_my_wait_event_info;
uint32 *my_wait_event_info = &local_my_wait_event_info;
-#define WAIT_EVENT_CLASS_MASK 0xFF000000
-#define WAIT_EVENT_ID_MASK 0x0000FFFF
+/* WAIT_EVENT_CLASS_MASK / WAIT_EVENT_ID_MASK are defined in utils/wait_classes.h */
/*
* Hash tables for storing custom wait event ids and their names in
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 560659f956856..fe3fac660c96d 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -417,6 +417,8 @@ XactSLRU "Waiting to access the transaction status SLRU cache."
ParallelVacuumDSA "Waiting for parallel vacuum dynamic shared memory allocation."
AioUringCompletion "Waiting for another process to complete IO via io_uring."
ShmemIndex "Waiting to find or allocate space in shared memory."
+WaitEventTimingDSA "Waiting for wait event timing dynamic shared memory allocation."
+WaitEventTraceDSA "Waiting for wait event trace dynamic shared memory allocation."
# No "ABI_compatibility" region here as WaitEventLWLock has its own C code.
diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c
new file mode 100644
index 0000000000000..26a13fe8d9fb7
--- /dev/null
+++ b/src/backend/utils/activity/wait_event_timing.c
@@ -0,0 +1,3111 @@
+/*-------------------------------------------------------------------------
+ *
+ * wait_event_timing.c
+ * Per-backend wait event timing and histogram accumulation.
+ *
+ * Every transition through pgstat_report_wait_start()/_end() records the
+ * wait duration with INSTR_TIME (a clock_gettime()-grade timestamp) and
+ * accumulates per-(backend, event) statistics -- count, total/max
+ * nanoseconds, and a log2 duration histogram -- in shared memory. Each
+ * backend writes only to its own slot, so the hot path needs no locking.
+ *
+ * The per-backend slot array is allocated lazily in a DSA the first time
+ * any backend in the cluster enables capture, so a build that compiles the
+ * feature in but never enables it pays no per-backend memory. Backends
+ * attach on their first wait event under capture; a before_shmem_exit gate
+ * keeps the hot path away from DSA mappings that proc_exit has torn down.
+ *
+ * At the trace level, every completed wait (and a set of query-attribution
+ * markers) is additionally pushed into a per-session DSA ring buffer that
+ * survives backend exit for post-mortem reading; see the trace section
+ * below and the lifecycle discussion on WaitEventTraceControl.
+ *
+ * Controlled by the wait_event_capture GUC (off | stats | trace, default
+ * off) and the compile-time option --enable-wait-event-timing. In builds
+ * without that option the file still compiles (the GUC backing variables,
+ * the enum table, the rejecting check hook, and empty-result SQL stubs),
+ * so the GUC and the catalog functions exist uniformly.
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/utils/activity/wait_event_timing.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "utils/guc.h"
+#include "utils/guc_hooks.h"
+#include "utils/wait_event_timing.h"
+
+/*
+ * GUC variables -- always defined so the GUC system has backing variables
+ * even when compiled without --enable-wait-event-timing.
+ */
+int wait_event_capture = WAIT_EVENT_CAPTURE_OFF;
+int wait_event_timing_max_tranches = 192;
+int wait_event_trace_ring_size = 4096;
+
+/*
+ * Records-per-ring derived from wait_event_trace_ring_size at server
+ * start. Read by the writer (via the per-ring cached mask) and by the ring
+ * allocator. Zero until the GUC framework commits the boot value.
+ */
+uint32 WaitEventTraceRingSize = 0;
+
+/*
+ * Enum value table consumed by guc.c. Order matches the
+ * WaitEventCaptureLevel enum and the documented "off < stats < trace".
+ */
+const struct config_enum_entry wait_event_capture_options[] = {
+ {"off", WAIT_EVENT_CAPTURE_OFF, false},
+ {"stats", WAIT_EVENT_CAPTURE_STATS, false},
+ {"trace", WAIT_EVENT_CAPTURE_TRACE, false},
+ {NULL, 0, false}
+};
+
+/*
+ * GUC check hook for wait_event_trace_ring_size. The ring size in records
+ * must be a power of two for the writer's mask-indexing (pos & ring_mask).
+ * Each record is 32 bytes, so kb is a power of two iff the record count is.
+ * Defined for both build configurations so the GUC framework validates it
+ * uniformly.
+ */
+bool
+check_wait_event_trace_ring_size(int *newval, void **extra, GucSource source)
+{
+ int v = *newval;
+
+ if (v <= 0 || (v & (v - 1)) != 0)
+ {
+ GUC_check_errdetail("wait_event_trace_ring_size must be a positive power of two.");
+ return false;
+ }
+ return true;
+}
+
+#ifndef USE_WAIT_EVENT_TIMING
+
+/*
+ * Stub build: no instrumentation. Provide the symbols referenced by
+ * pg_proc.dat, the GUC machinery, and the shmem subsystem registry.
+ */
+#include "fmgr.h"
+#include "funcapi.h"
+
+Datum pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS);
+Datum pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS);
+Datum pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS);
+Datum pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS);
+Datum pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS);
+Datum pg_get_wait_event_trace(PG_FUNCTION_ARGS);
+Datum pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS);
+
+Datum
+pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS)
+{
+ InitMaterializedSRF(fcinfo, 0);
+ PG_RETURN_VOID();
+}
+
+Datum
+pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS)
+{
+ InitMaterializedSRF(fcinfo, 0);
+ PG_RETURN_VOID();
+}
+
+Datum
+pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS)
+{
+ InitMaterializedSRF(fcinfo, 0);
+ PG_RETURN_VOID();
+}
+
+Datum
+pg_get_wait_event_trace(PG_FUNCTION_ARGS)
+{
+ InitMaterializedSRF(fcinfo, 0);
+ PG_RETURN_VOID();
+}
+
+Datum
+pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS)
+{
+ /* No trace infrastructure in this build, so never any orphans. */
+ PG_RETURN_INT64(0);
+}
+
+Datum
+pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("wait event capture is not supported by this build"),
+ errhint("Compile PostgreSQL with --enable-wait-event-timing.")));
+ PG_RETURN_VOID();
+}
+
+Datum
+pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("wait event capture is not supported by this build"),
+ errhint("Compile PostgreSQL with --enable-wait-event-timing.")));
+ PG_RETURN_VOID();
+}
+
+/*
+ * GUC check hook for the stub build. Any value other than 'off' is
+ * meaningless without --enable-wait-event-timing, so reject it -- or
+ * downgrade to 'off' with a warning when the value comes from a
+ * non-interactive source, so a leftover setting does not block startup.
+ */
+bool
+check_wait_event_capture(int *newval, void **extra, GucSource source)
+{
+ if (*newval != WAIT_EVENT_CAPTURE_OFF)
+ {
+ if (source < PGC_S_INTERACTIVE)
+ {
+ ereport(WARNING,
+ (errmsg("wait_event_capture is not supported by this build, "
+ "forcing to \"off\""),
+ errhint("Compile PostgreSQL with "
+ "--enable-wait-event-timing.")));
+ *newval = WAIT_EVENT_CAPTURE_OFF;
+ return true;
+ }
+ GUC_check_errdetail("This build does not support wait event capture.");
+ GUC_check_errhint("Compile PostgreSQL with --enable-wait-event-timing.");
+ return false;
+ }
+ return true;
+}
+
+void
+assign_wait_event_capture(int newval, void *extra)
+{
+}
+
+/* No shared memory is reserved in the stub build. */
+const ShmemCallbacks WaitEventTimingShmemCallbacks = {0};
+const ShmemCallbacks WaitEventTraceControlShmemCallbacks = {0};
+
+/* Defined so every extern in wait_event_timing.h resolves in stub builds. */
+WaitEventTimingState *my_wait_event_timing = NULL;
+int my_trace_proc_number = -1;
+
+void
+pgstat_set_wait_event_timing_storage(int procNumber)
+{
+}
+
+void
+pgstat_reset_wait_event_timing_storage(void)
+{
+}
+
+void
+wait_event_trace_attach(int procNumber)
+{
+}
+
+/*
+ * Stub trace-marker entry points -- declared unconditionally in the header
+ * so call sites in execMain.c/postgres.c/backend_status.c need no #ifdef.
+ */
+void
+wait_event_trace_query_start(int64 query_id)
+{
+}
+
+void
+wait_event_trace_query_end(int64 query_id)
+{
+}
+
+void
+wait_event_trace_exec_start(int64 query_id)
+{
+}
+
+void
+wait_event_trace_exec_end(int64 query_id)
+{
+}
+
+#else /* USE_WAIT_EVENT_TIMING */
+
+#include "catalog/pg_authid.h"
+#include "catalog/pg_type_d.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/queryjumble.h"
+#include "port/pg_bitutils.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procnumber.h"
+#include "utils/acl.h"
+#include "utils/array.h"
+#include "utils/backend_status.h"
+#include "utils/builtins.h"
+#include "utils/dsa.h"
+#include "utils/injection_point.h"
+#include "utils/tuplestore.h"
+#include "utils/wait_event.h"
+
+#define NUM_WAIT_EVENT_TIMING_SLOTS (MaxBackends + NUM_AUXILIARY_PROCS)
+
+#define HAS_PGSTAT_PERMISSIONS(role) \
+ (has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS) || \
+ has_privs_of_role(GetUserId(), role))
+
+/* Pointer to this backend's timing state in shared memory. */
+WaitEventTimingState *my_wait_event_timing = NULL;
+
+/*
+ * Backend-local copy of the last reset generation this backend acted on.
+ * Compared against the shared reset_generation at every wait_end; when they
+ * differ, the owning backend performs the reset of its own counters on
+ * behalf of whoever called pg_stat_reset_wait_event_timing(target).
+ */
+static uint32 my_last_reset_generation = 0;
+
+/*
+ * DSA-based control struct in fixed shared memory. The large per-backend
+ * WaitEventTimingState array is allocated lazily in DSA the first time any
+ * backend in the cluster sets wait_event_capture != off, so a build that
+ * compiles the feature in but never enables it pays no per-backend memory.
+ */
+typedef struct WaitEventTimingControl
+{
+ LWLock lock; /* protects first-time DSA create + array
+ * alloc */
+ dsa_handle timing_dsa_handle; /* DSA_HANDLE_INVALID until first enable */
+ dsa_pointer timing_array; /* InvalidDsaPointer until first enable */
+} WaitEventTimingControl;
+
+static WaitEventTimingControl *WaitEventTimingCtl = NULL;
+static dsa_area *timing_dsa = NULL;
+
+/*
+ * Per-backend gate raised by the before_shmem_exit callback once proc_exit
+ * begins tearing down DSA mappings (dsm_backend_shutdown runs as a later
+ * on_shmem_exit callback). Once set, the wait-event hot path skips all
+ * timing work -- including the lazy re-attach branch -- so it cannot
+ * dereference my_wait_event_timing or run DSA primitives on already-detached
+ * memory. Per-backend (process-local), so the hot-path check is a single
+ * cache-warm load.
+ */
+static bool wait_event_timing_writes_disabled = false;
+
+/*
+ * Backend-local cached pointer to the start of the shared slot array, set on
+ * first lazy-attach. Slots are NOT a simple C array: each has a
+ * runtime-determined stride (header + variable-size hash arrays); use
+ * wet_slot() to index.
+ */
+static char *WaitEventTimingArray = NULL;
+
+/*
+ * Per-backend slot stride and the hash dimensions, all derived from the
+ * GUC wait_event_timing_max_tranches. Because the GUC is PGC_POSTMASTER,
+ * every backend in the cluster derives identical values, so the shared
+ * layout is consistent across fork() and EXEC_BACKEND.
+ */
+static Size wait_event_timing_per_backend_stride = 0;
+static int wait_event_timing_hash_size = 0;
+static int wait_event_timing_max_entries = 0;
+
+/* ---- Trace-level (wait_event_capture = trace) state ---- */
+/* Pointer to this backend's trace ring buffer */
+static WaitEventTraceState *my_wait_event_trace = NULL;
+
+/* DSA-based trace ring buffer control */
+static WaitEventTraceControl *WaitEventTraceCtl = NULL;
+static dsa_area *trace_dsa = NULL;
+int my_trace_proc_number = -1;
+
+/*
+ * Same-backend coordination between pg_get_backend_wait_event_trace (the
+ * own-session SRF reader) and wait_event_trace_release_slot (the GUC
+ * step-down path that frees this backend's ring). Both paths run in this
+ * same backend, single-threaded, so a plain bool is sufficient -- no
+ * atomics needed.
+ *
+ * srf_in_progress set true while the SRF is iterating the ring; the
+ * release path observes this and defers the dsa_free
+ * instead of yanking the chunk out from under us.
+ *
+ * release_pending set by the release path when it had to defer; the
+ * SRF's PG_FINALLY checks it and performs the deferred
+ * dsa_free after the iteration completes.
+ *
+ * Cross-backend readers (extensions, bgworkers reading another backend's
+ * ring) cannot use this mechanism -- they coordinate with the release
+ * path via WaitEventTraceCtl->lock instead. See the header for the
+ * recommended snapshot-under-lock pattern for those consumers.
+ */
+static bool wait_event_trace_srf_in_progress = false;
+static bool wait_event_trace_release_pending = false;
+
+/*
+ * Per-backend gate that disables the trace-ring writer in the wait-
+ * event hot path while a slot-state transition is in progress.
+ *
+ * Set true around code paths that either free the local trace ring
+ * (wait_event_trace_release_slot's dsa_free) or transition the slot
+ * out of OWNED (wait_event_trace_before_shmem_exit's OWNED ->
+ * ORPHANED publish). In both cases an internal LWLock inside
+ * dsa_free / dsa_attach / dsa_pin_mapping / dsa_pin can in
+ * principle contend long enough to dispatch a wait event; that
+ * wait event's pgstat_report_wait_end_timing inline path runs in
+ * the SAME backend, sees capture_level == TRACE (the GUC hasn't
+ * been committed yet by the time the assign hook runs), and would:
+ *
+ * * during release_slot's dsa_free: write into a ring that has
+ * already been returned to the DSA freelist -- if another
+ * allocator has since reused the chunk, this is a stray write
+ * into someone else's allocation.
+ *
+ * * during release_slot's dsa_free, alternative timing: see
+ * my_wait_event_trace == NULL on a naive "clear before free"
+ * fix and recurse into wait_event_trace_attach, which would
+ * either deadlock on the WaitEventTraceCtl->lock the outer
+ * release_slot already holds, or (on a lock-free moment)
+ * allocate a fresh ring that the outer release_slot would
+ * then free again as part of its post-acquire DsaPointerIsValid
+ * check -- a different use-after-free of a freshly-allocated
+ * chunk.
+ *
+ * * during before_shmem_exit: write into the ring after the slot
+ * has been published as ORPHANED, violating the post-mortem
+ * read-only contract that cross-backend readers rely on.
+ *
+ * The flag is per-backend (static at file scope means per-process
+ * in PG's process-per-backend model), so the hot path's check is a
+ * single cache-warm load and a branch; no atomic, no fence. The
+ * trace branch is already gated by capture_level == TRACE so the
+ * additional check costs nothing in the common case where capture
+ * is off or stats-only. The flag is set on the very same backend
+ * that may later read it from the hot path, so there is no
+ * cross-process visibility concern.
+ *
+ * See the release_slot and before_shmem_exit doc comments for the
+ * specific transition each uses this flag around.
+ */
+static bool wait_event_trace_writes_disabled = false;
+
+/*
+ * Re-entrancy guard for the trace-record writer in
+ * pgstat_report_wait_end_timing(): wait events emitted while a trace
+ * record is mid-write are not themselves ring-recorded. In production the
+ * record write is a handful of plain stores and cannot wait, so this never
+ * fires; it exists for the injection point inside the writer, whose wait
+ * machinery emits nested wait events that would otherwise recurse into the
+ * writer (and, by re-running the injection point, self-deadlock on locks
+ * the outer invocation holds). Per-backend, single-threaded -- a plain
+ * bool suffices.
+ */
+static bool wait_event_trace_in_write = false;
+
+/*
+ * Forward declarations for trace functions referenced before their
+ * definitions (the storage/assign hooks call them; the bodies are in the
+ * trace machinery section below).
+ */
+static void wait_event_trace_detach(int procNumber);
+static void wait_event_trace_release_slot(int procNumber);
+static void wait_event_trace_clear_orphan_at_init(int procNumber);
+
+/*
+ * Mapping arrays for the flat events[] array, generated from
+ * wait_event_names.txt by generate-wait_event_types.pl. Defines
+ * wait_event_class_dense / _nevents / _offset / _dense_to_classid.
+ */
+#include "utils/wait_event_timing_data.h"
+
+/*
+ * Round up to the next power of two, with a minimum of 32. The hash slot
+ * count must be a power of two for the mask-based modulo in the lookup hot
+ * path; we target >= 2x the entry cap so the load factor stays <= 50%.
+ */
+static int
+wait_event_timing_hash_size_for(int max_entries)
+{
+ int size = 32;
+
+ while (size < max_entries * 2)
+ size <<= 1;
+ return size;
+}
+
+/*
+ * Compute the per-backend slot size for the given max_entries. Layout:
+ *
+ * [ WaitEventTimingState header ]
+ * [ LWLockTimingHashEntry[hash_size] ]
+ * [ WaitEventTimingEntry[max_entries] <- lwlock_events[] ]
+ */
+static Size
+wait_event_timing_slot_size(int max_entries)
+{
+ int hash_size = wait_event_timing_hash_size_for(max_entries);
+
+ return add_size(sizeof(WaitEventTimingState),
+ add_size(mul_size(hash_size, sizeof(LWLockTimingHashEntry)),
+ mul_size(max_entries, sizeof(WaitEventTimingEntry))));
+}
+
+/*
+ * Ensure this backend is attached to the timing DSA. The DSA is created by
+ * whichever backend first reaches here with an empty control struct;
+ * subsequent callers attach to the existing handle. The backend-local
+ * dsa_area pointer is cached in timing_dsa for the backend's lifetime.
+ */
+static void
+wait_event_timing_ensure_dsa(void)
+{
+ MemoryContext oldcontext;
+
+ if (timing_dsa != NULL)
+ return;
+
+ if (WaitEventTimingCtl == NULL)
+ return; /* pre-ShmemInit; nothing to attach to */
+
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ LWLockAcquire(&WaitEventTimingCtl->lock, LW_EXCLUSIVE);
+
+ if (WaitEventTimingCtl->timing_dsa_handle == DSA_HANDLE_INVALID)
+ {
+ timing_dsa = dsa_create(LWTRANCHE_WAIT_EVENT_TIMING_DSA);
+ dsa_pin(timing_dsa);
+ dsa_pin_mapping(timing_dsa);
+ WaitEventTimingCtl->timing_dsa_handle = dsa_get_handle(timing_dsa);
+ }
+ else
+ {
+ timing_dsa = dsa_attach(WaitEventTimingCtl->timing_dsa_handle);
+ dsa_pin_mapping(timing_dsa);
+ }
+
+ LWLockRelease(&WaitEventTimingCtl->lock);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Attach this backend to the shared per-backend slot array, allocating it in
+ * DSA on first use when allocate_if_missing is true. Returns true if the
+ * array is now available (WaitEventTimingArray non-NULL). Readers pass
+ * false so a SELECT against an empty view does not force a big allocation;
+ * the hot path passes true so the first wait event under capture != off
+ * creates the storage.
+ *
+ * Re-entrancy guard: dsa_create / dsa_allocate / the LWLockAcquire inside
+ * ensure_dsa can themselves emit LWLock wait events, which feed back into
+ * the wait-end timing hot path, which lazy-attaches by calling this
+ * function. The in_attach guard prevents deadlock on the control lock and
+ * recursion with a half-initialised pointer.
+ */
+static bool
+wait_event_timing_attach_array(bool allocate_if_missing)
+{
+ static bool in_attach = false;
+ bool attached = false;
+
+ if (WaitEventTimingArray != NULL)
+ return true;
+
+ if (WaitEventTimingCtl == NULL)
+ return false;
+
+ /*
+ * Reader fast path: if no backend has ever enabled capture, the DSA was
+ * never created. Return without calling ensure_dsa() so a plain SELECT
+ * against an empty view does not create (and pin for the postmaster's
+ * lifetime) an otherwise-unused DSA segment. Only the first enabler
+ * (allocate_if_missing) brings the DSA into existence.
+ */
+ if (!allocate_if_missing &&
+ WaitEventTimingCtl->timing_dsa_handle == DSA_HANDLE_INVALID)
+ return false;
+
+ if (in_attach)
+ return false;
+
+ in_attach = true;
+ PG_TRY();
+ {
+ wait_event_timing_ensure_dsa();
+
+ if (WaitEventTimingCtl->timing_array == InvalidDsaPointer)
+ {
+ if (!allocate_if_missing)
+ {
+ attached = false;
+ }
+ else
+ {
+ int max_entries;
+ int hash_size;
+ Size stride;
+ Size total;
+
+ /*
+ * Snapshot the GUC once for the cluster-wide first-enable
+ * allocation; every slot shares these dimensions for the
+ * cluster's lifetime (the GUC is PGC_POSTMASTER).
+ */
+ max_entries = wait_event_timing_max_tranches;
+ hash_size = wait_event_timing_hash_size_for(max_entries);
+ stride = wait_event_timing_slot_size(max_entries);
+ total = mul_size(NUM_WAIT_EVENT_TIMING_SLOTS, stride);
+
+ LWLockAcquire(&WaitEventTimingCtl->lock, LW_EXCLUSIVE);
+
+ if (WaitEventTimingCtl->timing_array == InvalidDsaPointer)
+ {
+ dsa_pointer p;
+ char *region;
+ int i;
+
+ p = dsa_allocate_extended(timing_dsa, total, DSA_ALLOC_ZERO);
+ region = (char *) dsa_get_address(timing_dsa, p);
+
+ for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++)
+ {
+ WaitEventTimingState *slot;
+ LWLockTimingHashEntry *slot_entries;
+ int j;
+
+ slot = (WaitEventTimingState *) (region + (Size) i * stride);
+
+ pg_atomic_init_u32(&slot->reset_generation, 0);
+ slot->lwlock_hash.num_used = 0;
+ slot->lwlock_hash.hash_size = hash_size;
+ slot->lwlock_hash.max_entries = max_entries;
+
+ /*
+ * DSA_ALLOC_ZERO zeroed the region, but the empty
+ * sentinel is 0xFFFF, not 0.
+ */
+ slot_entries = (LWLockTimingHashEntry *)
+ ((char *) slot + sizeof(WaitEventTimingState));
+ for (j = 0; j < hash_size; j++)
+ slot_entries[j].tranche_id = LWLOCK_TIMING_EMPTY_SLOT;
+ }
+
+ WaitEventTimingCtl->timing_array = p;
+ }
+
+ LWLockRelease(&WaitEventTimingCtl->lock);
+ attached = true;
+ }
+ }
+ else
+ attached = true;
+
+ if (attached)
+ {
+ WaitEventTimingState *first;
+
+ WaitEventTimingArray = (char *)
+ dsa_get_address(timing_dsa, WaitEventTimingCtl->timing_array);
+
+ /*
+ * Recover the dimensions from the first slot's header (all slots
+ * share them) and cache the stride so wet_slot() is a single
+ * multiply-and-add.
+ */
+ first = (WaitEventTimingState *) WaitEventTimingArray;
+ wait_event_timing_max_entries = first->lwlock_hash.max_entries;
+ wait_event_timing_hash_size = first->lwlock_hash.hash_size;
+ wait_event_timing_per_backend_stride =
+ wait_event_timing_slot_size(wait_event_timing_max_entries);
+ }
+ }
+ PG_FINALLY();
+ {
+ in_attach = false;
+ }
+ PG_END_TRY();
+
+ return WaitEventTimingArray != NULL;
+}
+
+/* Resolve the address of slot `idx` within WaitEventTimingArray. */
+static inline WaitEventTimingState *
+wet_slot(int idx)
+{
+ return (WaitEventTimingState *)
+ (WaitEventTimingArray + (Size) idx * wait_event_timing_per_backend_stride);
+}
+
+/*
+ * Address of the LWLock hash slot table for a slot (immediately follows
+ * the WaitEventTimingState header).
+ */
+static inline LWLockTimingHashEntry *
+wet_lwlock_hash_entries(WaitEventTimingState *state)
+{
+ return (LWLockTimingHashEntry *) ((char *) state + sizeof(WaitEventTimingState));
+}
+
+/*
+ * Address of the dense LWLock events array for a slot (immediately follows
+ * the slot table).
+ */
+static inline WaitEventTimingEntry *
+wet_lwlock_hash_events(WaitEventTimingState *state)
+{
+ return (WaitEventTimingEntry *)
+ ((char *) state + sizeof(WaitEventTimingState)
+ + (Size) state->lwlock_hash.hash_size * sizeof(LWLockTimingHashEntry));
+}
+
+/*
+ * Convert wait_event_info to a flat index for the events[] array. Returns
+ * WAIT_EVENT_TIMING_IDX_LWLOCK for LWLock events (which use the hash) and
+ * -1 for events outside the mapped classes.
+ */
+static int
+wait_event_timing_index(uint32 wait_event_info)
+{
+ uint32 classId = wait_event_info & WAIT_EVENT_CLASS_MASK;
+ int eventId = wait_event_info & WAIT_EVENT_ID_MASK;
+ int class_byte;
+ int dense;
+
+ if (classId == PG_WAIT_LWLOCK)
+ return WAIT_EVENT_TIMING_IDX_LWLOCK;
+
+ class_byte = classId >> 24;
+ if (class_byte >= WAIT_EVENT_TIMING_RAW_CLASSES)
+ return -1;
+
+ dense = wait_event_class_dense[class_byte];
+ if (dense < 0)
+ return -1;
+
+ if (eventId >= wait_event_class_nevents[dense])
+ return -1;
+
+ return wait_event_class_offset[dense] + eventId;
+}
+
+/*
+ * Reset a slot's LWLockTimingHash to its empty initial state. The hash
+ * header's hash_size and max_entries are immutable and not reset here.
+ */
+static void
+lwlock_timing_hash_clear(WaitEventTimingState *state)
+{
+ LWLockTimingHash *ht = &state->lwlock_hash;
+ LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state);
+ WaitEventTimingEntry *events = wet_lwlock_hash_events(state);
+ int i;
+
+ ht->num_used = 0;
+ memset(events, 0, (Size) ht->max_entries * sizeof(WaitEventTimingEntry));
+ for (i = 0; i < ht->hash_size; i++)
+ {
+ entries[i].tranche_id = LWLOCK_TIMING_EMPTY_SLOT;
+ entries[i].dense_idx = 0;
+ }
+}
+
+/*
+ * Maximum probes attempted on the lookup hot path once the table is at
+ * capacity. At cap an unknown tranche cannot be inserted, so bounding the
+ * scan caps the per-event cost instead of walking clustered occupied slots
+ * on every unknown-tranche wait_end. 8 is well above the expected probe
+ * distance at the target load factor.
+ */
+#define LWLOCK_TIMING_LOOKUP_AT_CAP_PROBE_LIMIT 8
+
+/*
+ * Look up (or insert) the timing entry for an LWLock tranche id. Returns
+ * NULL when the table is at capacity and the tranche is not already
+ * present.
+ */
+static WaitEventTimingEntry *
+lwlock_timing_lookup(WaitEventTimingState *state, uint16 tranche_id)
+{
+ LWLockTimingHash *ht = &state->lwlock_hash;
+ LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state);
+ WaitEventTimingEntry *events = wet_lwlock_hash_events(state);
+ uint32 hash = (uint32) tranche_id * 2654435761U;
+ int slot = hash & (ht->hash_size - 1);
+ int limit;
+ int i;
+
+ limit = (ht->num_used >= ht->max_entries)
+ ? LWLOCK_TIMING_LOOKUP_AT_CAP_PROBE_LIMIT
+ : ht->hash_size;
+
+ for (i = 0; i < limit; i++)
+ {
+ LWLockTimingHashEntry *e = &entries[slot];
+
+ if (e->tranche_id == tranche_id)
+ return &events[e->dense_idx];
+
+ if (e->tranche_id == LWLOCK_TIMING_EMPTY_SLOT)
+ {
+ if (ht->num_used >= ht->max_entries)
+ return NULL;
+
+ e->tranche_id = tranche_id;
+ e->dense_idx = ht->num_used++;
+ return &events[e->dense_idx];
+ }
+
+ slot = (slot + 1) & (ht->hash_size - 1);
+ }
+
+ return NULL;
+}
+
+/*
+ * Compute the histogram bucket index for a duration in nanoseconds. See
+ * the rationale on WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS in the header.
+ */
+static int
+wait_event_timing_bucket(int64 duration_ns)
+{
+ int bucket;
+
+ /*
+ * Everything under ~1us (and 0, undefined for pg_leftmost_one_pos64)
+ * lands in bucket 0.
+ */
+ if (duration_ns < 1024)
+ return 0;
+
+ bucket = pg_leftmost_one_pos64((uint64) duration_ns) - 9;
+
+ if (bucket >= WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS)
+ bucket = WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS - 1;
+
+ return bucket;
+}
+
+/*
+ * ShmemRequest/Init: reserve only the small control struct in fixed shmem.
+ * The large per-backend array is allocated lazily in DSA on first enable.
+ */
+static void
+WaitEventTimingShmemRequest(void *arg)
+{
+ ShmemRequestStruct(.name = "WaitEventTimingControl",
+ .size = sizeof(WaitEventTimingControl),
+ .ptr = (void **) &WaitEventTimingCtl);
+}
+
+static void
+WaitEventTimingShmemInit(void *arg)
+{
+ LWLockInitialize(&WaitEventTimingCtl->lock,
+ LWTRANCHE_WAIT_EVENT_TIMING_DSA);
+ WaitEventTimingCtl->timing_dsa_handle = DSA_HANDLE_INVALID;
+ WaitEventTimingCtl->timing_array = InvalidDsaPointer;
+ WaitEventTimingArray = NULL;
+}
+
+const ShmemCallbacks WaitEventTimingShmemCallbacks = {
+ .request_fn = WaitEventTimingShmemRequest,
+ .init_fn = WaitEventTimingShmemInit,
+};
+
+/*
+ * before_shmem_exit callback: disable the inline hot path for the rest of
+ * proc_exit, so it does not dereference my_wait_event_timing or attempt a
+ * fresh lazy-attach after dsm_backend_shutdown has unmapped the DSA segment
+ * behind the slot. We do NOT null my_wait_event_timing here: a NULL pointer
+ * would route the hot path through the lazy-attach branch, which then
+ * re-attaches using DSA primitives that operate on already-detached memory.
+ */
+static void
+pgstat_wait_event_timing_before_shmem_exit(int code, Datum arg)
+{
+ wait_event_timing_writes_disabled = true;
+}
+
+/*
+ * Point my_wait_event_timing at this backend's slot, allocating the DSA
+ * array on first call. Reached from the hot path the first time this
+ * backend observes wait_event_capture != off; after the first successful
+ * attach the cached pointer stays valid for the backend's lifetime, so this
+ * is a cold branch.
+ */
+static void
+pgstat_wait_event_timing_lazy_attach(void)
+{
+ int procNumber;
+ WaitEventTimingState *slot;
+
+ if (my_wait_event_timing != NULL)
+ return;
+
+ if (MyProc == NULL)
+ return;
+
+ /*
+ * Lazy attach allocates memory (dsa_attach -> dsm_attach ->
+ * MemoryContextAlloc), which Assert-fails inside a critical section. A
+ * backend's very first wait event after capture is enabled can land in
+ * one (e.g. a parallel worker in XLogInsert). Skipping silently drops
+ * that in-flight event but keeps the backend alive; the next wait event
+ * outside any critical section attaches successfully.
+ */
+ if (CritSectionCount > 0)
+ return;
+
+ /*
+ * Skip if MyProc is already on an LWLock wait queue: we run inside
+ * LWLockAcquire after LWLockQueueSelf set MyProc->lwWaiting, and our
+ * attach path's own LWLockAcquire would hit the "queueing while waiting
+ * on another lock" PANIC. The next wait outside an LWLock-wait context
+ * retries successfully.
+ */
+ if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
+ return;
+
+ procNumber = GetNumberFromPGProc(MyProc);
+ if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+ return;
+
+ if (!wait_event_timing_attach_array(true))
+ return;
+
+ slot = wet_slot(procNumber);
+
+ /*
+ * Clear the slot before publishing it: the DSA region is zeroed at
+ * creation, but a later backend may inherit a slot from an exited one.
+ * Zero through the local `slot` first, THEN publish to
+ * my_wait_event_timing, so a non-NULL pointer always means the slot is
+ * ready for the very next store.
+ */
+ memset(slot->events, 0, sizeof(slot->events));
+ lwlock_timing_hash_clear(slot);
+ slot->reset_count = 0;
+ slot->lwlock_overflow_count = 0;
+ slot->flat_overflow_count = 0;
+ slot->current_event = 0;
+ INSTR_TIME_SET_ZERO(slot->wait_start);
+
+ my_last_reset_generation = pg_atomic_read_u32(&slot->reset_generation);
+
+ /* Publish only after the slot is fully initialised. */
+ my_wait_event_timing = slot;
+
+ /*
+ * Register a before_shmem_exit callback (once) to raise the
+ * writes-disabled gate before the DSA mappings go away. shmem_exit()
+ * runs all before_shmem_exit callbacks (which sets the gate) and only
+ * then calls dsm_backend_shutdown(), so the gate is guaranteed up before
+ * any DSA segment is unmapped -- independent of callback ordering.
+ */
+ {
+ static bool registered = false;
+
+ if (!registered)
+ {
+ before_shmem_exit(pgstat_wait_event_timing_before_shmem_exit,
+ (Datum) 0);
+ registered = true;
+ }
+ }
+}
+
+/*
+ * Called from InitProcess()/InitAuxiliaryProcess(). In the lazy-DSA design
+ * there is no slot to point at yet -- the slot attaches on the first wait
+ * event under capture != off (pgstat_wait_event_timing_lazy_attach) -- so a
+ * backend that never enables capture pays zero per-backend memory. Just
+ * make sure the pointer starts NULL.
+ */
+void
+pgstat_set_wait_event_timing_storage(int procNumber)
+{
+ my_wait_event_timing = NULL;
+ my_wait_event_trace = NULL;
+
+ if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+ {
+ my_trace_proc_number = -1;
+ return;
+ }
+
+ my_trace_proc_number = procNumber;
+
+ /*
+ * If the previous occupant of this procNumber left an ORPHANED trace ring
+ * behind (we do not free trace rings at backend exit -- see
+ * WaitEventTraceControl), free it now so this backend starts with a clean
+ * FREE slot.
+ */
+ wait_event_trace_clear_orphan_at_init(procNumber);
+}
+
+/*
+ * Detach from the timing slot on backend exit. The slots stay in DSA;
+ * clearing the local pointers keeps the late-shutdown hot path from touching
+ * already-detached memory.
+ */
+void
+pgstat_reset_wait_event_timing_storage(void)
+{
+ if (my_trace_proc_number >= 0)
+ wait_event_trace_detach(my_trace_proc_number);
+
+ my_wait_event_timing = NULL;
+ my_wait_event_trace = NULL;
+ my_trace_proc_number = -1;
+}
+
+/*
+ * GUC check hook for wait_event_capture (timing build). All enum values
+ * are accepted; there is nothing to validate beyond the enum mapping.
+ */
+bool
+check_wait_event_capture(int *newval, void **extra, GucSource source)
+{
+ return true;
+}
+
+/*
+ * GUC assign hook for wait_event_capture (timing build).
+ *
+ * Drop any in-flight wait state: after the capture level changes, the
+ * existing wait_start / current_event can no longer be trusted (a wait
+ * that started under one level and ends under another would be miscredited
+ * or use a stale start time). Forfeiting at most one in-flight sample per
+ * GUC change is negligible and eliminates all such miscredits.
+ */
+void
+assign_wait_event_capture(int newval, void *extra)
+{
+ if (my_wait_event_timing != NULL)
+ {
+ INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start);
+ my_wait_event_timing->current_event = 0;
+ }
+
+ /*
+ * Stepping down from TRACE: release the ~4 MB DSA ring now rather than
+ * holding it pinned for the rest of the session. Only fires when a ring
+ * is actually attached, so OFF -> TRACE -> OFF without ever emitting a
+ * trace record stays a no-op. Re-enable re-allocates a fresh ring on the
+ * first wait event via wait_event_trace_attach. dsa_free is non-raising
+ * LWLock bookkeeping, so it is safe from an assign hook.
+ */
+ if (newval != WAIT_EVENT_CAPTURE_TRACE && my_wait_event_trace != NULL)
+ wait_event_trace_release_slot(my_trace_proc_number);
+
+ /*
+ * Trace-level query attribution needs a non-zero query_id (from
+ * compute_query_id) and the activity reporting that drives the markers
+ * (track_activities). Warn -- but never error -- if either is missing;
+ * assign hooks must not ereport(ERROR). Trace still records wait events;
+ * only the query markers are affected.
+ */
+ if (newval == WAIT_EVENT_CAPTURE_TRACE && !pgstat_track_activities)
+ ereport(WARNING,
+ (errmsg("query attribution at \"wait_event_capture\" = \"trace\" requires \"track_activities\" to be enabled")));
+
+ if (newval == WAIT_EVENT_CAPTURE_TRACE &&
+ compute_query_id == COMPUTE_QUERY_ID_OFF)
+ ereport(WARNING,
+ (errmsg("query attribution at \"wait_event_capture\" = \"trace\" requires \"compute_query_id\" to be enabled"),
+ errhint("Set \"compute_query_id\" to \"on\" or \"auto\", or load a module that enables it.")));
+}
+
+/* ================= Trace-level ring machinery ================= */
+
+
+/*
+ * Write a trace ring marker record. Shared helper for all marker types.
+ */
+static void
+wait_event_trace_write_marker(uint8 record_type, int64 query_id)
+{
+ uint64 pos;
+ WaitEventTraceRecord *rec;
+ uint32 seq;
+ instr_time now;
+
+ /*
+ * Single capture-level gate: markers only land in the ring when
+ * wait_event_capture is at TRACE. This guarantees consistency with the
+ * wait-event hot path (also gated on the same level) -- there is no
+ * configuration in which one half of the trace fires and the other
+ * doesn't. query_id == 0 means "no query ID available" (utility command
+ * or compute_query_id = off), which we skip.
+ *
+ * wait_event_trace_writes_disabled is the same per-backend gate the
+ * wait-event hot path uses; it is raised by release_slot and
+ * before_shmem_exit around slot-state transitions to keep both writers
+ * consistent. Markers cannot fire during those transitions today
+ * (single-threaded execution, no nested executor), but checking here
+ * keeps the contract uniform across all trace-ring writers and is robust
+ * to future code paths that might invoke a marker from a nested context.
+ *
+ * No likely()/unlikely() annotation: this function is called at
+ * query/exec boundaries (a handful per query, not per wait event), so
+ * neither side of the branch dominates often enough for static layout to
+ * matter, and the meaningful production configuration (wait_event_capture
+ * = trace) is exactly when the body is hot -- an annotation on the
+ * early-return would point the wrong way.
+ */
+ if (wait_event_capture != WAIT_EVENT_CAPTURE_TRACE ||
+ wait_event_trace_writes_disabled ||
+ query_id == 0)
+ return;
+
+ /*
+ * Lazy attach on first use. Allocation lives here (not in the assign
+ * hook) because dsa_allocate_extended() can ereport(ERROR) on OOM, which
+ * is forbidden in assign-hook context but legitimate here. Idempotent:
+ * wait_event_trace_attach() short-circuits on subsequent calls.
+ */
+ if (my_wait_event_trace == NULL)
+ {
+ if (my_trace_proc_number < 0)
+ return;
+ wait_event_trace_attach(my_trace_proc_number);
+ if (my_wait_event_trace == NULL)
+ return; /* attach path unable to allocate */
+ }
+
+ /*
+ * Claim the next slot. Single-writer counter (only the owning backend
+ * writes its own ring), so a plain read+write is sufficient and avoids
+ * the LOCK XADD that pg_atomic_fetch_add_u64 would emit -- a wasted
+ * cache-coherence trip on an unshared cache line at this rate (one per
+ * wait event). Cross-backend readers use pg_atomic_read_u64, which
+ * compiles to a plain MOV on x86 and tolerates concurrent writes here
+ * (their actual safety against the records[] window is the per-record
+ * seqlock below). Same idiom as injection_point.c's per-entry generation
+ * counter (single writer + multiple lock-free readers).
+ */
+ pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos);
+ pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1);
+ rec = &my_wait_event_trace->records[pos & my_wait_event_trace->ring_mask];
+ seq = (uint32) (pos * 2 + 1);
+
+ rec->seq = seq;
+ pg_write_barrier(); /* release: payload stores must not rise above
+ * seq=odd */
+
+ INSTR_TIME_SET_CURRENT(now);
+ rec->record_type = record_type;
+ rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now);
+ rec->data.query.query_id = query_id;
+ rec->data.query.pad2 = 0;
+
+ pg_write_barrier(); /* release: payload stores must land before
+ * seq=even */
+ rec->seq = seq + 1;
+}
+
+void
+wait_event_trace_query_start(int64 query_id)
+{
+ wait_event_trace_write_marker(TRACE_QUERY_START, query_id);
+}
+
+void
+wait_event_trace_query_end(int64 query_id)
+{
+ wait_event_trace_write_marker(TRACE_QUERY_END, query_id);
+}
+
+void
+wait_event_trace_exec_start(int64 query_id)
+{
+ wait_event_trace_write_marker(TRACE_EXEC_START, query_id);
+}
+
+void
+wait_event_trace_exec_end(int64 query_id)
+{
+ wait_event_trace_write_marker(TRACE_EXEC_END, query_id);
+}
+
+/*
+ * Report the shared memory space needed for trace ring buffer control.
+ * Only a small control struct is in fixed shmem; the actual ring buffers
+ * are allocated lazily via DSA. At ~24 bytes/slot, the slot array adds
+ * ~26 KB at a default MaxBackends, negligible compared to the ring
+ * memory itself.
+ */
+static Size
+WaitEventTraceControlShmemSize(void)
+{
+ return add_size(offsetof(WaitEventTraceControl, trace_slots),
+ mul_size(NUM_WAIT_EVENT_TIMING_SLOTS,
+ sizeof(WaitEventTraceSlot)));
+}
+
+static void
+WaitEventTraceControlShmemRequest(void *arg)
+{
+ ShmemRequestStruct(.name = "WaitEventTraceControl",
+ .size = WaitEventTraceControlShmemSize(),
+ .ptr = (void **) &WaitEventTraceCtl);
+}
+
+/*
+ * Initialize shared memory for trace ring buffer control.
+ */
+static void
+WaitEventTraceControlShmemInit(void *arg)
+{
+ int i;
+
+ WaitEventTraceCtl->trace_dsa_handle = DSA_HANDLE_INVALID;
+ LWLockInitialize(&WaitEventTraceCtl->lock,
+ LWTRANCHE_WAIT_EVENT_TRACE_DSA);
+ for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++)
+ {
+ WaitEventTraceSlot *s = &WaitEventTraceCtl->trace_slots[i];
+
+ pg_atomic_init_u64(&s->generation, 0);
+ pg_atomic_init_u32(&s->state, WAIT_EVENT_TRACE_SLOT_FREE);
+ s->pad = 0;
+ s->ring_ptr = InvalidDsaPointer;
+ }
+}
+
+const ShmemCallbacks WaitEventTraceControlShmemCallbacks = {
+ .request_fn = WaitEventTraceControlShmemRequest,
+ .init_fn = WaitEventTraceControlShmemInit,
+};
+
+/*
+ * Ensure the shared DSA for trace ring buffers exists and is attached.
+ * Creates it on first call (any backend), attaches on subsequent calls.
+ * Must be called from a backend context (not postmaster).
+ */
+static void
+wait_event_trace_ensure_dsa(void)
+{
+ MemoryContext oldcontext;
+
+ if (trace_dsa != NULL)
+ return;
+
+ oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+ LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+ if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID)
+ {
+ trace_dsa = dsa_create(LWTRANCHE_WAIT_EVENT_TRACE_DSA);
+ dsa_pin(trace_dsa);
+ dsa_pin_mapping(trace_dsa);
+ WaitEventTraceCtl->trace_dsa_handle = dsa_get_handle(trace_dsa);
+ }
+ else
+ {
+ trace_dsa = dsa_attach(WaitEventTraceCtl->trace_dsa_handle);
+ dsa_pin_mapping(trace_dsa);
+ }
+
+ LWLockRelease(&WaitEventTraceCtl->lock);
+
+ MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Transition our trace ring slot to ORPHANED on backend exit.
+ *
+ * Registered as a before_shmem_exit callback. Runs BEFORE
+ * dsm_backend_shutdown() detaches the DSA.
+ *
+ * Crucially, we do NOT free the ring here. The ring stays allocated in
+ * DSA so that cross-backend consumers -- the in-tree
+ * pg_get_wait_event_trace SRF and any extension following the
+ * snapshot pattern documented on WaitEventTraceControl -- can read
+ * the dying backend's final waits. The original "free at exit"
+ * design lost data the instant a worker terminated, which was
+ * particularly bad for parallel workers exiting in milliseconds at
+ * end-of-parallel-query. See the lifecycle comment on
+ * WaitEventTraceControl for the full design
+ * rationale and the bounded-memory cost we accept in exchange.
+ *
+ * The ORPHANED slot is reclaimed in one of two ways:
+ * (a) a new backend at this procNumber calls
+ * wait_event_trace_clear_orphan_at_init() at backend init, or
+ * (b) the DBA calls pg_stat_clear_orphaned_wait_event_rings().
+ *
+ * State transition order matters: bump generation BEFORE storing the
+ * new state, so cross-backend readers that snapshot
+ * (generation_before, state, ring_ptr, generation_after) under the
+ * lock see a consistent (state, ring_ptr) pair iff generation didn't
+ * change. We hold the lock for the whole transition, but readers do
+ * not have to (they just take it briefly to snapshot the ring
+ * contents); the generation check is what makes the unlocked-read
+ * path safe.
+ */
+static void
+wait_event_trace_before_shmem_exit(int code, Datum arg)
+{
+ int procNumber = DatumGetInt32(arg);
+ WaitEventTraceSlot *slot;
+
+ if (WaitEventTraceCtl == NULL)
+ return;
+
+ if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+ return;
+
+ slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+ /*
+ * If this backend never ended up with an OWNED slot (e.g. capture was off
+ * the whole session, or the trace was released back to FREE via
+ * assign_wait_event_capture going trace -> off), there is nothing to
+ * transition. Read state without the lock first as a fast-path check;
+ * the authoritative re-check happens under the lock below.
+ */
+ if (pg_atomic_read_u32(&slot->state) != WAIT_EVENT_TRACE_SLOT_OWNED)
+ {
+ wait_event_trace_writes_disabled = true;
+ my_wait_event_trace = NULL;
+ return;
+ }
+
+ /*
+ * Disable trace-ring writes on this backend before we touch the lock.
+ * Writes after this point would race with the OWNED -> ORPHANED state
+ * publish below: a wait event whose end-timing path runs after the state
+ * has been published as ORPHANED would write into a ring that the patch
+ * contract declares read-only post-mortem. Cross-backend readers
+ * snapshot ORPHANED rings without expecting concurrent writes from the
+ * dying owner. See wait_event_trace_writes_disabled for the full UAF /
+ * contract-violation analysis.
+ *
+ * The flag stays true for the remainder of this backend's life (we are in
+ * proc_exit; there is no subsequent capture re-enable to handle), so we
+ * do not reset it.
+ */
+ wait_event_trace_writes_disabled = true;
+
+ LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+ /*
+ * Drop the local pointer inside the lock-held region as a second line of
+ * defense; the writes-disabled flag above is the primary gate.
+ */
+ my_wait_event_trace = NULL;
+
+ if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_OWNED &&
+ DsaPointerIsValid(slot->ring_ptr))
+ {
+ /*
+ * Bump generation first so any reader that snapped the old generation
+ * will detect the change on its post-read recheck and discard its
+ * read. Then publish the ORPHANED state. Keep ring_ptr valid -- the
+ * data is what we want to preserve.
+ */
+ pg_atomic_fetch_add_u64(&slot->generation, 1);
+ pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_ORPHANED);
+ }
+
+ LWLockRelease(&WaitEventTraceCtl->lock);
+}
+
+/*
+ * Allocate (or re-acquire) a trace ring buffer for this backend via DSA.
+ * Called when wait_event_capture is set to 'trace'.
+ *
+ * Slot state at entry will be one of:
+ *
+ * FREE fresh slot (or one cleared on this backend's init by
+ * wait_event_trace_clear_orphan_at_init): allocate a new
+ * ring, transition slot to OWNED, bump generation.
+ *
+ * OWNED we already attached earlier in this same backend's life
+ * (e.g. user toggled capture trace->stats->trace; the
+ * stats step calls wait_event_trace_release_slot which
+ * transitions back to FREE, but our cached
+ * my_wait_event_trace was cleared on the way down -- so
+ * seeing OWNED here at attach time means a different
+ * backend somehow ended up with this procNumber, which
+ * cannot happen because procNumber is per-backend and a
+ * single backend can only run one attach at a time. We
+ * still tolerate this state defensively by re-mapping the
+ * existing ring rather than leaking a second allocation.
+ *
+ * ORPHANED can never be observed here: a new backend's
+ * pgstat_set_wait_event_timing_storage() called
+ * wait_event_trace_clear_orphan_at_init() before any
+ * wait-event capture path can run, so any prior orphan has
+ * already been demoted to FREE. Treated as a safety check
+ * (Assert in debug builds).
+ */
+void
+wait_event_trace_attach(int procNumber)
+{
+ /*
+ * Re-entrancy guard. dsa_create / dsa_allocate_extended below can emit
+ * wait events internally; those reach the lazy-attach hot path which
+ * calls back into this function while we still hold
+ * WaitEventTraceCtl->lock or are mid-allocation. See the
+ * function-local-static-bool pattern explainer on
+ * wait_event_timing_attach_array.
+ */
+ static bool in_attach = false;
+ static bool shmem_exit_registered = false;
+ WaitEventTraceSlot *slot;
+ dsa_pointer p;
+ WaitEventTraceState *ts;
+ uint32 state_now;
+
+ if (in_attach)
+ return;
+
+ if (WaitEventTraceCtl == NULL)
+ return;
+
+ if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+ return;
+
+ /*
+ * Skip the attach if we are inside a critical section. Below this point
+ * we call dsa_create / dsa_attach / dsa_allocate_extended, all of which
+ * can allocate memory via MemoryContextAlloc and Assert-fail on
+ * "CritSectionCount == 0 || allowInCritSection". The very-first wait
+ * event after wait_event_capture = trace can land inside a critical
+ * section (e.g. a parallel worker scanning a heap page hits
+ * BufferSetHintBits16 -> XLogSaveBufferForHint -> XLogInsert ->
+ * LWLockAcquire, with the XLogInsert critical section open).
+ *
+ * Skipping here silently drops the in-flight wait event (it is not
+ * traced) but keeps the backend alive. The next wait event outside any
+ * critical section will hit this function again and attach successfully.
+ * See the matching guard in pgstat_wait_event_timing_lazy_attach.
+ */
+ if (CritSectionCount > 0)
+ return;
+
+ /*
+ * Skip the attach if MyProc is already on an LWLock wait queue. We are
+ * called from the wait-event hot path which fires AFTER LWLockQueueSelf
+ * has set MyProc->lwWaiting; a nested LWLockAcquire on our internal lock
+ * (via wait_event_trace_ensure_dsa) would PANIC at lwlock.c:1029. See the
+ * matching guard in pgstat_wait_event_timing_lazy_ attach for the full
+ * rationale.
+ */
+ if (MyProc != NULL && MyProc->lwWaiting != LW_WS_NOT_WAITING)
+ return;
+
+ slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+ in_attach = true;
+ PG_TRY();
+ {
+ state_now = pg_atomic_read_u32(&slot->state);
+
+ /*
+ * ORPHANED is normally impossible at attach time --
+ * pgstat_set_wait_event_timing_storage() at backend init calls
+ * wait_event_trace_clear_orphan_at_init() which demotes any inherited
+ * orphan to FREE. But there is one case where this backend can
+ * legitimately observe its own slot in the ORPHANED state: after we
+ * have already run wait_event_trace_before_shmem_exit()
+ * (transitioning the slot to ORPHANED on exit), a later
+ * before_shmem_exit callback (e.g. pgstat_io_flush_cb during
+ * proc_exit shutdown) can contend on an LWLock that emits a wait
+ * event, which calls pgstat_report_wait_end_timing() ->
+ * wait_event_trace_attach() after my_wait_event_trace has been
+ * cleared. We must not re-attach in that case: we are dying, the
+ * ring is now post-mortem data for cross-backend readers, and the
+ * writer invariant must hold. Skip the trace for any wait events
+ * emitted after our own exit transition.
+ */
+ if (state_now == WAIT_EVENT_TRACE_SLOT_ORPHANED)
+ {
+ /* PG_FINALLY below clears in_attach. */
+ }
+ else if (state_now == WAIT_EVENT_TRACE_SLOT_OWNED &&
+ DsaPointerIsValid(slot->ring_ptr))
+ {
+ /* Already have a ring buffer; re-map to it. */
+ wait_event_trace_ensure_dsa();
+ my_wait_event_trace = dsa_get_address(trace_dsa, slot->ring_ptr);
+ my_trace_proc_number = procNumber;
+ }
+ else
+ {
+ Size alloc_size;
+
+ wait_event_trace_ensure_dsa();
+
+ /*
+ * Cache the cluster-wide ring size on first allocation in this
+ * backend. wait_event_trace_ring_size is PGC_POSTMASTER, so by
+ * the time any backend reaches here, its boot value has been
+ * committed by the GUC framework. All rings in the postmaster
+ * run share the same dimensions.
+ */
+ if (WaitEventTraceRingSize == 0)
+ WaitEventTraceRingSize =
+ (uint32) wait_event_trace_ring_size * 1024U /
+ (uint32) sizeof(WaitEventTraceRecord);
+
+ alloc_size = offsetof(WaitEventTraceState, records) +
+ (Size) WaitEventTraceRingSize * sizeof(WaitEventTraceRecord);
+
+ p = dsa_allocate_extended(trace_dsa, alloc_size, DSA_ALLOC_ZERO);
+ ts = dsa_get_address(trace_dsa, p);
+ pg_atomic_init_u64(&ts->write_pos, 0);
+ ts->ring_mask = WaitEventTraceRingSize - 1;
+
+ LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+ /*
+ * Publish ring_ptr BEFORE transitioning state to OWNED.
+ * Cross-backend readers that observe state==OWNED outside the
+ * lock then see a valid ring_ptr. Bump generation last so any
+ * reader that snapped the prior generation will detect the
+ * change.
+ */
+ slot->ring_ptr = p;
+ pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_OWNED);
+ pg_atomic_fetch_add_u64(&slot->generation, 1);
+ LWLockRelease(&WaitEventTraceCtl->lock);
+
+ my_wait_event_trace = ts;
+ my_trace_proc_number = procNumber;
+
+ /*
+ * Register cleanup to run BEFORE dsm_backend_shutdown() detaches
+ * the DSA. The before_shmem_exit callbacks run in LIFO order
+ * before DSM detach, so the ORPHANED transition (which does not
+ * actually free the ring) is safe at that point.
+ *
+ * Guarded by shmem_exit_registered because under the
+ * release-on-disable policy (see wait_event_trace_release_slot
+ * and assign_wait_event_capture) the allocate branch can run
+ * multiple times per backend lifetime -- once per off/stats ->
+ * trace re-enable cycle. The cleanup itself is idempotent (it
+ * short-circuits when state is not OWNED), so it is safe to
+ * invoke after a release-then-reattach cycle, but we still avoid
+ * growing the before_shmem_exit list.
+ */
+ if (!shmem_exit_registered)
+ {
+ before_shmem_exit(wait_event_trace_before_shmem_exit,
+ Int32GetDatum(procNumber));
+ shmem_exit_registered = true;
+ }
+ }
+ }
+ PG_FINALLY();
+ {
+ in_attach = false;
+ }
+ PG_END_TRY();
+}
+
+/*
+ * Free trace ring buffer for this backend on exit.
+ */
+static void
+wait_event_trace_detach(int procNumber)
+{
+ /*
+ * Only clear local pointers here. The actual DSA free happens in
+ * wait_event_trace_before_shmem_exit(), which runs before
+ * dsm_backend_shutdown() detaches the DSA segments.
+ */
+ my_wait_event_trace = NULL;
+ my_trace_proc_number = -1;
+}
+
+/*
+ * Release this backend's trace ring buffer back to DSA immediately.
+ *
+ * Called from assign_wait_event_capture when the user steps down from
+ * TRACE to STATS or OFF. Without this, a ~4 MB ring allocated by a
+ * brief investigation would remain pinned for the rest of the session's
+ * lifetime, which can leak gigabytes across large connection pools.
+ *
+ * Important contrast with wait_event_trace_before_shmem_exit: backend
+ * exit transitions the slot to ORPHANED (preserving data for
+ * cross-backend readers); release_slot fully frees and returns to FREE
+ * because the operator has explicitly disabled trace -- they have
+ * affirmatively decided not to keep the data, so we honour that and
+ * reclaim the memory immediately. Subsequent re-enable allocates a
+ * fresh ring via wait_event_trace_attach's allocate branch.
+ *
+ * The operation is LWLock-safe and does not raise -- dsa_free is pure
+ * bookkeeping on the DSA freelist, no allocation and no ereport paths.
+ * Safe to call from a GUC assign hook.
+ *
+ * If pg_get_backend_wait_event_trace is currently iterating our own ring
+ * (wait_event_trace_srf_in_progress), we must NOT free the chunk out
+ * from under it: that would be a use-after-free on the records[] the SRF
+ * is still reading. Set wait_event_trace_release_pending instead and
+ * return; the SRF's PG_FINALLY block will perform the deferred free
+ * after iteration completes. In practice this branch is unreachable in
+ * current PG (assign hooks fire only at command boundaries and the SRF
+ * is a single command), but it makes the invariant explicit and the
+ * future-proofing free.
+ */
+static void
+wait_event_trace_release_slot(int procNumber)
+{
+ /*
+ * Re-entrancy guard. dsa_free takes a DSA-internal LWLock which can in
+ * principle emit a wait event; if a nested assign hook re-enters we must
+ * not recurse. See the function-local-static-bool pattern explainer on
+ * wait_event_timing_attach_array.
+ */
+ static bool in_release = false;
+ WaitEventTraceSlot *slot;
+
+ if (in_release)
+ return;
+
+ if (WaitEventTraceCtl == NULL || trace_dsa == NULL)
+ return;
+
+ /*
+ * Same-backend SRF is iterating our own ring. Defer the free until the
+ * SRF's PG_FINALLY runs.
+ */
+ if (wait_event_trace_srf_in_progress)
+ {
+ wait_event_trace_release_pending = true;
+ return;
+ }
+
+ if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+ return;
+
+ slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+ in_release = true;
+
+ /*
+ * Disable trace-ring writes on this backend before we touch the lock or
+ * call dsa_free. An internal LWLock inside dsa_free can dispatch a wait
+ * event whose end-timing path would otherwise see capture_level == TRACE
+ * (the GUC assign hook is in flight; the variable has not been committed
+ * by the framework yet) and write into the very chunk we are returning to
+ * the DSA freelist. See the comment on wait_event_trace_writes_disabled
+ * for the full UAF analysis.
+ */
+ wait_event_trace_writes_disabled = true;
+
+ PG_TRY();
+ {
+ LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+ /*
+ * Drop the local pointer BEFORE the dsa_free as a second line of
+ * defense (the writes-disabled flag above is the primary gate). Any
+ * wait event whose hot path slips past the gate check via a compiler
+ * or memory-ordering surprise would at least see my_wait_event_trace
+ * == NULL and skip the write.
+ */
+ my_wait_event_trace = NULL;
+
+ if (DsaPointerIsValid(slot->ring_ptr))
+ {
+ /*
+ * Bump generation first to invalidate any concurrent
+ * cross-backend snapshot, then free, then publish the FREE state
+ * with a NULL ring_ptr. Order matters for unlocked readers that
+ * have already passed the state check.
+ */
+ pg_atomic_fetch_add_u64(&slot->generation, 1);
+ dsa_free(trace_dsa, slot->ring_ptr);
+ slot->ring_ptr = InvalidDsaPointer;
+ pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE);
+ }
+ LWLockRelease(&WaitEventTraceCtl->lock);
+ }
+ PG_FINALLY();
+ {
+ wait_event_trace_writes_disabled = false;
+ in_release = false;
+ }
+ PG_END_TRY();
+}
+
+/*
+ * Clear an orphaned trace ring at backend init time.
+ *
+ * Called from pgstat_set_wait_event_timing_storage() once the new
+ * backend has its procNumber. If the slot we're inheriting was left
+ * ORPHANED by a previous backend (because we deliberately do not free
+ * trace rings on backend exit -- see the lifecycle discussion on
+ * WaitEventTraceControl), free the ring now so the new backend starts
+ * with a clean FREE slot. Subsequent wait_event_trace_attach() calls
+ * (when this backend itself enables trace) will then take the
+ * allocate branch.
+ *
+ * No-op when the slot is already FREE or OWNED: FREE means there's
+ * nothing to clear; OWNED is impossible at backend init (only a
+ * not-yet-exited backend can leave a slot OWNED, and procNumbers are
+ * assigned exclusively). We assert OWNED is not observed in debug
+ * builds and conservatively skip the free in production.
+ *
+ * Robustness: this runs during InitProcess() (before the backend can
+ * accept any work), and the work it performs -- dsa_attach() and
+ * dsa_free() -- can raise ERROR on rare runtime failures (corrupted
+ * DSA segment headers, descriptor exhaustion, mmap ENOMEM, etc.).
+ * An uncaught ERROR here would propagate out of InitProcess() and
+ * abort backend startup entirely, even for sessions that never
+ * intended to use wait_event_capture. To prevent the trace
+ * feature's housekeeping from gating connection establishment, the
+ * body is wrapped in PG_TRY()/PG_CATCH(): any error from dsa_attach
+ * or dsa_free is captured, downgraded to a WARNING with a hint
+ * pointing at the admin sweep function, and execution continues.
+ * The orphan stays in place; it can be reclaimed by the next
+ * backend that inherits the same procNumber (if the underlying
+ * problem was transient), by pg_stat_clear_orphaned_wait_event_rings(),
+ * or at next cluster restart.
+ */
+static void
+wait_event_trace_clear_orphan_at_init(int procNumber)
+{
+ WaitEventTraceSlot *slot;
+ uint32 state_now;
+ MemoryContext caller_cxt;
+
+ if (WaitEventTraceCtl == NULL)
+ return;
+
+ if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+ return;
+
+ slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+ state_now = pg_atomic_read_u32(&slot->state);
+ if (state_now != WAIT_EVENT_TRACE_SLOT_ORPHANED)
+ {
+ Assert(state_now != WAIT_EVENT_TRACE_SLOT_OWNED);
+ return;
+ }
+
+ /*
+ * Save CurrentMemoryContext so the PG_CATCH path can copy the error data
+ * into a context that survives FlushErrorState(). FlushErrorState() calls
+ * MemoryContextReset(ErrorContext), so CopyErrorData() must run in a
+ * different context or the returned ErrorData becomes a dangling pointer.
+ */
+ caller_cxt = CurrentMemoryContext;
+
+ PG_TRY();
+ {
+ /*
+ * The trace DSA is shared across the cluster. We must attach to it
+ * before calling dsa_free (which needs the dsa_area pointer). The
+ * DSA was created by some earlier backend that wrote a trace record
+ * (otherwise the slot couldn't have ended up ORPHANED), so the handle
+ * in WaitEventTraceCtl is valid; ensure_dsa() will attach. Both
+ * ensure_dsa() and dsa_free() can raise ERROR; the PG_CATCH below
+ * downgrades any such error to a WARNING so backend startup is not
+ * blocked.
+ */
+ wait_event_trace_ensure_dsa();
+
+ LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+ if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_ORPHANED &&
+ DsaPointerIsValid(slot->ring_ptr))
+ {
+ pg_atomic_fetch_add_u64(&slot->generation, 1);
+ dsa_free(trace_dsa, slot->ring_ptr);
+ slot->ring_ptr = InvalidDsaPointer;
+ pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE);
+ }
+ LWLockRelease(&WaitEventTraceCtl->lock);
+ }
+ PG_CATCH();
+ {
+ ErrorData *edata;
+
+ /*
+ * Release any LWLocks we (or anything we called) might still hold.
+ * Two paths can leave WaitEventTraceCtl->lock held when control
+ * reaches here:
+ *
+ * 1. The outer LWLockAcquire above succeeded and dsa_free raised
+ * before we reached LWLockRelease. 2. wait_event_trace_ensure_dsa()
+ * raised inside its own LWLockAcquire/dsa_attach/LWLockRelease
+ * region.
+ *
+ * We are running during InitProcess(), BEFORE any transaction or
+ * PostgresMain sigsetjmp has been set up, so PG's standard
+ * "AbortTransaction -> LWLockReleaseAll" cleanup does NOT fire on the
+ * longjmp into PG_CATCH. Without an explicit release here the lock
+ * would stay held for the lifetime of this backend, blocking every
+ * future LW_EXCLUSIVE acquirer (the orphan-clear sweep, release_slot,
+ * before_shmem_exit transitions, and subsequent backends'
+ * clear_orphan_at_init). That would be strictly worse than the
+ * original failure-startup behavior this commit set out to fix.
+ *
+ * LWLockReleaseAll() is the idiomatic catch-path lock cleanup used by
+ * the standalone aux-process error handlers (walwriter.c,
+ * checkpointer.c, pgarch.c). It is safe to call broadly here because
+ * pgstat_set_wait_ event_timing_storage runs at a fixed point in
+ * InitProcess where the caller frame holds no other LWLocks across
+ * our return: the earlier InitProcess steps that touch LWLocks
+ * (ProcArrayAdd, etc.) release them before returning, and the
+ * subsequent steps that acquire LWLocks have not yet run.
+ */
+ LWLockReleaseAll();
+
+ /*
+ * Switch BACK to the caller's context before CopyErrorData so that
+ * edata is allocated in a context that survives FlushErrorState().
+ * FlushErrorState() calls MemoryContextReset(ErrorContext);
+ * allocating edata in ErrorContext (the default at PG_CATCH entry on
+ * the error path) would make it a dangling pointer the moment we
+ * flush. See the matching pattern in spi.c PG_CATCH branches.
+ */
+ MemoryContextSwitchTo(caller_cxt);
+ edata = CopyErrorData();
+ FlushErrorState();
+
+ ereport(WARNING,
+ (errcode(edata->sqlerrcode),
+ errmsg("could not clear orphaned wait-event trace ring "
+ "at backend init: %s", edata->message),
+ errdetail("Backend startup proceeds with the orphan "
+ "still allocated for procnumber %d.",
+ procNumber),
+ errhint("Run pg_stat_clear_orphaned_wait_event_rings() "
+ "to release the orphan when the underlying "
+ "condition is resolved.")));
+
+ FreeErrorData(edata);
+ }
+ PG_END_TRY();
+}
+
+/*
+ * Out-of-line body for pgstat_report_wait_start()'s timing path. Records
+ * the start timestamp and the event being waited on. Reached only when
+ * wait_event_capture != OFF.
+ */
+void
+pgstat_report_wait_start_timing(uint32 wait_event_info)
+{
+ /*
+ * Stay out of the timing path once proc_exit has begun tearing down DSA
+ * mappings (see the before_shmem_exit callback).
+ */
+ if (wait_event_timing_writes_disabled)
+ return;
+
+ if (my_wait_event_timing == NULL)
+ {
+ pgstat_wait_event_timing_lazy_attach();
+
+ /*
+ * lazy_attach can dispatch nested wait events while it sets up DSA
+ * (dsa_attach takes an internal LWLock); those nested wait_end calls
+ * clear my_wait_event_info to 0. Re-publish so the outer wait stays
+ * visible in pg_stat_activity. Only needed on the first-attach path.
+ */
+ *(volatile uint32 *) my_wait_event_info = wait_event_info;
+ }
+
+ if (my_wait_event_timing != NULL)
+ {
+ INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start);
+ my_wait_event_timing->current_event = wait_event_info;
+ }
+}
+
+/*
+ * Out-of-line body for pgstat_report_wait_end()'s timing path. Computes
+ * the wait duration, accumulates per-event statistics, and at the trace
+ * level pushes the completed wait into the per-session ring.
+ *
+ * capture_level is the value of wait_event_capture observed at the inline
+ * gate. Passing it through (rather than re-loading the global here) avoids
+ * a redundant load on the trace branch below -- the function-call boundary
+ * defeats CSE -- and means a concurrent GUC change cannot half-apply to
+ * this call: we run entirely at the gate's view of the level.
+ */
+void
+pgstat_report_wait_end_timing(int capture_level)
+{
+ uint32 event;
+ uint32 cur_reset_gen;
+
+ if (wait_event_timing_writes_disabled)
+ return;
+
+ if (my_wait_event_timing == NULL)
+ {
+ pgstat_wait_event_timing_lazy_attach();
+ if (my_wait_event_timing == NULL)
+ return;
+ }
+
+ event = my_wait_event_timing->current_event;
+
+ /*
+ * Service a pending cross-backend reset request. A single relaxed atomic
+ * load; when the shared generation has advanced past the value we last
+ * acted on, clear our own counters on behalf of the requester and record
+ * the reset. wait_start is left untouched so the in-flight measurement
+ * still lands (in the freshly-zeroed counters), and current_event is
+ * zeroed so external readers do not see stale state.
+ */
+ cur_reset_gen = pg_atomic_read_u32(&my_wait_event_timing->reset_generation);
+ if (cur_reset_gen != my_last_reset_generation)
+ {
+ memset(my_wait_event_timing->events, 0,
+ sizeof(my_wait_event_timing->events));
+ lwlock_timing_hash_clear(my_wait_event_timing);
+ my_wait_event_timing->reset_count++;
+ my_wait_event_timing->lwlock_overflow_count = 0;
+ my_wait_event_timing->flat_overflow_count = 0;
+ my_wait_event_timing->current_event = 0;
+ my_last_reset_generation = cur_reset_gen;
+ }
+
+ if (event != 0 && !INSTR_TIME_IS_ZERO(my_wait_event_timing->wait_start))
+ {
+ instr_time now;
+ int64 duration_ns;
+ int idx;
+ WaitEventTimingEntry *entry = NULL;
+ bool warn_lwlock_overflow = false;
+ bool warn_flat_overflow = false;
+
+ INSTR_TIME_SET_CURRENT(now);
+ duration_ns = INSTR_TIME_GET_NANOSEC(now) -
+ INSTR_TIME_GET_NANOSEC(my_wait_event_timing->wait_start);
+
+ if (duration_ns < 0)
+ duration_ns = 0;
+
+ idx = wait_event_timing_index(event);
+
+ /*
+ * Single-writer hot path: each slot has exactly one writer (the
+ * owning backend), and the SRF reader is lock-free, so no locking is
+ * needed here. Overflow WARNINGs are deferred to after the
+ * bookkeeping so ereport() cannot recurse through a wait event while
+ * counters are in an intermediate state.
+ */
+ if (idx == WAIT_EVENT_TIMING_IDX_LWLOCK)
+ entry = lwlock_timing_lookup(my_wait_event_timing, event & 0xFFFF);
+ else if (idx >= 0)
+ entry = &my_wait_event_timing->events[idx];
+
+ if (entry != NULL)
+ {
+ entry->count++;
+ entry->total_ns += duration_ns;
+ if (duration_ns > entry->max_ns)
+ entry->max_ns = duration_ns;
+ entry->histogram[wait_event_timing_bucket(duration_ns)]++;
+ }
+ else if (idx == WAIT_EVENT_TIMING_IDX_LWLOCK)
+ {
+ if (my_wait_event_timing->lwlock_overflow_count++ == 0)
+ warn_lwlock_overflow = true;
+ }
+ else if (idx == -1)
+ {
+ if (my_wait_event_timing->flat_overflow_count++ == 0)
+ warn_flat_overflow = true;
+ }
+
+ if (warn_lwlock_overflow)
+ ereport(WARNING,
+ (errmsg("wait event timing data for some LWLock tranches will be lost"),
+ errdetail("This backend has used more than %d distinct LWLock tranches.",
+ wait_event_timing_max_entries),
+ errhint("Consider raising \"wait_event_timing_max_tranches\".")));
+ else if (warn_flat_overflow)
+ ereport(WARNING,
+ (errmsg("some wait events will not be timed"),
+ errdetail("A wait event was reported with a class unknown to the timing infrastructure.")));
+
+ /* Trace level: push the completed wait into the per-session ring. */
+ if (capture_level == WAIT_EVENT_CAPTURE_TRACE &&
+ !wait_event_trace_writes_disabled &&
+ !wait_event_trace_in_write)
+ {
+ /*
+ * Lazy-attach the ring on first use here (not in the assign hook,
+ * which must not ereport on OOM). The writes-disabled gate also
+ * blocks this re-attach during slot-state transitions
+ * (release_slot / before_shmem_exit): without it a nested wait
+ * event mid-transition could recurse into a fresh attach that
+ * deadlocks on the lock the transition already holds.
+ */
+ if (my_wait_event_trace == NULL && my_trace_proc_number >= 0)
+ wait_event_trace_attach(my_trace_proc_number);
+
+ if (my_wait_event_trace != NULL)
+ {
+ /*
+ * Single-writer claim: a plain read+write avoids the LOCK
+ * XADD that pg_atomic_fetch_add_u64 would emit on every wait
+ * event. Cross-backend readers use pg_atomic_read_u64 and
+ * rely on the per-record seqlock below for safety.
+ */
+ uint64 pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos);
+ WaitEventTraceRecord *rec;
+ uint32 seq;
+
+ /*
+ * Wait events emitted while a trace record is being written
+ * must not themselves be ring-recorded (the gate above): the
+ * record write is plain stores and cannot wait, but the
+ * injection point below can, and the wait machinery it runs
+ * (DSM registry lookup, condition-variable sleep) emits wait
+ * events whose wait_end would recurse into this block --
+ * re-running the injection point and self-deadlocking on
+ * locks the outer invocation still holds. Costs one
+ * process-local store each way; nested waits still
+ * accumulate in the stats above.
+ */
+ wait_event_trace_in_write = true;
+
+ pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1);
+
+ /*
+ * Injection point for the regression test of the
+ * position-encoded identity seqlock: stalling here widens the
+ * window between the write_pos store and the rec->seq store,
+ * simulating weak-memory visibility that would otherwise be
+ * unreachable on x86. Compiled out unless
+ * --enable-injection-points.
+ */
+ INJECTION_POINT("wait-event-trace-after-write-pos", NULL);
+
+ rec = &my_wait_event_trace->records[pos & my_wait_event_trace->ring_mask];
+ seq = (uint32) (pos * 2 + 1);
+
+ rec->seq = seq;
+ pg_write_barrier(); /* payload stores must not rise above
+ * seq=odd */
+
+ rec->record_type = TRACE_WAIT_EVENT;
+ rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now);
+ rec->data.wait.event = event;
+ rec->data.wait.pad2 = 0;
+ rec->data.wait.duration_ns = duration_ns;
+
+ pg_write_barrier(); /* payload stores must land before
+ * seq=even */
+ rec->seq = seq + 1;
+
+ wait_event_trace_in_write = false;
+ }
+ }
+
+ INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start);
+ }
+}
+
+/*
+ * Resolve the optional pid SRF argument to a procNumber range
+ * [out_start, out_end). Returns false if the SRF should emit zero rows
+ * (unknown pid -- silent no-op).
+ */
+static bool
+wait_event_timing_pid_range(FunctionCallInfo fcinfo,
+ int *out_start, int *out_end)
+{
+ if (PG_ARGISNULL(0))
+ {
+ *out_start = 0;
+ *out_end = NUM_WAIT_EVENT_TIMING_SLOTS;
+ return true;
+ }
+ else
+ {
+ int target_pid = PG_GETARG_INT32(0);
+ PGPROC *proc;
+ int procNumber;
+
+ proc = BackendPidGetProc(target_pid);
+ if (proc == NULL)
+ proc = AuxiliaryPidGetProc(target_pid);
+ if (proc == NULL)
+ return false;
+
+ procNumber = GetNumberFromPGProc(proc);
+ if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+ return false;
+
+ *out_start = procNumber;
+ *out_end = procNumber + 1;
+ return true;
+ }
+}
+
+/* Emit one SRF row for a populated timing entry. */
+static void
+wait_event_timing_emit_row(ReturnSetInfo *rsinfo, PgBackendStatus *beentry,
+ int procnumber, uint32 wait_event_info,
+ WaitEventTimingEntry *entry,
+ ArrayType *hist_array, int64 *hist_payload)
+{
+ Datum values[10];
+ bool nulls[10];
+ const char *event_type;
+ const char *event_name;
+ int bucket;
+
+ event_type = pgstat_get_wait_event_type(wait_event_info);
+ event_name = pgstat_get_wait_event(wait_event_info);
+ if (event_type == NULL || event_name == NULL)
+ return;
+
+ memset(nulls, 0, sizeof(nulls));
+
+ values[0] = Int32GetDatum(beentry->st_procpid);
+ values[1] = CStringGetTextDatum(GetBackendTypeDesc(beentry->st_backendType));
+ values[2] = Int32GetDatum(procnumber);
+ values[3] = CStringGetTextDatum(event_type);
+ values[4] = CStringGetTextDatum(event_name);
+ values[5] = Int64GetDatum(entry->count);
+ values[6] = Float8GetDatum((double) entry->total_ns / 1000000.0);
+ values[7] = Float8GetDatum(entry->count > 0
+ ? (double) entry->total_ns / entry->count / 1000.0
+ : 0.0);
+ values[8] = Float8GetDatum((double) entry->max_ns / 1000.0);
+
+ for (bucket = 0; bucket < WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS; bucket++)
+ hist_payload[bucket] = entry->histogram[bucket];
+ values[9] = PointerGetDatum(hist_array);
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+}
+
+/*
+ * SQL function: pg_stat_get_wait_event_timing(pid int4, OUT ...)
+ *
+ * Returns one row per (backend, wait_event) with a non-zero count. pid is
+ * optional: NULL means all backends; a non-NULL value restricts the sweep
+ * to that backend (silently empty for unknown pids).
+ */
+Datum
+pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ int start_idx;
+ int end_idx;
+ int backend_idx;
+ ArrayType *hist_array;
+ int64 *hist_payload;
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ if (!wait_event_timing_attach_array(false))
+ PG_RETURN_VOID();
+
+ if (!wait_event_timing_pid_range(fcinfo, &start_idx, &end_idx))
+ PG_RETURN_VOID();
+
+ /*
+ * Allocate the histogram ArrayType once and reuse it across every row;
+ * tuplestore_putvalues flattens the varlena into its stored tuple, so
+ * rewriting the payload cannot corrupt previously emitted rows.
+ */
+ {
+ Datum zero_elems[WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS];
+
+ memset(zero_elems, 0, sizeof(zero_elems));
+ hist_array = construct_array_builtin(zero_elems,
+ WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS,
+ INT8OID);
+ hist_payload = (int64 *) ARR_DATA_PTR(hist_array);
+ }
+
+ for (backend_idx = start_idx; backend_idx < end_idx; backend_idx++)
+ {
+ WaitEventTimingState *state = wet_slot(backend_idx);
+ PgBackendStatus *beentry;
+ int i;
+
+ /* Skip dead backend slots and enforce stats permissions. */
+ beentry = pgstat_get_beentry_by_proc_number(backend_idx);
+ if (beentry == NULL)
+ continue;
+ if (!HAS_PGSTAT_PERMISSIONS(beentry->st_userid))
+ continue;
+
+ /* Flat array rows (all classes except LWLock). */
+ for (i = 0; i < WAIT_EVENT_TIMING_DENSE_CLASSES; i++)
+ {
+ int base = wait_event_class_offset[i];
+ int nevents = wait_event_class_nevents[i];
+ uint32 classId = wait_event_dense_to_classid[i];
+ int j;
+
+ for (j = 0; j < nevents; j++)
+ {
+ WaitEventTimingEntry *entry = &state->events[base + j];
+
+ if (entry->count == 0)
+ continue;
+
+ wait_event_timing_emit_row(rsinfo, beentry, backend_idx,
+ ((uint32) classId << 24) | j,
+ entry, hist_array, hist_payload);
+ }
+ }
+
+ /* LWLock hash rows. */
+ {
+ LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state);
+ WaitEventTimingEntry *events = wet_lwlock_hash_events(state);
+ int hash_size = state->lwlock_hash.hash_size;
+
+ for (i = 0; i < hash_size; i++)
+ {
+ LWLockTimingHashEntry *he = &entries[i];
+ WaitEventTimingEntry *entry;
+
+ if (he->tranche_id == LWLOCK_TIMING_EMPTY_SLOT)
+ continue;
+
+ entry = &events[he->dense_idx];
+ if (entry->count == 0)
+ continue;
+
+ wait_event_timing_emit_row(rsinfo, beentry, backend_idx,
+ PG_WAIT_LWLOCK | he->tranche_id,
+ entry, hist_array, hist_payload);
+ }
+ }
+ }
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * SQL function: pg_stat_get_wait_event_timing_overflow(pid int4, OUT ...)
+ *
+ * Exposes the per-backend truncation counters that the recording path
+ * maintains: lwlock_overflow_count (LWLock waits dropped because the
+ * per-backend tranche hash was full), flat_overflow_count (events whose
+ * class index was out of range), and reset_count (resets the backend has
+ * observed and acted on). pid has the same optional semantics as
+ * pg_stat_get_wait_event_timing().
+ */
+Datum
+pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ int start_idx;
+ int end_idx;
+ int backend_idx;
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ if (!wait_event_timing_attach_array(false))
+ PG_RETURN_VOID();
+
+ if (!wait_event_timing_pid_range(fcinfo, &start_idx, &end_idx))
+ PG_RETURN_VOID();
+
+ for (backend_idx = start_idx; backend_idx < end_idx; backend_idx++)
+ {
+ WaitEventTimingState *state = wet_slot(backend_idx);
+ PgBackendStatus *beentry;
+ Datum values[6];
+ bool nulls[6];
+
+ beentry = pgstat_get_beentry_by_proc_number(backend_idx);
+ if (beentry == NULL)
+ continue;
+ if (!HAS_PGSTAT_PERMISSIONS(beentry->st_userid))
+ continue;
+
+ memset(nulls, 0, sizeof(nulls));
+
+ values[0] = Int32GetDatum(beentry->st_procpid);
+ values[1] = CStringGetTextDatum(GetBackendTypeDesc(beentry->st_backendType));
+ values[2] = Int32GetDatum(backend_idx);
+ values[3] = Int64GetDatum(state->lwlock_overflow_count);
+ values[4] = Int64GetDatum(state->flat_overflow_count);
+ values[5] = Int64GetDatum(state->reset_count);
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+ }
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * Request a cross-backend self-reset on the given slot: bump the slot's
+ * reset_generation and wake the target so it promptly observes the change
+ * and clears its own counters at its next wait_end. Lock-free: only the
+ * owning backend ever writes its statistics.
+ */
+static void
+wait_event_timing_request_reset(int slot_idx)
+{
+ Assert(slot_idx >= 0 && slot_idx < NUM_WAIT_EVENT_TIMING_SLOTS);
+
+ if (!wait_event_timing_attach_array(false))
+ return;
+
+ pg_atomic_fetch_add_u32(&wet_slot(slot_idx)->reset_generation, 1);
+
+ /*
+ * The slot index is also the PGPROC array index. Waking the target
+ * shortens the time before it completes its current wait and notices the
+ * request; setting a latch on a slot with no live owner is harmless.
+ */
+ if (ProcGlobal != NULL && ProcGlobal->allProcs != NULL)
+ SetLatch(&ProcGlobal->allProcs[slot_idx].procLatch);
+}
+
+/*
+ * SQL function: pg_stat_reset_wait_event_timing(pid int4)
+ *
+ * NULL or own pid : reset the caller's own counters synchronously.
+ * another pid : request a cross-backend reset (pg_signal_backend).
+ * unknown pid : silent no-op.
+ *
+ * Cross-backend resets are asynchronous: the target clears its counters at
+ * its next wait_end. Callers needing read-after-reset semantics should
+ * target their own backend, or poll reset_count in
+ * pg_stat_wait_event_timing_overflow until it increments.
+ */
+Datum
+pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS)
+{
+ int target_pid;
+ PGPROC *proc;
+ int procNumber;
+
+ if (PG_ARGISNULL(0) || PG_GETARG_INT32(0) == MyProcPid)
+ {
+ /*
+ * Own backend: synchronous, no lock needed (single writer).
+ * wait_start is already zero (every wait_end zeroes it and we cannot
+ * be mid-wait while running this function), so there is no in-flight
+ * measurement to preserve.
+ */
+ if (my_wait_event_timing != NULL)
+ {
+ memset(my_wait_event_timing->events, 0,
+ sizeof(my_wait_event_timing->events));
+ lwlock_timing_hash_clear(my_wait_event_timing);
+ my_wait_event_timing->reset_count++;
+ my_wait_event_timing->lwlock_overflow_count = 0;
+ my_wait_event_timing->flat_overflow_count = 0;
+ my_wait_event_timing->current_event = 0;
+ }
+ PG_RETURN_VOID();
+ }
+
+ /*
+ * Cross-backend reset requires pg_signal_backend, matching
+ * pg_stat_reset_backend_stats(pid): anyone who can terminate the target
+ * backend can already destroy more forensic state than a counter wipe.
+ */
+ if (!has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND))
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied to reset another backend's wait event timing"),
+ errdetail("Only roles with privileges of the \"pg_signal_backend\" role may reset another backend's wait event timing.")));
+
+ target_pid = PG_GETARG_INT32(0);
+
+ proc = BackendPidGetProc(target_pid);
+ if (proc == NULL)
+ proc = AuxiliaryPidGetProc(target_pid);
+ if (proc == NULL)
+ PG_RETURN_VOID(); /* unknown/dead pid: silent no-op */
+
+ procNumber = GetNumberFromPGProc(proc);
+ if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+ PG_RETURN_VOID();
+
+ wait_event_timing_request_reset(procNumber);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * SQL function: pg_stat_reset_wait_event_timing_all()
+ *
+ * Request a reset on every backend. Execution is revoked from PUBLIC by
+ * default (the blast radius is the whole cluster, a different decision
+ * from the per-backend variant); administrators can delegate with GRANT.
+ */
+Datum
+pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS)
+{
+ int i;
+
+ /*
+ * Execution is revoked from PUBLIC in system_views.sql; administrators
+ * can delegate with GRANT EXECUTE.
+ */
+ if (!wait_event_timing_attach_array(false))
+ PG_RETURN_VOID();
+
+ for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++)
+ wait_event_timing_request_reset(i);
+
+ PG_RETURN_VOID();
+}
+
+/* ================= Trace-level readers ================= */
+
+/*
+ * SQL function: pg_get_backend_wait_event_trace()
+ *
+ * Returns trace records from the current backend's own ring buffer. This
+ * function is deliberately session-local; cross-backend reading goes
+ * through pg_get_wait_event_trace(procnumber) below, which implements the
+ * snapshot-under-lock protocol documented on WaitEventTraceControl in
+ * wait_event_timing.h. The name mirrors pg_get_backend_memory_contexts()
+ * to make the session-local scope explicit at the API level.
+ *
+ * Same-backend coordination with wait_event_trace_release_slot uses the
+ * wait_event_trace_srf_in_progress / _release_pending flags rather than
+ * an LWLock: same-backend serialization is implicit, so a per-backend
+ * bool plus a deferred-free path is sufficient and avoids any of the
+ * cross-backend lock-hold latency that the cross-backend reader pattern
+ * has to manage. PG_TRY/PG_FINALLY guarantees the flag is cleared and
+ * any deferred dsa_free is performed even on ereport(ERROR).
+ *
+ * Uses InitMaterializedSRF (materialize-all). The ring holds up to
+ * WaitEventTraceRingSize records (set at server start from the
+ * wait_event_trace_ring_size GUC; default 4 MB = 131072 records);
+ * full materialization caps the per-call cost at the ring size of
+ * tuplestore memory, which is acceptable for the use case this SRF
+ * is designed for: interactive own-session diagnostics from psql.
+ *
+ * This SRF is NOT the path for cross-backend monitoring tools: it is
+ * hard-coded to the calling backend's own ring, so a bgworker selecting
+ * from pg_backend_wait_event_trace via SPI would see only its own
+ * (typically empty) ring. Cross-backend consumers use
+ * pg_get_wait_event_trace(procnumber) for SQL access, or implement the
+ * snapshot-under-lock protocol on WaitEventTraceControl directly.
+ *
+ * value-per-call (deferred) SRF mode would let an interactive
+ * "SELECT ... FROM pg_backend_wait_event_trace LIMIT N" short-circuit
+ * the materialisation, but converting this function would require
+ * spanning the wait_event_trace_srf_in_progress flag (and its
+ * deferred-free coordination with assign_wait_event_capture) across
+ * multiple SRF callbacks plus a transaction-cleanup registration to
+ * handle LIMIT abandonment. The complexity is not
+ * justified for the diagnostic use case, especially since cross-
+ * backend monitoring (the consumer that would actually benefit from
+ * streaming) goes through the snapshot pattern above instead.
+ * Interactive callers who want only recent records should use
+ * "ORDER BY seq DESC LIMIT N" -- the LIMIT is applied after
+ * materialisation but the cost stays bounded by the ring size.
+ */
+Datum
+pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ WaitEventTraceState *ts;
+ uint64 write_pos;
+ uint64 read_start;
+ uint64 i;
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ if (my_wait_event_trace == NULL)
+ PG_RETURN_VOID();
+
+ ts = my_wait_event_trace;
+
+ write_pos = pg_atomic_read_u64(&ts->write_pos);
+
+ if (write_pos == 0)
+ PG_RETURN_VOID();
+
+ /* Read from oldest available to newest */
+ {
+ uint64 ring_size = (uint64) ts->ring_mask + 1;
+
+ read_start = (write_pos > ring_size)
+ ? write_pos - ring_size : 0;
+ }
+
+ /*
+ * Mark the iteration in progress so wait_event_trace_release_slot defers
+ * any concurrent dsa_free of our own ring (see the comment on that
+ * function for the deferral protocol). PG_FINALLY clears the flag and
+ * performs any deferred free, even on ereport(ERROR).
+ */
+ wait_event_trace_srf_in_progress = true;
+ PG_TRY();
+ {
+ for (i = read_start; i < write_pos; i++)
+ {
+ WaitEventTraceRecord *rec =
+ &ts->records[i & ts->ring_mask];
+ Datum values[6];
+ bool nulls[6];
+ const char *event_type;
+ const char *event_name;
+ uint32 seq_before;
+ uint32 seq_after;
+ uint8 rtype;
+ int64 timestamp_ns;
+ uint32 event_info;
+ int64 duration_ns;
+ int64 query_id;
+
+ /* Seqlock read */
+ seq_before = rec->seq;
+ pg_read_barrier(); /* acquire: payload loads below must not rise
+ * above this */
+
+ if (seq_before & 1)
+ continue;
+
+ rtype = rec->record_type;
+ timestamp_ns = rec->timestamp_ns;
+
+ if (rtype == TRACE_WAIT_EVENT)
+ {
+ event_info = rec->data.wait.event;
+ duration_ns = rec->data.wait.duration_ns;
+ query_id = 0;
+ }
+ else if (rtype == TRACE_QUERY_START || rtype == TRACE_QUERY_END ||
+ rtype == TRACE_EXEC_START || rtype == TRACE_EXEC_END)
+ {
+ event_info = 0;
+ duration_ns = 0;
+ query_id = rec->data.query.query_id;
+ }
+ else
+ {
+ pg_read_barrier(); /* acquire: pair with seq_before read
+ * above before skipping */
+ continue;
+ }
+
+ pg_read_barrier(); /* acquire: payload loads must have landed
+ * before seq_after */
+ seq_after = rec->seq;
+
+ if (seq_before != seq_after)
+ continue;
+
+ /* Skip empty wait events */
+ if (rtype == TRACE_WAIT_EVENT && event_info == 0)
+ continue;
+
+ if (rtype == TRACE_WAIT_EVENT)
+ {
+ event_type = pgstat_get_wait_event_type(event_info);
+ event_name = pgstat_get_wait_event(event_info);
+ }
+ else if (rtype == TRACE_QUERY_START)
+ {
+ event_type = "Query";
+ event_name = "QueryStart";
+ }
+ else if (rtype == TRACE_EXEC_START)
+ {
+ event_type = "Query";
+ event_name = "ExecStart";
+ }
+ else if (rtype == TRACE_EXEC_END)
+ {
+ event_type = "Query";
+ event_name = "ExecEnd";
+ }
+ else
+ {
+ event_type = "Query";
+ event_name = "QueryEnd";
+ }
+
+ if (event_type == NULL || event_name == NULL)
+ continue;
+
+ memset(nulls, 0, sizeof(nulls));
+
+ values[0] = Int64GetDatum((int64) i);
+ values[1] = Int64GetDatum(timestamp_ns);
+ values[2] = CStringGetTextDatum(event_type);
+ values[3] = CStringGetTextDatum(event_name);
+ values[4] = Float8GetDatum((double) duration_ns / 1000.0);
+ values[5] = Int64GetDatum(query_id);
+
+ tuplestore_putvalues(rsinfo->setResult,
+ rsinfo->setDesc,
+ values, nulls);
+ }
+ }
+ PG_FINALLY();
+ {
+ wait_event_trace_srf_in_progress = false;
+
+ /*
+ * If a GUC step-down fired during iteration, it deferred the
+ * dsa_free. Process it now that we're safely past the loop. Re-check
+ * release_pending under the same flag to handle the
+ * (impossible-today, possible-tomorrow) case of a nested SRF.
+ */
+ if (wait_event_trace_release_pending)
+ {
+ wait_event_trace_release_pending = false;
+ if (my_trace_proc_number >= 0)
+ wait_event_trace_release_slot(my_trace_proc_number);
+ }
+ }
+ PG_END_TRY();
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * One element of the local result buffer. Pairs a per-record copy
+ * with the original ring index (used as the seq output column).
+ */
+typedef struct WetValidRecord
+{
+ uint64 ring_index; /* original index in the writer's ring */
+ WaitEventTraceRecord rec;
+} WetValidRecord;
+
+/*
+ * Snapshot the trace ring for a given procNumber and emit records into
+ * the SRF's tuplestore. Returns silently for FREE slots, out-of-range
+ * procnumbers, slots whose ring was never allocated, and slots whose
+ * write_pos is zero.
+ *
+ * Cross-backend reader protocol implemented here:
+ *
+ * 1. Read slot->state without the lock as a cheap "worth visiting"
+ * check; FREE -> nothing to emit.
+ * 2. Allocate the worst-case result buffer BEFORE taking the lock,
+ * so the palloc -- which can bottom out in a glibc mmap syscall
+ * for the worst-case (full-ring) size -- runs without holding the
+ * WaitEventTraceCtl lock.
+ * 3. Acquire WaitEventTraceCtl->lock in LW_SHARED. All slot
+ * transitions take LW_EXCLUSIVE, so the slot's identity, state,
+ * and ring_ptr are stable for the duration of the iteration.
+ * 4. Re-check state under the lock and resolve ring_ptr via
+ * dsa_get_address. Read write_pos.
+ * 5. Iterate every live ring index [read_start, write_pos). For
+ * each record do the per-record POSITION-ENCODED IDENTITY
+ * seqlock check ON SHARED MEMORY (see the comment on the loop
+ * below).
+ * 6. Release the lock.
+ * 7. Walk the local result array and emit rows into the tuplestore.
+ * This is the expensive part (potential disk spill); doing it
+ * after release minimises lock-hold time.
+ *
+ * Why per-record seqlock against shared memory, not against a local
+ * memcpy of the full ring: the protocol requires the two seq reads
+ * to go to the SAME shared-memory location at DIFFERENT TIMES, with
+ * the payload read between them. A bulk memcpy then seqlock-on-
+ * local-copy reads the same frozen byte twice, the check degenerates
+ * to a no-op, and torn / stale-cycle reads slip through.
+ *
+ * Why position-encoded identity, not just parity: the writer encodes
+ * the ring position into the seq value (mid-write = pos*2+1, complete
+ * = pos*2+2). After RING_SIZE writes the slot wraps and is rewritten
+ * with a new numerically-distinct seq. A parity-only check accepts
+ * any stable even seq -- including the PREVIOUS cycle's seq if cross-
+ * process visibility puts the new write_pos ahead of the new seq
+ * update. See the loop body for the four failure modes the identity
+ * check rejects.
+ *
+ * Holding LW_SHARED throughout the iteration also makes the
+ * generation-counter retry unnecessary for this caller: slot
+ * transitions take LW_EXCLUSIVE and therefore cannot happen while we
+ * hold LW_SHARED. The generation counter is still part of the
+ * cross-backend reader contract on WaitEventTraceControl for external
+ * readers that follow a different lock-release pattern (e.g. an
+ * extension that wants to release the lock between batches of records
+ * and re-acquire), but this in-tree implementation does not release
+ * the lock mid-iteration.
+ *
+ * Both OWNED and ORPHANED slots are read uniformly. For OWNED the
+ * live owner is concurrently writing; the seqlock catches torn reads.
+ * For ORPHANED the records are immutable post-mortem so the check is
+ * essentially a pass-through (it still correctly skips at most one
+ * trailing odd-seq record if the owner died mid-write).
+ *
+ * Lock-hold is O(write_pos - read_start) shared-memory loads, at
+ * roughly the same wall-clock cost as a single 4 MB memcpy of the
+ * full ring (~1 ms on modern hardware), with no I/O and no syscalls.
+ */
+static void
+emit_wait_event_trace_for_procnumber(int procNumber, ReturnSetInfo *rsinfo)
+{
+ WaitEventTraceSlot *slot;
+ WaitEventTraceState *ts;
+ WetValidRecord *valid_records = NULL;
+ uint64 valid_count = 0;
+ uint64 write_pos;
+ uint64 read_start;
+ uint64 i;
+ uint32 state_now;
+
+ if (WaitEventTraceCtl == NULL)
+ return;
+
+ /*
+ * Range check. Negative or out-of-range procnumbers return an empty
+ * result rather than ERRORing because the most natural use pattern for
+ * cross-backend readers is to iterate every possible slot index (a
+ * monitoring background worker doesn't know the exact
+ * NUM_WAIT_EVENT_TIMING_SLOTS at SQL level), and silent- empty for
+ * out-of-range matches the behaviour of sister functions like
+ * pg_stat_get_wait_event_timing(NULL) which iterate the shared array
+ * internally. FREE-but-in-range slots also return empty (see the state
+ * check below); the caller cannot distinguish out-of-range from FREE,
+ * which is fine.
+ */
+ if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+ return;
+
+ slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+ /*
+ * If the trace DSA was never created (no backend in the cluster has ever
+ * set wait_event_capture = trace), every slot is still in its initial
+ * FREE state. Skip without taking the lock.
+ */
+ if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID)
+ return;
+
+ /*
+ * Unlocked fast-path check; the authoritative check is under the lock
+ * below.
+ */
+ if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_FREE)
+ return;
+
+ wait_event_trace_ensure_dsa();
+ if (trace_dsa == NULL)
+ return;
+
+ /*
+ * Allocate the worst-case result buffer BEFORE taking the lock. The
+ * buffer is sized for the full ring (sizeof(WetValidRecord) *
+ * WaitEventTraceRingSize, e.g. ~5 MB at the 4 MB default ring and up to
+ * ~42 MB at the 32 MB maximum); on a near-empty ring most goes unused,
+ * but that is preferable to holding the WaitEventTraceCtl lock during a
+ * palloc that may bottom out in a glibc mmap() syscall (allocations above
+ * the malloc-mmap threshold). Glibc's arena-internal mutex around the
+ * syscall would serialise every concurrent reader of this lock through
+ * one VMA-modifying kernel operation; sizing the alloc outside the lock
+ * keeps the lock-hold time bounded by the per-record loop alone.
+ *
+ * After we acquire the lock we will either consume this buffer (writing
+ * up to (write_pos - read_start) entries) or release it unused on an
+ * early return.
+ */
+
+ /*
+ * Worst-case size = ring size. Derive it from the GUC on first use in
+ * this backend; subsequent calls see the cached value. The GUC is
+ * PGC_POSTMASTER so the value is the same across every backend in this
+ * postmaster run and never changes.
+ */
+ if (WaitEventTraceRingSize == 0)
+ WaitEventTraceRingSize =
+ (uint32) wait_event_trace_ring_size * 1024U /
+ (uint32) sizeof(WaitEventTraceRecord);
+ valid_records = palloc(sizeof(WetValidRecord) * WaitEventTraceRingSize);
+
+ LWLockAcquire(&WaitEventTraceCtl->lock, LW_SHARED);
+
+ state_now = pg_atomic_read_u32(&slot->state);
+ if (state_now == WAIT_EVENT_TRACE_SLOT_FREE ||
+ !DsaPointerIsValid(slot->ring_ptr))
+ {
+ LWLockRelease(&WaitEventTraceCtl->lock);
+ pfree(valid_records);
+ return;
+ }
+
+ ts = (WaitEventTraceState *) dsa_get_address(trace_dsa, slot->ring_ptr);
+ write_pos = pg_atomic_read_u64(&ts->write_pos);
+
+ if (write_pos == 0)
+ {
+ LWLockRelease(&WaitEventTraceCtl->lock);
+ pfree(valid_records);
+ return;
+ }
+
+ /* Live range: oldest available to newest. */
+ {
+ uint64 ring_size = (uint64) ts->ring_mask + 1;
+
+ read_start = (write_pos > ring_size)
+ ? write_pos - ring_size : 0;
+ }
+
+ for (i = read_start; i < write_pos; i++)
+ {
+ WaitEventTraceRecord *rec_shared =
+ &ts->records[i & ts->ring_mask];
+ WetValidRecord *out = &valid_records[valid_count];
+ uint32 expected_seq;
+ uint32 seq_before;
+ uint32 seq_after;
+
+ /*
+ * Position-encoded seqlock identity check (NOT just parity).
+ *
+ * The writer encodes the ring position into the seq value: mid-write
+ * -> (uint32)(pos * 2 + 1), complete -> + 2. After RING_SIZE writes
+ * the slot wraps and the same memory location gets a new seq value
+ * (next_pos * 2 + 2) that is numerically distinct from the previous
+ * cycle's seq.
+ *
+ * A parity-only check (skip on odd seq, accept on stable even) is
+ * INSUFFICIENT for this layout in the cross-backend case: if the
+ * writer just incremented write_pos to pos+1 but cross-process cache
+ * coherence has not yet propagated the subsequent rec->seq =
+ * (pos*2+1) store, this reader at i = pos would see the previous
+ * cycle's complete-even seq (from logical position pos - RING_SIZE).
+ * Both seq_before and seq_after would read that stale even value,
+ * parity passes, identity-against-itself passes, and a record
+ * belonging to the PREVIOUS cycle gets emitted with the new
+ * ring_index = pos. Silent data corruption (wrong attribution, not
+ * torn bytes).
+ *
+ * The fix is identity against EXPECTED: a record is valid for
+ * iterator position i if and only if its seq equals (uint32)(i * 2 +
+ * 2) -- the writer's encoded "complete" value for that exact ring
+ * position. This rejects:
+ *
+ * * Stale prior cycle (seq < expected): writer hasn't yet advanced
+ * rec->seq for the current cycle. * Mid-write current cycle (seq ==
+ * expected - 1, odd): writer is in the payload write window. * Ring
+ * wrapped past us (seq > expected): the writer completed a later
+ * cycle on this slot during our read.
+ *
+ * The uint32 wraparound at 2^31 cycles is safe: we use exact
+ * equality, and the writer's existing wrap-safety argument
+ * (sizeof(seq) > worst-case in-flight window by 11 orders of
+ * magnitude) covers the seq value.
+ */
+ expected_seq = (uint32) (i * 2 + 2);
+
+ seq_before = rec_shared->seq;
+ pg_read_barrier();
+
+ if (seq_before != expected_seq)
+ continue;
+
+ out->rec = *rec_shared; /* one 32-byte structure copy */
+
+ pg_read_barrier();
+ seq_after = rec_shared->seq;
+
+ if (seq_after != expected_seq)
+ continue;
+
+ out->ring_index = i;
+ valid_count++;
+ }
+
+ LWLockRelease(&WaitEventTraceCtl->lock);
+
+ /*
+ * Walk the local result array and emit rows. No shared-memory access
+ * from here on, so spills to disk by the tuplestore (if the result is
+ * large) do not hold any wait-event-timing lock.
+ */
+ for (i = 0; i < valid_count; i++)
+ {
+ WetValidRecord *vr = &valid_records[i];
+ WaitEventTraceRecord *rec = &vr->rec;
+ Datum values[6];
+ bool nulls[6];
+ const char *event_type;
+ const char *event_name;
+ uint8 rtype = rec->record_type;
+ uint32 event_info;
+ int64 duration_ns;
+ int64 query_id;
+
+ if (rtype == TRACE_WAIT_EVENT)
+ {
+ event_info = rec->data.wait.event;
+ duration_ns = rec->data.wait.duration_ns;
+ query_id = 0;
+
+ /* Skip empty wait events. */
+ if (event_info == 0)
+ continue;
+
+ event_type = pgstat_get_wait_event_type(event_info);
+ event_name = pgstat_get_wait_event(event_info);
+ }
+ else if (rtype == TRACE_QUERY_START)
+ {
+ event_info = 0;
+ duration_ns = 0;
+ query_id = rec->data.query.query_id;
+ event_type = "Query";
+ event_name = "QueryStart";
+ }
+ else if (rtype == TRACE_QUERY_END)
+ {
+ event_info = 0;
+ duration_ns = 0;
+ query_id = rec->data.query.query_id;
+ event_type = "Query";
+ event_name = "QueryEnd";
+ }
+ else if (rtype == TRACE_EXEC_START)
+ {
+ event_info = 0;
+ duration_ns = 0;
+ query_id = rec->data.query.query_id;
+ event_type = "Query";
+ event_name = "ExecStart";
+ }
+ else if (rtype == TRACE_EXEC_END)
+ {
+ event_info = 0;
+ duration_ns = 0;
+ query_id = rec->data.query.query_id;
+ event_type = "Query";
+ event_name = "ExecEnd";
+ }
+ else
+ {
+ /* Unrecognised record_type -- skip defensively. */
+ continue;
+ }
+
+ if (event_type == NULL || event_name == NULL)
+ continue;
+
+ memset(nulls, 0, sizeof(nulls));
+
+ values[0] = Int64GetDatum((int64) vr->ring_index);
+ values[1] = Int64GetDatum(rec->timestamp_ns);
+ values[2] = CStringGetTextDatum(event_type);
+ values[3] = CStringGetTextDatum(event_name);
+ values[4] = Float8GetDatum((double) duration_ns / 1000.0);
+ values[5] = Int64GetDatum(query_id);
+
+ tuplestore_putvalues(rsinfo->setResult,
+ rsinfo->setDesc,
+ values, nulls);
+ }
+
+ pfree(valid_records);
+}
+
+/*
+ * SQL function: pg_get_wait_event_trace(procnumber int4)
+ *
+ * Cross-backend trace ring reader. Returns the records from the trace
+ * ring belonging to the backend that currently or previously occupied
+ * the given procNumber slot. Reads OWNED and ORPHANED slots uniformly;
+ * FREE slots return an empty result.
+ *
+ * This SRF is the in-tree consumer of the orphan-preserved trace data:
+ * a backend that exited while wait_event_capture = trace leaves its
+ * ring allocated in DSA in ORPHANED state, and this function reads it
+ * until either a new backend takes over the same procNumber or the
+ * DBA calls pg_stat_clear_orphaned_wait_event_rings(). External
+ * extensions that need cross-backend access follow the same
+ * snapshot pattern documented on WaitEventTraceControl in
+ * wait_event_timing.h; this function serves as both the reference
+ * implementation and a DBA-facing diagnostic tool.
+ *
+ * Privileges: REVOKE'd from PUBLIC and GRANT'ed to pg_read_all_stats
+ * in system_views.sql, matching the privilege model of the session-
+ * local view pg_backend_wait_event_trace.
+ *
+ * The procnumber argument can be obtained from the procnumber column
+ * of pg_stat_get_wait_event_timing or pg_stat_get_wait_event_timing_
+ * overflow. For pid-keyed access against live backends, callers can
+ * do:
+ *
+ * SELECT * FROM pg_get_wait_event_trace(
+ * (SELECT procnumber FROM pg_stat_get_wait_event_timing()
+ * WHERE pid = LIMIT 1));
+ *
+ * Note that pid-keyed access cannot read ORPHANED slots because a
+ * dying backend's pid is removed from procArray on exit; for
+ * post-mortem reading of short-lived backends (parallel workers,
+ * autovacuum, walsender) the procNumber must be captured before the
+ * backend exits, or discovered by iterating procnumbers in a
+ * monitoring background worker.
+ */
+Datum
+pg_get_wait_event_trace(PG_FUNCTION_ARGS)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ int32 procNumber = PG_GETARG_INT32(0);
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ emit_wait_event_trace_for_procnumber((int) procNumber, rsinfo);
+
+ PG_RETURN_VOID();
+}
+
+/*
+ * SQL function: pg_stat_clear_orphaned_wait_event_rings()
+ *
+ * Free every trace ring whose owner has exited (slot state ORPHANED).
+ * Returns the number of rings released.
+ *
+ * Why this exists. When a backend that had wait_event_capture = trace
+ * exits, we deliberately do NOT free its ~4 MB trace ring (see the
+ * lifecycle discussion on WaitEventTraceControl): the data must remain
+ * readable by cross-backend consumers -- the in-tree
+ * pg_get_wait_event_trace SRF and any extension following the
+ * snapshot pattern on WaitEventTraceControl -- and an exit-time
+ * dsa_free would defeat that.
+ * The reclaim instead happens lazily in two places:
+ *
+ * (a) wait_event_trace_clear_orphan_at_init(): when a new backend
+ * inherits the same procNumber slot at init, it frees the prior
+ * orphan as part of starting clean. This handles the common
+ * case (busy clusters with connection churn) automatically.
+ *
+ * (b) THIS FUNCTION: an explicit DBA-driven sweep that releases
+ * every currently orphaned ring at once.
+ *
+ * The pathological case (a) does not handle is "capture briefly
+ * enabled, then disabled, on a cluster with long-lived pooled
+ * connections that never exit". In that scenario procNumbers do not
+ * recycle, so prior orphans persist until cluster restart unless the
+ * DBA calls this function. Worst-case bound is
+ * NUM_WAIT_EVENT_TIMING_SLOTS * sizeof(WaitEventTraceState) which is
+ * ~400 MB at MaxBackends=100, ~4 GB at MaxBackends=1000 -- bounded
+ * but worth a kill switch.
+ *
+ * Permissions: execution is revoked from PUBLIC by default, matching the
+ * cluster-wide reset (pg_stat_reset_wait_event_timing_all). This is a
+ * cluster-scope memory-reclamation operation: it can disrupt any
+ * concurrent cross-backend reader on any orphaned slot. The
+ * disruption is bounded (readers retry via the generation counter
+ * and at worst skip one read) but the operation is still
+ * cluster-wide, so the default privilege matches the reset variant
+ * with the same blast radius; administrators can delegate with GRANT.
+ *
+ * The function is safe to call even when no orphans exist (returns
+ * 0) and even when capture is currently OFF (the slot array exists
+ * unconditionally; only the rings are lazy).
+ */
+Datum
+pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS)
+{
+ int64 freed = 0;
+ int i;
+
+ /*
+ * Execution is revoked from PUBLIC in system_views.sql; administrators
+ * can delegate with GRANT EXECUTE.
+ */
+ if (WaitEventTraceCtl == NULL)
+ PG_RETURN_INT64(0);
+
+ /*
+ * If no backend has ever enabled trace, the trace DSA was never created
+ * and there cannot be any ORPHANED slots: every slot is still in its
+ * initial FREE state. Nothing to do.
+ */
+ if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID)
+ PG_RETURN_INT64(0);
+
+ /* Attach to the trace DSA so dsa_free() can be called. */
+ wait_event_trace_ensure_dsa();
+ if (trace_dsa == NULL)
+ PG_RETURN_INT64(0);
+
+ /*
+ * Walk every slot, taking and releasing WaitEventTraceCtl->lock per slot
+ * rather than holding it across the entire sweep.
+ *
+ * Rationale: at MaxBackends = 1000 with a fully-orphaned cluster the
+ * per-slot work (atomic state read + dsa_free + ring_ptr clear + atomic
+ * state write) totals a few microseconds; holding the lock across all
+ * slots would yield a millisecond-scale lock-hold window during which
+ * every concurrent backend startup (the lazy
+ * wait_event_trace_clear_orphan_at_init path), every cross-backend reader
+ * (pg_get_wait_event_trace and the external snapshot pattern), and every
+ * capture step-down or restore would stall. PG's general convention is
+ * to keep LWLock-held windows in paths that compete with regular activity
+ * well under 100 microseconds; per-slot release/reacquire gives us a
+ * worst- case lock-hold of one slot's worth of work regardless of how
+ * many orphans exist cluster-wide.
+ *
+ * An unlocked fast-path read of slot->state skips non-ORPHANED slots
+ * without an LWLockAcquire/Release pair. This is safe: if a slot races
+ * from non-ORPHANED to ORPHANED after we read it, we miss that orphan --
+ * but the function is documented as a snapshot sweep, the missed orphan
+ * can be cleared by a subsequent call, and the same race exists for
+ * orphans that appear after the loop ends. The authoritative re-check
+ * under the lock prevents racing on the dsa_free direction (we never free
+ * a slot whose owner became OWNED again).
+ *
+ * CHECK_FOR_INTERRUPTS at the top of the loop body lets the caller cancel
+ * a long sweep; with the previous single-lock structure the
+ * InterruptHoldoffCount elevation from LWLockAcquire deferred all
+ * cancellation until release.
+ */
+ for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++)
+ {
+ WaitEventTraceSlot *slot = &WaitEventTraceCtl->trace_slots[i];
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* Unlocked fast-path: skip non-ORPHANED slots cheaply. */
+ if (pg_atomic_read_u32(&slot->state) != WAIT_EVENT_TRACE_SLOT_ORPHANED)
+ continue;
+
+ LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+ /*
+ * Authoritative re-check under the lock. A concurrent
+ * clear_orphan_at_init may have already freed this slot.
+ */
+ if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_ORPHANED &&
+ DsaPointerIsValid(slot->ring_ptr))
+ {
+ pg_atomic_fetch_add_u64(&slot->generation, 1);
+ dsa_free(trace_dsa, slot->ring_ptr);
+ slot->ring_ptr = InvalidDsaPointer;
+ pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE);
+ freed++;
+ }
+
+ LWLockRelease(&WaitEventTraceCtl->lock);
+ }
+
+ PG_RETURN_INT64(freed);
+}
+
+#endif /* USE_WAIT_EVENT_TIMING */
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index c1457eb34f02b..b1cf75f3b7d2d 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -70,6 +70,7 @@
#include "utils/snapmgr.h"
#include "utils/syscache.h"
#include "utils/timeout.h"
+#include "utils/wait_event_timing.h"
/* has this backend called EmitConnectionWarnings()? */
static bool ConnectionWarningsEmitted;
@@ -1244,6 +1245,16 @@ InitPostgres(const char *in_dbname, Oid dboid,
/* Process pg_db_role_setting options */
process_settings(MyDatabaseId, GetSessionUserId());
+#ifdef USE_WAIT_EVENT_TIMING
+
+ /*
+ * Attach trace ring if wait_event_capture = trace was set via
+ * config/db/role settings
+ */
+ if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && my_trace_proc_number >= 0)
+ wait_event_trace_attach(my_trace_proc_number);
+#endif
+
/* Apply PostAuthDelay as soon as we've read all options */
if (PostAuthDelay > 0)
pg_usleep(PostAuthDelay * 1000000L);
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index afaa058b046c9..b9b51015651db 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -3425,6 +3425,35 @@
boot_val => 'true',
},
+{ name => 'wait_event_capture', type => 'enum', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE',
+ short_desc => 'Controls collection of per-wait-event timing statistics.',
+ variable => 'wait_event_capture',
+ boot_val => 'WAIT_EVENT_CAPTURE_OFF',
+ options => 'wait_event_capture_options',
+ check_hook => 'check_wait_event_capture',
+ assign_hook => 'assign_wait_event_capture',
+},
+
+{ name => 'wait_event_timing_max_tranches', type => 'int', context => 'PGC_POSTMASTER', group => 'STATS_CUMULATIVE',
+ short_desc => 'Sets the maximum number of distinct LWLock tranches whose timing is recorded per backend.',
+ long_desc => 'Each backend\'s wait-event-timing hash table can hold this many distinct LWLock tranches; subsequent tranches are not individually timed. Sized at server start; raise this if your installation loads many extensions that register their own LWLock tranches.',
+ variable => 'wait_event_timing_max_tranches',
+ boot_val => '192',
+ min => '16',
+ max => '65534',
+},
+
+{ name => 'wait_event_trace_ring_size', type => 'int', context => 'PGC_POSTMASTER', group => 'STATS_CUMULATIVE',
+ short_desc => 'Sets the size of each backend\'s wait-event-trace ring buffer.',
+ long_desc => 'Each backend that enables wait_event_capture = trace allocates a ring buffer of this size from a cluster-wide DSA. The value must be a power of two and is fixed at server start. Larger rings retain longer histories before wrapping.',
+ flags => 'GUC_UNIT_KB',
+ variable => 'wait_event_trace_ring_size',
+ boot_val => '4096',
+ min => '8',
+ max => '32768',
+ check_hook => 'check_wait_event_trace_ring_size',
+},
+
{ name => 'wal_block_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS',
short_desc => 'Shows the block size in the write ahead log.',
flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE',
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 290ccbc543e25..25a3b139523e8 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -103,6 +103,7 @@
#include "utils/plancache.h"
#include "utils/ps_status.h"
#include "utils/rls.h"
+#include "utils/wait_event_timing.h"
#include "utils/xml.h"
#ifdef TRACE_SYNCSCAN
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ac38cddaaf9a6..1b440a14d616e 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -700,6 +700,11 @@
#track_cost_delay_timing = off
#track_io_timing = off
#track_wal_io_timing = off
+#wait_event_capture = off # off, stats, trace
+ # (requires --enable-wait-event-timing)
+#wait_event_timing_max_tranches = 192 # (change requires restart)
+#wait_event_trace_ring_size = 4MB # power of two, 8kB .. 32MB
+ # (change requires restart)
#track_functions = none # none, pl, all
#stats_fetch_consistency = cache # cache, none, snapshot
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 87007e725c161..d2b2964f0bdb4 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
*/
/* yyyymmddN */
-#define CATALOG_VERSION_NO 202606051
+#define CATALOG_VERSION_NO 202606112
#endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index be157a5fbe90c..9cab4efc5b45a 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12693,4 +12693,63 @@
proname => 'hashoid8extended', prorettype => 'int8',
proargtypes => 'oid8 int8', prosrc => 'hashoid8extended' },
+{ oid => '9956',
+ descr => 'statistics: per-backend wait event timing (count, duration, histogram)',
+ proname => 'pg_stat_get_wait_event_timing', prorows => '1000',
+ proisstrict => 'f', proretset => 't', provolatile => 's', proparallel => 'r',
+ prorettype => 'record', proargtypes => 'int4',
+ proallargtypes => '{int4,int4,text,int4,text,text,int8,float8,float8,float8,_int8}',
+ proargmodes => '{i,o,o,o,o,o,o,o,o,o,o}',
+ proargnames => '{pid,pid,backend_type,procnumber,wait_event_type,wait_event,calls,total_time_ms,avg_time_us,max_time_us,histogram}',
+ prosrc => 'pg_stat_get_wait_event_timing' },
+
+{ oid => '9958',
+ descr => 'statistics: reset wait event timing counters for the given backend (NULL = own)',
+ proname => 'pg_stat_reset_wait_event_timing', proisstrict => 'f',
+ provolatile => 'v', prorettype => 'void', proargtypes => 'int4',
+ proargnames => '{pid}', proargdefaults => '{NULL}',
+ prosrc => 'pg_stat_reset_wait_event_timing' },
+
+{ oid => '9959',
+ descr => 'statistics: per-backend wait event timing overflow counters (rows lost to LWLock hash / flat array overflow)',
+ proname => 'pg_stat_get_wait_event_timing_overflow', prorows => '1000',
+ proisstrict => 'f', proretset => 't', provolatile => 's', proparallel => 'r',
+ prorettype => 'record', proargtypes => 'int4',
+ proallargtypes => '{int4,int4,text,int4,int8,int8,int8}',
+ proargmodes => '{i,o,o,o,o,o,o}',
+ proargnames => '{pid,pid,backend_type,procnumber,lwlock_overflow_count,flat_overflow_count,reset_count}',
+ prosrc => 'pg_stat_get_wait_event_timing_overflow' },
+
+{ oid => '9960',
+ descr => 'statistics: reset wait event timing counters for all backends (superuser only)',
+ proname => 'pg_stat_reset_wait_event_timing_all',
+ provolatile => 'v', prorettype => 'void', proargtypes => '',
+ prosrc => 'pg_stat_reset_wait_event_timing_all' },
+
+{ oid => '9957',
+ descr => 'current backend wait event trace ring buffer',
+ proname => 'pg_get_backend_wait_event_trace', prorows => '1000',
+ proretset => 't', provolatile => 's', proparallel => 'r',
+ prorettype => 'record', proargtypes => '',
+ proallargtypes => '{int8,int8,text,text,float8,int8}',
+ proargmodes => '{o,o,o,o,o,o}',
+ proargnames => '{seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}',
+ prosrc => 'pg_get_backend_wait_event_trace' },
+
+{ oid => '9962',
+ descr => 'wait event trace ring for the given procnumber slot, live or post-mortem',
+ proname => 'pg_get_wait_event_trace', prorows => '1000',
+ proretset => 't', provolatile => 'v', proparallel => 'r',
+ prorettype => 'record', proargtypes => 'int4',
+ proallargtypes => '{int4,int8,int8,text,text,float8,int8}',
+ proargmodes => '{i,o,o,o,o,o,o}',
+ proargnames => '{procnumber,seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}',
+ prosrc => 'pg_get_wait_event_trace' },
+
+{ oid => '9961',
+ descr => 'statistics: free wait-event-trace rings whose owner backend has exited (superuser only); returns count freed',
+ proname => 'pg_stat_clear_orphaned_wait_event_rings',
+ provolatile => 'v', prorettype => 'int8', proargtypes => '',
+ prosrc => 'pg_stat_clear_orphaned_wait_event_rings' },
+
]
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 4f8113c144b0c..ed0b7f26f9f5a 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -762,6 +762,9 @@
/* Define to select unnamed POSIX semaphores. */
#undef USE_UNNAMED_POSIX_SEMAPHORES
+/* Define to 1 to build with wait event timing. (--enable-wait-event-timing) */
+#undef USE_WAIT_EVENT_TIMING
+
/* Define to select Win32-style semaphores. */
#undef USE_WIN32_SEMAPHORES
diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h
index d7eb648bd2758..ab842923db45e 100644
--- a/src/include/storage/lwlocklist.h
+++ b/src/include/storage/lwlocklist.h
@@ -140,3 +140,5 @@ PG_LWLOCKTRANCHE(XACT_SLRU, XactSLRU)
PG_LWLOCKTRANCHE(PARALLEL_VACUUM_DSA, ParallelVacuumDSA)
PG_LWLOCKTRANCHE(AIO_URING_COMPLETION, AioUringCompletion)
PG_LWLOCKTRANCHE(SHMEM_INDEX, ShmemIndex)
+PG_LWLOCKTRANCHE(WAIT_EVENT_TIMING_DSA, WaitEventTimingDSA)
+PG_LWLOCKTRANCHE(WAIT_EVENT_TRACE_DSA, WaitEventTraceDSA)
diff --git a/src/include/storage/subsystemlist.h b/src/include/storage/subsystemlist.h
index 9ad619080be22..90b142354644c 100644
--- a/src/include/storage/subsystemlist.h
+++ b/src/include/storage/subsystemlist.h
@@ -79,6 +79,8 @@ PG_SHMEM_SUBSYSTEM(SyncScanShmemCallbacks)
PG_SHMEM_SUBSYSTEM(AsyncShmemCallbacks)
PG_SHMEM_SUBSYSTEM(StatsShmemCallbacks)
PG_SHMEM_SUBSYSTEM(WaitEventCustomShmemCallbacks)
+PG_SHMEM_SUBSYSTEM(WaitEventTimingShmemCallbacks)
+PG_SHMEM_SUBSYSTEM(WaitEventTraceControlShmemCallbacks)
#ifdef USE_INJECTION_POINTS
PG_SHMEM_SUBSYSTEM(InjectionPointShmemCallbacks)
#endif
diff --git a/src/include/utils/.gitignore b/src/include/utils/.gitignore
index ff6f61cd7ee7b..8a489b7769b16 100644
--- a/src/include/utils/.gitignore
+++ b/src/include/utils/.gitignore
@@ -6,4 +6,5 @@
/header-stamp
/pgstat_wait_event.c
/wait_event_funcs_data.c
+/wait_event_timing_data.h
/wait_event_types.h
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index dc406d6651aa2..309d5e87967a0 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -346,6 +346,7 @@ extern PGDLLIMPORT const struct config_enum_entry dynamic_shared_memory_options[
extern PGDLLIMPORT const struct config_enum_entry io_method_options[];
extern PGDLLIMPORT const struct config_enum_entry recovery_target_action_options[];
extern PGDLLIMPORT const struct config_enum_entry server_message_level_options[];
+extern PGDLLIMPORT const struct config_enum_entry wait_event_capture_options[];
extern PGDLLIMPORT const struct config_enum_entry wal_level_options[];
extern PGDLLIMPORT const struct config_enum_entry wal_sync_method_options[];
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index 307f4fbaefe08..afd9017e45cb0 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -172,6 +172,9 @@ extern bool check_transaction_isolation(int *newval, void **extra, GucSource sou
extern bool check_transaction_read_only(bool *newval, void **extra, GucSource source);
extern void assign_transaction_timeout(int newval, void *extra);
extern const char *show_unix_socket_permissions(void);
+extern bool check_wait_event_capture(int *newval, void **extra, GucSource source);
+extern void assign_wait_event_capture(int newval, void *extra);
+extern bool check_wait_event_trace_ring_size(int *newval, void **extra, GucSource source);
extern bool check_wal_buffers(int *newval, void **extra, GucSource source);
extern bool check_wal_consistency_checking(char **newval, void **extra,
GucSource source);
diff --git a/src/include/utils/meson.build b/src/include/utils/meson.build
index fd3a2352df5d4..ef8b2dc261811 100644
--- a/src/include/utils/meson.build
+++ b/src/include/utils/meson.build
@@ -1,6 +1,6 @@
# Copyright (c) 2022-2026, PostgreSQL Global Development Group
-wait_event_output = ['wait_event_types.h', 'pgstat_wait_event.c', 'wait_event_funcs_data.c']
+wait_event_output = ['wait_event_types.h', 'pgstat_wait_event.c', 'wait_event_funcs_data.c', 'wait_event_timing_data.h']
wait_event_target = custom_target('wait_event_names',
input: files('../../backend/utils/activity/wait_event_names.txt'),
output: wait_event_output,
@@ -11,7 +11,7 @@ wait_event_target = custom_target('wait_event_names',
],
build_by_default: true,
install: true,
- install_dir: [dir_include_server / 'utils', false, false],
+ install_dir: [dir_include_server / 'utils', false, false, false],
)
wait_event_types_h = wait_event_target[0]
diff --git a/src/include/utils/wait_classes.h b/src/include/utils/wait_classes.h
index b91690a22c63b..c6c692a1e9391 100644
--- a/src/include/utils/wait_classes.h
+++ b/src/include/utils/wait_classes.h
@@ -26,4 +26,13 @@
#define PG_WAIT_IO 0x0A000000U
#define PG_WAIT_INJECTIONPOINT 0x0B000000U
+/*
+ * Bit-layout masks for wait_event_info. The high byte encodes the
+ * class (one of the PG_WAIT_* constants above); the low 16 bits
+ * encode the per-class event id; the middle byte is currently
+ * reserved (see pgstat_report_wait_start in wait_event.h).
+ */
+#define WAIT_EVENT_CLASS_MASK 0xFF000000U
+#define WAIT_EVENT_ID_MASK 0x0000FFFFU
+
#endif /* WAIT_CLASSES_H */
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 86ee348220d7f..3f94cd090993a 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -13,6 +13,10 @@
/* enums for wait events */
#include "utils/wait_event_types.h"
+#ifdef USE_WAIT_EVENT_TIMING
+#include "utils/wait_event_timing.h"
+#endif
+
extern const char *pgstat_get_wait_event(uint32 wait_event_info);
extern const char *pgstat_get_wait_event_type(uint32 wait_event_info);
static inline void pgstat_report_wait_start(uint32 wait_event_info);
@@ -22,6 +26,11 @@ extern void pgstat_reset_wait_event_storage(void);
extern PGDLLIMPORT uint32 *my_wait_event_info;
+#ifdef USE_WAIT_EVENT_TIMING
+extern void pgstat_report_wait_start_timing(uint32 wait_event_info);
+extern void pgstat_report_wait_end_timing(int capture_level);
+#endif
+
/*
* Wait Events - Extension, InjectionPoint
@@ -61,6 +70,9 @@ extern char **GetWaitEventCustomNames(uint32 classId, int *nwaitevents);
*
* my_wait_event_info initially points to local memory, making it safe to
* call this before MyProc has been initialized.
+ *
+ * When compiled with --enable-wait-event-timing, also records the start
+ * timestamp for later duration computation in pgstat_report_wait_end().
* ----------
*/
static inline void
@@ -71,17 +83,54 @@ pgstat_report_wait_start(uint32 wait_event_info)
* four-bytes, updates are atomic.
*/
*(volatile uint32 *) my_wait_event_info = wait_event_info;
+
+#ifdef USE_WAIT_EVENT_TIMING
+
+ /*
+ * Minimal inline gate: one global load and a branch. The body --
+ * lazy/eager slot resolution, INSTR_TIME read, current-event write --
+ * lives out-of-line in pgstat_report_wait_start_timing() so the many
+ * inlined call sites (LWLockAcquire, XLogInsert, ...) stay compact and
+ * the off-mode codegen impact is a load + test per site.
+ *
+ * No unlikely(): wait_event_capture is monomorphic for long stretches, so
+ * the dynamic branch predictor handles it perfectly with or without the
+ * hint, and a hint would point the wrong way once capture is on.
+ */
+ if (wait_event_capture != WAIT_EVENT_CAPTURE_OFF)
+ pgstat_report_wait_start_timing(wait_event_info);
+#endif
}
/* ----------
* pgstat_report_wait_end() -
*
* Called to report end of a wait.
+ *
+ * When compiled with --enable-wait-event-timing and the GUC is enabled,
+ * calls the out-of-line pgstat_report_wait_end_timing() to compute the
+ * wait duration and accumulate statistics.
* ----------
*/
static inline void
pgstat_report_wait_end(void)
{
+#ifdef USE_WAIT_EVENT_TIMING
+ /*
+ * The load of wait_event_capture is reused as the argument to
+ * pgstat_report_wait_end_timing(), so the out-of-line body does not have
+ * to re-load it across the call boundary (CSE doesn't cross function
+ * calls). See pgstat_report_wait_start() for the no-unlikely()
+ * rationale.
+ */
+ {
+ int capture_level = wait_event_capture;
+
+ if (capture_level != WAIT_EVENT_CAPTURE_OFF)
+ pgstat_report_wait_end_timing(capture_level);
+ }
+#endif
+
/* see pgstat_report_wait_start() */
*(volatile uint32 *) my_wait_event_info = 0;
}
diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h
new file mode 100644
index 0000000000000..d5766934b8c91
--- /dev/null
+++ b/src/include/utils/wait_event_timing.h
@@ -0,0 +1,374 @@
+/*-------------------------------------------------------------------------
+ *
+ * wait_event_timing.h
+ * Per-backend wait event timing instrumentation.
+ *
+ * When enabled via the wait_event_capture GUC, every transition through
+ * pgstat_report_wait_start()/pgstat_report_wait_end() records the wait
+ * duration and accumulates per-(backend, event) statistics -- count,
+ * total/maximum duration, and a log2 duration histogram -- in shared
+ * memory. Each backend writes only to its own slot, so the hot path
+ * needs no locking; cross-backend readers (the pg_stat_wait_event_timing
+ * SRF) read lock-free and tolerate torn reads of 64-bit fields on 32-bit
+ * platforms, which is acceptable for statistics.
+ *
+ * The per-backend slot array is allocated lazily in a DSA the first time
+ * any backend in the cluster sets wait_event_capture != off, so a build
+ * that compiles the feature in but never enables it pays no per-backend
+ * memory. Backends attach to the array on their first wait event under
+ * capture (pgstat_wait_event_timing_lazy_attach), and a before_shmem_exit
+ * gate stops the hot path from touching DSA mappings that proc_exit has
+ * already torn down.
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * src/include/utils/wait_event_timing.h
+ *-------------------------------------------------------------------------
+ */
+#ifndef WAIT_EVENT_TIMING_H
+#define WAIT_EVENT_TIMING_H
+
+#include "port/atomics.h"
+#include "portability/instr_time.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "utils/dsa.h"
+#include "utils/wait_event_types.h"
+
+/*
+ * Capture levels for the wait_event_capture GUC. Order is significant:
+ * higher values are strict supersets of lower ones, so code paths can
+ * test for activation with "level >= WAIT_EVENT_CAPTURE_STATS".
+ *
+ * OFF - No instrumentation, no hot-path cost.
+ * STATS - Aggregated per-event statistics (counts, durations, histogram)
+ * exposed via pg_stat_wait_event_timing.
+ * TRACE - Everything in STATS plus a per-session ring buffer of individual
+ * wait events and query-attribution markers, exposed via
+ * pg_backend_wait_event_trace.
+ */
+typedef enum WaitEventCaptureLevel
+{
+ WAIT_EVENT_CAPTURE_OFF = 0,
+ WAIT_EVENT_CAPTURE_STATS,
+ WAIT_EVENT_CAPTURE_TRACE,
+} WaitEventCaptureLevel;
+
+/*
+ * Pin the enum ordering at compile time so code that compares with >= keeps
+ * working, and so reordering is caught at build time rather than via
+ * mysterious runtime mode switches.
+ */
+StaticAssertDecl(WAIT_EVENT_CAPTURE_OFF == 0 &&
+ WAIT_EVENT_CAPTURE_STATS == 1 &&
+ WAIT_EVENT_CAPTURE_TRACE == 2,
+ "WaitEventCaptureLevel values must be 0=OFF < 1=STATS < 2=TRACE");
+
+/*
+ * Number of log2 histogram buckets. Bin edges are powers of two on the
+ * nanosecond axis: bucket 0 covers [0, 1024) ns, bucket i covers
+ * [2^(i+9), 2^(i+10)) ns, and the last bucket is open-ended at
+ * [2^(NBUCKETS+8), inf) ns. These boundaries approximate the
+ * decimal-microsecond grid (1024 ~ 1 us, 2048 ~ 2 us, ...), which lets
+ * wait_event_timing_bucket() avoid a divide on the hot path.
+ *
+ * 32 buckets cover from <1us through the open-ended overflow at 2^40 ns
+ * (~18 minutes), so the long tail (lock contention, vacuum, replication
+ * apply, noisy-neighbour I/O spikes) lands in a real bucket rather than a
+ * single overflow bin -- which is exactly where tail/P99 analysis pays
+ * off.
+ */
+#define WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS 32
+
+/* Sentinel returned by wait_event_timing_index() for LWLock events. */
+#define WAIT_EVENT_TIMING_IDX_LWLOCK (-2)
+
+/*
+ * Per-event accumulated statistics. One entry per distinct wait event per
+ * backend, written only by the owning backend.
+ */
+typedef struct WaitEventTimingEntry
+{
+ int64 count; /* number of occurrences */
+ int64 total_ns; /* total wait duration in nanoseconds */
+ int64 max_ns; /* longest single wait in nanoseconds */
+ int64 histogram[WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS];
+} WaitEventTimingEntry;
+
+/*
+ * Sentinel marking an empty LWLock-hash slot. We reserve the top of the
+ * uint16 range (0xFFFF) rather than 0 so that any legal tranche id --
+ * including the currently-unused tranche 0 -- can be stored and matched.
+ */
+#define LWLOCK_TIMING_EMPTY_SLOT ((uint16) 0xFFFF)
+
+/*
+ * Open-addressing hash slot mapping an LWLock tranche id to a dense index
+ * into the per-backend lwlock_events[] array. Per-backend, single-writer.
+ */
+typedef struct LWLockTimingHashEntry
+{
+ uint16 tranche_id; /* LWLOCK_TIMING_EMPTY_SLOT marks empty */
+ uint16 dense_idx; /* index into lwlock_events[] */
+} LWLockTimingHashEntry;
+
+/*
+ * Header for the per-backend LWLock-timing hash. The slot table and the
+ * dense events array follow the WaitEventTimingState in memory (their
+ * lengths are runtime-determined by wait_event_timing_max_tranches), so
+ * they are not struct members; resolve them via the helpers in
+ * wait_event_timing.c.
+ */
+typedef struct LWLockTimingHash
+{
+ int num_used; /* count of occupied entries */
+ int hash_size; /* slot-table size (power of two); immutable */
+ int max_entries; /* cap on distinct tranches; immutable */
+} LWLockTimingHash;
+
+/* Declaration of the GUC (see guc_parameters.dat). */
+extern PGDLLIMPORT int wait_event_timing_max_tranches;
+
+/*
+ * Per-backend wait event timing state. One slot per
+ * MaxBackends + NUM_AUXILIARY_PROCS, written exclusively by the owning
+ * backend. Shared-memory layout of one slot:
+ *
+ * [ WaitEventTimingState header ]
+ * [ LWLockTimingHashEntry[hash_size] ]
+ * [ WaitEventTimingEntry[max_entries] <- lwlock_events[] ]
+ *
+ * where hash_size and max_entries are runtime-derived from the GUC
+ * wait_event_timing_max_tranches and recorded in lwlock_hash. Slots are
+ * laid out contiguously in the lazily-allocated DSA region using a runtime
+ * stride rather than C array indexing, since the per-backend size is
+ * determined at first enable.
+ */
+typedef struct WaitEventTimingState
+{
+ /*
+ * Generation counter for cross-backend reset requests. Bumped atomically
+ * by pg_stat_reset_wait_event_timing(target); the owning backend notices
+ * the change at its next wait_end and clears its own counters. This
+ * keeps the hot path lock-free: only the owning backend ever writes its
+ * statistics, so there is no writer/resetter race.
+ */
+ pg_atomic_uint32 reset_generation;
+
+ /* Current wait start timestamp (set by pgstat_report_wait_start). */
+ instr_time wait_start;
+
+ /* Current wait_event_info (cached for use in wait_end). */
+ uint32 current_event;
+
+ /*
+ * Number of resets this backend has observed and acted on. Own-backend
+ * resets are synchronous (one bump per call); cross-backend resets
+ * coalesce (multiple requests between two wait_ends count as one).
+ */
+ int64 reset_count;
+
+ /* Per-event statistics: flat array for bounded classes. */
+ WaitEventTimingEntry events[WAIT_EVENT_TIMING_NUM_EVENTS];
+
+ /* Per-event statistics: hash for the LWLock class (unbounded ids). */
+ LWLockTimingHash lwlock_hash;
+
+ /*
+ * Count of LWLock events dropped because the LWLock-timing hash reached
+ * its cap (wait_event_timing_max_tranches). Exposed via
+ * pg_stat_wait_event_timing_overflow.
+ */
+ int64 lwlock_overflow_count;
+
+ /* Count of events dropped because the class index was out of range. */
+ int64 flat_overflow_count;
+} WaitEventTimingState;
+
+/* GUC variables (see guc_parameters.dat). */
+extern PGDLLIMPORT int wait_event_capture;
+
+/* Pointer to this backend's timing state in shared memory. */
+extern PGDLLIMPORT WaitEventTimingState *my_wait_event_timing;
+
+/*
+ * Shared memory setup -- registered via the shmem subsystem registry
+ * (src/include/storage/subsystemlist.h). Stub builds expose a no-op
+ * callbacks struct so the subsystemlist.h reference resolves either way.
+ */
+extern PGDLLIMPORT const ShmemCallbacks WaitEventTimingShmemCallbacks;
+
+/*
+ * Called from InitProcess()/InitAuxiliaryProcess() to set up this backend's
+ * timing/trace bookkeeping, and from ProcKill() to clear it.
+ */
+extern void pgstat_set_wait_event_timing_storage(int procNumber);
+extern void pgstat_reset_wait_event_timing_storage(void);
+
+
+/* ----------------------------------------------------------------------
+ * Trace level (wait_event_capture = trace)
+ *
+ * In addition to the STATS aggregates, every completed wait (and a set of
+ * query-attribution markers) is pushed into a per-session ring buffer --
+ * one record per completed wait. The ring is allocated lazily in DSA on
+ * first use,
+ * so only sessions that enable trace pay the per-ring memory cost. External
+ * tools read a session's ring via pg_get_backend_wait_event_trace() (own
+ * session) or pg_get_wait_event_trace(procnumber) (cross-backend).
+ *
+ * Query attribution is by scanning the ring at read time: QUERY/EXEC
+ * START/END markers delimit which wait events belong to which query_id.
+ *
+ * The ring size is set cluster-wide at server start by the
+ * wait_event_trace_ring_size GUC (PGC_POSTMASTER, default 4 MB). It
+ * MUST be a power of two: the writer indexes the ring as (pos & ring_mask).
+ * ----------------------------------------------------------------------
+ */
+
+/* Trace record types */
+#define TRACE_WAIT_EVENT 0
+#define TRACE_QUERY_START 1
+#define TRACE_QUERY_END 2
+#define TRACE_EXEC_START 3
+#define TRACE_EXEC_END 4
+
+typedef struct WaitEventTraceRecord
+{
+ /*
+ * Seqlock for torn-read detection. Writers set seq odd before filling
+ * fields, then even after; readers check seq before and after and skip
+ * the record if either is odd or they differ. uint32 wrap is irrelevant
+ * over the ~10-20 ns reader access window.
+ */
+ uint32 seq;
+ uint8 record_type; /* TRACE_WAIT_EVENT / QUERY_* / EXEC_* */
+ uint8 pad[3];
+ int64 timestamp_ns; /* monotonic clock */
+ union
+ {
+ struct /* record_type = TRACE_WAIT_EVENT */
+ {
+ uint32 event; /* wait_event_info */
+ uint32 pad2;
+ int64 duration_ns;
+ } wait;
+ struct /* QUERY_START/END or EXEC_START/END */
+ {
+ int64 query_id;
+ int64 pad2;
+ } query;
+ } data;
+} WaitEventTraceRecord; /* 32 bytes */
+
+/*
+ * The seqlock wrap-safety argument and the mask-index math both rely on a
+ * fixed 32-byte record stride; make a stray field addition a build failure.
+ */
+StaticAssertDecl(sizeof(WaitEventTraceRecord) == 32,
+ "WaitEventTraceRecord must be exactly 32 bytes");
+
+/*
+ * Per-backend trace ring header followed by the records array. records[]
+ * is variably sized at allocation time (wait_event_trace_ring_size
+ * decides the row count). write_pos and ring_mask share a cache line so
+ * the hot-path index calculation touches one line.
+ */
+typedef struct WaitEventTraceState
+{
+ pg_atomic_uint64 write_pos; /* monotonically increasing, wraps via mask */
+ uint32 ring_mask; /* ring_size - 1; ring_size is a power of two */
+ uint32 ring_size_pad; /* keep the records[] slab 16-byte aligned */
+ WaitEventTraceRecord records[FLEXIBLE_ARRAY_MEMBER];
+} WaitEventTraceState;
+
+/*
+ * Per-procNumber trace-ring slot lifecycle. Decoupled from backend
+ * lifecycle on purpose: when a backend exits we transition its slot to
+ * ORPHANED and leave the ring in DSA so cross-backend consumers can still
+ * read the dying backend's final waits. An orphan is reclaimed when a new
+ * backend takes the same procNumber, or by
+ * pg_stat_clear_orphaned_wait_event_rings().
+ *
+ * FREE no ring allocated (ring_ptr invalid).
+ * OWNED a live backend at this procNumber is writing to the ring.
+ * ORPHANED the owner exited; the ring is post-mortem and immutable.
+ */
+typedef enum WaitEventTraceSlotState
+{
+ WAIT_EVENT_TRACE_SLOT_FREE = 0,
+ WAIT_EVENT_TRACE_SLOT_OWNED,
+ WAIT_EVENT_TRACE_SLOT_ORPHANED,
+} WaitEventTraceSlotState;
+
+/*
+ * Per-procNumber slot. generation is bumped on every owner transition;
+ * cross-backend readers snapshot it before+after their read and retry if it
+ * changed (the BackendStatusArray st_changecount idiom). state is atomic
+ * only for cheap unlocked "worth visiting" probes; authoritative reads of
+ * (state, ring_ptr) are done under WaitEventTraceCtl->lock in LW_SHARED,
+ * while every transition holds it LW_EXCLUSIVE.
+ */
+typedef struct WaitEventTraceSlot
+{
+ pg_atomic_uint64 generation; /* bumped on every owner transition */
+ pg_atomic_uint32 state; /* WaitEventTraceSlotState */
+ uint32 pad; /* keep ring_ptr 8-aligned */
+ dsa_pointer ring_ptr; /* InvalidDsaPointer when FREE; else the
+ * WaitEventTraceState chunk */
+} WaitEventTraceSlot;
+
+/*
+ * Control struct in fixed shared memory. trace_slots[] is indexed by
+ * procNumber.
+ *
+ * External cross-backend reader protocol (pg_get_wait_event_trace is the
+ * reference implementation):
+ * 1. read trace_slots[procNumber].state unlocked as a "worth visiting"
+ * probe; FREE -> nothing to read.
+ * 2. acquire lock LW_SHARED (all transitions take LW_EXCLUSIVE, so the
+ * slot's state/ring_ptr/ring memory are stable for the iteration).
+ * 3. re-check state under the lock; resolve ring_ptr via dsa_get_address;
+ * read write_pos.
+ * 4. iterate [read_start, write_pos): for each record do the per-record
+ * POSITION-ENCODED IDENTITY seqlock check against shared memory --
+ * expected_seq = (uint32)(i*2 + 2); read seq, barrier, copy record,
+ * barrier, re-read seq; accept only if both equal expected_seq. This
+ * rejects stale previous-cycle reads (parity alone would not).
+ * 5. release the lock; emit the buffered records afterwards.
+ * 6. optional: snapshot generation before/after if releasing the lock
+ * between batches.
+ */
+typedef struct WaitEventTraceControl
+{
+ dsa_handle trace_dsa_handle; /* DSA_HANDLE_INVALID until first use */
+ LWLock lock; /* protects DSA creation and slot transitions */
+ WaitEventTraceSlot trace_slots[FLEXIBLE_ARRAY_MEMBER]; /* per procNumber */
+} WaitEventTraceControl;
+
+/* Trace GUC and the records-per-ring value derived from it at startup. */
+extern PGDLLIMPORT int wait_event_trace_ring_size;
+extern PGDLLIMPORT uint32 WaitEventTraceRingSize;
+
+/* This backend's procNumber for the trace ring, or -1 if not set. */
+extern PGDLLIMPORT int my_trace_proc_number;
+
+/*
+ * Trace control shmem -- registered via subsystemlist.h; a no-op stub when
+ * the feature is compiled out.
+ */
+extern PGDLLIMPORT const ShmemCallbacks WaitEventTraceControlShmemCallbacks;
+
+/*
+ * Lazy DSA-based trace ring allocation -- called on first trace write and
+ * at backend startup when capture = trace was set via configuration.
+ */
+extern void wait_event_trace_attach(int procNumber);
+
+/* Query-attribution markers (defined in wait_event_timing.c). */
+extern void wait_event_trace_query_start(int64 query_id);
+extern void wait_event_trace_query_end(int64 query_id);
+extern void wait_event_trace_exec_start(int64 query_id);
+extern void wait_event_trace_exec_end(int64 query_id);
+
+#endif /* WAIT_EVENT_TIMING_H */
diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build
index 969e90b396da0..f87c8bed52167 100644
--- a/src/test/modules/test_misc/meson.build
+++ b/src/test/modules/test_misc/meson.build
@@ -22,6 +22,7 @@ tests += {
't/011_lock_stats.pl',
't/012_ddlutils.pl',
't/013_temp_obj_multisession.pl',
+ 't/014_wait_event_trace_seqlock.pl',
],
# The injection points are cluster-wide, so disable installcheck
'runningcheck': false,
diff --git a/src/test/modules/test_misc/t/014_wait_event_trace_seqlock.pl b/src/test/modules/test_misc/t/014_wait_event_trace_seqlock.pl
new file mode 100644
index 0000000000000..0f0e49970c89e
--- /dev/null
+++ b/src/test/modules/test_misc/t/014_wait_event_trace_seqlock.pl
@@ -0,0 +1,122 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# Test the position-encoded identity seqlock that protects cross-backend
+# reads of the wait-event trace ring (wait_event_capture = trace).
+#
+# The hazard: the trace writer advances write_pos and only then stamps the
+# record's seq. A cross-backend reader that observes the new write_pos
+# before the seq store has propagated sees, at the in-flight ring slot, the
+# PREVIOUS cycle's record -- complete, with an even seq. A parity-only
+# seqlock would accept it and emit a stale record attributed to the wrong
+# ring index; the identity check (seq must equal the writer's completion
+# value for that exact position) must reject it.
+#
+# That window is unobservable on TSO hardware without instrumentation, so
+# the writer carries INJECTION_POINT("wait-event-trace-after-write-pos")
+# between the write_pos advance and the seq stamp. This test:
+#
+# 1. fills and wraps a minimum-size ring (8kB = 256 records), so every
+# slot holds a complete record from the previous cycle;
+# 2. wedges the writer at the injection point, mid-record;
+# 3. reads the ring cross-backend: the reader must return exactly
+# ring_size - 1 records, skipping the in-flight slot whose stale
+# prior-cycle record a parity-only check would have emitted;
+# 4. releases the writer and verifies the ring reads full again.
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+plan skip_all => 'Injection points not supported by this build'
+ unless $ENV{enable_injection_points} eq 'yes';
+
+my $ring_records = 256; # 8kB ring / 32-byte records
+
+my $node = PostgreSQL::Test::Cluster->new('seqlock');
+$node->init;
+$node->append_conf(
+ 'postgresql.conf', q[
+wait_event_trace_ring_size = '8kB'
+]);
+$node->start;
+
+# Skip if the server was not built with --enable-wait-event-timing.
+my ($ret, $stdout, $stderr) =
+ $node->psql('postgres', 'SET wait_event_capture = trace;');
+if ($ret != 0)
+{
+ $node->stop;
+ plan skip_all => 'server not built with --enable-wait-event-timing';
+}
+
+$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
+
+# Writer session: enable trace and wrap the ring. 400 pg_sleep calls emit
+# at least 400 wait events into a 256-record ring, so every slot holds a
+# complete record from the current window.
+my $writer = $node->background_psql('postgres');
+$writer->query_safe('SET wait_event_capture = trace;');
+$writer->query_safe(
+ 'SELECT count(pg_sleep(0.001)) FROM generate_series(1, 400);');
+
+my $writer_proc = $writer->query_safe(
+ 'SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid())'
+ . ' LIMIT 1;');
+chomp $writer_proc;
+like($writer_proc, qr/^\d+$/, 'writer reported its procnumber');
+
+# With the ring wrapped and the writer idle, a cross-backend read returns
+# exactly ring_size records: every slot is complete and identity-valid.
+my $count_full = $node->safe_psql('postgres',
+ "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+is($count_full, $ring_records, 'wrapped ring reads full before the wedge');
+
+# Wedge the writer mid-record: arm the injection point, then send a
+# statement. The arrival of the statement completes the writer's
+# ClientRead wait; its trace write advances write_pos and then blocks at
+# the injection point, before stamping the record's seq.
+$node->safe_psql('postgres',
+ "SELECT injection_points_attach('wait-event-trace-after-write-pos', 'wait');"
+);
+$writer->query_until(
+ qr/wedge_sent/, q[
+\echo wedge_sent
+SELECT 1;
+]);
+$node->wait_for_event('client backend', 'wait-event-trace-after-write-pos');
+
+# The decisive read: the in-flight slot still holds the previous cycle's
+# complete record. A parity-only seqlock would emit it (ring_size rows,
+# one misattributed); the identity check must skip exactly that slot.
+my $count_wedged = $node->safe_psql('postgres',
+ "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+is($count_wedged, $ring_records - 1,
+ 'reader skips the in-flight slot instead of emitting the stale prior-cycle record'
+);
+
+# The read is stable and repeatable while the writer is wedged.
+my $count_wedged2 = $node->safe_psql('postgres',
+ "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+is($count_wedged2, $count_wedged, 'wedged-ring read is stable');
+
+# Release the writer: detach first so the nested wakeup wait does not
+# re-arm, then wake it.
+$node->safe_psql('postgres',
+ "SELECT injection_points_detach('wait-event-trace-after-write-pos');");
+$node->safe_psql('postgres',
+ "SELECT injection_points_wakeup('wait-event-trace-after-write-pos');");
+
+# The writer completes the wedged record (and its pending statement); the
+# ring must read full again.
+$writer->query_safe("SELECT 'resync';");
+my $count_after = $node->safe_psql('postgres',
+ "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+is($count_after, $ring_records, 'ring reads full again after release');
+
+$writer->quit;
+$node->stop;
+
+done_testing();
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index a65a5bf0c4fbc..0a00c00cd5c91 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1332,6 +1332,13 @@ pg_backend_memory_contexts| SELECT name,
free_chunks,
used_bytes
FROM pg_get_backend_memory_contexts() pg_get_backend_memory_contexts(name, ident, type, level, path, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes);
+pg_backend_wait_event_trace| SELECT seq,
+ timestamp_ns,
+ wait_event_type,
+ wait_event,
+ duration_us,
+ query_id
+ FROM pg_get_backend_wait_event_trace() t(seq, timestamp_ns, wait_event_type, wait_event, duration_us, query_id);
pg_config| SELECT name,
setting
FROM pg_config() pg_config(name, setting);
@@ -2415,6 +2422,24 @@ pg_stat_user_tables| SELECT relid,
stats_reset
FROM pg_stat_all_tables
WHERE ((schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (schemaname !~ '^pg_toast'::text));
+pg_stat_wait_event_timing| SELECT pid,
+ backend_type,
+ procnumber,
+ wait_event_type,
+ wait_event,
+ calls,
+ total_time_ms,
+ avg_time_us,
+ max_time_us,
+ histogram
+ FROM pg_stat_get_wait_event_timing(NULL::integer) t(pid, backend_type, procnumber, wait_event_type, wait_event, calls, total_time_ms, avg_time_us, max_time_us, histogram);
+pg_stat_wait_event_timing_overflow| SELECT pid,
+ backend_type,
+ procnumber,
+ lwlock_overflow_count,
+ flat_overflow_count,
+ reset_count
+ FROM pg_stat_get_wait_event_timing_overflow(NULL::integer) t(pid, backend_type, procnumber, lwlock_overflow_count, flat_overflow_count, reset_count);
pg_stat_wal| SELECT wal_records,
wal_fpi,
wal_bytes,
@@ -2891,6 +2916,11 @@ pg_views| SELECT n.nspname AS schemaname,
FROM (pg_class c
LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)))
WHERE (c.relkind = 'v'::"char");
+pg_wait_event_timing_histogram_buckets| SELECT bucket_idx,
+ lower_ns,
+ upper_ns,
+ label
+ FROM ( VALUES (0,(0)::bigint,(1024)::bigint,'<1us'::text), (1,(1024)::bigint,(2048)::bigint,'1-2us'::text), (2,(2048)::bigint,(4096)::bigint,'2-4us'::text), (3,(4096)::bigint,(8192)::bigint,'4-8us'::text), (4,(8192)::bigint,(16384)::bigint,'8-16us'::text), (5,(16384)::bigint,(32768)::bigint,'16-32us'::text), (6,(32768)::bigint,(65536)::bigint,'32-64us'::text), (7,(65536)::bigint,(131072)::bigint,'64-128us'::text), (8,(131072)::bigint,(262144)::bigint,'128-256us'::text), (9,(262144)::bigint,(524288)::bigint,'256-512us'::text), (10,(524288)::bigint,(1048576)::bigint,'512us-1ms'::text), (11,(1048576)::bigint,(2097152)::bigint,'1-2ms'::text), (12,(2097152)::bigint,(4194304)::bigint,'2-4ms'::text), (13,(4194304)::bigint,(8388608)::bigint,'4-8ms'::text), (14,(8388608)::bigint,(16777216)::bigint,'8-16ms'::text), (15,(16777216)::bigint,(33554432)::bigint,'16-32ms'::text), (16,(33554432)::bigint,(67108864)::bigint,'32-64ms'::text), (17,(67108864)::bigint,(134217728)::bigint,'64-128ms'::text), (18,(134217728)::bigint,(268435456)::bigint,'128-256ms'::text), (19,(268435456)::bigint,(536870912)::bigint,'256-512ms'::text), (20,(536870912)::bigint,(1073741824)::bigint,'512ms-1s'::text), (21,(1073741824)::bigint,'2147483648'::bigint,'1-2s'::text), (22,'2147483648'::bigint,'4294967296'::bigint,'2-4s'::text), (23,'4294967296'::bigint,'8589934592'::bigint,'4-8s'::text), (24,'8589934592'::bigint,'17179869184'::bigint,'8-16s'::text), (25,'17179869184'::bigint,'34359738368'::bigint,'16-32s'::text), (26,'34359738368'::bigint,'68719476736'::bigint,'32-64s'::text), (27,'68719476736'::bigint,'137438953472'::bigint,'64-128s'::text), (28,'137438953472'::bigint,'274877906944'::bigint,'128-256s'::text), (29,'274877906944'::bigint,'549755813888'::bigint,'256-512s'::text), (30,'549755813888'::bigint,'1099511627776'::bigint,'512s-1024s'::text), (31,'1099511627776'::bigint,NULL::bigint,'>=1024s'::text)) t(bucket_idx, lower_ns, upper_ns, label);
pg_wait_events| SELECT type,
name,
description
diff --git a/src/test/regress/expected/wait_event_timing.out b/src/test/regress/expected/wait_event_timing.out
new file mode 100644
index 0000000000000..8e94a4ce6c5de
--- /dev/null
+++ b/src/test/regress/expected/wait_event_timing.out
@@ -0,0 +1,188 @@
+--
+-- WAIT_EVENT_TIMING
+--
+-- Exercises the wait_event_capture instrumentation: the GUC, the stats
+-- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing
+-- and histogram-buckets views, overflow counters, resets), and the trace
+-- surface (the per-session ring, its readers, and the query markers).
+--
+-- Two expected outputs are maintained:
+-- wait_event_timing.out -- --enable-wait-event-timing builds
+-- wait_event_timing_1.out -- builds without the option (stub)
+-- The difference is whether SET wait_event_capture = stats succeeds and
+-- whether the SRF records anything; durations are never printed, so the
+-- timing-build output is deterministic.
+--
+-- Default is off.
+SHOW wait_event_capture;
+ wait_event_capture
+--------------------
+ off
+(1 row)
+
+-- The taxonomy view is pure SQL and identical in both build configs.
+SELECT count(*) AS buckets FROM pg_wait_event_timing_histogram_buckets;
+ buckets
+---------
+ 32
+(1 row)
+
+SELECT bucket_idx, lower_ns, upper_ns, label
+FROM pg_wait_event_timing_histogram_buckets
+WHERE bucket_idx IN (0, 1, 31)
+ORDER BY bucket_idx;
+ bucket_idx | lower_ns | upper_ns | label
+------------+---------------+----------+---------
+ 0 | 0 | 1024 | <1us
+ 1 | 1024 | 2048 | 1-2us
+ 31 | 1099511627776 | | >=1024s
+(3 rows)
+
+-- Enable stats capture and generate a deterministic wait: pg_sleep emits a
+-- Timeout / PgSleep wait of ~0.1s. (In a stub build the SET errors and the
+-- SRF stays empty; that is the documented difference between the two
+-- expected files.)
+SET wait_event_capture = stats;
+SELECT pg_sleep(0.1);
+ pg_sleep
+----------
+
+(1 row)
+
+-- PgSleep must now be recorded for this backend, with the per-event
+-- invariants holding. We print only booleans so the output is stable.
+SELECT calls >= 1 AS calls_ok,
+ calls = (SELECT sum(h) FROM unnest(histogram) AS h) AS hist_sum_eq_calls,
+ total_time_ms > 0 AS total_positive,
+ max_time_us > 0 AS max_positive,
+ array_length(histogram, 1)
+ = (SELECT count(*)::int FROM pg_wait_event_timing_histogram_buckets)
+ AS histogram_len_ok
+FROM pg_stat_get_wait_event_timing(pg_backend_pid())
+WHERE wait_event = 'PgSleep';
+ calls_ok | hist_sum_eq_calls | total_positive | max_positive | histogram_len_ok
+----------+-------------------+----------------+--------------+------------------
+ t | t | t | t | t
+(1 row)
+
+-- The view surfaces the same row (type/name only; durations omitted).
+SELECT wait_event_type, wait_event
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+ wait_event_type | wait_event
+-----------------+------------
+ Timeout | PgSleep
+(1 row)
+
+-- A non-NULL pid that does not exist yields no rows (silent, not an error).
+SELECT count(*) AS rows_for_bogus_pid
+FROM pg_stat_get_wait_event_timing(-1);
+ rows_for_bogus_pid
+--------------------
+ 0
+(1 row)
+
+-- Overflow/reset counters for this backend. A simple test backend uses
+-- few LWLock tranches and no out-of-range classes, so both overflow
+-- counters are zero, and a fresh backend has not been reset.
+SELECT lwlock_overflow_count, flat_overflow_count, reset_count
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+ lwlock_overflow_count | flat_overflow_count | reset_count
+-----------------------+---------------------+-------------
+ 0 | 0 | 0
+(1 row)
+
+-- Resetting our own backend is synchronous: the PgSleep row is cleared and
+-- reset_count advances. (We filter to PgSleep because inter-command waits
+-- such as ClientRead may be recorded again before the next statement runs.)
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ pg_stat_reset_wait_event_timing
+---------------------------------
+
+(1 row)
+
+SELECT count(*) AS pgsleep_rows_after_reset
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+ pgsleep_rows_after_reset
+--------------------------
+ 0
+(1 row)
+
+SELECT reset_count
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+ reset_count
+-------------
+ 1
+(1 row)
+
+-- The pid argument defaults to NULL, so a no-argument call resets the
+-- caller's own backend.
+SELECT pg_stat_reset_wait_event_timing();
+ pg_stat_reset_wait_event_timing
+---------------------------------
+
+(1 row)
+
+-- Resetting an unknown pid is a silent no-op, not an error.
+SELECT pg_stat_reset_wait_event_timing(2147483647);
+ pg_stat_reset_wait_event_timing
+---------------------------------
+
+(1 row)
+
+RESET wait_event_capture;
+--
+-- Trace level: per-session ring of individual waits + query markers.
+-- (In a stub build SET trace errors and the trace readers stay empty;
+-- that is the documented difference between the two expected files.)
+--
+SET compute_query_id = on;
+SET wait_event_capture = trace;
+SELECT pg_sleep(0.1);
+ pg_sleep
+----------
+
+(1 row)
+
+-- PgSleep is recorded in this backend's ring with a positive duration.
+SELECT count(*) >= 1 AS pgsleep_in_ring,
+ coalesce(bool_and(duration_us > 0), false) AS durations_positive
+FROM pg_get_backend_wait_event_trace()
+WHERE wait_event = 'PgSleep';
+ pgsleep_in_ring | durations_positive
+-----------------+--------------------
+ t | t
+(1 row)
+
+-- The same records are visible through the view.
+SELECT count(*) >= 1 AS view_has_pgsleep
+FROM pg_backend_wait_event_trace
+WHERE wait_event = 'PgSleep';
+ view_has_pgsleep
+------------------
+ t
+(1 row)
+
+-- Executor markers bracket the work and carry a non-zero query_id.
+SELECT count(*) >= 1 AS has_exec_markers,
+ coalesce(bool_and(query_id <> 0), true) AS markers_have_query_id
+FROM pg_get_backend_wait_event_trace()
+WHERE wait_event_type = 'Query'
+ AND wait_event IN ('ExecStart', 'ExecEnd');
+ has_exec_markers | markers_have_query_id
+------------------+-----------------------
+ t | t
+(1 row)
+
+-- Clearing orphaned rings is a no-op here (no orphans) but must succeed.
+SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok;
+ clear_orphans_ok
+------------------
+ t
+(1 row)
+
+RESET compute_query_id;
+RESET wait_event_capture;
diff --git a/src/test/regress/expected/wait_event_timing_1.out b/src/test/regress/expected/wait_event_timing_1.out
new file mode 100644
index 0000000000000..31ccd005ff7b6
--- /dev/null
+++ b/src/test/regress/expected/wait_event_timing_1.out
@@ -0,0 +1,181 @@
+--
+-- WAIT_EVENT_TIMING
+--
+-- Exercises the wait_event_capture instrumentation: the GUC, the stats
+-- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing
+-- and histogram-buckets views, overflow counters, resets), and the trace
+-- surface (the per-session ring, its readers, and the query markers).
+--
+-- Two expected outputs are maintained:
+-- wait_event_timing.out -- --enable-wait-event-timing builds
+-- wait_event_timing_1.out -- builds without the option (stub)
+-- The difference is whether SET wait_event_capture = stats succeeds and
+-- whether the SRF records anything; durations are never printed, so the
+-- timing-build output is deterministic.
+--
+-- Default is off.
+SHOW wait_event_capture;
+ wait_event_capture
+--------------------
+ off
+(1 row)
+
+-- The taxonomy view is pure SQL and identical in both build configs.
+SELECT count(*) AS buckets FROM pg_wait_event_timing_histogram_buckets;
+ buckets
+---------
+ 32
+(1 row)
+
+SELECT bucket_idx, lower_ns, upper_ns, label
+FROM pg_wait_event_timing_histogram_buckets
+WHERE bucket_idx IN (0, 1, 31)
+ORDER BY bucket_idx;
+ bucket_idx | lower_ns | upper_ns | label
+------------+---------------+----------+---------
+ 0 | 0 | 1024 | <1us
+ 1 | 1024 | 2048 | 1-2us
+ 31 | 1099511627776 | | >=1024s
+(3 rows)
+
+-- Enable stats capture and generate a deterministic wait: pg_sleep emits a
+-- Timeout / PgSleep wait of ~0.1s. (In a stub build the SET errors and the
+-- SRF stays empty; that is the documented difference between the two
+-- expected files.)
+SET wait_event_capture = stats;
+ERROR: invalid value for parameter "wait_event_capture": "stats"
+DETAIL: This build does not support wait event capture.
+HINT: Compile PostgreSQL with --enable-wait-event-timing.
+SELECT pg_sleep(0.1);
+ pg_sleep
+----------
+
+(1 row)
+
+-- PgSleep must now be recorded for this backend, with the per-event
+-- invariants holding. We print only booleans so the output is stable.
+SELECT calls >= 1 AS calls_ok,
+ calls = (SELECT sum(h) FROM unnest(histogram) AS h) AS hist_sum_eq_calls,
+ total_time_ms > 0 AS total_positive,
+ max_time_us > 0 AS max_positive,
+ array_length(histogram, 1)
+ = (SELECT count(*)::int FROM pg_wait_event_timing_histogram_buckets)
+ AS histogram_len_ok
+FROM pg_stat_get_wait_event_timing(pg_backend_pid())
+WHERE wait_event = 'PgSleep';
+ calls_ok | hist_sum_eq_calls | total_positive | max_positive | histogram_len_ok
+----------+-------------------+----------------+--------------+------------------
+(0 rows)
+
+-- The view surfaces the same row (type/name only; durations omitted).
+SELECT wait_event_type, wait_event
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+ wait_event_type | wait_event
+-----------------+------------
+(0 rows)
+
+-- A non-NULL pid that does not exist yields no rows (silent, not an error).
+SELECT count(*) AS rows_for_bogus_pid
+FROM pg_stat_get_wait_event_timing(-1);
+ rows_for_bogus_pid
+--------------------
+ 0
+(1 row)
+
+-- Overflow/reset counters for this backend. A simple test backend uses
+-- few LWLock tranches and no out-of-range classes, so both overflow
+-- counters are zero, and a fresh backend has not been reset.
+SELECT lwlock_overflow_count, flat_overflow_count, reset_count
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+ lwlock_overflow_count | flat_overflow_count | reset_count
+-----------------------+---------------------+-------------
+(0 rows)
+
+-- Resetting our own backend is synchronous: the PgSleep row is cleared and
+-- reset_count advances. (We filter to PgSleep because inter-command waits
+-- such as ClientRead may be recorded again before the next statement runs.)
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ERROR: wait event capture is not supported by this build
+HINT: Compile PostgreSQL with --enable-wait-event-timing.
+SELECT count(*) AS pgsleep_rows_after_reset
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+ pgsleep_rows_after_reset
+--------------------------
+ 0
+(1 row)
+
+SELECT reset_count
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+ reset_count
+-------------
+(0 rows)
+
+-- The pid argument defaults to NULL, so a no-argument call resets the
+-- caller's own backend.
+SELECT pg_stat_reset_wait_event_timing();
+ERROR: wait event capture is not supported by this build
+HINT: Compile PostgreSQL with --enable-wait-event-timing.
+-- Resetting an unknown pid is a silent no-op, not an error.
+SELECT pg_stat_reset_wait_event_timing(2147483647);
+ERROR: wait event capture is not supported by this build
+HINT: Compile PostgreSQL with --enable-wait-event-timing.
+RESET wait_event_capture;
+--
+-- Trace level: per-session ring of individual waits + query markers.
+-- (In a stub build SET trace errors and the trace readers stay empty;
+-- that is the documented difference between the two expected files.)
+--
+SET compute_query_id = on;
+SET wait_event_capture = trace;
+ERROR: invalid value for parameter "wait_event_capture": "trace"
+DETAIL: This build does not support wait event capture.
+HINT: Compile PostgreSQL with --enable-wait-event-timing.
+SELECT pg_sleep(0.1);
+ pg_sleep
+----------
+
+(1 row)
+
+-- PgSleep is recorded in this backend's ring with a positive duration.
+SELECT count(*) >= 1 AS pgsleep_in_ring,
+ coalesce(bool_and(duration_us > 0), false) AS durations_positive
+FROM pg_get_backend_wait_event_trace()
+WHERE wait_event = 'PgSleep';
+ pgsleep_in_ring | durations_positive
+-----------------+--------------------
+ f | f
+(1 row)
+
+-- The same records are visible through the view.
+SELECT count(*) >= 1 AS view_has_pgsleep
+FROM pg_backend_wait_event_trace
+WHERE wait_event = 'PgSleep';
+ view_has_pgsleep
+------------------
+ f
+(1 row)
+
+-- Executor markers bracket the work and carry a non-zero query_id.
+SELECT count(*) >= 1 AS has_exec_markers,
+ coalesce(bool_and(query_id <> 0), true) AS markers_have_query_id
+FROM pg_get_backend_wait_event_trace()
+WHERE wait_event_type = 'Query'
+ AND wait_event IN ('ExecStart', 'ExecEnd');
+ has_exec_markers | markers_have_query_id
+------------------+-----------------------
+ f | t
+(1 row)
+
+-- Clearing orphaned rings is a no-op here (no orphans) but must succeed.
+SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok;
+ clear_orphans_ok
+------------------
+ t
+(1 row)
+
+RESET compute_query_id;
+RESET wait_event_capture;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 8fa0a6c47fb30..b5e6712a1e83d 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -130,6 +130,10 @@ test: partition_merge partition_split partition_join partition_prune reloptions
# ----------
test: compression compression_lz4 compression_pglz cluster
+# wait_event_timing reads only the calling backend's own stats, so it is
+# safe on its own line; pg_sleep keeps its runtime small.
+test: wait_event_timing
+
# event_trigger depends on create_am and cannot run concurrently with
# any test that runs DDL
# oidjoins is read-only, though, and should run late for best coverage
diff --git a/src/test/regress/sql/wait_event_timing.sql b/src/test/regress/sql/wait_event_timing.sql
new file mode 100644
index 0000000000000..259ffb5f2d920
--- /dev/null
+++ b/src/test/regress/sql/wait_event_timing.sql
@@ -0,0 +1,113 @@
+--
+-- WAIT_EVENT_TIMING
+--
+-- Exercises the wait_event_capture instrumentation: the GUC, the stats
+-- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing
+-- and histogram-buckets views, overflow counters, resets), and the trace
+-- surface (the per-session ring, its readers, and the query markers).
+--
+-- Two expected outputs are maintained:
+-- wait_event_timing.out -- --enable-wait-event-timing builds
+-- wait_event_timing_1.out -- builds without the option (stub)
+-- The difference is whether SET wait_event_capture = stats succeeds and
+-- whether the SRF records anything; durations are never printed, so the
+-- timing-build output is deterministic.
+--
+
+-- Default is off.
+SHOW wait_event_capture;
+
+-- The taxonomy view is pure SQL and identical in both build configs.
+SELECT count(*) AS buckets FROM pg_wait_event_timing_histogram_buckets;
+SELECT bucket_idx, lower_ns, upper_ns, label
+FROM pg_wait_event_timing_histogram_buckets
+WHERE bucket_idx IN (0, 1, 31)
+ORDER BY bucket_idx;
+
+-- Enable stats capture and generate a deterministic wait: pg_sleep emits a
+-- Timeout / PgSleep wait of ~0.1s. (In a stub build the SET errors and the
+-- SRF stays empty; that is the documented difference between the two
+-- expected files.)
+SET wait_event_capture = stats;
+SELECT pg_sleep(0.1);
+
+-- PgSleep must now be recorded for this backend, with the per-event
+-- invariants holding. We print only booleans so the output is stable.
+SELECT calls >= 1 AS calls_ok,
+ calls = (SELECT sum(h) FROM unnest(histogram) AS h) AS hist_sum_eq_calls,
+ total_time_ms > 0 AS total_positive,
+ max_time_us > 0 AS max_positive,
+ array_length(histogram, 1)
+ = (SELECT count(*)::int FROM pg_wait_event_timing_histogram_buckets)
+ AS histogram_len_ok
+FROM pg_stat_get_wait_event_timing(pg_backend_pid())
+WHERE wait_event = 'PgSleep';
+
+-- The view surfaces the same row (type/name only; durations omitted).
+SELECT wait_event_type, wait_event
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+
+-- A non-NULL pid that does not exist yields no rows (silent, not an error).
+SELECT count(*) AS rows_for_bogus_pid
+FROM pg_stat_get_wait_event_timing(-1);
+
+-- Overflow/reset counters for this backend. A simple test backend uses
+-- few LWLock tranches and no out-of-range classes, so both overflow
+-- counters are zero, and a fresh backend has not been reset.
+SELECT lwlock_overflow_count, flat_overflow_count, reset_count
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+
+-- Resetting our own backend is synchronous: the PgSleep row is cleared and
+-- reset_count advances. (We filter to PgSleep because inter-command waits
+-- such as ClientRead may be recorded again before the next statement runs.)
+SELECT pg_stat_reset_wait_event_timing(NULL);
+SELECT count(*) AS pgsleep_rows_after_reset
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+SELECT reset_count
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+
+-- The pid argument defaults to NULL, so a no-argument call resets the
+-- caller's own backend.
+SELECT pg_stat_reset_wait_event_timing();
+
+-- Resetting an unknown pid is a silent no-op, not an error.
+SELECT pg_stat_reset_wait_event_timing(2147483647);
+
+RESET wait_event_capture;
+
+--
+-- Trace level: per-session ring of individual waits + query markers.
+-- (In a stub build SET trace errors and the trace readers stay empty;
+-- that is the documented difference between the two expected files.)
+--
+SET compute_query_id = on;
+SET wait_event_capture = trace;
+SELECT pg_sleep(0.1);
+
+-- PgSleep is recorded in this backend's ring with a positive duration.
+SELECT count(*) >= 1 AS pgsleep_in_ring,
+ coalesce(bool_and(duration_us > 0), false) AS durations_positive
+FROM pg_get_backend_wait_event_trace()
+WHERE wait_event = 'PgSleep';
+
+-- The same records are visible through the view.
+SELECT count(*) >= 1 AS view_has_pgsleep
+FROM pg_backend_wait_event_trace
+WHERE wait_event = 'PgSleep';
+
+-- Executor markers bracket the work and carry a non-zero query_id.
+SELECT count(*) >= 1 AS has_exec_markers,
+ coalesce(bool_and(query_id <> 0), true) AS markers_have_query_id
+FROM pg_get_backend_wait_event_trace()
+WHERE wait_event_type = 'Query'
+ AND wait_event IN ('ExecStart', 'ExecEnd');
+
+-- Clearing orphaned rings is a no-op here (no orphans) but must succeed.
+SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok;
+
+RESET compute_query_id;
+RESET wait_event_capture;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 8cf40c87043f2..fe770fd6086cb 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1609,6 +1609,8 @@ LWLock
LWLockHandle
LWLockMode
LWLockPadded
+LWLockTimingHash
+LWLockTimingHashEntry
LWLockTrancheShmemData
LZ4F_compressionContext_t
LZ4F_decompressOptions_t
@@ -3416,6 +3418,7 @@ WaitEvent
WaitEventActivity
WaitEventBuffer
WaitEventClient
+WaitEventCaptureLevel
WaitEventCustomCounterData
WaitEventCustomEntryByInfo
WaitEventCustomEntryByName
@@ -3423,6 +3426,14 @@ WaitEventIO
WaitEventIPC
WaitEventSet
WaitEventTimeout
+WaitEventTimingControl
+WaitEventTimingEntry
+WaitEventTimingState
+WaitEventTraceControl
+WaitEventTraceRecord
+WaitEventTraceSlot
+WaitEventTraceSlotState
+WaitEventTraceState
WaitLSNProcInfo
WaitLSNResult
WaitLSNState
@@ -3453,6 +3464,7 @@ WalUsage
WalWriteMethod
WalWriteMethodOps
Walfile
+WetValidRecord
WindowAgg
WindowAggPath
WindowAggState