diff --git a/.cirrus.tasks.yml b/.cirrus.tasks.yml index a22cef063f337..5a747254c93ae 100644 --- a/.cirrus.tasks.yml +++ b/.cirrus.tasks.yml @@ -514,12 +514,19 @@ task: # code not being exercised much. Thus specify a very small segment size # here. Use a non-power-of-two segment size, given we currently allow # that. + # --enable-wait-event-timing is tacked on to this entry so the timing + # build path (including the expected output at + # src/test/regress/expected/wait_event_timing.out) actually gets + # exercised by CI; without it, only the stub alt output + # wait_event_timing_1.out is consumed and any regression in the + # timing-enabled code is invisible to upstream. configure_script: | su postgres <<-EOF set -e ./configure \ --enable-cassert --enable-injection-points --enable-debug \ --enable-tap-tests --enable-nls \ + --enable-wait-event-timing \ --with-segsize-blocks=6 \ --with-libnuma \ --with-liburing \ diff --git a/configure b/configure index f66c1054a7a1e..a535703d3a5ce 100755 --- a/configure +++ b/configure @@ -774,6 +774,7 @@ CC enable_injection_points PG_TEST_EXTRA enable_tap_tests +enable_wait_event_timing enable_dtrace DTRACEFLAGS DTRACE @@ -850,6 +851,7 @@ enable_debug enable_profiling enable_coverage enable_dtrace +enable_wait_event_timing enable_tap_tests enable_injection_points with_blocksize @@ -1551,6 +1553,8 @@ Optional Features: --enable-profiling build with profiling enabled --enable-coverage build with coverage testing instrumentation --enable-dtrace build with DTrace support + --enable-wait-event-timing + build with wait event timing instrumentation --enable-tap-tests enable TAP tests (requires Perl and IPC::Run) --enable-injection-points enable injection points (for testing) @@ -3632,6 +3636,34 @@ fi +# +# --enable-wait-event-timing adds wait event timing instrumentation +# + + +# Check whether --enable-wait-event-timing was given. +if test "${enable_wait_event_timing+set}" = set; then : + enableval=$enable_wait_event_timing; + case $enableval in + yes) + +$as_echo "#define USE_WAIT_EVENT_TIMING 1" >>confdefs.h + + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --enable-wait-event-timing option" "$LINENO" 5 + ;; + esac + +else + enable_wait_event_timing=no + +fi + + # # TAP tests diff --git a/configure.ac b/configure.ac index 8d176bd3468e9..0d37b77ac53b6 100644 --- a/configure.ac +++ b/configure.ac @@ -225,6 +225,14 @@ fi AC_SUBST(DTRACEFLAGS)]) AC_SUBST(enable_dtrace) +# +# --enable-wait-event-timing adds wait event timing instrumentation +# +PGAC_ARG_BOOL(enable, wait-event-timing, no, + [build with wait event timing instrumentation], + [AC_DEFINE([USE_WAIT_EVENT_TIMING], 1, + [Define to 1 to build with wait event timing. (--enable-wait-event-timing)])]) + # # TAP tests # diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 73cc04123303d..d059dc095a2a0 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -9110,6 +9110,209 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; + + wait_event_capture (enum) + + wait_event_capture configuration parameter + + + + + Controls collection of wait event instrumentation data. Requires + the server to be compiled with + . Possible values are + off, stats, and + trace; each level is a strict superset of the + previous one. + + + At stats, the server records per-backend wait + event statistics (counts, total and average durations, log2 + histograms) visible in the + + pg_stat_wait_event_timing view. + Two clock_gettime() calls are added around + every wait event transition, costing approximately + 40–100 ns each on modern hardware. + + + At trace, the server additionally records every + individual wait event into a per-session ring buffer (~4 MB of + DSA per backend, allocated lazily on first enable), exposed via the + + pg_backend_wait_event_trace view. + Each record carries either a wait event or a query-attribution + marker; consumers reconstruct which query owns which wait by + interleaving the two streams. + + + Two marker families are emitted into the ring: + + + + ExecStart/ExecEnd markers + bracket every executor invocation + (ExecutorStart/ExecutorEnd). + They are the primary attribution signal: every executable + statement, including those run inside parallel workers and + pipelined extended-protocol messages, is bracketed. Emission + requires to produce a + non-zero query_id; otherwise the + markers are silently skipped. They are not + gated on track_activities. + + + + + QueryStart/QueryEnd markers + fire at top-level query identifier transitions and at the + transition to idle, providing inter-statement boundaries that + the executor markers cannot (e.g. the + ClientRead wait between statements). They + require both and + to be enabled. + + + + A WARNING is logged at the time + wait_event_capture is set to trace + if either prerequisite is missing. + + + The default is off. Only superusers and users + with the appropriate SET privilege can change + this setting. + + + The setting is gated to superuser by default because + trace mode allocates approximately 4 MB + of dynamic shared memory per backend that enables it; an + unprivileged role enabling trace on every connection in a + large pool could consume substantial cluster-wide memory. + Read access to the resulting statistics is controlled + separately by membership in the + pg_read_all_stats + role (which the pg_monitor role inherits), + so a monitoring operator can typically read + pg_stat_wait_event_timing but cannot + toggle wait_event_capture itself. + + + To delegate the ability to change this setting to a + non-superuser role — for example, the + pg_monitor role in environments where the + cluster owner is not the operator on call — use the + standard PostgreSQL GRANT SET ON PARAMETER + mechanism: + +GRANT SET ON PARAMETER wait_event_capture TO pg_monitor; + + After this, any role that has the pg_monitor + role membership can run + SET wait_event_capture = stats (or + = trace) for its own session. The grant is + per-installation policy rather than baked into the GUC, so + managed-PostgreSQL environments and self-hosted clusters can + choose independently whether monitoring roles should be able to + flip this on. + + + + + + wait_event_timing_max_tranches (integer) + + wait_event_timing_max_tranches configuration parameter + + + + + Sets the maximum number of distinct LWLock tranches whose timing + is recorded individually per backend. PostgreSQL maintains a + per-backend hash table that maps each tranche the backend + encounters to its histogram bucket; once the table fills, further + tranches encountered by that backend are counted against + lwlock_overflow_count in + + pg_stat_wait_event_timing_overflow + and not individually timed. Sized at server start; this + parameter has no effect on builds compiled without + . The default is + 192; raise it if your installation loads many + extensions that register their own LWLock tranches and you + observe non-zero + lwlock_overflow_count. + + + The shared-memory cost is per-backend and proportional to this + setting. Each entry is approximately 152 bytes (an + LWLock-timing histogram), and the slot table that resolves + tranche IDs adds another 4 bytes per slot, with the slot + count rounded up to the next power of two of twice this value. + At default 192 entries (512 slots) the per-backend overhead is + roughly 31 KB; at 512 entries (1024 slots) roughly + 80 KB. The total cluster-wide cost is paid only when the + first backend in the cluster sets + to a non-off + value, and remains allocated for the postmaster's lifetime + regardless of subsequent GUC changes. Builds compiled without + pay zero memory for + this setting. + + + Setting can only be changed at server start. Only superusers + and users with the appropriate SET privilege + can change this setting. + + + + + + wait_event_trace_ring_size_kb (integer) + + wait_event_trace_ring_size_kb configuration parameter + + + + + Per-backend size, in kilobytes, of the wait-event-trace ring + buffer allocated when a session sets + to + trace. Must be a power of two. Sized at + server start (PGC_POSTMASTER); all rings in + a given postmaster run have the same size. This parameter has + no effect on builds compiled without + . + + + Each record is 32 bytes, so the record count is the kilobyte + value times 32. The default of 4096 KB + (= 131072 records, ~4 MB) gives roughly 0.5–1 + second of retention at peak wait-event rates of 200K/s. + Larger values give longer retention before the FIFO wrap + overwrites the oldest records; smaller values reduce + per-backend memory at high max_connections. + Allowed range is 832768 + KB (256 records to ~1 million records per ring). + + + Worst-case total memory is approximately + max_connections * + wait_event_trace_ring_size_kb, allocated + lazily from a cluster-wide DSA only as backends enable + wait_event_capture = trace. + Memory is reclaimed when backends exit and their slots are + recycled, or explicitly via + pg_stat_clear_orphaned_wait_event_rings. + + + Setting can only be changed at server start. Only superusers + and users with the appropriate SET privilege + can change this setting. + + + + track_functions (enum) diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml index b345a1056740a..9ddf46328e2c7 100644 --- a/doc/src/sgml/installation.sgml +++ b/doc/src/sgml/installation.sgml @@ -1594,6 +1594,33 @@ build-postgresql: + + + + + Compiles in per-backend wait event timing instrumentation. + When enabled, every call to + pgstat_report_wait_start()/pgstat_report_wait_end() + records the wait duration and accumulates per-event statistics + (count, total time, histogram) in shared memory. + The overhead is two clock_gettime(CLOCK_MONOTONIC) + calls per wait event transition (~40–100 ns via VDSO). + When not compiled in, the wait_event_capture + GUC still exists but only accepts off, and the + SQL functions return empty result sets. + The compile flag allocates approximately 120 KB of shared + memory per backend slot for timing statistics (regardless of GUC + setting). At max_connections = 200 + this is roughly 26 MB; at 1000 it is roughly 120 MB. + Trace ring buffers are allocated lazily via DSA only when + wait_event_capture is set to + trace (~4 MB per traced backend). + See for the runtime + control. + + + + diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 08d5b8245529f..5f12b700700b1 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -551,6 +551,24 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser + + pg_stat_wait_event_timingpg_stat_wait_event_timing + One row per backend per wait event, showing accumulated timing + statistics. See + + pg_stat_wait_event_timing for details. + + + + + pg_backend_wait_event_tracepg_backend_wait_event_trace + Individual wait event records from the current backend's trace + ring buffer. See + + pg_backend_wait_event_trace for details. + + + @@ -3699,6 +3717,603 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + <structname>pg_stat_wait_event_timing</structname> + + + pg_stat_wait_event_timing + + + + The pg_stat_wait_event_timing view contains one + row for each combination of backend and wait event that has a non-zero + call count. It shows accumulated timing statistics collected when + is set to stats + or trace. Requires the server to be compiled with + . + + + + <structname>pg_stat_wait_event_timing</structname> View + + + + + Column Type + + + Description + + + + + + + + pid integer + + + Process ID of the backend + + + + + + backend_type text + + + Type of the backend (e.g. client backend, + checkpointer, walwriter) + + + + + + procnumber integer + + + Internal slot number (0-based process number). Suitable for + passing directly to pg_get_wait_event_trace. + + + + + + wait_event_type text + + + Wait event type (e.g. IO, LWLock, + Timeout) + + + + + + wait_event text + + + Wait event name (e.g. DataFileRead, + WALWrite, PgSleep) + + + + + + calls bigint + + + Number of times this wait event occurred + + + + + + total_time_ms double precision + + + Total time spent in this wait event, in milliseconds + + + + + + avg_time_us double precision + + + Average wait duration, in microseconds + + + + + + max_time_us double precision + + + Maximum single wait duration, in microseconds + + + + + + histogram bigint[] + + + Log2 histogram of wait durations with 16 buckets. Bin edges are + powers of two on the nanosecond axis: bucket 0 covers + [0, 1024) ns, bucket k covers + [2^(k+9), + 2^(k+10)) ns, and the last bucket covers + [2^24, ∞) ns. The boundaries approximate the + decimal-microsecond grid (1024 ns ≈ 1 μs, 2048 ns ≈ + 2 μs, ..., 2^24 ns ≈ 16 ms); the exact edges are chosen + to let the hot path skip a division by 1000. The + + pg_wait_event_timing_histogram_buckets + view provides the numeric bin edges and human-readable labels for + each index; the canonical join pattern is: + +SELECT w.wait_event, b.label, h.count +FROM pg_stat_wait_event_timing w, + LATERAL unnest(w.histogram) WITH ORDINALITY AS h(count, idx) +JOIN pg_wait_event_timing_histogram_buckets b ON b.bucket_idx = h.idx - 1 +WHERE w.wait_event = 'PgSleep' +ORDER BY b.bucket_idx; + + + + + + +
+
+ + + <structname>pg_wait_event_timing_histogram_buckets</structname> + + + pg_wait_event_timing_histogram_buckets + + + + The pg_wait_event_timing_histogram_buckets + view describes the 32 bins used by the + histogram column of + + pg_stat_wait_event_timing. It always + contains 32 rows in ascending order of + bucket_idx, and is independent of runtime + state; a join against it attaches numeric bin edges and human + labels to any histogram array. Bins are powers of two on the + nanosecond axis: bin 0 covers [0, 1us), each + subsequent bin doubles its lower edge, and the final bin + (bucket_idx = 31) is open-ended at + approximately 1024 seconds. + + + + The 32-bin layout (rather than the more common 16-bin choice for + log-scale histograms) is deliberate: real-world wait-event + distributions have long tails routinely extending past 16 ms + into multi-second territory (slow-disk + DataFileRead, lock contention waits, replication + apply waits, vacuum waits). A 16-bin histogram would collapse all + of those into a single overflow bin, hiding the very signal that + wait-event timing exists to surface. The 32-bin layout keeps the + long tail individually addressable up to about 17 minutes + before the open-ended bin; single waits beyond that belong in + auto_explain + or pg_stat_activity, not a histogram. + + + + <structname>pg_wait_event_timing_histogram_buckets</structname> View + + + + + Column Type + + + Description + + + + + + + + bucket_idx integer + + + Zero-based bin index (0–31). Matches the offset into the + histogram array of + pg_stat_wait_event_timing. + + + + + + lower_ns bigint + + + Inclusive lower edge of this bin in nanoseconds. + + + + + + upper_ns bigint + + + Exclusive upper edge of this bin in nanoseconds, or + NULL for the final bin which extends to + infinity. + + + + + + label text + + + Short human-readable label for the bin (e.g. + <1us, 1-2us, + >=16ms), expressed on the approximate + decimal-microsecond grid the bin edges are aligned to. + + + + +
+
+ + + <structname>pg_stat_wait_event_timing_overflow</structname> + + + pg_stat_wait_event_timing_overflow + + + + The pg_stat_wait_event_timing_overflow view + exposes per-backend truncation counters for the wait-event timing + subsystem. Each backend owns a bounded LWLock timing hash + (192 tranches) and a bounded flat event array; events that cannot + be mapped to a slot are counted here. A non-zero value means the + corresponding row(s) in + + pg_stat_wait_event_timing + are incomplete for that backend. Requires the server to be + compiled with . + + + + <structname>pg_stat_wait_event_timing_overflow</structname> View + + + + + Column Type + + + Description + + + + + + + + pid integer + + + Process ID of the backend + + + + + + backend_type text + + + Type of the backend (e.g. client backend, + checkpointer, walwriter) + + + + + + procnumber integer + + + Internal slot number (0-based process number). Suitable for + passing directly to pg_get_wait_event_trace. + + + + + + lwlock_overflow_count bigint + + + Number of LWLock wait events dropped because the per-backend + LWLock timing hash was already full (more distinct tranches + observed in this session than + allows). + Zero means no LWLock truncation. A one-time + WARNING is also emitted to the server log on + first overflow. If you see this counter rising, raise + wait_event_timing_max_tranches at server + start (the per-backend memory cost is proportional and + described under that GUC). + + + + + + flat_overflow_count bigint + + + Number of non-LWLock wait events dropped because the event + could not be mapped to a known class / index. This almost + always indicates a code path emitting a wait event of a class + the timing infrastructure was not compiled for; it should be + zero in supported builds. + + + + + + reset_count bigint + + + Number of resets this backend has observed and acted + on; not a request counter. Own-backend resets via + pg_stat_reset_wait_event_timing(NULL) (or + passing the caller's own PID) are synchronous and bump this + column once per call. Cross-backend reset requests + coalesce: if several + pg_stat_reset_wait_event_timing(pid) + calls land between two of the target's wait events, the target + observes them as a single reset and increments + reset_count only once. Callers + polling for asynchronous-reset acknowledgment should watch for + any N → N+1 transition. + + + + +
+
+ + + <structname>pg_backend_wait_event_trace</structname> + + + pg_backend_wait_event_trace + + + + The pg_backend_wait_event_trace view shows + individual wait event records from the current backend's + trace ring buffer. Each record captures either a single wait event + (with timestamp and duration) or a query-attribution marker. Two + marker families exist: ExecStart/ExecEnd + bracket every executor invocation, and + QueryStart/QueryEnd mark + top-level query-id transitions and the transition to idle. See + for the gating rules of + each marker family. + Requires to be set to + trace. The ring buffer holds up to + kilobytes of + records (default 4096 KB = 131072 records of 32 bytes each); + older records are overwritten in FIFO order. The view is session-local + and analogous in scope to + + pg_backend_memory_contexts; querying it + from a superuser session still returns only that session's own + records, never another backend's. + + + + The pg_backend_wait_event_trace view is + intended for session-local interactive diagnostics: + running ad-hoc SELECT queries against your own + session's trace from psql while + investigating wait-event behaviour. The view materialises up to + one ring's worth of records (default ~4 MB, controlled by + ) into a + tuplestore on each call, which is bounded and acceptable for that + use; for narrow result sets, append + ORDER BY seq DESC LIMIT N + to get the most recent records. + + + + Cross-backend monitoring tools — extensions and background + workers that read wait events losslessly from every backend's + ring — should not consume through this + view. The in-tree cross-backend reader is + pg_get_wait_event_trace + (see ); the underlying + per-session SQL function returns only the calling backend's own + ring, so a background worker invoking + SELECT * FROM pg_backend_wait_event_trace via + SPI would receive only its own (typically empty) ring, not the + target backend's data. External tools that need cross-backend + access without going through SQL use the shared-memory snapshot + pattern documented on + WaitEventTraceControl in + src/include/utils/wait_event_timing.h: + snapshot trace_slots[procNumber].generation, + acquire WaitEventTraceCtl->lock in + LW_SHARED, resolve the target slot's + ring_ptr via + dsa_get_address, snapshot the relevant slice + of the ring into local memory, release the lock, re-snapshot + generation and discard the read if it + changed, then process the snapshot off the lock. That bypasses + this view entirely and is the supported cross-backend interface + for monitoring extensions. + + + + Slot lifecycle. Per-backend trace rings are + not freed when their owner backend exits. The ring stays + allocated in shared memory in an orphaned state + so the dying backend's final waits remain readable by the + cross-backend interface — + pg_get_wait_event_trace + (see ) for in-tree + access, or external background workers that follow the + snapshot pattern documented above. + This does not change the behaviour of this view, + which always reads the calling backend's own ring and is + unaffected by orphan-state slots belonging to other + procnumbers. The lifecycle change matters for short-lived + backends that exit before any monitoring tool has read their + data: parallel workers in particular exit in milliseconds at + end-of-parallel-query, well below typical reader polling + intervals, and without orphan-persistence their final waits + would be lost. Orphaned rings are reclaimed automatically when a new + backend takes over the same procNumber + slot, and the DBA can force a sweep at any time via + pg_stat_clear_orphaned_wait_event_rings. + The worst-case orphan-memory footprint is bounded by the slot + count times ~4 MB; see + pg_stat_clear_orphaned_wait_event_rings + under for details and + the deployment patterns where the function is most useful. + + + + The ring buffer is designed as a lock-free transport mechanism for + external consumption. At high wait event rates (e.g., 220K events/sec), + the ring wraps in roughly 0.5–1 seconds. External consumers + (background workers, extensions) can attribute events to queries by + scanning for ExecStart markers (or, when the + executor markers are unavailable, QueryStart); if + both have been overwritten, events before the next visible marker are + unattributed. Consumers should poll the ring buffer before it wraps + and can use st_query_id from + PgBackendStatus as a fallback for the current + query context. + + + + The seq column is the absolute write + position of each record; it is monotonically increasing and never + resets while the ring is alive. A consumer polling the ring + repeatedly can detect wraparound losses by tracking + max(seq) between successive scrapes: given two + consecutive polls returning N2 rows with + maximum seq values + S1 (previous poll) and + S2 (current poll), the number of records + overwritten before the second poll could read them is + max(0, (S2 - S1) - N2). No separate + trace overflow counter is exposed because this + information is exact and derivable from seq + alone. + + + + QueryStart/QueryEnd markers are + emitted as matched pairs around each protocol phase that touches a + query_id. In simple protocol that is one + pair per statement. In extended protocol there is one pair around + each of Parse, Bind, and + Execute for the same + query_id — so a single parameterized + statement produces three nested pairs, plus the surrounding + ExecStart/ExecEnd pair from the + executor. This per-phase pairing lets consumers measure how much + time a query spent in each protocol phase (parse vs. bind vs. + execute) by computing the duration between each pair, and lets a + total-time-per-query rollup be expressed as the sum of pair + durations rather than a single subtraction. Consumers that just want + "how long did this query take in the executor" should use the + ExecStart/ExecEnd pair, which + fires exactly once per statement regardless of protocol. + + + + <structname>pg_backend_wait_event_trace</structname> View + + + + + Column Type + + + Description + + + + + + + + seq bigint + + + Sequence number of this record in the ring buffer + + + + + + timestamp_ns bigint + + + Monotonic clock timestamp in nanoseconds + + + + + + wait_event_type text + + + Wait event type, or Query for query markers + + + + + + wait_event text + + + Wait event name, or one of ExecStart, + ExecEnd, QueryStart, + QueryEnd for query-attribution markers. + + + + + + duration_us double precision + + + Wait duration in microseconds (0 for query markers) + + + + + + query_id bigint + + + Query identifier for query markers (0 for wait events) + + + + + +
+
+ <structname>pg_stat_database</structname> @@ -5736,6 +6351,208 @@ description | Waiting for a newly initialized WAL file to reach durable storage
+ + + + pg_stat_get_wait_event_timing + + pg_stat_get_wait_event_timing () + setof record + + + Returns one row for each combination of backend and wait event with + non-zero counts. Output columns include pid, + backend_type, event identity, timing + statistics, and a log2 histogram. Unprivileged users see only their + own backend. Superusers and members of + pg_read_all_stats see all backends. + Requires . + + + + + + + pg_get_backend_wait_event_trace + + pg_get_backend_wait_event_trace () + setof record + + + Returns individual wait event records from the current session's + trace ring buffer. For another session's ring (live or + post-mortem orphaned), use + pg_get_wait_event_trace below. + + + + + + + pg_get_wait_event_trace + + pg_get_wait_event_trace ( procnumber integer ) + setof record + + + Returns individual wait event records from the trace ring of + the backend that currently or previously occupied the slot + identified by procnumber. Reads slots + in OWNED state (live writer) and + ORPHANED state (writer has exited but the + ring is preserved for post-mortem reading) uniformly. An + empty result indicates the slot is in FREE + state (no ring) or no records have been written. Concurrent + slot transitions cannot interrupt the read because the + function holds the cross-backend trace lock in + SHARED mode throughout the iteration; the + per-record seqlock protocol skips any record being written + by a concurrent live writer. + + + This is the canonical cross-backend reader. External + monitoring extensions that need cross-backend access without + going through SQL should follow the same snapshot pattern + documented on WaitEventTraceControl + in src/include/utils/wait_event_timing.h; + this function serves as both the reference implementation and + a DBA-facing diagnostic tool. The + procnumber argument can be obtained + from the procnumber column of + pg_stat_get_wait_event_timing or + pg_stat_get_wait_event_timing_overflow + for live backends. For post-mortem reads of short-lived + backends (parallel workers, autovacuum, walsender) the + procnumber must be captured while the + backend is still alive, or discovered by iterating slots in a + monitoring background worker. A pid-keyed lookup for live + backends only is one query away: + + +SELECT * FROM pg_get_wait_event_trace( + (SELECT procnumber FROM pg_stat_get_wait_event_timing(target_pid) + WHERE pid = target_pid LIMIT 1)); + + + + Requires membership in pg_read_all_stats + (matching the privilege model of the session-local view + pg_backend_wait_event_trace). + + + + + + + pg_stat_get_wait_event_timing_overflow + + pg_stat_get_wait_event_timing_overflow () + setof record + + + Returns one row per live backend with per-backend truncation + counters for the wait event timing subsystem. Use this view to + confirm that + + pg_stat_wait_event_timing + rows for a backend are complete rather than truncated. + Unprivileged users see only their own backend; superusers and + members of pg_read_all_stats see all + backends. Requires . + + + + + + + pg_stat_reset_wait_event_timing + + pg_stat_reset_wait_event_timing ( pid integer DEFAULT NULL ) + void + + + Resets wait event timing counters for a single backend, identified + by its process ID (see pid in + + pg_stat_activity). + Passing NULL (or the caller's own + pg_backend_pid()) resets the current session; + any user may do this. Passing any other PID resets that backend + and requires membership in the + pg_signal_backend + role — the same role required by + pg_stat_reset_backend_stats, + pg_terminate_backend, and + pg_cancel_backend. Unknown or + already-exited PIDs are silent no-ops, matching the behavior of + pg_stat_reset_backend_stats. + + + + + + + pg_stat_reset_wait_event_timing_all + + pg_stat_reset_wait_event_timing_all () + void + + + Resets wait event timing counters for every backend in the + cluster. Requires superuser. This is intentionally stricter + than the per-backend variant + pg_stat_reset_wait_event_timing(pid), + which only requires pg_signal_backend: the + cluster-wide form has unbounded blast radius (it affects every + backend in a single call) and would erase forensic patterns + that span multiple backends, so it is gated to the cluster + owner. Returns before the resets have been observed by their + target backends; callers that need strict read-after-reset + semantics should poll each target's + reset_count column. + + + + + + + pg_stat_clear_orphaned_wait_event_rings + + pg_stat_clear_orphaned_wait_event_rings () + bigint + + + Frees every wait-event-trace ring whose owner backend has + exited. Returns the number of rings released. Requires + superuser. + + + When a backend that had wait_event_capture = + trace exits, its ~4 MB trace ring is + intentionally not freed at exit so that + cross-backend consumers + (pg_get_wait_event_trace and extensions + following the snapshot pattern) can still read the dying + backend's final waits. The + memory is reclaimed lazily: in the common case, the ring is + freed automatically when a new backend takes over the same + procNumber slot. This function is the + explicit DBA-driven sweep for the pathological case where + capture was briefly enabled, then disabled, on a cluster with + long-lived pooled connections that never recycle the + procNumber. The maximum amount of memory + this can release is bounded by the slot count times the + per-ring size (~400 MB at max_connections + = 100, ~4 GB at 1000); on most deployments the function will + report 0 because connection churn already drained orphans + naturally. + + + Safe to call when capture is currently off + and even when no orphans exist (returns 0 in both cases). + + + diff --git a/meson.build b/meson.build index 20b887f1a1bc1..f786901189660 100644 --- a/meson.build +++ b/meson.build @@ -505,6 +505,7 @@ meson_bin = find_program(meson_binpath, native: true) cdata.set('USE_ASSERT_CHECKING', get_option('cassert') ? 1 : false) cdata.set('USE_INJECTION_POINTS', get_option('injection_points') ? 1 : false) +cdata.set('USE_WAIT_EVENT_TIMING', get_option('wait_event_timing') ? 1 : false) blocksize = get_option('blocksize').to_int() * 1024 diff --git a/meson_options.txt b/meson_options.txt index 6a793f3e47943..1f191d3a9d621 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -40,6 +40,9 @@ option('pgport', type: 'integer', value: 5432, option('cassert', type: 'boolean', value: false, description: 'Enable assertion checks (for debugging)') +option('wait_event_timing', type: 'boolean', value: false, + description: 'Enable wait event timing instrumentation') + option('tap_tests', type: 'feature', value: 'auto', description: 'Enable TAP tests') diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 73a1c1c46703a..0fd75f8289abf 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1556,3 +1556,106 @@ CREATE VIEW pg_aios AS SELECT * FROM pg_get_aios(); REVOKE ALL ON pg_aios FROM PUBLIC; GRANT SELECT ON pg_aios TO pg_read_all_stats; + +-- Taxonomy for the histogram column on pg_stat_wait_event_timing. The +-- histogram array has one entry per bucket, in ascending order. This +-- view names them so callers do not have to memorise the layout; join +-- against it via unnest(histogram) WITH ORDINALITY. +-- +-- WARNING: keep this list in lock-step with WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS +-- and wait_event_timing_bucket() in src/backend/utils/activity/wait_event_timing.c. +-- Bin edges are powers of two in nanoseconds; labels are the approximate +-- decimal-microsecond grid documented in src/include/utils/wait_event_timing.h. +CREATE VIEW pg_wait_event_timing_histogram_buckets AS + SELECT bucket_idx, lower_ns, upper_ns, label + FROM (VALUES + ( 0, 0::bigint, 1024::bigint, '<1us'::text), + ( 1, 1024::bigint, 2048::bigint, '1-2us'), + ( 2, 2048::bigint, 4096::bigint, '2-4us'), + ( 3, 4096::bigint, 8192::bigint, '4-8us'), + ( 4, 8192::bigint, 16384::bigint, '8-16us'), + ( 5, 16384::bigint, 32768::bigint, '16-32us'), + ( 6, 32768::bigint, 65536::bigint, '32-64us'), + ( 7, 65536::bigint, 131072::bigint, '64-128us'), + ( 8, 131072::bigint, 262144::bigint, '128-256us'), + ( 9, 262144::bigint, 524288::bigint, '256-512us'), + (10, 524288::bigint, 1048576::bigint, '512us-1ms'), + (11, 1048576::bigint, 2097152::bigint, '1-2ms'), + (12, 2097152::bigint, 4194304::bigint, '2-4ms'), + (13, 4194304::bigint, 8388608::bigint, '4-8ms'), + (14, 8388608::bigint, 16777216::bigint, '8-16ms'), + (15, 16777216::bigint, 33554432::bigint, '16-32ms'), + (16, 33554432::bigint, 67108864::bigint, '32-64ms'), + (17, 67108864::bigint, 134217728::bigint, '64-128ms'), + (18, 134217728::bigint, 268435456::bigint, '128-256ms'), + (19, 268435456::bigint, 536870912::bigint, '256-512ms'), + (20, 536870912::bigint, 1073741824::bigint, '512ms-1s'), + (21, 1073741824::bigint, 2147483648::bigint, '1-2s'), + (22, 2147483648::bigint, 4294967296::bigint, '2-4s'), + (23, 4294967296::bigint, 8589934592::bigint, '4-8s'), + (24, 8589934592::bigint, 17179869184::bigint, '8-16s'), + (25, 17179869184::bigint, 34359738368::bigint, '16-32s'), + (26, 34359738368::bigint, 68719476736::bigint, '32-64s'), + (27, 68719476736::bigint, 137438953472::bigint, '64-128s'), + (28, 137438953472::bigint, 274877906944::bigint, '128-256s'), + (29, 274877906944::bigint, 549755813888::bigint, '256-512s'), + (30, 549755813888::bigint, 1099511627776::bigint, '512s-1024s'), + (31, 1099511627776::bigint, NULL::bigint, '>=1024s') + ) AS t(bucket_idx, lower_ns, upper_ns, label); + +CREATE VIEW pg_stat_wait_event_timing AS + SELECT + t.pid, + t.backend_type, + t.procnumber, + t.wait_event_type, + t.wait_event, + t.calls, + t.total_time_ms, + t.avg_time_us, + t.max_time_us, + t.histogram + FROM pg_stat_get_wait_event_timing(NULL) t; +REVOKE ALL ON pg_stat_wait_event_timing FROM PUBLIC; +GRANT SELECT ON pg_stat_wait_event_timing TO pg_read_all_stats; + +CREATE VIEW pg_stat_wait_event_timing_overflow AS + SELECT + t.pid, + t.backend_type, + t.procnumber, + t.lwlock_overflow_count, + t.flat_overflow_count, + t.reset_count + FROM pg_stat_get_wait_event_timing_overflow(NULL) t; +REVOKE ALL ON pg_stat_wait_event_timing_overflow FROM PUBLIC; +GRANT SELECT ON pg_stat_wait_event_timing_overflow TO pg_read_all_stats; + + +-- Session-local view: mirrors pg_backend_memory_contexts in both naming +-- and access control. The SRF is hardcoded to the caller's own ring, +-- so a non-superuser only ever sees their own session's data; but as +-- with pg_backend_memory_contexts, the row contents (query_id values +-- joinable against pg_stat_statements, per-event timings) are +-- information that ordinary roles should not see across SECURITY +-- DEFINER call chains. Lock the view to pg_read_all_stats to match +-- the precedent set in commit f8a2afa12 (PG 17) for the namesake view. +CREATE VIEW pg_backend_wait_event_trace AS + SELECT + t.seq, + t.timestamp_ns, + t.wait_event_type, + t.wait_event, + t.duration_us, + t.query_id + FROM pg_get_backend_wait_event_trace() t; +REVOKE ALL ON pg_backend_wait_event_trace FROM PUBLIC; +GRANT SELECT ON pg_backend_wait_event_trace TO pg_read_all_stats; + +-- Cross-backend trace ring reader. Keyed by procnumber (reads OWNED +-- and ORPHANED slots uniformly so post-mortem data from short-lived +-- backends remains observable). Same privilege model as the +-- session-local view above: REVOKE'd from PUBLIC and GRANT'ed to +-- pg_read_all_stats. +REVOKE EXECUTE ON FUNCTION pg_get_wait_event_trace(int4) FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_wait_event_trace(int4) TO pg_read_all_stats; diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 4b30f7686801a..7f03c6875140f 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -57,6 +57,7 @@ #include "parser/parse_relation.h" #include "pgstat.h" #include "rewrite/rewriteHandler.h" +#include "utils/wait_event_timing.h" #include "tcop/utility.h" #include "utils/acl.h" #include "utils/backend_status.h" @@ -133,6 +134,8 @@ ExecutorStart(QueryDesc *queryDesc, int eflags) */ pgstat_report_query_id(queryDesc->plannedstmt->queryId, false); + wait_event_trace_exec_start(queryDesc->plannedstmt->queryId); + if (ExecutorStart_hook) (*ExecutorStart_hook) (queryDesc, eflags); else @@ -476,6 +479,8 @@ standard_ExecutorFinish(QueryDesc *queryDesc) void ExecutorEnd(QueryDesc *queryDesc) { + wait_event_trace_exec_end(queryDesc->plannedstmt->queryId); + if (ExecutorEnd_hook) (*ExecutorEnd_hook) (queryDesc); else diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index 9803a0ee2a141..50e27fb4f702b 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -26,6 +26,7 @@ #include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/wait_event.h" +#include "utils/wait_event_timing.h" static void ShutdownAuxiliaryProcess(int code, Datum arg); @@ -113,6 +114,11 @@ AuxiliaryProcessMainCommon(void) */ CreateAuxProcessResourceOwner(); +#ifdef USE_WAIT_EVENT_TIMING + /* Attach trace ring if wait_event_capture = trace was set via postgresql.conf */ + if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && my_trace_proc_number >= 0) + wait_event_trace_attach(my_trace_proc_number); +#endif /* Initialize backend status information */ pgstat_beinit(); diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 1ac25068d62f2..b68ea684c5f42 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -57,6 +57,7 @@ #include "utils/timeout.h" #include "utils/timestamp.h" #include "utils/wait_event.h" +#include "utils/wait_event_timing.h" /* GUC variables */ int DeadlockTimeout = 1000; @@ -541,6 +542,7 @@ InitProcess(void) /* now that we have a proc, report wait events to shared memory */ pgstat_set_wait_event_storage(&MyProc->wait_event_info); + pgstat_set_wait_event_timing_storage(MyProcNumber); /* * We might be reusing a semaphore that belonged to a failed process. So @@ -713,6 +715,7 @@ InitAuxiliaryProcess(void) /* now that we have a proc, report wait events to shared memory */ pgstat_set_wait_event_storage(&MyProc->wait_event_info); + pgstat_set_wait_event_timing_storage(MyProcNumber); /* Check that group locking fields are in a proper initial state. */ Assert(MyProc->lockGroupLeader == NULL); @@ -1003,6 +1006,7 @@ ProcKill(int code, Datum arg) */ SwitchBackToLocalLatch(); pgstat_reset_wait_event_storage(); + pgstat_reset_wait_event_timing_storage(); proc = MyProc; MyProc = NULL; @@ -1068,6 +1072,7 @@ AuxiliaryProcKill(int code, Datum arg) /* look at the equivalent ProcKill() code for comments */ SwitchBackToLocalLatch(); pgstat_reset_wait_event_storage(); + pgstat_reset_wait_event_timing_storage(); proc = MyProc; MyProc = NULL; diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index dbef734a93f15..38b7606a0c852 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -78,6 +78,7 @@ #include "tcop/tcopprot.h" #include "tcop/utility.h" #include "utils/guc_hooks.h" +#include "utils/wait_event_timing.h" #include "utils/injection_point.h" #include "utils/lsyscache.h" #include "utils/memutils.h" @@ -1423,6 +1424,19 @@ exec_parse_message(const char *query_string, /* string to execute */ */ debug_query_string = query_string; + /* + * In pipelined extended protocol, a Parse can arrive while the previous + * query's st_query_id is still set and st_state is still RUNNING (no + * Sync->idle between queries, so send_ready_for_query has not yet + * emitted the prior QUERY_END marker). Flush the prior id with + * force=true so the QUERY_END marker fires before pgstat_report_activity + * below silently zeros st_query_id. Skip when st_state != RUNNING: + * coming from idle means send_ready_for_query has already emitted the + * QUERY_END for whatever residual st_query_id remains, and re-emitting + * here would double-count. + */ + if (MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING) + pgstat_report_query_id(0, true); pgstat_report_activity(STATE_RUNNING, query_string); set_ps_display("PARSE"); @@ -1692,6 +1706,12 @@ exec_bind_message(StringInfo input_message) */ debug_query_string = psrc->query_string; + /* See exec_parse_message for rationale. In particular, the state + * gate prevents a duplicate QUERY_END when this Bind is the first + * message after a Sync->idle transition (where send_ready_for_query + * has already emitted QUERY_END for any residual st_query_id). */ + if (MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING) + pgstat_report_query_id(0, true); pgstat_report_activity(STATE_RUNNING, psrc->query_string); foreach(lc, psrc->query_list) @@ -2183,6 +2203,14 @@ exec_execute_message(const char *portal_name, long max_rows) */ debug_query_string = sourceText; + /* See exec_parse_message. Closes the per-phase + * QUERY_START..QUERY_END pair from the preceding Bind (or from the + * prior pipelined Execute) so trace consumers see balanced markers + * across Parse/Bind/Execute. State gate avoids a duplicate + * QUERY_END when this Execute is the first message after a + * Sync->idle transition. */ + if (MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING) + pgstat_report_query_id(0, true); pgstat_report_activity(STATE_RUNNING, sourceText); foreach(lc, portal->stmts) @@ -4654,6 +4682,18 @@ PostgresMain(const char *dbname, const char *username) */ if (send_ready_for_query) { + /* + * Emit QUERY_END trace marker before going idle so that + * idle waits (ClientRead etc.) are not attributed to the + * finished query. + */ + { + volatile PgBackendStatus *beentry = MyBEEntry; + + if (beentry != NULL && beentry->st_query_id != 0) + wait_event_trace_query_end(beentry->st_query_id); + } + if (IsAbortedTransactionBlockState()) { set_ps_display("idle in transaction (aborted)"); diff --git a/src/backend/utils/.gitignore b/src/backend/utils/.gitignore index fa9cfb39693db..5051e36d1f01f 100644 --- a/src/backend/utils/.gitignore +++ b/src/backend/utils/.gitignore @@ -7,4 +7,5 @@ /errcodes.h /pgstat_wait_event.c /wait_event_funcs_data.c +/wait_event_timing_data.h /wait_event_types.h diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile index 81b4a956bda3f..5c11d8294f01a 100644 --- a/src/backend/utils/Makefile +++ b/src/backend/utils/Makefile @@ -43,7 +43,7 @@ generated-header-symlinks: $(top_builddir)/src/include/utils/header-stamp submak submake-adt-headers: $(MAKE) -C adt jsonpath_gram.h -$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h +$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h # fmgr-stamp records the last time we ran Gen_fmgrtab.pl. We don't rely on # the timestamps of the individual output files, because the Perl script @@ -60,6 +60,7 @@ guc_tables.inc.c: $(top_srcdir)/src/backend/utils/misc/guc_parameters.dat $(top_ pgstat_wait_event.c: wait_event_types.h wait_event_funcs_data.c: wait_event_types.h +wait_event_timing_data.h: wait_event_types.h wait_event_types.h: $(top_srcdir)/src/backend/utils/activity/wait_event_names.txt $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl $(PERL) $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl --code $< @@ -79,8 +80,8 @@ endif # These generated headers must be symlinked into src/include/. # We use header-stamp to record that we've done this because the symlinks # themselves may appear older than fmgr-stamp. -$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h - cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h; do \ +$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h + cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h; do \ rm -f $$file && $(LN_S) "../../../$(subdir)/$$file" . ; \ done touch $@ @@ -99,4 +100,4 @@ uninstall-data: clean: rm -f probes.h probes.h.tmp rm -f fmgroids.h fmgrprotos.h fmgrtab.c fmgr-stamp errcodes.h guc_tables.inc.c - rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c + rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile index ca3ef89bf5997..60154d8055780 100644 --- a/src/backend/utils/activity/Makefile +++ b/src/backend/utils/activity/Makefile @@ -35,10 +35,12 @@ OBJS = \ pgstat_wal.o \ pgstat_xact.o \ wait_event.o \ - wait_event_funcs.o + wait_event_funcs.o \ + wait_event_timing.o # Force these dependencies to be known even without dependency info built: wait_event.o: wait_event.c $(top_builddir)/src/backend/utils/pgstat_wait_event.c wait_event_funcs.o: wait_event_funcs.c $(top_builddir)/src/backend/utils/wait_event_funcs_data.c +wait_event_timing.o: wait_event_timing.c $(top_builddir)/src/backend/utils/wait_event_timing_data.h include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c index d685fc5cd87c0..a97f554704af5 100644 --- a/src/backend/utils/activity/backend_status.c +++ b/src/backend/utils/activity/backend_status.c @@ -22,6 +22,7 @@ #include "storage/shmem.h" #include "storage/subsystems.h" #include "utils/ascii.h" +#include "utils/wait_event_timing.h" #include "utils/guc.h" /* for application_name */ #include "utils/memutils.h" @@ -670,6 +671,18 @@ pgstat_report_query_id(int64 query_id, bool force) if (beentry->st_query_id != INT64CONST(0) && !force) return; + /* + * Emit trace markers for query-to-query transitions. QUERY_END fires + * here when st_query_id transitions from one non-zero value to another + * (multi-statement simple protocol, pipelined extended protocol). + * The last-query-to-idle QUERY_END is emitted separately in + * PostgresMain() at send_ready_for_query. + */ + if (beentry->st_query_id != 0 && beentry->st_query_id != query_id) + wait_event_trace_query_end(beentry->st_query_id); + if (query_id != 0 && query_id != beentry->st_query_id) + wait_event_trace_query_start(query_id); + /* * Update my status entry, following the protocol of bumping * st_changecount before and after. We use a volatile pointer here to diff --git a/src/backend/utils/activity/generate-wait_event_types.pl b/src/backend/utils/activity/generate-wait_event_types.pl index d39a30d04783d..f3f1f107a4c04 100644 --- a/src/backend/utils/activity/generate-wait_event_types.pl +++ b/src/backend/utils/activity/generate-wait_event_types.pl @@ -5,6 +5,7 @@ # - wait_event_types.h (if --code is passed) # - pgstat_wait_event.c (if --code is passed) # - wait_event_funcs_data.c (if --code is passed) +# - wait_event_timing_data.h (if --code is passed) # - wait_event_types.sgml (if --docs is passed) # # Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group @@ -269,17 +270,195 @@ } } + # ----------------------------------------------------------- + # Compute wait_event_timing class mapping data. + # + # The dense class table maps raw classId (0x00..max) to a + # dense index, with per-class slot counts rounded up to the + # next power of 2 (minimum 16). Extension and InjectionPoint + # are fixed at 128 because extensions register custom events. + # LWLock uses a hash table (dense = -1). + # ----------------------------------------------------------- + + # Map section name -> raw classId (from wait_classes.h constants) + my %class_to_raw = ( + 'Lock' => 0x03, + 'Buffer' => 0x04, + 'Activity' => 0x05, + 'Client' => 0x06, + 'Extension' => 0x07, + 'IPC' => 0x08, + 'Timeout' => 0x09, + 'IO' => 0x0A, + 'InjectionPoint' => 0x0B, + ); + + # Classes that need fixed large slot counts (dynamically extensible) + my %fixed_slot_classes = ( + 'Extension' => 128, + 'InjectionPoint' => 128, + ); + + # Count events per class from the parsed data. + # Build a list of (className, rawId, actualCount) sorted by rawId. + my @timing_classes; + foreach my $waitclass (keys %hashwe) + { + my $short = $waitclass; + $short =~ s/^WaitEvent//; + + # Skip LWLock -- uses hash table, not flat array + next unless exists $class_to_raw{$short}; + + my $raw_id = $class_to_raw{$short}; + my $count = scalar @{ $hashwe{$waitclass} }; + + push @timing_classes, { + name => $short, + raw_id => $raw_id, + actual => $count, + }; + } + + # InjectionPoint (0x0B) has no section in wait_event_names.txt + # because its events are dynamically registered at runtime. + # Add it explicitly with actual=0 and a fixed slot count. + if (!grep { $_->{name} eq 'InjectionPoint' } @timing_classes) + { + push @timing_classes, { + name => 'InjectionPoint', + raw_id => $class_to_raw{'InjectionPoint'}, + actual => 0, + }; + } + + # Sort by raw classId + @timing_classes = sort { $a->{raw_id} <=> $b->{raw_id} } @timing_classes; + + # Compute slot counts: next power of 2, minimum 16, or fixed + foreach my $cls (@timing_classes) + { + if (exists $fixed_slot_classes{$cls->{name}}) + { + $cls->{slots} = $fixed_slot_classes{$cls->{name}}; + } + else + { + my $slots = 16; # minimum + $slots *= 2 while $slots < $cls->{actual}; + $cls->{slots} = $slots; + } + } + + # Compute cumulative offsets + my $offset = 0; + foreach my $cls (@timing_classes) + { + $cls->{offset} = $offset; + $offset += $cls->{slots}; + } + my $total_events = $offset; + + # Determine max raw classId for array sizing + my $max_raw = 0; + foreach my $cls (@timing_classes) + { + $max_raw = $cls->{raw_id} if $cls->{raw_id} > $max_raw; + } + my $raw_classes = $max_raw + 1; + my $dense_classes = scalar @timing_classes; + + # Emit timing defines into wait_event_types.h + printf $h "\n/* Wait event timing flat array sizing (generated) */\n"; + printf $h "#define WAIT_EVENT_TIMING_RAW_CLASSES\t%d\n", $raw_classes; + printf $h "#define WAIT_EVENT_TIMING_DENSE_CLASSES\t%d\n", $dense_classes; + printf $h "#define WAIT_EVENT_TIMING_NUM_EVENTS\t%d\n\n", $total_events; + printf $h "#endif /* WAIT_EVENT_TYPES_H */\n"; close $h; close $c; close $wc; + # Generate wait_event_timing_data.h with the mapping arrays. + # A header (rather than a .c file) keeps the file-extension category + # straight: it is included into a single TU (wait_event_timing.c) and + # defines static const tables there. The include guard makes the + # single-owner intent explicit and prevents accidental double inclusion. + my $ttmp = "$output_path/wait_event_timing_data.h.tmp$$"; + open my $t, '>', $ttmp or die "Could not open $ttmp: $!"; + printf $t $header_comment, 'wait_event_timing_data.h'; + + printf $t "#ifndef WAIT_EVENT_TIMING_DATA_H\n"; + printf $t "#define WAIT_EVENT_TIMING_DATA_H\n\n"; + + # Emit wait_event_class_dense[] + printf $t "static const int8 wait_event_class_dense[WAIT_EVENT_TIMING_RAW_CLASSES] = {\n"; + for (my $i = 0; $i < $raw_classes; $i++) + { + my $dense = -1; + my $comment = "unused"; + for (my $d = 0; $d < $dense_classes; $d++) + { + if ($timing_classes[$d]->{raw_id} == $i) + { + $dense = $d; + $comment = $timing_classes[$d]->{name}; + last; + } + } + # classId 0x01 is LWLock + if ($i == 0x01) + { + $comment = "LWLock (uses hash)"; + } + my $comma = ($i < $raw_classes - 1) ? "," : ""; + printf $t "\t%2d$comma\t\t/* 0x%02x: %s */\n", $dense, $i, $comment; + } + printf $t "};\n\n"; + + # Emit wait_event_class_nevents[] + printf $t "static const int wait_event_class_nevents[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n"; + for (my $d = 0; $d < $dense_classes; $d++) + { + my $cls = $timing_classes[$d]; + my $comma = ($d < $dense_classes - 1) ? "," : ""; + printf $t "\t%d$comma\t\t/* %s (actual: %d) */\n", + $cls->{slots}, $cls->{name}, $cls->{actual}; + } + printf $t "};\n\n"; + + # Emit wait_event_class_offset[] + printf $t "static const int wait_event_class_offset[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n"; + for (my $d = 0; $d < $dense_classes; $d++) + { + my $cls = $timing_classes[$d]; + my $comma = ($d < $dense_classes - 1) ? "," : ""; + printf $t "\t%d$comma\t\t/* %s */\n", $cls->{offset}, $cls->{name}; + } + printf $t "};\n\n"; + + # Emit wait_event_dense_to_classid[] + printf $t "static const uint8 wait_event_dense_to_classid[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n\t"; + for (my $d = 0; $d < $dense_classes; $d++) + { + my $cls = $timing_classes[$d]; + my $comma = ($d < $dense_classes - 1) ? ", " : ""; + printf $t "0x%02x$comma", $cls->{raw_id}; + } + printf $t "\n};\n\n"; + + printf $t "#endif /* WAIT_EVENT_TIMING_DATA_H */\n"; + + close $t; + rename($htmp, "$output_path/wait_event_types.h") || die "rename: $htmp to $output_path/wait_event_types.h: $!"; rename($ctmp, "$output_path/pgstat_wait_event.c") || die "rename: $ctmp to $output_path/pgstat_wait_event.c: $!"; rename($wctmp, "$output_path/wait_event_funcs_data.c") || die "rename: $wctmp to $output_path/wait_event_funcs_data.c: $!"; + rename($ttmp, "$output_path/wait_event_timing_data.h") + || die "rename: $ttmp to $output_path/wait_event_timing_data.h: $!"; } # Generate the .sgml file. elsif ($gen_docs) diff --git a/src/backend/utils/activity/meson.build b/src/backend/utils/activity/meson.build index 1aa7ece52908c..1da4e216c4263 100644 --- a/src/backend/utils/activity/meson.build +++ b/src/backend/utils/activity/meson.build @@ -19,6 +19,7 @@ backend_sources += files( 'pgstat_subscription.c', 'pgstat_wal.c', 'pgstat_xact.c', + 'wait_event_timing.c', ) # this includes a .c file with contents generated in ../../../include/activity, diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 95635c7f56ce7..c8fab55b36321 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -41,8 +41,7 @@ static const char *pgstat_get_wait_io(WaitEventIO w); static uint32 local_my_wait_event_info; uint32 *my_wait_event_info = &local_my_wait_event_info; -#define WAIT_EVENT_CLASS_MASK 0xFF000000 -#define WAIT_EVENT_ID_MASK 0x0000FFFF +/* WAIT_EVENT_CLASS_MASK / WAIT_EVENT_ID_MASK are defined in utils/wait_classes.h */ /* * Hash tables for storing custom wait event ids and their names in diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 560659f956856..35f8b3f359dc5 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -417,6 +417,8 @@ XactSLRU "Waiting to access the transaction status SLRU cache." ParallelVacuumDSA "Waiting for parallel vacuum dynamic shared memory allocation." AioUringCompletion "Waiting for another process to complete IO via io_uring." ShmemIndex "Waiting to find or allocate space in shared memory." +WaitEventTraceDSA "Waiting for wait event trace dynamic shared memory allocation." +WaitEventTimingDSA "Waiting for wait event timing dynamic shared memory allocation." # No "ABI_compatibility" region here as WaitEventLWLock has its own C code. diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c new file mode 100644 index 0000000000000..b745d84eb1051 --- /dev/null +++ b/src/backend/utils/activity/wait_event_timing.c @@ -0,0 +1,3582 @@ +/*------------------------------------------------------------------------- + * + * wait_event_timing.c + * Per-backend wait event timing and histogram accumulation. + * + * This module provides Oracle-style wait event instrumentation: every + * call to pgstat_report_wait_start()/pgstat_report_wait_end() records + * the wait duration using clock_gettime() and accumulates per-event + * statistics (count, total nanoseconds, max, histogram) in shared memory. + * + * Overhead: two VDSO clock_gettime() calls per wait event transition + * (~40-100 ns total), plus a few memory writes to per-backend arrays. + * No locking is needed since each backend writes only to its own slot. + * + * Controlled by the wait_event_capture GUC (off | stats | trace, + * default off). The 'stats' level activates the aggregated per-event + * counters; 'trace' additionally enables a per-session DSA-backed ring + * buffer of individual events for 10046-style analysis. + * + * Copyright (c) 2026, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/utils/activity/wait_event_timing.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "utils/guc.h" +#include "utils/guc_hooks.h" +#include "utils/wait_event_timing.h" + +/* + * GUC variable -- always defined so the GUC system works even when + * compiled without --enable-wait-event-timing. In stub builds the + * check_hook below rejects any value other than OFF. + */ +int wait_event_capture = WAIT_EVENT_CAPTURE_OFF; + +/* + * GUC: cap on distinct LWLock tranches the per-backend hash table + * tracks individually. Sized at server start (PGC_POSTMASTER). See + * the description in guc_parameters.dat. Always defined so the GUC + * machinery has a backing variable even on builds compiled without + * --enable-wait-event-timing; the value is unused outside that gate. + */ +int wait_event_timing_max_tranches = 192; + +/* + * GUC: per-backend wait-event-trace ring buffer size, in kilobytes. + * Power of two; sized at server start. Always defined so the GUC + * machinery has a backing variable even in stub builds. + */ +int wait_event_trace_ring_size_kb = 4096; + +/* + * Records-per-ring derived from wait_event_trace_ring_size_kb at + * server start. Set once during the postmaster's GUC initialisation; + * read by the writer hot path (via the per-ring cached mask) and by + * the allocator. Stays at zero until the GUC framework has committed + * the boot value, after which any code reading it sees the final + * cluster-wide ring size. + */ +uint32 WaitEventTraceRingSize = 0; + +/* + * Enum value table consumed by guc.c. Order matches the + * WaitEventCaptureLevel enum and the documented "off < stats < trace" + * ordering. + */ +const struct config_enum_entry wait_event_capture_options[] = { + {"off", WAIT_EVENT_CAPTURE_OFF, false}, + {"stats", WAIT_EVENT_CAPTURE_STATS, false}, + {"trace", WAIT_EVENT_CAPTURE_TRACE, false}, + {NULL, 0, false} +}; + +StaticAssertDecl(lengthof(wait_event_capture_options) == (WAIT_EVENT_CAPTURE_TRACE + 2), + "wait_event_capture_options length mismatch"); + +/* + * GUC check hook for wait_event_trace_ring_size_kb. + * + * The ring size in records must be a power of two so the writer's + * mask-indexing (pos & ring_mask) works. Since each record is exactly + * 32 bytes, the kilobyte value is a power of two iff records-count is + * (kb * 32 is a power of two iff kb is, as 32 itself is). + * + * Defined for both build configurations so the GUC framework can + * validate the value uniformly; the value itself is unused in stub + * builds. + */ +bool +check_wait_event_trace_ring_size_kb(int *newval, void **extra, GucSource source) +{ + int v = *newval; + + if (v <= 0 || (v & (v - 1)) != 0) + { + GUC_check_errdetail("wait_event_trace_ring_size_kb must be a positive power of two."); + return false; + } + return true; +} + +#ifndef USE_WAIT_EVENT_TIMING + +/* + * Stub SQL functions when compiled without --enable-wait-event-timing. + * These are referenced by pg_proc.dat and must exist as symbols. + */ +#include "fmgr.h" +#include "funcapi.h" +#include "utils/guc_hooks.h" + +Datum pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS); +Datum pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS); +Datum pg_get_wait_event_trace(PG_FUNCTION_ARGS); +Datum pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS); +Datum pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS); +Datum pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS); +Datum pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS); + +Datum +pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS) +{ + InitMaterializedSRF(fcinfo, 0); + PG_RETURN_VOID(); +} + +Datum +pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS) +{ + InitMaterializedSRF(fcinfo, 0); + PG_RETURN_VOID(); +} + +Datum +pg_get_wait_event_trace(PG_FUNCTION_ARGS) +{ + InitMaterializedSRF(fcinfo, 0); + PG_RETURN_VOID(); +} + +Datum +pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS) +{ + InitMaterializedSRF(fcinfo, 0); + PG_RETURN_VOID(); +} + +Datum +pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("wait event capture is not supported by this build"), + errhint("Compile PostgreSQL with --enable-wait-event-timing."))); + PG_RETURN_VOID(); +} + +Datum +pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS) +{ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("wait event capture is not supported by this build"), + errhint("Compile PostgreSQL with --enable-wait-event-timing."))); + PG_RETURN_VOID(); +} + +Datum +pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS) +{ + /* + * In stub builds the trace ring infrastructure does not exist, so + * there can never be any orphaned rings to clear. Return 0 rather + * than erroring; this lets monitoring scripts call the function + * unconditionally without branching on the build flag. + */ + PG_RETURN_INT64(0); +} + +/* + * Extern variables referenced by backend_status.c unconditionally. + * In timing builds these are defined after the #else. + */ +/* + * GUC check hook for the stub build. Any value other than 'off' is + * meaningless without --enable-wait-event-timing, so we reject it + * (or downgrade to 'off' silently when the value comes from the + * config file at startup, matching the old per-GUC behavior). + */ +bool +check_wait_event_capture(int *newval, void **extra, GucSource source) +{ + if (*newval != WAIT_EVENT_CAPTURE_OFF) + { + if (source < PGC_S_INTERACTIVE) + { + ereport(WARNING, + (errmsg("wait_event_capture is not supported by this build, " + "forcing to \"off\""), + errhint("Compile PostgreSQL with " + "--enable-wait-event-timing."))); + *newval = WAIT_EVENT_CAPTURE_OFF; + return true; + } + GUC_check_errdetail("This build does not support wait event capture."); + GUC_check_errhint("Compile PostgreSQL with --enable-wait-event-timing."); + return false; + } + return true; +} + +/* Stub GUC assign hook -- nothing to do without compile-time support. */ +void +assign_wait_event_capture(int newval, void *extra) +{ +} + +/* + * Stub shmem callbacks registered from storage/subsystemlist.h. In the + * non-timing build no shared memory is reserved: both request_fn and + * init_fn are NULL, which RegisterShmemCallbacks() treats as no-ops. + */ +const ShmemCallbacks WaitEventTimingShmemCallbacks = {0}; +const ShmemCallbacks WaitEventTraceControlShmemCallbacks = {0}; + +void +pgstat_set_wait_event_timing_storage(int procNumber) +{ +} + +void +pgstat_reset_wait_event_timing_storage(void) +{ +} + +/* + * Stub trace-marker entry points. Declared unconditionally in + * wait_event_timing.h so that call sites in execMain.c, + * backend_status.c, and postgres.c do not need #ifdef + * USE_WAIT_EVENT_TIMING guards around the call. No-ops here in the + * stub build: there is no ring to write to and no infrastructure to + * initialise. + */ +void +wait_event_trace_query_start(int64 query_id) +{ +} + +void +wait_event_trace_query_end(int64 query_id) +{ +} + +void +wait_event_trace_exec_start(int64 query_id) +{ +} + +void +wait_event_trace_exec_end(int64 query_id) +{ +} + +#else /* USE_WAIT_EVENT_TIMING */ + +#include "catalog/pg_authid.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/queryjumble.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procnumber.h" +#include "storage/shmem.h" +#include "catalog/pg_type_d.h" +#include "utils/acl.h" +#include "utils/array.h" +#include "utils/backend_status.h" +#include "utils/builtins.h" +#include "utils/guc.h" +#include "utils/guc_hooks.h" +#include "utils/injection_point.h" +#include "utils/tuplestore.h" +#include "utils/wait_event.h" + +#define NUM_WAIT_EVENT_TIMING_SLOTS (MaxBackends + NUM_AUXILIARY_PROCS) + +#define HAS_PGSTAT_PERMISSIONS(role) \ + (has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS) || \ + has_privs_of_role(GetUserId(), role)) + +/* Pointer to this backend's timing state */ +WaitEventTimingState *my_wait_event_timing = NULL; + +/* Pointer to this backend's trace ring buffer */ +static WaitEventTraceState *my_wait_event_trace = NULL; + +/* + * Backend-local copy of the last reset generation we acted on. Compared + * against the shared pg_atomic_uint32 reset_generation in this backend's + * WaitEventTimingState slot at every wait_end. When the shared value + * differs, the owning backend performs the reset of its own counters on + * behalf of whoever called pg_stat_reset_wait_event_timing(target). + * + * This makes cross-backend reset a lock-free request-response: the caller + * bumps the atomic (and wakes the target's latch so idle backends notice); + * the owning backend clears its counters at a safe point. Because only the + * owning backend ever writes its slot, there is no race between writers and + * resetters -- the reset happens inline inside the single-writer hot path. + */ +static uint32 my_last_reset_generation = 0; + +/* + * DSA-based shared timing array control. + * + * The per-backend WaitEventTimingState array is allocated lazily in DSA + * on the first SET wait_event_capture = stats|trace in the cluster. + * This avoids ~11-113 MB of eager shmem allocation at postmaster start + * when the feature is compiled in but turned off at runtime (the common + * case). See wait_event_timing_attach_array(). + * + * The control struct itself lives in the small fixed shmem region; it + * holds a DSA handle and a dsa_pointer to the allocated array. + */ +typedef struct WaitEventTimingControl +{ + LWLock lock; /* protects first-time DSA create + array alloc */ + dsa_handle timing_dsa_handle; /* DSA_HANDLE_INVALID until first enable */ + dsa_pointer timing_array; /* InvalidDsaPointer until first enable */ +} WaitEventTimingControl; + +static WaitEventTimingControl *WaitEventTimingCtl = NULL; +static dsa_area *timing_dsa = NULL; + +/* + * Backend-local cached pointer to the start of the shared array, set + * on first lazy-attach. Readers of other backends' slots (pg_stat_*) + * attach on demand and use this cache for the rest of the SRF call. + * Writers access their own slot exclusively via my_wait_event_timing. + * + * Slots in this region are NOT laid out as a simple C array -- per + * the layout description on WaitEventTimingState (in + * src/include/utils/wait_event_timing.h), each slot has a + * runtime-determined stride (header + variable-size hash arrays). + * Use wet_slot(idx) below to index into it. + */ +static char *WaitEventTimingArray = NULL; + +/* + * Per-backend slot stride within WaitEventTimingArray. Set at first + * attach from the GUC value at the time of allocation; constant for + * the cluster's lifetime once the DSA is allocated. + */ +static Size wait_event_timing_per_backend_stride = 0; + +/* + * Effective hash sizing. Both values are derived from the GUC + * wait_event_timing_max_tranches at allocation time and stored in + * each slot's LWLockTimingHash header; cached here as backend-local + * for use by code that needs the values before resolving a slot + * (e.g., the allocation code itself). + */ +static int wait_event_timing_hash_size = 0; +static int wait_event_timing_max_entries = 0; + +/* + * Round up to the next power of two, with a minimum of 32. The hash + * slot count must be a power of two for the mask-based modulo in the + * lookup hot path; we target >= 2x the entry cap so the load factor + * stays at or below 50%. + */ +static int +wait_event_timing_hash_size_for(int max_entries) +{ + int size = 32; + + while (size < max_entries * 2) + size <<= 1; + return size; +} + +/* + * Compute the per-backend slot size for the given max_entries. Each + * slot is laid out as + * + * [ WaitEventTimingState header ] + * [ LWLockTimingHashEntry[hash_size] ] + * [ WaitEventTimingEntry[max_entries] <- lwlock_events[] ] + * + * with no padding between sections (the structs already pack + * 8-byte-aligned). + */ +static Size +wait_event_timing_slot_size(int max_entries) +{ + int hash_size = wait_event_timing_hash_size_for(max_entries); + + return add_size(sizeof(WaitEventTimingState), + add_size(mul_size(hash_size, sizeof(LWLockTimingHashEntry)), + mul_size(max_entries, sizeof(WaitEventTimingEntry)))); +} + +/* Resolve the address of slot `idx` within WaitEventTimingArray. */ +static inline WaitEventTimingState * +wet_slot(int idx) +{ + return (WaitEventTimingState *) + (WaitEventTimingArray + (Size) idx * wait_event_timing_per_backend_stride); +} + +/* + * Address of the LWLock hash slot table for the given slot's lwlock_hash + * header. The slot table immediately follows the WaitEventTimingState + * header in memory; hash_size in the LWLockTimingHash header tells us + * how many entries follow. + */ +static inline LWLockTimingHashEntry * +wet_lwlock_hash_entries(WaitEventTimingState *state) +{ + return (LWLockTimingHashEntry *)((char *) state + sizeof(WaitEventTimingState)); +} + +/* + * Address of the dense LWLock events array for the given slot. It + * immediately follows the slot table. + */ +static inline WaitEventTimingEntry * +wet_lwlock_hash_events(WaitEventTimingState *state) +{ + return (WaitEventTimingEntry *) + ((char *) state + sizeof(WaitEventTimingState) + + (Size) state->lwlock_hash.hash_size * sizeof(LWLockTimingHashEntry)); +} + +/* DSA-based trace ring buffer control */ +static WaitEventTraceControl *WaitEventTraceCtl = NULL; +static dsa_area *trace_dsa = NULL; +int my_trace_proc_number = -1; + +/* + * Same-backend coordination between pg_get_backend_wait_event_trace (the + * own-session SRF reader) and wait_event_trace_release_slot (the GUC + * step-down path that frees this backend's ring). Both paths run in this + * same backend, single-threaded, so a plain bool is sufficient -- no + * atomics needed. + * + * srf_in_progress set true while the SRF is iterating the ring; the + * release path observes this and defers the dsa_free + * instead of yanking the chunk out from under us. + * + * release_pending set by the release path when it had to defer; the + * SRF's PG_FINALLY checks it and performs the deferred + * dsa_free after the iteration completes. + * + * Cross-backend readers (extensions, bgworkers reading another backend's + * ring) cannot use this mechanism -- they coordinate with the release + * path via WaitEventTraceCtl->lock instead. See the header for the + * recommended snapshot-under-lock pattern for those consumers. + */ +static bool wait_event_trace_srf_in_progress = false; +static bool wait_event_trace_release_pending = false; + +/* + * Per-backend gate that disables the trace-ring writer in the wait- + * event hot path while a slot-state transition is in progress. + * + * Set true around code paths that either free the local trace ring + * (wait_event_trace_release_slot's dsa_free) or transition the slot + * out of OWNED (wait_event_trace_before_shmem_exit's OWNED -> + * ORPHANED publish). In both cases an internal LWLock inside + * dsa_free / dsa_attach / dsa_pin_mapping / dsa_pin can in + * principle contend long enough to dispatch a wait event; that + * wait event's pgstat_report_wait_end_timing inline path runs in + * the SAME backend, sees capture_level == TRACE (the GUC hasn't + * been committed yet by the time the assign hook runs), and would: + * + * * during release_slot's dsa_free: write into a ring that has + * already been returned to the DSA freelist -- if another + * allocator has since reused the chunk, this is a stray write + * into someone else's allocation. + * + * * during release_slot's dsa_free, alternative timing: see + * my_wait_event_trace == NULL on a naive "clear before free" + * fix and recurse into wait_event_trace_attach, which would + * either deadlock on the WaitEventTraceCtl->lock the outer + * release_slot already holds, or (on a lock-free moment) + * allocate a fresh ring that the outer release_slot would + * then free again as part of its post-acquire DsaPointerIsValid + * check -- a different use-after-free of a freshly-allocated + * chunk. + * + * * during before_shmem_exit: write into the ring after the slot + * has been published as ORPHANED, violating the post-mortem + * read-only contract that cross-backend readers rely on. + * + * The flag is per-backend (static at file scope means per-process + * in PG's process-per-backend model), so the hot path's check is a + * single cache-warm load and a branch; no atomic, no fence. The + * trace branch is already gated by capture_level == TRACE so the + * additional check costs nothing in the common case where capture + * is off or stats-only. The flag is set on the very same backend + * that may later read it from the hot path, so there is no + * cross-process visibility concern. + * + * See the release_slot and before_shmem_exit doc comments for the + * specific transition each uses this flag around, and review_6.md + * issue #10 for the UAF analysis. + */ +static bool wait_event_trace_writes_disabled = false; + +/* Forward declarations for lazy-attach helpers */ +static void wait_event_timing_ensure_dsa(void); +static void pgstat_wait_event_timing_before_shmem_exit(int code, Datum arg); + +/* + * Per-backend shutdown gate. Set true in the before_shmem_exit + * callback so the wait-event hot path can detect that DSA mappings + * may already be torn down by dsm_backend_shutdown (which runs as + * a LATER on_shmem_exit callback) and skip every code path that + * would dereference my_wait_event_timing or attempt a fresh + * lazy_attach. Once true, the backend's wait events are silently + * dropped for the remainder of proc_exit -- the backend is going + * away anyway, and the alternative is a SIGSEGV. + */ +bool wait_event_timing_writes_disabled = false; +static bool wait_event_timing_attach_array(bool allocate_if_missing); +static void wait_event_trace_release_slot(int procNumber); + +/* + * Mapping arrays for the flat events[] array, generated from + * wait_event_names.txt by generate-wait_event_types.pl. + * Defines: WAIT_EVENT_TIMING_RAW_CLASSES, WAIT_EVENT_TIMING_DENSE_CLASSES, + * WAIT_EVENT_TIMING_NUM_EVENTS, and the four mapping arrays. + */ +#include "utils/wait_event_timing_data.h" + +/* + * Convert wait_event_info to a flat index for the events[] array. + * For bounded classes, eventId equals the array index within the class + * (the enum values start at PG_WAIT_ and increment by one). + * + * Class extraction follows the same idiom as pgstat_get_wait_event_type: + * mask off the class bits and compare against the full PG_WAIT_* + * constants, rather than shifting both sides down to a byte. The + * dense-table lookup still needs the byte-form class id, but that + * conversion is now an isolated array-index step rather than a + * load-bearing piece of encoding-layout knowledge in the comparison. + */ +static int +wait_event_timing_index(uint32 wait_event_info) +{ + uint32 classId = wait_event_info & WAIT_EVENT_CLASS_MASK; + int eventId = wait_event_info & WAIT_EVENT_ID_MASK; + int class_byte; + int dense; + + if (classId == PG_WAIT_LWLOCK) + return WAIT_EVENT_TIMING_IDX_LWLOCK; + + class_byte = classId >> 24; + if (unlikely(class_byte >= WAIT_EVENT_TIMING_RAW_CLASSES)) + return -1; + + dense = wait_event_class_dense[class_byte]; + if (unlikely(dense < 0)) + return -1; + + if (unlikely(eventId >= wait_event_class_nevents[dense])) + return -1; + + return wait_event_class_offset[dense] + eventId; +} + +/* + * Reset a slot's LWLockTimingHash to its empty initial state. + * + * Takes a WaitEventTimingState rather than a bare LWLockTimingHash + * because the slot table (entries[]) and dense events array + * (lwlock_events[]) live as variable-size regions following the + * WaitEventTimingState header in memory; their sizes are runtime- + * determined by wait_event_timing_max_tranches. The hash header's + * hash_size and max_entries fields are immutable after allocation + * and are NOT reset here. + */ +static void +lwlock_timing_hash_clear(WaitEventTimingState *state) +{ + LWLockTimingHash *ht = &state->lwlock_hash; + LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state); + WaitEventTimingEntry *events = wet_lwlock_hash_events(state); + int i; + + ht->num_used = 0; + memset(events, 0, (Size) ht->max_entries * sizeof(WaitEventTimingEntry)); + for (i = 0; i < ht->hash_size; i++) + { + entries[i].tranche_id = LWLOCK_TIMING_EMPTY_SLOT; + entries[i].dense_idx = 0; + } +} + +/* + * Maximum number of probes attempted on the lookup hot path once the + * table is at capacity. At cap there is no further insertion + * possible, so an unknown tranche cannot be recorded; the only useful + * work the loop can do is find an existing entry within its + * probe-distance window. Bounding the scan caps the per-event cost at + * the cap-overflow regime to a constant, instead of paying ~2-3 probes + * (worst-case clusters: many more) on every unknown-tranche wait_end + * for the remainder of the backend lifetime. + * + * The bound (8) is well above the expected probe distance at this + * table's load factor (linear-probing miss expected length ~1.78 at + * 37.5% load; P99 fits comfortably in 8). Entries inserted with a + * collision distance > 8 from their hash slot will fail to be found at + * cap, which is theoretically possible but astronomically unlikely at + * the load factors we target (probability < 1e-3) and is the right + * trade against the common at-cap unknown-tranche cost. + */ +#define LWLOCK_TIMING_LOOKUP_AT_CAP_PROBE_LIMIT 8 + +/* + * Look up (or insert) timing entry for an LWLock tranche ID. + * + * Takes WaitEventTimingState (rather than just the hash header) so the + * variable-size entries[] and lwlock_events[] arrays following the + * header can be addressed via the wet_lwlock_hash_*() helpers. + */ +static WaitEventTimingEntry * +lwlock_timing_lookup(WaitEventTimingState *state, uint16 tranche_id) +{ + LWLockTimingHash *ht = &state->lwlock_hash; + LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state); + WaitEventTimingEntry *events = wet_lwlock_hash_events(state); + uint32 hash = (uint32) tranche_id * 2654435761U; + int slot = hash & (ht->hash_size - 1); + int limit; + int i; + + /* + * At cap, bound the probe distance so unknown tranches return NULL + * quickly instead of walking through clustered occupied slots. See + * the comment on LWLOCK_TIMING_LOOKUP_AT_CAP_PROBE_LIMIT. + */ + limit = (ht->num_used >= ht->max_entries) + ? LWLOCK_TIMING_LOOKUP_AT_CAP_PROBE_LIMIT + : ht->hash_size; + + for (i = 0; i < limit; i++) + { + LWLockTimingHashEntry *e = &entries[slot]; + + if (e->tranche_id == tranche_id) + return &events[e->dense_idx]; + + if (e->tranche_id == LWLOCK_TIMING_EMPTY_SLOT) + { + if (ht->num_used >= ht->max_entries) + return NULL; + + e->tranche_id = tranche_id; + e->dense_idx = ht->num_used++; + return &events[e->dense_idx]; + } + + slot = (slot + 1) & (ht->hash_size - 1); + } + + return NULL; +} + +/* + * Compute histogram bucket index for a duration in nanoseconds. + * + * Bin edges are powers of two directly on nanoseconds: bucket i covers + * [2^(i+9), 2^(i+10)) ns for 0 < i < NBUCKETS-1, bucket 0 covers + * [0, 1024) ns, and the last bucket covers [2^(NBUCKETS+8), inf) ns. + * The boundaries approximate the decimal-microsecond grid (1024 ≈ 1 us, + * 2048 ≈ 2 us, ... 2^33 ≈ 8.6 s) close enough for a diagnostic + * histogram while letting us skip the strength-reduced /1000 on the + * hot path. + * + * See the rationale comment on WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS in + * wait_event_timing.h for why the bucket count is 32 (covering up to + * 8.6s) rather than 16 (which would have capped at 16ms). + */ +static int +wait_event_timing_bucket(int64 duration_ns) +{ + int bucket; + + /* + * Everything under 1024 ns ("~1 us") lands in bucket 0. Also handles + * duration_ns == 0, which would otherwise be undefined input to + * pg_leftmost_one_pos64. + */ + if (duration_ns < 1024) + return 0; + + bucket = pg_leftmost_one_pos64((uint64) duration_ns) - 9; + + if (bucket >= WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS) + bucket = WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS - 1; + + return bucket; +} + +/* + * Write a trace ring marker record. Shared helper for all marker types. + */ +static void +wait_event_trace_write_marker(uint8 record_type, int64 query_id) +{ + uint64 pos; + WaitEventTraceRecord *rec; + uint32 seq; + instr_time now; + + /* + * Single capture-level gate: markers only land in the ring when + * wait_event_capture is at TRACE. This guarantees consistency with + * the wait-event hot path (also gated on the same level) -- there is + * no configuration in which one half of the trace fires and the + * other doesn't. query_id == 0 means "no query ID available" + * (utility command or compute_query_id = off), which we skip. + * + * wait_event_trace_writes_disabled is the same per-backend gate + * the wait-event hot path uses; it is raised by release_slot and + * before_shmem_exit around slot-state transitions to keep both + * writers consistent. Markers cannot fire during those + * transitions today (single-threaded execution, no nested + * executor), but checking here keeps the contract uniform + * across all trace-ring writers and is robust to future code + * paths that might invoke a marker from a nested context. + * + * No likely()/unlikely() annotation: this function is called at + * query/exec boundaries (a handful per query, not per wait event), + * so neither side of the branch dominates often enough for static + * layout to matter, and the meaningful production configuration + * (wait_event_capture = trace) is exactly when the body is hot -- + * an annotation on the early-return would point the wrong way. + */ + if (wait_event_capture != WAIT_EVENT_CAPTURE_TRACE || + wait_event_trace_writes_disabled || + query_id == 0) + return; + + /* + * Lazy attach on first use. Allocation lives here (not in the + * assign hook) because dsa_allocate_extended() can ereport(ERROR) + * on OOM, which is forbidden in assign-hook context but legitimate + * here. Idempotent: wait_event_trace_attach() short-circuits on + * subsequent calls. + */ + if (my_wait_event_trace == NULL) + { + if (my_trace_proc_number < 0) + return; + wait_event_trace_attach(my_trace_proc_number); + if (my_wait_event_trace == NULL) + return; /* attach path unable to allocate */ + } + + /* + * Claim the next slot. Single-writer counter (only the owning backend + * writes its own ring), so a plain read+write is sufficient and avoids + * the LOCK XADD that pg_atomic_fetch_add_u64 would emit -- a wasted + * cache-coherence trip on an unshared cache line at this rate (one per + * wait event). Cross-backend readers use pg_atomic_read_u64, which + * compiles to a plain MOV on x86 and tolerates concurrent writes here + * (their actual safety against the records[] window is the per-record + * seqlock below). Same idiom as injection_point.c's per-entry + * generation counter (single writer + multiple lock-free readers). + */ + pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos); + pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1); + rec = &my_wait_event_trace->records[pos & my_wait_event_trace->ring_mask]; + seq = (uint32)(pos * 2 + 1); + + rec->seq = seq; + pg_write_barrier(); /* release: payload stores must not rise above seq=odd */ + + INSTR_TIME_SET_CURRENT(now); + rec->record_type = record_type; + rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now); + rec->data.query.query_id = query_id; + rec->data.query.pad2 = 0; + + pg_write_barrier(); /* release: payload stores must land before seq=even */ + rec->seq = seq + 1; +} + +void +wait_event_trace_query_start(int64 query_id) +{ + wait_event_trace_write_marker(TRACE_QUERY_START, query_id); +} + +void +wait_event_trace_query_end(int64 query_id) +{ + wait_event_trace_write_marker(TRACE_QUERY_END, query_id); +} + +void +wait_event_trace_exec_start(int64 query_id) +{ + wait_event_trace_write_marker(TRACE_EXEC_START, query_id); +} + +void +wait_event_trace_exec_end(int64 query_id) +{ + wait_event_trace_write_marker(TRACE_EXEC_END, query_id); +} + +/* + * Report and initialize shared memory for wait event timing. + * + * Registered via the shmem subsystem registry in + * src/include/storage/subsystemlist.h. Only the small control struct + * is in fixed shmem; the per-backend WaitEventTimingState array + * (~30 KB/backend) is allocated lazily in DSA on first enable by any + * backend (see wait_event_timing_attach_array). + */ +static void +WaitEventTimingShmemRequest(void *arg) +{ + ShmemRequestStruct(.name = "WaitEventTimingControl", + .size = sizeof(WaitEventTimingControl), + .ptr = (void **) &WaitEventTimingCtl); +} + +static void +WaitEventTimingShmemInit(void *arg) +{ + LWLockInitialize(&WaitEventTimingCtl->lock, + LWTRANCHE_WAIT_EVENT_TIMING_DSA); + WaitEventTimingCtl->timing_dsa_handle = DSA_HANDLE_INVALID; + WaitEventTimingCtl->timing_array = InvalidDsaPointer; + + WaitEventTimingArray = NULL; +} + +const ShmemCallbacks WaitEventTimingShmemCallbacks = { + .request_fn = WaitEventTimingShmemRequest, + .init_fn = WaitEventTimingShmemInit, +}; + +/* + * Ensure the backend is attached to the timing DSA. + * + * The DSA is created by whichever backend first hits this function with + * an empty control struct; subsequent callers just attach to the + * existing handle. The backend-local dsa_area pointer is cached in + * timing_dsa for the backend's lifetime. + */ +static void +wait_event_timing_ensure_dsa(void) +{ + MemoryContext oldcontext; + + if (timing_dsa != NULL) + return; + + if (WaitEventTimingCtl == NULL) + return; /* pre-ShmemInit; nothing to attach to */ + + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + LWLockAcquire(&WaitEventTimingCtl->lock, LW_EXCLUSIVE); + + if (WaitEventTimingCtl->timing_dsa_handle == DSA_HANDLE_INVALID) + { + timing_dsa = dsa_create(LWTRANCHE_WAIT_EVENT_TIMING_DSA); + dsa_pin(timing_dsa); + dsa_pin_mapping(timing_dsa); + WaitEventTimingCtl->timing_dsa_handle = dsa_get_handle(timing_dsa); + } + else + { + timing_dsa = dsa_attach(WaitEventTimingCtl->timing_dsa_handle); + dsa_pin_mapping(timing_dsa); + } + + LWLockRelease(&WaitEventTimingCtl->lock); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Attach this backend to the shared WaitEventTimingArray, allocating + * it in DSA on first use if allocate_if_missing is true. + * + * Returns true if the array is now available (WaitEventTimingArray is + * non-NULL on return); false otherwise. Readers pass allocate_if_missing + * = false to avoid allocating a big array just because somebody ran + * SELECT against an empty pg_stat view. Writers (hot path) pass true + * so that the first wait event under wait_event_capture != off creates + * the storage. + * + * Re-entrancy guard. Internal operations below (dsa_create, + * dsa_allocate_extended, the LWLockAcquire inside ensure_dsa) can + * emit LWLock wait events of their own, which feed into the wait-end + * timing hot path; under wait_event_capture >= STATS that hot path + * lazy-attaches by calling back into this function. Without the + * guard we would either deadlock on WaitEventTimingCtl->lock or + * recurse with a half-initialised slot pointer. + * + * The same hazard applies in wait_event_trace_attach (which also runs + * dsa_allocate / LWLock under its body) and in + * wait_event_trace_release_slot (whose dsa_free takes a DSA-internal + * LWLock that can in principle emit a wait event during shutdown + * sequences). Each function carries its own static bool guard close + * to the code it protects, matching the established PG idiom for + * function-local re-entry guards (see, e.g., in_vacuum in + * src/backend/commands/vacuum.c, in_streamed_transaction in + * src/backend/replication/logical/worker.c). We deliberately do NOT + * collapse these into a shared bitmask because: + * 1. PG style places re-entry flags adjacent to the function they + * protect, not in a centralised module-level state structure. + * 2. The three guarded functions are independent: a re-entry into + * one of them while another is in flight is a legitimate pattern + * (e.g., release_slot can be triggered by an assign hook that + * itself ran while attach was in progress earlier). A shared + * flag would conservatively block those legal cases. + * + * If you add a fourth re-entrant function in this file, follow the + * same shape: a `static bool in_ = false;` at the top of the + * function, an early-return `if (in_) return ...;`, set true + * before the body, clear in PG_FINALLY so an ereport(ERROR) cannot + * leave the flag stuck set. + */ +static bool +wait_event_timing_attach_array(bool allocate_if_missing) +{ + static bool in_attach = false; + bool attached = false; + + if (WaitEventTimingArray != NULL) + return true; + + if (WaitEventTimingCtl == NULL) + return false; + + if (in_attach) + return false; + + in_attach = true; + PG_TRY(); + { + wait_event_timing_ensure_dsa(); + + if (WaitEventTimingCtl->timing_array == InvalidDsaPointer) + { + if (!allocate_if_missing) + { + attached = false; + } + else + { + int max_entries; + int hash_size; + Size stride; + Size total; + + /* + * Snapshot the GUC at allocation time and use the same + * value for every slot in the cluster. This is the + * cluster-wide first-enable allocation; subsequent + * backends that attach reuse these dimensions, even if + * the GUC has somehow been changed in between (it + * shouldn't, since it is PGC_POSTMASTER, but reading + * once and storing the result keeps the contract + * explicit). + */ + max_entries = wait_event_timing_max_tranches; + hash_size = wait_event_timing_hash_size_for(max_entries); + stride = wait_event_timing_slot_size(max_entries); + total = mul_size(NUM_WAIT_EVENT_TIMING_SLOTS, stride); + + LWLockAcquire(&WaitEventTimingCtl->lock, LW_EXCLUSIVE); + + if (WaitEventTimingCtl->timing_array == InvalidDsaPointer) + { + dsa_pointer p; + char *region; + int i; + + p = dsa_allocate_extended(timing_dsa, total, + DSA_ALLOC_ZERO); + region = (char *) dsa_get_address(timing_dsa, p); + + for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++) + { + WaitEventTimingState *slot; + LWLockTimingHashEntry *slot_entries; + int j; + + slot = (WaitEventTimingState *) (region + (Size) i * stride); + + pg_atomic_init_u32(&slot->reset_generation, 0); + slot->lwlock_hash.num_used = 0; + slot->lwlock_hash.hash_size = hash_size; + slot->lwlock_hash.max_entries = max_entries; + + /* + * Initialise the hash slot table to the empty + * sentinel. The DSA region was zeroed above + * (DSA_ALLOC_ZERO), but the empty sentinel is + * 0xFFFF, not 0. + */ + slot_entries = (LWLockTimingHashEntry *) + ((char *) slot + sizeof(WaitEventTimingState)); + for (j = 0; j < hash_size; j++) + slot_entries[j].tranche_id = LWLOCK_TIMING_EMPTY_SLOT; + } + + WaitEventTimingCtl->timing_array = p; + } + + LWLockRelease(&WaitEventTimingCtl->lock); + + attached = true; + } + } + else + { + attached = true; + } + + if (attached) + { + WaitEventTimingState *first; + + WaitEventTimingArray = (char *) + dsa_get_address(timing_dsa, + WaitEventTimingCtl->timing_array); + + /* + * Recover the dimensions from the first slot's lwlock_hash + * header. All slots share the same dimensions, set at + * allocation time. Cache the stride backend-locally so + * wet_slot() is a single multiply-and-add. + */ + first = (WaitEventTimingState *) WaitEventTimingArray; + wait_event_timing_max_entries = first->lwlock_hash.max_entries; + wait_event_timing_hash_size = first->lwlock_hash.hash_size; + wait_event_timing_per_backend_stride = + wait_event_timing_slot_size(wait_event_timing_max_entries); + } + } + PG_FINALLY(); + { + in_attach = false; + } + PG_END_TRY(); + + return WaitEventTimingArray != NULL; +} + +/* + * Point my_wait_event_timing at this backend's slot within the shared + * timing array, allocating the array in DSA on first call. + * + * Called from the hot path entry points pgstat_report_wait_start_timing() + * and pgstat_report_wait_end_timing() when wait_event_capture is non-OFF + * and my_wait_event_timing is still NULL. After the first successful + * attach, my_wait_event_timing stays non-NULL for the backend's lifetime, + * so this function is reached only on the cold first-attach path. + */ +static void +pgstat_wait_event_timing_lazy_attach(void) +{ + int procNumber; + WaitEventTimingState *slot; + + if (my_wait_event_timing != NULL) + return; + + if (MyProc == NULL) + return; + + /* + * Lazy attach allocates memory (via wait_event_timing_attach_array -> + * dsa_attach -> dsm_attach -> MemoryContextAlloc). In a critical + * section, MemoryContextAlloc Assert-fails on + * "CritSectionCount == 0 || allowInCritSection". A backend's very + * first wait event after wait_event_capture is enabled can land + * inside a critical section -- e.g. a parallel worker that hasn't + * yet emitted any wait events does so for the first time in + * BufferSetHintBits16 -> XLogSaveBufferForHint -> XLogInsert -> + * LWLockAcquire, with XLogInsert holding a critical section. + * + * Skipping the attach in that case silently drops the in-flight + * wait event but keeps the backend alive. The very next wait + * event outside any critical section will hit this function again + * and attach successfully, after which the hot path no longer + * routes through here. Wait events emitted inside critical + * sections are by their nature brief, infrequent (critical + * sections are short by design), and would be dropped anyway if + * the backend exited from a crash here -- so losing them at the + * very-first-attach moment is an acceptable tradeoff against the + * Assert-induced abort. + */ + if (CritSectionCount > 0) + return; + + /* + * Skip the attach if MyProc is already on an LWLock wait queue. + * The wait-event hot path that called us runs INSIDE + * LWLockAcquire after LWLockQueueSelf has set + * MyProc->lwWaiting = LW_WS_WAITING, but BEFORE the actual + * sleep. Our wait_event_timing_attach_array calls + * dsa_attach -> LWLockAcquire on its own LWLock; if that lock + * is contended the nested LWLockQueueSelf would hit the + * "queueing for lock while waiting on another one" PANIC at + * lwlock.c:1029 (Assert(MyProc->lwWaiting == LW_WS_NOT_WAITING) + * just before the queue insert). + * + * Skipping here drops the in-flight wait event from timing + * stats but keeps the backend alive. The next wait event + * outside any LWLock-wait context will retry the attach + * successfully; in practice every backend hits an uncontended + * latch or PgSleep wait well within its first few seconds, + * so the lost stats are at most a handful of contention + * waits at backend startup. + */ + if (MyProc->lwWaiting != LW_WS_NOT_WAITING) + return; + + procNumber = GetNumberFromPGProc(MyProc); + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return; + + if (!wait_event_timing_attach_array(true)) + return; + + slot = wet_slot(procNumber); + + /* + * Clear this backend's slot the first time it is used after backend + * start. The DSA-allocated region is zeroed on creation, but a later + * backend may inherit a slot previously occupied by an exited + * backend; explicit zero here keeps stats accurate across slot reuse. + * Matches the old per-backend init performed by + * pgstat_set_wait_event_timing_storage() in the eager-shmem design. + * + * Initialisation order: zero the slot through the local `slot` first, + * THEN publish the result to my_wait_event_timing. This keeps the + * single-backend invariant clean: at no point in this backend can + * `my_wait_event_timing != NULL` coincide with `*my_wait_event_timing` + * being partially initialised. The hot-path inline gate + * + * if (unlikely(my_wait_event_timing == NULL)) + * pgstat_wait_event_timing_lazy_attach(); + * ... my_wait_event_timing->wait_start = ... ; + * + * relies on that ordering: a non-NULL pointer means the slot is + * ready for the very next store. + * + * Note that cross-backend readers do NOT go through + * my_wait_event_timing -- they index WaitEventTimingArray[procNumber] + * directly via pgstat_get_wait_event_timing(), guarded by + * pgstat_get_beentry_by_proc_number() which filters dead/recycled + * slots. So this reordering is a same-backend tidiness fix; it does + * not address (and does not need to address) any cross-backend + * publication ordering, of which there is none. + */ + memset(slot->events, 0, sizeof(slot->events)); + lwlock_timing_hash_clear(slot); + slot->reset_count = 0; + slot->lwlock_overflow_count = 0; + slot->flat_overflow_count = 0; + slot->current_event = 0; + INSTR_TIME_SET_ZERO(slot->wait_start); + + my_last_reset_generation = pg_atomic_read_u32(&slot->reset_generation); + + /* Publish only after the slot is fully initialised. */ + my_wait_event_timing = slot; + + /* + * Register a before_shmem_exit callback to clear my_wait_event_timing + * BEFORE dsm_backend_shutdown unmaps the DSA segment that backs the + * slot. Without this, late-shutdown wait events (e.g. ProcArrayLock + * contention inside ProcArrayRemove during shmem_exit) fire the + * inline hot path, dereference the now-dangling slot pointer through + * INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start), and + * segfault. Callbacks run in LIFO order; dsm_backend_shutdown is + * registered very early in InitProcess so it always runs AFTER this + * one, giving us a safe window to null the pointer. + */ + { + static bool registered = false; + + if (!registered) + { + before_shmem_exit(pgstat_wait_event_timing_before_shmem_exit, + (Datum) 0); + registered = true; + } + } +} + +/* + * before_shmem_exit callback. Disables the inline hot path so it + * does NOT dereference my_wait_event_timing during the rest of the + * proc_exit cascade (after dsm_backend_shutdown unmaps the DSA + * segment behind that pointer). We deliberately do NOT null out + * my_wait_event_timing here: a NULL pointer would route the hot + * path through the lazy-attach branch, which then re-attaches a + * fresh slot using DSA primitives that themselves operate on + * already-detached memory. Setting the gate flag stops both the + * dereference and the re-attach. + */ +static void +pgstat_wait_event_timing_before_shmem_exit(int code, Datum arg) +{ + wait_event_timing_writes_disabled = true; +} + +/* + * Report the shared memory space needed for trace ring buffer control. + * Only a small control struct is in fixed shmem; the actual ring buffers + * are allocated lazily via DSA. At ~24 bytes/slot, the slot array adds + * ~26 KB at a default MaxBackends, negligible compared to the ring + * memory itself. + */ +static Size +WaitEventTraceControlShmemSize(void) +{ + return add_size(offsetof(WaitEventTraceControl, trace_slots), + mul_size(NUM_WAIT_EVENT_TIMING_SLOTS, + sizeof(WaitEventTraceSlot))); +} + +static void +WaitEventTraceControlShmemRequest(void *arg) +{ + ShmemRequestStruct(.name = "WaitEventTraceControl", + .size = WaitEventTraceControlShmemSize(), + .ptr = (void **) &WaitEventTraceCtl); +} + +/* + * Initialize shared memory for trace ring buffer control. + */ +static void +WaitEventTraceControlShmemInit(void *arg) +{ + int i; + + WaitEventTraceCtl->trace_dsa_handle = DSA_HANDLE_INVALID; + LWLockInitialize(&WaitEventTraceCtl->lock, + LWTRANCHE_WAIT_EVENT_TRACE_DSA); + for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++) + { + WaitEventTraceSlot *s = &WaitEventTraceCtl->trace_slots[i]; + + pg_atomic_init_u64(&s->generation, 0); + pg_atomic_init_u32(&s->state, WAIT_EVENT_TRACE_SLOT_FREE); + s->pad = 0; + s->ring_ptr = InvalidDsaPointer; + } +} + +const ShmemCallbacks WaitEventTraceControlShmemCallbacks = { + .request_fn = WaitEventTraceControlShmemRequest, + .init_fn = WaitEventTraceControlShmemInit, +}; + +/* + * Ensure the shared DSA for trace ring buffers exists and is attached. + * Creates it on first call (any backend), attaches on subsequent calls. + * Must be called from a backend context (not postmaster). + */ +static void +wait_event_trace_ensure_dsa(void) +{ + MemoryContext oldcontext; + + if (trace_dsa != NULL) + return; + + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + + if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID) + { + trace_dsa = dsa_create(LWTRANCHE_WAIT_EVENT_TRACE_DSA); + dsa_pin(trace_dsa); + dsa_pin_mapping(trace_dsa); + WaitEventTraceCtl->trace_dsa_handle = dsa_get_handle(trace_dsa); + } + else + { + trace_dsa = dsa_attach(WaitEventTraceCtl->trace_dsa_handle); + dsa_pin_mapping(trace_dsa); + } + + LWLockRelease(&WaitEventTraceCtl->lock); + + MemoryContextSwitchTo(oldcontext); +} + +/* + * Transition our trace ring slot to ORPHANED on backend exit. + * + * Registered as a before_shmem_exit callback. Runs BEFORE + * dsm_backend_shutdown() detaches the DSA. + * + * Crucially, we do NOT free the ring here. The ring stays allocated in + * DSA so that cross-backend consumers -- the in-tree + * pg_get_wait_event_trace SRF and any extension following the + * snapshot pattern documented on WaitEventTraceControl -- can read + * the dying backend's final waits. The original "free at exit" + * design lost data the instant a worker terminated, which was + * particularly bad for parallel workers exiting in milliseconds at + * end-of-parallel-query. See the lifecycle comment on + * WaitEventTraceControl for the full design + * rationale and the bounded-memory cost we accept in exchange. + * + * The ORPHANED slot is reclaimed in one of two ways: + * (a) a new backend at this procNumber calls + * wait_event_trace_clear_orphan_at_init() at backend init, or + * (b) the DBA calls pg_stat_clear_orphaned_wait_event_rings(). + * + * State transition order matters: bump generation BEFORE storing the + * new state, so cross-backend readers that snapshot + * (generation_before, state, ring_ptr, generation_after) under the + * lock see a consistent (state, ring_ptr) pair iff generation didn't + * change. We hold the lock for the whole transition, but readers do + * not have to (they just take it briefly to snapshot the ring + * contents); the generation check is what makes the unlocked-read + * path safe. + */ +static void +wait_event_trace_before_shmem_exit(int code, Datum arg) +{ + int procNumber = DatumGetInt32(arg); + WaitEventTraceSlot *slot; + + if (WaitEventTraceCtl == NULL) + return; + + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return; + + slot = &WaitEventTraceCtl->trace_slots[procNumber]; + + /* + * If this backend never ended up with an OWNED slot (e.g. capture + * was off the whole session, or the trace was released back to FREE + * via assign_wait_event_capture going trace -> off), there is + * nothing to transition. Read state without the lock first as a + * fast-path check; the authoritative re-check happens under the + * lock below. + */ + if (pg_atomic_read_u32(&slot->state) != WAIT_EVENT_TRACE_SLOT_OWNED) + { + wait_event_trace_writes_disabled = true; + my_wait_event_trace = NULL; + return; + } + + /* + * Disable trace-ring writes on this backend before we touch the + * lock. Writes after this point would race with the + * OWNED -> ORPHANED state publish below: a wait event whose + * end-timing path runs after the state has been published as + * ORPHANED would write into a ring that the patch contract + * declares read-only post-mortem. Cross-backend readers + * snapshot ORPHANED rings without expecting concurrent writes + * from the dying owner. See wait_event_trace_writes_disabled + * for the full UAF / contract-violation analysis. + * + * The flag stays true for the remainder of this backend's life + * (we are in proc_exit; there is no subsequent capture re-enable + * to handle), so we do not reset it. + */ + wait_event_trace_writes_disabled = true; + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + + /* + * Drop the local pointer inside the lock-held region as a + * second line of defense; the writes-disabled flag above is + * the primary gate. + */ + my_wait_event_trace = NULL; + + if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_OWNED && + DsaPointerIsValid(slot->ring_ptr)) + { + /* + * Bump generation first so any reader that snapped the old + * generation will detect the change on its post-read recheck + * and discard its read. Then publish the ORPHANED state. + * Keep ring_ptr valid -- the data is what we want to preserve. + */ + pg_atomic_fetch_add_u64(&slot->generation, 1); + pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_ORPHANED); + } + + LWLockRelease(&WaitEventTraceCtl->lock); +} + +/* + * Allocate (or re-acquire) a trace ring buffer for this backend via DSA. + * Called when wait_event_capture is set to 'trace'. + * + * Slot state at entry will be one of: + * + * FREE fresh slot (or one cleared on this backend's init by + * wait_event_trace_clear_orphan_at_init): allocate a new + * ring, transition slot to OWNED, bump generation. + * + * OWNED we already attached earlier in this same backend's life + * (e.g. user toggled capture trace->stats->trace; the + * stats step calls wait_event_trace_release_slot which + * transitions back to FREE, but our cached + * my_wait_event_trace was cleared on the way down -- so + * seeing OWNED here at attach time means a different + * backend somehow ended up with this procNumber, which + * cannot happen because procNumber is per-backend and a + * single backend can only run one attach at a time. We + * still tolerate this state defensively by re-mapping the + * existing ring rather than leaking a second allocation. + * + * ORPHANED can never be observed here: a new backend's + * pgstat_set_wait_event_timing_storage() called + * wait_event_trace_clear_orphan_at_init() before any + * wait-event capture path can run, so any prior orphan has + * already been demoted to FREE. Treated as a safety check + * (Assert in debug builds). + */ +void +wait_event_trace_attach(int procNumber) +{ + /* + * Re-entrancy guard. dsa_create / dsa_allocate_extended below can + * emit wait events internally; those reach the lazy-attach hot path + * which calls back into this function while we still hold + * WaitEventTraceCtl->lock or are mid-allocation. See the + * function-local-static-bool pattern explainer on + * wait_event_timing_attach_array. + */ + static bool in_attach = false; + static bool shmem_exit_registered = false; + WaitEventTraceSlot *slot; + dsa_pointer p; + WaitEventTraceState *ts; + uint32 state_now; + + if (in_attach) + return; + + if (WaitEventTraceCtl == NULL) + return; + + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return; + + /* + * Skip the attach if we are inside a critical section. Below this + * point we call dsa_create / dsa_attach / dsa_allocate_extended, + * all of which can allocate memory via MemoryContextAlloc and + * Assert-fail on "CritSectionCount == 0 || allowInCritSection". + * The very-first wait event after wait_event_capture = trace can + * land inside a critical section (e.g. a parallel worker scanning + * a heap page hits BufferSetHintBits16 -> XLogSaveBufferForHint -> + * XLogInsert -> LWLockAcquire, with the XLogInsert critical + * section open). + * + * Skipping here silently drops the in-flight wait event (it is + * not traced) but keeps the backend alive. The next wait event + * outside any critical section will hit this function again and + * attach successfully. See the matching guard in + * pgstat_wait_event_timing_lazy_attach. + */ + if (CritSectionCount > 0) + return; + + /* + * Skip the attach if MyProc is already on an LWLock wait + * queue. We are called from the wait-event hot path which + * fires AFTER LWLockQueueSelf has set MyProc->lwWaiting; a + * nested LWLockAcquire on our internal lock (via + * wait_event_trace_ensure_dsa) would PANIC at lwlock.c:1029. + * See the matching guard in pgstat_wait_event_timing_lazy_ + * attach for the full rationale. + */ + if (MyProc != NULL && MyProc->lwWaiting != LW_WS_NOT_WAITING) + return; + + slot = &WaitEventTraceCtl->trace_slots[procNumber]; + + in_attach = true; + PG_TRY(); + { + state_now = pg_atomic_read_u32(&slot->state); + + /* + * ORPHANED is normally impossible at attach time -- + * pgstat_set_wait_event_timing_storage() at backend init calls + * wait_event_trace_clear_orphan_at_init() which demotes any + * inherited orphan to FREE. But there is one case where this + * backend can legitimately observe its own slot in the + * ORPHANED state: after we have already run + * wait_event_trace_before_shmem_exit() (transitioning the slot + * to ORPHANED on exit), a later before_shmem_exit callback + * (e.g. pgstat_io_flush_cb during proc_exit shutdown) can + * contend on an LWLock that emits a wait event, which calls + * pgstat_report_wait_end_timing() -> wait_event_trace_attach() + * after my_wait_event_trace has been cleared. We must not + * re-attach in that case: we are dying, the ring is now + * post-mortem data for cross-backend readers, and the writer + * invariant must hold. Skip the trace for any wait events + * emitted after our own exit transition. + */ + if (state_now == WAIT_EVENT_TRACE_SLOT_ORPHANED) + { + /* PG_FINALLY below clears in_attach. */ + } + else if (state_now == WAIT_EVENT_TRACE_SLOT_OWNED && + DsaPointerIsValid(slot->ring_ptr)) + { + /* Already have a ring buffer; re-map to it. */ + wait_event_trace_ensure_dsa(); + my_wait_event_trace = dsa_get_address(trace_dsa, slot->ring_ptr); + my_trace_proc_number = procNumber; + } + else + { + Size alloc_size; + + wait_event_trace_ensure_dsa(); + + /* + * Cache the cluster-wide ring size on first allocation in + * this backend. wait_event_trace_ring_size_kb is + * PGC_POSTMASTER, so by the time any backend reaches + * here, its boot value has been committed by the GUC + * framework. All rings in the postmaster run share the + * same dimensions. + */ + if (WaitEventTraceRingSize == 0) + WaitEventTraceRingSize = + (uint32) wait_event_trace_ring_size_kb * 1024U / + (uint32) sizeof(WaitEventTraceRecord); + + alloc_size = offsetof(WaitEventTraceState, records) + + (Size) WaitEventTraceRingSize * sizeof(WaitEventTraceRecord); + + p = dsa_allocate_extended(trace_dsa, alloc_size, DSA_ALLOC_ZERO); + ts = dsa_get_address(trace_dsa, p); + pg_atomic_init_u64(&ts->write_pos, 0); + ts->ring_mask = WaitEventTraceRingSize - 1; + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + /* + * Publish ring_ptr BEFORE transitioning state to OWNED. + * Cross-backend readers that observe state==OWNED outside + * the lock then see a valid ring_ptr. Bump generation + * last so any reader that snapped the prior generation + * will detect the change. + */ + slot->ring_ptr = p; + pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_OWNED); + pg_atomic_fetch_add_u64(&slot->generation, 1); + LWLockRelease(&WaitEventTraceCtl->lock); + + my_wait_event_trace = ts; + my_trace_proc_number = procNumber; + + /* + * Register cleanup to run BEFORE dsm_backend_shutdown() + * detaches the DSA. The before_shmem_exit callbacks run in + * LIFO order before DSM detach, so the ORPHANED transition + * (which does not actually free the ring) is safe at that + * point. + * + * Guarded by shmem_exit_registered because under the + * release-on-disable policy (see wait_event_trace_release_slot + * and assign_wait_event_capture) the allocate branch can run + * multiple times per backend lifetime -- once per + * off/stats -> trace re-enable cycle. The cleanup itself is + * idempotent (it short-circuits when state is not OWNED), so + * it is safe to invoke after a release-then-reattach cycle, + * but we still avoid growing the before_shmem_exit list. + */ + if (!shmem_exit_registered) + { + before_shmem_exit(wait_event_trace_before_shmem_exit, + Int32GetDatum(procNumber)); + shmem_exit_registered = true; + } + } + } + PG_FINALLY(); + { + in_attach = false; + } + PG_END_TRY(); +} + +/* + * Free trace ring buffer for this backend on exit. + */ +static void +wait_event_trace_detach(int procNumber) +{ + /* + * Only clear local pointers here. The actual DSA free happens in + * wait_event_trace_before_shmem_exit(), which runs before + * dsm_backend_shutdown() detaches the DSA segments. + */ + my_wait_event_trace = NULL; + my_trace_proc_number = -1; +} + +/* + * Release this backend's trace ring buffer back to DSA immediately. + * + * Called from assign_wait_event_capture when the user steps down from + * TRACE to STATS or OFF. Without this, a ~4 MB ring allocated by a + * brief investigation would remain pinned for the rest of the session's + * lifetime, which can leak gigabytes across large connection pools. + * + * Important contrast with wait_event_trace_before_shmem_exit: backend + * exit transitions the slot to ORPHANED (preserving data for + * cross-backend readers); release_slot fully frees and returns to FREE + * because the operator has explicitly disabled trace -- they have + * affirmatively decided not to keep the data, so we honour that and + * reclaim the memory immediately. Subsequent re-enable allocates a + * fresh ring via wait_event_trace_attach's allocate branch. + * + * The operation is LWLock-safe and does not raise -- dsa_free is pure + * bookkeeping on the DSA freelist, no allocation and no ereport paths. + * Safe to call from a GUC assign hook. + * + * If pg_get_backend_wait_event_trace is currently iterating our own ring + * (wait_event_trace_srf_in_progress), we must NOT free the chunk out + * from under it: that would be a use-after-free on the records[] the SRF + * is still reading. Set wait_event_trace_release_pending instead and + * return; the SRF's PG_FINALLY block will perform the deferred free + * after iteration completes. In practice this branch is unreachable in + * current PG (assign hooks fire only at command boundaries and the SRF + * is a single command), but it makes the invariant explicit and the + * future-proofing free. + */ +static void +wait_event_trace_release_slot(int procNumber) +{ + /* + * Re-entrancy guard. dsa_free takes a DSA-internal LWLock which can + * in principle emit a wait event; if a nested assign hook re-enters + * we must not recurse. See the function-local-static-bool pattern + * explainer on wait_event_timing_attach_array. + */ + static bool in_release = false; + WaitEventTraceSlot *slot; + + if (in_release) + return; + + if (WaitEventTraceCtl == NULL || trace_dsa == NULL) + return; + + /* + * Same-backend SRF is iterating our own ring. Defer the free until + * the SRF's PG_FINALLY runs. + */ + if (wait_event_trace_srf_in_progress) + { + wait_event_trace_release_pending = true; + return; + } + + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return; + + slot = &WaitEventTraceCtl->trace_slots[procNumber]; + + in_release = true; + + /* + * Disable trace-ring writes on this backend before we touch the + * lock or call dsa_free. An internal LWLock inside dsa_free can + * dispatch a wait event whose end-timing path would otherwise see + * capture_level == TRACE (the GUC assign hook is in flight; the + * variable has not been committed by the framework yet) and + * write into the very chunk we are returning to the DSA + * freelist. See the comment on + * wait_event_trace_writes_disabled for the full UAF analysis. + */ + wait_event_trace_writes_disabled = true; + + PG_TRY(); + { + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + + /* + * Drop the local pointer BEFORE the dsa_free as a second line + * of defense (the writes-disabled flag above is the primary + * gate). Any wait event whose hot path slips past the gate + * check via a compiler or memory-ordering surprise would at + * least see my_wait_event_trace == NULL and skip the write. + */ + my_wait_event_trace = NULL; + + if (DsaPointerIsValid(slot->ring_ptr)) + { + /* + * Bump generation first to invalidate any concurrent + * cross-backend snapshot, then free, then publish the FREE + * state with a NULL ring_ptr. Order matters for unlocked + * readers that have already passed the state check. + */ + pg_atomic_fetch_add_u64(&slot->generation, 1); + dsa_free(trace_dsa, slot->ring_ptr); + slot->ring_ptr = InvalidDsaPointer; + pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE); + } + LWLockRelease(&WaitEventTraceCtl->lock); + } + PG_FINALLY(); + { + wait_event_trace_writes_disabled = false; + in_release = false; + } + PG_END_TRY(); +} + +/* + * Clear an orphaned trace ring at backend init time. + * + * Called from pgstat_set_wait_event_timing_storage() once the new + * backend has its procNumber. If the slot we're inheriting was left + * ORPHANED by a previous backend (because we deliberately do not free + * trace rings on backend exit -- see the lifecycle discussion on + * WaitEventTraceControl), free the ring now so the new backend starts + * with a clean FREE slot. Subsequent wait_event_trace_attach() calls + * (when this backend itself enables trace) will then take the + * allocate branch. + * + * No-op when the slot is already FREE or OWNED: FREE means there's + * nothing to clear; OWNED is impossible at backend init (only a + * not-yet-exited backend can leave a slot OWNED, and procNumbers are + * assigned exclusively). We assert OWNED is not observed in debug + * builds and conservatively skip the free in production. + * + * Robustness: this runs during InitProcess() (before the backend can + * accept any work), and the work it performs -- dsa_attach() and + * dsa_free() -- can raise ERROR on rare runtime failures (corrupted + * DSA segment headers, descriptor exhaustion, mmap ENOMEM, etc.). + * An uncaught ERROR here would propagate out of InitProcess() and + * abort backend startup entirely, even for sessions that never + * intended to use wait_event_capture. To prevent the trace + * feature's housekeeping from gating connection establishment, the + * body is wrapped in PG_TRY()/PG_CATCH(): any error from dsa_attach + * or dsa_free is captured, downgraded to a WARNING with a hint + * pointing at the admin sweep function, and execution continues. + * The orphan stays in place; it can be reclaimed by the next + * backend that inherits the same procNumber (if the underlying + * problem was transient), by pg_stat_clear_orphaned_wait_event_rings(), + * or at next cluster restart. + */ +static void +wait_event_trace_clear_orphan_at_init(int procNumber) +{ + WaitEventTraceSlot *slot; + uint32 state_now; + MemoryContext caller_cxt; + + if (WaitEventTraceCtl == NULL) + return; + + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return; + + slot = &WaitEventTraceCtl->trace_slots[procNumber]; + + state_now = pg_atomic_read_u32(&slot->state); + if (state_now != WAIT_EVENT_TRACE_SLOT_ORPHANED) + { + Assert(state_now != WAIT_EVENT_TRACE_SLOT_OWNED); + return; + } + + /* + * Save CurrentMemoryContext so the PG_CATCH path can copy the + * error data into a context that survives FlushErrorState(). + * FlushErrorState() calls MemoryContextReset(ErrorContext), so + * CopyErrorData() must run in a different context or the + * returned ErrorData becomes a dangling pointer. + */ + caller_cxt = CurrentMemoryContext; + + PG_TRY(); + { + /* + * The trace DSA is shared across the cluster. We must attach + * to it before calling dsa_free (which needs the dsa_area + * pointer). The DSA was created by some earlier backend that + * wrote a trace record (otherwise the slot couldn't have + * ended up ORPHANED), so the handle in WaitEventTraceCtl is + * valid; ensure_dsa() will attach. Both ensure_dsa() and + * dsa_free() can raise ERROR; the PG_CATCH below downgrades + * any such error to a WARNING so backend startup is not + * blocked. + */ + wait_event_trace_ensure_dsa(); + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_ORPHANED && + DsaPointerIsValid(slot->ring_ptr)) + { + pg_atomic_fetch_add_u64(&slot->generation, 1); + dsa_free(trace_dsa, slot->ring_ptr); + slot->ring_ptr = InvalidDsaPointer; + pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE); + } + LWLockRelease(&WaitEventTraceCtl->lock); + } + PG_CATCH(); + { + ErrorData *edata; + + /* + * Release any LWLocks we (or anything we called) might + * still hold. Two paths can leave WaitEventTraceCtl->lock + * held when control reaches here: + * + * 1. The outer LWLockAcquire above succeeded and dsa_free + * raised before we reached LWLockRelease. + * 2. wait_event_trace_ensure_dsa() raised inside its own + * LWLockAcquire/dsa_attach/LWLockRelease region. + * + * We are running during InitProcess(), BEFORE any + * transaction or PostgresMain sigsetjmp has been set up, + * so PG's standard "AbortTransaction -> LWLockReleaseAll" + * cleanup does NOT fire on the longjmp into PG_CATCH. + * Without an explicit release here the lock would stay + * held for the lifetime of this backend, blocking every + * future LW_EXCLUSIVE acquirer (the orphan-clear sweep, + * release_slot, before_shmem_exit transitions, and + * subsequent backends' clear_orphan_at_init). That would + * be strictly worse than the original failure-startup + * behavior this commit set out to fix. + * + * LWLockReleaseAll() is the idiomatic catch-path lock + * cleanup used by the standalone aux-process error + * handlers (walwriter.c, checkpointer.c, pgarch.c). It + * is safe to call broadly here because pgstat_set_wait_ + * event_timing_storage runs at a fixed point in + * InitProcess where the caller frame holds no other + * LWLocks across our return: the earlier InitProcess + * steps that touch LWLocks (ProcArrayAdd, etc.) release + * them before returning, and the subsequent steps that + * acquire LWLocks have not yet run. + */ + LWLockReleaseAll(); + + /* + * Switch BACK to the caller's context before CopyErrorData + * so that edata is allocated in a context that survives + * FlushErrorState(). FlushErrorState() calls + * MemoryContextReset(ErrorContext); allocating edata in + * ErrorContext (the default at PG_CATCH entry on the error + * path) would make it a dangling pointer the moment we + * flush. See the matching pattern in spi.c PG_CATCH + * branches. + */ + MemoryContextSwitchTo(caller_cxt); + edata = CopyErrorData(); + FlushErrorState(); + + ereport(WARNING, + (errcode(edata->sqlerrcode), + errmsg("could not clear orphaned wait-event trace ring " + "at backend init: %s", edata->message), + errdetail("Backend startup proceeds with the orphan " + "still allocated for procnumber %d.", + procNumber), + errhint("Run pg_stat_clear_orphaned_wait_event_rings() " + "to release the orphan when the underlying " + "condition is resolved."))); + + FreeErrorData(edata); + } + PG_END_TRY(); +} + +/* + * GUC check hook for wait_event_capture (timing build). + * + * All three enum values are accepted at this level; the assign hook + * handles side effects (attaching the trace ring on TRACE, warning + * about track_activities, etc.). + */ +bool +check_wait_event_capture(int *newval, void **extra, GucSource source) +{ + return true; +} + +/* + * GUC assign hook for wait_event_capture. + * + * Three responsibilities, all correctness- or resource-critical: + * + * 1) Drop any in-flight wait state. After the capture level changes, + * the existing wait_start / current_event in our per-backend slot can + * no longer be trusted. Consider this sequence: + * + * capture = STATS, wait on E1 starts -> wait_start=T0, current_event=E1 + * capture flips to OFF mid-wait + * wait_end inline skips (guard fails) -> state still T0/E1 + * new wait on E2 starts under OFF -> inline skips, state still T0/E1 + * capture flips back to STATS + * wait_end for E2 -> guard passes, credits (now - T0) to E1 + * + * Zeroing both fields on every assignment forfeits at most one + * in-flight sample per GUC change (negligible) but eliminates all + * such miscredits. + * + * 2) Release the trace ring buffer when stepping down from TRACE. + * The per-backend trace ring is ~4 MB of DSA memory, and leaving it + * pinned for the rest of the session's lifetime leaks shmem across + * large connection pools that briefly enable trace. Freeing here + * makes "wait_event_capture = off" semantically release resources. + * The next re-enable re-allocates a fresh ring on first wait event + * via wait_event_trace_attach. + * + * 3) Warn (but never error) about secondary preconditions for TRACE + * level. GUC assign hooks MUST NOT ereport(ERROR) -- see + * src/backend/utils/misc/README -- because they can run during + * transaction rollback when lookups are unsafe. In particular, the + * trace ring's DSA allocation is NOT performed here (it can raise on + * OOM). Instead, the ring is attached lazily on the first write + * from wait_event_trace_write_marker() and + * pgstat_report_wait_end_timing(), where ereport(ERROR) has + * well-defined semantics. The release path above is safe to call + * from the hook because dsa_free is non-raising LWLock bookkeeping. + */ +void +assign_wait_event_capture(int newval, void *extra) +{ + if (my_wait_event_timing != NULL) + { + INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start); + my_wait_event_timing->current_event = 0; + } + + /* + * Step-down from TRACE: release the ring now instead of at backend + * exit. Only fires when a ring is actually attached, so going + * directly OFF -> TRACE -> OFF without ever having emitted a trace + * record is still a no-op. + */ + if (newval != WAIT_EVENT_CAPTURE_TRACE && my_wait_event_trace != NULL) + wait_event_trace_release_slot(my_trace_proc_number); + + if (newval == WAIT_EVENT_CAPTURE_TRACE && !pgstat_track_activities) + ereport(WARNING, + (errmsg("wait_event_capture = trace query attribution " + "requires track_activities to be enabled"))); + + if (newval == WAIT_EVENT_CAPTURE_TRACE && + compute_query_id == COMPUTE_QUERY_ID_OFF) + ereport(WARNING, + (errmsg("wait_event_capture = trace query attribution " + "requires compute_query_id to be enabled"), + errhint("Set compute_query_id to \"on\" or \"auto\", or " + "load an extension that enables it (e.g. " + "pg_stat_statements)."))); +} + +/* + * Point my_wait_event_timing at this backend's slot. + * Called from InitProcess() after the backend has a valid procNumber. + * + * procNumber is the PGPROC array index (from GetNumberFromPGProc). + * Covers both regular backends (procNumber < MaxBackends) and auxiliary + * processes (bgwriter, checkpointer, walwriter, etc.). + * + * On EXEC_BACKEND builds (Windows), SubPostmasterMain() calls + * CreateSharedMemoryAndSemaphores() before InitProcess(), so + * WaitEventTimingArray is always initialized at this point. + */ +void +pgstat_set_wait_event_timing_storage(int procNumber) +{ + /* + * Do NOT attach to the timing array here: the array is allocated in + * DSA on first enable of wait_event_capture (see + * pgstat_wait_event_timing_lazy_attach). A backend that never enables + * capture pays zero shmem cost. + * + * Trace ring buffer is allocated lazily via DSA when + * wait_event_capture is set to 'trace'. Save procNumber for later + * use by trace_attach/detach. + */ + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + { + my_wait_event_timing = NULL; + my_trace_proc_number = -1; + my_wait_event_trace = NULL; + return; + } + + my_wait_event_timing = NULL; + my_trace_proc_number = procNumber; + my_wait_event_trace = NULL; + + /* + * If the previous occupant of this procNumber slot was a tracing + * backend that exited, its trace ring is still allocated in DSA in + * ORPHANED state (see wait_event_trace_before_shmem_exit and the + * lifecycle discussion on WaitEventTraceControl). Free it now so + * this backend starts with a clean FREE slot; otherwise the next + * wait_event_trace_attach call would observe OWNED-but-not-our-data + * (impossible by invariant) or, with the eventual addition of + * post-mortem cross-backend reads, a freshly attached writer would + * end up appending to a previous backend's records. + */ + wait_event_trace_clear_orphan_at_init(procNumber); +} + +/* + * Detach from timing state on backend exit. + * + * This function is invoked from ProcKill() as an on_shmem_exit callback, + * which runs AFTER dsm_backend_shutdown() has detached DSA mappings. + * Writing to my_wait_event_timing at this point would touch DSA-backed + * memory that is no longer mapped and would segfault. + * + * We therefore only clear the backend-local pointers here. Zeroing of + * the shared slot itself happens in two safe places: + * - the next time a backend attaches to the slot (lazy_attach memsets), + * - the SRF readers filter dead backends via pgstat_get_beentry_by_proc_number, + * so stale data in the slot never becomes user-visible. + */ +void +pgstat_reset_wait_event_timing_storage(void) +{ + /* Trace ring buffer: cleanup via before_shmem_exit callback (Fix #1) */ + if (my_trace_proc_number >= 0) + wait_event_trace_detach(my_trace_proc_number); + + my_wait_event_timing = NULL; + my_wait_event_trace = NULL; + my_trace_proc_number = -1; +} + +/* + * Out-of-line body for pgstat_report_wait_start() timing path. + * + * Called when wait_event_capture != OFF. The inline gate keeps just + * one global load + branch at every call site; everything else -- + * writes-disabled check, lazy attach, INSTR_TIME read, current_event + * write -- runs here, where the cost is paid only when capture is + * actually enabled and the codegen does not pollute the host + * functions (LWLockAcquire, XLogInsert, etc.). + */ +void +pgstat_report_wait_start_timing(uint32 wait_event_info) +{ + /* + * Keeps us out of the timing path during the proc_exit cascade + * after the before_shmem_exit callback raises this flag, so we + * do not attempt DSA operations whose backing mappings + * dsm_backend_shutdown has already torn down. + */ + if (wait_event_timing_writes_disabled) + return; + + /* + * Lazy attach: the per-backend timing slot lives in a DSA created + * the first time any backend in the cluster enables + * wait_event_capture. After the first successful attach the cached + * pointer stays valid for the backend's lifetime, so this branch is + * cold and perfectly predicted. + */ + if (unlikely(my_wait_event_timing == NULL)) + { + pgstat_wait_event_timing_lazy_attach(); + + /* + * lazy_attach() can dispatch nested wait events while it sets up + * DSA (dsa_attach takes an internal LWLock which can contend). + * Those nested wait_end() calls clear my_wait_event_info to 0, + * so by the time we return here the outer wait's wait_event_info + * is no longer published to pg_stat_activity. Re-publish to + * restore visibility. Only needed on the first-attach path; + * subsequent calls skip this branch entirely. + */ + *(volatile uint32 *) my_wait_event_info = wait_event_info; + } + + if (likely(my_wait_event_timing != NULL)) + { + INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start); + my_wait_event_timing->current_event = wait_event_info; + } +} + +/* + * Out-of-line body for pgstat_report_wait_end() timing path. + * Called when wait_event_capture is at STATS or higher. Performs the + * writes-disabled check, lazy-attach, computes wait duration, + * accumulates per-event stats, and (at TRACE level) writes the event + * into the per-session trace ring buffer. + * + * The capture_level argument is the value of wait_event_capture as + * observed at the inline gate. Passing it through (rather than + * re-loading the global here) avoids a redundant memory load on the + * trace hot path: the function-call boundary defeats CSE, so without + * the parameter the compiler must emit a second load to test for + * TRACE level below. Using the gate's view also means a concurrent + * GUC change cannot half-update this call -- we either ran in the + * old level or we don't run at all. + */ +void +pgstat_report_wait_end_timing(int capture_level) +{ + uint32 event; + uint32 cur_reset_gen; + + if (wait_event_timing_writes_disabled) + return; + + if (unlikely(my_wait_event_timing == NULL)) + { + pgstat_wait_event_timing_lazy_attach(); + if (my_wait_event_timing == NULL) + return; + } + + event = my_wait_event_timing->current_event; + + /* + * Fast check for a pending cross-backend reset request. Single + * atomic load; almost always hits the fast path (branch well + * predicted). When we detect that our shared reset_generation has + * advanced, clear our own counters on behalf of the requester, then + * continue with normal accumulation. wait_start is deliberately + * left untouched so we don't lose the measurement that's already + * running; the completing event will land in the freshly-zeroed + * counters, which is the desired behaviour. current_event is safe + * to zero here because the local "event" above already captured its + * value before the reset block; zeroing it kills a source of stale + * state that external readers would otherwise observe on the slot + * between waits. + */ + cur_reset_gen = pg_atomic_read_u32(&my_wait_event_timing->reset_generation); + if (unlikely(cur_reset_gen != my_last_reset_generation)) + { + memset(my_wait_event_timing->events, 0, + sizeof(my_wait_event_timing->events)); + lwlock_timing_hash_clear(my_wait_event_timing); + my_wait_event_timing->reset_count++; + my_wait_event_timing->lwlock_overflow_count = 0; + my_wait_event_timing->flat_overflow_count = 0; + my_wait_event_timing->current_event = 0; + my_last_reset_generation = cur_reset_gen; + } + + if (event != 0 && !INSTR_TIME_IS_ZERO(my_wait_event_timing->wait_start)) + { + instr_time now; + int64 duration_ns; + int idx; + + INSTR_TIME_SET_CURRENT(now); + duration_ns = INSTR_TIME_GET_NANOSEC(now) - + INSTR_TIME_GET_NANOSEC(my_wait_event_timing->wait_start); + + if (unlikely(duration_ns < 0)) + duration_ns = 0; + + idx = wait_event_timing_index(event); + + /* + * No lock needed on the hot path: each WaitEventTimingState slot + * has a single writer (the owning backend), and the SRF reader + * pg_stat_get_wait_event_timing() is lock-free by design. Cross- + * backend reset is handled by the reset_generation check at the + * top of this function: the requester bumps the atomic and the + * owning backend (us) clears the counters at the next wait_end. + * + * We defer emitting the overflow WARNING to after the critical + * bookkeeping is complete, so ereport() cannot recurse through + * a wait event while counters are in an intermediate state. + */ + { + WaitEventTimingEntry *entry = NULL; + bool warn_lwlock_overflow = false; + bool warn_flat_overflow = false; + + if (idx == WAIT_EVENT_TIMING_IDX_LWLOCK) + entry = lwlock_timing_lookup(my_wait_event_timing, + event & 0xFFFF); + else if (likely(idx >= 0)) + entry = &my_wait_event_timing->events[idx]; + + if (likely(entry != NULL)) + { + entry->count++; + entry->total_ns += duration_ns; + if (duration_ns > entry->max_ns) + entry->max_ns = duration_ns; + entry->histogram[wait_event_timing_bucket(duration_ns)]++; + } + else if (idx == WAIT_EVENT_TIMING_IDX_LWLOCK) + { + if (my_wait_event_timing->lwlock_overflow_count++ == 0) + warn_lwlock_overflow = true; + } + else if (idx == -1) + { + if (my_wait_event_timing->flat_overflow_count++ == 0) + warn_flat_overflow = true; + } + + /* Emit overflow warnings outside any critical section. */ + if (unlikely(warn_lwlock_overflow)) + ereport(WARNING, + (errmsg("wait_event_timing: LWLock hash table full, " + "timing data for some LWLock tranches will be lost"), + errhint("This backend uses more than %d distinct LWLock tranches; raise wait_event_timing_max_tranches.", + wait_event_timing_max_entries))); + else if (unlikely(warn_flat_overflow)) + ereport(WARNING, + (errmsg("wait_event_timing: event class overflow, " + "some events will not be timed"))); + } + + /* 10046-style per-session trace ring buffer (DSA-backed) */ + if (unlikely(capture_level == WAIT_EVENT_CAPTURE_TRACE) && + likely(!wait_event_trace_writes_disabled)) + { + /* + * Lazy attach on first use -- allocation happens here rather + * than in assign_wait_event_capture() to respect the GUC + * assign-hook "must not ereport" contract. See the comment + * on assign_wait_event_capture() for rationale. + * + * wait_event_trace_writes_disabled (checked above) also + * blocks this re-attach during slot-state transitions + * driven by release_slot / before_shmem_exit; without that + * gate, a nested wait event mid-transition could see + * my_wait_event_trace == NULL and recurse into a fresh + * attach that deadlocks on the lock the outer transition + * already holds. See review_6.md issue #10. + */ + if (my_wait_event_trace == NULL && my_trace_proc_number >= 0) + wait_event_trace_attach(my_trace_proc_number); + + if (my_wait_event_trace != NULL) + { + /* + * Single-writer claim: read+write avoids the LOCK XADD that + * pg_atomic_fetch_add_u64 would emit on every wait event. + * See wait_event_trace_write_marker for the full rationale. + */ + uint64 pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos); + WaitEventTraceRecord *rec; + uint32 seq; + + pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1); + + /* + * Injection point used by the regression test for the + * position-encoded identity seqlock in + * emit_wait_event_trace_for_procnumber(). Stalling here + * widens the window between the write_pos store and the + * rec->seq store, simulating the weak-memory visibility + * order that would otherwise be unreachable on x86. A + * cross-backend reader observing the new write_pos + * while the rec->seq update has not yet happened MUST + * skip this slot via the identity check; without the + * identity check the reader would emit a stale record + * from the previous ring cycle with the wrong ring + * index. Compiled out unless --enable-injection-points + * is set. + */ + INJECTION_POINT("wait-event-trace-after-write-pos", NULL); + + rec = &my_wait_event_trace->records[pos & my_wait_event_trace->ring_mask]; + seq = (uint32)(pos * 2 + 1); + + rec->seq = seq; + pg_write_barrier(); /* release: payload stores must not rise above seq=odd */ + + rec->record_type = TRACE_WAIT_EVENT; + rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now); + rec->data.wait.event = event; + rec->data.wait.pad2 = 0; + rec->data.wait.duration_ns = duration_ns; + + pg_write_barrier(); /* release: payload stores must land before seq=even */ + rec->seq = seq + 1; + } + } + + INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start); + } +} + +/* + * Resolve the optional pid SRF argument to a procNumber range + * [out_start, out_end). Returns true on success, false if the SRF + * should emit zero rows (unknown pid -- silent no-op, matching the + * pg_stat_reset_wait_event_timing convention). + * + * PID NULL -> sweep all NUM_WAIT_EVENT_TIMING_SLOTS slots. + * PID known -> sweep the single slot belonging to that backend. + * PID unknown / invalid -> emit no rows. + */ +static bool +wait_event_timing_pid_range(FunctionCallInfo fcinfo, + int *out_start, int *out_end) +{ + if (PG_ARGISNULL(0)) + { + *out_start = 0; + *out_end = NUM_WAIT_EVENT_TIMING_SLOTS; + return true; + } + else + { + int target_pid = PG_GETARG_INT32(0); + PGPROC *proc; + int procNumber; + + proc = BackendPidGetProc(target_pid); + if (proc == NULL) + proc = AuxiliaryPidGetProc(target_pid); + if (proc == NULL) + return false; + + procNumber = GetNumberFromPGProc(proc); + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return false; + + *out_start = procNumber; + *out_end = procNumber + 1; + return true; + } +} + +/* + * SQL function: pg_stat_get_wait_event_timing(pid int4, OUT ...) + * + * Returns one row per (backend, wait_event) with non-zero counts. + * pid is optional: NULL means all backends; a non-NULL value restricts + * the sweep to that single backend (silently empty if the PID is + * unknown, matching pg_stat_reset_wait_event_timing(pid) semantics). + * + * The PID-filtered fast path turns the cost of cluster-wide monitoring + * loops that poll a specific PID from O(MaxBackends * events) into + * O(events) per call -- the same precedent as pg_stat_get_activity(pid). + * + * Uses InitMaterializedSRF (materialize-all) for simplicity. The result + * set is bounded by (NUM_WAIT_EVENT_TIMING_SLOTS * WAIT_EVENT_TIMING_NUM_EVENTS) + * rows, so deferred (value-per-call) mode is not needed. + */ +Datum +pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int start_idx; + int end_idx; + int backend_idx; + ArrayType *hist_array; + int64 *hist_payload; + + InitMaterializedSRF(fcinfo, 0); + + /* + * If no backend has ever enabled wait_event_capture since the last + * postmaster start, the shared timing array has not been allocated + * yet -- return zero rows rather than forcing an allocation just for + * a read. + */ + if (!wait_event_timing_attach_array(false)) + PG_RETURN_VOID(); + + if (!wait_event_timing_pid_range(fcinfo, &start_idx, &end_idx)) + PG_RETURN_VOID(); + + /* + * Allocate the histogram ArrayType once and reuse it across every row + * emitted below. Per-row we overwrite the 16 int8 payload slots via + * ARR_DATA_PTR; tuplestore_putvalues flattens the varlena into its + * stored tuple, so subsequent rewrites cannot corrupt previously + * emitted rows. Saves one palloc per row on SRFs that can easily + * produce tens of thousands of rows on large clusters. + */ + { + Datum zero_elems[WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS]; + + memset(zero_elems, 0, sizeof(zero_elems)); + hist_array = construct_array_builtin(zero_elems, + WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS, + INT8OID); + hist_payload = (int64 *) ARR_DATA_PTR(hist_array); + } + + for (backend_idx = start_idx; backend_idx < end_idx; backend_idx++) + { + WaitEventTimingState *state = wet_slot(backend_idx); + PgBackendStatus *beentry; + int i; + + /* Skip dead backend slots and check permissions */ + beentry = pgstat_get_beentry_by_proc_number(backend_idx); + if (beentry == NULL) + continue; + if (!HAS_PGSTAT_PERMISSIONS(beentry->st_userid)) + continue; + + /* Emit rows from the flat array (all classes except LWLock) */ + for (i = 0; i < WAIT_EVENT_TIMING_DENSE_CLASSES; i++) + { + int base = wait_event_class_offset[i]; + int nevents = wait_event_class_nevents[i]; + uint32 classId = wait_event_dense_to_classid[i]; + int j; + + for (j = 0; j < nevents; j++) + { + WaitEventTimingEntry *entry = &state->events[base + j]; + Datum values[10]; + bool nulls[10]; + uint32 wait_event_info; + const char *event_type; + const char *event_name; + int bucket; + + if (entry->count == 0) + continue; + + /* Reconstruct wait_event_info from class and event ID */ + wait_event_info = ((uint32) classId << 24) | j; + + event_type = pgstat_get_wait_event_type(wait_event_info); + event_name = pgstat_get_wait_event(wait_event_info); + + if (event_type == NULL || event_name == NULL) + continue; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(beentry->st_procpid); + values[1] = CStringGetTextDatum(GetBackendTypeDesc(beentry->st_backendType)); + values[2] = Int32GetDatum(backend_idx); + values[3] = CStringGetTextDatum(event_type); + values[4] = CStringGetTextDatum(event_name); + values[5] = Int64GetDatum(entry->count); + values[6] = Float8GetDatum((double) entry->total_ns / 1000000.0); + values[7] = Float8GetDatum(entry->count > 0 + ? (double) entry->total_ns / entry->count / 1000.0 + : 0.0); + values[8] = Float8GetDatum((double) entry->max_ns / 1000.0); + + for (bucket = 0; bucket < WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS; bucket++) + hist_payload[bucket] = entry->histogram[bucket]; + values[9] = PointerGetDatum(hist_array); + + tuplestore_putvalues(rsinfo->setResult, + rsinfo->setDesc, + values, nulls); + } + } + + /* Emit rows from the LWLock hash table */ + { + LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state); + WaitEventTimingEntry *events = wet_lwlock_hash_events(state); + int hash_size = state->lwlock_hash.hash_size; + + for (i = 0; i < hash_size; i++) + { + LWLockTimingHashEntry *he = &entries[i]; + WaitEventTimingEntry *entry; + Datum values[10]; + bool nulls[10]; + uint32 wait_event_info; + const char *event_type; + const char *event_name; + int bucket; + + if (he->tranche_id == LWLOCK_TIMING_EMPTY_SLOT) + continue; + + entry = &events[he->dense_idx]; + if (entry->count == 0) + continue; + + wait_event_info = PG_WAIT_LWLOCK | he->tranche_id; + + event_type = pgstat_get_wait_event_type(wait_event_info); + event_name = pgstat_get_wait_event(wait_event_info); + + if (event_type == NULL || event_name == NULL) + continue; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(beentry->st_procpid); + values[1] = CStringGetTextDatum(GetBackendTypeDesc(beentry->st_backendType)); + values[2] = Int32GetDatum(backend_idx); + values[3] = CStringGetTextDatum(event_type); + values[4] = CStringGetTextDatum(event_name); + values[5] = Int64GetDatum(entry->count); + values[6] = Float8GetDatum((double) entry->total_ns / 1000000.0); + values[7] = Float8GetDatum(entry->count > 0 + ? (double) entry->total_ns / entry->count / 1000.0 + : 0.0); + values[8] = Float8GetDatum((double) entry->max_ns / 1000.0); + + for (bucket = 0; bucket < WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS; bucket++) + hist_payload[bucket] = entry->histogram[bucket]; + values[9] = PointerGetDatum(hist_array); + + tuplestore_putvalues(rsinfo->setResult, + rsinfo->setDesc, + values, nulls); + } + } + } + + PG_RETURN_VOID(); +} + +/* + * SQL function: pg_get_backend_wait_event_trace() + * + * Returns trace records from the current backend's own ring buffer. + * Cross-backend ring reading is intentionally not supported: the ring + * lives in per-backend DSA and reading another session's segment would + * require attaching/detaching under the trace control lock, which is + * the responsibility of external consumers (extensions, background + * workers). The recommended cross-backend reader pattern is documented + * on WaitEventTraceControl in wait_event_timing.h. The name mirrors + * pg_get_backend_memory_contexts() to make the session-local scope + * explicit at the API level. + * + * Same-backend coordination with wait_event_trace_release_slot uses the + * wait_event_trace_srf_in_progress / _release_pending flags rather than + * an LWLock: same-backend serialization is implicit, so a per-backend + * bool plus a deferred-free path is sufficient and avoids any of the + * cross-backend lock-hold latency that the cross-backend reader pattern + * has to manage. PG_TRY/PG_FINALLY guarantees the flag is cleared and + * any deferred dsa_free is performed even on ereport(ERROR). + * + * Uses InitMaterializedSRF (materialize-all). The ring holds up to + * WaitEventTraceRingSize records (set at server start from the + * wait_event_trace_ring_size_kb GUC; defaults to 131072 = 4 MB); + * full materialization caps the per-call cost at the ring size of + * tuplestore memory, which is acceptable for the use case this SRF + * is designed for: interactive own-session diagnostics from psql. + * + * This SRF is NOT the path for cross-backend monitoring tools -- + * cross-backend readers should use pg_get_wait_event_trace for SQL + * access, or follow the shared-memory snapshot pattern documented + * on WaitEventTraceControl in wait_event_timing.h to consume the + * per-backend trace rings directly. They should NOT call this + * function via SPI. + * It is hard-coded to return only the calling backend's own ring, + * so a bgworker calling SELECT * FROM pg_backend_wait_event_trace + * would get only the bgworker's own (typically empty) ring, not the + * target backend's data. + * + * Cross-backend consumers must instead use the lock + DSA-snapshot + * pattern documented on WaitEventTraceControl in wait_event_timing.h: + * acquire WaitEventTraceCtl->lock in LW_SHARED, resolve trace_ptrs[ + * procNumber] via dsa_get_address, snapshot the records of interest + * into local memory, release the lock, then process the snapshot. + * That path bypasses this SRF entirely and is the supported + * cross-backend interface for monitoring extensions and bgworkers. + * + * value-per-call (deferred) SRF mode would let an interactive + * "SELECT ... FROM pg_backend_wait_event_trace LIMIT N" short-circuit + * the materialisation, but converting this function would require + * spanning the wait_event_trace_srf_in_progress flag (and its + * deferred-free coordination with assign_wait_event_capture; see + * issue #8) across multiple SRF callbacks plus a transaction-cleanup + * registration to handle LIMIT abandonment. The complexity is not + * justified for the diagnostic use case, especially since cross- + * backend monitoring (the consumer that would actually benefit from + * streaming) goes through the snapshot pattern above instead. + * Interactive callers who want only recent records should use + * "ORDER BY seq DESC LIMIT N" -- the LIMIT is applied after + * materialisation but the cost stays bounded by the ring size. + */ +Datum +pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + WaitEventTraceState *ts; + uint64 write_pos; + uint64 read_start; + uint64 i; + + InitMaterializedSRF(fcinfo, 0); + + if (my_wait_event_trace == NULL) + PG_RETURN_VOID(); + + ts = my_wait_event_trace; + + write_pos = pg_atomic_read_u64(&ts->write_pos); + + if (write_pos == 0) + PG_RETURN_VOID(); + + /* Read from oldest available to newest */ + { + uint64 ring_size = (uint64) ts->ring_mask + 1; + + read_start = (write_pos > ring_size) + ? write_pos - ring_size : 0; + } + + /* + * Mark the iteration in progress so wait_event_trace_release_slot + * defers any concurrent dsa_free of our own ring (see the comment on + * that function for the deferral protocol). PG_FINALLY clears the + * flag and performs any deferred free, even on ereport(ERROR). + */ + wait_event_trace_srf_in_progress = true; + PG_TRY(); + { + for (i = read_start; i < write_pos; i++) + { + WaitEventTraceRecord *rec = + &ts->records[i & ts->ring_mask]; + Datum values[6]; + bool nulls[6]; + const char *event_type; + const char *event_name; + uint32 seq_before; + uint32 seq_after; + uint8 rtype; + int64 timestamp_ns; + uint32 event_info; + int64 duration_ns; + int64 query_id; + + /* Seqlock read */ + seq_before = rec->seq; + pg_read_barrier(); /* acquire: payload loads below must not rise above this */ + + if (seq_before & 1) + continue; + + rtype = rec->record_type; + timestamp_ns = rec->timestamp_ns; + + if (rtype == TRACE_WAIT_EVENT) + { + event_info = rec->data.wait.event; + duration_ns = rec->data.wait.duration_ns; + query_id = 0; + } + else if (rtype == TRACE_QUERY_START || rtype == TRACE_QUERY_END || + rtype == TRACE_EXEC_START || rtype == TRACE_EXEC_END) + { + event_info = 0; + duration_ns = 0; + query_id = rec->data.query.query_id; + } + else + { + pg_read_barrier(); /* acquire: pair with seq_before read above before skipping */ + continue; + } + + pg_read_barrier(); /* acquire: payload loads must have landed before seq_after */ + seq_after = rec->seq; + + if (seq_before != seq_after) + continue; + + /* Skip empty wait events */ + if (rtype == TRACE_WAIT_EVENT && event_info == 0) + continue; + + if (rtype == TRACE_WAIT_EVENT) + { + event_type = pgstat_get_wait_event_type(event_info); + event_name = pgstat_get_wait_event(event_info); + } + else if (rtype == TRACE_QUERY_START) + { + event_type = "Query"; + event_name = "QueryStart"; + } + else if (rtype == TRACE_EXEC_START) + { + event_type = "Query"; + event_name = "ExecStart"; + } + else if (rtype == TRACE_EXEC_END) + { + event_type = "Query"; + event_name = "ExecEnd"; + } + else + { + event_type = "Query"; + event_name = "QueryEnd"; + } + + if (event_type == NULL || event_name == NULL) + continue; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int64GetDatum((int64) i); + values[1] = Int64GetDatum(timestamp_ns); + values[2] = CStringGetTextDatum(event_type); + values[3] = CStringGetTextDatum(event_name); + values[4] = Float8GetDatum((double) duration_ns / 1000.0); + values[5] = Int64GetDatum(query_id); + + tuplestore_putvalues(rsinfo->setResult, + rsinfo->setDesc, + values, nulls); + } + } + PG_FINALLY(); + { + wait_event_trace_srf_in_progress = false; + + /* + * If a GUC step-down fired during iteration, it deferred the + * dsa_free. Process it now that we're safely past the loop. + * Re-check release_pending under the same flag to handle the + * (impossible-today, possible-tomorrow) case of a nested SRF. + */ + if (wait_event_trace_release_pending) + { + wait_event_trace_release_pending = false; + if (my_trace_proc_number >= 0) + wait_event_trace_release_slot(my_trace_proc_number); + } + } + PG_END_TRY(); + + PG_RETURN_VOID(); +} + +/* + * One element of the local result buffer. Pairs a per-record copy + * with the original ring index (used as the seq output column). + */ +typedef struct WetValidRecord +{ + uint64 ring_index; /* original index in the writer's ring */ + WaitEventTraceRecord rec; +} WetValidRecord; + +/* + * Snapshot the trace ring for a given procNumber and emit records into + * the SRF's tuplestore. Returns silently for FREE slots, out-of-range + * procnumbers, slots whose ring was never allocated, and slots whose + * write_pos is zero. + * + * Cross-backend reader protocol implemented here: + * + * 1. Read slot->state without the lock as a cheap "worth visiting" + * check; FREE -> nothing to emit. + * 2. Allocate the worst-case result buffer BEFORE taking the lock, + * so the palloc -- which can bottom out in a glibc mmap syscall + * for the ~5 MB worst-case size -- runs without holding the + * WaitEventTraceCtl lock. + * 3. Acquire WaitEventTraceCtl->lock in LW_SHARED. All slot + * transitions take LW_EXCLUSIVE, so the slot's identity, state, + * and ring_ptr are stable for the duration of the iteration. + * 4. Re-check state under the lock and resolve ring_ptr via + * dsa_get_address. Read write_pos. + * 5. Iterate every live ring index [read_start, write_pos). For + * each record do the per-record POSITION-ENCODED IDENTITY + * seqlock check ON SHARED MEMORY (see the comment on the loop + * below). + * 6. Release the lock. + * 7. Walk the local result array and emit rows into the tuplestore. + * This is the expensive part (potential disk spill); doing it + * after release minimises lock-hold time. + * + * Why per-record seqlock against shared memory, not against a local + * memcpy of the full ring: the protocol requires the two seq reads + * to go to the SAME shared-memory location at DIFFERENT TIMES, with + * the payload read between them. A bulk memcpy then seqlock-on- + * local-copy reads the same frozen byte twice, the check degenerates + * to a no-op, and torn / stale-cycle reads slip through. + * + * Why position-encoded identity, not just parity: the writer encodes + * the ring position into the seq value (mid-write = pos*2+1, complete + * = pos*2+2). After RING_SIZE writes the slot wraps and is rewritten + * with a new numerically-distinct seq. A parity-only check accepts + * any stable even seq -- including the PREVIOUS cycle's seq if cross- + * process visibility puts the new write_pos ahead of the new seq + * update. See the loop body for the four failure modes the identity + * check rejects. + * + * Holding LW_SHARED throughout the iteration also makes the + * generation-counter retry unnecessary for this caller: slot + * transitions take LW_EXCLUSIVE and therefore cannot happen while we + * hold LW_SHARED. The generation counter is still part of the + * cross-backend reader contract on WaitEventTraceControl for external + * readers that follow a different lock-release pattern (e.g. an + * extension that wants to release the lock between batches of records + * and re-acquire), but this in-tree implementation does not release + * the lock mid-iteration. + * + * Both OWNED and ORPHANED slots are read uniformly. For OWNED the + * live owner is concurrently writing; the seqlock catches torn reads. + * For ORPHANED the records are immutable post-mortem so the check is + * essentially a pass-through (it still correctly skips at most one + * trailing odd-seq record if the owner died mid-write). + * + * Lock-hold is O(write_pos - read_start) shared-memory loads, at + * roughly the same wall-clock cost as a single 4 MB memcpy of the + * full ring (~1 ms on modern hardware), with no I/O and no syscalls. + */ +static void +emit_wait_event_trace_for_procnumber(int procNumber, ReturnSetInfo *rsinfo) +{ + WaitEventTraceSlot *slot; + WaitEventTraceState *ts; + WetValidRecord *valid_records = NULL; + uint64 valid_count = 0; + uint64 write_pos; + uint64 read_start; + uint64 i; + uint32 state_now; + + if (WaitEventTraceCtl == NULL) + return; + + /* + * Range check. Negative or out-of-range procnumbers return an + * empty result rather than ERRORing because the most natural use + * pattern for cross-backend readers is to iterate every possible + * slot index (a monitoring background worker doesn't know the + * exact NUM_WAIT_EVENT_TIMING_SLOTS at SQL level), and silent- + * empty for out-of-range matches the behaviour of sister functions + * like pg_stat_get_wait_event_timing(NULL) which iterate the + * shared array internally. FREE-but-in-range slots also return + * empty (see the state check below); the caller cannot + * distinguish out-of-range from FREE, which is fine. + */ + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return; + + slot = &WaitEventTraceCtl->trace_slots[procNumber]; + + /* + * If the trace DSA was never created (no backend in the cluster + * has ever set wait_event_capture = trace), every slot is still + * in its initial FREE state. Skip without taking the lock. + */ + if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID) + return; + + /* Unlocked fast-path check; the authoritative check is under the + * lock below. */ + if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_FREE) + return; + + wait_event_trace_ensure_dsa(); + if (trace_dsa == NULL) + return; + + /* + * Allocate the worst-case result buffer BEFORE taking the lock. + * The buffer is sized for the full ring (~5 MB at default + * RING_SIZE=128K); on a near-empty ring most of it goes unused, + * but that is preferable to holding the WaitEventTraceCtl lock + * during a palloc that may bottom out in a glibc mmap() syscall + * (allocations above the malloc-mmap threshold). Glibc's + * arena-internal mutex around the syscall would serialise every + * concurrent reader of this lock through one VMA-modifying + * kernel operation; sizing the alloc outside the lock keeps the + * lock-hold time bounded by the per-record loop alone. + * + * After we acquire the lock we will either consume this buffer + * (writing up to (write_pos - read_start) entries) or release + * it unused on an early return. + */ + /* + * Worst-case size = ring size. Derive it from the GUC on first + * use in this backend; subsequent calls see the cached value. + * The GUC is PGC_POSTMASTER so the value is the same across + * every backend in this postmaster run and never changes. + */ + if (WaitEventTraceRingSize == 0) + WaitEventTraceRingSize = + (uint32) wait_event_trace_ring_size_kb * 1024U / + (uint32) sizeof(WaitEventTraceRecord); + valid_records = palloc(sizeof(WetValidRecord) * WaitEventTraceRingSize); + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_SHARED); + + state_now = pg_atomic_read_u32(&slot->state); + if (state_now == WAIT_EVENT_TRACE_SLOT_FREE || + !DsaPointerIsValid(slot->ring_ptr)) + { + LWLockRelease(&WaitEventTraceCtl->lock); + pfree(valid_records); + return; + } + + ts = (WaitEventTraceState *) dsa_get_address(trace_dsa, slot->ring_ptr); + write_pos = pg_atomic_read_u64(&ts->write_pos); + + if (write_pos == 0) + { + LWLockRelease(&WaitEventTraceCtl->lock); + pfree(valid_records); + return; + } + + /* Live range: oldest available to newest. */ + { + uint64 ring_size = (uint64) ts->ring_mask + 1; + + read_start = (write_pos > ring_size) + ? write_pos - ring_size : 0; + } + + for (i = read_start; i < write_pos; i++) + { + WaitEventTraceRecord *rec_shared = + &ts->records[i & ts->ring_mask]; + WetValidRecord *out = &valid_records[valid_count]; + uint32 expected_seq; + uint32 seq_before; + uint32 seq_after; + + /* + * Position-encoded seqlock identity check (NOT just parity). + * + * The writer encodes the ring position into the seq value: + * mid-write -> (uint32)(pos * 2 + 1), complete -> + 2. After + * RING_SIZE writes the slot wraps and the same memory location + * gets a new seq value (next_pos * 2 + 2) that is numerically + * distinct from the previous cycle's seq. + * + * A parity-only check (skip on odd seq, accept on stable even) + * is INSUFFICIENT for this layout in the cross-backend case: + * if the writer just incremented write_pos to pos+1 but + * cross-process cache coherence has not yet propagated the + * subsequent rec->seq = (pos*2+1) store, this reader at + * i = pos would see the previous cycle's complete-even seq + * (from logical position pos - RING_SIZE). Both seq_before + * and seq_after would read that stale even value, parity + * passes, identity-against-itself passes, and a record + * belonging to the PREVIOUS cycle gets emitted with the new + * ring_index = pos. Silent data corruption (wrong attribution, + * not torn bytes). + * + * The fix is identity against EXPECTED: a record is valid for + * iterator position i if and only if its seq equals + * (uint32)(i * 2 + 2) -- the writer's encoded "complete" value + * for that exact ring position. This rejects: + * + * * Stale prior cycle (seq < expected): writer hasn't yet + * advanced rec->seq for the current cycle. + * * Mid-write current cycle (seq == expected - 1, odd): + * writer is in the payload write window. + * * Ring wrapped past us (seq > expected): the writer + * completed a later cycle on this slot during our read. + * + * The uint32 wraparound at 2^31 cycles is safe: we use exact + * equality, and the writer's existing wrap-safety argument + * (sizeof(seq) > worst-case in-flight window by 11 orders of + * magnitude) covers the seq value. + */ + expected_seq = (uint32)(i * 2 + 2); + + seq_before = rec_shared->seq; + pg_read_barrier(); + + if (seq_before != expected_seq) + continue; + + out->rec = *rec_shared; /* one 32-byte structure copy */ + + pg_read_barrier(); + seq_after = rec_shared->seq; + + if (seq_after != expected_seq) + continue; + + out->ring_index = i; + valid_count++; + } + + LWLockRelease(&WaitEventTraceCtl->lock); + + /* + * Walk the local result array and emit rows. No shared-memory + * access from here on, so spills to disk by the tuplestore (if + * the result is large) do not hold any wait-event-timing lock. + */ + for (i = 0; i < valid_count; i++) + { + WetValidRecord *vr = &valid_records[i]; + WaitEventTraceRecord *rec = &vr->rec; + Datum values[6]; + bool nulls[6]; + const char *event_type; + const char *event_name; + uint8 rtype = rec->record_type; + uint32 event_info; + int64 duration_ns; + int64 query_id; + + if (rtype == TRACE_WAIT_EVENT) + { + event_info = rec->data.wait.event; + duration_ns = rec->data.wait.duration_ns; + query_id = 0; + + /* Skip empty wait events. */ + if (event_info == 0) + continue; + + event_type = pgstat_get_wait_event_type(event_info); + event_name = pgstat_get_wait_event(event_info); + } + else if (rtype == TRACE_QUERY_START) + { + event_info = 0; + duration_ns = 0; + query_id = rec->data.query.query_id; + event_type = "Query"; + event_name = "QueryStart"; + } + else if (rtype == TRACE_QUERY_END) + { + event_info = 0; + duration_ns = 0; + query_id = rec->data.query.query_id; + event_type = "Query"; + event_name = "QueryEnd"; + } + else if (rtype == TRACE_EXEC_START) + { + event_info = 0; + duration_ns = 0; + query_id = rec->data.query.query_id; + event_type = "Query"; + event_name = "ExecStart"; + } + else if (rtype == TRACE_EXEC_END) + { + event_info = 0; + duration_ns = 0; + query_id = rec->data.query.query_id; + event_type = "Query"; + event_name = "ExecEnd"; + } + else + { + /* Unrecognised record_type -- skip defensively. */ + continue; + } + + if (event_type == NULL || event_name == NULL) + continue; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int64GetDatum((int64) vr->ring_index); + values[1] = Int64GetDatum(rec->timestamp_ns); + values[2] = CStringGetTextDatum(event_type); + values[3] = CStringGetTextDatum(event_name); + values[4] = Float8GetDatum((double) duration_ns / 1000.0); + values[5] = Int64GetDatum(query_id); + + tuplestore_putvalues(rsinfo->setResult, + rsinfo->setDesc, + values, nulls); + } + + pfree(valid_records); +} + +/* + * SQL function: pg_get_wait_event_trace(procnumber int4) + * + * Cross-backend trace ring reader. Returns the records from the trace + * ring belonging to the backend that currently or previously occupied + * the given procNumber slot. Reads OWNED and ORPHANED slots uniformly; + * FREE slots return an empty result. + * + * This SRF is the in-tree consumer of the orphan-preserved trace data: + * a backend that exited while wait_event_capture = trace leaves its + * ring allocated in DSA in ORPHANED state, and this function reads it + * until either a new backend takes over the same procNumber or the + * DBA calls pg_stat_clear_orphaned_wait_event_rings(). External + * extensions that need cross-backend access follow the same + * snapshot pattern documented on WaitEventTraceControl in + * wait_event_timing.h; this function serves as both the reference + * implementation and a DBA-facing diagnostic tool. + * + * Privileges: REVOKE'd from PUBLIC and GRANT'ed to pg_read_all_stats + * in system_views.sql, matching the privilege model of the session- + * local view pg_backend_wait_event_trace. + * + * The procnumber argument can be obtained from the procnumber column + * of pg_stat_get_wait_event_timing or pg_stat_get_wait_event_timing_ + * overflow. For pid-keyed access against live backends, callers can + * do: + * + * SELECT * FROM pg_get_wait_event_trace( + * (SELECT procnumber FROM pg_stat_get_wait_event_timing() + * WHERE pid = LIMIT 1)); + * + * Note that pid-keyed access cannot read ORPHANED slots because a + * dying backend's pid is removed from procArray on exit; for + * post-mortem reading of short-lived backends (parallel workers, + * autovacuum, walsender) the procNumber must be captured before the + * backend exits, or discovered by iterating procnumbers in a + * monitoring background worker. + */ +Datum +pg_get_wait_event_trace(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int32 procNumber = PG_GETARG_INT32(0); + + InitMaterializedSRF(fcinfo, 0); + + emit_wait_event_trace_for_procnumber((int) procNumber, rsinfo); + + PG_RETURN_VOID(); +} + +/* + * Request a self-reset on the given backend slot. + * + * Lock-free: atomically bumps the slot's reset_generation, then sets the + * target's process latch so an idle backend wakes up and completes its + * current wait event (which triggers pgstat_report_wait_end_timing, which + * observes the generation change and performs the reset). If the target + * slot is currently unoccupied the SetLatch is a harmless no-op. + */ +static void +wait_event_timing_request_reset(int slot_idx) +{ + Assert(slot_idx >= 0 && slot_idx < NUM_WAIT_EVENT_TIMING_SLOTS); + + /* + * If no backend has ever enabled capture, the shared array does not + * exist yet -- there is nothing to reset. Attach read-only; callers + * ultimately want the target backend to observe a generation bump, + * so if the array isn't allocated the latch set below is also a + * harmless no-op (no live backend is tracking). + */ + if (!wait_event_timing_attach_array(false)) + return; + + pg_atomic_fetch_add_u32(&wet_slot(slot_idx)->reset_generation, 1); + + /* + * Wake the target if it is sleeping in WaitLatch/WaitEventSetWait so + * that it completes its current wait promptly and observes the reset + * request. The slot index is also the PGPROC array index + * (pgstat_set_wait_event_timing_storage is called with procNumber). + * + * Even if no live backend currently owns the slot, setting the latch + * on the stale PGPROC is harmless -- latches in shared memory are + * durable and no process is waiting on it. + */ + if (ProcGlobal != NULL && ProcGlobal->allProcs != NULL) + SetLatch(&ProcGlobal->allProcs[slot_idx].procLatch); +} + +/* + * SQL function: pg_stat_get_wait_event_timing_overflow() + * + * Exposes the per-backend truncation counters that are otherwise + * write-only: without these, a user has no way to tell from SQL whether + * their stats are complete or whether the hash table / flat array was + * saturated mid-session and silently dropped events. + * + * lwlock_overflow_count: number of LWLock wait events that could not + * be recorded because the per-backend LWLock timing hash + * (capped by wait_event_timing_max_tranches) was full. + * flat_overflow_count: number of non-LWLock wait events that + * resolved to an unknown / out-of-range class index and therefore + * could not be mapped to a histogram slot. + * reset_count: number of resets this backend has *observed + * and acted on*, NOT a request counter. Own-backend resets are + * synchronous and bump this once per call. Cross-backend resets + * coalesce: if multiple pg_stat_reset_wait_event_timing(target) + * calls land between two of the target's wait_ends, the target + * observes them as a single reset and reset_count increments + * only once. Callers polling for asynchronous-reset + * acknowledgment should watch for any increment (N -> N+1). + * + * One row per live backend; filtered by HAS_PGSTAT_PERMISSIONS like + * pg_stat_get_wait_event_timing(). The pid argument is optional with + * the same semantics as pg_stat_get_wait_event_timing(): NULL means + * all backends, a non-NULL value restricts the sweep to that single + * backend (silently empty for unknown PIDs). + */ +Datum +pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int start_idx; + int end_idx; + int backend_idx; + + InitMaterializedSRF(fcinfo, 0); + + if (!wait_event_timing_attach_array(false)) + PG_RETURN_VOID(); + + if (!wait_event_timing_pid_range(fcinfo, &start_idx, &end_idx)) + PG_RETURN_VOID(); + + for (backend_idx = start_idx; backend_idx < end_idx; backend_idx++) + { + WaitEventTimingState *state = wet_slot(backend_idx); + PgBackendStatus *beentry; + Datum values[6]; + bool nulls[6]; + + beentry = pgstat_get_beentry_by_proc_number(backend_idx); + if (beentry == NULL) + continue; + if (!HAS_PGSTAT_PERMISSIONS(beentry->st_userid)) + continue; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(beentry->st_procpid); + values[1] = CStringGetTextDatum(GetBackendTypeDesc(beentry->st_backendType)); + values[2] = Int32GetDatum(backend_idx); + values[3] = Int64GetDatum(state->lwlock_overflow_count); + values[4] = Int64GetDatum(state->flat_overflow_count); + values[5] = Int64GetDatum(state->reset_count); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + PG_RETURN_VOID(); +} + +/* + * SQL function: pg_stat_reset_wait_event_timing(pid int4) + * + * Resets wait-event-timing counters for a single backend, identified by PID. + * + * NULL (or MyProcPid): reset caller's own session synchronously -- + * single writer, no lock needed. + * another PID: request a cross-backend reset (superuser only). + * unknown / dead PID: silent no-op, matching pg_stat_reset_backend_stats. + * + * To reset every backend, use pg_stat_reset_wait_event_timing_all(). + * + * Cross-backend resets are asynchronous by design: the function atomically + * bumps the target slot's reset_generation counter and wakes the target's + * latch; the owning backend observes the change on its next wait_end and + * clears its own counters. This keeps the hot path lock-free and avoids + * the cross-writer races that plagued an earlier LWLock-based design. + * + * Visibility is near-immediate for active backends (their next event ends + * within microseconds) and is bounded by the target's wait duration for + * idle backends -- SetLatch shortens that by interrupting any current + * WaitLatch. The function returns before the reset has been observed; + * callers that need strict read-after-reset semantics should either + * target their own backend (where reset is synchronous) or poll the + * target's reset_count column in pg_stat_wait_event_timing_overflow + * until it increments. + */ +Datum +pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS) +{ + int target_pid; + PGPROC *proc; + int procNumber; + + if (PG_ARGISNULL(0) || PG_GETARG_INT32(0) == MyProcPid) + { + /* + * Reset own backend. Synchronous: no lock or atomic indirection + * needed. If capture has never been enabled in this backend yet, + * my_wait_event_timing is still NULL; nothing to reset. + * + * wait_start is already zero here -- pgstat_report_wait_end_timing + * zeros it at the end of every wait, and the backend cannot be mid- + * wait while it is executing this SQL function -- so there is no + * in-flight measurement to preserve. We zero current_event for the + * same hygiene reason as the cross-backend reset path above: keep + * external readers of the slot from seeing stale state between + * waits. + */ + if (my_wait_event_timing != NULL) + { + memset(my_wait_event_timing->events, 0, + sizeof(my_wait_event_timing->events)); + lwlock_timing_hash_clear(my_wait_event_timing); + my_wait_event_timing->reset_count++; + my_wait_event_timing->lwlock_overflow_count = 0; + my_wait_event_timing->flat_overflow_count = 0; + my_wait_event_timing->current_event = 0; + } + PG_RETURN_VOID(); + } + + /* + * Cross-backend reset requires pg_signal_backend membership, matching + * the privilege model of pg_stat_reset_backend_stats(int4 pid) (the + * closest existing per-backend reset in the wider stats family). + * + * Why pg_signal_backend rather than naked superuser(): + * + * 1) Operational alignment. The role pg_signal_backend exists + * specifically for "the operator who acts on other backends' + * state" -- it gates pg_terminate_backend, pg_cancel_backend, + * and pg_stat_reset_backend_stats already. Resetting another + * backend's wait-event timing is structurally the same kind of + * operation (per-PID, addressable, bounded blast radius), so it + * belongs to the same role. Demanding superuser would create a + * surplus-privilege gap: a DBA who can already TERMINATE the + * target backend (strictly more invasive than resetting its + * counters) would need to escalate to superuser just to wipe + * its stats, which is operationally backwards. + * + * 2) Cluster-wide reset is a different decision. See + * pg_stat_reset_wait_event_timing_all() below, which keeps the + * stricter superuser() gate -- different blast radius, different + * role. This split (per-backend = pg_signal_backend, cluster-wide + * = superuser) reflects the principle that the role required for + * an operation should match what the operation can affect. The + * fact that pg_stat_reset() (cluster-wide) actually only requires + * pg_read_all_stats today is an inconsistency in PG's existing + * surface; we deliberately do not extend that inconsistency here. + * + * 3) Information-disclosure concern is bounded. The only + * "destructive" property of a stats reset is that it erases + * forensic evidence of past wait events. Anyone with + * pg_signal_backend can already terminate the target backend -- + * which terminates that forensic record by destroying the + * backend itself. A counter wipe is strictly less invasive. + */ + if (!has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to reset another backend's wait event timing"), + errdetail("Only roles with privileges of the \"pg_signal_backend\" role may reset another backend's wait event timing."))); + + target_pid = PG_GETARG_INT32(0); + + /* Look up the target. Try regular backends first, then aux. */ + proc = BackendPidGetProc(target_pid); + if (proc == NULL) + proc = AuxiliaryPidGetProc(target_pid); + + /* Unknown / dead PID: silent no-op, matching pg_stat_reset_backend_stats. */ + if (proc == NULL) + PG_RETURN_VOID(); + + procNumber = GetNumberFromPGProc(proc); + + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + PG_RETURN_VOID(); + + wait_event_timing_request_reset(procNumber); + + PG_RETURN_VOID(); +} + +/* + * Reset wait-event-timing counters for every backend. Superuser only. + * + * Each slot is asked to self-reset on its next wait event (owner-cleared); + * see wait_event_timing_request_reset for the protocol. Returns before the + * resets have been observed -- callers that need strict read-after-reset + * semantics should poll the targets' reset_count columns. + * + * Privilege model rationale (intentional asymmetry with the per-backend + * variant pg_stat_reset_wait_event_timing(pid)): + * + * * Per-backend reset uses pg_signal_backend, matching + * pg_stat_reset_backend_stats(pid). The blast radius is one PID; + * anyone who can pg_terminate_backend the target can already + * destroy more forensic state than a counter wipe would. + * + * * Cluster-wide reset is gated tighter because the blast radius is + * every backend in the cluster. An operator with pg_signal_backend + * can disrupt one PID at a time (and must specify which); the + * cluster-wide reset wipes ALL backends' historical counters in a + * single call, which is meaningfully different in two ways: + * + * (a) it can hide cross-tenant patterns that a forensic audit + * would have wanted to compare across backends, and + * + * (b) it removes the per-call addressability that makes the + * per-backend variant auditable -- a log entry showing "user + * X reset PID Y" is more actionable than "user X wiped + * everything." + * + * Requiring superuser for the cluster-wide variant matches the + * general PG principle that scope of authority should match scope + * of effect. We deliberately do NOT mirror pg_stat_reset(), which + * today is gated only on pg_read_all_stats despite being similarly + * cluster-wide -- that's a pre-existing inconsistency in the wider + * stats family and not one we want to extend. + */ +Datum +pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS) +{ + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be a superuser to reset wait event timing for all backends"))); + + for (int i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++) + wait_event_timing_request_reset(i); + + PG_RETURN_VOID(); +} + +/* + * SQL function: pg_stat_clear_orphaned_wait_event_rings() + * + * Free every trace ring whose owner has exited (slot state ORPHANED). + * Returns the number of rings released. + * + * Why this exists. When a backend that had wait_event_capture = trace + * exits, we deliberately do NOT free its ~4 MB trace ring (see the + * lifecycle discussion on WaitEventTraceControl): the data must remain + * readable by cross-backend consumers -- the in-tree + * pg_get_wait_event_trace SRF and any extension following the + * snapshot pattern on WaitEventTraceControl -- and an exit-time + * dsa_free would defeat that. + * The reclaim instead happens lazily in two places: + * + * (a) wait_event_trace_clear_orphan_at_init(): when a new backend + * inherits the same procNumber slot at init, it frees the prior + * orphan as part of starting clean. This handles the common + * case (busy clusters with connection churn) automatically. + * + * (b) THIS FUNCTION: an explicit DBA-driven sweep that releases + * every currently orphaned ring at once. + * + * The pathological case (a) does not handle is "capture briefly + * enabled, then disabled, on a cluster with long-lived pooled + * connections that never exit". In that scenario procNumbers do not + * recycle, so prior orphans persist until cluster restart unless the + * DBA calls this function. Worst-case bound is + * NUM_WAIT_EVENT_TIMING_SLOTS * sizeof(WaitEventTraceState) which is + * ~400 MB at MaxBackends=100, ~4 GB at MaxBackends=1000 -- bounded + * but worth a kill switch. + * + * Permissions: superuser-only, matching the cluster-wide reset + * (pg_stat_reset_wait_event_timing_all). This is a + * cluster-scope memory-reclamation operation: it can disrupt any + * concurrent cross-backend reader on any orphaned slot. The + * disruption is bounded (readers retry via the generation counter + * and at worst skip one read) but the operation is still + * cluster-wide, so the privilege model matches the reset variant + * with the same blast radius. + * + * The function is safe to call even when no orphans exist (returns + * 0) and even when capture is currently OFF (the slot array exists + * unconditionally; only the rings are lazy). + */ +Datum +pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS) +{ + int64 freed = 0; + int i; + + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be a superuser to clear orphaned wait event " + "trace rings"))); + + if (WaitEventTraceCtl == NULL) + PG_RETURN_INT64(0); + + /* + * If no backend has ever enabled trace, the trace DSA was never + * created and there cannot be any ORPHANED slots: every slot is + * still in its initial FREE state. Nothing to do. + */ + if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID) + PG_RETURN_INT64(0); + + /* Attach to the trace DSA so dsa_free() can be called. */ + wait_event_trace_ensure_dsa(); + if (trace_dsa == NULL) + PG_RETURN_INT64(0); + + /* + * Walk every slot, taking and releasing WaitEventTraceCtl->lock per + * slot rather than holding it across the entire sweep. + * + * Rationale: at MaxBackends = 1000 with a fully-orphaned cluster + * the per-slot work (atomic state read + dsa_free + ring_ptr + * clear + atomic state write) totals a few microseconds; holding + * the lock across all slots would yield a millisecond-scale + * lock-hold window during which every concurrent backend startup + * (the lazy wait_event_trace_clear_orphan_at_init path), every + * cross-backend reader (pg_get_wait_event_trace and the external + * snapshot pattern), and every capture step-down or restore + * would stall. PG's general convention is to keep LWLock-held + * windows in paths that compete with regular activity well under + * 100 microseconds; per-slot release/reacquire gives us a worst- + * case lock-hold of one slot's worth of work regardless of how + * many orphans exist cluster-wide. + * + * An unlocked fast-path read of slot->state skips non-ORPHANED + * slots without an LWLockAcquire/Release pair. This is safe: if + * a slot races from non-ORPHANED to ORPHANED after we read it, + * we miss that orphan -- but the function is documented as a + * snapshot sweep, the missed orphan can be cleared by a + * subsequent call, and the same race exists for orphans that + * appear after the loop ends. The authoritative re-check under + * the lock prevents racing on the dsa_free direction (we never + * free a slot whose owner became OWNED again). + * + * CHECK_FOR_INTERRUPTS at the top of the loop body lets the + * caller cancel a long sweep; with the previous single-lock + * structure the InterruptHoldoffCount elevation from + * LWLockAcquire deferred all cancellation until release. + */ + for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++) + { + WaitEventTraceSlot *slot = &WaitEventTraceCtl->trace_slots[i]; + + CHECK_FOR_INTERRUPTS(); + + /* Unlocked fast-path: skip non-ORPHANED slots cheaply. */ + if (pg_atomic_read_u32(&slot->state) != WAIT_EVENT_TRACE_SLOT_ORPHANED) + continue; + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + + /* + * Authoritative re-check under the lock. A concurrent + * clear_orphan_at_init may have already freed this slot. + */ + if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_ORPHANED && + DsaPointerIsValid(slot->ring_ptr)) + { + pg_atomic_fetch_add_u64(&slot->generation, 1); + dsa_free(trace_dsa, slot->ring_ptr); + slot->ring_ptr = InvalidDsaPointer; + pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE); + freed++; + } + + LWLockRelease(&WaitEventTraceCtl->lock); + } + + PG_RETURN_INT64(freed); +} + +#endif /* USE_WAIT_EVENT_TIMING */ diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 2460e550f96e2..d1dceda12df7b 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -70,6 +70,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/timeout.h" +#include "utils/wait_event_timing.h" /* has this backend called EmitConnectionWarnings()? */ static bool ConnectionWarningsEmitted; @@ -1244,6 +1245,12 @@ InitPostgres(const char *in_dbname, Oid dboid, /* Process pg_db_role_setting options */ process_settings(MyDatabaseId, GetSessionUserId()); +#ifdef USE_WAIT_EVENT_TIMING + /* Attach trace ring if wait_event_capture = trace was set via config/db/role settings */ + if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && my_trace_proc_number >= 0) + wait_event_trace_attach(my_trace_proc_number); +#endif + /* Apply PostAuthDelay as soon as we've read all options */ if (PostAuthDelay > 0) pg_usleep(PostAuthDelay * 1000000L); diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index afaa058b046c9..a1cf02c6ce91c 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -3425,6 +3425,35 @@ boot_val => 'true', }, +{ name => 'wait_event_capture', type => 'enum', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE', + short_desc => 'Controls collection of per-wait-event timing statistics and (optionally) per-session event tracing.', + variable => 'wait_event_capture', + boot_val => 'WAIT_EVENT_CAPTURE_OFF', + options => 'wait_event_capture_options', + check_hook => 'check_wait_event_capture', + assign_hook => 'assign_wait_event_capture', +}, + +{ name => 'wait_event_timing_max_tranches', type => 'int', context => 'PGC_POSTMASTER', group => 'STATS_CUMULATIVE', + short_desc => 'Sets the maximum number of distinct LWLock tranches whose timing is recorded per backend.', + long_desc => 'Each backend\'s wait-event-timing hash table can hold this many distinct LWLock tranches; subsequent tranches are counted against lwlock_overflow_count and not individually timed. Sized at server start; raise this if your installation loads many extensions that register their own LWLock tranches and you observe non-zero lwlock_overflow_count in pg_stat_wait_event_timing_overflow.', + variable => 'wait_event_timing_max_tranches', + boot_val => '192', + min => '16', + max => '65534', +}, + +{ name => 'wait_event_trace_ring_size_kb', type => 'int', context => 'PGC_POSTMASTER', group => 'STATS_CUMULATIVE', + short_desc => 'Per-backend wait-event-trace ring buffer size, in kilobytes.', + long_desc => 'Each backend that enables wait_event_capture = trace allocates a ring buffer of this size from a cluster-wide DSA. The value must be a power of two and is sized at server start. Larger rings retain longer histories before wrapping; smaller rings reduce per-backend memory at high max_connections. Worst-case total memory is approximately max_connections times this value.', + flags => 'GUC_UNIT_KB', + variable => 'wait_event_trace_ring_size_kb', + boot_val => '4096', + min => '8', + max => '32768', + check_hook => 'check_wait_event_trace_ring_size_kb', +}, + { name => 'wal_block_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', short_desc => 'Shows the block size in the write ahead log.', flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 290ccbc543e25..25a3b139523e8 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -103,6 +103,7 @@ #include "utils/plancache.h" #include "utils/ps_status.h" #include "utils/rls.h" +#include "utils/wait_event_timing.h" #include "utils/xml.h" #ifdef TRACE_SYNCSCAN diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index ac38cddaaf9a6..e854ad329a375 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -700,6 +700,10 @@ #track_cost_delay_timing = off #track_io_timing = off #track_wal_io_timing = off +#wait_event_capture = off # off, stats, trace +#wait_event_timing_max_tranches = 192 # (change requires restart) +#wait_event_trace_ring_size_kb = 4096 # (change requires restart) + # must be power of two, 8 .. 32768 #track_functions = none # none, pl, all #stats_fetch_consistency = cache # cache, none, snapshot diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index a1416260abcbf..32ba1d391e146 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -57,6 +57,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202605131 +#define CATALOG_VERSION_NO 202605151 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index be157a5fbe90c..1bb610167cb6b 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12693,4 +12693,64 @@ proname => 'hashoid8extended', prorettype => 'int8', proargtypes => 'oid8 int8', prosrc => 'hashoid8extended' }, +{ oid => '9956', + descr => 'statistics: per-backend wait event timing (count, duration, histogram)', + proname => 'pg_stat_get_wait_event_timing', prorows => '1000', + proisstrict => 'f', proretset => 't', provolatile => 's', proparallel => 'r', + prorettype => 'record', proargtypes => 'int4', + proallargtypes => '{int4,int4,text,int4,text,text,int8,float8,float8,float8,_int8}', + proargmodes => '{i,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{pid,pid,backend_type,procnumber,wait_event_type,wait_event,calls,total_time_ms,avg_time_us,max_time_us,histogram}', + prosrc => 'pg_stat_get_wait_event_timing' }, + + +{ oid => '9957', + descr => 'current backend wait event trace ring buffer', + proname => 'pg_get_backend_wait_event_trace', prorows => '1000', + proretset => 't', provolatile => 's', proparallel => 'r', + prorettype => 'record', proargtypes => '', + proallargtypes => '{int8,int8,text,text,float8,int8}', + proargmodes => '{o,o,o,o,o,o}', + proargnames => '{seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}', + prosrc => 'pg_get_backend_wait_event_trace' }, + +{ oid => '9958', + descr => 'statistics: reset wait event timing counters for the given backend (NULL = own)', + proname => 'pg_stat_reset_wait_event_timing', proisstrict => 'f', + provolatile => 'v', prorettype => 'void', proargtypes => 'int4', + proargnames => '{pid}', + prosrc => 'pg_stat_reset_wait_event_timing' }, + +{ oid => '9959', + descr => 'statistics: per-backend wait event timing overflow counters (rows lost to LWLock hash / flat array overflow)', + proname => 'pg_stat_get_wait_event_timing_overflow', prorows => '1000', + proisstrict => 'f', proretset => 't', provolatile => 's', proparallel => 'r', + prorettype => 'record', proargtypes => 'int4', + proallargtypes => '{int4,int4,text,int4,int8,int8,int8}', + proargmodes => '{i,o,o,o,o,o,o}', + proargnames => '{pid,pid,backend_type,procnumber,lwlock_overflow_count,flat_overflow_count,reset_count}', + prosrc => 'pg_stat_get_wait_event_timing_overflow' }, + +{ oid => '9960', + descr => 'statistics: reset wait event timing counters for all backends (superuser only)', + proname => 'pg_stat_reset_wait_event_timing_all', + provolatile => 'v', prorettype => 'void', proargtypes => '', + prosrc => 'pg_stat_reset_wait_event_timing_all' }, + +{ oid => '9961', + descr => 'statistics: free wait-event-trace rings whose owner backend has exited (superuser only); returns count freed', + proname => 'pg_stat_clear_orphaned_wait_event_rings', + provolatile => 'v', prorettype => 'int8', proargtypes => '', + prosrc => 'pg_stat_clear_orphaned_wait_event_rings' }, + +{ oid => '9962', + descr => 'wait event trace ring for the given procnumber slot (OWNED or ORPHANED)', + proname => 'pg_get_wait_event_trace', prorows => '1000', + proretset => 't', provolatile => 'v', proparallel => 'r', + prorettype => 'record', proargtypes => 'int4', + proallargtypes => '{int4,int8,int8,text,text,float8,int8}', + proargmodes => '{i,o,o,o,o,o,o}', + proargnames => '{procnumber,seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}', + prosrc => 'pg_get_wait_event_trace' }, + ] diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 4f8113c144b0c..ed0b7f26f9f5a 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -762,6 +762,9 @@ /* Define to select unnamed POSIX semaphores. */ #undef USE_UNNAMED_POSIX_SEMAPHORES +/* Define to 1 to build with wait event timing. (--enable-wait-event-timing) */ +#undef USE_WAIT_EVENT_TIMING + /* Define to select Win32-style semaphores. */ #undef USE_WIN32_SEMAPHORES diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index d7eb648bd2758..26ccc0cf486f9 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -140,3 +140,5 @@ PG_LWLOCKTRANCHE(XACT_SLRU, XactSLRU) PG_LWLOCKTRANCHE(PARALLEL_VACUUM_DSA, ParallelVacuumDSA) PG_LWLOCKTRANCHE(AIO_URING_COMPLETION, AioUringCompletion) PG_LWLOCKTRANCHE(SHMEM_INDEX, ShmemIndex) +PG_LWLOCKTRANCHE(WAIT_EVENT_TRACE_DSA, WaitEventTraceDSA) +PG_LWLOCKTRANCHE(WAIT_EVENT_TIMING_DSA, WaitEventTimingDSA) diff --git a/src/include/storage/subsystemlist.h b/src/include/storage/subsystemlist.h index 9ad619080be22..90b142354644c 100644 --- a/src/include/storage/subsystemlist.h +++ b/src/include/storage/subsystemlist.h @@ -79,6 +79,8 @@ PG_SHMEM_SUBSYSTEM(SyncScanShmemCallbacks) PG_SHMEM_SUBSYSTEM(AsyncShmemCallbacks) PG_SHMEM_SUBSYSTEM(StatsShmemCallbacks) PG_SHMEM_SUBSYSTEM(WaitEventCustomShmemCallbacks) +PG_SHMEM_SUBSYSTEM(WaitEventTimingShmemCallbacks) +PG_SHMEM_SUBSYSTEM(WaitEventTraceControlShmemCallbacks) #ifdef USE_INJECTION_POINTS PG_SHMEM_SUBSYSTEM(InjectionPointShmemCallbacks) #endif diff --git a/src/include/utils/.gitignore b/src/include/utils/.gitignore index ff6f61cd7ee7b..8a489b7769b16 100644 --- a/src/include/utils/.gitignore +++ b/src/include/utils/.gitignore @@ -6,4 +6,5 @@ /header-stamp /pgstat_wait_event.c /wait_event_funcs_data.c +/wait_event_timing_data.h /wait_event_types.h diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index dc406d6651aa2..309d5e87967a0 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -346,6 +346,7 @@ extern PGDLLIMPORT const struct config_enum_entry dynamic_shared_memory_options[ extern PGDLLIMPORT const struct config_enum_entry io_method_options[]; extern PGDLLIMPORT const struct config_enum_entry recovery_target_action_options[]; extern PGDLLIMPORT const struct config_enum_entry server_message_level_options[]; +extern PGDLLIMPORT const struct config_enum_entry wait_event_capture_options[]; extern PGDLLIMPORT const struct config_enum_entry wal_level_options[]; extern PGDLLIMPORT const struct config_enum_entry wal_sync_method_options[]; diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index 307f4fbaefe08..0cd528ecfb3f7 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -172,6 +172,9 @@ extern bool check_transaction_isolation(int *newval, void **extra, GucSource sou extern bool check_transaction_read_only(bool *newval, void **extra, GucSource source); extern void assign_transaction_timeout(int newval, void *extra); extern const char *show_unix_socket_permissions(void); +extern bool check_wait_event_capture(int *newval, void **extra, GucSource source); +extern void assign_wait_event_capture(int newval, void *extra); +extern bool check_wait_event_trace_ring_size_kb(int *newval, void **extra, GucSource source); extern bool check_wal_buffers(int *newval, void **extra, GucSource source); extern bool check_wal_consistency_checking(char **newval, void **extra, GucSource source); diff --git a/src/include/utils/meson.build b/src/include/utils/meson.build index fd3a2352df5d4..ef8b2dc261811 100644 --- a/src/include/utils/meson.build +++ b/src/include/utils/meson.build @@ -1,6 +1,6 @@ # Copyright (c) 2022-2026, PostgreSQL Global Development Group -wait_event_output = ['wait_event_types.h', 'pgstat_wait_event.c', 'wait_event_funcs_data.c'] +wait_event_output = ['wait_event_types.h', 'pgstat_wait_event.c', 'wait_event_funcs_data.c', 'wait_event_timing_data.h'] wait_event_target = custom_target('wait_event_names', input: files('../../backend/utils/activity/wait_event_names.txt'), output: wait_event_output, @@ -11,7 +11,7 @@ wait_event_target = custom_target('wait_event_names', ], build_by_default: true, install: true, - install_dir: [dir_include_server / 'utils', false, false], + install_dir: [dir_include_server / 'utils', false, false, false], ) wait_event_types_h = wait_event_target[0] diff --git a/src/include/utils/wait_classes.h b/src/include/utils/wait_classes.h index b91690a22c63b..c6c692a1e9391 100644 --- a/src/include/utils/wait_classes.h +++ b/src/include/utils/wait_classes.h @@ -26,4 +26,13 @@ #define PG_WAIT_IO 0x0A000000U #define PG_WAIT_INJECTIONPOINT 0x0B000000U +/* + * Bit-layout masks for wait_event_info. The high byte encodes the + * class (one of the PG_WAIT_* constants above); the low 16 bits + * encode the per-class event id; the middle byte is currently + * reserved (see pgstat_report_wait_start in wait_event.h). + */ +#define WAIT_EVENT_CLASS_MASK 0xFF000000U +#define WAIT_EVENT_ID_MASK 0x0000FFFFU + #endif /* WAIT_CLASSES_H */ diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 86ee348220d7f..0ea5066027d19 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -13,6 +13,10 @@ /* enums for wait events */ #include "utils/wait_event_types.h" +#ifdef USE_WAIT_EVENT_TIMING +#include "utils/wait_event_timing.h" +#endif + extern const char *pgstat_get_wait_event(uint32 wait_event_info); extern const char *pgstat_get_wait_event_type(uint32 wait_event_info); static inline void pgstat_report_wait_start(uint32 wait_event_info); @@ -22,6 +26,11 @@ extern void pgstat_reset_wait_event_storage(void); extern PGDLLIMPORT uint32 *my_wait_event_info; +#ifdef USE_WAIT_EVENT_TIMING +extern void pgstat_report_wait_start_timing(uint32 wait_event_info); +extern void pgstat_report_wait_end_timing(int capture_level); +#endif + /* * Wait Events - Extension, InjectionPoint @@ -61,6 +70,9 @@ extern char **GetWaitEventCustomNames(uint32 classId, int *nwaitevents); * * my_wait_event_info initially points to local memory, making it safe to * call this before MyProc has been initialized. + * + * When compiled with --enable-wait-event-timing, also records the start + * timestamp for later duration computation in pgstat_report_wait_end(). * ---------- */ static inline void @@ -71,17 +83,53 @@ pgstat_report_wait_start(uint32 wait_event_info) * four-bytes, updates are atomic. */ *(volatile uint32 *) my_wait_event_info = wait_event_info; + +#ifdef USE_WAIT_EVENT_TIMING + /* + * Minimal inline gate: one global load + predicted-not-taken branch. + * Keeping the gate body out-of-line in pgstat_report_wait_start_timing() + * shrinks the inlined call sites and limits the codegen impact on host + * functions (LWLockAcquire, XLogInsert, etc.) to a few bytes each. + * + * unlikely(): wait_event_capture defaults to OFF and is OFF on the + * vast majority of installations. The annotation steers the compiler + * to lay out the no-op fall-through as the straight-line hot path. + */ + if (unlikely(wait_event_capture != WAIT_EVENT_CAPTURE_OFF)) + pgstat_report_wait_start_timing(wait_event_info); +#endif } /* ---------- * pgstat_report_wait_end() - * * Called to report end of a wait. + * + * When compiled with --enable-wait-event-timing and the GUC is enabled, + * calls the out-of-line pgstat_report_wait_end_timing() to compute the + * wait duration and accumulate statistics. The body is kept out-of-line + * to reduce I-cache pressure at the many call sites. * ---------- */ static inline void pgstat_report_wait_end(void) { +#ifdef USE_WAIT_EVENT_TIMING + /* + * Minimal inline gate. See pgstat_report_wait_start() for the + * unlikely() rationale. The load of wait_event_capture is reused + * as the argument to pgstat_report_wait_end_timing(), so the + * out-of-line body does not have to re-load it across the call + * boundary (CSE doesn't cross function calls). + */ + { + int capture_level = wait_event_capture; + + if (unlikely(capture_level != WAIT_EVENT_CAPTURE_OFF)) + pgstat_report_wait_end_timing(capture_level); + } +#endif + /* see pgstat_report_wait_start() */ *(volatile uint32 *) my_wait_event_info = 0; } diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h new file mode 100644 index 0000000000000..77563be29d2b2 --- /dev/null +++ b/src/include/utils/wait_event_timing.h @@ -0,0 +1,742 @@ +/*------------------------------------------------------------------------- + * + * wait_event_timing.h + * Per-backend wait event timing and histogram infrastructure. + * + * When enabled via the wait_event_timing GUC, every call to + * pgstat_report_wait_start()/pgstat_report_wait_end() records the wait + * duration and accumulates per-event statistics (count, total time, + * histogram) in shared memory. + * + * The overhead is two clock_gettime(CLOCK_MONOTONIC) calls per wait event + * transition (~40-100 ns via VDSO), plus a few memory writes to per-backend + * arrays. No locking is needed because each backend writes only to its own + * stats slot. + * + * Statistics are exposed via the pg_stat_wait_event_timing view + * and pg_stat_get_wait_event_timing() SQL function. + * + * Copyright (c) 2026, PostgreSQL Global Development Group + * + * src/include/utils/wait_event_timing.h + *------------------------------------------------------------------------- + */ +#ifndef WAIT_EVENT_TIMING_H +#define WAIT_EVENT_TIMING_H + +#include "port/atomics.h" +#include "port/pg_bitutils.h" +#include "portability/instr_time.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "utils/dsa.h" +#include "utils/wait_event_types.h" + +/* + * Number of log2 histogram buckets. Bin edges are powers of two on the + * nanosecond axis: bucket i covers [2^(i+9), 2^(i+10)) ns, except bucket + * 0 which covers [0, 1024) ns and the last bucket which covers + * [2^(NBUCKETS+8), infinity) ns. These boundaries approximate the + * decimal-microsecond grid (1024 ≈ 1 us, 2048 ≈ 2 us, ...), which lets + * wait_event_timing_bucket() avoid a /1000 on the hot path. + * + * 32 buckets cover from <1us through ~512s-1024s, with the last + * bucket open-ended at 2^40 ns ≈ 1099 s ≈ ~18 minutes. Sample edges: + * + * bucket 0: [0, 1024) ns <1us + * bucket 1: [1024, 2048) ns 1-2us + * bucket 14: [2^23, 2^24) ns 8-16ms + * bucket 23: [2^32, 2^33) ns 4-8s + * bucket 30: [2^39, 2^40) ns 512s-1024s + * bucket 31: [2^40, inf) ns >=1024s (overflow) + * + * Why 32 (and not 16, the original): + * + * The original 16 buckets capped at 16ms in the last open-ended + * bucket. In real production workloads the long tail routinely + * extends well past 16ms -- HDD seek-and-queue, cloud-EBS noisy- + * neighbour spikes, lock-contention waits during table-level + * conflict, vacuum waits, replication apply waits, all commonly + * land in the 50ms-to-multi-second range. Collapsing all of those + * into a single overflow bucket made the histogram much less useful + * for the diagnostic case it primarily exists to serve: P99 / tail + * analysis is precisely where wait-event timing pays for itself, + * and that signal lives in the long tail. + * + * Doubling to 32 buckets pushes the open-ended overflow out to + * ~17 minutes (2^40 ns). Anything beyond that genuinely belongs in + * EXPLAIN / auto_explain or pg_stat_activity rather than a wait- + * event distribution: a single wait of more than ~17 minutes is a + * query-shape or stuck-process problem, not a histogram-bucket + * problem. The 32-bucket layout therefore covers the entire + * useful diagnostic range without leaving the long tail in an + * overflow bucket the operator cannot reason about. + * + * Cost: 16 extra int8 slots per WaitEventTimingEntry, increasing + * the per-entry size from 152 to 280 bytes (each int8 = 8 bytes). + * At default 192-tranche cap that adds ~24 KB to the per-backend + * lwlock_events array, plus ~32 KB to the per-backend events array + * (~250 distinct events), so ~56 KB more per backend -- about + * double the previous baseline, still bounded. The hot-path cost + * is unchanged: histogram[bucket]++ is the same single store + * regardless of array length, and the bucket index computation + * (pg_leftmost_one_pos64 - 9) doesn't depend on the array size. + * + * ABI note: pg_proc.dat declares pg_stat_get_wait_event_timing's + * histogram return type as _int8 (variable-length int8 array). The + * array is constructed at SRF emit time via construct_array_builtin + * sized by this constant, so changing the constant changes the + * row-payload length but not the catalog row type. External + * consumers that addressed buckets by absolute index (e.g. + * "histogram[15] is the overflow bucket") need to be updated; + * consumers that join against pg_wait_event_timing_histogram_buckets + * (the canonical name-and-edge table) continue to work transparently + * because that view is also extended to 32 rows in lockstep. + */ +#define WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS 32 + +/* + * Compact per-class mapping for the flat events[] array. + * + * WAIT_EVENT_TIMING_RAW_CLASSES, WAIT_EVENT_TIMING_DENSE_CLASSES, and + * WAIT_EVENT_TIMING_NUM_EVENTS are generated into wait_event_types.h by + * generate-wait_event_types.pl from wait_event_names.txt. + * + * The mapping arrays (wait_event_class_dense, wait_event_class_nevents, + * wait_event_class_offset, wait_event_dense_to_classid) and internal + * helper functions are in wait_event_timing.c (included from the + * generated wait_event_timing_data.h). + */ + +/* Sentinel returned by wait_event_timing_index() for LWLock events */ +#define WAIT_EVENT_TIMING_IDX_LWLOCK (-2) + +/* + * Per-event accumulated statistics. One entry per distinct wait event + * per backend. These are written only by the owning backend, so no + * locking is needed. External readers may see torn reads for 64-bit + * fields on 32-bit platforms, but that is acceptable for statistics. + */ +typedef struct WaitEventTimingEntry +{ + int64 count; /* number of occurrences */ + int64 total_ns; /* total wait duration in nanoseconds */ + int64 max_ns; /* longest single wait in nanoseconds */ + int64 histogram[WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS]; +} WaitEventTimingEntry; + +/* + * LWLock-specific open-addressing hash table for unbounded tranche IDs. + * Per-backend, written only by the owning backend -- no locking needed. + * Tranche IDs are dynamically allocated by LWLockNewTrancheId() starting + * at LWTRANCHE_FIRST_USER_DEFINED (~88) with no upper bound. The hash + * maps tranche_id -> dense index into lwlock_events[]. + */ +/* + * Hash slot count vs. entry cap. + * + * The cap on distinct LWLock tranches per backend (and the slot count + * of the open-addressing hash that resolves them) is configured at + * server start by the GUC wait_event_timing_max_tranches. Default 192 + * matches real-world ceilings on deployments without many custom + * extensions; raise it for installations that load many extensions + * which register their own LWLock tranches. See guc_parameters.dat. + * + * The slot count is derived as the next power of two of (2 × + * max_tranches), giving a load factor of at most 50% (typically ~37% + * because the next-pow2 jump usually overshoots). Linear probing gets + * expensive fast above 50% load (avg ~8.5 probes on miss at 75%, ~1.6 + * at 37.5%), and this table sits inside the single-writer hot path in + * pgstat_report_wait_end_timing, so probe length matters. The slot- + * table memory cost is small relative to the entry array (4 bytes per + * slot vs. ~152 bytes per entry). + * + * Both the slot table (entries[]) and the dense events array + * (lwlock_events[]) are sized at allocation time and stored in the + * per-backend DSA region following the WaitEventTimingState header + * for that backend; see the layout description there. The + * LWLockTimingHash struct below holds only the immutable size metadata + * and the runtime num_used counter -- the arrays themselves are not + * struct members because their length is runtime-determined. + */ + +/* + * Sentinel marking an empty hash slot. We deliberately reserve the + * upper end of the uint16 range (0xFFFF) instead of 0 so that any + * legal LWLock tranche ID -- including the currently-unused tranche 0 + * (lwlocklist.h: "0 is available; was formerly BufFreelistLock") -- + * can be stored and matched correctly. Keeping the sentinel decoupled + * from the LWLock numbering makes this hash table robust to future + * changes in lwlocklist.h. + */ +#define LWLOCK_TIMING_EMPTY_SLOT ((uint16) 0xFFFF) + +typedef struct LWLockTimingHashEntry +{ + uint16 tranche_id; /* LWLOCK_TIMING_EMPTY_SLOT (0xFFFF) + * marks an unoccupied slot. Real + * tranche IDs are uint16 and use the + * remaining range. */ + uint16 dense_idx; /* index into lwlock_events[] */ +} LWLockTimingHashEntry; + +/* + * Header-only struct. The actual hash slot array and dense events + * array live in the per-backend DSA region immediately after the + * WaitEventTimingState (in that order); their addresses are recovered + * via wait_event_timing_lwlock_entries() / _lwlock_events() helpers + * defined in wait_event_timing.c. + */ +typedef struct LWLockTimingHash +{ + int num_used; /* count of occupied entries */ + int hash_size; /* size of slot table (power of 2); + * immutable after allocation */ + int max_entries; /* cap on distinct tranches; immutable + * after allocation, == GUC value at + * postmaster start */ +} LWLockTimingHash; + +/* Declaration of the GUC (see guc_parameters.dat). */ +extern PGDLLIMPORT int wait_event_timing_max_tranches; + +/* + * Per-backend wait event timing state. Allocated in shared memory, + * one per MaxBackends + NUM_AUXILIARY_PROCS slot. + * + * Synchronization: each slot is written exclusively by its owning backend. + * Cross-backend readers (pg_stat_get_wait_event_timing) are lock-free and + * tolerate torn reads of 64-bit fields on 32-bit platforms (acceptable for + * statistics). Cross-backend reset is request-based: the caller atomically + * bumps reset_generation, and the owning backend observes the change on + * its next wait_end and performs the reset itself. This keeps the hot + * path lock-free while guaranteeing atomic, race-free resets. + * + * DSA layout: each backend's slot is laid out as + * + * [ WaitEventTimingState header ] + * [ LWLockTimingHashEntry[hash_size] ] + * [ WaitEventTimingEntry[max_entries] <- lwlock_events[] ] + * + * where hash_size and max_entries are runtime-derived from the GUC + * wait_event_timing_max_tranches and recorded in the + * WaitEventTimingState->lwlock_hash header. Slots are laid out + * contiguously in the shared array using a runtime stride + * (wait_event_timing_per_backend_stride in wait_event_timing.c) rather + * than the C array-indexing operator [], because per-backend size is + * determined at server start. + */ +typedef struct WaitEventTimingState +{ + /* + * Generation counter for cross-backend reset requests. Incremented + * atomically by pg_stat_reset_wait_event_timing(target). The owning + * backend tracks a local last-observed value; when it differs from the + * shared value, the owner performs the reset before the next event + * accumulation. Pure request-response: no locks needed on any path. + */ + pg_atomic_uint32 reset_generation; + + /* Current wait start timestamp (set by pgstat_report_wait_start) */ + instr_time wait_start; + + /* Current wait_event_info (cached for use in wait_end) */ + uint32 current_event; + + /* + * Counter of resets that have been *observed and acted on* by this + * backend. Own-backend resets (pg_stat_reset_wait_event_timing(NULL) + * or own-pid) are synchronous and bump this once per call. + * Cross-backend resets COALESCE: if multiple resets are requested + * for this backend between two of its wait_ends, the owner observes + * them as one and bumps reset_count once. Callers polling for "did + * my async reset land?" should rely on the N -> N+1 transition; + * do not use this column as a request counter. + */ + int64 reset_count; + + /* Per-event statistics: flat array for bounded classes */ + WaitEventTimingEntry events[WAIT_EVENT_TIMING_NUM_EVENTS]; + + /* Per-event statistics: hash table for LWLock class (unbounded IDs) */ + LWLockTimingHash lwlock_hash; + + /* Count of LWLock events dropped because the LWLock-timing hash + * table reached its cap (the GUC wait_event_timing_max_tranches). */ + int64 lwlock_overflow_count; + + /* Count of flat array events dropped due to eventId exceeding slot count */ + int64 flat_overflow_count; +} WaitEventTimingState; + + +/* + * Per-session wait event trace ring buffer (10046-style). + * When wait_event_trace GUC is on for a session, every wait_end writes + * a record to a per-backend ring buffer. External tools read the buffer + * via pg_get_backend_wait_event_trace(). + * + * Query attribution is done by scanning the ring at read time: QUERY_START + * and QUERY_END markers delimit which wait events belong to which query_id. + * This eliminates the previous per-backend shared-memory hash table. + * + * The ring buffer is allocated lazily via DSA (Dynamic Shared Memory Areas) + * on first use. Only backends that enable wait_event_trace pay the + * per-ring memory cost. A small control struct in fixed shmem holds + * per-backend DSA pointers. + * + * The ring size is configurable via the wait_event_trace_ring_size_kb + * GUC (PGC_POSTMASTER, default 4096 KB = 4 MB = 131072 records of 32 + * bytes each). The size is fixed cluster-wide at server start, so all + * rings in a given postmaster run have the same dimensions; each ring + * still caches its mask in the WaitEventTraceState header (next to + * write_pos) so the hot-path index computation is a single + * cache-warm load. + * + * The size MUST be a power of two: the writer indexes the ring as + * (pos & ring_mask), and ring_mask = ring_size - 1 only equals "low + * log2(ring_size) bits" when ring_size is a power of two. The GUC + * check hook enforces this. + */ + +/* Trace record types */ +#define TRACE_WAIT_EVENT 0 +#define TRACE_QUERY_START 1 +#define TRACE_QUERY_END 2 +#define TRACE_EXEC_START 3 +#define TRACE_EXEC_END 4 + +typedef struct WaitEventTraceRecord +{ + /* + * Seqlock for torn-read detection. Writers set seq to an odd value + * before filling fields, then to even after. Readers check seq before + * and after; if either is odd or they differ, the record is skipped. + * + * uint32 wraps after pos > 2^31 (~2.7 hours at 220K events/sec), but + * the protection only needs to hold for the reader's access window + * (~10-20 ns between seq_before and seq_after reads). A collision + * requires advancing 2^31 positions in that window -- physically + * impossible by 11 orders of magnitude. + */ + uint32 seq; + uint8 record_type; /* TRACE_WAIT_EVENT / QUERY_START / QUERY_END */ + uint8 pad[3]; + int64 timestamp_ns; /* monotonic clock */ + union + { + struct /* record_type = TRACE_WAIT_EVENT */ + { + uint32 event; /* wait_event_info */ + uint32 pad2; + int64 duration_ns; + } wait; + struct /* record_type = TRACE_QUERY_START/END */ + { + int64 query_id; + int64 pad2; + } query; + } data; +} WaitEventTraceRecord; /* 32 bytes */ + +/* + * Compile-time invariants for the trace ring. These used to live as + * prose in the header comment above; the asserts make accidental + * violations (e.g. someone adding a field to WaitEventTraceRecord) a + * build failure instead of a silently-broken ring. + * + * The ring size itself is now runtime-configurable via the + * wait_event_trace_ring_size_kb GUC; the power-of-two invariant + * (required for the mask-indexing pos & ring_mask) is enforced by the + * GUC check hook, and the minimum-size invariant by the GUC bounds. + */ +StaticAssertDecl(sizeof(WaitEventTraceRecord) == 32, + "WaitEventTraceRecord must be exactly 32 bytes: the " + "seqlock wrap-safety argument relies on single-record, " + "single-cache-line writes, and ARR_DATA_PTR / mask-index " + "math assumes a fixed record stride."); + +/* + * Per-backend trace ring header followed by the records array. The + * records[] slab is variably sized at allocation time (the postmaster's + * value of wait_event_trace_ring_size_kb determines the row count). + * write_pos and ring_mask live on the same cache line so the hot path + * touches a single line for the index calculation. + */ +typedef struct WaitEventTraceState +{ + pg_atomic_uint64 write_pos; /* monotonically increasing, wraps via mask */ + uint32 ring_mask; /* (ring_size - 1); ring_size is a power of two */ + uint32 ring_size_pad; /* keep 16-byte alignment for the records[] slab */ + WaitEventTraceRecord records[FLEXIBLE_ARRAY_MEMBER]; +} WaitEventTraceState; +/* ~4 MB per backend (allocated lazily via DSA). When the ring wraps, + * old records are silently overwritten. Readers detect overwritten + * records via the seqlock (odd seq = in-flight write). */ + +/* + * Per-procNumber trace-ring slot state. + * + * Slot lifecycle is decoupled from backend lifecycle on purpose: when a + * backend exits we deliberately do NOT free its ring. Instead we + * transition the slot to ORPHANED and leave the ring allocated in DSA. + * That preserves trace data past backend exit so it remains readable by + * cross-backend consumers: the in-tree pg_get_wait_event_trace SRF and + * any extension that follows the snapshot pattern documented on + * WaitEventTraceControl below. The original per-backend-ring design + * lost data the + * instant a parallel worker (or any short-lived backend) terminated, + * because the worker's before_shmem_exit callback ran dsa_free before + * any consumer could observe the final waits. See "Slot lifecycle and + * orphan-memory accounting" on WaitEventTraceControl below for the + * rationale and the bounded-memory cost of this choice. + * + * FREE no ring is allocated; ring_ptr is InvalidDsaPointer. + * This is the initial state of every slot at postmaster + * startup, and the state a slot returns to after + * pg_stat_clear_orphaned_wait_event_rings() or after a new + * backend at this procNumber clears the prior orphan. + * + * OWNED ring is allocated and a live backend at this procNumber + * is writing to it. Single-writer invariant holds: only + * the owner backend writes to records[]. Cross-backend + * consumers may read concurrently using the per-record + * seqlock protocol. + * + * ORPHANED ring is allocated but the previous owner has exited. + * Data is post-mortem and immutable -- no writer will + * touch it again. The ring stays in DSA until either + * (a) a new backend takes this procNumber and clears it, + * or (b) the DBA calls + * pg_stat_clear_orphaned_wait_event_rings() + * to release the memory. Worst-case orphan footprint is + * bounded at NUM_WAIT_EVENT_TIMING_SLOTS times the + * per-backend ring size set by + * wait_event_trace_ring_size_kb (default 4 MB; one + * orphaned ring per procNumber); see WaitEventTraceControl. + */ +typedef enum WaitEventTraceSlotState +{ + WAIT_EVENT_TRACE_SLOT_FREE = 0, + WAIT_EVENT_TRACE_SLOT_OWNED, + WAIT_EVENT_TRACE_SLOT_ORPHANED, +} WaitEventTraceSlotState; + +/* + * Per-procNumber slot in the trace control struct. + * + * Synchronization model + * --------------------- + * + * generation is bumped on every owner transition (FREE->OWNED at attach, + * OWNED->ORPHANED at backend exit, anything->FREE at orphan cleanup or + * release-on-disable). Cross-backend readers snapshot generation + * before and after their critical section; if it changed they discard + * the read and retry, matching the BackendStatusArray st_changecount + * idiom. Writers never read generation on the hot path -- it is + * touched only on slot transitions, which are rare (once per backend + * lifecycle plus admin cleanups). + * + * state is pg_atomic_uint32 only for cheap unlocked "is this slot + * worth visiting" probes (e.g. an iterating reader that walks all + * MaxBackends slots and skips FREE ones without taking the lock). + * Authoritative + * reads of state-and-ring_ptr together MUST be done under + * WaitEventTraceCtl->lock in LW_SHARED, paired with the + * generation-snapshot retry loop above. Writers always hold the lock + * in LW_EXCLUSIVE for the full transition, so a reader holding + * LW_SHARED observes an internally consistent slot. + * + * ring_ptr is touched only under WaitEventTraceCtl->lock; both writers + * (transitions) and readers (resolving the DSA pointer to read records) + * take the lock around it. The lock-hold for readers is bounded to + * the dsa_get_address + memcpy of the records of interest -- per-record + * processing must happen after the lock is released, both for + * latency and to avoid lock-ordering issues with other PG subsystems. + * + * Size: 8 + 4 + 4(pad) + 8 = 24 bytes per slot. At MaxBackends + AUX + * = ~1100 on a default cluster, ~26 KB of fixed shared memory total + * for the slot array -- negligible compared to the ring memory itself. + */ +typedef struct WaitEventTraceSlot +{ + pg_atomic_uint64 generation; /* bumped on every owner transition; + * cross-backend readers snapshot + * before+after their read and retry + * if it changed (BackendStatusArray + * st_changecount idiom) */ + pg_atomic_uint32 state; /* WaitEventTraceSlotState */ + uint32 pad; /* explicit pad to keep ring_ptr 8-aligned */ + dsa_pointer ring_ptr; /* InvalidDsaPointer when state == FREE; + * valid DSA pointer to the + * WaitEventTraceState chunk otherwise */ +} WaitEventTraceSlot; + +/* + * Control struct for lazy DSA-based trace ring allocation. + * Lives in fixed shared memory, one per cluster. + * + * The per-backend trace ring is a lock-free transport for external consumers. + * Writers (owning backend) update write_pos and use a per-record seqlock + * for torn-read detection. + * + * Slot lifecycle and orphan-memory accounting + * ------------------------------------------- + * + * The trace_slots[] array is indexed by procNumber. Each slot's + * lifecycle is independent of the backend lifecycle that briefly + * occupies it: when a backend exits we transition its slot to + * ORPHANED and leave the DSA-allocated ring in place, instead of the + * older design that called dsa_free in the backend's + * before_shmem_exit callback. That older design lost trace data the + * instant a backend exited, because the data was gone before any + * cross-backend reader could observe it. This was particularly + * acute for parallel workers, which exit in milliseconds at + * end-of-parallel-query; a reader polling at 1 Hz would never + * observe their waits before the data was freed. + * + * Persisting the ring past backend exit pays a bounded memory cost: + * up to NUM_WAIT_EVENT_TIMING_SLOTS orphaned rings can simultaneously + * exist, each sized by wait_event_trace_ring_size_kb (default 4 MB). + * At the default 4 MB and MaxBackends=100 + auxiliaries that ceiling + * is ~400 MB; at MaxBackends=1000 it is ~4 GB. Operators who need + * a tighter memory cap can lower wait_event_trace_ring_size_kb at + * server start (minimum 8 KB); operators who need longer retention + * before the FIFO wrap can raise it (maximum 32 MB). The ceiling is only + * reached if every procNumber has been used by a tracing backend and + * none of those procNumbers has been reused since. In typical + * deployments this does not happen: + * + * * Always-on tracing: connection churn keeps slots cycling, so + * orphans drain naturally as new backends claim procNumbers. + * * Brief diagnostic tracing: capture is enabled, a few backends + * trace, then capture is disabled. Slots gradually clear as + * the procNumbers are reused; or the DBA calls + * pg_stat_clear_orphaned_wait_event_rings() to release them + * immediately. + * * Long-lived pooled connections that never recycle: the worst + * pathological case. Operators who hit this should call the + * orphan-clear function after diagnostic sessions. + * + * Compared to the alternatives, accepting the bounded orphan-memory + * cost wins on every other axis we care about: hot-path overhead is + * unchanged (single writer, lock-free), correctness is universal + * (parallel workers, autovacuum, walsender, all transient backends + * preserve their data), DSA's lazy-allocation property is preserved + * (capture=off pays zero memory), and the cross-backend reader + * pattern below is what pg_get_wait_event_trace uses; extensions + * implementing similar tools follow the same pattern with no further + * plumbing. See review_5.md issue #26 for the design discussion. + * + * External reader pattern (cross-backend consumers) + * ------------------------------------------------- + * + * External readers (extensions, background workers reading another + * backend's ring) MUST follow this protocol; the in-tree SRF + * pg_get_wait_event_trace() is the reference implementation. + * + * 1. Read trace_slots[procNumber].state without the lock as a cheap + * "worth visiting" check. If FREE, there is no ring -- nothing + * to do. Otherwise proceed to step 2. + * + * 2. Acquire WaitEventTraceCtl->lock in LW_SHARED. All slot + * transitions (FREE <-> OWNED <-> ORPHANED, including + * dsa_allocate / dsa_free of the ring) take LW_EXCLUSIVE, so the + * SHARED hold makes the slot's state, ring_ptr, and ring memory + * stable for the entire iteration that follows. This is what + * makes the per-slot generation counter optional for callers + * that, like this in-tree reader, keep the lock held across the + * iteration; callers that release and re-acquire the lock + * between batches must use the generation idiom from step 7 + * instead. + * + * 3. Re-check state under the lock. If FREE, the slot was + * reassigned between step 1 and the lock acquire; release the + * lock and return. + * + * 4. Resolve trace_slots[procNumber].ring_ptr via dsa_get_address + * and read write_pos = pg_atomic_read_u64(&ts->write_pos). No + * barrier is required here: the position-encoded identity + * seqlock check in step 5 rejects any stale-cycle visibility + * (writer's write_pos store seen by reader before the rec->seq + * store) by comparing rec->seq against the expected value for + * iterator position i, which the previous cycle's seq cannot + * equal. An ordering mismatch on weak-memory architectures + * simply causes the reader to skip the in-flight slot until the + * next call. + * + * 5. Iterate ring indices [read_start, write_pos), masking each + * through the ring (i & ts->ring_mask, where ring_mask is the + * per-ring mask cached next to write_pos in the ring header). + * For EACH record do the per-record seqlock protocol AGAINST + * SHARED MEMORY, using a POSITION-ENCODED IDENTITY check + * (not just parity): + * + * expected_seq = (uint32)(i * 2 + 2); / writer's complete-even + * value for ring position i / + * seq_before = rec_shared->seq; + * pg_read_barrier(); + * if (seq_before != expected_seq) continue; + * local_copy = *rec_shared; / 32-byte struct copy / + * pg_read_barrier(); + * seq_after = rec_shared->seq; + * if (seq_after != expected_seq) continue; + * + * Append valid records to a local result buffer for emission + * after the lock is released. + * + * The writer encodes the ring position into seq: mid-write is + * (pos * 2 + 1), complete is (pos * 2 + 2). Identity against + * (i * 2 + 2) rejects four distinct failure modes: + * + * - Stale previous cycle (seq < expected): writer just + * advanced write_pos to i+1 but the seq store for cycle i + * has not propagated to this CPU's view yet, so we see the + * even seq value from (i - RING_SIZE) -- the slot's + * previous occupant. Parity-only seqlock would accept + * this and emit a record belonging to the previous cycle + * with the new ring_index, a silent data-attribution bug. + * - Mid-write (seq == expected - 1, odd): writer is in the + * payload-write window between seq=odd and seq=even. + * - Ring wrapped past us (seq > expected): a later cycle on + * this slot completed during our read. + * - Torn write completed mid-read (seq_after differs from + * seq_before): the writer crossed a full cycle while we + * copied the record. + * + * Do NOT memcpy the full records[] array up front and then do + * the seqlock check against the local copy: both seq reads + * would hit the same frozen byte in local memory, the check + * degenerates to a no-op, and torn / stale-cycle reads slip + * through. The seqlock protocol requires the two seq reads to + * go to shared memory at distinct times around the payload + * read, and they must be compared against the expected + * position-encoded value. + * + * 6. Release the lock. Per-record post-processing (event-name + * lookups, tuplestore population, network I/O) happens off the + * lock so spills to disk or slow consumers do not extend + * lock-hold. Lock-hold time is O(records_in_range) loads from + * shared memory; for the full ring this is ~1 ms on modern + * hardware -- on par with a single 4 MB memcpy and acceptable + * given the lock is contended only by other transitions + * (themselves rare) and other readers (which share with us). + * + * 7. Optional: snapshot trace_slots[procNumber].generation BEFORE + * step 2 and AFTER step 6; if it changed, the slot was + * reassigned across some lock-release boundary. This in-tree + * reader does not need the snapshot because it holds the lock + * throughout, but readers that batch their work across multiple + * lock-acquire windows (e.g. an extension that polls many slots + * in sequence without holding any single lock too long) should + * use the generation idiom to detect slot reassignment between + * batches. The generation counter is bumped under LW_EXCLUSIVE + * on every transition (FREE -> OWNED at attach, OWNED -> + * ORPHANED at backend exit, anything -> FREE at release/clear). + * + * Same-backend readers (the in-tree pg_get_backend_wait_event_trace + * SRF) do NOT use the LWLock above -- same-backend serialization is + * implicit because a backend can only run one command at a time, + * and the SRF coordinates with wait_event_trace_release_slot via + * per-backend flags. That mechanism is private to + * wait_event_timing.c; external code should use the cross-backend + * protocol described above. + */ +typedef struct WaitEventTraceControl +{ + dsa_handle trace_dsa_handle; /* DSA_HANDLE_INVALID until first use */ + LWLock lock; /* protects DSA creation and slot + * transitions (FREE<->OWNED<-> + * ORPHANED including ring_ptr + * dsa_allocate / dsa_free) */ + WaitEventTraceSlot trace_slots[FLEXIBLE_ARRAY_MEMBER]; /* per procNumber */ +} WaitEventTraceControl; + + +/* + * Capture levels for the wait_event_capture GUC. Order is significant: + * higher values are strict supersets of lower ones, and code paths use + * "level >= WAIT_EVENT_CAPTURE_STATS" to test for activation. + * + * OFF - No instrumentation, no hot-path cost. + * STATS - Aggregated per-event statistics in pg_stat_wait_event_timing + * (counts, durations, histograms). Hot path samples wall time + * around every wait. + * TRACE - Everything in STATS plus a per-session ring buffer of + * individual events and query markers, exposed via + * pg_backend_wait_event_trace. Adds ~4 MB DSA per session. + */ +typedef enum WaitEventCaptureLevel +{ + WAIT_EVENT_CAPTURE_OFF = 0, + WAIT_EVENT_CAPTURE_STATS, + WAIT_EVENT_CAPTURE_TRACE, +} WaitEventCaptureLevel; + +/* + * The hot path uses (capture_level != OFF) as the "any capture + * mode" gate and (capture_level == TRACE) for the trace-specific + * gate. Either form is order-independent, but the values are + * also constrained to a strict OFF < STATS < TRACE order so that + * future code paths needing "at least STATS" can compare with + * >= safely. Pin the invariant explicitly to catch enum + * reordering at compile time rather than via mysterious runtime + * mode switches. + */ +StaticAssertDecl(WAIT_EVENT_CAPTURE_OFF == 0 && + WAIT_EVENT_CAPTURE_STATS == 1 && + WAIT_EVENT_CAPTURE_TRACE == 2, + "WaitEventCaptureLevel values must be 0=OFF < 1=STATS < 2=TRACE"); + +/* GUC variables */ +extern PGDLLIMPORT int wait_event_capture; +extern PGDLLIMPORT int wait_event_trace_ring_size_kb; + +/* + * Records-per-ring value derived from wait_event_trace_ring_size_kb at + * server start. Cached file-scope so the allocator and any caller + * that wants the total record count (rather than the mask) does not + * have to redo the divide. Set once by ProcessConfigFile()'s startup + * sync of POSTMASTER-context GUCs; never updated thereafter. + */ +extern PGDLLIMPORT uint32 WaitEventTraceRingSize; + +/* Pointer to this backend's timing state in shared memory */ +extern PGDLLIMPORT WaitEventTimingState *my_wait_event_timing; + +/* + * Per-backend gate raised by the before_shmem_exit callback when + * proc_exit begins tearing down DSA mappings. The inline wait-event + * hot path checks this and skips ALL wait-event-timing work + * (including the lazy re-attach branch) once the gate is up, to + * avoid SIGSEGV on dangling pointers after dsm_backend_shutdown. + */ +extern PGDLLIMPORT bool wait_event_timing_writes_disabled; + +/* This backend's procNumber for the trace ring, or -1 if not set */ +extern PGDLLIMPORT int my_trace_proc_number; + +/* + * Shared memory setup -- registered via the shmem subsystem registry + * (src/include/storage/subsystemlist.h). Stub builds expose a no-op + * callbacks struct so subsystemlist.h references resolve either way. + */ +extern PGDLLIMPORT const ShmemCallbacks WaitEventTimingShmemCallbacks; +extern PGDLLIMPORT const ShmemCallbacks WaitEventTraceControlShmemCallbacks; + +/* Called from InitProcess() to point my_wait_event_timing at our slot */ +extern void pgstat_set_wait_event_timing_storage(int procNumber); +extern void pgstat_reset_wait_event_timing_storage(void); + +/* Lazy DSA-based trace ring buffer allocation */ +extern void wait_event_trace_attach(int procNumber); + +/* GUC hooks declared in guc_hooks.h */ + +/* Trace marker functions (defined in wait_event_timing.c) */ +extern void wait_event_trace_query_start(int64 query_id); +extern void wait_event_trace_query_end(int64 query_id); +extern void wait_event_trace_exec_start(int64 query_id); +extern void wait_event_trace_exec_end(int64 query_id); + +#endif /* WAIT_EVENT_TIMING_H */ diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build index 4bca42bb3706a..9ae79c3ce6746 100644 --- a/src/test/modules/meson.build +++ b/src/test/modules/meson.build @@ -39,6 +39,7 @@ subdir('test_json_parser') subdir('test_lfind') subdir('test_lwlock_tranches') subdir('test_misc') +subdir('test_wait_event_stress') subdir('test_oat_hooks') subdir('test_parser') subdir('test_pg_dump') diff --git a/src/test/modules/test_wait_event_stress/Makefile b/src/test/modules/test_wait_event_stress/Makefile new file mode 100644 index 0000000000000..69d10db51831b --- /dev/null +++ b/src/test/modules/test_wait_event_stress/Makefile @@ -0,0 +1,19 @@ +MODULES = test_wait_event_stress +PGFILEDESC = "test_wait_event_stress - wait event timing overhead measurement" + +EXTENSION = test_wait_event_stress +DATA = test_wait_event_stress--1.0.sql + +REGRESS = test_wait_event_stress +TAP_TESTS = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_wait_event_stress +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/test_wait_event_stress/expected/test_wait_event_stress.out b/src/test/modules/test_wait_event_stress/expected/test_wait_event_stress.out new file mode 100644 index 0000000000000..cba09b3c594f4 --- /dev/null +++ b/src/test/modules/test_wait_event_stress/expected/test_wait_event_stress.out @@ -0,0 +1,121 @@ +CREATE EXTENSION test_wait_event_stress; +-- Start from a clean slate so this test is idempotent against any state +-- left behind by earlier queries in the same session. +SELECT pg_stat_reset_wait_event_timing(NULL); + pg_stat_reset_wait_event_timing +--------------------------------- + +(1 row) + +-- Basic stress test: verify function works (requires capture to be on so +-- the instrumentation path actually executes work we can time). +SET wait_event_capture = stats; +SELECT stress_wait_events(10000) > 0 AS stress_ok; + stress_ok +----------- + t +(1 row) + +RESET wait_event_capture; +-- Deterministic exact-count coverage. Core regression's wait_event_timing +-- test uses pg_sleep(), which can emit a non-deterministic number of +-- PgSleep wait events under CPU contention, so it cannot assert exact +-- counts. stress_wait_events(N) calls pgstat_report_wait_start/end in a +-- tight loop exactly N times, giving us strictly deterministic input for +-- the ring + aggregated-stats pipeline. This catches symmetric +-- duplication bugs that the count-agnostic core assertions would miss. +SELECT pg_stat_reset_wait_event_timing(NULL); + pg_stat_reset_wait_event_timing +--------------------------------- + +(1 row) + +SET wait_event_capture = trace; +-- stress_wait_events returns elapsed microseconds; on fast TSC-based +-- timers 5 iterations can round to 0 us, so check IS NOT NULL (the +-- function succeeded) rather than > 0. +SELECT stress_wait_events(5) IS NOT NULL AS deterministic_input_ok; + deterministic_input_ok +------------------------ + t +(1 row) + +SELECT count(*) = 5 AS ring_has_exactly_five +FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep'; + ring_has_exactly_five +----------------------- + t +(1 row) + +SELECT calls = 5 AS aggregated_has_exactly_five +FROM pg_stat_wait_event_timing +WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep'; + aggregated_has_exactly_five +----------------------------- + t +(1 row) + +RESET wait_event_capture; +-- LWLock hash overflow test: register 200 tranches (> 192 limit) +-- This should trigger a WARNING about hash table being full +SET wait_event_capture = stats; +-- Start from a clean slate so we can make deterministic assertions +-- about the overflow counter. +SELECT pg_stat_reset_wait_event_timing(NULL); + pg_stat_reset_wait_event_timing +--------------------------------- + +(1 row) + +SELECT lwlock_overflow_count AS before_overflow +FROM pg_stat_wait_event_timing_overflow +WHERE pid = pg_backend_pid(); + before_overflow +----------------- + 0 +(1 row) + +SET client_min_messages = warning; +SELECT test_lwlock_hash_overflow(200); +WARNING: wait_event_timing: LWLock hash table full, timing data for some LWLock tranches will be lost +HINT: This backend uses more than 192 distinct LWLock tranches; raise wait_event_timing_max_tranches. + test_lwlock_hash_overflow +--------------------------- + 200 +(1 row) + +RESET client_min_messages; +-- After overflow the counter must be visible from SQL. +SELECT lwlock_overflow_count > 0 AS overflow_visible +FROM pg_stat_wait_event_timing_overflow +WHERE pid = pg_backend_pid(); + overflow_visible +------------------ + t +(1 row) + +-- Reset clears the overflow counter (pins the fix for issue #9). +SELECT pg_stat_reset_wait_event_timing(NULL); + pg_stat_reset_wait_event_timing +--------------------------------- + +(1 row) + +SELECT lwlock_overflow_count = 0 AS lw_cleared, + flat_overflow_count = 0 AS flat_cleared +FROM pg_stat_wait_event_timing_overflow +WHERE pid = pg_backend_pid(); + lw_cleared | flat_cleared +------------+-------------- + t | t +(1 row) + +-- Verify the function returns the count +SELECT test_lwlock_hash_overflow(10); + test_lwlock_hash_overflow +--------------------------- + 10 +(1 row) + +RESET wait_event_capture; +DROP EXTENSION test_wait_event_stress; diff --git a/src/test/modules/test_wait_event_stress/meson.build b/src/test/modules/test_wait_event_stress/meson.build new file mode 100644 index 0000000000000..ef00737017497 --- /dev/null +++ b/src/test/modules/test_wait_event_stress/meson.build @@ -0,0 +1,38 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +test_wait_event_stress_sources = files( + 'test_wait_event_stress.c', +) + +if host_system == 'windows' + test_wait_event_stress_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'test_wait_event_stress', + '--FILEDESC', 'test_wait_event_stress - wait event timing overhead measurement',]) +endif + +test_wait_event_stress = shared_module('test_wait_event_stress', + test_wait_event_stress_sources, + kwargs: pg_test_mod_args, +) +test_install_libs += test_wait_event_stress + +test_install_data += files( + 'test_wait_event_stress.control', + 'test_wait_event_stress--1.0.sql', +) + +tests += { + 'name': 'test_wait_event_stress', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'test_wait_event_stress', + ], + }, + 'tap': { + 'tests': [ + 't/001_orphan_roundtrip.pl', + ], + }, +} diff --git a/src/test/modules/test_wait_event_stress/sql/test_wait_event_stress.sql b/src/test/modules/test_wait_event_stress/sql/test_wait_event_stress.sql new file mode 100644 index 0000000000000..4579d00eb2897 --- /dev/null +++ b/src/test/modules/test_wait_event_stress/sql/test_wait_event_stress.sql @@ -0,0 +1,66 @@ +CREATE EXTENSION test_wait_event_stress; + +-- Start from a clean slate so this test is idempotent against any state +-- left behind by earlier queries in the same session. +SELECT pg_stat_reset_wait_event_timing(NULL); + +-- Basic stress test: verify function works (requires capture to be on so +-- the instrumentation path actually executes work we can time). +SET wait_event_capture = stats; +SELECT stress_wait_events(10000) > 0 AS stress_ok; +RESET wait_event_capture; + +-- Deterministic exact-count coverage. Core regression's wait_event_timing +-- test uses pg_sleep(), which can emit a non-deterministic number of +-- PgSleep wait events under CPU contention, so it cannot assert exact +-- counts. stress_wait_events(N) calls pgstat_report_wait_start/end in a +-- tight loop exactly N times, giving us strictly deterministic input for +-- the ring + aggregated-stats pipeline. This catches symmetric +-- duplication bugs that the count-agnostic core assertions would miss. +SELECT pg_stat_reset_wait_event_timing(NULL); +SET wait_event_capture = trace; +-- stress_wait_events returns elapsed microseconds; on fast TSC-based +-- timers 5 iterations can round to 0 us, so check IS NOT NULL (the +-- function succeeded) rather than > 0. +SELECT stress_wait_events(5) IS NOT NULL AS deterministic_input_ok; + +SELECT count(*) = 5 AS ring_has_exactly_five +FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep'; + +SELECT calls = 5 AS aggregated_has_exactly_five +FROM pg_stat_wait_event_timing +WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep'; +RESET wait_event_capture; + +-- LWLock hash overflow test: register 200 tranches (> 192 limit) +-- This should trigger a WARNING about hash table being full +SET wait_event_capture = stats; + +-- Start from a clean slate so we can make deterministic assertions +-- about the overflow counter. +SELECT pg_stat_reset_wait_event_timing(NULL); +SELECT lwlock_overflow_count AS before_overflow +FROM pg_stat_wait_event_timing_overflow +WHERE pid = pg_backend_pid(); + +SET client_min_messages = warning; +SELECT test_lwlock_hash_overflow(200); +RESET client_min_messages; + +-- After overflow the counter must be visible from SQL. +SELECT lwlock_overflow_count > 0 AS overflow_visible +FROM pg_stat_wait_event_timing_overflow +WHERE pid = pg_backend_pid(); + +-- Reset clears the overflow counter (pins the fix for issue #9). +SELECT pg_stat_reset_wait_event_timing(NULL); +SELECT lwlock_overflow_count = 0 AS lw_cleared, + flat_overflow_count = 0 AS flat_cleared +FROM pg_stat_wait_event_timing_overflow +WHERE pid = pg_backend_pid(); + +-- Verify the function returns the count +SELECT test_lwlock_hash_overflow(10); + +RESET wait_event_capture; +DROP EXTENSION test_wait_event_stress; diff --git a/src/test/modules/test_wait_event_stress/t/001_orphan_roundtrip.pl b/src/test/modules/test_wait_event_stress/t/001_orphan_roundtrip.pl new file mode 100644 index 0000000000000..d5bcde4302325 --- /dev/null +++ b/src/test/modules/test_wait_event_stress/t/001_orphan_roundtrip.pl @@ -0,0 +1,519 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group +# +# End-to-end test for wait-event trace orphan persistence and the +# in-tree cross-backend reader pg_get_wait_event_trace(). +# +# Three scenarios: +# +# 1. Plain backend orphan roundtrip +# A writer session enables wait_event_capture = trace, emits a +# handful of waits, captures its own procnumber, and disconnects. +# A separate long-lived reader session then asserts that +# pg_get_wait_event_trace() returns the +# writer's recorded events post-mortem, that +# pg_stat_clear_orphaned_wait_event_rings() releases the orphan, +# and that a subsequent read returns empty. +# +# 2. Parallel-worker orphan roundtrip (the patch's stated motivation) +# A query is forced through parallel workers via +# debug_parallel_query=on plus zero parallel costs; the workers +# exit at end-of-parallel-query in milliseconds. The test +# then asserts pg_stat_clear_orphaned_wait_event_rings() +# returns at least 2 -- the leader and at least one worker -- +# confirming that short-lived parallel workers do leave +# readable orphans, the case the orphan-persistence lifecycle +# was designed for. +# +# 3. OWNED-slot read with a concurrent live writer +# A long-lived writer session emits a steady stream of PgSleep +# wait events while a separate reader calls +# pg_get_wait_event_trace(writer_procnumber) repeatedly. All +# rows must be well-formed (no NULL/empty event_type or +# event_name, no negative durations) -- this exercises the +# per-record seqlock protocol that protects against torn +# reads of records mid-write. Without the seqlock the reader +# would emit malformed records during contention windows. +# +# Race-hardening: the reader session is held open for the entire +# run so its procnumber slot cannot be a recycle target for any +# writer or parallel worker when they exit, and the test asserts +# no unrelated client backend is present at the orphan-read +# moment. Skipped on builds without --enable-wait-event-timing. + +use strict; +use warnings FATAL => 'all'; + +use Time::HiRes qw(usleep); +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +my $node = PostgreSQL::Test::Cluster->new('wet_orphan_roundtrip'); +$node->init; +# A high max_connections gives plenty of unused procnumber slots so a +# new backend started during the test window is unlikely to recycle +# the just-exited writer's slot. Combined with the long-lived reader +# session below (which pins its own slot for the full run), this +# closes the race window to negligible width on a quiet test node. +$node->append_conf('postgresql.conf', q{ +max_connections = 100 +}); +$node->start; + +# Skip when wait-event-timing isn't compiled in. GUC check hook +# rejects 'trace' on stub builds; detect via a probe SET. +my ($rc, $stdout, $stderr) = $node->psql( + 'postgres', + "SET wait_event_capture = trace;"); +if ($stderr =~ /not supported by this build/) +{ + plan skip_all => + 'wait_event_timing not compiled in (--enable-wait-event-timing)'; +} + +# Long-lived reader session. Stays connected for the entire test so +# its procnumber slot is in OWNED state and therefore not eligible as +# a recycle target for writer/parallel-worker slots when they exit. +my $reader = $node->background_psql('postgres'); + +# ------------------------------------------------------------------ +# Scenario 1: plain backend orphan roundtrip +# ------------------------------------------------------------------ + +# Spawn the writer as a one-shot psql. It enables trace, emits a +# handful of waits inside a DO block (PERFORM avoids empty-row +# pollution of the captured output), then SELECTs its own procnumber. +# Because psql one-shot commands spawn a fresh backend that exits +# when the SQL completes, the writer's slot transitions to ORPHANED +# on exit. +my $writer_proc = $node->safe_psql( + 'postgres', q{ + SET wait_event_capture = trace; + DO $$ + BEGIN + PERFORM pg_sleep(0.02); + PERFORM pg_sleep(0.02); + PERFORM pg_sleep(0.02); + END + $$; + SELECT procnumber + FROM pg_stat_get_wait_event_timing(pg_backend_pid()) + WHERE pid = pg_backend_pid() + LIMIT 1; + }); +chomp $writer_proc; +like($writer_proc, qr/^\d+$/, 'writer reported its procnumber'); + +# Wait for the writer backend to fully exit. pg_stat_activity loses +# the row before the before_shmem_exit callback finishes; we then +# additionally assert that no other client backend has inherited the +# writer's procnumber, which would clear the orphan via +# wait_event_trace_clear_orphan_at_init. +my $other_clients_query = + "SELECT count(*) FROM pg_stat_activity " + . "WHERE backend_type = 'client backend' AND pid <> pg_backend_pid();"; + +my $writer_gone = 0; +for (my $i = 0; $i < 100; $i++) +{ + my $count = $reader->query_safe($other_clients_query); + chomp $count; + if ($count eq '0') { $writer_gone = 1; last; } + usleep(20_000); +} +ok($writer_gone, 'writer backend has exited (slot should be ORPHANED)'); + +# Race-harden: confirm no client backend has taken over the writer's +# procnumber between its exit and our read. This is what would +# clear the orphan; if some other test artifact triggered it we want +# the test to fail loudly rather than spuriously report "no orphan". +my $recycler_count = $reader->query_safe( + "SELECT count(*) FROM pg_stat_activity " + . "WHERE backend_type = 'client backend' " + . " AND pid <> pg_backend_pid();"); +chomp $recycler_count; +is($recycler_count, '0', + 'no other client backend present at orphan-read time (slot not recycled)'); + +# Read the orphaned ring via the cross-backend reader. At least one +# record is expected (we emitted three pg_sleep waits). +my $orphan_rows = $reader->query_safe( + "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);"); +chomp $orphan_rows; +cmp_ok($orphan_rows, '>=', 1, + "pg_get_wait_event_trace($writer_proc) reads ORPHANED ring (rows: $orphan_rows)"); + +# Admin sweep: clear the orphan. Should report >= 1 since we know one +# ORPHANED slot exists. +my $cleared = $reader->query_safe( + "SELECT pg_stat_clear_orphaned_wait_event_rings();"); +chomp $cleared; +cmp_ok($cleared, '>=', 1, + "pg_stat_clear_orphaned_wait_event_rings released $cleared ring(s)"); + +# After the sweep, the orphan is gone. Reading the same procnumber +# returns empty. +my $after_clear = $reader->query_safe( + "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);"); +chomp $after_clear; +is($after_clear, '0', + "pg_get_wait_event_trace($writer_proc) returns empty after sweep"); + +# ------------------------------------------------------------------ +# Scenario 2: parallel-worker orphan roundtrip +# ------------------------------------------------------------------ +# +# Force parallel workers to participate in a trivial seq scan, +# capture their procnumbers while alive, then assert each worker's +# orphaned ring is readable after the parallel query has finished +# (workers exit in milliseconds at end-of-parallel-query). + +# Create the table used to force parallelism. Done from the reader +# session so it survives across the writer's lifetime. Suppress +# NOTICE so query_safe doesn't treat the "does not exist, skipping" +# message as a failure (BackgroundPsql::query_safe treats any stderr +# output as a query failure). The table is sized large enough +# (1M rows) and the query is structured (ORDER BY + count(*) under +# a Gather) so workers reliably emit wait events (tuple queue +# operations, latch waits) before they exit at end-of-parallel- +# query. A smaller table with a plain count(*) can be processed +# entirely from cache without any wait points, leaving worker +# trace rings never lazily allocated, never transitioning to +# OWNED, and never producing ORPHANED slots -- the test would +# pass without exercising the parallel-worker case. +$reader->query_safe("SET client_min_messages = warning;"); +$reader->query_safe(q{ + DROP TABLE IF EXISTS wet_parallel_target; + CREATE TABLE wet_parallel_target AS + SELECT i FROM generate_series(1, 1000000) i; +}); + +# Spawn a writer session that enables trace and runs a forced- +# parallel query. Workers run, then exit at end-of-parallel-query; +# the leader (this safe_psql backend) then exits when safe_psql +# returns. After return, both leader and workers are gone -- each +# leaves an ORPHANED slot whose ring should be readable. +$node->safe_psql( + 'postgres', q{ + SET wait_event_capture = trace; + SET min_parallel_table_scan_size = 0; + SET parallel_setup_cost = 0; + SET parallel_tuple_cost = 0; + SET max_parallel_workers_per_gather = 2; + SET debug_parallel_query = on; + -- ORDER BY forces a parallel sort and a Gather Merge, + -- which routes tuples through shm_mq queues -- workers + -- reliably emit MessageQueueSend / MessageQueueReceive + -- wait events here, guaranteeing lazy trace-ring + -- allocation and OWNED->ORPHANED transition on exit. + SELECT count(*) FROM ( + SELECT i FROM wet_parallel_target ORDER BY i + ) s; + }); + +# Wait for all client/worker backends to fully exit. At +# safe_psql return the leader has exited, but worker +# before_shmem_exit callbacks may still be running -- +# pg_stat_clear_orphaned_wait_event_rings counts only +# slots that have completed their OWNED -> ORPHANED +# transition, so racing the callbacks under-counts. +my $exit_drained = 0; +for (my $i = 0; $i < 200; $i++) +{ + my $count = $reader->query_safe( + "SELECT count(*) FROM pg_stat_activity " + . "WHERE backend_type IN ('client backend', 'parallel worker') " + . " AND pid <> pg_backend_pid();"); + chomp $count; + if ($count eq '0') { $exit_drained = 1; last; } + usleep(20_000); +} +ok($exit_drained, + 'all parallel-query backends have exited before counting orphans'); + +# Count parallel-produced orphans via the admin sweep, which +# returns the number of rings released. After a forced-parallel +# query with the leader and workers all exited, we expect at +# least 2 orphans (leader + at least one worker). +# +# Using the sweep is cheaper than iterating every procnumber and +# calling pg_get_wait_event_trace on each: it's a single lock +# acquisition and tells us the count directly. The read path +# itself is already covered by scenario 1 above; here we only +# need to confirm that parallel-worker exits do produce orphans. +my $parallel_orphans = $reader->query_safe( + "SELECT pg_stat_clear_orphaned_wait_event_rings();"); +chomp $parallel_orphans; +cmp_ok($parallel_orphans, '>=', 2, + "parallel-query exit produced >= 2 orphans (leader + worker(s)): $parallel_orphans"); + +# ------------------------------------------------------------------ +# Scenario 3: OWNED-slot read with a concurrent live writer +# ------------------------------------------------------------------ +# +# Exercises the per-record seqlock protocol against an actively +# writing backend. OWNED is the case where the seqlock check is +# load-bearing: the writer is concurrently appending records to +# the ring while the reader iterates. A torn read (writer +# mid-record at the moment of the reader's payload copy) must be +# detected and the record skipped; well-formed records must be +# emitted intact, never with a malformed event_type or event_name +# that would otherwise crash pgstat_get_wait_event_type() or +# materialise NULL strings into the result. +# +# Setup: a long-lived BackgroundPsql writer that has +# wait_event_capture = trace and runs a tight pg_sleep loop +# producing a steady stream of PgSleep wait events. While the +# writer is emitting, the reader calls +# pg_get_wait_event_trace(writer_procnumber) repeatedly and +# asserts every observed row has well-formed event_type, +# event_name, and a non-negative duration. Any torn record that +# slipped through the seqlock surfaces here as a NULL or empty +# string (or worse, a crash inside the SRF). + +my $writer_bg = $node->background_psql('postgres'); +$writer_bg->query_safe("SET client_min_messages = warning;"); +$writer_bg->query_safe("SET wait_event_capture = trace;"); +# Generate at least one wait so the ring is allocated and the +# procnumber appears in pg_stat_get_wait_event_timing. +$writer_bg->query_safe("SELECT pg_sleep(0.01);"); +my $writer_bg_proc = $writer_bg->query_safe( + "SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) " + . "WHERE pid = pg_backend_pid() LIMIT 1;"); +chomp $writer_bg_proc; +like($writer_bg_proc, qr/^\d+$/, + 'live writer reported its procnumber'); + +# Start a burst of wait events asynchronously. query_until +# returns as soon as it sees the \echo banner, leaving the DO +# block executing pg_sleep(0.001) in a tight 1000-iteration loop +# (~1 s wall, ~1000 PgSleep wait events) in the background. +$writer_bg->query_until( + qr/burst_started/, + "\\echo burst_started\n" + . "DO \$\$ BEGIN FOR i IN 1..1000 LOOP PERFORM pg_sleep(0.001); END LOOP; END \$\$;\n"); + +# Read concurrently from the reader session. Each read iterates +# the writer's ring under LW_SHARED; the writer is freely +# appending records. Any torn row surfaces as NULL/empty +# event_type, event_name, or negative duration. +my $live_read_attempts = 10; +my $live_reads_ok = 1; +my $live_total_observed = 0; +for (my $r = 0; $r < $live_read_attempts; $r++) +{ + my $bad = $reader->query_safe( + "SELECT count(*) FROM pg_get_wait_event_trace($writer_bg_proc) t " + . "WHERE t.wait_event_type IS NULL " + . " OR t.wait_event_type = '' " + . " OR t.wait_event IS NULL " + . " OR t.wait_event = '' " + . " OR t.duration_us < 0;"); + chomp $bad; + if ($bad ne '0') + { + $live_reads_ok = 0; + diag("read $r against live writer returned $bad malformed row(s)"); + last; + } + + my $total = $reader->query_safe( + "SELECT count(*) FROM pg_get_wait_event_trace($writer_bg_proc);"); + chomp $total; + $live_total_observed += $total; + + usleep(50_000); +} +ok($live_reads_ok, + 'OWNED-slot reads against live writer produced only well-formed rows'); +cmp_ok($live_total_observed, '>', 0, + "OWNED-slot reads observed records across $live_read_attempts reads (total: $live_total_observed)"); + +# Wait for the writer's DO block to finish; this query_safe +# blocks until psql is ready to receive new input. +$writer_bg->query_safe("SELECT 1;"); +$writer_bg->quit; + +# ------------------------------------------------------------------ +# Scenario 4: wait_event_trace_clear_orphan_at_init reclaims an +# orphan when a new backend inherits the same procNumber slot +# ------------------------------------------------------------------ +# +# review_6.md issue #8 asked specifically for coverage of the +# clear_orphan_at_init path (the lazy lifecycle reclaim that +# runs at every backend's InitProcess and frees a prior orphan +# whose procNumber the new backend has inherited). Scenarios 1 +# and 2 above exercise the admin-driven sweep +# (pg_stat_clear_orphaned_wait_event_rings) but not the init-time +# per-slot reclaim, leaving a gap in regression coverage for the +# lifecycle's "common case" path. +# +# Strategy: +# 1. Spawn writer W4, enable wait_event_capture=trace, emit a +# wait so W4's slot transitions FREE -> OWNED with a real +# trace ring allocated, capture W4's procnumber, disconnect. +# W4's slot is now ORPHANED with non-empty ring contents. +# 2. Verify from the reader session that the orphan is visible +# via pg_get_wait_event_trace(w4_proc). +# 3. Spawn a new backend B with wait_event_capture=stats (so B +# does NOT allocate a trace ring of its own). Query B's +# procnumber. If B inherited W4's procnumber slot, then B's +# clear_orphan_at_init must have transitioned the slot from +# ORPHANED -> FREE at InitProcess time; we verify that by +# asserting pg_get_wait_event_trace(w4_proc) is empty. +# 4. Retry up to a bounded number of times: procNumber +# assignment is determined by ProcGlobal's free list, which +# on a quiet single-session test cluster tends to reuse the +# just-freed slot quickly, but the reuse is not strictly +# guaranteed (aux processes, autovacuum workers, etc. can +# take the slot in between). If we exhaust retries without +# a same-procnumber hit, mark the scenario as skipped rather +# than fail -- the failure mode is environment-dependent, +# not a defect under test. + +my $w4 = $node->background_psql('postgres'); +$w4->query_safe("SET client_min_messages = warning;"); +$w4->query_safe("SET wait_event_capture = trace;"); +$w4->query_safe("SELECT pg_sleep(0.01);"); + +my $w4_proc = $w4->query_safe( + "SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) " + . "WHERE pid = pg_backend_pid() LIMIT 1;"); +chomp $w4_proc; +like($w4_proc, qr/^\d+$/, + 'scenario-4 writer reported its procnumber'); + +$w4->quit; + +# Wait for W4's full server-side exit before reading the orphan +# or starting the retry loop. pg_stat_activity loses the row +# before the before_shmem_exit callbacks finish (the +# OWNED -> ORPHANED transition lives in such a callback), so +# polling pg_stat_activity isn't a perfect signal for the +# transition itself -- but it IS a perfect signal for "the slot +# has been returned to ProcGlobal->freeProcs and may now be +# inherited by a new backend", which is what we need before +# entering the retry loop. Without this poll, on slower test +# environments the next query can race ahead of the cleanup and +# either observe the slot in OWNED state (which the reader +# still reads correctly, but is a different invariant from what +# this scenario tests) or observe $w4 still in pg_stat_activity +# and fail the "no other client backend" assertion below. +my $w4_gone = 0; +for (my $i = 0; $i < 100; $i++) +{ + my $count = $reader->query_safe( + "SELECT count(*) FROM pg_stat_activity " + . "WHERE backend_type = 'client backend' " + . " AND pid <> pg_backend_pid();"); + chomp $count; + if ($count eq '0') { $w4_gone = 1; last; } + usleep(20_000); +} +ok($w4_gone, 'scenario-4 writer backend has exited'); + +my $orphan_rows4 = $reader->query_safe( + "SELECT count(*) FROM pg_get_wait_event_trace($w4_proc);"); +chomp $orphan_rows4; +cmp_ok($orphan_rows4, '>=', 1, + "scenario-4 orphan visible at procnumber $w4_proc before any reclaim " + . "(rows: $orphan_rows4)"); + +# Retry loop: spawn new backends with capture = stats (does NOT +# allocate a trace ring), capture procnumber, check whether the +# inheritance landed on w4_proc. +my $reclaimed = 0; +my $attempts = 0; +# ProcGlobal->freeProcs is a FIFO (dlist_push_tail on backend exit, +# dlist_pop_head_node on new backend init), so after W4 disconnects +# the just-freed slot goes to the tail of the queue. To cycle the +# queue back around to W4's procnumber, a new backend has to be +# spawned for every free slot ahead of W4's in the queue. The +# cluster's max_connections is the upper bound; query it and add +# a 20% safety margin to absorb any walsender/bgworker free-list +# overlap or queue-occupancy fluctuations from autovacuum/etc. +# This adapts automatically if the test config in $node->append_conf +# above is changed. +my $max_connections = $node->safe_psql('postgres', + 'SHOW max_connections;'); +chomp $max_connections; +my $max_attempts = int($max_connections * 1.2); +my @observed_procs; + +for (my $i = 0; $i < $max_attempts; $i++) +{ + $attempts++; + my $b = $node->background_psql('postgres'); + $b->query_safe("SET client_min_messages = warning;"); + $b->query_safe("SET wait_event_capture = stats;"); + # Sleep duration kept slightly above the millisecond range so the + # pg_stat_get_wait_event_timing query reliably observes a non-zero + # entry count on slow CI hosts that may aggressively optimise + # sub-millisecond WaitLatch dispatches. + $b->query_safe("SELECT pg_sleep(0.005);"); + + my $b_proc = $b->query_safe( + "SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) " + . "WHERE pid = pg_backend_pid() LIMIT 1;"); + chomp $b_proc; + push @observed_procs, $b_proc; + + if ($b_proc eq $w4_proc) + { + # Inheritance landed. B's clear_orphan_at_init at + # InitProcess time must have FREE'd W4's ORPHANED + # slot. Since B used capture = stats (not trace), B + # allocated no new trace ring at this slot, so a + # subsequent pg_get_wait_event_trace($w4_proc) should + # return zero rows. If it returns >= 1 row, those + # rows are W4's stale records -- the reclaim did not + # happen and the test fails. + my $rows_after = $reader->query_safe( + "SELECT count(*) FROM pg_get_wait_event_trace($w4_proc);"); + chomp $rows_after; + is($rows_after, '0', + "clear_orphan_at_init reclaimed W4's orphan when B " + . "inherited procnumber $w4_proc (attempt $attempts)") + or diag("expected 0 rows but pg_get_wait_event_trace(" + . "$w4_proc) returned $rows_after; observed procnumber " + . "sequence: " . join(",", @observed_procs)); + $reclaimed = 1; + $b->quit; + last; + } + $b->quit; +} + +SKIP: +{ + if (!$reclaimed) + { + diag("observed procnumbers (W4 was $w4_proc): " + . join(",", @observed_procs)); + skip("procnumber $w4_proc was not reused within $max_attempts attempts; " + . "the ProcGlobal free-list order is environment-dependent. " + . "The clear_orphan_at_init code path is exercised at every " + . "backend init that DOES inherit an orphan; scenarios 1 and 2 " + . "above also cover orphan reclamation via the admin sweep.", + 1); + } +} + +# Note on test coverage: the position-encoded identity seqlock +# in emit_wait_event_trace_for_procnumber() has no direct +# regression test. The bug it prevents (reader observing the +# writer's new write_pos before the writer's rec->seq update has +# propagated, then emitting a stale record with the wrong ring +# index) is unreachable on x86 TSO without instrumentation. The +# writer-side INJECTION_POINT("wait-event-trace-after-write-pos") +# is in place to support such a test -- a future TAP scenario +# can attach with action 'wait' and verify the reader skips the +# in-flight slot. Wiring an async BackgroundPsql to wedge inside +# the wait-event recording path proved fiddly enough to defer to +# a follow-up; the identity check is correct by construction and +# the protocol is documented on WaitEventTraceControl. + +$reader->quit; +$node->stop; + +done_testing; diff --git a/src/test/modules/test_wait_event_stress/t/002_ring_wrap.pl b/src/test/modules/test_wait_event_stress/t/002_ring_wrap.pl new file mode 100644 index 0000000000000..ae3c101bb4934 --- /dev/null +++ b/src/test/modules/test_wait_event_stress/t/002_ring_wrap.pl @@ -0,0 +1,134 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group +# +# Wraparound regression test for the wait-event-trace ring buffer. +# +# Provisions a cluster with the smallest legal +# wait_event_trace_ring_size_kb (8 KB = 256 records) and a small +# max_connections, then drives a session through enough wait +# events to force the ring to wrap many times. Verifies that: +# +# 1. The session-local SRF (pg_get_backend_wait_event_trace) +# remains queryable when the ring has wrapped: the result is +# bounded by the ring size, well-formed, and the seq column +# reflects the most-recent records (not the oldest). +# +# 2. The cross-backend reader (pg_get_wait_event_trace) on a +# wrapped, currently-OWNED slot also returns well-formed +# records bounded by the ring size, with the per-record +# position-encoded identity seqlock correctly distinguishing +# current-cycle records from overwritten earlier-cycle ones. +# +# If the writer's `pos & ring_mask` indexing or the seqlock's +# identity check (expected_seq = pos*2 + 2) is wrong, this test +# either crashes the reader, produces NULL columns, or returns +# more records than the ring can hold. + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use Time::HiRes qw(usleep); + +my $node = PostgreSQL::Test::Cluster->new('wet_ring_wrap'); +$node->init; + +# Smallest legal ring size (8 KB = 256 records of 32 bytes each). +# Combined with the loops below this guarantees many ring wraps in +# the writer session. max_connections kept small so the +# administrative cost of starting/stopping backends stays low. +$node->append_conf( + 'postgresql.conf', q{ +max_connections = 20 +wait_event_trace_ring_size_kb = 8 +}); +$node->start; + +# Skip if wait-event-timing wasn't compiled in. Detect via a probe +# SET; the GUC's check hook rejects non-OFF values on stub builds. +my ($rc, $stdout, $stderr) = $node->psql( + 'postgres', + "SET wait_event_capture = trace;", + on_error_stop => 0); +if ($stderr =~ /wait event capture is not supported by this build/) +{ + plan skip_all => 'wait-event-timing not compiled in'; +} + +# Verify the GUC is what we asked for. +my $ring_kb = $node->safe_psql('postgres', "SHOW wait_event_trace_ring_size_kb;"); +chomp $ring_kb; +is($ring_kb, '8kB', + "wait_event_trace_ring_size_kb is the configured value: $ring_kb"); + +# Drive the writer past many ring wraps. +# Ring = 256 records. Each pg_sleep(0.0001) emits one wait event +# (the PgSleep latch wait at end). 500 sleeps => roughly 2x the +# ring size (the parse/plan/exec path emits additional waits per +# statement, so the actual ring-wrap factor is higher). +my $writer = $node->background_psql('postgres'); +$writer->query_safe("SET client_min_messages = warning;"); +$writer->query_safe("SET wait_event_capture = trace;"); +$writer->query_safe( + "DO \$\$ BEGIN FOR i IN 1..500 LOOP PERFORM pg_sleep(0.0001); END LOOP; END \$\$;" +); + +my $writer_proc = $writer->query_safe( + "SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) " + . "WHERE pid = pg_backend_pid() LIMIT 1;"); +chomp $writer_proc; +like($writer_proc, qr/^\d+$/, 'writer reported its procnumber'); + +# Session-local read: at most ring-size records, all well-formed, +# seq values reflect the wrapped state (not 0..N-1). +my $local_count = $writer->query_safe( + "SELECT count(*) FROM pg_get_backend_wait_event_trace();"); +chomp $local_count; +cmp_ok($local_count, '<=', 256, + "session-local read returns at most ring_size records ($local_count <= 256)"); +cmp_ok($local_count, '>=', 1, + "session-local read returns at least one record"); + +my $local_min_seq = $writer->query_safe( + "SELECT min(seq) FROM pg_get_backend_wait_event_trace();"); +chomp $local_min_seq; +cmp_ok($local_min_seq, '>=', 256, + "session-local read sees post-wrap seq (min=$local_min_seq >= 256)"); + +# Cross-backend read of the OWNED slot via pg_get_wait_event_trace. +# This exercises the identity-seqlock check under the wrap regime. +my $reader = $node->background_psql('postgres'); + +my $cross_count = $reader->query_safe( + "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);"); +chomp $cross_count; +cmp_ok($cross_count, '<=', 256, + "cross-backend read returns at most ring_size records ($cross_count <= 256)"); +cmp_ok($cross_count, '>=', 1, + "cross-backend read sees the wrapped ring's records"); + +my $bad = $reader->query_safe( + "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc) " + . "WHERE wait_event_type IS NULL " + . " OR wait_event IS NULL " + . " OR wait_event_type = '' " + . " OR wait_event = '' " + . " OR duration_us < 0;"); +chomp $bad; +is($bad, '0', + 'cross-backend read after wrap returns only well-formed rows'); + +# The seq column on the cross-backend SRF reports the writer's +# ring index, which after many wraps should be far above 256. +my $cross_min_seq = $reader->query_safe( + "SELECT min(seq) FROM pg_get_wait_event_trace($writer_proc);"); +chomp $cross_min_seq; +cmp_ok($cross_min_seq, '>=', 256, + "cross-backend reader sees post-wrap seq (min=$cross_min_seq >= 256)"); + +$writer->quit; +$reader->quit; +$node->stop; + +done_testing; diff --git a/src/test/modules/test_wait_event_stress/test_wait_event_stress--1.0.sql b/src/test/modules/test_wait_event_stress/test_wait_event_stress--1.0.sql new file mode 100644 index 0000000000000..916fe9456a197 --- /dev/null +++ b/src/test/modules/test_wait_event_stress/test_wait_event_stress--1.0.sql @@ -0,0 +1,9 @@ +CREATE FUNCTION stress_wait_events(integer) +RETURNS bigint +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; + +CREATE FUNCTION test_lwlock_hash_overflow(integer) +RETURNS integer +AS 'MODULE_PATHNAME' +LANGUAGE C STRICT; diff --git a/src/test/modules/test_wait_event_stress/test_wait_event_stress.c b/src/test/modules/test_wait_event_stress/test_wait_event_stress.c new file mode 100644 index 0000000000000..5ebb740dafb71 --- /dev/null +++ b/src/test/modules/test_wait_event_stress/test_wait_event_stress.c @@ -0,0 +1,103 @@ +#include "postgres.h" +#include "fmgr.h" +#include "funcapi.h" +#include "pgstat.h" +#include "storage/lwlock.h" +#include "utils/wait_event.h" +#include "utils/wait_event_types.h" +#include "utils/timestamp.h" + +PG_MODULE_MAGIC; + +/* + * stress_wait_events(n int) -> bigint + * + * Calls pgstat_report_wait_start()/pgstat_report_wait_end() in a tight loop + * n times. Returns the elapsed time in microseconds. + * + * This measures the pure overhead of the wait event timing instrumentation: + * - 2x clock_gettime(CLOCK_MONOTONIC) via VDSO per iteration + * - 1x histogram bucket calculation (CLZ instruction) + * - 1x accumulator update (counter + total_ns) + * - optionally 1x trace ring buffer write + * + * Usage: + * SELECT stress_wait_events(1000000); -- 1M iterations + * -- returns elapsed microseconds + * -- overhead per iteration = result / 1000000 microseconds + */ +PG_FUNCTION_INFO_V1(stress_wait_events); + +Datum +stress_wait_events(PG_FUNCTION_ARGS) +{ + int32 iterations = PG_GETARG_INT32(0); + instr_time start, + end; + int i; + + if (iterations < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("iterations must be non-negative"))); + + INSTR_TIME_SET_CURRENT(start); + + for (i = 0; i < iterations; i++) + { + pgstat_report_wait_start(WAIT_EVENT_PG_SLEEP); + pgstat_report_wait_end(); + } + + INSTR_TIME_SET_CURRENT(end); + + PG_RETURN_INT64(INSTR_TIME_GET_MICROSEC(end) - INSTR_TIME_GET_MICROSEC(start)); +} + +/* + * test_lwlock_hash_overflow(n_tranches int) -> int + * + * Registers n_tranches custom LWLock tranches and triggers a + * pgstat_report_wait_start()/pgstat_report_wait_end() cycle on each. + * Returns the number of tranches that were triggered. + * + * With n_tranches > LWLOCK_TIMING_MAX_ENTRIES (192), this exercises the + * hash overflow path and verifies the one-time WARNING fires. + * + * Usage: + * SET wait_event_capture = stats; + * SET client_min_messages = warning; + * SELECT test_lwlock_hash_overflow(200); + * -- expect WARNING about LWLock hash table full + */ +PG_FUNCTION_INFO_V1(test_lwlock_hash_overflow); + +Datum +test_lwlock_hash_overflow(PG_FUNCTION_ARGS) +{ + int32 n_tranches = PG_GETARG_INT32(0); + int i; + char name[64]; + + if (n_tranches < 0 || n_tranches > 1000) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("n_tranches must be between 0 and 1000"))); + + for (i = 0; i < n_tranches; i++) + { + int tranche_id; + uint32 event; + + snprintf(name, sizeof(name), "test_lwlock_overflow_%d", i); + tranche_id = LWLockNewTrancheId(name); + + /* Construct wait_event_info: PG_WAIT_LWLOCK | tranche_id */ + event = PG_WAIT_LWLOCK | (uint32) tranche_id; + + pgstat_report_wait_start(event); + pgstat_report_wait_end(); + } + + PG_RETURN_INT32(n_tranches); +} diff --git a/src/test/modules/test_wait_event_stress/test_wait_event_stress.control b/src/test/modules/test_wait_event_stress/test_wait_event_stress.control new file mode 100644 index 0000000000000..8c2b50e2af620 --- /dev/null +++ b/src/test/modules/test_wait_event_stress/test_wait_event_stress.control @@ -0,0 +1,4 @@ +comment = 'Stress test for wait event timing overhead' +default_version = '1.0' +module_pathname = '$libdir/test_wait_event_stress' +relocatable = true diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index a65a5bf0c4fbc..0a00c00cd5c91 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1332,6 +1332,13 @@ pg_backend_memory_contexts| SELECT name, free_chunks, used_bytes FROM pg_get_backend_memory_contexts() pg_get_backend_memory_contexts(name, ident, type, level, path, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes); +pg_backend_wait_event_trace| SELECT seq, + timestamp_ns, + wait_event_type, + wait_event, + duration_us, + query_id + FROM pg_get_backend_wait_event_trace() t(seq, timestamp_ns, wait_event_type, wait_event, duration_us, query_id); pg_config| SELECT name, setting FROM pg_config() pg_config(name, setting); @@ -2415,6 +2422,24 @@ pg_stat_user_tables| SELECT relid, stats_reset FROM pg_stat_all_tables WHERE ((schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (schemaname !~ '^pg_toast'::text)); +pg_stat_wait_event_timing| SELECT pid, + backend_type, + procnumber, + wait_event_type, + wait_event, + calls, + total_time_ms, + avg_time_us, + max_time_us, + histogram + FROM pg_stat_get_wait_event_timing(NULL::integer) t(pid, backend_type, procnumber, wait_event_type, wait_event, calls, total_time_ms, avg_time_us, max_time_us, histogram); +pg_stat_wait_event_timing_overflow| SELECT pid, + backend_type, + procnumber, + lwlock_overflow_count, + flat_overflow_count, + reset_count + FROM pg_stat_get_wait_event_timing_overflow(NULL::integer) t(pid, backend_type, procnumber, lwlock_overflow_count, flat_overflow_count, reset_count); pg_stat_wal| SELECT wal_records, wal_fpi, wal_bytes, @@ -2891,6 +2916,11 @@ pg_views| SELECT n.nspname AS schemaname, FROM (pg_class c LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = 'v'::"char"); +pg_wait_event_timing_histogram_buckets| SELECT bucket_idx, + lower_ns, + upper_ns, + label + FROM ( VALUES (0,(0)::bigint,(1024)::bigint,'<1us'::text), (1,(1024)::bigint,(2048)::bigint,'1-2us'::text), (2,(2048)::bigint,(4096)::bigint,'2-4us'::text), (3,(4096)::bigint,(8192)::bigint,'4-8us'::text), (4,(8192)::bigint,(16384)::bigint,'8-16us'::text), (5,(16384)::bigint,(32768)::bigint,'16-32us'::text), (6,(32768)::bigint,(65536)::bigint,'32-64us'::text), (7,(65536)::bigint,(131072)::bigint,'64-128us'::text), (8,(131072)::bigint,(262144)::bigint,'128-256us'::text), (9,(262144)::bigint,(524288)::bigint,'256-512us'::text), (10,(524288)::bigint,(1048576)::bigint,'512us-1ms'::text), (11,(1048576)::bigint,(2097152)::bigint,'1-2ms'::text), (12,(2097152)::bigint,(4194304)::bigint,'2-4ms'::text), (13,(4194304)::bigint,(8388608)::bigint,'4-8ms'::text), (14,(8388608)::bigint,(16777216)::bigint,'8-16ms'::text), (15,(16777216)::bigint,(33554432)::bigint,'16-32ms'::text), (16,(33554432)::bigint,(67108864)::bigint,'32-64ms'::text), (17,(67108864)::bigint,(134217728)::bigint,'64-128ms'::text), (18,(134217728)::bigint,(268435456)::bigint,'128-256ms'::text), (19,(268435456)::bigint,(536870912)::bigint,'256-512ms'::text), (20,(536870912)::bigint,(1073741824)::bigint,'512ms-1s'::text), (21,(1073741824)::bigint,'2147483648'::bigint,'1-2s'::text), (22,'2147483648'::bigint,'4294967296'::bigint,'2-4s'::text), (23,'4294967296'::bigint,'8589934592'::bigint,'4-8s'::text), (24,'8589934592'::bigint,'17179869184'::bigint,'8-16s'::text), (25,'17179869184'::bigint,'34359738368'::bigint,'16-32s'::text), (26,'34359738368'::bigint,'68719476736'::bigint,'32-64s'::text), (27,'68719476736'::bigint,'137438953472'::bigint,'64-128s'::text), (28,'137438953472'::bigint,'274877906944'::bigint,'128-256s'::text), (29,'274877906944'::bigint,'549755813888'::bigint,'256-512s'::text), (30,'549755813888'::bigint,'1099511627776'::bigint,'512s-1024s'::text), (31,'1099511627776'::bigint,NULL::bigint,'>=1024s'::text)) t(bucket_idx, lower_ns, upper_ns, label); pg_wait_events| SELECT type, name, description diff --git a/src/test/regress/expected/wait_event_timing.out b/src/test/regress/expected/wait_event_timing.out new file mode 100644 index 0000000000000..339e98507779e --- /dev/null +++ b/src/test/regress/expected/wait_event_timing.out @@ -0,0 +1,397 @@ +-- +-- Test wait event timing infrastructure +-- +-- These tests verify the wait event timing SQL interface. +-- They require --enable-wait-event-timing (or -Dwait_event_timing=true for +-- meson) at compile time. Without it, the alternate expected output +-- wait_event_timing_1.out is used. The default CI (Cirrus) runs without +-- timing enabled, so the non-timing path is tested automatically. +-- +-- Check GUC default +SHOW wait_event_capture; + wait_event_capture +-------------------- + off +(1 row) + +-- Enable stats-level capture for this test (PGC_SUSET, requires superuser) +SET wait_event_capture = stats; +-- Verify views exist (zero rows is fine, just checking structure) +SELECT * FROM pg_stat_wait_event_timing LIMIT 0; + pid | backend_type | procnumber | wait_event_type | wait_event | calls | total_time_ms | avg_time_us | max_time_us | histogram +-----+--------------+------------+-----------------+------------+-------+---------------+-------------+-------------+----------- +(0 rows) + +SELECT * FROM pg_backend_wait_event_trace LIMIT 0; + seq | timestamp_ns | wait_event_type | wait_event | duration_us | query_id +-----+--------------+-----------------+------------+-------------+---------- +(0 rows) + +-- The histogram-buckets taxonomy view is constant: 16 ordered rows, +-- ascending bin edges, last bucket open-ended. Available in both +-- timing and non-timing builds (defined in system_views.sql, not gated +-- on the compile flag). +SELECT count(*) = 32 AS thirty_two_rows, + min(bucket_idx) = 0 AS idx_starts_at_zero, + max(bucket_idx) = 31 AS idx_ends_at_thirty_one, + bool_and(lower_ns IS NOT NULL) AS all_lowers_present, + count(*) FILTER (WHERE upper_ns IS NULL) = 1 AS one_open_bucket +FROM pg_wait_event_timing_histogram_buckets; + thirty_two_rows | idx_starts_at_zero | idx_ends_at_thirty_one | all_lowers_present | one_open_bucket +-----------------+--------------------+------------------------+--------------------+----------------- + t | t | t | t | t +(1 row) + +-- Verify column types of timing view +SELECT + a.attname, + pg_catalog.format_type(a.atttypid, a.atttypmod) as type +FROM pg_catalog.pg_attribute a +JOIN pg_catalog.pg_class c ON a.attrelid = c.oid +JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid +WHERE n.nspname = 'pg_catalog' + AND c.relname = 'pg_stat_wait_event_timing' + AND a.attnum > 0 + AND NOT a.attisdropped +ORDER BY a.attnum; + attname | type +-----------------+------------------ + pid | integer + backend_type | text + procnumber | integer + wait_event_type | text + wait_event | text + calls | bigint + total_time_ms | double precision + avg_time_us | double precision + max_time_us | double precision + histogram | bigint[] +(10 rows) + +-- Generate a wait event +SELECT pg_sleep(0.1); + pg_sleep +---------- + +(1 row) + +-- Verify PgSleep event appears with correct structure +SELECT + pid = pg_backend_pid() AS pid_ok, + backend_type, + wait_event_type, + wait_event, + calls >= 1 AS has_calls, + total_time_ms > 0 AS has_time, + avg_time_us > 0 AS has_avg, + max_time_us > 0 AS has_max, + pg_typeof(histogram) AS hist_type, + array_length(histogram, 1) AS hist_len, + calls = (SELECT sum(x) FROM unnest(histogram) x) AS hist_invariant +FROM pg_stat_wait_event_timing +WHERE wait_event = 'PgSleep'; + pid_ok | backend_type | wait_event_type | wait_event | has_calls | has_time | has_avg | has_max | hist_type | hist_len | hist_invariant +--------+----------------+-----------------+------------+-----------+----------+---------+---------+-----------+----------+---------------- + t | client backend | Timeout | PgSleep | t | t | t | t | bigint[] | 32 | t +(1 row) + +-- Test reset function (own backend) +SELECT pg_stat_reset_wait_event_timing(NULL); + pg_stat_reset_wait_event_timing +--------------------------------- + +(1 row) + +SELECT count(*) AS after_reset +FROM pg_stat_wait_event_timing +WHERE wait_event = 'PgSleep'; + after_reset +------------- + 0 +(1 row) + +-- Test trace ring buffer (need compute_query_id for query markers) +SET compute_query_id = on; +SET wait_event_capture = trace; +SELECT pg_sleep(0.01); + pg_sleep +---------- + +(1 row) + +SELECT + wait_event_type, + wait_event, + duration_us >= 0 AS dur_ok, + seq >= 0 AS seq_ok +FROM pg_backend_wait_event_trace +WHERE wait_event = 'PgSleep'; + wait_event_type | wait_event | dur_ok | seq_ok +-----------------+------------+--------+-------- + Timeout | PgSleep | t | t +(1 row) + +-- Test query markers exist in trace +SELECT count(*) > 0 AS has_query_markers +FROM pg_backend_wait_event_trace +WHERE wait_event_type = 'Query'; + has_query_markers +------------------- + t +(1 row) + +-- Reset does not crash: NULL and own PID are equivalent +SELECT pg_stat_reset_wait_event_timing(NULL); + pg_stat_reset_wait_event_timing +--------------------------------- + +(1 row) + +SELECT pg_stat_reset_wait_event_timing(pg_backend_pid()); + pg_stat_reset_wait_event_timing +--------------------------------- + +(1 row) + +-- Unknown PID is a silent no-op (matches pg_stat_reset_backend_stats) +SELECT pg_stat_reset_wait_event_timing(2147483647); + pg_stat_reset_wait_event_timing +--------------------------------- + +(1 row) + +-- Cluster-wide reset (superuser-only) +SELECT pg_stat_reset_wait_event_timing_all(); + pg_stat_reset_wait_event_timing_all +------------------------------------- + +(1 row) + +-- Trace read (no arguments; always returns own session) +SELECT count(*) >= 0 AS trace_readable +FROM pg_get_backend_wait_event_trace(); + trace_readable +---------------- + t +(1 row) + +-- Test trace lifecycle: drop to stats, then back up to trace +SET compute_query_id = on; +SET wait_event_capture = stats; +SET wait_event_capture = trace; +SELECT 1 AS reattach_test; + reattach_test +--------------- + 1 +(1 row) + +SELECT count(*) >= 0 AS trace_reattach_ok +FROM pg_backend_wait_event_trace; + trace_reattach_ok +------------------- + t +(1 row) + +SET wait_event_capture = stats; +-- Pin issue #15 fix: TRACE -> OFF (or STATS) must release the DSA ring, +-- and a subsequent re-enable must allocate a fresh, empty ring. Old +-- trace records do NOT survive the disable, but aggregated stats in +-- pg_stat_wait_event_timing DO (they live in a separate DSA allocation). +-- +-- The assertions below are strict-equal on count-agnostic invariants. +-- We deliberately avoid "count(*) = N" style assertions here: pg_sleep() +-- loops around WaitLatch and can emit more than one PgSleep wait event +-- per call under CPU contention (spurious latch wakes), so a fixed count +-- would be flaky on busy CI runners. Instead: +-- +-- * ring_reallocated is decided by comparing phase 2's max(seq) against +-- phase 1's (seq is derived from write_pos, which resets to 0 on a +-- freshly allocated ring -- phase 2's records must have strictly +-- smaller seq than phase 1's last record iff the ring was freed). +-- +-- * stats_preserved_exactly checks that aggregated "calls" equals the +-- exact sum of events seen in the two phase rings. Whatever each +-- phase's ring count happens to be, the aggregated counter must land +-- on that sum; any drop, asymmetric duplication, or reset-on-toggle +-- bug breaks the equality. +-- +-- The symmetric-duplication case (both ring and aggregated doubled +-- identically) is covered separately in test_wait_event_stress using +-- deterministic exact-count input via stress_wait_events(). +SELECT pg_stat_reset_wait_event_timing(NULL); + pg_stat_reset_wait_event_timing +--------------------------------- + +(1 row) + +SET wait_event_capture = trace; +SELECT pg_sleep(0.001); + pg_sleep +---------- + +(1 row) + +SELECT pg_sleep(0.001); + pg_sleep +---------- + +(1 row) + +-- Stash phase 1's ring count + highest seq (all phase-1 records). +CREATE TEMP TABLE wet_phase1 AS +SELECT count(*) AS n, max(seq) AS max_seq +FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep'; +-- At least two PgSleep events captured (one per pg_sleep call, ignoring +-- spurious wakes). Catches drop bugs. +SELECT n >= 2 AS phase1_captured_both_sleeps +FROM wet_phase1; + phase1_captured_both_sleeps +----------------------------- + t +(1 row) + +SET wait_event_capture = off; +SET wait_event_capture = trace; +SELECT pg_sleep(0.001); + pg_sleep +---------- + +(1 row) + +-- Phase 2: stash fresh-ring count + max(seq). +CREATE TEMP TABLE wet_phase2 AS +SELECT count(*) AS n, max(seq) AS max_seq +FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep'; +-- The ring was freed iff phase 2's records all have seq strictly smaller +-- than phase 1's last seq (write_pos started over at 0). If the ring +-- had persisted, phase 2 would contain phase 1's records plus new ones, +-- so max(seq) would be >= phase1.max_seq. Strict-equal on semantic. +SELECT n >= 1 AND max_seq < (SELECT max_seq FROM wet_phase1) + AS ring_freed_and_reallocated +FROM wet_phase2; + ring_freed_and_reallocated +---------------------------- + t +(1 row) + +-- Aggregated stats must equal the exact sum of the two phase ring counts. +-- Catches drops (aggregated < sum), asymmetric duplication, and any +-- reset-on-toggle bug that would wipe aggregated counters. +SELECT calls = (SELECT n FROM wet_phase1) + (SELECT n FROM wet_phase2) + AS stats_preserved_exactly +FROM pg_stat_wait_event_timing +WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep'; + stats_preserved_exactly +------------------------- + t +(1 row) + +DROP TABLE wet_phase1, wet_phase2; +SET wait_event_capture = stats; +-- Overflow counters view: should be readable and overflow counts should +-- be zero for a freshly-reset session that hasn't exceeded limits. +-- reset_count must have incremented at least once (we called reset above). +SELECT + pid = pg_backend_pid() AS pid_ok, + lwlock_overflow_count >= 0 AS lw_nonneg, + flat_overflow_count >= 0 AS flat_nonneg, + reset_count >= 1 AS reset_count_bumped +FROM pg_stat_wait_event_timing_overflow +WHERE pid = pg_backend_pid(); + pid_ok | lw_nonneg | flat_nonneg | reset_count_bumped +--------+-----------+-------------+-------------------- + t | t | t | t +(1 row) + +-- Orphan-clear admin function: smoke-test that it returns a non-negative +-- count and is callable without error. Actual orphan-creation requires +-- a backend exit, which the regression harness can't easily orchestrate +-- in a portable way; we verify here only that the API works. Returns +-- bigint (count of rings freed); typically 0 in a fresh test run. +SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; + clear_orphans_ok +------------------ + t +(1 row) + +-- PID-filter fast path on the cluster-wide SRFs. Smoke-test that the +-- single-slot branch returns rows for the calling backend and zero rows +-- for a known-bad PID (matching pg_stat_reset_wait_event_timing +-- semantics). +SELECT + (SELECT count(*) FROM pg_stat_get_wait_event_timing(pg_backend_pid()) + WHERE pid = pg_backend_pid()) >= 0 AS own_pid_returns_rows, + (SELECT count(*) FROM pg_stat_get_wait_event_timing(2147483647)) = 0 + AS unknown_pid_empty, + (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(pg_backend_pid()) + WHERE pid = pg_backend_pid()) = 1 AS overflow_own_pid_one_row, + (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(2147483647)) = 0 + AS overflow_unknown_pid_empty; + own_pid_returns_rows | unknown_pid_empty | overflow_own_pid_one_row | overflow_unknown_pid_empty +----------------------+-------------------+--------------------------+---------------------------- + t | t | t | t +(1 row) + +-- Cross-backend trace SRF: smoke-test that pg_get_wait_event_trace +-- (procnumber-keyed) is callable and returns sensible results. +-- Full orphan-readability and the parallel-worker case are exercised +-- by the TAP test (which can orchestrate backend exits). +SET wait_event_capture = trace; +-- generate at least one wait event so the ring is allocated +SELECT pg_sleep(0.01); + pg_sleep +---------- + +(1 row) + +SELECT + -- Own session: pull our procnumber from the timing SRF, then read + -- our own trace ring through the cross-backend SRF. + (SELECT count(*) FROM pg_get_wait_event_trace( + (SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) + WHERE pid = pg_backend_pid() LIMIT 1))) >= 0 + AS by_procnumber_self_ok, + -- Out-of-range procnumber: empty result, no error. + (SELECT count(*) FROM pg_get_wait_event_trace(-1)) = 0 + AS negative_procnumber_empty, + (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0 + AS huge_procnumber_empty; + by_procnumber_self_ok | negative_procnumber_empty | huge_procnumber_empty +-----------------------+---------------------------+----------------------- + t | t | t +(1 row) + +-- With capture disabled, a never-allocated slot still reads as empty +-- (the function short-circuits when the trace DSA was never created +-- or when the slot is FREE). +SET wait_event_capture = off; +SELECT (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0 + AS capture_off_empty; + capture_off_empty +------------------- + t +(1 row) + +-- Permission gating: a role without pg_read_all_stats cannot call the +-- function. Cover both the public role and a freshly-created one. +SET wait_event_capture = stats; +CREATE ROLE regress_wet_reader_nopriv NOLOGIN; +DO $$ +DECLARE + err text; +BEGIN + SET LOCAL ROLE regress_wet_reader_nopriv; + BEGIN + PERFORM count(*) FROM pg_get_wait_event_trace(0); + err := 'NO ERROR (unexpected: function should be denied)'; + EXCEPTION WHEN insufficient_privilege THEN + err := 'permission denied (expected)'; + END; + RAISE NOTICE 'permission gate: %', err; +END +$$; +NOTICE: permission gate: permission denied (expected) +DROP ROLE regress_wet_reader_nopriv; +-- Clean up +RESET wait_event_capture; +RESET compute_query_id; diff --git a/src/test/regress/expected/wait_event_timing_1.out b/src/test/regress/expected/wait_event_timing_1.out new file mode 100644 index 0000000000000..2df1b0c3e870a --- /dev/null +++ b/src/test/regress/expected/wait_event_timing_1.out @@ -0,0 +1,405 @@ +-- +-- Test wait event timing infrastructure +-- +-- These tests verify the wait event timing SQL interface. +-- They require --enable-wait-event-timing (or -Dwait_event_timing=true for +-- meson) at compile time. Without it, the alternate expected output +-- wait_event_timing_1.out is used. The default CI (Cirrus) runs without +-- timing enabled, so the non-timing path is tested automatically. +-- +-- Check GUC default +SHOW wait_event_capture; + wait_event_capture +-------------------- + off +(1 row) + +-- Enable stats-level capture for this test (PGC_SUSET, requires superuser) +SET wait_event_capture = stats; +ERROR: invalid value for parameter "wait_event_capture": "stats" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +-- Verify views exist (zero rows is fine, just checking structure) +SELECT * FROM pg_stat_wait_event_timing LIMIT 0; + pid | backend_type | procnumber | wait_event_type | wait_event | calls | total_time_ms | avg_time_us | max_time_us | histogram +-----+--------------+------------+-----------------+------------+-------+---------------+-------------+-------------+----------- +(0 rows) + +SELECT * FROM pg_backend_wait_event_trace LIMIT 0; + seq | timestamp_ns | wait_event_type | wait_event | duration_us | query_id +-----+--------------+-----------------+------------+-------------+---------- +(0 rows) + +-- The histogram-buckets taxonomy view is constant: 16 ordered rows, +-- ascending bin edges, last bucket open-ended. Available in both +-- timing and non-timing builds (defined in system_views.sql, not gated +-- on the compile flag). +SELECT count(*) = 32 AS thirty_two_rows, + min(bucket_idx) = 0 AS idx_starts_at_zero, + max(bucket_idx) = 31 AS idx_ends_at_thirty_one, + bool_and(lower_ns IS NOT NULL) AS all_lowers_present, + count(*) FILTER (WHERE upper_ns IS NULL) = 1 AS one_open_bucket +FROM pg_wait_event_timing_histogram_buckets; + thirty_two_rows | idx_starts_at_zero | idx_ends_at_thirty_one | all_lowers_present | one_open_bucket +-----------------+--------------------+------------------------+--------------------+----------------- + t | t | t | t | t +(1 row) + +-- Verify column types of timing view +SELECT + a.attname, + pg_catalog.format_type(a.atttypid, a.atttypmod) as type +FROM pg_catalog.pg_attribute a +JOIN pg_catalog.pg_class c ON a.attrelid = c.oid +JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid +WHERE n.nspname = 'pg_catalog' + AND c.relname = 'pg_stat_wait_event_timing' + AND a.attnum > 0 + AND NOT a.attisdropped +ORDER BY a.attnum; + attname | type +-----------------+------------------ + pid | integer + backend_type | text + procnumber | integer + wait_event_type | text + wait_event | text + calls | bigint + total_time_ms | double precision + avg_time_us | double precision + max_time_us | double precision + histogram | bigint[] +(10 rows) + +-- Generate a wait event +SELECT pg_sleep(0.1); + pg_sleep +---------- + +(1 row) + +-- Verify PgSleep event appears with correct structure +SELECT + pid = pg_backend_pid() AS pid_ok, + backend_type, + wait_event_type, + wait_event, + calls >= 1 AS has_calls, + total_time_ms > 0 AS has_time, + avg_time_us > 0 AS has_avg, + max_time_us > 0 AS has_max, + pg_typeof(histogram) AS hist_type, + array_length(histogram, 1) AS hist_len, + calls = (SELECT sum(x) FROM unnest(histogram) x) AS hist_invariant +FROM pg_stat_wait_event_timing +WHERE wait_event = 'PgSleep'; + pid_ok | backend_type | wait_event_type | wait_event | has_calls | has_time | has_avg | has_max | hist_type | hist_len | hist_invariant +--------+--------------+-----------------+------------+-----------+----------+---------+---------+-----------+----------+---------------- +(0 rows) + +-- Test reset function (own backend) +SELECT pg_stat_reset_wait_event_timing(NULL); +ERROR: wait event capture is not supported by this build +HINT: Compile PostgreSQL with --enable-wait-event-timing. +SELECT count(*) AS after_reset +FROM pg_stat_wait_event_timing +WHERE wait_event = 'PgSleep'; + after_reset +------------- + 0 +(1 row) + +-- Test trace ring buffer (need compute_query_id for query markers) +SET compute_query_id = on; +SET wait_event_capture = trace; +ERROR: invalid value for parameter "wait_event_capture": "trace" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +SELECT pg_sleep(0.01); + pg_sleep +---------- + +(1 row) + +SELECT + wait_event_type, + wait_event, + duration_us >= 0 AS dur_ok, + seq >= 0 AS seq_ok +FROM pg_backend_wait_event_trace +WHERE wait_event = 'PgSleep'; + wait_event_type | wait_event | dur_ok | seq_ok +-----------------+------------+--------+-------- +(0 rows) + +-- Test query markers exist in trace +SELECT count(*) > 0 AS has_query_markers +FROM pg_backend_wait_event_trace +WHERE wait_event_type = 'Query'; + has_query_markers +------------------- + f +(1 row) + +-- Reset does not crash: NULL and own PID are equivalent +SELECT pg_stat_reset_wait_event_timing(NULL); +ERROR: wait event capture is not supported by this build +HINT: Compile PostgreSQL with --enable-wait-event-timing. +SELECT pg_stat_reset_wait_event_timing(pg_backend_pid()); +ERROR: wait event capture is not supported by this build +HINT: Compile PostgreSQL with --enable-wait-event-timing. +-- Unknown PID is a silent no-op (matches pg_stat_reset_backend_stats) +SELECT pg_stat_reset_wait_event_timing(2147483647); +ERROR: wait event capture is not supported by this build +HINT: Compile PostgreSQL with --enable-wait-event-timing. +-- Cluster-wide reset (superuser-only) +SELECT pg_stat_reset_wait_event_timing_all(); +ERROR: wait event capture is not supported by this build +HINT: Compile PostgreSQL with --enable-wait-event-timing. +-- Trace read (no arguments; always returns own session) +SELECT count(*) >= 0 AS trace_readable +FROM pg_get_backend_wait_event_trace(); + trace_readable +---------------- + t +(1 row) + +-- Test trace lifecycle: drop to stats, then back up to trace +SET compute_query_id = on; +SET wait_event_capture = stats; +ERROR: invalid value for parameter "wait_event_capture": "stats" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +SET wait_event_capture = trace; +ERROR: invalid value for parameter "wait_event_capture": "trace" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +SELECT 1 AS reattach_test; + reattach_test +--------------- + 1 +(1 row) + +SELECT count(*) >= 0 AS trace_reattach_ok +FROM pg_backend_wait_event_trace; + trace_reattach_ok +------------------- + t +(1 row) + +SET wait_event_capture = stats; +ERROR: invalid value for parameter "wait_event_capture": "stats" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +-- Pin issue #15 fix: TRACE -> OFF (or STATS) must release the DSA ring, +-- and a subsequent re-enable must allocate a fresh, empty ring. Old +-- trace records do NOT survive the disable, but aggregated stats in +-- pg_stat_wait_event_timing DO (they live in a separate DSA allocation). +-- +-- The assertions below are strict-equal on count-agnostic invariants. +-- We deliberately avoid "count(*) = N" style assertions here: pg_sleep() +-- loops around WaitLatch and can emit more than one PgSleep wait event +-- per call under CPU contention (spurious latch wakes), so a fixed count +-- would be flaky on busy CI runners. Instead: +-- +-- * ring_reallocated is decided by comparing phase 2's max(seq) against +-- phase 1's (seq is derived from write_pos, which resets to 0 on a +-- freshly allocated ring -- phase 2's records must have strictly +-- smaller seq than phase 1's last record iff the ring was freed). +-- +-- * stats_preserved_exactly checks that aggregated "calls" equals the +-- exact sum of events seen in the two phase rings. Whatever each +-- phase's ring count happens to be, the aggregated counter must land +-- on that sum; any drop, asymmetric duplication, or reset-on-toggle +-- bug breaks the equality. +-- +-- The symmetric-duplication case (both ring and aggregated doubled +-- identically) is covered separately in test_wait_event_stress using +-- deterministic exact-count input via stress_wait_events(). +SELECT pg_stat_reset_wait_event_timing(NULL); +ERROR: wait event capture is not supported by this build +HINT: Compile PostgreSQL with --enable-wait-event-timing. +SET wait_event_capture = trace; +ERROR: invalid value for parameter "wait_event_capture": "trace" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +SELECT pg_sleep(0.001); + pg_sleep +---------- + +(1 row) + +SELECT pg_sleep(0.001); + pg_sleep +---------- + +(1 row) + +-- Stash phase 1's ring count + highest seq (all phase-1 records). +CREATE TEMP TABLE wet_phase1 AS +SELECT count(*) AS n, max(seq) AS max_seq +FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep'; +-- At least two PgSleep events captured (one per pg_sleep call, ignoring +-- spurious wakes). Catches drop bugs. +SELECT n >= 2 AS phase1_captured_both_sleeps +FROM wet_phase1; + phase1_captured_both_sleeps +----------------------------- + f +(1 row) + +SET wait_event_capture = off; +SET wait_event_capture = trace; +ERROR: invalid value for parameter "wait_event_capture": "trace" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +SELECT pg_sleep(0.001); + pg_sleep +---------- + +(1 row) + +-- Phase 2: stash fresh-ring count + max(seq). +CREATE TEMP TABLE wet_phase2 AS +SELECT count(*) AS n, max(seq) AS max_seq +FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep'; +-- The ring was freed iff phase 2's records all have seq strictly smaller +-- than phase 1's last seq (write_pos started over at 0). If the ring +-- had persisted, phase 2 would contain phase 1's records plus new ones, +-- so max(seq) would be >= phase1.max_seq. Strict-equal on semantic. +SELECT n >= 1 AND max_seq < (SELECT max_seq FROM wet_phase1) + AS ring_freed_and_reallocated +FROM wet_phase2; + ring_freed_and_reallocated +---------------------------- + f +(1 row) + +-- Aggregated stats must equal the exact sum of the two phase ring counts. +-- Catches drops (aggregated < sum), asymmetric duplication, and any +-- reset-on-toggle bug that would wipe aggregated counters. +SELECT calls = (SELECT n FROM wet_phase1) + (SELECT n FROM wet_phase2) + AS stats_preserved_exactly +FROM pg_stat_wait_event_timing +WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep'; + stats_preserved_exactly +------------------------- +(0 rows) + +DROP TABLE wet_phase1, wet_phase2; +SET wait_event_capture = stats; +ERROR: invalid value for parameter "wait_event_capture": "stats" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +-- Overflow counters view: should be readable and overflow counts should +-- be zero for a freshly-reset session that hasn't exceeded limits. +-- reset_count must have incremented at least once (we called reset above). +SELECT + pid = pg_backend_pid() AS pid_ok, + lwlock_overflow_count >= 0 AS lw_nonneg, + flat_overflow_count >= 0 AS flat_nonneg, + reset_count >= 1 AS reset_count_bumped +FROM pg_stat_wait_event_timing_overflow +WHERE pid = pg_backend_pid(); + pid_ok | lw_nonneg | flat_nonneg | reset_count_bumped +--------+-----------+-------------+-------------------- +(0 rows) + +-- Orphan-clear admin function: smoke-test that it returns a non-negative +-- count and is callable without error. Actual orphan-creation requires +-- a backend exit, which the regression harness can't easily orchestrate +-- in a portable way; we verify here only that the API works. Returns +-- bigint (count of rings freed); typically 0 in a fresh test run. +SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; + clear_orphans_ok +------------------ + t +(1 row) + +-- PID-filter fast path on the cluster-wide SRFs. Smoke-test that the +-- single-slot branch returns rows for the calling backend and zero rows +-- for a known-bad PID (matching pg_stat_reset_wait_event_timing +-- semantics). +SELECT + (SELECT count(*) FROM pg_stat_get_wait_event_timing(pg_backend_pid()) + WHERE pid = pg_backend_pid()) >= 0 AS own_pid_returns_rows, + (SELECT count(*) FROM pg_stat_get_wait_event_timing(2147483647)) = 0 + AS unknown_pid_empty, + (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(pg_backend_pid()) + WHERE pid = pg_backend_pid()) = 1 AS overflow_own_pid_one_row, + (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(2147483647)) = 0 + AS overflow_unknown_pid_empty; + own_pid_returns_rows | unknown_pid_empty | overflow_own_pid_one_row | overflow_unknown_pid_empty +----------------------+-------------------+--------------------------+---------------------------- + t | t | f | t +(1 row) + +-- Cross-backend trace SRF: smoke-test that pg_get_wait_event_trace +-- (procnumber-keyed) is callable and returns sensible results. +-- Full orphan-readability and the parallel-worker case are exercised +-- by the TAP test (which can orchestrate backend exits). +SET wait_event_capture = trace; +ERROR: invalid value for parameter "wait_event_capture": "trace" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +-- generate at least one wait event so the ring is allocated +SELECT pg_sleep(0.01); + pg_sleep +---------- + +(1 row) + +SELECT + -- Own session: pull our procnumber from the timing SRF, then read + -- our own trace ring through the cross-backend SRF. + (SELECT count(*) FROM pg_get_wait_event_trace( + (SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) + WHERE pid = pg_backend_pid() LIMIT 1))) >= 0 + AS by_procnumber_self_ok, + -- Out-of-range procnumber: empty result, no error. + (SELECT count(*) FROM pg_get_wait_event_trace(-1)) = 0 + AS negative_procnumber_empty, + (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0 + AS huge_procnumber_empty; + by_procnumber_self_ok | negative_procnumber_empty | huge_procnumber_empty +-----------------------+---------------------------+----------------------- + t | t | t +(1 row) + +-- With capture disabled, a never-allocated slot still reads as empty +-- (the function short-circuits when the trace DSA was never created +-- or when the slot is FREE). +SET wait_event_capture = off; +SELECT (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0 + AS capture_off_empty; + capture_off_empty +------------------- + t +(1 row) + +-- Permission gating: a role without pg_read_all_stats cannot call the +-- function. Cover both the public role and a freshly-created one. +SET wait_event_capture = stats; +ERROR: invalid value for parameter "wait_event_capture": "stats" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +CREATE ROLE regress_wet_reader_nopriv NOLOGIN; +DO $$ +DECLARE + err text; +BEGIN + SET LOCAL ROLE regress_wet_reader_nopriv; + BEGIN + PERFORM count(*) FROM pg_get_wait_event_trace(0); + err := 'NO ERROR (unexpected: function should be denied)'; + EXCEPTION WHEN insufficient_privilege THEN + err := 'permission denied (expected)'; + END; + RAISE NOTICE 'permission gate: %', err; +END +$$; +NOTICE: permission gate: permission denied (expected) +DROP ROLE regress_wet_reader_nopriv; +-- Clean up +RESET wait_event_capture; +RESET compute_query_id; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 8fa0a6c47fb30..b47560f85fd04 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -135,6 +135,12 @@ test: compression compression_lz4 compression_pglz cluster # oidjoins is read-only, though, and should run late for best coverage test: oidjoins event_trigger +# wait_event_timing creates and drops temp tables to capture trace-ring +# state across phases (see wet_phase1/wet_phase2 in the .sql file). Its +# DDL would be polluted by event_trigger's ddl_command_end trigger if +# they ran concurrently, so it gets its own scheduling slot rather than +# sharing event_trigger's parallel group. +test: wait_event_timing # event_trigger_login cannot run concurrently with any other tests because # on-login event handling could catch connection of a concurrent test. diff --git a/src/test/regress/sql/wait_event_timing.sql b/src/test/regress/sql/wait_event_timing.sql new file mode 100644 index 0000000000000..6531fd7490ca0 --- /dev/null +++ b/src/test/regress/sql/wait_event_timing.sql @@ -0,0 +1,260 @@ +-- +-- Test wait event timing infrastructure +-- +-- These tests verify the wait event timing SQL interface. +-- They require --enable-wait-event-timing (or -Dwait_event_timing=true for +-- meson) at compile time. Without it, the alternate expected output +-- wait_event_timing_1.out is used. The default CI (Cirrus) runs without +-- timing enabled, so the non-timing path is tested automatically. +-- + +-- Check GUC default +SHOW wait_event_capture; + +-- Enable stats-level capture for this test (PGC_SUSET, requires superuser) +SET wait_event_capture = stats; + +-- Verify views exist (zero rows is fine, just checking structure) +SELECT * FROM pg_stat_wait_event_timing LIMIT 0; +SELECT * FROM pg_backend_wait_event_trace LIMIT 0; + +-- The histogram-buckets taxonomy view is constant: 16 ordered rows, +-- ascending bin edges, last bucket open-ended. Available in both +-- timing and non-timing builds (defined in system_views.sql, not gated +-- on the compile flag). +SELECT count(*) = 32 AS thirty_two_rows, + min(bucket_idx) = 0 AS idx_starts_at_zero, + max(bucket_idx) = 31 AS idx_ends_at_thirty_one, + bool_and(lower_ns IS NOT NULL) AS all_lowers_present, + count(*) FILTER (WHERE upper_ns IS NULL) = 1 AS one_open_bucket +FROM pg_wait_event_timing_histogram_buckets; + +-- Verify column types of timing view +SELECT + a.attname, + pg_catalog.format_type(a.atttypid, a.atttypmod) as type +FROM pg_catalog.pg_attribute a +JOIN pg_catalog.pg_class c ON a.attrelid = c.oid +JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid +WHERE n.nspname = 'pg_catalog' + AND c.relname = 'pg_stat_wait_event_timing' + AND a.attnum > 0 + AND NOT a.attisdropped +ORDER BY a.attnum; + +-- Generate a wait event +SELECT pg_sleep(0.1); + +-- Verify PgSleep event appears with correct structure +SELECT + pid = pg_backend_pid() AS pid_ok, + backend_type, + wait_event_type, + wait_event, + calls >= 1 AS has_calls, + total_time_ms > 0 AS has_time, + avg_time_us > 0 AS has_avg, + max_time_us > 0 AS has_max, + pg_typeof(histogram) AS hist_type, + array_length(histogram, 1) AS hist_len, + calls = (SELECT sum(x) FROM unnest(histogram) x) AS hist_invariant +FROM pg_stat_wait_event_timing +WHERE wait_event = 'PgSleep'; + +-- Test reset function (own backend) +SELECT pg_stat_reset_wait_event_timing(NULL); +SELECT count(*) AS after_reset +FROM pg_stat_wait_event_timing +WHERE wait_event = 'PgSleep'; + +-- Test trace ring buffer (need compute_query_id for query markers) +SET compute_query_id = on; +SET wait_event_capture = trace; +SELECT pg_sleep(0.01); + +SELECT + wait_event_type, + wait_event, + duration_us >= 0 AS dur_ok, + seq >= 0 AS seq_ok +FROM pg_backend_wait_event_trace +WHERE wait_event = 'PgSleep'; + +-- Test query markers exist in trace +SELECT count(*) > 0 AS has_query_markers +FROM pg_backend_wait_event_trace +WHERE wait_event_type = 'Query'; + +-- Reset does not crash: NULL and own PID are equivalent +SELECT pg_stat_reset_wait_event_timing(NULL); +SELECT pg_stat_reset_wait_event_timing(pg_backend_pid()); + +-- Unknown PID is a silent no-op (matches pg_stat_reset_backend_stats) +SELECT pg_stat_reset_wait_event_timing(2147483647); + +-- Cluster-wide reset (superuser-only) +SELECT pg_stat_reset_wait_event_timing_all(); + +-- Trace read (no arguments; always returns own session) +SELECT count(*) >= 0 AS trace_readable +FROM pg_get_backend_wait_event_trace(); + +-- Test trace lifecycle: drop to stats, then back up to trace +SET compute_query_id = on; +SET wait_event_capture = stats; +SET wait_event_capture = trace; +SELECT 1 AS reattach_test; +SELECT count(*) >= 0 AS trace_reattach_ok +FROM pg_backend_wait_event_trace; +SET wait_event_capture = stats; + +-- Pin issue #15 fix: TRACE -> OFF (or STATS) must release the DSA ring, +-- and a subsequent re-enable must allocate a fresh, empty ring. Old +-- trace records do NOT survive the disable, but aggregated stats in +-- pg_stat_wait_event_timing DO (they live in a separate DSA allocation). +-- +-- The assertions below are strict-equal on count-agnostic invariants. +-- We deliberately avoid "count(*) = N" style assertions here: pg_sleep() +-- loops around WaitLatch and can emit more than one PgSleep wait event +-- per call under CPU contention (spurious latch wakes), so a fixed count +-- would be flaky on busy CI runners. Instead: +-- +-- * ring_reallocated is decided by comparing phase 2's max(seq) against +-- phase 1's (seq is derived from write_pos, which resets to 0 on a +-- freshly allocated ring -- phase 2's records must have strictly +-- smaller seq than phase 1's last record iff the ring was freed). +-- +-- * stats_preserved_exactly checks that aggregated "calls" equals the +-- exact sum of events seen in the two phase rings. Whatever each +-- phase's ring count happens to be, the aggregated counter must land +-- on that sum; any drop, asymmetric duplication, or reset-on-toggle +-- bug breaks the equality. +-- +-- The symmetric-duplication case (both ring and aggregated doubled +-- identically) is covered separately in test_wait_event_stress using +-- deterministic exact-count input via stress_wait_events(). +SELECT pg_stat_reset_wait_event_timing(NULL); +SET wait_event_capture = trace; +SELECT pg_sleep(0.001); +SELECT pg_sleep(0.001); + +-- Stash phase 1's ring count + highest seq (all phase-1 records). +CREATE TEMP TABLE wet_phase1 AS +SELECT count(*) AS n, max(seq) AS max_seq +FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep'; + +-- At least two PgSleep events captured (one per pg_sleep call, ignoring +-- spurious wakes). Catches drop bugs. +SELECT n >= 2 AS phase1_captured_both_sleeps +FROM wet_phase1; + +SET wait_event_capture = off; +SET wait_event_capture = trace; +SELECT pg_sleep(0.001); + +-- Phase 2: stash fresh-ring count + max(seq). +CREATE TEMP TABLE wet_phase2 AS +SELECT count(*) AS n, max(seq) AS max_seq +FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep'; + +-- The ring was freed iff phase 2's records all have seq strictly smaller +-- than phase 1's last seq (write_pos started over at 0). If the ring +-- had persisted, phase 2 would contain phase 1's records plus new ones, +-- so max(seq) would be >= phase1.max_seq. Strict-equal on semantic. +SELECT n >= 1 AND max_seq < (SELECT max_seq FROM wet_phase1) + AS ring_freed_and_reallocated +FROM wet_phase2; + +-- Aggregated stats must equal the exact sum of the two phase ring counts. +-- Catches drops (aggregated < sum), asymmetric duplication, and any +-- reset-on-toggle bug that would wipe aggregated counters. +SELECT calls = (SELECT n FROM wet_phase1) + (SELECT n FROM wet_phase2) + AS stats_preserved_exactly +FROM pg_stat_wait_event_timing +WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep'; + +DROP TABLE wet_phase1, wet_phase2; +SET wait_event_capture = stats; + +-- Overflow counters view: should be readable and overflow counts should +-- be zero for a freshly-reset session that hasn't exceeded limits. +-- reset_count must have incremented at least once (we called reset above). +SELECT + pid = pg_backend_pid() AS pid_ok, + lwlock_overflow_count >= 0 AS lw_nonneg, + flat_overflow_count >= 0 AS flat_nonneg, + reset_count >= 1 AS reset_count_bumped +FROM pg_stat_wait_event_timing_overflow +WHERE pid = pg_backend_pid(); + +-- Orphan-clear admin function: smoke-test that it returns a non-negative +-- count and is callable without error. Actual orphan-creation requires +-- a backend exit, which the regression harness can't easily orchestrate +-- in a portable way; we verify here only that the API works. Returns +-- bigint (count of rings freed); typically 0 in a fresh test run. +SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; + +-- PID-filter fast path on the cluster-wide SRFs. Smoke-test that the +-- single-slot branch returns rows for the calling backend and zero rows +-- for a known-bad PID (matching pg_stat_reset_wait_event_timing +-- semantics). +SELECT + (SELECT count(*) FROM pg_stat_get_wait_event_timing(pg_backend_pid()) + WHERE pid = pg_backend_pid()) >= 0 AS own_pid_returns_rows, + (SELECT count(*) FROM pg_stat_get_wait_event_timing(2147483647)) = 0 + AS unknown_pid_empty, + (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(pg_backend_pid()) + WHERE pid = pg_backend_pid()) = 1 AS overflow_own_pid_one_row, + (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(2147483647)) = 0 + AS overflow_unknown_pid_empty; + +-- Cross-backend trace SRF: smoke-test that pg_get_wait_event_trace +-- (procnumber-keyed) is callable and returns sensible results. +-- Full orphan-readability and the parallel-worker case are exercised +-- by the TAP test (which can orchestrate backend exits). +SET wait_event_capture = trace; +-- generate at least one wait event so the ring is allocated +SELECT pg_sleep(0.01); +SELECT + -- Own session: pull our procnumber from the timing SRF, then read + -- our own trace ring through the cross-backend SRF. + (SELECT count(*) FROM pg_get_wait_event_trace( + (SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) + WHERE pid = pg_backend_pid() LIMIT 1))) >= 0 + AS by_procnumber_self_ok, + -- Out-of-range procnumber: empty result, no error. + (SELECT count(*) FROM pg_get_wait_event_trace(-1)) = 0 + AS negative_procnumber_empty, + (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0 + AS huge_procnumber_empty; + +-- With capture disabled, a never-allocated slot still reads as empty +-- (the function short-circuits when the trace DSA was never created +-- or when the slot is FREE). +SET wait_event_capture = off; +SELECT (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0 + AS capture_off_empty; + +-- Permission gating: a role without pg_read_all_stats cannot call the +-- function. Cover both the public role and a freshly-created one. +SET wait_event_capture = stats; +CREATE ROLE regress_wet_reader_nopriv NOLOGIN; +DO $$ +DECLARE + err text; +BEGIN + SET LOCAL ROLE regress_wet_reader_nopriv; + BEGIN + PERFORM count(*) FROM pg_get_wait_event_trace(0); + err := 'NO ERROR (unexpected: function should be denied)'; + EXCEPTION WHEN insufficient_privilege THEN + err := 'permission denied (expected)'; + END; + RAISE NOTICE 'permission gate: %', err; +END +$$; +DROP ROLE regress_wet_reader_nopriv; + +-- Clean up +RESET wait_event_capture; +RESET compute_query_id; diff --git a/src/tools/pgindent/exclude_file_patterns b/src/tools/pgindent/exclude_file_patterns index 4976a373f9e53..68269fe0c6175 100644 --- a/src/tools/pgindent/exclude_file_patterns +++ b/src/tools/pgindent/exclude_file_patterns @@ -17,6 +17,7 @@ src/backend/nodes/\w+\.switch\.c$ # looks worse with pgindent. src/backend/utils/activity/pgstat_wait_event\.c$ src/backend/utils/activity/wait_event_funcs_data\.c$ +src/backend/utils/activity/wait_event_timing_data\.h$ src/backend/utils/activity/wait_event_types\.h$ # # This confuses pgindent, and it's a derived file anyway. diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index cbd9e10fc1d47..631f5ede8312c 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1609,6 +1609,8 @@ LWLock LWLockHandle LWLockMode LWLockPadded +LWLockTimingHash +LWLockTimingHashEntry LWLockTrancheShmemData LZ4F_compressionContext_t LZ4F_decompressOptions_t @@ -3414,6 +3416,7 @@ WSAPROTOCOL_INFO WaitEvent WaitEventActivity WaitEventBuffer +WaitEventCaptureLevel WaitEventClient WaitEventCustomCounterData WaitEventCustomEntryByInfo @@ -3422,6 +3425,14 @@ WaitEventIO WaitEventIPC WaitEventSet WaitEventTimeout +WaitEventTimingControl +WaitEventTimingEntry +WaitEventTimingState +WaitEventTraceControl +WaitEventTraceRecord +WaitEventTraceSlot +WaitEventTraceSlotState +WaitEventTraceState WaitLSNProcInfo WaitLSNResult WaitLSNState @@ -3452,6 +3463,7 @@ WalUsage WalWriteMethod WalWriteMethodOps Walfile +WetValidRecord WindowAgg WindowAggPath WindowAggState