diff --git a/.cirrus.tasks.yml b/.cirrus.tasks.yml
index a22cef063f337..5a747254c93ae 100644
--- a/.cirrus.tasks.yml
+++ b/.cirrus.tasks.yml
@@ -514,12 +514,19 @@ task:
       # code not being exercised much. Thus specify a very small segment size
       # here. Use a non-power-of-two segment size, given we currently allow
       # that.
+      # --enable-wait-event-timing is tacked on to this entry so the timing
+      # build path (including the expected output at
+      # src/test/regress/expected/wait_event_timing.out) actually gets
+      # exercised by CI; without it, only the stub alt output
+      # wait_event_timing_1.out is consumed and any regression in the
+      # timing-enabled code is invisible to upstream.
       configure_script: |
         su postgres <<-EOF
           set -e
           ./configure \
             --enable-cassert --enable-injection-points --enable-debug \
             --enable-tap-tests --enable-nls \
+            --enable-wait-event-timing \
             --with-segsize-blocks=6 \
             --with-libnuma \
             --with-liburing \
diff --git a/configure b/configure
index f66c1054a7a1e..a535703d3a5ce 100755
--- a/configure
+++ b/configure
@@ -774,6 +774,7 @@ CC
 enable_injection_points
 PG_TEST_EXTRA
 enable_tap_tests
+enable_wait_event_timing
 enable_dtrace
 DTRACEFLAGS
 DTRACE
@@ -850,6 +851,7 @@ enable_debug
 enable_profiling
 enable_coverage
 enable_dtrace
+enable_wait_event_timing
 enable_tap_tests
 enable_injection_points
 with_blocksize
@@ -1551,6 +1553,8 @@ Optional Features:
   --enable-profiling      build with profiling enabled
   --enable-coverage       build with coverage testing instrumentation
   --enable-dtrace         build with DTrace support
+  --enable-wait-event-timing
+                          build with wait event timing instrumentation
   --enable-tap-tests      enable TAP tests (requires Perl and IPC::Run)
   --enable-injection-points
                           enable injection points (for testing)
@@ -3632,6 +3636,34 @@ fi
 
 
 
+#
+# --enable-wait-event-timing adds wait event timing instrumentation
+#
+
+
+# Check whether --enable-wait-event-timing was given.
+if test "${enable_wait_event_timing+set}" = set; then :
+  enableval=$enable_wait_event_timing;
+  case $enableval in
+    yes)
+
+$as_echo "#define USE_WAIT_EVENT_TIMING 1" >>confdefs.h
+
+      ;;
+    no)
+      :
+      ;;
+    *)
+      as_fn_error $? "no argument expected for --enable-wait-event-timing option" "$LINENO" 5
+      ;;
+  esac
+
+else
+  enable_wait_event_timing=no
+
+fi
+
+
 
 #
 # TAP tests
diff --git a/configure.ac b/configure.ac
index 8d176bd3468e9..0d37b77ac53b6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -225,6 +225,14 @@ fi
 AC_SUBST(DTRACEFLAGS)])
 AC_SUBST(enable_dtrace)
 
+#
+# --enable-wait-event-timing adds wait event timing instrumentation
+#
+PGAC_ARG_BOOL(enable, wait-event-timing, no,
+              [build with wait event timing instrumentation],
+              [AC_DEFINE([USE_WAIT_EVENT_TIMING], 1,
+                         [Define to 1 to build with wait event timing. (--enable-wait-event-timing)])])
+
 #
 # TAP tests
 #
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 73cc04123303d..d059dc095a2a0 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9110,6 +9110,209 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-wait-event-capture" xreflabel="wait_event_capture">
+      <term><varname>wait_event_capture</varname> (<type>enum</type>)
+      <indexterm>
+       <primary><varname>wait_event_capture</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Controls collection of wait event instrumentation data.  Requires
+        the server to be compiled with
+        <option>--enable-wait-event-timing</option>.  Possible values are
+        <literal>off</literal>, <literal>stats</literal>, and
+        <literal>trace</literal>; each level is a strict superset of the
+        previous one.
+       </para>
+       <para>
+        At <literal>stats</literal>, the server records per-backend wait
+        event statistics (counts, total and average durations, log2
+        histograms) visible in the
+        <link linkend="monitoring-pg-stat-wait-event-timing-view">
+        <structname>pg_stat_wait_event_timing</structname></link> view.
+        Two <function>clock_gettime()</function> calls are added around
+        every wait event transition, costing approximately
+        40&ndash;100&nbsp;ns each on modern hardware.
+       </para>
+       <para>
+        At <literal>trace</literal>, the server additionally records every
+        individual wait event into a per-session ring buffer (~4&nbsp;MB of
+        DSA per backend, allocated lazily on first enable), exposed via the
+        <link linkend="monitoring-pg-backend-wait-event-trace-view">
+        <structname>pg_backend_wait_event_trace</structname></link> view.
+        Each record carries either a wait event or a query-attribution
+        marker; consumers reconstruct which query owns which wait by
+        interleaving the two streams.
+       </para>
+       <para>
+        Two marker families are emitted into the ring:
+        <itemizedlist>
+         <listitem>
+          <para>
+           <literal>ExecStart</literal>/<literal>ExecEnd</literal> markers
+           bracket every executor invocation
+           (<function>ExecutorStart</function>/<function>ExecutorEnd</function>).
+           They are the primary attribution signal: every executable
+           statement, including those run inside parallel workers and
+           pipelined extended-protocol messages, is bracketed.  Emission
+           requires <xref linkend="guc-compute-query-id"/> to produce a
+           non-zero <structfield>query_id</structfield>; otherwise the
+           markers are silently skipped.  They are <emphasis>not</emphasis>
+           gated on <varname>track_activities</varname>.
+          </para>
+         </listitem>
+         <listitem>
+          <para>
+           <literal>QueryStart</literal>/<literal>QueryEnd</literal> markers
+           fire at top-level query identifier transitions and at the
+           transition to idle, providing inter-statement boundaries that
+           the executor markers cannot (e.g. the
+           <literal>ClientRead</literal> wait between statements).  They
+           require both <xref linkend="guc-track-activities"/> and
+           <xref linkend="guc-compute-query-id"/> to be enabled.
+          </para>
+         </listitem>
+        </itemizedlist>
+        A <literal>WARNING</literal> is logged at the time
+        <varname>wait_event_capture</varname> is set to <literal>trace</literal>
+        if either prerequisite is missing.
+       </para>
+       <para>
+        The default is <literal>off</literal>.  Only superusers and users
+        with the appropriate <literal>SET</literal> privilege can change
+        this setting.
+       </para>
+       <para>
+        The setting is gated to superuser by default because
+        <literal>trace</literal> mode allocates approximately 4&nbsp;MB
+        of dynamic shared memory per backend that enables it; an
+        unprivileged role enabling trace on every connection in a
+        large pool could consume substantial cluster-wide memory.
+        Read access to the resulting statistics is controlled
+        separately by membership in the
+        <link linkend="predefined-roles"><literal>pg_read_all_stats</literal></link>
+        role (which the <literal>pg_monitor</literal> role inherits),
+        so a monitoring operator can typically read
+        <structname>pg_stat_wait_event_timing</structname> but cannot
+        toggle <varname>wait_event_capture</varname> itself.
+       </para>
+       <para>
+        To delegate the ability to change this setting to a
+        non-superuser role &mdash; for example, the
+        <literal>pg_monitor</literal> role in environments where the
+        cluster owner is not the operator on call &mdash; use the
+        standard PostgreSQL <command>GRANT SET ON PARAMETER</command>
+        mechanism:
+<programlisting>
+GRANT SET ON PARAMETER wait_event_capture TO pg_monitor;
+</programlisting>
+        After this, any role that has the <literal>pg_monitor</literal>
+        role membership can run
+        <command>SET wait_event_capture = stats</command> (or
+        <literal>= trace</literal>) for its own session.  The grant is
+        per-installation policy rather than baked into the GUC, so
+        managed-PostgreSQL environments and self-hosted clusters can
+        choose independently whether monitoring roles should be able to
+        flip this on.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-wait-event-timing-max-tranches" xreflabel="wait_event_timing_max_tranches">
+      <term><varname>wait_event_timing_max_tranches</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>wait_event_timing_max_tranches</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Sets the maximum number of distinct LWLock tranches whose timing
+        is recorded individually per backend.  PostgreSQL maintains a
+        per-backend hash table that maps each tranche the backend
+        encounters to its histogram bucket; once the table fills, further
+        tranches encountered by that backend are counted against
+        <structfield>lwlock_overflow_count</structfield> in
+        <link linkend="monitoring-pg-stat-wait-event-timing-overflow-view">
+        <structname>pg_stat_wait_event_timing_overflow</structname></link>
+        and not individually timed.  Sized at server start; this
+        parameter has no effect on builds compiled without
+        <option>--enable-wait-event-timing</option>.  The default is
+        <literal>192</literal>; raise it if your installation loads many
+        extensions that register their own LWLock tranches and you
+        observe non-zero
+        <structfield>lwlock_overflow_count</structfield>.
+       </para>
+       <para>
+        The shared-memory cost is per-backend and proportional to this
+        setting.  Each entry is approximately 152&nbsp;bytes (an
+        LWLock-timing histogram), and the slot table that resolves
+        tranche IDs adds another 4&nbsp;bytes per slot, with the slot
+        count rounded up to the next power of two of twice this value.
+        At default 192 entries (512 slots) the per-backend overhead is
+        roughly 31&nbsp;KB; at 512 entries (1024 slots) roughly
+        80&nbsp;KB.  The total cluster-wide cost is paid only when the
+        first backend in the cluster sets
+        <xref linkend="guc-wait-event-capture"/> to a non-<literal>off</literal>
+        value, and remains allocated for the postmaster's lifetime
+        regardless of subsequent GUC changes.  Builds compiled without
+        <option>--enable-wait-event-timing</option> pay zero memory for
+        this setting.
+       </para>
+       <para>
+        Setting can only be changed at server start.  Only superusers
+        and users with the appropriate <literal>SET</literal> privilege
+        can change this setting.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-wait-event-trace-ring-size-kb" xreflabel="wait_event_trace_ring_size_kb">
+      <term><varname>wait_event_trace_ring_size_kb</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>wait_event_trace_ring_size_kb</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Per-backend size, in kilobytes, of the wait-event-trace ring
+        buffer allocated when a session sets
+        <xref linkend="guc-wait-event-capture"/> to
+        <literal>trace</literal>.  Must be a power of two.  Sized at
+        server start (<literal>PGC_POSTMASTER</literal>); all rings in
+        a given postmaster run have the same size.  This parameter has
+        no effect on builds compiled without
+        <option>--enable-wait-event-timing</option>.
+       </para>
+       <para>
+        Each record is 32 bytes, so the record count is the kilobyte
+        value times 32.  The default of <literal>4096</literal> KB
+        (= 131072 records, ~4&nbsp;MB) gives roughly 0.5&ndash;1
+        second of retention at peak wait-event rates of 200K/s.
+        Larger values give longer retention before the FIFO wrap
+        overwrites the oldest records; smaller values reduce
+        per-backend memory at high <varname>max_connections</varname>.
+        Allowed range is <literal>8</literal>&ndash;<literal>32768</literal>
+        KB (256 records to ~1 million records per ring).
+       </para>
+       <para>
+        Worst-case total memory is approximately
+        <varname>max_connections</varname> *
+        <varname>wait_event_trace_ring_size_kb</varname>, allocated
+        lazily from a cluster-wide DSA only as backends enable
+        <varname>wait_event_capture</varname> = <literal>trace</literal>.
+        Memory is reclaimed when backends exit and their slots are
+        recycled, or explicitly via
+        <function>pg_stat_clear_orphaned_wait_event_rings</function>.
+       </para>
+       <para>
+        Setting can only be changed at server start.  Only superusers
+        and users with the appropriate <literal>SET</literal> privilege
+        can change this setting.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-track-functions" xreflabel="track_functions">
       <term><varname>track_functions</varname> (<type>enum</type>)
       <indexterm>
diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml
index b345a1056740a..9ddf46328e2c7 100644
--- a/doc/src/sgml/installation.sgml
+++ b/doc/src/sgml/installation.sgml
@@ -1594,6 +1594,33 @@ build-postgresql:
        </listitem>
       </varlistentry>
 
+      <varlistentry id="configure-option-enable-wait-event-timing">
+       <term><option>--enable-wait-event-timing</option></term>
+       <listitem>
+        <para>
+         Compiles in per-backend wait event timing instrumentation.
+         When enabled, every call to
+         <function>pgstat_report_wait_start()</function>/<function>pgstat_report_wait_end()</function>
+         records the wait duration and accumulates per-event statistics
+         (count, total time, histogram) in shared memory.
+         The overhead is two <function>clock_gettime(CLOCK_MONOTONIC)</function>
+         calls per wait event transition (~40&ndash;100&nbsp;ns via VDSO).
+         When not compiled in, the <varname>wait_event_capture</varname>
+         GUC still exists but only accepts <literal>off</literal>, and the
+         SQL functions return empty result sets.
+         The compile flag allocates approximately 120&nbsp;KB of shared
+         memory per backend slot for timing statistics (regardless of GUC
+         setting).  At <varname>max_connections</varname>&nbsp;=&nbsp;200
+         this is roughly 26&nbsp;MB; at 1000 it is roughly 120&nbsp;MB.
+         Trace ring buffers are allocated lazily via DSA only when
+         <varname>wait_event_capture</varname> is set to
+         <literal>trace</literal> (~4&nbsp;MB per traced backend).
+         See <xref linkend="guc-wait-event-capture"/> for the runtime
+         control.
+        </para>
+       </listitem>
+      </varlistentry>
+
       <varlistentry id="configure-option-enable-tap-tests">
        <term><option>--enable-tap-tests</option></term>
        <listitem>
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 08d5b8245529f..5f12b700700b1 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -551,6 +551,24 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       </entry>
      </row>
 
+     <row>
+      <entry><structname>pg_stat_wait_event_timing</structname><indexterm><primary>pg_stat_wait_event_timing</primary></indexterm></entry>
+      <entry>One row per backend per wait event, showing accumulated timing
+       statistics.  See
+       <link linkend="monitoring-pg-stat-wait-event-timing-view">
+       <structname>pg_stat_wait_event_timing</structname></link> for details.
+      </entry>
+     </row>
+
+     <row>
+      <entry><structname>pg_backend_wait_event_trace</structname><indexterm><primary>pg_backend_wait_event_trace</primary></indexterm></entry>
+      <entry>Individual wait event records from the current backend's trace
+       ring buffer.  See
+       <link linkend="monitoring-pg-backend-wait-event-trace-view">
+       <structname>pg_backend_wait_event_trace</structname></link> for details.
+      </entry>
+     </row>
+
      <!-- all "stat" for schema objects, by "importance" -->
 
      <row>
@@ -3699,6 +3717,603 @@ description | Waiting for a newly initialized WAL file to reach durable storage
 
 </sect2>
 
+ <sect2 id="monitoring-pg-stat-wait-event-timing-view">
+  <title><structname>pg_stat_wait_event_timing</structname></title>
+
+  <indexterm>
+   <primary>pg_stat_wait_event_timing</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_stat_wait_event_timing</structname> view contains one
+   row for each combination of backend and wait event that has a non-zero
+   call count.  It shows accumulated timing statistics collected when
+   <xref linkend="guc-wait-event-capture"/> is set to <literal>stats</literal>
+   or <literal>trace</literal>.  Requires the server to be compiled with
+   <option>--enable-wait-event-timing</option>.
+  </para>
+
+  <table id="pg-stat-wait-event-timing-view" xreflabel="pg_stat_wait_event_timing">
+   <title><structname>pg_stat_wait_event_timing</structname> View</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>pid</structfield> <type>integer</type>
+      </para>
+      <para>
+       Process ID of the backend
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>backend_type</structfield> <type>text</type>
+      </para>
+      <para>
+       Type of the backend (e.g. <literal>client backend</literal>,
+       <literal>checkpointer</literal>, <literal>walwriter</literal>)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>procnumber</structfield> <type>integer</type>
+      </para>
+      <para>
+       Internal slot number (0-based process number).  Suitable for
+       passing directly to <function>pg_get_wait_event_trace</function>.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>wait_event_type</structfield> <type>text</type>
+      </para>
+      <para>
+       Wait event type (e.g. <literal>IO</literal>, <literal>LWLock</literal>,
+       <literal>Timeout</literal>)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>wait_event</structfield> <type>text</type>
+      </para>
+      <para>
+       Wait event name (e.g. <literal>DataFileRead</literal>,
+       <literal>WALWrite</literal>, <literal>PgSleep</literal>)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>calls</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of times this wait event occurred
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>total_time_ms</structfield> <type>double precision</type>
+      </para>
+      <para>
+       Total time spent in this wait event, in milliseconds
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>avg_time_us</structfield> <type>double precision</type>
+      </para>
+      <para>
+       Average wait duration, in microseconds
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>max_time_us</structfield> <type>double precision</type>
+      </para>
+      <para>
+       Maximum single wait duration, in microseconds
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>histogram</structfield> <type>bigint[]</type>
+      </para>
+      <para>
+       Log2 histogram of wait durations with 16 buckets.  Bin edges are
+       powers of two on the nanosecond axis: bucket 0 covers
+       [0, 1024) ns, bucket <replaceable>k</replaceable> covers
+       [2^(<replaceable>k</replaceable>+9),
+       2^(<replaceable>k</replaceable>+10)) ns, and the last bucket covers
+       [2^24, &infin;) ns.  The boundaries approximate the
+       decimal-microsecond grid (1024 ns &asymp; 1 &mu;s, 2048 ns &asymp;
+       2 &mu;s, ..., 2^24 ns &asymp; 16 ms); the exact edges are chosen
+       to let the hot path skip a division by 1000.  The
+       <link linkend="monitoring-pg-wait-event-timing-histogram-buckets-view">
+       <structname>pg_wait_event_timing_histogram_buckets</structname></link>
+       view provides the numeric bin edges and human-readable labels for
+       each index; the canonical join pattern is:
+<programlisting>
+SELECT w.wait_event, b.label, h.count
+FROM   pg_stat_wait_event_timing w,
+       LATERAL unnest(w.histogram) WITH ORDINALITY AS h(count, idx)
+JOIN   pg_wait_event_timing_histogram_buckets b ON b.bucket_idx = h.idx - 1
+WHERE  w.wait_event = 'PgSleep'
+ORDER  BY b.bucket_idx;
+</programlisting>
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+ </sect2>
+
+ <sect2 id="monitoring-pg-wait-event-timing-histogram-buckets-view">
+  <title><structname>pg_wait_event_timing_histogram_buckets</structname></title>
+
+  <indexterm>
+   <primary>pg_wait_event_timing_histogram_buckets</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_wait_event_timing_histogram_buckets</structname>
+   view describes the 32 bins used by the
+   <structfield>histogram</structfield> column of
+   <link linkend="monitoring-pg-stat-wait-event-timing-view">
+   <structname>pg_stat_wait_event_timing</structname></link>.  It always
+   contains 32 rows in ascending order of
+   <structfield>bucket_idx</structfield>, and is independent of runtime
+   state; a join against it attaches numeric bin edges and human
+   labels to any histogram array.  Bins are powers of two on the
+   nanosecond axis: bin 0 covers <literal>[0,&nbsp;1us)</literal>, each
+   subsequent bin doubles its lower edge, and the final bin
+   (<structfield>bucket_idx</structfield> = 31) is open-ended at
+   approximately 1024&nbsp;seconds.
+  </para>
+
+  <para>
+   The 32-bin layout (rather than the more common 16-bin choice for
+   log-scale histograms) is deliberate: real-world wait-event
+   distributions have long tails routinely extending past 16&nbsp;ms
+   into multi-second territory (slow-disk
+   <literal>DataFileRead</literal>, lock contention waits, replication
+   apply waits, vacuum waits).  A 16-bin histogram would collapse all
+   of those into a single overflow bin, hiding the very signal that
+   wait-event timing exists to surface.  The 32-bin layout keeps the
+   long tail individually addressable up to about 17&nbsp;minutes
+   before the open-ended bin; single waits beyond that belong in
+   <link linkend="auto-explain"><structname>auto_explain</structname></link>
+   or <structname>pg_stat_activity</structname>, not a histogram.
+  </para>
+
+  <table id="pg-wait-event-timing-histogram-buckets-view" xreflabel="pg_wait_event_timing_histogram_buckets">
+   <title><structname>pg_wait_event_timing_histogram_buckets</structname> View</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>bucket_idx</structfield> <type>integer</type>
+      </para>
+      <para>
+       Zero-based bin index (0&ndash;31).  Matches the offset into the
+       <structfield>histogram</structfield> array of
+       <structname>pg_stat_wait_event_timing</structname>.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>lower_ns</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Inclusive lower edge of this bin in nanoseconds.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>upper_ns</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Exclusive upper edge of this bin in nanoseconds, or
+       <literal>NULL</literal> for the final bin which extends to
+       infinity.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>label</structfield> <type>text</type>
+      </para>
+      <para>
+       Short human-readable label for the bin (e.g.
+       <literal>&lt;1us</literal>, <literal>1-2us</literal>,
+       <literal>&gt;=16ms</literal>), expressed on the approximate
+       decimal-microsecond grid the bin edges are aligned to.
+      </para></entry>
+     </row>
+    </tbody>
+   </tgroup>
+  </table>
+ </sect2>
+
+ <sect2 id="monitoring-pg-stat-wait-event-timing-overflow-view">
+  <title><structname>pg_stat_wait_event_timing_overflow</structname></title>
+
+  <indexterm>
+   <primary>pg_stat_wait_event_timing_overflow</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_stat_wait_event_timing_overflow</structname> view
+   exposes per-backend truncation counters for the wait-event timing
+   subsystem.  Each backend owns a bounded LWLock timing hash
+   (192 tranches) and a bounded flat event array; events that cannot
+   be mapped to a slot are counted here.  A non-zero value means the
+   corresponding row(s) in
+   <link linkend="monitoring-pg-stat-wait-event-timing-view">
+   <structname>pg_stat_wait_event_timing</structname></link>
+   are incomplete for that backend.  Requires the server to be
+   compiled with <option>--enable-wait-event-timing</option>.
+  </para>
+
+  <table id="pg-stat-wait-event-timing-overflow-view" xreflabel="pg_stat_wait_event_timing_overflow">
+   <title><structname>pg_stat_wait_event_timing_overflow</structname> View</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>pid</structfield> <type>integer</type>
+      </para>
+      <para>
+       Process ID of the backend
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>backend_type</structfield> <type>text</type>
+      </para>
+      <para>
+       Type of the backend (e.g. <literal>client backend</literal>,
+       <literal>checkpointer</literal>, <literal>walwriter</literal>)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>procnumber</structfield> <type>integer</type>
+      </para>
+      <para>
+       Internal slot number (0-based process number).  Suitable for
+       passing directly to <function>pg_get_wait_event_trace</function>.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>lwlock_overflow_count</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of LWLock wait events dropped because the per-backend
+       LWLock timing hash was already full (more distinct tranches
+       observed in this session than
+       <xref linkend="guc-wait-event-timing-max-tranches"/> allows).
+       Zero means no LWLock truncation.  A one-time
+       <literal>WARNING</literal> is also emitted to the server log on
+       first overflow.  If you see this counter rising, raise
+       <varname>wait_event_timing_max_tranches</varname> at server
+       start (the per-backend memory cost is proportional and
+       described under that GUC).
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>flat_overflow_count</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of non-LWLock wait events dropped because the event
+       could not be mapped to a known class / index.  This almost
+       always indicates a code path emitting a wait event of a class
+       the timing infrastructure was not compiled for; it should be
+       zero in supported builds.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>reset_count</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of resets this backend has <emphasis>observed and acted
+       on</emphasis>; not a request counter.  Own-backend resets via
+       <function>pg_stat_reset_wait_event_timing(NULL)</function> (or
+       passing the caller's own PID) are synchronous and bump this
+       column once per call.  Cross-backend reset requests
+       <emphasis>coalesce</emphasis>: if several
+       <function>pg_stat_reset_wait_event_timing(<replaceable>pid</replaceable>)</function>
+       calls land between two of the target's wait events, the target
+       observes them as a single reset and increments
+       <structfield>reset_count</structfield> only once.  Callers
+       polling for asynchronous-reset acknowledgment should watch for
+       any <literal>N &rarr; N+1</literal> transition.
+      </para></entry>
+     </row>
+    </tbody>
+   </tgroup>
+  </table>
+ </sect2>
+
+ <sect2 id="monitoring-pg-backend-wait-event-trace-view">
+  <title><structname>pg_backend_wait_event_trace</structname></title>
+
+  <indexterm>
+   <primary>pg_backend_wait_event_trace</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_backend_wait_event_trace</structname> view shows
+   individual wait event records from the <emphasis>current backend's</emphasis>
+   trace ring buffer.  Each record captures either a single wait event
+   (with timestamp and duration) or a query-attribution marker.  Two
+   marker families exist: <literal>ExecStart</literal>/<literal>ExecEnd</literal>
+   bracket every executor invocation, and
+   <literal>QueryStart</literal>/<literal>QueryEnd</literal> mark
+   top-level query-id transitions and the transition to idle.  See
+   <xref linkend="guc-wait-event-capture"/> for the gating rules of
+   each marker family.
+   Requires <xref linkend="guc-wait-event-capture"/> to be set to
+   <literal>trace</literal>.  The ring buffer holds up to
+   <xref linkend="guc-wait-event-trace-ring-size-kb"/> kilobytes of
+   records (default 4096 KB = 131072 records of 32 bytes each);
+   older records are overwritten in FIFO order.  The view is session-local
+   and analogous in scope to
+   <link linkend="view-pg-backend-memory-contexts">
+   <structname>pg_backend_memory_contexts</structname></link>; querying it
+   from a superuser session still returns only that session's own
+   records, never another backend's.
+  </para>
+
+  <para>
+   The <structname>pg_backend_wait_event_trace</structname> view is
+   intended for <emphasis>session-local interactive diagnostics</emphasis>:
+   running ad-hoc <literal>SELECT</literal> queries against your own
+   session's trace from <application>psql</application> while
+   investigating wait-event behaviour.  The view materialises up to
+   one ring's worth of records (default ~4&nbsp;MB, controlled by
+   <xref linkend="guc-wait-event-trace-ring-size-kb"/>) into a
+   tuplestore on each call, which is bounded and acceptable for that
+   use; for narrow result sets, append
+   <literal>ORDER BY seq DESC LIMIT <replaceable>N</replaceable></literal>
+   to get the most recent records.
+  </para>
+
+  <para>
+   Cross-backend monitoring tools &mdash; extensions and background
+   workers that read wait events losslessly from every backend's
+   ring &mdash; should <emphasis>not</emphasis> consume through this
+   view.  The in-tree cross-backend reader is
+   <function>pg_get_wait_event_trace</function>
+   (see <xref linkend="monitoring-stats-funcs"/>); the underlying
+   per-session SQL function returns only the calling backend's own
+   ring, so a background worker invoking
+   <command>SELECT * FROM pg_backend_wait_event_trace</command> via
+   SPI would receive only its own (typically empty) ring, not the
+   target backend's data.  External tools that need cross-backend
+   access without going through SQL use the shared-memory snapshot
+   pattern documented on
+   <structname>WaitEventTraceControl</structname> in
+   <filename>src/include/utils/wait_event_timing.h</filename>:
+   snapshot <structfield>trace_slots[procNumber].generation</structfield>,
+   acquire <structname>WaitEventTraceCtl->lock</structname> in
+   <literal>LW_SHARED</literal>, resolve the target slot's
+   <structfield>ring_ptr</structfield> via
+   <function>dsa_get_address</function>, snapshot the relevant slice
+   of the ring into local memory, release the lock, re-snapshot
+   <structfield>generation</structfield> and discard the read if it
+   changed, then process the snapshot off the lock.  That bypasses
+   this view entirely and is the supported cross-backend interface
+   for monitoring extensions.
+  </para>
+
+  <para>
+   <emphasis>Slot lifecycle.</emphasis>  Per-backend trace rings are
+   not freed when their owner backend exits.  The ring stays
+   allocated in shared memory in an <quote>orphaned</quote> state
+   so the dying backend's final waits remain readable by the
+   cross-backend interface &mdash;
+   <function>pg_get_wait_event_trace</function>
+   (see <xref linkend="monitoring-stats-funcs"/>) for in-tree
+   access, or external background workers that follow the
+   snapshot pattern documented above.
+   <emphasis>This does not change the behaviour of this view</emphasis>,
+   which always reads the calling backend's own ring and is
+   unaffected by orphan-state slots belonging to other
+   procnumbers.  The lifecycle change matters for short-lived
+   backends that exit before any monitoring tool has read their
+   data: parallel workers in particular exit in milliseconds at
+   end-of-parallel-query, well below typical reader polling
+   intervals, and without orphan-persistence their final waits
+   would be lost.  Orphaned rings are reclaimed automatically when a new
+   backend takes over the same <literal>procNumber</literal>
+   slot, and the DBA can force a sweep at any time via
+   <function>pg_stat_clear_orphaned_wait_event_rings</function>.
+   The worst-case orphan-memory footprint is bounded by the slot
+   count times ~4&nbsp;MB; see
+   <function>pg_stat_clear_orphaned_wait_event_rings</function>
+   under <xref linkend="monitoring-stats-funcs"/> for details and
+   the deployment patterns where the function is most useful.
+  </para>
+
+  <para>
+   The ring buffer is designed as a lock-free transport mechanism for
+   external consumption.  At high wait event rates (e.g., 220K events/sec),
+   the ring wraps in roughly 0.5&ndash;1 seconds.  External consumers
+   (background workers, extensions) can attribute events to queries by
+   scanning for <literal>ExecStart</literal> markers (or, when the
+   executor markers are unavailable, <literal>QueryStart</literal>); if
+   both have been overwritten, events before the next visible marker are
+   unattributed.  Consumers should poll the ring buffer before it wraps
+   and can use <structfield>st_query_id</structfield> from
+   <structname>PgBackendStatus</structname> as a fallback for the current
+   query context.
+  </para>
+
+  <para>
+   The <structfield>seq</structfield> column is the absolute write
+   position of each record; it is monotonically increasing and never
+   resets while the ring is alive.  A consumer polling the ring
+   repeatedly can detect wraparound losses by tracking
+   <function>max(seq)</function> between successive scrapes: given two
+   consecutive polls returning <replaceable>N2</replaceable> rows with
+   maximum <structfield>seq</structfield> values
+   <replaceable>S1</replaceable> (previous poll) and
+   <replaceable>S2</replaceable> (current poll), the number of records
+   overwritten before the second poll could read them is
+   <literal>max(0, (S2 - S1) - N2)</literal>.  No separate
+   <quote>trace overflow</quote> counter is exposed because this
+   information is exact and derivable from <structfield>seq</structfield>
+   alone.
+  </para>
+
+  <para>
+   <literal>QueryStart</literal>/<literal>QueryEnd</literal> markers are
+   emitted as matched pairs around each protocol phase that touches a
+   <structfield>query_id</structfield>.  In simple protocol that is one
+   pair per statement.  In extended protocol there is one pair around
+   each of <literal>Parse</literal>, <literal>Bind</literal>, and
+   <literal>Execute</literal> for the same
+   <structfield>query_id</structfield> &mdash; so a single parameterized
+   statement produces three nested pairs, plus the surrounding
+   <literal>ExecStart</literal>/<literal>ExecEnd</literal> pair from the
+   executor.  This per-phase pairing lets consumers measure how much
+   time a query spent in each protocol phase (parse vs. bind vs.
+   execute) by computing the duration between each pair, and lets a
+   total-time-per-query rollup be expressed as the sum of pair
+   durations rather than a single subtraction.  Consumers that just want
+   "how long did this query take in the executor" should use the
+   <literal>ExecStart</literal>/<literal>ExecEnd</literal> pair, which
+   fires exactly once per statement regardless of protocol.
+  </para>
+
+  <table id="pg-backend-wait-event-trace-view" xreflabel="pg_backend_wait_event_trace">
+   <title><structname>pg_backend_wait_event_trace</structname> View</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>seq</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Sequence number of this record in the ring buffer
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>timestamp_ns</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Monotonic clock timestamp in nanoseconds
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>wait_event_type</structfield> <type>text</type>
+      </para>
+      <para>
+       Wait event type, or <literal>Query</literal> for query markers
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>wait_event</structfield> <type>text</type>
+      </para>
+      <para>
+       Wait event name, or one of <literal>ExecStart</literal>,
+       <literal>ExecEnd</literal>, <literal>QueryStart</literal>,
+       <literal>QueryEnd</literal> for query-attribution markers.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>duration_us</structfield> <type>double precision</type>
+      </para>
+      <para>
+       Wait duration in microseconds (0 for query markers)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>query_id</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Query identifier for query markers (0 for wait events)
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+ </sect2>
+
  <sect2 id="monitoring-pg-stat-database-view">
   <title><structname>pg_stat_database</structname></title>
 
@@ -5736,6 +6351,208 @@ description | Waiting for a newly initialized WAL file to reach durable storage
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_stat_get_wait_event_timing</primary>
+        </indexterm>
+        <function>pg_stat_get_wait_event_timing</function> ()
+        <returnvalue>setof record</returnvalue>
+       </para>
+       <para>
+        Returns one row for each combination of backend and wait event with
+        non-zero counts.  Output columns include <structfield>pid</structfield>,
+        <structfield>backend_type</structfield>, event identity, timing
+        statistics, and a log2 histogram.  Unprivileged users see only their
+        own backend.  Superusers and members of
+        <literal>pg_read_all_stats</literal> see all backends.
+        Requires <option>--enable-wait-event-timing</option>.
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_get_backend_wait_event_trace</primary>
+        </indexterm>
+        <function>pg_get_backend_wait_event_trace</function> ()
+        <returnvalue>setof record</returnvalue>
+       </para>
+       <para>
+        Returns individual wait event records from the current session's
+        trace ring buffer.  For another session's ring (live or
+        post-mortem orphaned), use
+        <function>pg_get_wait_event_trace</function> below.
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_get_wait_event_trace</primary>
+        </indexterm>
+        <function>pg_get_wait_event_trace</function> ( <parameter>procnumber</parameter> <type>integer</type> )
+        <returnvalue>setof record</returnvalue>
+       </para>
+       <para>
+        Returns individual wait event records from the trace ring of
+        the backend that currently or previously occupied the slot
+        identified by <parameter>procnumber</parameter>.  Reads slots
+        in <literal>OWNED</literal> state (live writer) and
+        <literal>ORPHANED</literal> state (writer has exited but the
+        ring is preserved for post-mortem reading) uniformly.  An
+        empty result indicates the slot is in <literal>FREE</literal>
+        state (no ring) or no records have been written.  Concurrent
+        slot transitions cannot interrupt the read because the
+        function holds the cross-backend trace lock in
+        <literal>SHARED</literal> mode throughout the iteration; the
+        per-record seqlock protocol skips any record being written
+        by a concurrent live writer.
+       </para>
+       <para>
+        This is the canonical cross-backend reader.  External
+        monitoring extensions that need cross-backend access without
+        going through SQL should follow the same snapshot pattern
+        documented on <structname>WaitEventTraceControl</structname>
+        in <filename>src/include/utils/wait_event_timing.h</filename>;
+        this function serves as both the reference implementation and
+        a DBA-facing diagnostic tool.  The
+        <parameter>procnumber</parameter> argument can be obtained
+        from the <structfield>procnumber</structfield> column of
+        <function>pg_stat_get_wait_event_timing</function> or
+        <function>pg_stat_get_wait_event_timing_overflow</function>
+        for live backends.  For post-mortem reads of short-lived
+        backends (parallel workers, autovacuum, walsender) the
+        <parameter>procnumber</parameter> must be captured while the
+        backend is still alive, or discovered by iterating slots in a
+        monitoring background worker.  A pid-keyed lookup for live
+        backends only is one query away:
+
+<programlisting>
+SELECT * FROM pg_get_wait_event_trace(
+    (SELECT procnumber FROM pg_stat_get_wait_event_timing(<replaceable>target_pid</replaceable>)
+     WHERE pid = <replaceable>target_pid</replaceable> LIMIT 1));
+</programlisting>
+       </para>
+       <para>
+        Requires membership in <literal>pg_read_all_stats</literal>
+        (matching the privilege model of the session-local view
+        <structname>pg_backend_wait_event_trace</structname>).
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_stat_get_wait_event_timing_overflow</primary>
+        </indexterm>
+        <function>pg_stat_get_wait_event_timing_overflow</function> ()
+        <returnvalue>setof record</returnvalue>
+       </para>
+       <para>
+        Returns one row per live backend with per-backend truncation
+        counters for the wait event timing subsystem.  Use this view to
+        confirm that
+        <link linkend="monitoring-pg-stat-wait-event-timing-view">
+        <structname>pg_stat_wait_event_timing</structname></link>
+        rows for a backend are complete rather than truncated.
+        Unprivileged users see only their own backend; superusers and
+        members of <literal>pg_read_all_stats</literal> see all
+        backends.  Requires <option>--enable-wait-event-timing</option>.
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_stat_reset_wait_event_timing</primary>
+        </indexterm>
+        <function>pg_stat_reset_wait_event_timing</function> ( <parameter>pid</parameter> <type>integer</type> <literal>DEFAULT</literal> <literal>NULL</literal> )
+        <returnvalue>void</returnvalue>
+       </para>
+       <para>
+        Resets wait event timing counters for a single backend, identified
+        by its process ID (see <structfield>pid</structfield> in
+        <link linkend="monitoring-pg-stat-activity-view">
+        <structname>pg_stat_activity</structname></link>).
+        Passing <literal>NULL</literal> (or the caller's own
+        <function>pg_backend_pid()</function>) resets the current session;
+        any user may do this.  Passing any other PID resets that backend
+        and requires membership in the
+        <link linkend="predefined-roles"><literal>pg_signal_backend</literal></link>
+        role &mdash; the same role required by
+        <function>pg_stat_reset_backend_stats</function>,
+        <function>pg_terminate_backend</function>, and
+        <function>pg_cancel_backend</function>.  Unknown or
+        already-exited PIDs are silent no-ops, matching the behavior of
+        <function>pg_stat_reset_backend_stats</function>.
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_stat_reset_wait_event_timing_all</primary>
+        </indexterm>
+        <function>pg_stat_reset_wait_event_timing_all</function> ()
+        <returnvalue>void</returnvalue>
+       </para>
+       <para>
+        Resets wait event timing counters for every backend in the
+        cluster.  Requires superuser.  This is intentionally stricter
+        than the per-backend variant
+        <function>pg_stat_reset_wait_event_timing</function>(<parameter>pid</parameter>),
+        which only requires <literal>pg_signal_backend</literal>: the
+        cluster-wide form has unbounded blast radius (it affects every
+        backend in a single call) and would erase forensic patterns
+        that span multiple backends, so it is gated to the cluster
+        owner.  Returns before the resets have been observed by their
+        target backends; callers that need strict read-after-reset
+        semantics should poll each target's
+        <structfield>reset_count</structfield> column.
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_stat_clear_orphaned_wait_event_rings</primary>
+        </indexterm>
+        <function>pg_stat_clear_orphaned_wait_event_rings</function> ()
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Frees every wait-event-trace ring whose owner backend has
+        exited.  Returns the number of rings released.  Requires
+        superuser.
+       </para>
+       <para>
+        When a backend that had <varname>wait_event_capture</varname> =
+        <literal>trace</literal> exits, its ~4 MB trace ring is
+        intentionally <emphasis>not</emphasis> freed at exit so that
+        cross-backend consumers
+        (<function>pg_get_wait_event_trace</function> and extensions
+        following the snapshot pattern) can still read the dying
+        backend's final waits.  The
+        memory is reclaimed lazily: in the common case, the ring is
+        freed automatically when a new backend takes over the same
+        <literal>procNumber</literal> slot.  This function is the
+        explicit DBA-driven sweep for the pathological case where
+        capture was briefly enabled, then disabled, on a cluster with
+        long-lived pooled connections that never recycle the
+        <literal>procNumber</literal>.  The maximum amount of memory
+        this can release is bounded by the slot count times the
+        per-ring size (~400 MB at <varname>max_connections</varname>
+        = 100, ~4 GB at 1000); on most deployments the function will
+        report 0 because connection churn already drained orphans
+        naturally.
+       </para>
+       <para>
+        Safe to call when capture is currently <literal>off</literal>
+        and even when no orphans exist (returns 0 in both cases).
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
diff --git a/meson.build b/meson.build
index 20b887f1a1bc1..f786901189660 100644
--- a/meson.build
+++ b/meson.build
@@ -505,6 +505,7 @@ meson_bin = find_program(meson_binpath, native: true)
 
 cdata.set('USE_ASSERT_CHECKING', get_option('cassert') ? 1 : false)
 cdata.set('USE_INJECTION_POINTS', get_option('injection_points') ? 1 : false)
+cdata.set('USE_WAIT_EVENT_TIMING', get_option('wait_event_timing') ? 1 : false)
 
 blocksize = get_option('blocksize').to_int() * 1024
 
diff --git a/meson_options.txt b/meson_options.txt
index 6a793f3e47943..1f191d3a9d621 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -40,6 +40,9 @@ option('pgport', type: 'integer', value: 5432,
 option('cassert', type: 'boolean', value: false,
   description: 'Enable assertion checks (for debugging)')
 
+option('wait_event_timing', type: 'boolean', value: false,
+  description: 'Enable wait event timing instrumentation')
+
 option('tap_tests', type: 'feature', value: 'auto',
   description: 'Enable TAP tests')
 
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 73a1c1c46703a..0fd75f8289abf 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1556,3 +1556,106 @@ CREATE VIEW pg_aios AS
     SELECT * FROM pg_get_aios();
 REVOKE ALL ON pg_aios FROM PUBLIC;
 GRANT SELECT ON pg_aios TO pg_read_all_stats;
+
+-- Taxonomy for the histogram column on pg_stat_wait_event_timing.  The
+-- histogram array has one entry per bucket, in ascending order.  This
+-- view names them so callers do not have to memorise the layout; join
+-- against it via unnest(histogram) WITH ORDINALITY.
+--
+-- WARNING: keep this list in lock-step with WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS
+-- and wait_event_timing_bucket() in src/backend/utils/activity/wait_event_timing.c.
+-- Bin edges are powers of two in nanoseconds; labels are the approximate
+-- decimal-microsecond grid documented in src/include/utils/wait_event_timing.h.
+CREATE VIEW pg_wait_event_timing_histogram_buckets AS
+    SELECT bucket_idx, lower_ns, upper_ns, label
+    FROM (VALUES
+        ( 0,             0::bigint,         1024::bigint,  '<1us'::text),
+        ( 1,          1024::bigint,         2048::bigint,  '1-2us'),
+        ( 2,          2048::bigint,         4096::bigint,  '2-4us'),
+        ( 3,          4096::bigint,         8192::bigint,  '4-8us'),
+        ( 4,          8192::bigint,        16384::bigint,  '8-16us'),
+        ( 5,         16384::bigint,        32768::bigint,  '16-32us'),
+        ( 6,         32768::bigint,        65536::bigint,  '32-64us'),
+        ( 7,         65536::bigint,       131072::bigint,  '64-128us'),
+        ( 8,        131072::bigint,       262144::bigint,  '128-256us'),
+        ( 9,        262144::bigint,       524288::bigint,  '256-512us'),
+        (10,        524288::bigint,      1048576::bigint,  '512us-1ms'),
+        (11,       1048576::bigint,      2097152::bigint,  '1-2ms'),
+        (12,       2097152::bigint,      4194304::bigint,  '2-4ms'),
+        (13,       4194304::bigint,      8388608::bigint,  '4-8ms'),
+        (14,       8388608::bigint,     16777216::bigint,  '8-16ms'),
+        (15,      16777216::bigint,     33554432::bigint,  '16-32ms'),
+        (16,      33554432::bigint,     67108864::bigint,  '32-64ms'),
+        (17,      67108864::bigint,    134217728::bigint,  '64-128ms'),
+        (18,     134217728::bigint,    268435456::bigint,  '128-256ms'),
+        (19,     268435456::bigint,    536870912::bigint,  '256-512ms'),
+        (20,     536870912::bigint,   1073741824::bigint,  '512ms-1s'),
+        (21,    1073741824::bigint,   2147483648::bigint,  '1-2s'),
+        (22,    2147483648::bigint,   4294967296::bigint,  '2-4s'),
+        (23,    4294967296::bigint,   8589934592::bigint,  '4-8s'),
+        (24,    8589934592::bigint,  17179869184::bigint,  '8-16s'),
+        (25,   17179869184::bigint,  34359738368::bigint,  '16-32s'),
+        (26,   34359738368::bigint,  68719476736::bigint,  '32-64s'),
+        (27,   68719476736::bigint, 137438953472::bigint,  '64-128s'),
+        (28,  137438953472::bigint, 274877906944::bigint,  '128-256s'),
+        (29,  274877906944::bigint, 549755813888::bigint,  '256-512s'),
+        (30,  549755813888::bigint, 1099511627776::bigint, '512s-1024s'),
+        (31, 1099511627776::bigint, NULL::bigint,          '>=1024s')
+    ) AS t(bucket_idx, lower_ns, upper_ns, label);
+
+CREATE VIEW pg_stat_wait_event_timing AS
+    SELECT
+        t.pid,
+        t.backend_type,
+        t.procnumber,
+        t.wait_event_type,
+        t.wait_event,
+        t.calls,
+        t.total_time_ms,
+        t.avg_time_us,
+        t.max_time_us,
+        t.histogram
+    FROM pg_stat_get_wait_event_timing(NULL) t;
+REVOKE ALL ON pg_stat_wait_event_timing FROM PUBLIC;
+GRANT SELECT ON pg_stat_wait_event_timing TO pg_read_all_stats;
+
+CREATE VIEW pg_stat_wait_event_timing_overflow AS
+    SELECT
+        t.pid,
+        t.backend_type,
+        t.procnumber,
+        t.lwlock_overflow_count,
+        t.flat_overflow_count,
+        t.reset_count
+    FROM pg_stat_get_wait_event_timing_overflow(NULL) t;
+REVOKE ALL ON pg_stat_wait_event_timing_overflow FROM PUBLIC;
+GRANT SELECT ON pg_stat_wait_event_timing_overflow TO pg_read_all_stats;
+
+
+-- Session-local view: mirrors pg_backend_memory_contexts in both naming
+-- and access control.  The SRF is hardcoded to the caller's own ring,
+-- so a non-superuser only ever sees their own session's data; but as
+-- with pg_backend_memory_contexts, the row contents (query_id values
+-- joinable against pg_stat_statements, per-event timings) are
+-- information that ordinary roles should not see across SECURITY
+-- DEFINER call chains.  Lock the view to pg_read_all_stats to match
+-- the precedent set in commit f8a2afa12 (PG 17) for the namesake view.
+CREATE VIEW pg_backend_wait_event_trace AS
+    SELECT
+        t.seq,
+        t.timestamp_ns,
+        t.wait_event_type,
+        t.wait_event,
+        t.duration_us,
+        t.query_id
+    FROM pg_get_backend_wait_event_trace() t;
+REVOKE ALL ON pg_backend_wait_event_trace FROM PUBLIC;
+GRANT SELECT ON pg_backend_wait_event_trace TO pg_read_all_stats;
+
+-- Cross-backend trace ring reader.  Keyed by procnumber (reads OWNED
+-- and ORPHANED slots uniformly so post-mortem data from short-lived
+-- backends remains observable).  Same privilege model as the
+-- session-local view above: REVOKE'd from PUBLIC and GRANT'ed to
+-- pg_read_all_stats.
+REVOKE EXECUTE ON FUNCTION pg_get_wait_event_trace(int4) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_wait_event_trace(int4) TO pg_read_all_stats;
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 4b30f7686801a..7f03c6875140f 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -57,6 +57,7 @@
 #include "parser/parse_relation.h"
 #include "pgstat.h"
 #include "rewrite/rewriteHandler.h"
+#include "utils/wait_event_timing.h"
 #include "tcop/utility.h"
 #include "utils/acl.h"
 #include "utils/backend_status.h"
@@ -133,6 +134,8 @@ ExecutorStart(QueryDesc *queryDesc, int eflags)
 	 */
 	pgstat_report_query_id(queryDesc->plannedstmt->queryId, false);
 
+	wait_event_trace_exec_start(queryDesc->plannedstmt->queryId);
+
 	if (ExecutorStart_hook)
 		(*ExecutorStart_hook) (queryDesc, eflags);
 	else
@@ -476,6 +479,8 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
 void
 ExecutorEnd(QueryDesc *queryDesc)
 {
+	wait_event_trace_exec_end(queryDesc->plannedstmt->queryId);
+
 	if (ExecutorEnd_hook)
 		(*ExecutorEnd_hook) (queryDesc);
 	else
diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c
index 9803a0ee2a141..50e27fb4f702b 100644
--- a/src/backend/postmaster/auxprocess.c
+++ b/src/backend/postmaster/auxprocess.c
@@ -26,6 +26,7 @@
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
 #include "utils/wait_event.h"
+#include "utils/wait_event_timing.h"
 
 
 static void ShutdownAuxiliaryProcess(int code, Datum arg);
@@ -113,6 +114,11 @@ AuxiliaryProcessMainCommon(void)
 	 */
 	CreateAuxProcessResourceOwner();
 
+#ifdef USE_WAIT_EVENT_TIMING
+	/* Attach trace ring if wait_event_capture = trace was set via postgresql.conf */
+	if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && my_trace_proc_number >= 0)
+		wait_event_trace_attach(my_trace_proc_number);
+#endif
 
 	/* Initialize backend status information */
 	pgstat_beinit();
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 1ac25068d62f2..b68ea684c5f42 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -57,6 +57,7 @@
 #include "utils/timeout.h"
 #include "utils/timestamp.h"
 #include "utils/wait_event.h"
+#include "utils/wait_event_timing.h"
 
 /* GUC variables */
 int			DeadlockTimeout = 1000;
@@ -541,6 +542,7 @@ InitProcess(void)
 
 	/* now that we have a proc, report wait events to shared memory */
 	pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+	pgstat_set_wait_event_timing_storage(MyProcNumber);
 
 	/*
 	 * We might be reusing a semaphore that belonged to a failed process. So
@@ -713,6 +715,7 @@ InitAuxiliaryProcess(void)
 
 	/* now that we have a proc, report wait events to shared memory */
 	pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+	pgstat_set_wait_event_timing_storage(MyProcNumber);
 
 	/* Check that group locking fields are in a proper initial state. */
 	Assert(MyProc->lockGroupLeader == NULL);
@@ -1003,6 +1006,7 @@ ProcKill(int code, Datum arg)
 	 */
 	SwitchBackToLocalLatch();
 	pgstat_reset_wait_event_storage();
+	pgstat_reset_wait_event_timing_storage();
 
 	proc = MyProc;
 	MyProc = NULL;
@@ -1068,6 +1072,7 @@ AuxiliaryProcKill(int code, Datum arg)
 	/* look at the equivalent ProcKill() code for comments */
 	SwitchBackToLocalLatch();
 	pgstat_reset_wait_event_storage();
+	pgstat_reset_wait_event_timing_storage();
 
 	proc = MyProc;
 	MyProc = NULL;
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index dbef734a93f15..38b7606a0c852 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -78,6 +78,7 @@
 #include "tcop/tcopprot.h"
 #include "tcop/utility.h"
 #include "utils/guc_hooks.h"
+#include "utils/wait_event_timing.h"
 #include "utils/injection_point.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
@@ -1423,6 +1424,19 @@ exec_parse_message(const char *query_string,	/* string to execute */
 	 */
 	debug_query_string = query_string;
 
+	/*
+	 * In pipelined extended protocol, a Parse can arrive while the previous
+	 * query's st_query_id is still set and st_state is still RUNNING (no
+	 * Sync->idle between queries, so send_ready_for_query has not yet
+	 * emitted the prior QUERY_END marker).  Flush the prior id with
+	 * force=true so the QUERY_END marker fires before pgstat_report_activity
+	 * below silently zeros st_query_id.  Skip when st_state != RUNNING:
+	 * coming from idle means send_ready_for_query has already emitted the
+	 * QUERY_END for whatever residual st_query_id remains, and re-emitting
+	 * here would double-count.
+	 */
+	if (MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING)
+		pgstat_report_query_id(0, true);
 	pgstat_report_activity(STATE_RUNNING, query_string);
 
 	set_ps_display("PARSE");
@@ -1692,6 +1706,12 @@ exec_bind_message(StringInfo input_message)
 	 */
 	debug_query_string = psrc->query_string;
 
+	/* See exec_parse_message for rationale.  In particular, the state
+	 * gate prevents a duplicate QUERY_END when this Bind is the first
+	 * message after a Sync->idle transition (where send_ready_for_query
+	 * has already emitted QUERY_END for any residual st_query_id). */
+	if (MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING)
+		pgstat_report_query_id(0, true);
 	pgstat_report_activity(STATE_RUNNING, psrc->query_string);
 
 	foreach(lc, psrc->query_list)
@@ -2183,6 +2203,14 @@ exec_execute_message(const char *portal_name, long max_rows)
 	 */
 	debug_query_string = sourceText;
 
+	/* See exec_parse_message.  Closes the per-phase
+	 * QUERY_START..QUERY_END pair from the preceding Bind (or from the
+	 * prior pipelined Execute) so trace consumers see balanced markers
+	 * across Parse/Bind/Execute.  State gate avoids a duplicate
+	 * QUERY_END when this Execute is the first message after a
+	 * Sync->idle transition. */
+	if (MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING)
+		pgstat_report_query_id(0, true);
 	pgstat_report_activity(STATE_RUNNING, sourceText);
 
 	foreach(lc, portal->stmts)
@@ -4654,6 +4682,18 @@ PostgresMain(const char *dbname, const char *username)
 		 */
 		if (send_ready_for_query)
 		{
+			/*
+			 * Emit QUERY_END trace marker before going idle so that
+			 * idle waits (ClientRead etc.) are not attributed to the
+			 * finished query.
+			 */
+			{
+				volatile PgBackendStatus *beentry = MyBEEntry;
+
+				if (beentry != NULL && beentry->st_query_id != 0)
+					wait_event_trace_query_end(beentry->st_query_id);
+			}
+
 			if (IsAbortedTransactionBlockState())
 			{
 				set_ps_display("idle in transaction (aborted)");
diff --git a/src/backend/utils/.gitignore b/src/backend/utils/.gitignore
index fa9cfb39693db..5051e36d1f01f 100644
--- a/src/backend/utils/.gitignore
+++ b/src/backend/utils/.gitignore
@@ -7,4 +7,5 @@
 /errcodes.h
 /pgstat_wait_event.c
 /wait_event_funcs_data.c
+/wait_event_timing_data.h
 /wait_event_types.h
diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile
index 81b4a956bda3f..5c11d8294f01a 100644
--- a/src/backend/utils/Makefile
+++ b/src/backend/utils/Makefile
@@ -43,7 +43,7 @@ generated-header-symlinks: $(top_builddir)/src/include/utils/header-stamp submak
 submake-adt-headers:
 	$(MAKE) -C adt jsonpath_gram.h
 
-$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h
+$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h
 
 # fmgr-stamp records the last time we ran Gen_fmgrtab.pl.  We don't rely on
 # the timestamps of the individual output files, because the Perl script
@@ -60,6 +60,7 @@ guc_tables.inc.c: $(top_srcdir)/src/backend/utils/misc/guc_parameters.dat $(top_
 
 pgstat_wait_event.c: wait_event_types.h
 wait_event_funcs_data.c: wait_event_types.h
+wait_event_timing_data.h: wait_event_types.h
 
 wait_event_types.h: $(top_srcdir)/src/backend/utils/activity/wait_event_names.txt $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl
 	$(PERL) $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl --code $<
@@ -79,8 +80,8 @@ endif
 # These generated headers must be symlinked into src/include/.
 # We use header-stamp to record that we've done this because the symlinks
 # themselves may appear older than fmgr-stamp.
-$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h
-	cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h; do \
+$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h
+	cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h; do \
 	  rm -f $$file && $(LN_S) "../../../$(subdir)/$$file" . ; \
 	done
 	touch $@
@@ -99,4 +100,4 @@ uninstall-data:
 clean:
 	rm -f probes.h probes.h.tmp
 	rm -f fmgroids.h fmgrprotos.h fmgrtab.c fmgr-stamp errcodes.h guc_tables.inc.c
-	rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c
+	rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h
diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile
index ca3ef89bf5997..60154d8055780 100644
--- a/src/backend/utils/activity/Makefile
+++ b/src/backend/utils/activity/Makefile
@@ -35,10 +35,12 @@ OBJS = \
 	pgstat_wal.o \
 	pgstat_xact.o \
 	wait_event.o \
-	wait_event_funcs.o
+	wait_event_funcs.o \
+	wait_event_timing.o
 
 # Force these dependencies to be known even without dependency info built:
 wait_event.o: wait_event.c $(top_builddir)/src/backend/utils/pgstat_wait_event.c
 wait_event_funcs.o: wait_event_funcs.c $(top_builddir)/src/backend/utils/wait_event_funcs_data.c
+wait_event_timing.o: wait_event_timing.c $(top_builddir)/src/backend/utils/wait_event_timing_data.h
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c
index d685fc5cd87c0..a97f554704af5 100644
--- a/src/backend/utils/activity/backend_status.c
+++ b/src/backend/utils/activity/backend_status.c
@@ -22,6 +22,7 @@
 #include "storage/shmem.h"
 #include "storage/subsystems.h"
 #include "utils/ascii.h"
+#include "utils/wait_event_timing.h"
 #include "utils/guc.h"			/* for application_name */
 #include "utils/memutils.h"
 
@@ -670,6 +671,18 @@ pgstat_report_query_id(int64 query_id, bool force)
 	if (beentry->st_query_id != INT64CONST(0) && !force)
 		return;
 
+	/*
+	 * Emit trace markers for query-to-query transitions.  QUERY_END fires
+	 * here when st_query_id transitions from one non-zero value to another
+	 * (multi-statement simple protocol, pipelined extended protocol).
+	 * The last-query-to-idle QUERY_END is emitted separately in
+	 * PostgresMain() at send_ready_for_query.
+	 */
+	if (beentry->st_query_id != 0 && beentry->st_query_id != query_id)
+		wait_event_trace_query_end(beentry->st_query_id);
+	if (query_id != 0 && query_id != beentry->st_query_id)
+		wait_event_trace_query_start(query_id);
+
 	/*
 	 * Update my status entry, following the protocol of bumping
 	 * st_changecount before and after.  We use a volatile pointer here to
diff --git a/src/backend/utils/activity/generate-wait_event_types.pl b/src/backend/utils/activity/generate-wait_event_types.pl
index d39a30d04783d..f3f1f107a4c04 100644
--- a/src/backend/utils/activity/generate-wait_event_types.pl
+++ b/src/backend/utils/activity/generate-wait_event_types.pl
@@ -5,6 +5,7 @@
 # - wait_event_types.h (if --code is passed)
 # - pgstat_wait_event.c (if --code is passed)
 # - wait_event_funcs_data.c (if --code is passed)
+# - wait_event_timing_data.h (if --code is passed)
 # - wait_event_types.sgml (if --docs is passed)
 #
 # Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
@@ -269,17 +270,195 @@
 		}
 	}
 
+	# -----------------------------------------------------------
+	# Compute wait_event_timing class mapping data.
+	#
+	# The dense class table maps raw classId (0x00..max) to a
+	# dense index, with per-class slot counts rounded up to the
+	# next power of 2 (minimum 16).  Extension and InjectionPoint
+	# are fixed at 128 because extensions register custom events.
+	# LWLock uses a hash table (dense = -1).
+	# -----------------------------------------------------------
+
+	# Map section name -> raw classId (from wait_classes.h constants)
+	my %class_to_raw = (
+		'Lock'           => 0x03,
+		'Buffer'         => 0x04,
+		'Activity'       => 0x05,
+		'Client'         => 0x06,
+		'Extension'      => 0x07,
+		'IPC'            => 0x08,
+		'Timeout'        => 0x09,
+		'IO'             => 0x0A,
+		'InjectionPoint' => 0x0B,
+	);
+
+	# Classes that need fixed large slot counts (dynamically extensible)
+	my %fixed_slot_classes = (
+		'Extension'      => 128,
+		'InjectionPoint' => 128,
+	);
+
+	# Count events per class from the parsed data.
+	# Build a list of (className, rawId, actualCount) sorted by rawId.
+	my @timing_classes;
+	foreach my $waitclass (keys %hashwe)
+	{
+		my $short = $waitclass;
+		$short =~ s/^WaitEvent//;
+
+		# Skip LWLock -- uses hash table, not flat array
+		next unless exists $class_to_raw{$short};
+
+		my $raw_id = $class_to_raw{$short};
+		my $count = scalar @{ $hashwe{$waitclass} };
+
+		push @timing_classes, {
+			name     => $short,
+			raw_id   => $raw_id,
+			actual   => $count,
+		};
+	}
+
+	# InjectionPoint (0x0B) has no section in wait_event_names.txt
+	# because its events are dynamically registered at runtime.
+	# Add it explicitly with actual=0 and a fixed slot count.
+	if (!grep { $_->{name} eq 'InjectionPoint' } @timing_classes)
+	{
+		push @timing_classes, {
+			name     => 'InjectionPoint',
+			raw_id   => $class_to_raw{'InjectionPoint'},
+			actual   => 0,
+		};
+	}
+
+	# Sort by raw classId
+	@timing_classes = sort { $a->{raw_id} <=> $b->{raw_id} } @timing_classes;
+
+	# Compute slot counts: next power of 2, minimum 16, or fixed
+	foreach my $cls (@timing_classes)
+	{
+		if (exists $fixed_slot_classes{$cls->{name}})
+		{
+			$cls->{slots} = $fixed_slot_classes{$cls->{name}};
+		}
+		else
+		{
+			my $slots = 16;	# minimum
+			$slots *= 2 while $slots < $cls->{actual};
+			$cls->{slots} = $slots;
+		}
+	}
+
+	# Compute cumulative offsets
+	my $offset = 0;
+	foreach my $cls (@timing_classes)
+	{
+		$cls->{offset} = $offset;
+		$offset += $cls->{slots};
+	}
+	my $total_events = $offset;
+
+	# Determine max raw classId for array sizing
+	my $max_raw = 0;
+	foreach my $cls (@timing_classes)
+	{
+		$max_raw = $cls->{raw_id} if $cls->{raw_id} > $max_raw;
+	}
+	my $raw_classes = $max_raw + 1;
+	my $dense_classes = scalar @timing_classes;
+
+	# Emit timing defines into wait_event_types.h
+	printf $h "\n/* Wait event timing flat array sizing (generated) */\n";
+	printf $h "#define WAIT_EVENT_TIMING_RAW_CLASSES\t%d\n", $raw_classes;
+	printf $h "#define WAIT_EVENT_TIMING_DENSE_CLASSES\t%d\n", $dense_classes;
+	printf $h "#define WAIT_EVENT_TIMING_NUM_EVENTS\t%d\n\n", $total_events;
+
 	printf $h "#endif                          /* WAIT_EVENT_TYPES_H */\n";
 	close $h;
 	close $c;
 	close $wc;
 
+	# Generate wait_event_timing_data.h with the mapping arrays.
+	# A header (rather than a .c file) keeps the file-extension category
+	# straight: it is included into a single TU (wait_event_timing.c) and
+	# defines static const tables there.  The include guard makes the
+	# single-owner intent explicit and prevents accidental double inclusion.
+	my $ttmp = "$output_path/wait_event_timing_data.h.tmp$$";
+	open my $t, '>', $ttmp or die "Could not open $ttmp: $!";
+	printf $t $header_comment, 'wait_event_timing_data.h';
+
+	printf $t "#ifndef WAIT_EVENT_TIMING_DATA_H\n";
+	printf $t "#define WAIT_EVENT_TIMING_DATA_H\n\n";
+
+	# Emit wait_event_class_dense[]
+	printf $t "static const int8 wait_event_class_dense[WAIT_EVENT_TIMING_RAW_CLASSES] = {\n";
+	for (my $i = 0; $i < $raw_classes; $i++)
+	{
+		my $dense = -1;
+		my $comment = "unused";
+		for (my $d = 0; $d < $dense_classes; $d++)
+		{
+			if ($timing_classes[$d]->{raw_id} == $i)
+			{
+				$dense = $d;
+				$comment = $timing_classes[$d]->{name};
+				last;
+			}
+		}
+		# classId 0x01 is LWLock
+		if ($i == 0x01)
+		{
+			$comment = "LWLock (uses hash)";
+		}
+		my $comma = ($i < $raw_classes - 1) ? "," : "";
+		printf $t "\t%2d$comma\t\t/* 0x%02x: %s */\n", $dense, $i, $comment;
+	}
+	printf $t "};\n\n";
+
+	# Emit wait_event_class_nevents[]
+	printf $t "static const int wait_event_class_nevents[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n";
+	for (my $d = 0; $d < $dense_classes; $d++)
+	{
+		my $cls = $timing_classes[$d];
+		my $comma = ($d < $dense_classes - 1) ? "," : "";
+		printf $t "\t%d$comma\t\t/* %s (actual: %d) */\n",
+			$cls->{slots}, $cls->{name}, $cls->{actual};
+	}
+	printf $t "};\n\n";
+
+	# Emit wait_event_class_offset[]
+	printf $t "static const int wait_event_class_offset[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n";
+	for (my $d = 0; $d < $dense_classes; $d++)
+	{
+		my $cls = $timing_classes[$d];
+		my $comma = ($d < $dense_classes - 1) ? "," : "";
+		printf $t "\t%d$comma\t\t/* %s */\n", $cls->{offset}, $cls->{name};
+	}
+	printf $t "};\n\n";
+
+	# Emit wait_event_dense_to_classid[]
+	printf $t "static const uint8 wait_event_dense_to_classid[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n\t";
+	for (my $d = 0; $d < $dense_classes; $d++)
+	{
+		my $cls = $timing_classes[$d];
+		my $comma = ($d < $dense_classes - 1) ? ", " : "";
+		printf $t "0x%02x$comma", $cls->{raw_id};
+	}
+	printf $t "\n};\n\n";
+
+	printf $t "#endif                          /* WAIT_EVENT_TIMING_DATA_H */\n";
+
+	close $t;
+
 	rename($htmp, "$output_path/wait_event_types.h")
 	  || die "rename: $htmp to $output_path/wait_event_types.h: $!";
 	rename($ctmp, "$output_path/pgstat_wait_event.c")
 	  || die "rename: $ctmp to $output_path/pgstat_wait_event.c: $!";
 	rename($wctmp, "$output_path/wait_event_funcs_data.c")
 	  || die "rename: $wctmp to $output_path/wait_event_funcs_data.c: $!";
+	rename($ttmp, "$output_path/wait_event_timing_data.h")
+	  || die "rename: $ttmp to $output_path/wait_event_timing_data.h: $!";
 }
 # Generate the .sgml file.
 elsif ($gen_docs)
diff --git a/src/backend/utils/activity/meson.build b/src/backend/utils/activity/meson.build
index 1aa7ece52908c..1da4e216c4263 100644
--- a/src/backend/utils/activity/meson.build
+++ b/src/backend/utils/activity/meson.build
@@ -19,6 +19,7 @@ backend_sources += files(
   'pgstat_subscription.c',
   'pgstat_wal.c',
   'pgstat_xact.c',
+  'wait_event_timing.c',
 )
 
 # this includes a .c file with contents generated in ../../../include/activity,
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index 95635c7f56ce7..c8fab55b36321 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -41,8 +41,7 @@ static const char *pgstat_get_wait_io(WaitEventIO w);
 static uint32 local_my_wait_event_info;
 uint32	   *my_wait_event_info = &local_my_wait_event_info;
 
-#define WAIT_EVENT_CLASS_MASK	0xFF000000
-#define WAIT_EVENT_ID_MASK		0x0000FFFF
+/* WAIT_EVENT_CLASS_MASK / WAIT_EVENT_ID_MASK are defined in utils/wait_classes.h */
 
 /*
  * Hash tables for storing custom wait event ids and their names in
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 560659f956856..35f8b3f359dc5 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -417,6 +417,8 @@ XactSLRU	"Waiting to access the transaction status SLRU cache."
 ParallelVacuumDSA	"Waiting for parallel vacuum dynamic shared memory allocation."
 AioUringCompletion	"Waiting for another process to complete IO via io_uring."
 ShmemIndex	"Waiting to find or allocate space in shared memory."
+WaitEventTraceDSA	"Waiting for wait event trace dynamic shared memory allocation."
+WaitEventTimingDSA	"Waiting for wait event timing dynamic shared memory allocation."
 
 # No "ABI_compatibility" region here as WaitEventLWLock has its own C code.
 
diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c
new file mode 100644
index 0000000000000..b745d84eb1051
--- /dev/null
+++ b/src/backend/utils/activity/wait_event_timing.c
@@ -0,0 +1,3582 @@
+/*-------------------------------------------------------------------------
+ *
+ * wait_event_timing.c
+ *	  Per-backend wait event timing and histogram accumulation.
+ *
+ * This module provides Oracle-style wait event instrumentation: every
+ * call to pgstat_report_wait_start()/pgstat_report_wait_end() records
+ * the wait duration using clock_gettime() and accumulates per-event
+ * statistics (count, total nanoseconds, max, histogram) in shared memory.
+ *
+ * Overhead: two VDSO clock_gettime() calls per wait event transition
+ * (~40-100 ns total), plus a few memory writes to per-backend arrays.
+ * No locking is needed since each backend writes only to its own slot.
+ *
+ * Controlled by the wait_event_capture GUC (off | stats | trace,
+ * default off).  The 'stats' level activates the aggregated per-event
+ * counters; 'trace' additionally enables a per-session DSA-backed ring
+ * buffer of individual events for 10046-style analysis.
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/activity/wait_event_timing.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "utils/guc.h"
+#include "utils/guc_hooks.h"
+#include "utils/wait_event_timing.h"
+
+/*
+ * GUC variable -- always defined so the GUC system works even when
+ * compiled without --enable-wait-event-timing.  In stub builds the
+ * check_hook below rejects any value other than OFF.
+ */
+int			wait_event_capture = WAIT_EVENT_CAPTURE_OFF;
+
+/*
+ * GUC: cap on distinct LWLock tranches the per-backend hash table
+ * tracks individually.  Sized at server start (PGC_POSTMASTER).  See
+ * the description in guc_parameters.dat.  Always defined so the GUC
+ * machinery has a backing variable even on builds compiled without
+ * --enable-wait-event-timing; the value is unused outside that gate.
+ */
+int			wait_event_timing_max_tranches = 192;
+
+/*
+ * GUC: per-backend wait-event-trace ring buffer size, in kilobytes.
+ * Power of two; sized at server start.  Always defined so the GUC
+ * machinery has a backing variable even in stub builds.
+ */
+int			wait_event_trace_ring_size_kb = 4096;
+
+/*
+ * Records-per-ring derived from wait_event_trace_ring_size_kb at
+ * server start.  Set once during the postmaster's GUC initialisation;
+ * read by the writer hot path (via the per-ring cached mask) and by
+ * the allocator.  Stays at zero until the GUC framework has committed
+ * the boot value, after which any code reading it sees the final
+ * cluster-wide ring size.
+ */
+uint32		WaitEventTraceRingSize = 0;
+
+/*
+ * Enum value table consumed by guc.c.  Order matches the
+ * WaitEventCaptureLevel enum and the documented "off < stats < trace"
+ * ordering.
+ */
+const struct config_enum_entry wait_event_capture_options[] = {
+	{"off", WAIT_EVENT_CAPTURE_OFF, false},
+	{"stats", WAIT_EVENT_CAPTURE_STATS, false},
+	{"trace", WAIT_EVENT_CAPTURE_TRACE, false},
+	{NULL, 0, false}
+};
+
+StaticAssertDecl(lengthof(wait_event_capture_options) == (WAIT_EVENT_CAPTURE_TRACE + 2),
+				 "wait_event_capture_options length mismatch");
+
+/*
+ * GUC check hook for wait_event_trace_ring_size_kb.
+ *
+ * The ring size in records must be a power of two so the writer's
+ * mask-indexing (pos & ring_mask) works.  Since each record is exactly
+ * 32 bytes, the kilobyte value is a power of two iff records-count is
+ * (kb * 32 is a power of two iff kb is, as 32 itself is).
+ *
+ * Defined for both build configurations so the GUC framework can
+ * validate the value uniformly; the value itself is unused in stub
+ * builds.
+ */
+bool
+check_wait_event_trace_ring_size_kb(int *newval, void **extra, GucSource source)
+{
+	int		v = *newval;
+
+	if (v <= 0 || (v & (v - 1)) != 0)
+	{
+		GUC_check_errdetail("wait_event_trace_ring_size_kb must be a positive power of two.");
+		return false;
+	}
+	return true;
+}
+
+#ifndef USE_WAIT_EVENT_TIMING
+
+/*
+ * Stub SQL functions when compiled without --enable-wait-event-timing.
+ * These are referenced by pg_proc.dat and must exist as symbols.
+ */
+#include "fmgr.h"
+#include "funcapi.h"
+#include "utils/guc_hooks.h"
+
+Datum		pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS);
+Datum		pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS);
+Datum		pg_get_wait_event_trace(PG_FUNCTION_ARGS);
+Datum		pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS);
+Datum		pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS);
+Datum		pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS);
+Datum		pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS);
+
+Datum
+pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS)
+{
+	InitMaterializedSRF(fcinfo, 0);
+	PG_RETURN_VOID();
+}
+
+Datum
+pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS)
+{
+	InitMaterializedSRF(fcinfo, 0);
+	PG_RETURN_VOID();
+}
+
+Datum
+pg_get_wait_event_trace(PG_FUNCTION_ARGS)
+{
+	InitMaterializedSRF(fcinfo, 0);
+	PG_RETURN_VOID();
+}
+
+Datum
+pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS)
+{
+	InitMaterializedSRF(fcinfo, 0);
+	PG_RETURN_VOID();
+}
+
+Datum
+pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("wait event capture is not supported by this build"),
+			 errhint("Compile PostgreSQL with --enable-wait-event-timing.")));
+	PG_RETURN_VOID();
+}
+
+Datum
+pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS)
+{
+	ereport(ERROR,
+			(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+			 errmsg("wait event capture is not supported by this build"),
+			 errhint("Compile PostgreSQL with --enable-wait-event-timing.")));
+	PG_RETURN_VOID();
+}
+
+Datum
+pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS)
+{
+	/*
+	 * In stub builds the trace ring infrastructure does not exist, so
+	 * there can never be any orphaned rings to clear.  Return 0 rather
+	 * than erroring; this lets monitoring scripts call the function
+	 * unconditionally without branching on the build flag.
+	 */
+	PG_RETURN_INT64(0);
+}
+
+/*
+ * Extern variables referenced by backend_status.c unconditionally.
+ * In timing builds these are defined after the #else.
+ */
+/*
+ * GUC check hook for the stub build.  Any value other than 'off' is
+ * meaningless without --enable-wait-event-timing, so we reject it
+ * (or downgrade to 'off' silently when the value comes from the
+ * config file at startup, matching the old per-GUC behavior).
+ */
+bool
+check_wait_event_capture(int *newval, void **extra, GucSource source)
+{
+	if (*newval != WAIT_EVENT_CAPTURE_OFF)
+	{
+		if (source < PGC_S_INTERACTIVE)
+		{
+			ereport(WARNING,
+					(errmsg("wait_event_capture is not supported by this build, "
+							"forcing to \"off\""),
+					 errhint("Compile PostgreSQL with "
+							 "--enable-wait-event-timing.")));
+			*newval = WAIT_EVENT_CAPTURE_OFF;
+			return true;
+		}
+		GUC_check_errdetail("This build does not support wait event capture.");
+		GUC_check_errhint("Compile PostgreSQL with --enable-wait-event-timing.");
+		return false;
+	}
+	return true;
+}
+
+/* Stub GUC assign hook -- nothing to do without compile-time support. */
+void
+assign_wait_event_capture(int newval, void *extra)
+{
+}
+
+/*
+ * Stub shmem callbacks registered from storage/subsystemlist.h.  In the
+ * non-timing build no shared memory is reserved: both request_fn and
+ * init_fn are NULL, which RegisterShmemCallbacks() treats as no-ops.
+ */
+const ShmemCallbacks WaitEventTimingShmemCallbacks = {0};
+const ShmemCallbacks WaitEventTraceControlShmemCallbacks = {0};
+
+void
+pgstat_set_wait_event_timing_storage(int procNumber)
+{
+}
+
+void
+pgstat_reset_wait_event_timing_storage(void)
+{
+}
+
+/*
+ * Stub trace-marker entry points.  Declared unconditionally in
+ * wait_event_timing.h so that call sites in execMain.c,
+ * backend_status.c, and postgres.c do not need #ifdef
+ * USE_WAIT_EVENT_TIMING guards around the call.  No-ops here in the
+ * stub build: there is no ring to write to and no infrastructure to
+ * initialise.
+ */
+void
+wait_event_trace_query_start(int64 query_id)
+{
+}
+
+void
+wait_event_trace_query_end(int64 query_id)
+{
+}
+
+void
+wait_event_trace_exec_start(int64 query_id)
+{
+}
+
+void
+wait_event_trace_exec_end(int64 query_id)
+{
+}
+
+#else							/* USE_WAIT_EVENT_TIMING */
+
+#include "catalog/pg_authid.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "nodes/queryjumble.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procnumber.h"
+#include "storage/shmem.h"
+#include "catalog/pg_type_d.h"
+#include "utils/acl.h"
+#include "utils/array.h"
+#include "utils/backend_status.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/guc_hooks.h"
+#include "utils/injection_point.h"
+#include "utils/tuplestore.h"
+#include "utils/wait_event.h"
+
+#define NUM_WAIT_EVENT_TIMING_SLOTS  (MaxBackends + NUM_AUXILIARY_PROCS)
+
+#define HAS_PGSTAT_PERMISSIONS(role) \
+	(has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS) || \
+	 has_privs_of_role(GetUserId(), role))
+
+/* Pointer to this backend's timing state */
+WaitEventTimingState *my_wait_event_timing = NULL;
+
+/* Pointer to this backend's trace ring buffer */
+static WaitEventTraceState *my_wait_event_trace = NULL;
+
+/*
+ * Backend-local copy of the last reset generation we acted on.  Compared
+ * against the shared pg_atomic_uint32 reset_generation in this backend's
+ * WaitEventTimingState slot at every wait_end.  When the shared value
+ * differs, the owning backend performs the reset of its own counters on
+ * behalf of whoever called pg_stat_reset_wait_event_timing(target).
+ *
+ * This makes cross-backend reset a lock-free request-response: the caller
+ * bumps the atomic (and wakes the target's latch so idle backends notice);
+ * the owning backend clears its counters at a safe point.  Because only the
+ * owning backend ever writes its slot, there is no race between writers and
+ * resetters -- the reset happens inline inside the single-writer hot path.
+ */
+static uint32 my_last_reset_generation = 0;
+
+/*
+ * DSA-based shared timing array control.
+ *
+ * The per-backend WaitEventTimingState array is allocated lazily in DSA
+ * on the first SET wait_event_capture = stats|trace in the cluster.
+ * This avoids ~11-113 MB of eager shmem allocation at postmaster start
+ * when the feature is compiled in but turned off at runtime (the common
+ * case).  See wait_event_timing_attach_array().
+ *
+ * The control struct itself lives in the small fixed shmem region; it
+ * holds a DSA handle and a dsa_pointer to the allocated array.
+ */
+typedef struct WaitEventTimingControl
+{
+	LWLock		lock;			/* protects first-time DSA create + array alloc */
+	dsa_handle	timing_dsa_handle;	/* DSA_HANDLE_INVALID until first enable */
+	dsa_pointer timing_array;	/* InvalidDsaPointer until first enable */
+} WaitEventTimingControl;
+
+static WaitEventTimingControl *WaitEventTimingCtl = NULL;
+static dsa_area *timing_dsa = NULL;
+
+/*
+ * Backend-local cached pointer to the start of the shared array, set
+ * on first lazy-attach.  Readers of other backends' slots (pg_stat_*)
+ * attach on demand and use this cache for the rest of the SRF call.
+ * Writers access their own slot exclusively via my_wait_event_timing.
+ *
+ * Slots in this region are NOT laid out as a simple C array -- per
+ * the layout description on WaitEventTimingState (in
+ * src/include/utils/wait_event_timing.h), each slot has a
+ * runtime-determined stride (header + variable-size hash arrays).
+ * Use wet_slot(idx) below to index into it.
+ */
+static char *WaitEventTimingArray = NULL;
+
+/*
+ * Per-backend slot stride within WaitEventTimingArray.  Set at first
+ * attach from the GUC value at the time of allocation; constant for
+ * the cluster's lifetime once the DSA is allocated.
+ */
+static Size wait_event_timing_per_backend_stride = 0;
+
+/*
+ * Effective hash sizing.  Both values are derived from the GUC
+ * wait_event_timing_max_tranches at allocation time and stored in
+ * each slot's LWLockTimingHash header; cached here as backend-local
+ * for use by code that needs the values before resolving a slot
+ * (e.g., the allocation code itself).
+ */
+static int	wait_event_timing_hash_size = 0;
+static int	wait_event_timing_max_entries = 0;
+
+/*
+ * Round up to the next power of two, with a minimum of 32.  The hash
+ * slot count must be a power of two for the mask-based modulo in the
+ * lookup hot path; we target >= 2x the entry cap so the load factor
+ * stays at or below 50%.
+ */
+static int
+wait_event_timing_hash_size_for(int max_entries)
+{
+	int		size = 32;
+
+	while (size < max_entries * 2)
+		size <<= 1;
+	return size;
+}
+
+/*
+ * Compute the per-backend slot size for the given max_entries.  Each
+ * slot is laid out as
+ *
+ *     [ WaitEventTimingState header ]
+ *     [ LWLockTimingHashEntry[hash_size] ]
+ *     [ WaitEventTimingEntry[max_entries]    <- lwlock_events[] ]
+ *
+ * with no padding between sections (the structs already pack
+ * 8-byte-aligned).
+ */
+static Size
+wait_event_timing_slot_size(int max_entries)
+{
+	int		hash_size = wait_event_timing_hash_size_for(max_entries);
+
+	return add_size(sizeof(WaitEventTimingState),
+					add_size(mul_size(hash_size, sizeof(LWLockTimingHashEntry)),
+							 mul_size(max_entries, sizeof(WaitEventTimingEntry))));
+}
+
+/* Resolve the address of slot `idx` within WaitEventTimingArray. */
+static inline WaitEventTimingState *
+wet_slot(int idx)
+{
+	return (WaitEventTimingState *)
+		(WaitEventTimingArray + (Size) idx * wait_event_timing_per_backend_stride);
+}
+
+/*
+ * Address of the LWLock hash slot table for the given slot's lwlock_hash
+ * header.  The slot table immediately follows the WaitEventTimingState
+ * header in memory; hash_size in the LWLockTimingHash header tells us
+ * how many entries follow.
+ */
+static inline LWLockTimingHashEntry *
+wet_lwlock_hash_entries(WaitEventTimingState *state)
+{
+	return (LWLockTimingHashEntry *)((char *) state + sizeof(WaitEventTimingState));
+}
+
+/*
+ * Address of the dense LWLock events array for the given slot.  It
+ * immediately follows the slot table.
+ */
+static inline WaitEventTimingEntry *
+wet_lwlock_hash_events(WaitEventTimingState *state)
+{
+	return (WaitEventTimingEntry *)
+		((char *) state + sizeof(WaitEventTimingState)
+		 + (Size) state->lwlock_hash.hash_size * sizeof(LWLockTimingHashEntry));
+}
+
+/* DSA-based trace ring buffer control */
+static WaitEventTraceControl *WaitEventTraceCtl = NULL;
+static dsa_area *trace_dsa = NULL;
+int			my_trace_proc_number = -1;
+
+/*
+ * Same-backend coordination between pg_get_backend_wait_event_trace (the
+ * own-session SRF reader) and wait_event_trace_release_slot (the GUC
+ * step-down path that frees this backend's ring).  Both paths run in this
+ * same backend, single-threaded, so a plain bool is sufficient -- no
+ * atomics needed.
+ *
+ *   srf_in_progress   set true while the SRF is iterating the ring; the
+ *                     release path observes this and defers the dsa_free
+ *                     instead of yanking the chunk out from under us.
+ *
+ *   release_pending   set by the release path when it had to defer; the
+ *                     SRF's PG_FINALLY checks it and performs the deferred
+ *                     dsa_free after the iteration completes.
+ *
+ * Cross-backend readers (extensions, bgworkers reading another backend's
+ * ring) cannot use this mechanism -- they coordinate with the release
+ * path via WaitEventTraceCtl->lock instead.  See the header for the
+ * recommended snapshot-under-lock pattern for those consumers.
+ */
+static bool wait_event_trace_srf_in_progress = false;
+static bool wait_event_trace_release_pending = false;
+
+/*
+ * Per-backend gate that disables the trace-ring writer in the wait-
+ * event hot path while a slot-state transition is in progress.
+ *
+ * Set true around code paths that either free the local trace ring
+ * (wait_event_trace_release_slot's dsa_free) or transition the slot
+ * out of OWNED (wait_event_trace_before_shmem_exit's OWNED ->
+ * ORPHANED publish).  In both cases an internal LWLock inside
+ * dsa_free / dsa_attach / dsa_pin_mapping / dsa_pin can in
+ * principle contend long enough to dispatch a wait event; that
+ * wait event's pgstat_report_wait_end_timing inline path runs in
+ * the SAME backend, sees capture_level == TRACE (the GUC hasn't
+ * been committed yet by the time the assign hook runs), and would:
+ *
+ *   * during release_slot's dsa_free: write into a ring that has
+ *     already been returned to the DSA freelist -- if another
+ *     allocator has since reused the chunk, this is a stray write
+ *     into someone else's allocation.
+ *
+ *   * during release_slot's dsa_free, alternative timing: see
+ *     my_wait_event_trace == NULL on a naive "clear before free"
+ *     fix and recurse into wait_event_trace_attach, which would
+ *     either deadlock on the WaitEventTraceCtl->lock the outer
+ *     release_slot already holds, or (on a lock-free moment)
+ *     allocate a fresh ring that the outer release_slot would
+ *     then free again as part of its post-acquire DsaPointerIsValid
+ *     check -- a different use-after-free of a freshly-allocated
+ *     chunk.
+ *
+ *   * during before_shmem_exit: write into the ring after the slot
+ *     has been published as ORPHANED, violating the post-mortem
+ *     read-only contract that cross-backend readers rely on.
+ *
+ * The flag is per-backend (static at file scope means per-process
+ * in PG's process-per-backend model), so the hot path's check is a
+ * single cache-warm load and a branch; no atomic, no fence.  The
+ * trace branch is already gated by capture_level == TRACE so the
+ * additional check costs nothing in the common case where capture
+ * is off or stats-only.  The flag is set on the very same backend
+ * that may later read it from the hot path, so there is no
+ * cross-process visibility concern.
+ *
+ * See the release_slot and before_shmem_exit doc comments for the
+ * specific transition each uses this flag around, and review_6.md
+ * issue #10 for the UAF analysis.
+ */
+static bool wait_event_trace_writes_disabled = false;
+
+/* Forward declarations for lazy-attach helpers */
+static void wait_event_timing_ensure_dsa(void);
+static void pgstat_wait_event_timing_before_shmem_exit(int code, Datum arg);
+
+/*
+ * Per-backend shutdown gate.  Set true in the before_shmem_exit
+ * callback so the wait-event hot path can detect that DSA mappings
+ * may already be torn down by dsm_backend_shutdown (which runs as
+ * a LATER on_shmem_exit callback) and skip every code path that
+ * would dereference my_wait_event_timing or attempt a fresh
+ * lazy_attach.  Once true, the backend's wait events are silently
+ * dropped for the remainder of proc_exit -- the backend is going
+ * away anyway, and the alternative is a SIGSEGV.
+ */
+bool		wait_event_timing_writes_disabled = false;
+static bool wait_event_timing_attach_array(bool allocate_if_missing);
+static void wait_event_trace_release_slot(int procNumber);
+
+/*
+ * Mapping arrays for the flat events[] array, generated from
+ * wait_event_names.txt by generate-wait_event_types.pl.
+ * Defines: WAIT_EVENT_TIMING_RAW_CLASSES, WAIT_EVENT_TIMING_DENSE_CLASSES,
+ *          WAIT_EVENT_TIMING_NUM_EVENTS, and the four mapping arrays.
+ */
+#include "utils/wait_event_timing_data.h"
+
+/*
+ * Convert wait_event_info to a flat index for the events[] array.
+ * For bounded classes, eventId equals the array index within the class
+ * (the enum values start at PG_WAIT_<CLASS> and increment by one).
+ *
+ * Class extraction follows the same idiom as pgstat_get_wait_event_type:
+ * mask off the class bits and compare against the full PG_WAIT_*
+ * constants, rather than shifting both sides down to a byte.  The
+ * dense-table lookup still needs the byte-form class id, but that
+ * conversion is now an isolated array-index step rather than a
+ * load-bearing piece of encoding-layout knowledge in the comparison.
+ */
+static int
+wait_event_timing_index(uint32 wait_event_info)
+{
+	uint32		classId = wait_event_info & WAIT_EVENT_CLASS_MASK;
+	int			eventId = wait_event_info & WAIT_EVENT_ID_MASK;
+	int			class_byte;
+	int			dense;
+
+	if (classId == PG_WAIT_LWLOCK)
+		return WAIT_EVENT_TIMING_IDX_LWLOCK;
+
+	class_byte = classId >> 24;
+	if (unlikely(class_byte >= WAIT_EVENT_TIMING_RAW_CLASSES))
+		return -1;
+
+	dense = wait_event_class_dense[class_byte];
+	if (unlikely(dense < 0))
+		return -1;
+
+	if (unlikely(eventId >= wait_event_class_nevents[dense]))
+		return -1;
+
+	return wait_event_class_offset[dense] + eventId;
+}
+
+/*
+ * Reset a slot's LWLockTimingHash to its empty initial state.
+ *
+ * Takes a WaitEventTimingState rather than a bare LWLockTimingHash
+ * because the slot table (entries[]) and dense events array
+ * (lwlock_events[]) live as variable-size regions following the
+ * WaitEventTimingState header in memory; their sizes are runtime-
+ * determined by wait_event_timing_max_tranches.  The hash header's
+ * hash_size and max_entries fields are immutable after allocation
+ * and are NOT reset here.
+ */
+static void
+lwlock_timing_hash_clear(WaitEventTimingState *state)
+{
+	LWLockTimingHash *ht = &state->lwlock_hash;
+	LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state);
+	WaitEventTimingEntry *events = wet_lwlock_hash_events(state);
+	int			i;
+
+	ht->num_used = 0;
+	memset(events, 0, (Size) ht->max_entries * sizeof(WaitEventTimingEntry));
+	for (i = 0; i < ht->hash_size; i++)
+	{
+		entries[i].tranche_id = LWLOCK_TIMING_EMPTY_SLOT;
+		entries[i].dense_idx = 0;
+	}
+}
+
+/*
+ * Maximum number of probes attempted on the lookup hot path once the
+ * table is at capacity.  At cap there is no further insertion
+ * possible, so an unknown tranche cannot be recorded; the only useful
+ * work the loop can do is find an existing entry within its
+ * probe-distance window.  Bounding the scan caps the per-event cost at
+ * the cap-overflow regime to a constant, instead of paying ~2-3 probes
+ * (worst-case clusters: many more) on every unknown-tranche wait_end
+ * for the remainder of the backend lifetime.
+ *
+ * The bound (8) is well above the expected probe distance at this
+ * table's load factor (linear-probing miss expected length ~1.78 at
+ * 37.5% load; P99 fits comfortably in 8).  Entries inserted with a
+ * collision distance > 8 from their hash slot will fail to be found at
+ * cap, which is theoretically possible but astronomically unlikely at
+ * the load factors we target (probability < 1e-3) and is the right
+ * trade against the common at-cap unknown-tranche cost.
+ */
+#define LWLOCK_TIMING_LOOKUP_AT_CAP_PROBE_LIMIT 8
+
+/*
+ * Look up (or insert) timing entry for an LWLock tranche ID.
+ *
+ * Takes WaitEventTimingState (rather than just the hash header) so the
+ * variable-size entries[] and lwlock_events[] arrays following the
+ * header can be addressed via the wet_lwlock_hash_*() helpers.
+ */
+static WaitEventTimingEntry *
+lwlock_timing_lookup(WaitEventTimingState *state, uint16 tranche_id)
+{
+	LWLockTimingHash *ht = &state->lwlock_hash;
+	LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state);
+	WaitEventTimingEntry *events = wet_lwlock_hash_events(state);
+	uint32		hash = (uint32) tranche_id * 2654435761U;
+	int			slot = hash & (ht->hash_size - 1);
+	int			limit;
+	int			i;
+
+	/*
+	 * At cap, bound the probe distance so unknown tranches return NULL
+	 * quickly instead of walking through clustered occupied slots.  See
+	 * the comment on LWLOCK_TIMING_LOOKUP_AT_CAP_PROBE_LIMIT.
+	 */
+	limit = (ht->num_used >= ht->max_entries)
+		? LWLOCK_TIMING_LOOKUP_AT_CAP_PROBE_LIMIT
+		: ht->hash_size;
+
+	for (i = 0; i < limit; i++)
+	{
+		LWLockTimingHashEntry *e = &entries[slot];
+
+		if (e->tranche_id == tranche_id)
+			return &events[e->dense_idx];
+
+		if (e->tranche_id == LWLOCK_TIMING_EMPTY_SLOT)
+		{
+			if (ht->num_used >= ht->max_entries)
+				return NULL;
+
+			e->tranche_id = tranche_id;
+			e->dense_idx = ht->num_used++;
+			return &events[e->dense_idx];
+		}
+
+		slot = (slot + 1) & (ht->hash_size - 1);
+	}
+
+	return NULL;
+}
+
+/*
+ * Compute histogram bucket index for a duration in nanoseconds.
+ *
+ * Bin edges are powers of two directly on nanoseconds: bucket i covers
+ * [2^(i+9), 2^(i+10)) ns for 0 < i < NBUCKETS-1, bucket 0 covers
+ * [0, 1024) ns, and the last bucket covers [2^(NBUCKETS+8), inf) ns.
+ * The boundaries approximate the decimal-microsecond grid (1024 ≈ 1 us,
+ * 2048 ≈ 2 us, ... 2^33 ≈ 8.6 s) close enough for a diagnostic
+ * histogram while letting us skip the strength-reduced /1000 on the
+ * hot path.
+ *
+ * See the rationale comment on WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS in
+ * wait_event_timing.h for why the bucket count is 32 (covering up to
+ * 8.6s) rather than 16 (which would have capped at 16ms).
+ */
+static int
+wait_event_timing_bucket(int64 duration_ns)
+{
+	int			bucket;
+
+	/*
+	 * Everything under 1024 ns ("~1 us") lands in bucket 0.  Also handles
+	 * duration_ns == 0, which would otherwise be undefined input to
+	 * pg_leftmost_one_pos64.
+	 */
+	if (duration_ns < 1024)
+		return 0;
+
+	bucket = pg_leftmost_one_pos64((uint64) duration_ns) - 9;
+
+	if (bucket >= WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS)
+		bucket = WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS - 1;
+
+	return bucket;
+}
+
+/*
+ * Write a trace ring marker record.  Shared helper for all marker types.
+ */
+static void
+wait_event_trace_write_marker(uint8 record_type, int64 query_id)
+{
+	uint64	pos;
+	WaitEventTraceRecord *rec;
+	uint32	seq;
+	instr_time now;
+
+	/*
+	 * Single capture-level gate: markers only land in the ring when
+	 * wait_event_capture is at TRACE.  This guarantees consistency with
+	 * the wait-event hot path (also gated on the same level) -- there is
+	 * no configuration in which one half of the trace fires and the
+	 * other doesn't.  query_id == 0 means "no query ID available"
+	 * (utility command or compute_query_id = off), which we skip.
+	 *
+	 * wait_event_trace_writes_disabled is the same per-backend gate
+	 * the wait-event hot path uses; it is raised by release_slot and
+	 * before_shmem_exit around slot-state transitions to keep both
+	 * writers consistent.  Markers cannot fire during those
+	 * transitions today (single-threaded execution, no nested
+	 * executor), but checking here keeps the contract uniform
+	 * across all trace-ring writers and is robust to future code
+	 * paths that might invoke a marker from a nested context.
+	 *
+	 * No likely()/unlikely() annotation: this function is called at
+	 * query/exec boundaries (a handful per query, not per wait event),
+	 * so neither side of the branch dominates often enough for static
+	 * layout to matter, and the meaningful production configuration
+	 * (wait_event_capture = trace) is exactly when the body is hot --
+	 * an annotation on the early-return would point the wrong way.
+	 */
+	if (wait_event_capture != WAIT_EVENT_CAPTURE_TRACE ||
+		wait_event_trace_writes_disabled ||
+		query_id == 0)
+		return;
+
+	/*
+	 * Lazy attach on first use.  Allocation lives here (not in the
+	 * assign hook) because dsa_allocate_extended() can ereport(ERROR)
+	 * on OOM, which is forbidden in assign-hook context but legitimate
+	 * here.  Idempotent: wait_event_trace_attach() short-circuits on
+	 * subsequent calls.
+	 */
+	if (my_wait_event_trace == NULL)
+	{
+		if (my_trace_proc_number < 0)
+			return;
+		wait_event_trace_attach(my_trace_proc_number);
+		if (my_wait_event_trace == NULL)
+			return;			/* attach path unable to allocate */
+	}
+
+	/*
+	 * Claim the next slot.  Single-writer counter (only the owning backend
+	 * writes its own ring), so a plain read+write is sufficient and avoids
+	 * the LOCK XADD that pg_atomic_fetch_add_u64 would emit -- a wasted
+	 * cache-coherence trip on an unshared cache line at this rate (one per
+	 * wait event).  Cross-backend readers use pg_atomic_read_u64, which
+	 * compiles to a plain MOV on x86 and tolerates concurrent writes here
+	 * (their actual safety against the records[] window is the per-record
+	 * seqlock below).  Same idiom as injection_point.c's per-entry
+	 * generation counter (single writer + multiple lock-free readers).
+	 */
+	pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos);
+	pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1);
+	rec = &my_wait_event_trace->records[pos & my_wait_event_trace->ring_mask];
+	seq = (uint32)(pos * 2 + 1);
+
+	rec->seq = seq;
+	pg_write_barrier();		/* release: payload stores must not rise above seq=odd */
+
+	INSTR_TIME_SET_CURRENT(now);
+	rec->record_type = record_type;
+	rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now);
+	rec->data.query.query_id = query_id;
+	rec->data.query.pad2 = 0;
+
+	pg_write_barrier();		/* release: payload stores must land before seq=even */
+	rec->seq = seq + 1;
+}
+
+void
+wait_event_trace_query_start(int64 query_id)
+{
+	wait_event_trace_write_marker(TRACE_QUERY_START, query_id);
+}
+
+void
+wait_event_trace_query_end(int64 query_id)
+{
+	wait_event_trace_write_marker(TRACE_QUERY_END, query_id);
+}
+
+void
+wait_event_trace_exec_start(int64 query_id)
+{
+	wait_event_trace_write_marker(TRACE_EXEC_START, query_id);
+}
+
+void
+wait_event_trace_exec_end(int64 query_id)
+{
+	wait_event_trace_write_marker(TRACE_EXEC_END, query_id);
+}
+
+/*
+ * Report and initialize shared memory for wait event timing.
+ *
+ * Registered via the shmem subsystem registry in
+ * src/include/storage/subsystemlist.h.  Only the small control struct
+ * is in fixed shmem; the per-backend WaitEventTimingState array
+ * (~30 KB/backend) is allocated lazily in DSA on first enable by any
+ * backend (see wait_event_timing_attach_array).
+ */
+static void
+WaitEventTimingShmemRequest(void *arg)
+{
+	ShmemRequestStruct(.name = "WaitEventTimingControl",
+					   .size = sizeof(WaitEventTimingControl),
+					   .ptr = (void **) &WaitEventTimingCtl);
+}
+
+static void
+WaitEventTimingShmemInit(void *arg)
+{
+	LWLockInitialize(&WaitEventTimingCtl->lock,
+					 LWTRANCHE_WAIT_EVENT_TIMING_DSA);
+	WaitEventTimingCtl->timing_dsa_handle = DSA_HANDLE_INVALID;
+	WaitEventTimingCtl->timing_array = InvalidDsaPointer;
+
+	WaitEventTimingArray = NULL;
+}
+
+const ShmemCallbacks WaitEventTimingShmemCallbacks = {
+	.request_fn = WaitEventTimingShmemRequest,
+	.init_fn = WaitEventTimingShmemInit,
+};
+
+/*
+ * Ensure the backend is attached to the timing DSA.
+ *
+ * The DSA is created by whichever backend first hits this function with
+ * an empty control struct; subsequent callers just attach to the
+ * existing handle.  The backend-local dsa_area pointer is cached in
+ * timing_dsa for the backend's lifetime.
+ */
+static void
+wait_event_timing_ensure_dsa(void)
+{
+	MemoryContext oldcontext;
+
+	if (timing_dsa != NULL)
+		return;
+
+	if (WaitEventTimingCtl == NULL)
+		return;					/* pre-ShmemInit; nothing to attach to */
+
+	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+	LWLockAcquire(&WaitEventTimingCtl->lock, LW_EXCLUSIVE);
+
+	if (WaitEventTimingCtl->timing_dsa_handle == DSA_HANDLE_INVALID)
+	{
+		timing_dsa = dsa_create(LWTRANCHE_WAIT_EVENT_TIMING_DSA);
+		dsa_pin(timing_dsa);
+		dsa_pin_mapping(timing_dsa);
+		WaitEventTimingCtl->timing_dsa_handle = dsa_get_handle(timing_dsa);
+	}
+	else
+	{
+		timing_dsa = dsa_attach(WaitEventTimingCtl->timing_dsa_handle);
+		dsa_pin_mapping(timing_dsa);
+	}
+
+	LWLockRelease(&WaitEventTimingCtl->lock);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Attach this backend to the shared WaitEventTimingArray, allocating
+ * it in DSA on first use if allocate_if_missing is true.
+ *
+ * Returns true if the array is now available (WaitEventTimingArray is
+ * non-NULL on return); false otherwise.  Readers pass allocate_if_missing
+ * = false to avoid allocating a big array just because somebody ran
+ * SELECT against an empty pg_stat view.  Writers (hot path) pass true
+ * so that the first wait event under wait_event_capture != off creates
+ * the storage.
+ *
+ * Re-entrancy guard.  Internal operations below (dsa_create,
+ * dsa_allocate_extended, the LWLockAcquire inside ensure_dsa) can
+ * emit LWLock wait events of their own, which feed into the wait-end
+ * timing hot path; under wait_event_capture >= STATS that hot path
+ * lazy-attaches by calling back into this function.  Without the
+ * guard we would either deadlock on WaitEventTimingCtl->lock or
+ * recurse with a half-initialised slot pointer.
+ *
+ * The same hazard applies in wait_event_trace_attach (which also runs
+ * dsa_allocate / LWLock under its body) and in
+ * wait_event_trace_release_slot (whose dsa_free takes a DSA-internal
+ * LWLock that can in principle emit a wait event during shutdown
+ * sequences).  Each function carries its own static bool guard close
+ * to the code it protects, matching the established PG idiom for
+ * function-local re-entry guards (see, e.g., in_vacuum in
+ * src/backend/commands/vacuum.c, in_streamed_transaction in
+ * src/backend/replication/logical/worker.c).  We deliberately do NOT
+ * collapse these into a shared bitmask because:
+ *   1. PG style places re-entry flags adjacent to the function they
+ *      protect, not in a centralised module-level state structure.
+ *   2. The three guarded functions are independent: a re-entry into
+ *      one of them while another is in flight is a legitimate pattern
+ *      (e.g., release_slot can be triggered by an assign hook that
+ *      itself ran while attach was in progress earlier).  A shared
+ *      flag would conservatively block those legal cases.
+ *
+ * If you add a fourth re-entrant function in this file, follow the
+ * same shape: a `static bool in_<verb> = false;` at the top of the
+ * function, an early-return `if (in_<verb>) return ...;`, set true
+ * before the body, clear in PG_FINALLY so an ereport(ERROR) cannot
+ * leave the flag stuck set.
+ */
+static bool
+wait_event_timing_attach_array(bool allocate_if_missing)
+{
+	static bool in_attach = false;
+	bool		attached = false;
+
+	if (WaitEventTimingArray != NULL)
+		return true;
+
+	if (WaitEventTimingCtl == NULL)
+		return false;
+
+	if (in_attach)
+		return false;
+
+	in_attach = true;
+	PG_TRY();
+	{
+		wait_event_timing_ensure_dsa();
+
+		if (WaitEventTimingCtl->timing_array == InvalidDsaPointer)
+		{
+			if (!allocate_if_missing)
+			{
+				attached = false;
+			}
+			else
+			{
+				int		max_entries;
+				int		hash_size;
+				Size	stride;
+				Size	total;
+
+				/*
+				 * Snapshot the GUC at allocation time and use the same
+				 * value for every slot in the cluster.  This is the
+				 * cluster-wide first-enable allocation; subsequent
+				 * backends that attach reuse these dimensions, even if
+				 * the GUC has somehow been changed in between (it
+				 * shouldn't, since it is PGC_POSTMASTER, but reading
+				 * once and storing the result keeps the contract
+				 * explicit).
+				 */
+				max_entries = wait_event_timing_max_tranches;
+				hash_size = wait_event_timing_hash_size_for(max_entries);
+				stride = wait_event_timing_slot_size(max_entries);
+				total = mul_size(NUM_WAIT_EVENT_TIMING_SLOTS, stride);
+
+				LWLockAcquire(&WaitEventTimingCtl->lock, LW_EXCLUSIVE);
+
+				if (WaitEventTimingCtl->timing_array == InvalidDsaPointer)
+				{
+					dsa_pointer p;
+					char	   *region;
+					int			i;
+
+					p = dsa_allocate_extended(timing_dsa, total,
+											  DSA_ALLOC_ZERO);
+					region = (char *) dsa_get_address(timing_dsa, p);
+
+					for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++)
+					{
+						WaitEventTimingState *slot;
+						LWLockTimingHashEntry *slot_entries;
+						int			j;
+
+						slot = (WaitEventTimingState *) (region + (Size) i * stride);
+
+						pg_atomic_init_u32(&slot->reset_generation, 0);
+						slot->lwlock_hash.num_used = 0;
+						slot->lwlock_hash.hash_size = hash_size;
+						slot->lwlock_hash.max_entries = max_entries;
+
+						/*
+						 * Initialise the hash slot table to the empty
+						 * sentinel.  The DSA region was zeroed above
+						 * (DSA_ALLOC_ZERO), but the empty sentinel is
+						 * 0xFFFF, not 0.
+						 */
+						slot_entries = (LWLockTimingHashEntry *)
+							((char *) slot + sizeof(WaitEventTimingState));
+						for (j = 0; j < hash_size; j++)
+							slot_entries[j].tranche_id = LWLOCK_TIMING_EMPTY_SLOT;
+					}
+
+					WaitEventTimingCtl->timing_array = p;
+				}
+
+				LWLockRelease(&WaitEventTimingCtl->lock);
+
+				attached = true;
+			}
+		}
+		else
+		{
+			attached = true;
+		}
+
+		if (attached)
+		{
+			WaitEventTimingState *first;
+
+			WaitEventTimingArray = (char *)
+				dsa_get_address(timing_dsa,
+								WaitEventTimingCtl->timing_array);
+
+			/*
+			 * Recover the dimensions from the first slot's lwlock_hash
+			 * header.  All slots share the same dimensions, set at
+			 * allocation time.  Cache the stride backend-locally so
+			 * wet_slot() is a single multiply-and-add.
+			 */
+			first = (WaitEventTimingState *) WaitEventTimingArray;
+			wait_event_timing_max_entries = first->lwlock_hash.max_entries;
+			wait_event_timing_hash_size = first->lwlock_hash.hash_size;
+			wait_event_timing_per_backend_stride =
+				wait_event_timing_slot_size(wait_event_timing_max_entries);
+		}
+	}
+	PG_FINALLY();
+	{
+		in_attach = false;
+	}
+	PG_END_TRY();
+
+	return WaitEventTimingArray != NULL;
+}
+
+/*
+ * Point my_wait_event_timing at this backend's slot within the shared
+ * timing array, allocating the array in DSA on first call.
+ *
+ * Called from the hot path entry points pgstat_report_wait_start_timing()
+ * and pgstat_report_wait_end_timing() when wait_event_capture is non-OFF
+ * and my_wait_event_timing is still NULL.  After the first successful
+ * attach, my_wait_event_timing stays non-NULL for the backend's lifetime,
+ * so this function is reached only on the cold first-attach path.
+ */
+static void
+pgstat_wait_event_timing_lazy_attach(void)
+{
+	int			procNumber;
+	WaitEventTimingState *slot;
+
+	if (my_wait_event_timing != NULL)
+		return;
+
+	if (MyProc == NULL)
+		return;
+
+	/*
+	 * Lazy attach allocates memory (via wait_event_timing_attach_array ->
+	 * dsa_attach -> dsm_attach -> MemoryContextAlloc).  In a critical
+	 * section, MemoryContextAlloc Assert-fails on
+	 * "CritSectionCount == 0 || allowInCritSection".  A backend's very
+	 * first wait event after wait_event_capture is enabled can land
+	 * inside a critical section -- e.g. a parallel worker that hasn't
+	 * yet emitted any wait events does so for the first time in
+	 * BufferSetHintBits16 -> XLogSaveBufferForHint -> XLogInsert ->
+	 * LWLockAcquire, with XLogInsert holding a critical section.
+	 *
+	 * Skipping the attach in that case silently drops the in-flight
+	 * wait event but keeps the backend alive.  The very next wait
+	 * event outside any critical section will hit this function again
+	 * and attach successfully, after which the hot path no longer
+	 * routes through here.  Wait events emitted inside critical
+	 * sections are by their nature brief, infrequent (critical
+	 * sections are short by design), and would be dropped anyway if
+	 * the backend exited from a crash here -- so losing them at the
+	 * very-first-attach moment is an acceptable tradeoff against the
+	 * Assert-induced abort.
+	 */
+	if (CritSectionCount > 0)
+		return;
+
+	/*
+	 * Skip the attach if MyProc is already on an LWLock wait queue.
+	 * The wait-event hot path that called us runs INSIDE
+	 * LWLockAcquire after LWLockQueueSelf has set
+	 * MyProc->lwWaiting = LW_WS_WAITING, but BEFORE the actual
+	 * sleep.  Our wait_event_timing_attach_array calls
+	 * dsa_attach -> LWLockAcquire on its own LWLock; if that lock
+	 * is contended the nested LWLockQueueSelf would hit the
+	 * "queueing for lock while waiting on another one" PANIC at
+	 * lwlock.c:1029 (Assert(MyProc->lwWaiting == LW_WS_NOT_WAITING)
+	 * just before the queue insert).
+	 *
+	 * Skipping here drops the in-flight wait event from timing
+	 * stats but keeps the backend alive.  The next wait event
+	 * outside any LWLock-wait context will retry the attach
+	 * successfully; in practice every backend hits an uncontended
+	 * latch or PgSleep wait well within its first few seconds,
+	 * so the lost stats are at most a handful of contention
+	 * waits at backend startup.
+	 */
+	if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
+		return;
+
+	procNumber = GetNumberFromPGProc(MyProc);
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		return;
+
+	if (!wait_event_timing_attach_array(true))
+		return;
+
+	slot = wet_slot(procNumber);
+
+	/*
+	 * Clear this backend's slot the first time it is used after backend
+	 * start.  The DSA-allocated region is zeroed on creation, but a later
+	 * backend may inherit a slot previously occupied by an exited
+	 * backend; explicit zero here keeps stats accurate across slot reuse.
+	 * Matches the old per-backend init performed by
+	 * pgstat_set_wait_event_timing_storage() in the eager-shmem design.
+	 *
+	 * Initialisation order: zero the slot through the local `slot` first,
+	 * THEN publish the result to my_wait_event_timing.  This keeps the
+	 * single-backend invariant clean: at no point in this backend can
+	 * `my_wait_event_timing != NULL` coincide with `*my_wait_event_timing`
+	 * being partially initialised.  The hot-path inline gate
+	 *
+	 *   if (unlikely(my_wait_event_timing == NULL))
+	 *       pgstat_wait_event_timing_lazy_attach();
+	 *   ... my_wait_event_timing->wait_start = ... ;
+	 *
+	 * relies on that ordering: a non-NULL pointer means the slot is
+	 * ready for the very next store.
+	 *
+	 * Note that cross-backend readers do NOT go through
+	 * my_wait_event_timing -- they index WaitEventTimingArray[procNumber]
+	 * directly via pgstat_get_wait_event_timing(), guarded by
+	 * pgstat_get_beentry_by_proc_number() which filters dead/recycled
+	 * slots.  So this reordering is a same-backend tidiness fix; it does
+	 * not address (and does not need to address) any cross-backend
+	 * publication ordering, of which there is none.
+	 */
+	memset(slot->events, 0, sizeof(slot->events));
+	lwlock_timing_hash_clear(slot);
+	slot->reset_count = 0;
+	slot->lwlock_overflow_count = 0;
+	slot->flat_overflow_count = 0;
+	slot->current_event = 0;
+	INSTR_TIME_SET_ZERO(slot->wait_start);
+
+	my_last_reset_generation = pg_atomic_read_u32(&slot->reset_generation);
+
+	/* Publish only after the slot is fully initialised. */
+	my_wait_event_timing = slot;
+
+	/*
+	 * Register a before_shmem_exit callback to clear my_wait_event_timing
+	 * BEFORE dsm_backend_shutdown unmaps the DSA segment that backs the
+	 * slot.  Without this, late-shutdown wait events (e.g. ProcArrayLock
+	 * contention inside ProcArrayRemove during shmem_exit) fire the
+	 * inline hot path, dereference the now-dangling slot pointer through
+	 * INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start), and
+	 * segfault.  Callbacks run in LIFO order; dsm_backend_shutdown is
+	 * registered very early in InitProcess so it always runs AFTER this
+	 * one, giving us a safe window to null the pointer.
+	 */
+	{
+		static bool registered = false;
+
+		if (!registered)
+		{
+			before_shmem_exit(pgstat_wait_event_timing_before_shmem_exit,
+							  (Datum) 0);
+			registered = true;
+		}
+	}
+}
+
+/*
+ * before_shmem_exit callback.  Disables the inline hot path so it
+ * does NOT dereference my_wait_event_timing during the rest of the
+ * proc_exit cascade (after dsm_backend_shutdown unmaps the DSA
+ * segment behind that pointer).  We deliberately do NOT null out
+ * my_wait_event_timing here: a NULL pointer would route the hot
+ * path through the lazy-attach branch, which then re-attaches a
+ * fresh slot using DSA primitives that themselves operate on
+ * already-detached memory.  Setting the gate flag stops both the
+ * dereference and the re-attach.
+ */
+static void
+pgstat_wait_event_timing_before_shmem_exit(int code, Datum arg)
+{
+	wait_event_timing_writes_disabled = true;
+}
+
+/*
+ * Report the shared memory space needed for trace ring buffer control.
+ * Only a small control struct is in fixed shmem; the actual ring buffers
+ * are allocated lazily via DSA.  At ~24 bytes/slot, the slot array adds
+ * ~26 KB at a default MaxBackends, negligible compared to the ring
+ * memory itself.
+ */
+static Size
+WaitEventTraceControlShmemSize(void)
+{
+	return add_size(offsetof(WaitEventTraceControl, trace_slots),
+					mul_size(NUM_WAIT_EVENT_TIMING_SLOTS,
+							 sizeof(WaitEventTraceSlot)));
+}
+
+static void
+WaitEventTraceControlShmemRequest(void *arg)
+{
+	ShmemRequestStruct(.name = "WaitEventTraceControl",
+					   .size = WaitEventTraceControlShmemSize(),
+					   .ptr = (void **) &WaitEventTraceCtl);
+}
+
+/*
+ * Initialize shared memory for trace ring buffer control.
+ */
+static void
+WaitEventTraceControlShmemInit(void *arg)
+{
+	int		i;
+
+	WaitEventTraceCtl->trace_dsa_handle = DSA_HANDLE_INVALID;
+	LWLockInitialize(&WaitEventTraceCtl->lock,
+					 LWTRANCHE_WAIT_EVENT_TRACE_DSA);
+	for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++)
+	{
+		WaitEventTraceSlot *s = &WaitEventTraceCtl->trace_slots[i];
+
+		pg_atomic_init_u64(&s->generation, 0);
+		pg_atomic_init_u32(&s->state, WAIT_EVENT_TRACE_SLOT_FREE);
+		s->pad = 0;
+		s->ring_ptr = InvalidDsaPointer;
+	}
+}
+
+const ShmemCallbacks WaitEventTraceControlShmemCallbacks = {
+	.request_fn = WaitEventTraceControlShmemRequest,
+	.init_fn = WaitEventTraceControlShmemInit,
+};
+
+/*
+ * Ensure the shared DSA for trace ring buffers exists and is attached.
+ * Creates it on first call (any backend), attaches on subsequent calls.
+ * Must be called from a backend context (not postmaster).
+ */
+static void
+wait_event_trace_ensure_dsa(void)
+{
+	MemoryContext oldcontext;
+
+	if (trace_dsa != NULL)
+		return;
+
+	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+	LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+	if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID)
+	{
+		trace_dsa = dsa_create(LWTRANCHE_WAIT_EVENT_TRACE_DSA);
+		dsa_pin(trace_dsa);
+		dsa_pin_mapping(trace_dsa);
+		WaitEventTraceCtl->trace_dsa_handle = dsa_get_handle(trace_dsa);
+	}
+	else
+	{
+		trace_dsa = dsa_attach(WaitEventTraceCtl->trace_dsa_handle);
+		dsa_pin_mapping(trace_dsa);
+	}
+
+	LWLockRelease(&WaitEventTraceCtl->lock);
+
+	MemoryContextSwitchTo(oldcontext);
+}
+
+/*
+ * Transition our trace ring slot to ORPHANED on backend exit.
+ *
+ * Registered as a before_shmem_exit callback.  Runs BEFORE
+ * dsm_backend_shutdown() detaches the DSA.
+ *
+ * Crucially, we do NOT free the ring here.  The ring stays allocated in
+ * DSA so that cross-backend consumers -- the in-tree
+ * pg_get_wait_event_trace SRF and any extension following the
+ * snapshot pattern documented on WaitEventTraceControl -- can read
+ * the dying backend's final waits.  The original "free at exit"
+ * design lost data the instant a worker terminated, which was
+ * particularly bad for parallel workers exiting in milliseconds at
+ * end-of-parallel-query.  See the lifecycle comment on
+ * WaitEventTraceControl for the full design
+ * rationale and the bounded-memory cost we accept in exchange.
+ *
+ * The ORPHANED slot is reclaimed in one of two ways:
+ *   (a) a new backend at this procNumber calls
+ *       wait_event_trace_clear_orphan_at_init() at backend init, or
+ *   (b) the DBA calls pg_stat_clear_orphaned_wait_event_rings().
+ *
+ * State transition order matters: bump generation BEFORE storing the
+ * new state, so cross-backend readers that snapshot
+ * (generation_before, state, ring_ptr, generation_after) under the
+ * lock see a consistent (state, ring_ptr) pair iff generation didn't
+ * change.  We hold the lock for the whole transition, but readers do
+ * not have to (they just take it briefly to snapshot the ring
+ * contents); the generation check is what makes the unlocked-read
+ * path safe.
+ */
+static void
+wait_event_trace_before_shmem_exit(int code, Datum arg)
+{
+	int		procNumber = DatumGetInt32(arg);
+	WaitEventTraceSlot *slot;
+
+	if (WaitEventTraceCtl == NULL)
+		return;
+
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		return;
+
+	slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+	/*
+	 * If this backend never ended up with an OWNED slot (e.g. capture
+	 * was off the whole session, or the trace was released back to FREE
+	 * via assign_wait_event_capture going trace -> off), there is
+	 * nothing to transition.  Read state without the lock first as a
+	 * fast-path check; the authoritative re-check happens under the
+	 * lock below.
+	 */
+	if (pg_atomic_read_u32(&slot->state) != WAIT_EVENT_TRACE_SLOT_OWNED)
+	{
+		wait_event_trace_writes_disabled = true;
+		my_wait_event_trace = NULL;
+		return;
+	}
+
+	/*
+	 * Disable trace-ring writes on this backend before we touch the
+	 * lock.  Writes after this point would race with the
+	 * OWNED -> ORPHANED state publish below: a wait event whose
+	 * end-timing path runs after the state has been published as
+	 * ORPHANED would write into a ring that the patch contract
+	 * declares read-only post-mortem.  Cross-backend readers
+	 * snapshot ORPHANED rings without expecting concurrent writes
+	 * from the dying owner.  See wait_event_trace_writes_disabled
+	 * for the full UAF / contract-violation analysis.
+	 *
+	 * The flag stays true for the remainder of this backend's life
+	 * (we are in proc_exit; there is no subsequent capture re-enable
+	 * to handle), so we do not reset it.
+	 */
+	wait_event_trace_writes_disabled = true;
+
+	LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+	/*
+	 * Drop the local pointer inside the lock-held region as a
+	 * second line of defense; the writes-disabled flag above is
+	 * the primary gate.
+	 */
+	my_wait_event_trace = NULL;
+
+	if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_OWNED &&
+		DsaPointerIsValid(slot->ring_ptr))
+	{
+		/*
+		 * Bump generation first so any reader that snapped the old
+		 * generation will detect the change on its post-read recheck
+		 * and discard its read.  Then publish the ORPHANED state.
+		 * Keep ring_ptr valid -- the data is what we want to preserve.
+		 */
+		pg_atomic_fetch_add_u64(&slot->generation, 1);
+		pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_ORPHANED);
+	}
+
+	LWLockRelease(&WaitEventTraceCtl->lock);
+}
+
+/*
+ * Allocate (or re-acquire) a trace ring buffer for this backend via DSA.
+ * Called when wait_event_capture is set to 'trace'.
+ *
+ * Slot state at entry will be one of:
+ *
+ *   FREE     fresh slot (or one cleared on this backend's init by
+ *            wait_event_trace_clear_orphan_at_init): allocate a new
+ *            ring, transition slot to OWNED, bump generation.
+ *
+ *   OWNED    we already attached earlier in this same backend's life
+ *            (e.g. user toggled capture trace->stats->trace; the
+ *            stats step calls wait_event_trace_release_slot which
+ *            transitions back to FREE, but our cached
+ *            my_wait_event_trace was cleared on the way down -- so
+ *            seeing OWNED here at attach time means a different
+ *            backend somehow ended up with this procNumber, which
+ *            cannot happen because procNumber is per-backend and a
+ *            single backend can only run one attach at a time.  We
+ *            still tolerate this state defensively by re-mapping the
+ *            existing ring rather than leaking a second allocation.
+ *
+ *   ORPHANED can never be observed here: a new backend's
+ *            pgstat_set_wait_event_timing_storage() called
+ *            wait_event_trace_clear_orphan_at_init() before any
+ *            wait-event capture path can run, so any prior orphan has
+ *            already been demoted to FREE.  Treated as a safety check
+ *            (Assert in debug builds).
+ */
+void
+wait_event_trace_attach(int procNumber)
+{
+	/*
+	 * Re-entrancy guard.  dsa_create / dsa_allocate_extended below can
+	 * emit wait events internally; those reach the lazy-attach hot path
+	 * which calls back into this function while we still hold
+	 * WaitEventTraceCtl->lock or are mid-allocation.  See the
+	 * function-local-static-bool pattern explainer on
+	 * wait_event_timing_attach_array.
+	 */
+	static bool in_attach = false;
+	static bool shmem_exit_registered = false;
+	WaitEventTraceSlot *slot;
+	dsa_pointer p;
+	WaitEventTraceState *ts;
+	uint32		state_now;
+
+	if (in_attach)
+		return;
+
+	if (WaitEventTraceCtl == NULL)
+		return;
+
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		return;
+
+	/*
+	 * Skip the attach if we are inside a critical section.  Below this
+	 * point we call dsa_create / dsa_attach / dsa_allocate_extended,
+	 * all of which can allocate memory via MemoryContextAlloc and
+	 * Assert-fail on "CritSectionCount == 0 || allowInCritSection".
+	 * The very-first wait event after wait_event_capture = trace can
+	 * land inside a critical section (e.g. a parallel worker scanning
+	 * a heap page hits BufferSetHintBits16 -> XLogSaveBufferForHint ->
+	 * XLogInsert -> LWLockAcquire, with the XLogInsert critical
+	 * section open).
+	 *
+	 * Skipping here silently drops the in-flight wait event (it is
+	 * not traced) but keeps the backend alive.  The next wait event
+	 * outside any critical section will hit this function again and
+	 * attach successfully.  See the matching guard in
+	 * pgstat_wait_event_timing_lazy_attach.
+	 */
+	if (CritSectionCount > 0)
+		return;
+
+	/*
+	 * Skip the attach if MyProc is already on an LWLock wait
+	 * queue.  We are called from the wait-event hot path which
+	 * fires AFTER LWLockQueueSelf has set MyProc->lwWaiting; a
+	 * nested LWLockAcquire on our internal lock (via
+	 * wait_event_trace_ensure_dsa) would PANIC at lwlock.c:1029.
+	 * See the matching guard in pgstat_wait_event_timing_lazy_
+	 * attach for the full rationale.
+	 */
+	if (MyProc != NULL && MyProc->lwWaiting != LW_WS_NOT_WAITING)
+		return;
+
+	slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+	in_attach = true;
+	PG_TRY();
+	{
+		state_now = pg_atomic_read_u32(&slot->state);
+
+		/*
+		 * ORPHANED is normally impossible at attach time --
+		 * pgstat_set_wait_event_timing_storage() at backend init calls
+		 * wait_event_trace_clear_orphan_at_init() which demotes any
+		 * inherited orphan to FREE.  But there is one case where this
+		 * backend can legitimately observe its own slot in the
+		 * ORPHANED state: after we have already run
+		 * wait_event_trace_before_shmem_exit() (transitioning the slot
+		 * to ORPHANED on exit), a later before_shmem_exit callback
+		 * (e.g. pgstat_io_flush_cb during proc_exit shutdown) can
+		 * contend on an LWLock that emits a wait event, which calls
+		 * pgstat_report_wait_end_timing() -> wait_event_trace_attach()
+		 * after my_wait_event_trace has been cleared.  We must not
+		 * re-attach in that case: we are dying, the ring is now
+		 * post-mortem data for cross-backend readers, and the writer
+		 * invariant must hold.  Skip the trace for any wait events
+		 * emitted after our own exit transition.
+		 */
+		if (state_now == WAIT_EVENT_TRACE_SLOT_ORPHANED)
+		{
+			/* PG_FINALLY below clears in_attach. */
+		}
+		else if (state_now == WAIT_EVENT_TRACE_SLOT_OWNED &&
+				 DsaPointerIsValid(slot->ring_ptr))
+		{
+			/* Already have a ring buffer; re-map to it. */
+			wait_event_trace_ensure_dsa();
+			my_wait_event_trace = dsa_get_address(trace_dsa, slot->ring_ptr);
+			my_trace_proc_number = procNumber;
+		}
+		else
+		{
+			Size	alloc_size;
+
+			wait_event_trace_ensure_dsa();
+
+			/*
+			 * Cache the cluster-wide ring size on first allocation in
+			 * this backend.  wait_event_trace_ring_size_kb is
+			 * PGC_POSTMASTER, so by the time any backend reaches
+			 * here, its boot value has been committed by the GUC
+			 * framework.  All rings in the postmaster run share the
+			 * same dimensions.
+			 */
+			if (WaitEventTraceRingSize == 0)
+				WaitEventTraceRingSize =
+					(uint32) wait_event_trace_ring_size_kb * 1024U /
+					(uint32) sizeof(WaitEventTraceRecord);
+
+			alloc_size = offsetof(WaitEventTraceState, records) +
+				(Size) WaitEventTraceRingSize * sizeof(WaitEventTraceRecord);
+
+			p = dsa_allocate_extended(trace_dsa, alloc_size, DSA_ALLOC_ZERO);
+			ts = dsa_get_address(trace_dsa, p);
+			pg_atomic_init_u64(&ts->write_pos, 0);
+			ts->ring_mask = WaitEventTraceRingSize - 1;
+
+			LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+			/*
+			 * Publish ring_ptr BEFORE transitioning state to OWNED.
+			 * Cross-backend readers that observe state==OWNED outside
+			 * the lock then see a valid ring_ptr.  Bump generation
+			 * last so any reader that snapped the prior generation
+			 * will detect the change.
+			 */
+			slot->ring_ptr = p;
+			pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_OWNED);
+			pg_atomic_fetch_add_u64(&slot->generation, 1);
+			LWLockRelease(&WaitEventTraceCtl->lock);
+
+			my_wait_event_trace = ts;
+			my_trace_proc_number = procNumber;
+
+			/*
+			 * Register cleanup to run BEFORE dsm_backend_shutdown()
+			 * detaches the DSA.  The before_shmem_exit callbacks run in
+			 * LIFO order before DSM detach, so the ORPHANED transition
+			 * (which does not actually free the ring) is safe at that
+			 * point.
+			 *
+			 * Guarded by shmem_exit_registered because under the
+			 * release-on-disable policy (see wait_event_trace_release_slot
+			 * and assign_wait_event_capture) the allocate branch can run
+			 * multiple times per backend lifetime -- once per
+			 * off/stats -> trace re-enable cycle.  The cleanup itself is
+			 * idempotent (it short-circuits when state is not OWNED), so
+			 * it is safe to invoke after a release-then-reattach cycle,
+			 * but we still avoid growing the before_shmem_exit list.
+			 */
+			if (!shmem_exit_registered)
+			{
+				before_shmem_exit(wait_event_trace_before_shmem_exit,
+								  Int32GetDatum(procNumber));
+				shmem_exit_registered = true;
+			}
+		}
+	}
+	PG_FINALLY();
+	{
+		in_attach = false;
+	}
+	PG_END_TRY();
+}
+
+/*
+ * Free trace ring buffer for this backend on exit.
+ */
+static void
+wait_event_trace_detach(int procNumber)
+{
+	/*
+	 * Only clear local pointers here.  The actual DSA free happens in
+	 * wait_event_trace_before_shmem_exit(), which runs before
+	 * dsm_backend_shutdown() detaches the DSA segments.
+	 */
+	my_wait_event_trace = NULL;
+	my_trace_proc_number = -1;
+}
+
+/*
+ * Release this backend's trace ring buffer back to DSA immediately.
+ *
+ * Called from assign_wait_event_capture when the user steps down from
+ * TRACE to STATS or OFF.  Without this, a ~4 MB ring allocated by a
+ * brief investigation would remain pinned for the rest of the session's
+ * lifetime, which can leak gigabytes across large connection pools.
+ *
+ * Important contrast with wait_event_trace_before_shmem_exit: backend
+ * exit transitions the slot to ORPHANED (preserving data for
+ * cross-backend readers); release_slot fully frees and returns to FREE
+ * because the operator has explicitly disabled trace -- they have
+ * affirmatively decided not to keep the data, so we honour that and
+ * reclaim the memory immediately.  Subsequent re-enable allocates a
+ * fresh ring via wait_event_trace_attach's allocate branch.
+ *
+ * The operation is LWLock-safe and does not raise -- dsa_free is pure
+ * bookkeeping on the DSA freelist, no allocation and no ereport paths.
+ * Safe to call from a GUC assign hook.
+ *
+ * If pg_get_backend_wait_event_trace is currently iterating our own ring
+ * (wait_event_trace_srf_in_progress), we must NOT free the chunk out
+ * from under it: that would be a use-after-free on the records[] the SRF
+ * is still reading.  Set wait_event_trace_release_pending instead and
+ * return; the SRF's PG_FINALLY block will perform the deferred free
+ * after iteration completes.  In practice this branch is unreachable in
+ * current PG (assign hooks fire only at command boundaries and the SRF
+ * is a single command), but it makes the invariant explicit and the
+ * future-proofing free.
+ */
+static void
+wait_event_trace_release_slot(int procNumber)
+{
+	/*
+	 * Re-entrancy guard.  dsa_free takes a DSA-internal LWLock which can
+	 * in principle emit a wait event; if a nested assign hook re-enters
+	 * we must not recurse.  See the function-local-static-bool pattern
+	 * explainer on wait_event_timing_attach_array.
+	 */
+	static bool in_release = false;
+	WaitEventTraceSlot *slot;
+
+	if (in_release)
+		return;
+
+	if (WaitEventTraceCtl == NULL || trace_dsa == NULL)
+		return;
+
+	/*
+	 * Same-backend SRF is iterating our own ring.  Defer the free until
+	 * the SRF's PG_FINALLY runs.
+	 */
+	if (wait_event_trace_srf_in_progress)
+	{
+		wait_event_trace_release_pending = true;
+		return;
+	}
+
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		return;
+
+	slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+	in_release = true;
+
+	/*
+	 * Disable trace-ring writes on this backend before we touch the
+	 * lock or call dsa_free.  An internal LWLock inside dsa_free can
+	 * dispatch a wait event whose end-timing path would otherwise see
+	 * capture_level == TRACE (the GUC assign hook is in flight; the
+	 * variable has not been committed by the framework yet) and
+	 * write into the very chunk we are returning to the DSA
+	 * freelist.  See the comment on
+	 * wait_event_trace_writes_disabled for the full UAF analysis.
+	 */
+	wait_event_trace_writes_disabled = true;
+
+	PG_TRY();
+	{
+		LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+		/*
+		 * Drop the local pointer BEFORE the dsa_free as a second line
+		 * of defense (the writes-disabled flag above is the primary
+		 * gate).  Any wait event whose hot path slips past the gate
+		 * check via a compiler or memory-ordering surprise would at
+		 * least see my_wait_event_trace == NULL and skip the write.
+		 */
+		my_wait_event_trace = NULL;
+
+		if (DsaPointerIsValid(slot->ring_ptr))
+		{
+			/*
+			 * Bump generation first to invalidate any concurrent
+			 * cross-backend snapshot, then free, then publish the FREE
+			 * state with a NULL ring_ptr.  Order matters for unlocked
+			 * readers that have already passed the state check.
+			 */
+			pg_atomic_fetch_add_u64(&slot->generation, 1);
+			dsa_free(trace_dsa, slot->ring_ptr);
+			slot->ring_ptr = InvalidDsaPointer;
+			pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE);
+		}
+		LWLockRelease(&WaitEventTraceCtl->lock);
+	}
+	PG_FINALLY();
+	{
+		wait_event_trace_writes_disabled = false;
+		in_release = false;
+	}
+	PG_END_TRY();
+}
+
+/*
+ * Clear an orphaned trace ring at backend init time.
+ *
+ * Called from pgstat_set_wait_event_timing_storage() once the new
+ * backend has its procNumber.  If the slot we're inheriting was left
+ * ORPHANED by a previous backend (because we deliberately do not free
+ * trace rings on backend exit -- see the lifecycle discussion on
+ * WaitEventTraceControl), free the ring now so the new backend starts
+ * with a clean FREE slot.  Subsequent wait_event_trace_attach() calls
+ * (when this backend itself enables trace) will then take the
+ * allocate branch.
+ *
+ * No-op when the slot is already FREE or OWNED: FREE means there's
+ * nothing to clear; OWNED is impossible at backend init (only a
+ * not-yet-exited backend can leave a slot OWNED, and procNumbers are
+ * assigned exclusively).  We assert OWNED is not observed in debug
+ * builds and conservatively skip the free in production.
+ *
+ * Robustness: this runs during InitProcess() (before the backend can
+ * accept any work), and the work it performs -- dsa_attach() and
+ * dsa_free() -- can raise ERROR on rare runtime failures (corrupted
+ * DSA segment headers, descriptor exhaustion, mmap ENOMEM, etc.).
+ * An uncaught ERROR here would propagate out of InitProcess() and
+ * abort backend startup entirely, even for sessions that never
+ * intended to use wait_event_capture.  To prevent the trace
+ * feature's housekeeping from gating connection establishment, the
+ * body is wrapped in PG_TRY()/PG_CATCH(): any error from dsa_attach
+ * or dsa_free is captured, downgraded to a WARNING with a hint
+ * pointing at the admin sweep function, and execution continues.
+ * The orphan stays in place; it can be reclaimed by the next
+ * backend that inherits the same procNumber (if the underlying
+ * problem was transient), by pg_stat_clear_orphaned_wait_event_rings(),
+ * or at next cluster restart.
+ */
+static void
+wait_event_trace_clear_orphan_at_init(int procNumber)
+{
+	WaitEventTraceSlot *slot;
+	uint32		state_now;
+	MemoryContext caller_cxt;
+
+	if (WaitEventTraceCtl == NULL)
+		return;
+
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		return;
+
+	slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+	state_now = pg_atomic_read_u32(&slot->state);
+	if (state_now != WAIT_EVENT_TRACE_SLOT_ORPHANED)
+	{
+		Assert(state_now != WAIT_EVENT_TRACE_SLOT_OWNED);
+		return;
+	}
+
+	/*
+	 * Save CurrentMemoryContext so the PG_CATCH path can copy the
+	 * error data into a context that survives FlushErrorState().
+	 * FlushErrorState() calls MemoryContextReset(ErrorContext), so
+	 * CopyErrorData() must run in a different context or the
+	 * returned ErrorData becomes a dangling pointer.
+	 */
+	caller_cxt = CurrentMemoryContext;
+
+	PG_TRY();
+	{
+		/*
+		 * The trace DSA is shared across the cluster.  We must attach
+		 * to it before calling dsa_free (which needs the dsa_area
+		 * pointer).  The DSA was created by some earlier backend that
+		 * wrote a trace record (otherwise the slot couldn't have
+		 * ended up ORPHANED), so the handle in WaitEventTraceCtl is
+		 * valid; ensure_dsa() will attach.  Both ensure_dsa() and
+		 * dsa_free() can raise ERROR; the PG_CATCH below downgrades
+		 * any such error to a WARNING so backend startup is not
+		 * blocked.
+		 */
+		wait_event_trace_ensure_dsa();
+
+		LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+		if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_ORPHANED &&
+			DsaPointerIsValid(slot->ring_ptr))
+		{
+			pg_atomic_fetch_add_u64(&slot->generation, 1);
+			dsa_free(trace_dsa, slot->ring_ptr);
+			slot->ring_ptr = InvalidDsaPointer;
+			pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE);
+		}
+		LWLockRelease(&WaitEventTraceCtl->lock);
+	}
+	PG_CATCH();
+	{
+		ErrorData  *edata;
+
+		/*
+		 * Release any LWLocks we (or anything we called) might
+		 * still hold.  Two paths can leave WaitEventTraceCtl->lock
+		 * held when control reaches here:
+		 *
+		 *   1. The outer LWLockAcquire above succeeded and dsa_free
+		 *      raised before we reached LWLockRelease.
+		 *   2. wait_event_trace_ensure_dsa() raised inside its own
+		 *      LWLockAcquire/dsa_attach/LWLockRelease region.
+		 *
+		 * We are running during InitProcess(), BEFORE any
+		 * transaction or PostgresMain sigsetjmp has been set up,
+		 * so PG's standard "AbortTransaction -> LWLockReleaseAll"
+		 * cleanup does NOT fire on the longjmp into PG_CATCH.
+		 * Without an explicit release here the lock would stay
+		 * held for the lifetime of this backend, blocking every
+		 * future LW_EXCLUSIVE acquirer (the orphan-clear sweep,
+		 * release_slot, before_shmem_exit transitions, and
+		 * subsequent backends' clear_orphan_at_init).  That would
+		 * be strictly worse than the original failure-startup
+		 * behavior this commit set out to fix.
+		 *
+		 * LWLockReleaseAll() is the idiomatic catch-path lock
+		 * cleanup used by the standalone aux-process error
+		 * handlers (walwriter.c, checkpointer.c, pgarch.c).  It
+		 * is safe to call broadly here because pgstat_set_wait_
+		 * event_timing_storage runs at a fixed point in
+		 * InitProcess where the caller frame holds no other
+		 * LWLocks across our return: the earlier InitProcess
+		 * steps that touch LWLocks (ProcArrayAdd, etc.) release
+		 * them before returning, and the subsequent steps that
+		 * acquire LWLocks have not yet run.
+		 */
+		LWLockReleaseAll();
+
+		/*
+		 * Switch BACK to the caller's context before CopyErrorData
+		 * so that edata is allocated in a context that survives
+		 * FlushErrorState().  FlushErrorState() calls
+		 * MemoryContextReset(ErrorContext); allocating edata in
+		 * ErrorContext (the default at PG_CATCH entry on the error
+		 * path) would make it a dangling pointer the moment we
+		 * flush.  See the matching pattern in spi.c PG_CATCH
+		 * branches.
+		 */
+		MemoryContextSwitchTo(caller_cxt);
+		edata = CopyErrorData();
+		FlushErrorState();
+
+		ereport(WARNING,
+				(errcode(edata->sqlerrcode),
+				 errmsg("could not clear orphaned wait-event trace ring "
+						"at backend init: %s", edata->message),
+				 errdetail("Backend startup proceeds with the orphan "
+						   "still allocated for procnumber %d.",
+						   procNumber),
+				 errhint("Run pg_stat_clear_orphaned_wait_event_rings() "
+						 "to release the orphan when the underlying "
+						 "condition is resolved.")));
+
+		FreeErrorData(edata);
+	}
+	PG_END_TRY();
+}
+
+/*
+ * GUC check hook for wait_event_capture (timing build).
+ *
+ * All three enum values are accepted at this level; the assign hook
+ * handles side effects (attaching the trace ring on TRACE, warning
+ * about track_activities, etc.).
+ */
+bool
+check_wait_event_capture(int *newval, void **extra, GucSource source)
+{
+	return true;
+}
+
+/*
+ * GUC assign hook for wait_event_capture.
+ *
+ * Three responsibilities, all correctness- or resource-critical:
+ *
+ * 1) Drop any in-flight wait state.  After the capture level changes,
+ *    the existing wait_start / current_event in our per-backend slot can
+ *    no longer be trusted.  Consider this sequence:
+ *
+ *       capture = STATS, wait on E1 starts -> wait_start=T0, current_event=E1
+ *       capture flips to OFF mid-wait
+ *       wait_end inline skips (guard fails) -> state still T0/E1
+ *       new wait on E2 starts under OFF     -> inline skips, state still T0/E1
+ *       capture flips back to STATS
+ *       wait_end for E2 -> guard passes, credits (now - T0) to E1
+ *
+ *    Zeroing both fields on every assignment forfeits at most one
+ *    in-flight sample per GUC change (negligible) but eliminates all
+ *    such miscredits.
+ *
+ * 2) Release the trace ring buffer when stepping down from TRACE.
+ *    The per-backend trace ring is ~4 MB of DSA memory, and leaving it
+ *    pinned for the rest of the session's lifetime leaks shmem across
+ *    large connection pools that briefly enable trace.  Freeing here
+ *    makes "wait_event_capture = off" semantically release resources.
+ *    The next re-enable re-allocates a fresh ring on first wait event
+ *    via wait_event_trace_attach.
+ *
+ * 3) Warn (but never error) about secondary preconditions for TRACE
+ *    level.  GUC assign hooks MUST NOT ereport(ERROR) -- see
+ *    src/backend/utils/misc/README -- because they can run during
+ *    transaction rollback when lookups are unsafe.  In particular, the
+ *    trace ring's DSA allocation is NOT performed here (it can raise on
+ *    OOM).  Instead, the ring is attached lazily on the first write
+ *    from wait_event_trace_write_marker() and
+ *    pgstat_report_wait_end_timing(), where ereport(ERROR) has
+ *    well-defined semantics.  The release path above is safe to call
+ *    from the hook because dsa_free is non-raising LWLock bookkeeping.
+ */
+void
+assign_wait_event_capture(int newval, void *extra)
+{
+	if (my_wait_event_timing != NULL)
+	{
+		INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start);
+		my_wait_event_timing->current_event = 0;
+	}
+
+	/*
+	 * Step-down from TRACE: release the ring now instead of at backend
+	 * exit.  Only fires when a ring is actually attached, so going
+	 * directly OFF -> TRACE -> OFF without ever having emitted a trace
+	 * record is still a no-op.
+	 */
+	if (newval != WAIT_EVENT_CAPTURE_TRACE && my_wait_event_trace != NULL)
+		wait_event_trace_release_slot(my_trace_proc_number);
+
+	if (newval == WAIT_EVENT_CAPTURE_TRACE && !pgstat_track_activities)
+		ereport(WARNING,
+				(errmsg("wait_event_capture = trace query attribution "
+						"requires track_activities to be enabled")));
+
+	if (newval == WAIT_EVENT_CAPTURE_TRACE &&
+		compute_query_id == COMPUTE_QUERY_ID_OFF)
+		ereport(WARNING,
+				(errmsg("wait_event_capture = trace query attribution "
+						"requires compute_query_id to be enabled"),
+				 errhint("Set compute_query_id to \"on\" or \"auto\", or "
+						 "load an extension that enables it (e.g. "
+						 "pg_stat_statements).")));
+}
+
+/*
+ * Point my_wait_event_timing at this backend's slot.
+ * Called from InitProcess() after the backend has a valid procNumber.
+ *
+ * procNumber is the PGPROC array index (from GetNumberFromPGProc).
+ * Covers both regular backends (procNumber < MaxBackends) and auxiliary
+ * processes (bgwriter, checkpointer, walwriter, etc.).
+ *
+ * On EXEC_BACKEND builds (Windows), SubPostmasterMain() calls
+ * CreateSharedMemoryAndSemaphores() before InitProcess(), so
+ * WaitEventTimingArray is always initialized at this point.
+ */
+void
+pgstat_set_wait_event_timing_storage(int procNumber)
+{
+	/*
+	 * Do NOT attach to the timing array here: the array is allocated in
+	 * DSA on first enable of wait_event_capture (see
+	 * pgstat_wait_event_timing_lazy_attach).  A backend that never enables
+	 * capture pays zero shmem cost.
+	 *
+	 * Trace ring buffer is allocated lazily via DSA when
+	 * wait_event_capture is set to 'trace'.  Save procNumber for later
+	 * use by trace_attach/detach.
+	 */
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+	{
+		my_wait_event_timing = NULL;
+		my_trace_proc_number = -1;
+		my_wait_event_trace = NULL;
+		return;
+	}
+
+	my_wait_event_timing = NULL;
+	my_trace_proc_number = procNumber;
+	my_wait_event_trace = NULL;
+
+	/*
+	 * If the previous occupant of this procNumber slot was a tracing
+	 * backend that exited, its trace ring is still allocated in DSA in
+	 * ORPHANED state (see wait_event_trace_before_shmem_exit and the
+	 * lifecycle discussion on WaitEventTraceControl).  Free it now so
+	 * this backend starts with a clean FREE slot; otherwise the next
+	 * wait_event_trace_attach call would observe OWNED-but-not-our-data
+	 * (impossible by invariant) or, with the eventual addition of
+	 * post-mortem cross-backend reads, a freshly attached writer would
+	 * end up appending to a previous backend's records.
+	 */
+	wait_event_trace_clear_orphan_at_init(procNumber);
+}
+
+/*
+ * Detach from timing state on backend exit.
+ *
+ * This function is invoked from ProcKill() as an on_shmem_exit callback,
+ * which runs AFTER dsm_backend_shutdown() has detached DSA mappings.
+ * Writing to my_wait_event_timing at this point would touch DSA-backed
+ * memory that is no longer mapped and would segfault.
+ *
+ * We therefore only clear the backend-local pointers here.  Zeroing of
+ * the shared slot itself happens in two safe places:
+ *   - the next time a backend attaches to the slot (lazy_attach memsets),
+ *   - the SRF readers filter dead backends via pgstat_get_beentry_by_proc_number,
+ * so stale data in the slot never becomes user-visible.
+ */
+void
+pgstat_reset_wait_event_timing_storage(void)
+{
+	/* Trace ring buffer: cleanup via before_shmem_exit callback (Fix #1) */
+	if (my_trace_proc_number >= 0)
+		wait_event_trace_detach(my_trace_proc_number);
+
+	my_wait_event_timing = NULL;
+	my_wait_event_trace = NULL;
+	my_trace_proc_number = -1;
+}
+
+/*
+ * Out-of-line body for pgstat_report_wait_start() timing path.
+ *
+ * Called when wait_event_capture != OFF.  The inline gate keeps just
+ * one global load + branch at every call site; everything else --
+ * writes-disabled check, lazy attach, INSTR_TIME read, current_event
+ * write -- runs here, where the cost is paid only when capture is
+ * actually enabled and the codegen does not pollute the host
+ * functions (LWLockAcquire, XLogInsert, etc.).
+ */
+void
+pgstat_report_wait_start_timing(uint32 wait_event_info)
+{
+	/*
+	 * Keeps us out of the timing path during the proc_exit cascade
+	 * after the before_shmem_exit callback raises this flag, so we
+	 * do not attempt DSA operations whose backing mappings
+	 * dsm_backend_shutdown has already torn down.
+	 */
+	if (wait_event_timing_writes_disabled)
+		return;
+
+	/*
+	 * Lazy attach: the per-backend timing slot lives in a DSA created
+	 * the first time any backend in the cluster enables
+	 * wait_event_capture.  After the first successful attach the cached
+	 * pointer stays valid for the backend's lifetime, so this branch is
+	 * cold and perfectly predicted.
+	 */
+	if (unlikely(my_wait_event_timing == NULL))
+	{
+		pgstat_wait_event_timing_lazy_attach();
+
+		/*
+		 * lazy_attach() can dispatch nested wait events while it sets up
+		 * DSA (dsa_attach takes an internal LWLock which can contend).
+		 * Those nested wait_end() calls clear my_wait_event_info to 0,
+		 * so by the time we return here the outer wait's wait_event_info
+		 * is no longer published to pg_stat_activity.  Re-publish to
+		 * restore visibility.  Only needed on the first-attach path;
+		 * subsequent calls skip this branch entirely.
+		 */
+		*(volatile uint32 *) my_wait_event_info = wait_event_info;
+	}
+
+	if (likely(my_wait_event_timing != NULL))
+	{
+		INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start);
+		my_wait_event_timing->current_event = wait_event_info;
+	}
+}
+
+/*
+ * Out-of-line body for pgstat_report_wait_end() timing path.
+ * Called when wait_event_capture is at STATS or higher.  Performs the
+ * writes-disabled check, lazy-attach, computes wait duration,
+ * accumulates per-event stats, and (at TRACE level) writes the event
+ * into the per-session trace ring buffer.
+ *
+ * The capture_level argument is the value of wait_event_capture as
+ * observed at the inline gate.  Passing it through (rather than
+ * re-loading the global here) avoids a redundant memory load on the
+ * trace hot path: the function-call boundary defeats CSE, so without
+ * the parameter the compiler must emit a second load to test for
+ * TRACE level below.  Using the gate's view also means a concurrent
+ * GUC change cannot half-update this call -- we either ran in the
+ * old level or we don't run at all.
+ */
+void
+pgstat_report_wait_end_timing(int capture_level)
+{
+	uint32		event;
+	uint32		cur_reset_gen;
+
+	if (wait_event_timing_writes_disabled)
+		return;
+
+	if (unlikely(my_wait_event_timing == NULL))
+	{
+		pgstat_wait_event_timing_lazy_attach();
+		if (my_wait_event_timing == NULL)
+			return;
+	}
+
+	event = my_wait_event_timing->current_event;
+
+	/*
+	 * Fast check for a pending cross-backend reset request.  Single
+	 * atomic load; almost always hits the fast path (branch well
+	 * predicted).  When we detect that our shared reset_generation has
+	 * advanced, clear our own counters on behalf of the requester, then
+	 * continue with normal accumulation.  wait_start is deliberately
+	 * left untouched so we don't lose the measurement that's already
+	 * running; the completing event will land in the freshly-zeroed
+	 * counters, which is the desired behaviour.  current_event is safe
+	 * to zero here because the local "event" above already captured its
+	 * value before the reset block; zeroing it kills a source of stale
+	 * state that external readers would otherwise observe on the slot
+	 * between waits.
+	 */
+	cur_reset_gen = pg_atomic_read_u32(&my_wait_event_timing->reset_generation);
+	if (unlikely(cur_reset_gen != my_last_reset_generation))
+	{
+		memset(my_wait_event_timing->events, 0,
+			   sizeof(my_wait_event_timing->events));
+		lwlock_timing_hash_clear(my_wait_event_timing);
+		my_wait_event_timing->reset_count++;
+		my_wait_event_timing->lwlock_overflow_count = 0;
+		my_wait_event_timing->flat_overflow_count = 0;
+		my_wait_event_timing->current_event = 0;
+		my_last_reset_generation = cur_reset_gen;
+	}
+
+	if (event != 0 && !INSTR_TIME_IS_ZERO(my_wait_event_timing->wait_start))
+	{
+		instr_time	now;
+		int64		duration_ns;
+		int			idx;
+
+		INSTR_TIME_SET_CURRENT(now);
+		duration_ns = INSTR_TIME_GET_NANOSEC(now) -
+			INSTR_TIME_GET_NANOSEC(my_wait_event_timing->wait_start);
+
+		if (unlikely(duration_ns < 0))
+			duration_ns = 0;
+
+		idx = wait_event_timing_index(event);
+
+		/*
+		 * No lock needed on the hot path: each WaitEventTimingState slot
+		 * has a single writer (the owning backend), and the SRF reader
+		 * pg_stat_get_wait_event_timing() is lock-free by design.  Cross-
+		 * backend reset is handled by the reset_generation check at the
+		 * top of this function: the requester bumps the atomic and the
+		 * owning backend (us) clears the counters at the next wait_end.
+		 *
+		 * We defer emitting the overflow WARNING to after the critical
+		 * bookkeeping is complete, so ereport() cannot recurse through
+		 * a wait event while counters are in an intermediate state.
+		 */
+		{
+			WaitEventTimingEntry *entry = NULL;
+			bool		warn_lwlock_overflow = false;
+			bool		warn_flat_overflow = false;
+
+			if (idx == WAIT_EVENT_TIMING_IDX_LWLOCK)
+				entry = lwlock_timing_lookup(my_wait_event_timing,
+											 event & 0xFFFF);
+			else if (likely(idx >= 0))
+				entry = &my_wait_event_timing->events[idx];
+
+			if (likely(entry != NULL))
+			{
+				entry->count++;
+				entry->total_ns += duration_ns;
+				if (duration_ns > entry->max_ns)
+					entry->max_ns = duration_ns;
+				entry->histogram[wait_event_timing_bucket(duration_ns)]++;
+			}
+			else if (idx == WAIT_EVENT_TIMING_IDX_LWLOCK)
+			{
+				if (my_wait_event_timing->lwlock_overflow_count++ == 0)
+					warn_lwlock_overflow = true;
+			}
+			else if (idx == -1)
+			{
+				if (my_wait_event_timing->flat_overflow_count++ == 0)
+					warn_flat_overflow = true;
+			}
+
+			/* Emit overflow warnings outside any critical section. */
+			if (unlikely(warn_lwlock_overflow))
+				ereport(WARNING,
+						(errmsg("wait_event_timing: LWLock hash table full, "
+								"timing data for some LWLock tranches will be lost"),
+						 errhint("This backend uses more than %d distinct LWLock tranches; raise wait_event_timing_max_tranches.",
+								 wait_event_timing_max_entries)));
+			else if (unlikely(warn_flat_overflow))
+				ereport(WARNING,
+						(errmsg("wait_event_timing: event class overflow, "
+								"some events will not be timed")));
+		}
+
+		/* 10046-style per-session trace ring buffer (DSA-backed) */
+		if (unlikely(capture_level == WAIT_EVENT_CAPTURE_TRACE) &&
+			likely(!wait_event_trace_writes_disabled))
+		{
+			/*
+			 * Lazy attach on first use -- allocation happens here rather
+			 * than in assign_wait_event_capture() to respect the GUC
+			 * assign-hook "must not ereport" contract.  See the comment
+			 * on assign_wait_event_capture() for rationale.
+			 *
+			 * wait_event_trace_writes_disabled (checked above) also
+			 * blocks this re-attach during slot-state transitions
+			 * driven by release_slot / before_shmem_exit; without that
+			 * gate, a nested wait event mid-transition could see
+			 * my_wait_event_trace == NULL and recurse into a fresh
+			 * attach that deadlocks on the lock the outer transition
+			 * already holds.  See review_6.md issue #10.
+			 */
+			if (my_wait_event_trace == NULL && my_trace_proc_number >= 0)
+				wait_event_trace_attach(my_trace_proc_number);
+
+			if (my_wait_event_trace != NULL)
+			{
+				/*
+				 * Single-writer claim: read+write avoids the LOCK XADD that
+				 * pg_atomic_fetch_add_u64 would emit on every wait event.
+				 * See wait_event_trace_write_marker for the full rationale.
+				 */
+				uint64	pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos);
+				WaitEventTraceRecord *rec;
+				uint32	seq;
+
+				pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1);
+
+				/*
+				 * Injection point used by the regression test for the
+				 * position-encoded identity seqlock in
+				 * emit_wait_event_trace_for_procnumber().  Stalling here
+				 * widens the window between the write_pos store and the
+				 * rec->seq store, simulating the weak-memory visibility
+				 * order that would otherwise be unreachable on x86.  A
+				 * cross-backend reader observing the new write_pos
+				 * while the rec->seq update has not yet happened MUST
+				 * skip this slot via the identity check; without the
+				 * identity check the reader would emit a stale record
+				 * from the previous ring cycle with the wrong ring
+				 * index.  Compiled out unless --enable-injection-points
+				 * is set.
+				 */
+				INJECTION_POINT("wait-event-trace-after-write-pos", NULL);
+
+				rec = &my_wait_event_trace->records[pos & my_wait_event_trace->ring_mask];
+				seq = (uint32)(pos * 2 + 1);
+
+				rec->seq = seq;
+				pg_write_barrier();		/* release: payload stores must not rise above seq=odd */
+
+				rec->record_type = TRACE_WAIT_EVENT;
+				rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now);
+				rec->data.wait.event = event;
+				rec->data.wait.pad2 = 0;
+				rec->data.wait.duration_ns = duration_ns;
+
+				pg_write_barrier();		/* release: payload stores must land before seq=even */
+				rec->seq = seq + 1;
+			}
+		}
+
+		INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start);
+	}
+}
+
+/*
+ * Resolve the optional pid SRF argument to a procNumber range
+ * [out_start, out_end).  Returns true on success, false if the SRF
+ * should emit zero rows (unknown pid -- silent no-op, matching the
+ * pg_stat_reset_wait_event_timing convention).
+ *
+ *   PID NULL  -> sweep all NUM_WAIT_EVENT_TIMING_SLOTS slots.
+ *   PID known -> sweep the single slot belonging to that backend.
+ *   PID unknown / invalid -> emit no rows.
+ */
+static bool
+wait_event_timing_pid_range(FunctionCallInfo fcinfo,
+							int *out_start, int *out_end)
+{
+	if (PG_ARGISNULL(0))
+	{
+		*out_start = 0;
+		*out_end = NUM_WAIT_EVENT_TIMING_SLOTS;
+		return true;
+	}
+	else
+	{
+		int		target_pid = PG_GETARG_INT32(0);
+		PGPROC *proc;
+		int		procNumber;
+
+		proc = BackendPidGetProc(target_pid);
+		if (proc == NULL)
+			proc = AuxiliaryPidGetProc(target_pid);
+		if (proc == NULL)
+			return false;
+
+		procNumber = GetNumberFromPGProc(proc);
+		if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+			return false;
+
+		*out_start = procNumber;
+		*out_end = procNumber + 1;
+		return true;
+	}
+}
+
+/*
+ * SQL function: pg_stat_get_wait_event_timing(pid int4, OUT ...)
+ *
+ * Returns one row per (backend, wait_event) with non-zero counts.
+ * pid is optional: NULL means all backends; a non-NULL value restricts
+ * the sweep to that single backend (silently empty if the PID is
+ * unknown, matching pg_stat_reset_wait_event_timing(pid) semantics).
+ *
+ * The PID-filtered fast path turns the cost of cluster-wide monitoring
+ * loops that poll a specific PID from O(MaxBackends * events) into
+ * O(events) per call -- the same precedent as pg_stat_get_activity(pid).
+ *
+ * Uses InitMaterializedSRF (materialize-all) for simplicity.  The result
+ * set is bounded by (NUM_WAIT_EVENT_TIMING_SLOTS * WAIT_EVENT_TIMING_NUM_EVENTS)
+ * rows, so deferred (value-per-call) mode is not needed.
+ */
+Datum
+pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	int			start_idx;
+	int			end_idx;
+	int			backend_idx;
+	ArrayType  *hist_array;
+	int64	   *hist_payload;
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	/*
+	 * If no backend has ever enabled wait_event_capture since the last
+	 * postmaster start, the shared timing array has not been allocated
+	 * yet -- return zero rows rather than forcing an allocation just for
+	 * a read.
+	 */
+	if (!wait_event_timing_attach_array(false))
+		PG_RETURN_VOID();
+
+	if (!wait_event_timing_pid_range(fcinfo, &start_idx, &end_idx))
+		PG_RETURN_VOID();
+
+	/*
+	 * Allocate the histogram ArrayType once and reuse it across every row
+	 * emitted below.  Per-row we overwrite the 16 int8 payload slots via
+	 * ARR_DATA_PTR; tuplestore_putvalues flattens the varlena into its
+	 * stored tuple, so subsequent rewrites cannot corrupt previously
+	 * emitted rows.  Saves one palloc per row on SRFs that can easily
+	 * produce tens of thousands of rows on large clusters.
+	 */
+	{
+		Datum		zero_elems[WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS];
+
+		memset(zero_elems, 0, sizeof(zero_elems));
+		hist_array = construct_array_builtin(zero_elems,
+											 WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS,
+											 INT8OID);
+		hist_payload = (int64 *) ARR_DATA_PTR(hist_array);
+	}
+
+	for (backend_idx = start_idx; backend_idx < end_idx; backend_idx++)
+	{
+		WaitEventTimingState *state = wet_slot(backend_idx);
+		PgBackendStatus *beentry;
+		int			i;
+
+		/* Skip dead backend slots and check permissions */
+		beentry = pgstat_get_beentry_by_proc_number(backend_idx);
+		if (beentry == NULL)
+			continue;
+		if (!HAS_PGSTAT_PERMISSIONS(beentry->st_userid))
+			continue;
+
+		/* Emit rows from the flat array (all classes except LWLock) */
+		for (i = 0; i < WAIT_EVENT_TIMING_DENSE_CLASSES; i++)
+		{
+			int		base = wait_event_class_offset[i];
+			int		nevents = wait_event_class_nevents[i];
+			uint32	classId = wait_event_dense_to_classid[i];
+			int		j;
+
+			for (j = 0; j < nevents; j++)
+			{
+				WaitEventTimingEntry *entry = &state->events[base + j];
+				Datum		values[10];
+				bool		nulls[10];
+				uint32		wait_event_info;
+				const char *event_type;
+				const char *event_name;
+				int			bucket;
+
+				if (entry->count == 0)
+					continue;
+
+				/* Reconstruct wait_event_info from class and event ID */
+				wait_event_info = ((uint32) classId << 24) | j;
+
+				event_type = pgstat_get_wait_event_type(wait_event_info);
+				event_name = pgstat_get_wait_event(wait_event_info);
+
+				if (event_type == NULL || event_name == NULL)
+					continue;
+
+				memset(nulls, 0, sizeof(nulls));
+
+				values[0] = Int32GetDatum(beentry->st_procpid);
+				values[1] = CStringGetTextDatum(GetBackendTypeDesc(beentry->st_backendType));
+				values[2] = Int32GetDatum(backend_idx);
+				values[3] = CStringGetTextDatum(event_type);
+				values[4] = CStringGetTextDatum(event_name);
+				values[5] = Int64GetDatum(entry->count);
+				values[6] = Float8GetDatum((double) entry->total_ns / 1000000.0);
+				values[7] = Float8GetDatum(entry->count > 0
+										   ? (double) entry->total_ns / entry->count / 1000.0
+										   : 0.0);
+				values[8] = Float8GetDatum((double) entry->max_ns / 1000.0);
+
+				for (bucket = 0; bucket < WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS; bucket++)
+					hist_payload[bucket] = entry->histogram[bucket];
+				values[9] = PointerGetDatum(hist_array);
+
+				tuplestore_putvalues(rsinfo->setResult,
+									rsinfo->setDesc,
+									values, nulls);
+			}
+		}
+
+		/* Emit rows from the LWLock hash table */
+		{
+			LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state);
+			WaitEventTimingEntry *events = wet_lwlock_hash_events(state);
+			int			hash_size = state->lwlock_hash.hash_size;
+
+		for (i = 0; i < hash_size; i++)
+		{
+			LWLockTimingHashEntry *he = &entries[i];
+			WaitEventTimingEntry *entry;
+			Datum		values[10];
+			bool		nulls[10];
+			uint32		wait_event_info;
+			const char *event_type;
+			const char *event_name;
+			int			bucket;
+
+			if (he->tranche_id == LWLOCK_TIMING_EMPTY_SLOT)
+				continue;
+
+			entry = &events[he->dense_idx];
+			if (entry->count == 0)
+				continue;
+
+			wait_event_info = PG_WAIT_LWLOCK | he->tranche_id;
+
+			event_type = pgstat_get_wait_event_type(wait_event_info);
+			event_name = pgstat_get_wait_event(wait_event_info);
+
+			if (event_type == NULL || event_name == NULL)
+				continue;
+
+			memset(nulls, 0, sizeof(nulls));
+
+			values[0] = Int32GetDatum(beentry->st_procpid);
+			values[1] = CStringGetTextDatum(GetBackendTypeDesc(beentry->st_backendType));
+			values[2] = Int32GetDatum(backend_idx);
+			values[3] = CStringGetTextDatum(event_type);
+			values[4] = CStringGetTextDatum(event_name);
+			values[5] = Int64GetDatum(entry->count);
+			values[6] = Float8GetDatum((double) entry->total_ns / 1000000.0);
+			values[7] = Float8GetDatum(entry->count > 0
+									   ? (double) entry->total_ns / entry->count / 1000.0
+									   : 0.0);
+			values[8] = Float8GetDatum((double) entry->max_ns / 1000.0);
+
+			for (bucket = 0; bucket < WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS; bucket++)
+				hist_payload[bucket] = entry->histogram[bucket];
+			values[9] = PointerGetDatum(hist_array);
+
+			tuplestore_putvalues(rsinfo->setResult,
+								rsinfo->setDesc,
+								values, nulls);
+		}
+		}
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * SQL function: pg_get_backend_wait_event_trace()
+ *
+ * Returns trace records from the current backend's own ring buffer.
+ * Cross-backend ring reading is intentionally not supported: the ring
+ * lives in per-backend DSA and reading another session's segment would
+ * require attaching/detaching under the trace control lock, which is
+ * the responsibility of external consumers (extensions, background
+ * workers).  The recommended cross-backend reader pattern is documented
+ * on WaitEventTraceControl in wait_event_timing.h.  The name mirrors
+ * pg_get_backend_memory_contexts() to make the session-local scope
+ * explicit at the API level.
+ *
+ * Same-backend coordination with wait_event_trace_release_slot uses the
+ * wait_event_trace_srf_in_progress / _release_pending flags rather than
+ * an LWLock: same-backend serialization is implicit, so a per-backend
+ * bool plus a deferred-free path is sufficient and avoids any of the
+ * cross-backend lock-hold latency that the cross-backend reader pattern
+ * has to manage.  PG_TRY/PG_FINALLY guarantees the flag is cleared and
+ * any deferred dsa_free is performed even on ereport(ERROR).
+ *
+ * Uses InitMaterializedSRF (materialize-all).  The ring holds up to
+ * WaitEventTraceRingSize records (set at server start from the
+ * wait_event_trace_ring_size_kb GUC; defaults to 131072 = 4 MB);
+ * full materialization caps the per-call cost at the ring size of
+ * tuplestore memory, which is acceptable for the use case this SRF
+ * is designed for: interactive own-session diagnostics from psql.
+ *
+ * This SRF is NOT the path for cross-backend monitoring tools --
+ * cross-backend readers should use pg_get_wait_event_trace for SQL
+ * access, or follow the shared-memory snapshot pattern documented
+ * on WaitEventTraceControl in wait_event_timing.h to consume the
+ * per-backend trace rings directly.  They should NOT call this
+ * function via SPI.
+ * It is hard-coded to return only the calling backend's own ring,
+ * so a bgworker calling SELECT * FROM pg_backend_wait_event_trace
+ * would get only the bgworker's own (typically empty) ring, not the
+ * target backend's data.
+ *
+ * Cross-backend consumers must instead use the lock + DSA-snapshot
+ * pattern documented on WaitEventTraceControl in wait_event_timing.h:
+ * acquire WaitEventTraceCtl->lock in LW_SHARED, resolve trace_ptrs[
+ * procNumber] via dsa_get_address, snapshot the records of interest
+ * into local memory, release the lock, then process the snapshot.
+ * That path bypasses this SRF entirely and is the supported
+ * cross-backend interface for monitoring extensions and bgworkers.
+ *
+ * value-per-call (deferred) SRF mode would let an interactive
+ * "SELECT ... FROM pg_backend_wait_event_trace LIMIT N" short-circuit
+ * the materialisation, but converting this function would require
+ * spanning the wait_event_trace_srf_in_progress flag (and its
+ * deferred-free coordination with assign_wait_event_capture; see
+ * issue #8) across multiple SRF callbacks plus a transaction-cleanup
+ * registration to handle LIMIT abandonment.  The complexity is not
+ * justified for the diagnostic use case, especially since cross-
+ * backend monitoring (the consumer that would actually benefit from
+ * streaming) goes through the snapshot pattern above instead.
+ * Interactive callers who want only recent records should use
+ * "ORDER BY seq DESC LIMIT N" -- the LIMIT is applied after
+ * materialisation but the cost stays bounded by the ring size.
+ */
+Datum
+pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	WaitEventTraceState *ts;
+	uint64		write_pos;
+	uint64		read_start;
+	uint64		i;
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	if (my_wait_event_trace == NULL)
+		PG_RETURN_VOID();
+
+	ts = my_wait_event_trace;
+
+	write_pos = pg_atomic_read_u64(&ts->write_pos);
+
+	if (write_pos == 0)
+		PG_RETURN_VOID();
+
+	/* Read from oldest available to newest */
+	{
+		uint64	ring_size = (uint64) ts->ring_mask + 1;
+
+		read_start = (write_pos > ring_size)
+			? write_pos - ring_size : 0;
+	}
+
+	/*
+	 * Mark the iteration in progress so wait_event_trace_release_slot
+	 * defers any concurrent dsa_free of our own ring (see the comment on
+	 * that function for the deferral protocol).  PG_FINALLY clears the
+	 * flag and performs any deferred free, even on ereport(ERROR).
+	 */
+	wait_event_trace_srf_in_progress = true;
+	PG_TRY();
+	{
+	for (i = read_start; i < write_pos; i++)
+	{
+		WaitEventTraceRecord *rec =
+			&ts->records[i & ts->ring_mask];
+		Datum		values[6];
+		bool		nulls[6];
+		const char *event_type;
+		const char *event_name;
+		uint32		seq_before;
+		uint32		seq_after;
+		uint8		rtype;
+		int64		timestamp_ns;
+		uint32		event_info;
+		int64		duration_ns;
+		int64		query_id;
+
+		/* Seqlock read */
+		seq_before = rec->seq;
+		pg_read_barrier();		/* acquire: payload loads below must not rise above this */
+
+		if (seq_before & 1)
+			continue;
+
+		rtype = rec->record_type;
+		timestamp_ns = rec->timestamp_ns;
+
+		if (rtype == TRACE_WAIT_EVENT)
+		{
+			event_info = rec->data.wait.event;
+			duration_ns = rec->data.wait.duration_ns;
+			query_id = 0;
+		}
+		else if (rtype == TRACE_QUERY_START || rtype == TRACE_QUERY_END ||
+				 rtype == TRACE_EXEC_START || rtype == TRACE_EXEC_END)
+		{
+			event_info = 0;
+			duration_ns = 0;
+			query_id = rec->data.query.query_id;
+		}
+		else
+		{
+			pg_read_barrier();	/* acquire: pair with seq_before read above before skipping */
+			continue;
+		}
+
+		pg_read_barrier();		/* acquire: payload loads must have landed before seq_after */
+		seq_after = rec->seq;
+
+		if (seq_before != seq_after)
+			continue;
+
+		/* Skip empty wait events */
+		if (rtype == TRACE_WAIT_EVENT && event_info == 0)
+			continue;
+
+		if (rtype == TRACE_WAIT_EVENT)
+		{
+			event_type = pgstat_get_wait_event_type(event_info);
+			event_name = pgstat_get_wait_event(event_info);
+		}
+		else if (rtype == TRACE_QUERY_START)
+		{
+			event_type = "Query";
+			event_name = "QueryStart";
+		}
+		else if (rtype == TRACE_EXEC_START)
+		{
+			event_type = "Query";
+			event_name = "ExecStart";
+		}
+		else if (rtype == TRACE_EXEC_END)
+		{
+			event_type = "Query";
+			event_name = "ExecEnd";
+		}
+		else
+		{
+			event_type = "Query";
+			event_name = "QueryEnd";
+		}
+
+		if (event_type == NULL || event_name == NULL)
+			continue;
+
+		memset(nulls, 0, sizeof(nulls));
+
+		values[0] = Int64GetDatum((int64) i);
+		values[1] = Int64GetDatum(timestamp_ns);
+		values[2] = CStringGetTextDatum(event_type);
+		values[3] = CStringGetTextDatum(event_name);
+		values[4] = Float8GetDatum((double) duration_ns / 1000.0);
+		values[5] = Int64GetDatum(query_id);
+
+		tuplestore_putvalues(rsinfo->setResult,
+							rsinfo->setDesc,
+							values, nulls);
+	}
+	}
+	PG_FINALLY();
+	{
+		wait_event_trace_srf_in_progress = false;
+
+		/*
+		 * If a GUC step-down fired during iteration, it deferred the
+		 * dsa_free.  Process it now that we're safely past the loop.
+		 * Re-check release_pending under the same flag to handle the
+		 * (impossible-today, possible-tomorrow) case of a nested SRF.
+		 */
+		if (wait_event_trace_release_pending)
+		{
+			wait_event_trace_release_pending = false;
+			if (my_trace_proc_number >= 0)
+				wait_event_trace_release_slot(my_trace_proc_number);
+		}
+	}
+	PG_END_TRY();
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * One element of the local result buffer.  Pairs a per-record copy
+ * with the original ring index (used as the seq output column).
+ */
+typedef struct WetValidRecord
+{
+	uint64		ring_index;		/* original index in the writer's ring */
+	WaitEventTraceRecord rec;
+} WetValidRecord;
+
+/*
+ * Snapshot the trace ring for a given procNumber and emit records into
+ * the SRF's tuplestore.  Returns silently for FREE slots, out-of-range
+ * procnumbers, slots whose ring was never allocated, and slots whose
+ * write_pos is zero.
+ *
+ * Cross-backend reader protocol implemented here:
+ *
+ *   1. Read slot->state without the lock as a cheap "worth visiting"
+ *      check; FREE -> nothing to emit.
+ *   2. Allocate the worst-case result buffer BEFORE taking the lock,
+ *      so the palloc -- which can bottom out in a glibc mmap syscall
+ *      for the ~5 MB worst-case size -- runs without holding the
+ *      WaitEventTraceCtl lock.
+ *   3. Acquire WaitEventTraceCtl->lock in LW_SHARED.  All slot
+ *      transitions take LW_EXCLUSIVE, so the slot's identity, state,
+ *      and ring_ptr are stable for the duration of the iteration.
+ *   4. Re-check state under the lock and resolve ring_ptr via
+ *      dsa_get_address.  Read write_pos.
+ *   5. Iterate every live ring index [read_start, write_pos).  For
+ *      each record do the per-record POSITION-ENCODED IDENTITY
+ *      seqlock check ON SHARED MEMORY (see the comment on the loop
+ *      below).
+ *   6. Release the lock.
+ *   7. Walk the local result array and emit rows into the tuplestore.
+ *      This is the expensive part (potential disk spill); doing it
+ *      after release minimises lock-hold time.
+ *
+ * Why per-record seqlock against shared memory, not against a local
+ * memcpy of the full ring: the protocol requires the two seq reads
+ * to go to the SAME shared-memory location at DIFFERENT TIMES, with
+ * the payload read between them.  A bulk memcpy then seqlock-on-
+ * local-copy reads the same frozen byte twice, the check degenerates
+ * to a no-op, and torn / stale-cycle reads slip through.
+ *
+ * Why position-encoded identity, not just parity: the writer encodes
+ * the ring position into the seq value (mid-write = pos*2+1, complete
+ * = pos*2+2).  After RING_SIZE writes the slot wraps and is rewritten
+ * with a new numerically-distinct seq.  A parity-only check accepts
+ * any stable even seq -- including the PREVIOUS cycle's seq if cross-
+ * process visibility puts the new write_pos ahead of the new seq
+ * update.  See the loop body for the four failure modes the identity
+ * check rejects.
+ *
+ * Holding LW_SHARED throughout the iteration also makes the
+ * generation-counter retry unnecessary for this caller: slot
+ * transitions take LW_EXCLUSIVE and therefore cannot happen while we
+ * hold LW_SHARED.  The generation counter is still part of the
+ * cross-backend reader contract on WaitEventTraceControl for external
+ * readers that follow a different lock-release pattern (e.g. an
+ * extension that wants to release the lock between batches of records
+ * and re-acquire), but this in-tree implementation does not release
+ * the lock mid-iteration.
+ *
+ * Both OWNED and ORPHANED slots are read uniformly.  For OWNED the
+ * live owner is concurrently writing; the seqlock catches torn reads.
+ * For ORPHANED the records are immutable post-mortem so the check is
+ * essentially a pass-through (it still correctly skips at most one
+ * trailing odd-seq record if the owner died mid-write).
+ *
+ * Lock-hold is O(write_pos - read_start) shared-memory loads, at
+ * roughly the same wall-clock cost as a single 4 MB memcpy of the
+ * full ring (~1 ms on modern hardware), with no I/O and no syscalls.
+ */
+static void
+emit_wait_event_trace_for_procnumber(int procNumber, ReturnSetInfo *rsinfo)
+{
+	WaitEventTraceSlot *slot;
+	WaitEventTraceState *ts;
+	WetValidRecord *valid_records = NULL;
+	uint64		valid_count = 0;
+	uint64		write_pos;
+	uint64		read_start;
+	uint64		i;
+	uint32		state_now;
+
+	if (WaitEventTraceCtl == NULL)
+		return;
+
+	/*
+	 * Range check.  Negative or out-of-range procnumbers return an
+	 * empty result rather than ERRORing because the most natural use
+	 * pattern for cross-backend readers is to iterate every possible
+	 * slot index (a monitoring background worker doesn't know the
+	 * exact NUM_WAIT_EVENT_TIMING_SLOTS at SQL level), and silent-
+	 * empty for out-of-range matches the behaviour of sister functions
+	 * like pg_stat_get_wait_event_timing(NULL) which iterate the
+	 * shared array internally.  FREE-but-in-range slots also return
+	 * empty (see the state check below); the caller cannot
+	 * distinguish out-of-range from FREE, which is fine.
+	 */
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		return;
+
+	slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+	/*
+	 * If the trace DSA was never created (no backend in the cluster
+	 * has ever set wait_event_capture = trace), every slot is still
+	 * in its initial FREE state.  Skip without taking the lock.
+	 */
+	if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID)
+		return;
+
+	/* Unlocked fast-path check; the authoritative check is under the
+	 * lock below. */
+	if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_FREE)
+		return;
+
+	wait_event_trace_ensure_dsa();
+	if (trace_dsa == NULL)
+		return;
+
+	/*
+	 * Allocate the worst-case result buffer BEFORE taking the lock.
+	 * The buffer is sized for the full ring (~5 MB at default
+	 * RING_SIZE=128K); on a near-empty ring most of it goes unused,
+	 * but that is preferable to holding the WaitEventTraceCtl lock
+	 * during a palloc that may bottom out in a glibc mmap() syscall
+	 * (allocations above the malloc-mmap threshold).  Glibc's
+	 * arena-internal mutex around the syscall would serialise every
+	 * concurrent reader of this lock through one VMA-modifying
+	 * kernel operation; sizing the alloc outside the lock keeps the
+	 * lock-hold time bounded by the per-record loop alone.
+	 *
+	 * After we acquire the lock we will either consume this buffer
+	 * (writing up to (write_pos - read_start) entries) or release
+	 * it unused on an early return.
+	 */
+	/*
+	 * Worst-case size = ring size.  Derive it from the GUC on first
+	 * use in this backend; subsequent calls see the cached value.
+	 * The GUC is PGC_POSTMASTER so the value is the same across
+	 * every backend in this postmaster run and never changes.
+	 */
+	if (WaitEventTraceRingSize == 0)
+		WaitEventTraceRingSize =
+			(uint32) wait_event_trace_ring_size_kb * 1024U /
+			(uint32) sizeof(WaitEventTraceRecord);
+	valid_records = palloc(sizeof(WetValidRecord) * WaitEventTraceRingSize);
+
+	LWLockAcquire(&WaitEventTraceCtl->lock, LW_SHARED);
+
+	state_now = pg_atomic_read_u32(&slot->state);
+	if (state_now == WAIT_EVENT_TRACE_SLOT_FREE ||
+		!DsaPointerIsValid(slot->ring_ptr))
+	{
+		LWLockRelease(&WaitEventTraceCtl->lock);
+		pfree(valid_records);
+		return;
+	}
+
+	ts = (WaitEventTraceState *) dsa_get_address(trace_dsa, slot->ring_ptr);
+	write_pos = pg_atomic_read_u64(&ts->write_pos);
+
+	if (write_pos == 0)
+	{
+		LWLockRelease(&WaitEventTraceCtl->lock);
+		pfree(valid_records);
+		return;
+	}
+
+	/* Live range: oldest available to newest. */
+	{
+		uint64	ring_size = (uint64) ts->ring_mask + 1;
+
+		read_start = (write_pos > ring_size)
+			? write_pos - ring_size : 0;
+	}
+
+	for (i = read_start; i < write_pos; i++)
+	{
+		WaitEventTraceRecord *rec_shared =
+			&ts->records[i & ts->ring_mask];
+		WetValidRecord *out = &valid_records[valid_count];
+		uint32		expected_seq;
+		uint32		seq_before;
+		uint32		seq_after;
+
+		/*
+		 * Position-encoded seqlock identity check (NOT just parity).
+		 *
+		 * The writer encodes the ring position into the seq value:
+		 * mid-write -> (uint32)(pos * 2 + 1), complete -> + 2.  After
+		 * RING_SIZE writes the slot wraps and the same memory location
+		 * gets a new seq value (next_pos * 2 + 2) that is numerically
+		 * distinct from the previous cycle's seq.
+		 *
+		 * A parity-only check (skip on odd seq, accept on stable even)
+		 * is INSUFFICIENT for this layout in the cross-backend case:
+		 * if the writer just incremented write_pos to pos+1 but
+		 * cross-process cache coherence has not yet propagated the
+		 * subsequent rec->seq = (pos*2+1) store, this reader at
+		 * i = pos would see the previous cycle's complete-even seq
+		 * (from logical position pos - RING_SIZE).  Both seq_before
+		 * and seq_after would read that stale even value, parity
+		 * passes, identity-against-itself passes, and a record
+		 * belonging to the PREVIOUS cycle gets emitted with the new
+		 * ring_index = pos.  Silent data corruption (wrong attribution,
+		 * not torn bytes).
+		 *
+		 * The fix is identity against EXPECTED: a record is valid for
+		 * iterator position i if and only if its seq equals
+		 * (uint32)(i * 2 + 2) -- the writer's encoded "complete" value
+		 * for that exact ring position.  This rejects:
+		 *
+		 *   * Stale prior cycle (seq <  expected): writer hasn't yet
+		 *     advanced rec->seq for the current cycle.
+		 *   * Mid-write current cycle (seq == expected - 1, odd):
+		 *     writer is in the payload write window.
+		 *   * Ring wrapped past us (seq >  expected): the writer
+		 *     completed a later cycle on this slot during our read.
+		 *
+		 * The uint32 wraparound at 2^31 cycles is safe: we use exact
+		 * equality, and the writer's existing wrap-safety argument
+		 * (sizeof(seq) > worst-case in-flight window by 11 orders of
+		 * magnitude) covers the seq value.
+		 */
+		expected_seq = (uint32)(i * 2 + 2);
+
+		seq_before = rec_shared->seq;
+		pg_read_barrier();
+
+		if (seq_before != expected_seq)
+			continue;
+
+		out->rec = *rec_shared;		/* one 32-byte structure copy */
+
+		pg_read_barrier();
+		seq_after = rec_shared->seq;
+
+		if (seq_after != expected_seq)
+			continue;
+
+		out->ring_index = i;
+		valid_count++;
+	}
+
+	LWLockRelease(&WaitEventTraceCtl->lock);
+
+	/*
+	 * Walk the local result array and emit rows.  No shared-memory
+	 * access from here on, so spills to disk by the tuplestore (if
+	 * the result is large) do not hold any wait-event-timing lock.
+	 */
+	for (i = 0; i < valid_count; i++)
+	{
+		WetValidRecord *vr = &valid_records[i];
+		WaitEventTraceRecord *rec = &vr->rec;
+		Datum		values[6];
+		bool		nulls[6];
+		const char *event_type;
+		const char *event_name;
+		uint8		rtype = rec->record_type;
+		uint32		event_info;
+		int64		duration_ns;
+		int64		query_id;
+
+		if (rtype == TRACE_WAIT_EVENT)
+		{
+			event_info = rec->data.wait.event;
+			duration_ns = rec->data.wait.duration_ns;
+			query_id = 0;
+
+			/* Skip empty wait events. */
+			if (event_info == 0)
+				continue;
+
+			event_type = pgstat_get_wait_event_type(event_info);
+			event_name = pgstat_get_wait_event(event_info);
+		}
+		else if (rtype == TRACE_QUERY_START)
+		{
+			event_info = 0;
+			duration_ns = 0;
+			query_id = rec->data.query.query_id;
+			event_type = "Query";
+			event_name = "QueryStart";
+		}
+		else if (rtype == TRACE_QUERY_END)
+		{
+			event_info = 0;
+			duration_ns = 0;
+			query_id = rec->data.query.query_id;
+			event_type = "Query";
+			event_name = "QueryEnd";
+		}
+		else if (rtype == TRACE_EXEC_START)
+		{
+			event_info = 0;
+			duration_ns = 0;
+			query_id = rec->data.query.query_id;
+			event_type = "Query";
+			event_name = "ExecStart";
+		}
+		else if (rtype == TRACE_EXEC_END)
+		{
+			event_info = 0;
+			duration_ns = 0;
+			query_id = rec->data.query.query_id;
+			event_type = "Query";
+			event_name = "ExecEnd";
+		}
+		else
+		{
+			/* Unrecognised record_type -- skip defensively. */
+			continue;
+		}
+
+		if (event_type == NULL || event_name == NULL)
+			continue;
+
+		memset(nulls, 0, sizeof(nulls));
+
+		values[0] = Int64GetDatum((int64) vr->ring_index);
+		values[1] = Int64GetDatum(rec->timestamp_ns);
+		values[2] = CStringGetTextDatum(event_type);
+		values[3] = CStringGetTextDatum(event_name);
+		values[4] = Float8GetDatum((double) duration_ns / 1000.0);
+		values[5] = Int64GetDatum(query_id);
+
+		tuplestore_putvalues(rsinfo->setResult,
+							 rsinfo->setDesc,
+							 values, nulls);
+	}
+
+	pfree(valid_records);
+}
+
+/*
+ * SQL function: pg_get_wait_event_trace(procnumber int4)
+ *
+ * Cross-backend trace ring reader.  Returns the records from the trace
+ * ring belonging to the backend that currently or previously occupied
+ * the given procNumber slot.  Reads OWNED and ORPHANED slots uniformly;
+ * FREE slots return an empty result.
+ *
+ * This SRF is the in-tree consumer of the orphan-preserved trace data:
+ * a backend that exited while wait_event_capture = trace leaves its
+ * ring allocated in DSA in ORPHANED state, and this function reads it
+ * until either a new backend takes over the same procNumber or the
+ * DBA calls pg_stat_clear_orphaned_wait_event_rings().  External
+ * extensions that need cross-backend access follow the same
+ * snapshot pattern documented on WaitEventTraceControl in
+ * wait_event_timing.h; this function serves as both the reference
+ * implementation and a DBA-facing diagnostic tool.
+ *
+ * Privileges: REVOKE'd from PUBLIC and GRANT'ed to pg_read_all_stats
+ * in system_views.sql, matching the privilege model of the session-
+ * local view pg_backend_wait_event_trace.
+ *
+ * The procnumber argument can be obtained from the procnumber column
+ * of pg_stat_get_wait_event_timing or pg_stat_get_wait_event_timing_
+ * overflow.  For pid-keyed access against live backends, callers can
+ * do:
+ *
+ *   SELECT * FROM pg_get_wait_event_trace(
+ *       (SELECT procnumber FROM pg_stat_get_wait_event_timing(<pid>)
+ *        WHERE pid = <pid> LIMIT 1));
+ *
+ * Note that pid-keyed access cannot read ORPHANED slots because a
+ * dying backend's pid is removed from procArray on exit; for
+ * post-mortem reading of short-lived backends (parallel workers,
+ * autovacuum, walsender) the procNumber must be captured before the
+ * backend exits, or discovered by iterating procnumbers in a
+ * monitoring background worker.
+ */
+Datum
+pg_get_wait_event_trace(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	int32		procNumber = PG_GETARG_INT32(0);
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	emit_wait_event_trace_for_procnumber((int) procNumber, rsinfo);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Request a self-reset on the given backend slot.
+ *
+ * Lock-free: atomically bumps the slot's reset_generation, then sets the
+ * target's process latch so an idle backend wakes up and completes its
+ * current wait event (which triggers pgstat_report_wait_end_timing, which
+ * observes the generation change and performs the reset).  If the target
+ * slot is currently unoccupied the SetLatch is a harmless no-op.
+ */
+static void
+wait_event_timing_request_reset(int slot_idx)
+{
+	Assert(slot_idx >= 0 && slot_idx < NUM_WAIT_EVENT_TIMING_SLOTS);
+
+	/*
+	 * If no backend has ever enabled capture, the shared array does not
+	 * exist yet -- there is nothing to reset.  Attach read-only; callers
+	 * ultimately want the target backend to observe a generation bump,
+	 * so if the array isn't allocated the latch set below is also a
+	 * harmless no-op (no live backend is tracking).
+	 */
+	if (!wait_event_timing_attach_array(false))
+		return;
+
+	pg_atomic_fetch_add_u32(&wet_slot(slot_idx)->reset_generation, 1);
+
+	/*
+	 * Wake the target if it is sleeping in WaitLatch/WaitEventSetWait so
+	 * that it completes its current wait promptly and observes the reset
+	 * request.  The slot index is also the PGPROC array index
+	 * (pgstat_set_wait_event_timing_storage is called with procNumber).
+	 *
+	 * Even if no live backend currently owns the slot, setting the latch
+	 * on the stale PGPROC is harmless -- latches in shared memory are
+	 * durable and no process is waiting on it.
+	 */
+	if (ProcGlobal != NULL && ProcGlobal->allProcs != NULL)
+		SetLatch(&ProcGlobal->allProcs[slot_idx].procLatch);
+}
+
+/*
+ * SQL function: pg_stat_get_wait_event_timing_overflow()
+ *
+ * Exposes the per-backend truncation counters that are otherwise
+ * write-only: without these, a user has no way to tell from SQL whether
+ * their stats are complete or whether the hash table / flat array was
+ * saturated mid-session and silently dropped events.
+ *
+ *   lwlock_overflow_count: number of LWLock wait events that could not
+ *       be recorded because the per-backend LWLock timing hash
+ *       (capped by wait_event_timing_max_tranches) was full.
+ *   flat_overflow_count:   number of non-LWLock wait events that
+ *       resolved to an unknown / out-of-range class index and therefore
+ *       could not be mapped to a histogram slot.
+ *   reset_count:           number of resets this backend has *observed
+ *       and acted on*, NOT a request counter.  Own-backend resets are
+ *       synchronous and bump this once per call.  Cross-backend resets
+ *       coalesce: if multiple pg_stat_reset_wait_event_timing(target)
+ *       calls land between two of the target's wait_ends, the target
+ *       observes them as a single reset and reset_count increments
+ *       only once.  Callers polling for asynchronous-reset
+ *       acknowledgment should watch for any increment (N -> N+1).
+ *
+ * One row per live backend; filtered by HAS_PGSTAT_PERMISSIONS like
+ * pg_stat_get_wait_event_timing().  The pid argument is optional with
+ * the same semantics as pg_stat_get_wait_event_timing(): NULL means
+ * all backends, a non-NULL value restricts the sweep to that single
+ * backend (silently empty for unknown PIDs).
+ */
+Datum
+pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	int			start_idx;
+	int			end_idx;
+	int			backend_idx;
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	if (!wait_event_timing_attach_array(false))
+		PG_RETURN_VOID();
+
+	if (!wait_event_timing_pid_range(fcinfo, &start_idx, &end_idx))
+		PG_RETURN_VOID();
+
+	for (backend_idx = start_idx; backend_idx < end_idx; backend_idx++)
+	{
+		WaitEventTimingState *state = wet_slot(backend_idx);
+		PgBackendStatus *beentry;
+		Datum		values[6];
+		bool		nulls[6];
+
+		beentry = pgstat_get_beentry_by_proc_number(backend_idx);
+		if (beentry == NULL)
+			continue;
+		if (!HAS_PGSTAT_PERMISSIONS(beentry->st_userid))
+			continue;
+
+		memset(nulls, 0, sizeof(nulls));
+
+		values[0] = Int32GetDatum(beentry->st_procpid);
+		values[1] = CStringGetTextDatum(GetBackendTypeDesc(beentry->st_backendType));
+		values[2] = Int32GetDatum(backend_idx);
+		values[3] = Int64GetDatum(state->lwlock_overflow_count);
+		values[4] = Int64GetDatum(state->flat_overflow_count);
+		values[5] = Int64GetDatum(state->reset_count);
+
+		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+							 values, nulls);
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * SQL function: pg_stat_reset_wait_event_timing(pid int4)
+ *
+ * Resets wait-event-timing counters for a single backend, identified by PID.
+ *
+ *   NULL (or MyProcPid): reset caller's own session synchronously --
+ *                        single writer, no lock needed.
+ *   another PID:         request a cross-backend reset (superuser only).
+ *   unknown / dead PID:  silent no-op, matching pg_stat_reset_backend_stats.
+ *
+ * To reset every backend, use pg_stat_reset_wait_event_timing_all().
+ *
+ * Cross-backend resets are asynchronous by design: the function atomically
+ * bumps the target slot's reset_generation counter and wakes the target's
+ * latch; the owning backend observes the change on its next wait_end and
+ * clears its own counters.  This keeps the hot path lock-free and avoids
+ * the cross-writer races that plagued an earlier LWLock-based design.
+ *
+ * Visibility is near-immediate for active backends (their next event ends
+ * within microseconds) and is bounded by the target's wait duration for
+ * idle backends -- SetLatch shortens that by interrupting any current
+ * WaitLatch.  The function returns before the reset has been observed;
+ * callers that need strict read-after-reset semantics should either
+ * target their own backend (where reset is synchronous) or poll the
+ * target's reset_count column in pg_stat_wait_event_timing_overflow
+ * until it increments.
+ */
+Datum
+pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS)
+{
+	int			target_pid;
+	PGPROC	   *proc;
+	int			procNumber;
+
+	if (PG_ARGISNULL(0) || PG_GETARG_INT32(0) == MyProcPid)
+	{
+		/*
+		 * Reset own backend.  Synchronous: no lock or atomic indirection
+		 * needed.  If capture has never been enabled in this backend yet,
+		 * my_wait_event_timing is still NULL; nothing to reset.
+		 *
+		 * wait_start is already zero here -- pgstat_report_wait_end_timing
+		 * zeros it at the end of every wait, and the backend cannot be mid-
+		 * wait while it is executing this SQL function -- so there is no
+		 * in-flight measurement to preserve.  We zero current_event for the
+		 * same hygiene reason as the cross-backend reset path above: keep
+		 * external readers of the slot from seeing stale state between
+		 * waits.
+		 */
+		if (my_wait_event_timing != NULL)
+		{
+			memset(my_wait_event_timing->events, 0,
+				   sizeof(my_wait_event_timing->events));
+			lwlock_timing_hash_clear(my_wait_event_timing);
+			my_wait_event_timing->reset_count++;
+			my_wait_event_timing->lwlock_overflow_count = 0;
+			my_wait_event_timing->flat_overflow_count = 0;
+			my_wait_event_timing->current_event = 0;
+		}
+		PG_RETURN_VOID();
+	}
+
+	/*
+	 * Cross-backend reset requires pg_signal_backend membership, matching
+	 * the privilege model of pg_stat_reset_backend_stats(int4 pid) (the
+	 * closest existing per-backend reset in the wider stats family).
+	 *
+	 * Why pg_signal_backend rather than naked superuser():
+	 *
+	 * 1) Operational alignment.  The role pg_signal_backend exists
+	 *    specifically for "the operator who acts on other backends'
+	 *    state" -- it gates pg_terminate_backend, pg_cancel_backend,
+	 *    and pg_stat_reset_backend_stats already.  Resetting another
+	 *    backend's wait-event timing is structurally the same kind of
+	 *    operation (per-PID, addressable, bounded blast radius), so it
+	 *    belongs to the same role.  Demanding superuser would create a
+	 *    surplus-privilege gap: a DBA who can already TERMINATE the
+	 *    target backend (strictly more invasive than resetting its
+	 *    counters) would need to escalate to superuser just to wipe
+	 *    its stats, which is operationally backwards.
+	 *
+	 * 2) Cluster-wide reset is a different decision.  See
+	 *    pg_stat_reset_wait_event_timing_all() below, which keeps the
+	 *    stricter superuser() gate -- different blast radius, different
+	 *    role.  This split (per-backend = pg_signal_backend, cluster-wide
+	 *    = superuser) reflects the principle that the role required for
+	 *    an operation should match what the operation can affect.  The
+	 *    fact that pg_stat_reset() (cluster-wide) actually only requires
+	 *    pg_read_all_stats today is an inconsistency in PG's existing
+	 *    surface; we deliberately do not extend that inconsistency here.
+	 *
+	 * 3) Information-disclosure concern is bounded.  The only
+	 *    "destructive" property of a stats reset is that it erases
+	 *    forensic evidence of past wait events.  Anyone with
+	 *    pg_signal_backend can already terminate the target backend --
+	 *    which terminates that forensic record by destroying the
+	 *    backend itself.  A counter wipe is strictly less invasive.
+	 */
+	if (!has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND))
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("permission denied to reset another backend's wait event timing"),
+				 errdetail("Only roles with privileges of the \"pg_signal_backend\" role may reset another backend's wait event timing.")));
+
+	target_pid = PG_GETARG_INT32(0);
+
+	/* Look up the target.  Try regular backends first, then aux. */
+	proc = BackendPidGetProc(target_pid);
+	if (proc == NULL)
+		proc = AuxiliaryPidGetProc(target_pid);
+
+	/* Unknown / dead PID: silent no-op, matching pg_stat_reset_backend_stats. */
+	if (proc == NULL)
+		PG_RETURN_VOID();
+
+	procNumber = GetNumberFromPGProc(proc);
+
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		PG_RETURN_VOID();
+
+	wait_event_timing_request_reset(procNumber);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Reset wait-event-timing counters for every backend.  Superuser only.
+ *
+ * Each slot is asked to self-reset on its next wait event (owner-cleared);
+ * see wait_event_timing_request_reset for the protocol.  Returns before the
+ * resets have been observed -- callers that need strict read-after-reset
+ * semantics should poll the targets' reset_count columns.
+ *
+ * Privilege model rationale (intentional asymmetry with the per-backend
+ * variant pg_stat_reset_wait_event_timing(pid)):
+ *
+ *   * Per-backend reset uses pg_signal_backend, matching
+ *     pg_stat_reset_backend_stats(pid).  The blast radius is one PID;
+ *     anyone who can pg_terminate_backend the target can already
+ *     destroy more forensic state than a counter wipe would.
+ *
+ *   * Cluster-wide reset is gated tighter because the blast radius is
+ *     every backend in the cluster.  An operator with pg_signal_backend
+ *     can disrupt one PID at a time (and must specify which); the
+ *     cluster-wide reset wipes ALL backends' historical counters in a
+ *     single call, which is meaningfully different in two ways:
+ *
+ *       (a) it can hide cross-tenant patterns that a forensic audit
+ *           would have wanted to compare across backends, and
+ *
+ *       (b) it removes the per-call addressability that makes the
+ *           per-backend variant auditable -- a log entry showing "user
+ *           X reset PID Y" is more actionable than "user X wiped
+ *           everything."
+ *
+ *     Requiring superuser for the cluster-wide variant matches the
+ *     general PG principle that scope of authority should match scope
+ *     of effect.  We deliberately do NOT mirror pg_stat_reset(), which
+ *     today is gated only on pg_read_all_stats despite being similarly
+ *     cluster-wide -- that's a pre-existing inconsistency in the wider
+ *     stats family and not one we want to extend.
+ */
+Datum
+pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS)
+{
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be a superuser to reset wait event timing for all backends")));
+
+	for (int i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++)
+		wait_event_timing_request_reset(i);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * SQL function: pg_stat_clear_orphaned_wait_event_rings()
+ *
+ * Free every trace ring whose owner has exited (slot state ORPHANED).
+ * Returns the number of rings released.
+ *
+ * Why this exists.  When a backend that had wait_event_capture = trace
+ * exits, we deliberately do NOT free its ~4 MB trace ring (see the
+ * lifecycle discussion on WaitEventTraceControl): the data must remain
+ * readable by cross-backend consumers -- the in-tree
+ * pg_get_wait_event_trace SRF and any extension following the
+ * snapshot pattern on WaitEventTraceControl -- and an exit-time
+ * dsa_free would defeat that.
+ * The reclaim instead happens lazily in two places:
+ *
+ *   (a) wait_event_trace_clear_orphan_at_init(): when a new backend
+ *       inherits the same procNumber slot at init, it frees the prior
+ *       orphan as part of starting clean.  This handles the common
+ *       case (busy clusters with connection churn) automatically.
+ *
+ *   (b) THIS FUNCTION: an explicit DBA-driven sweep that releases
+ *       every currently orphaned ring at once.
+ *
+ * The pathological case (a) does not handle is "capture briefly
+ * enabled, then disabled, on a cluster with long-lived pooled
+ * connections that never exit".  In that scenario procNumbers do not
+ * recycle, so prior orphans persist until cluster restart unless the
+ * DBA calls this function.  Worst-case bound is
+ * NUM_WAIT_EVENT_TIMING_SLOTS * sizeof(WaitEventTraceState) which is
+ * ~400 MB at MaxBackends=100, ~4 GB at MaxBackends=1000 -- bounded
+ * but worth a kill switch.
+ *
+ * Permissions: superuser-only, matching the cluster-wide reset
+ * (pg_stat_reset_wait_event_timing_all).  This is a
+ * cluster-scope memory-reclamation operation: it can disrupt any
+ * concurrent cross-backend reader on any orphaned slot.  The
+ * disruption is bounded (readers retry via the generation counter
+ * and at worst skip one read) but the operation is still
+ * cluster-wide, so the privilege model matches the reset variant
+ * with the same blast radius.
+ *
+ * The function is safe to call even when no orphans exist (returns
+ * 0) and even when capture is currently OFF (the slot array exists
+ * unconditionally; only the rings are lazy).
+ */
+Datum
+pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS)
+{
+	int64		freed = 0;
+	int			i;
+
+	if (!superuser())
+		ereport(ERROR,
+				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+				 errmsg("must be a superuser to clear orphaned wait event "
+						"trace rings")));
+
+	if (WaitEventTraceCtl == NULL)
+		PG_RETURN_INT64(0);
+
+	/*
+	 * If no backend has ever enabled trace, the trace DSA was never
+	 * created and there cannot be any ORPHANED slots: every slot is
+	 * still in its initial FREE state.  Nothing to do.
+	 */
+	if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID)
+		PG_RETURN_INT64(0);
+
+	/* Attach to the trace DSA so dsa_free() can be called. */
+	wait_event_trace_ensure_dsa();
+	if (trace_dsa == NULL)
+		PG_RETURN_INT64(0);
+
+	/*
+	 * Walk every slot, taking and releasing WaitEventTraceCtl->lock per
+	 * slot rather than holding it across the entire sweep.
+	 *
+	 * Rationale: at MaxBackends = 1000 with a fully-orphaned cluster
+	 * the per-slot work (atomic state read + dsa_free + ring_ptr
+	 * clear + atomic state write) totals a few microseconds; holding
+	 * the lock across all slots would yield a millisecond-scale
+	 * lock-hold window during which every concurrent backend startup
+	 * (the lazy wait_event_trace_clear_orphan_at_init path), every
+	 * cross-backend reader (pg_get_wait_event_trace and the external
+	 * snapshot pattern), and every capture step-down or restore
+	 * would stall.  PG's general convention is to keep LWLock-held
+	 * windows in paths that compete with regular activity well under
+	 * 100 microseconds; per-slot release/reacquire gives us a worst-
+	 * case lock-hold of one slot's worth of work regardless of how
+	 * many orphans exist cluster-wide.
+	 *
+	 * An unlocked fast-path read of slot->state skips non-ORPHANED
+	 * slots without an LWLockAcquire/Release pair.  This is safe: if
+	 * a slot races from non-ORPHANED to ORPHANED after we read it,
+	 * we miss that orphan -- but the function is documented as a
+	 * snapshot sweep, the missed orphan can be cleared by a
+	 * subsequent call, and the same race exists for orphans that
+	 * appear after the loop ends.  The authoritative re-check under
+	 * the lock prevents racing on the dsa_free direction (we never
+	 * free a slot whose owner became OWNED again).
+	 *
+	 * CHECK_FOR_INTERRUPTS at the top of the loop body lets the
+	 * caller cancel a long sweep; with the previous single-lock
+	 * structure the InterruptHoldoffCount elevation from
+	 * LWLockAcquire deferred all cancellation until release.
+	 */
+	for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++)
+	{
+		WaitEventTraceSlot *slot = &WaitEventTraceCtl->trace_slots[i];
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Unlocked fast-path: skip non-ORPHANED slots cheaply. */
+		if (pg_atomic_read_u32(&slot->state) != WAIT_EVENT_TRACE_SLOT_ORPHANED)
+			continue;
+
+		LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+		/*
+		 * Authoritative re-check under the lock.  A concurrent
+		 * clear_orphan_at_init may have already freed this slot.
+		 */
+		if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_ORPHANED &&
+			DsaPointerIsValid(slot->ring_ptr))
+		{
+			pg_atomic_fetch_add_u64(&slot->generation, 1);
+			dsa_free(trace_dsa, slot->ring_ptr);
+			slot->ring_ptr = InvalidDsaPointer;
+			pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE);
+			freed++;
+		}
+
+		LWLockRelease(&WaitEventTraceCtl->lock);
+	}
+
+	PG_RETURN_INT64(freed);
+}
+
+#endif							/* USE_WAIT_EVENT_TIMING */
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 2460e550f96e2..d1dceda12df7b 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -70,6 +70,7 @@
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/timeout.h"
+#include "utils/wait_event_timing.h"
 
 /* has this backend called EmitConnectionWarnings()? */
 static bool ConnectionWarningsEmitted;
@@ -1244,6 +1245,12 @@ InitPostgres(const char *in_dbname, Oid dboid,
 	/* Process pg_db_role_setting options */
 	process_settings(MyDatabaseId, GetSessionUserId());
 
+#ifdef USE_WAIT_EVENT_TIMING
+	/* Attach trace ring if wait_event_capture = trace was set via config/db/role settings */
+	if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && my_trace_proc_number >= 0)
+		wait_event_trace_attach(my_trace_proc_number);
+#endif
+
 	/* Apply PostAuthDelay as soon as we've read all options */
 	if (PostAuthDelay > 0)
 		pg_usleep(PostAuthDelay * 1000000L);
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index afaa058b046c9..a1cf02c6ce91c 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -3425,6 +3425,35 @@
   boot_val => 'true',
 },
 
+{ name => 'wait_event_capture', type => 'enum', context => 'PGC_SUSET', group => 'STATS_CUMULATIVE',
+  short_desc => 'Controls collection of per-wait-event timing statistics and (optionally) per-session event tracing.',
+  variable => 'wait_event_capture',
+  boot_val => 'WAIT_EVENT_CAPTURE_OFF',
+  options => 'wait_event_capture_options',
+  check_hook => 'check_wait_event_capture',
+  assign_hook => 'assign_wait_event_capture',
+},
+
+{ name => 'wait_event_timing_max_tranches', type => 'int', context => 'PGC_POSTMASTER', group => 'STATS_CUMULATIVE',
+  short_desc => 'Sets the maximum number of distinct LWLock tranches whose timing is recorded per backend.',
+  long_desc => 'Each backend\'s wait-event-timing hash table can hold this many distinct LWLock tranches; subsequent tranches are counted against lwlock_overflow_count and not individually timed.  Sized at server start; raise this if your installation loads many extensions that register their own LWLock tranches and you observe non-zero lwlock_overflow_count in pg_stat_wait_event_timing_overflow.',
+  variable => 'wait_event_timing_max_tranches',
+  boot_val => '192',
+  min => '16',
+  max => '65534',
+},
+
+{ name => 'wait_event_trace_ring_size_kb', type => 'int', context => 'PGC_POSTMASTER', group => 'STATS_CUMULATIVE',
+  short_desc => 'Per-backend wait-event-trace ring buffer size, in kilobytes.',
+  long_desc => 'Each backend that enables wait_event_capture = trace allocates a ring buffer of this size from a cluster-wide DSA.  The value must be a power of two and is sized at server start.  Larger rings retain longer histories before wrapping; smaller rings reduce per-backend memory at high max_connections.  Worst-case total memory is approximately max_connections times this value.',
+  flags => 'GUC_UNIT_KB',
+  variable => 'wait_event_trace_ring_size_kb',
+  boot_val => '4096',
+  min => '8',
+  max => '32768',
+  check_hook => 'check_wait_event_trace_ring_size_kb',
+},
+
 { name => 'wal_block_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS',
   short_desc => 'Shows the block size in the write ahead log.',
   flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE',
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 290ccbc543e25..25a3b139523e8 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -103,6 +103,7 @@
 #include "utils/plancache.h"
 #include "utils/ps_status.h"
 #include "utils/rls.h"
+#include "utils/wait_event_timing.h"
 #include "utils/xml.h"
 
 #ifdef TRACE_SYNCSCAN
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ac38cddaaf9a6..e854ad329a375 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -700,6 +700,10 @@
 #track_cost_delay_timing = off
 #track_io_timing = off
 #track_wal_io_timing = off
+#wait_event_capture = off              # off, stats, trace
+#wait_event_timing_max_tranches = 192   # (change requires restart)
+#wait_event_trace_ring_size_kb = 4096   # (change requires restart)
+                                        # must be power of two, 8 .. 32768
 #track_functions = none                 # none, pl, all
 #stats_fetch_consistency = cache        # cache, none, snapshot
 
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index a1416260abcbf..32ba1d391e146 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -57,6 +57,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202605131
+#define CATALOG_VERSION_NO	202605151
 
 #endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index be157a5fbe90c..1bb610167cb6b 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12693,4 +12693,64 @@
   proname => 'hashoid8extended', prorettype => 'int8',
   proargtypes => 'oid8 int8', prosrc => 'hashoid8extended' },
 
+{ oid => '9956',
+  descr => 'statistics: per-backend wait event timing (count, duration, histogram)',
+  proname => 'pg_stat_get_wait_event_timing', prorows => '1000',
+  proisstrict => 'f', proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,int4,text,int4,text,text,int8,float8,float8,float8,_int8}',
+  proargmodes => '{i,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{pid,pid,backend_type,procnumber,wait_event_type,wait_event,calls,total_time_ms,avg_time_us,max_time_us,histogram}',
+  prosrc => 'pg_stat_get_wait_event_timing' },
+
+
+{ oid => '9957',
+  descr => 'current backend wait event trace ring buffer',
+  proname => 'pg_get_backend_wait_event_trace', prorows => '1000',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => '',
+  proallargtypes => '{int8,int8,text,text,float8,int8}',
+  proargmodes => '{o,o,o,o,o,o}',
+  proargnames => '{seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}',
+  prosrc => 'pg_get_backend_wait_event_trace' },
+
+{ oid => '9958',
+  descr => 'statistics: reset wait event timing counters for the given backend (NULL = own)',
+  proname => 'pg_stat_reset_wait_event_timing', proisstrict => 'f',
+  provolatile => 'v', prorettype => 'void', proargtypes => 'int4',
+  proargnames => '{pid}',
+  prosrc => 'pg_stat_reset_wait_event_timing' },
+
+{ oid => '9959',
+  descr => 'statistics: per-backend wait event timing overflow counters (rows lost to LWLock hash / flat array overflow)',
+  proname => 'pg_stat_get_wait_event_timing_overflow', prorows => '1000',
+  proisstrict => 'f', proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,int4,text,int4,int8,int8,int8}',
+  proargmodes => '{i,o,o,o,o,o,o}',
+  proargnames => '{pid,pid,backend_type,procnumber,lwlock_overflow_count,flat_overflow_count,reset_count}',
+  prosrc => 'pg_stat_get_wait_event_timing_overflow' },
+
+{ oid => '9960',
+  descr => 'statistics: reset wait event timing counters for all backends (superuser only)',
+  proname => 'pg_stat_reset_wait_event_timing_all',
+  provolatile => 'v', prorettype => 'void', proargtypes => '',
+  prosrc => 'pg_stat_reset_wait_event_timing_all' },
+
+{ oid => '9961',
+  descr => 'statistics: free wait-event-trace rings whose owner backend has exited (superuser only); returns count freed',
+  proname => 'pg_stat_clear_orphaned_wait_event_rings',
+  provolatile => 'v', prorettype => 'int8', proargtypes => '',
+  prosrc => 'pg_stat_clear_orphaned_wait_event_rings' },
+
+{ oid => '9962',
+  descr => 'wait event trace ring for the given procnumber slot (OWNED or ORPHANED)',
+  proname => 'pg_get_wait_event_trace', prorows => '1000',
+  proretset => 't', provolatile => 'v', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,int8,int8,text,text,float8,int8}',
+  proargmodes => '{i,o,o,o,o,o,o}',
+  proargnames => '{procnumber,seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}',
+  prosrc => 'pg_get_wait_event_trace' },
+
 ]
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 4f8113c144b0c..ed0b7f26f9f5a 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -762,6 +762,9 @@
 /* Define to select unnamed POSIX semaphores. */
 #undef USE_UNNAMED_POSIX_SEMAPHORES
 
+/* Define to 1 to build with wait event timing. (--enable-wait-event-timing) */
+#undef USE_WAIT_EVENT_TIMING
+
 /* Define to select Win32-style semaphores. */
 #undef USE_WIN32_SEMAPHORES
 
diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h
index d7eb648bd2758..26ccc0cf486f9 100644
--- a/src/include/storage/lwlocklist.h
+++ b/src/include/storage/lwlocklist.h
@@ -140,3 +140,5 @@ PG_LWLOCKTRANCHE(XACT_SLRU, XactSLRU)
 PG_LWLOCKTRANCHE(PARALLEL_VACUUM_DSA, ParallelVacuumDSA)
 PG_LWLOCKTRANCHE(AIO_URING_COMPLETION, AioUringCompletion)
 PG_LWLOCKTRANCHE(SHMEM_INDEX, ShmemIndex)
+PG_LWLOCKTRANCHE(WAIT_EVENT_TRACE_DSA, WaitEventTraceDSA)
+PG_LWLOCKTRANCHE(WAIT_EVENT_TIMING_DSA, WaitEventTimingDSA)
diff --git a/src/include/storage/subsystemlist.h b/src/include/storage/subsystemlist.h
index 9ad619080be22..90b142354644c 100644
--- a/src/include/storage/subsystemlist.h
+++ b/src/include/storage/subsystemlist.h
@@ -79,6 +79,8 @@ PG_SHMEM_SUBSYSTEM(SyncScanShmemCallbacks)
 PG_SHMEM_SUBSYSTEM(AsyncShmemCallbacks)
 PG_SHMEM_SUBSYSTEM(StatsShmemCallbacks)
 PG_SHMEM_SUBSYSTEM(WaitEventCustomShmemCallbacks)
+PG_SHMEM_SUBSYSTEM(WaitEventTimingShmemCallbacks)
+PG_SHMEM_SUBSYSTEM(WaitEventTraceControlShmemCallbacks)
 #ifdef USE_INJECTION_POINTS
 PG_SHMEM_SUBSYSTEM(InjectionPointShmemCallbacks)
 #endif
diff --git a/src/include/utils/.gitignore b/src/include/utils/.gitignore
index ff6f61cd7ee7b..8a489b7769b16 100644
--- a/src/include/utils/.gitignore
+++ b/src/include/utils/.gitignore
@@ -6,4 +6,5 @@
 /header-stamp
 /pgstat_wait_event.c
 /wait_event_funcs_data.c
+/wait_event_timing_data.h
 /wait_event_types.h
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index dc406d6651aa2..309d5e87967a0 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -346,6 +346,7 @@ extern PGDLLIMPORT const struct config_enum_entry dynamic_shared_memory_options[
 extern PGDLLIMPORT const struct config_enum_entry io_method_options[];
 extern PGDLLIMPORT const struct config_enum_entry recovery_target_action_options[];
 extern PGDLLIMPORT const struct config_enum_entry server_message_level_options[];
+extern PGDLLIMPORT const struct config_enum_entry wait_event_capture_options[];
 extern PGDLLIMPORT const struct config_enum_entry wal_level_options[];
 extern PGDLLIMPORT const struct config_enum_entry wal_sync_method_options[];
 
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index 307f4fbaefe08..0cd528ecfb3f7 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -172,6 +172,9 @@ extern bool check_transaction_isolation(int *newval, void **extra, GucSource sou
 extern bool check_transaction_read_only(bool *newval, void **extra, GucSource source);
 extern void assign_transaction_timeout(int newval, void *extra);
 extern const char *show_unix_socket_permissions(void);
+extern bool check_wait_event_capture(int *newval, void **extra, GucSource source);
+extern void assign_wait_event_capture(int newval, void *extra);
+extern bool check_wait_event_trace_ring_size_kb(int *newval, void **extra, GucSource source);
 extern bool check_wal_buffers(int *newval, void **extra, GucSource source);
 extern bool check_wal_consistency_checking(char **newval, void **extra,
 										   GucSource source);
diff --git a/src/include/utils/meson.build b/src/include/utils/meson.build
index fd3a2352df5d4..ef8b2dc261811 100644
--- a/src/include/utils/meson.build
+++ b/src/include/utils/meson.build
@@ -1,6 +1,6 @@
 # Copyright (c) 2022-2026, PostgreSQL Global Development Group
 
-wait_event_output = ['wait_event_types.h', 'pgstat_wait_event.c', 'wait_event_funcs_data.c']
+wait_event_output = ['wait_event_types.h', 'pgstat_wait_event.c', 'wait_event_funcs_data.c', 'wait_event_timing_data.h']
 wait_event_target = custom_target('wait_event_names',
   input: files('../../backend/utils/activity/wait_event_names.txt'),
   output: wait_event_output,
@@ -11,7 +11,7 @@ wait_event_target = custom_target('wait_event_names',
   ],
   build_by_default: true,
   install: true,
-  install_dir: [dir_include_server / 'utils', false, false],
+  install_dir: [dir_include_server / 'utils', false, false, false],
 )
 
 wait_event_types_h = wait_event_target[0]
diff --git a/src/include/utils/wait_classes.h b/src/include/utils/wait_classes.h
index b91690a22c63b..c6c692a1e9391 100644
--- a/src/include/utils/wait_classes.h
+++ b/src/include/utils/wait_classes.h
@@ -26,4 +26,13 @@
 #define PG_WAIT_IO					0x0A000000U
 #define PG_WAIT_INJECTIONPOINT		0x0B000000U
 
+/*
+ * Bit-layout masks for wait_event_info.  The high byte encodes the
+ * class (one of the PG_WAIT_* constants above); the low 16 bits
+ * encode the per-class event id; the middle byte is currently
+ * reserved (see pgstat_report_wait_start in wait_event.h).
+ */
+#define WAIT_EVENT_CLASS_MASK		0xFF000000U
+#define WAIT_EVENT_ID_MASK			0x0000FFFFU
+
 #endif							/* WAIT_CLASSES_H */
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 86ee348220d7f..0ea5066027d19 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -13,6 +13,10 @@
 /* enums for wait events */
 #include "utils/wait_event_types.h"
 
+#ifdef USE_WAIT_EVENT_TIMING
+#include "utils/wait_event_timing.h"
+#endif
+
 extern const char *pgstat_get_wait_event(uint32 wait_event_info);
 extern const char *pgstat_get_wait_event_type(uint32 wait_event_info);
 static inline void pgstat_report_wait_start(uint32 wait_event_info);
@@ -22,6 +26,11 @@ extern void pgstat_reset_wait_event_storage(void);
 
 extern PGDLLIMPORT uint32 *my_wait_event_info;
 
+#ifdef USE_WAIT_EVENT_TIMING
+extern void pgstat_report_wait_start_timing(uint32 wait_event_info);
+extern void pgstat_report_wait_end_timing(int capture_level);
+#endif
+
 
 /*
  * Wait Events - Extension, InjectionPoint
@@ -61,6 +70,9 @@ extern char **GetWaitEventCustomNames(uint32 classId, int *nwaitevents);
  *
  *	my_wait_event_info initially points to local memory, making it safe to
  *	call this before MyProc has been initialized.
+ *
+ *	When compiled with --enable-wait-event-timing, also records the start
+ *	timestamp for later duration computation in pgstat_report_wait_end().
  * ----------
  */
 static inline void
@@ -71,17 +83,53 @@ pgstat_report_wait_start(uint32 wait_event_info)
 	 * four-bytes, updates are atomic.
 	 */
 	*(volatile uint32 *) my_wait_event_info = wait_event_info;
+
+#ifdef USE_WAIT_EVENT_TIMING
+	/*
+	 * Minimal inline gate: one global load + predicted-not-taken branch.
+	 * Keeping the gate body out-of-line in pgstat_report_wait_start_timing()
+	 * shrinks the inlined call sites and limits the codegen impact on host
+	 * functions (LWLockAcquire, XLogInsert, etc.) to a few bytes each.
+	 *
+	 * unlikely(): wait_event_capture defaults to OFF and is OFF on the
+	 * vast majority of installations.  The annotation steers the compiler
+	 * to lay out the no-op fall-through as the straight-line hot path.
+	 */
+	if (unlikely(wait_event_capture != WAIT_EVENT_CAPTURE_OFF))
+		pgstat_report_wait_start_timing(wait_event_info);
+#endif
 }
 
 /* ----------
  * pgstat_report_wait_end() -
  *
  *	Called to report end of a wait.
+ *
+ *	When compiled with --enable-wait-event-timing and the GUC is enabled,
+ *	calls the out-of-line pgstat_report_wait_end_timing() to compute the
+ *	wait duration and accumulate statistics.  The body is kept out-of-line
+ *	to reduce I-cache pressure at the many call sites.
  * ----------
  */
 static inline void
 pgstat_report_wait_end(void)
 {
+#ifdef USE_WAIT_EVENT_TIMING
+	/*
+	 * Minimal inline gate.  See pgstat_report_wait_start() for the
+	 * unlikely() rationale.  The load of wait_event_capture is reused
+	 * as the argument to pgstat_report_wait_end_timing(), so the
+	 * out-of-line body does not have to re-load it across the call
+	 * boundary (CSE doesn't cross function calls).
+	 */
+	{
+		int		capture_level = wait_event_capture;
+
+		if (unlikely(capture_level != WAIT_EVENT_CAPTURE_OFF))
+			pgstat_report_wait_end_timing(capture_level);
+	}
+#endif
+
 	/* see pgstat_report_wait_start() */
 	*(volatile uint32 *) my_wait_event_info = 0;
 }
diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h
new file mode 100644
index 0000000000000..77563be29d2b2
--- /dev/null
+++ b/src/include/utils/wait_event_timing.h
@@ -0,0 +1,742 @@
+/*-------------------------------------------------------------------------
+ *
+ * wait_event_timing.h
+ *	  Per-backend wait event timing and histogram infrastructure.
+ *
+ * When enabled via the wait_event_timing GUC, every call to
+ * pgstat_report_wait_start()/pgstat_report_wait_end() records the wait
+ * duration and accumulates per-event statistics (count, total time,
+ * histogram) in shared memory.
+ *
+ * The overhead is two clock_gettime(CLOCK_MONOTONIC) calls per wait event
+ * transition (~40-100 ns via VDSO), plus a few memory writes to per-backend
+ * arrays.  No locking is needed because each backend writes only to its own
+ * stats slot.
+ *
+ * Statistics are exposed via the pg_stat_wait_event_timing view
+ * and pg_stat_get_wait_event_timing() SQL function.
+ *
+ * Copyright (c) 2026, PostgreSQL Global Development Group
+ *
+ * src/include/utils/wait_event_timing.h
+ *-------------------------------------------------------------------------
+ */
+#ifndef WAIT_EVENT_TIMING_H
+#define WAIT_EVENT_TIMING_H
+
+#include "port/atomics.h"
+#include "port/pg_bitutils.h"
+#include "portability/instr_time.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "utils/dsa.h"
+#include "utils/wait_event_types.h"
+
+/*
+ * Number of log2 histogram buckets.  Bin edges are powers of two on the
+ * nanosecond axis: bucket i covers [2^(i+9), 2^(i+10)) ns, except bucket
+ * 0 which covers [0, 1024) ns and the last bucket which covers
+ * [2^(NBUCKETS+8), infinity) ns.  These boundaries approximate the
+ * decimal-microsecond grid (1024 ≈ 1 us, 2048 ≈ 2 us, ...), which lets
+ * wait_event_timing_bucket() avoid a /1000 on the hot path.
+ *
+ * 32 buckets cover from <1us through ~512s-1024s, with the last
+ * bucket open-ended at 2^40 ns ≈ 1099 s ≈ ~18 minutes.  Sample edges:
+ *
+ *   bucket  0:  [0, 1024) ns                 <1us
+ *   bucket  1:  [1024, 2048) ns              1-2us
+ *   bucket 14:  [2^23, 2^24) ns              8-16ms
+ *   bucket 23:  [2^32, 2^33) ns              4-8s
+ *   bucket 30:  [2^39, 2^40) ns              512s-1024s
+ *   bucket 31:  [2^40, inf) ns               >=1024s (overflow)
+ *
+ * Why 32 (and not 16, the original):
+ *
+ *   The original 16 buckets capped at 16ms in the last open-ended
+ *   bucket.  In real production workloads the long tail routinely
+ *   extends well past 16ms -- HDD seek-and-queue, cloud-EBS noisy-
+ *   neighbour spikes, lock-contention waits during table-level
+ *   conflict, vacuum waits, replication apply waits, all commonly
+ *   land in the 50ms-to-multi-second range.  Collapsing all of those
+ *   into a single overflow bucket made the histogram much less useful
+ *   for the diagnostic case it primarily exists to serve: P99 / tail
+ *   analysis is precisely where wait-event timing pays for itself,
+ *   and that signal lives in the long tail.
+ *
+ *   Doubling to 32 buckets pushes the open-ended overflow out to
+ *   ~17 minutes (2^40 ns).  Anything beyond that genuinely belongs in
+ *   EXPLAIN / auto_explain or pg_stat_activity rather than a wait-
+ *   event distribution: a single wait of more than ~17 minutes is a
+ *   query-shape or stuck-process problem, not a histogram-bucket
+ *   problem.  The 32-bucket layout therefore covers the entire
+ *   useful diagnostic range without leaving the long tail in an
+ *   overflow bucket the operator cannot reason about.
+ *
+ *   Cost: 16 extra int8 slots per WaitEventTimingEntry, increasing
+ *   the per-entry size from 152 to 280 bytes (each int8 = 8 bytes).
+ *   At default 192-tranche cap that adds ~24 KB to the per-backend
+ *   lwlock_events array, plus ~32 KB to the per-backend events array
+ *   (~250 distinct events), so ~56 KB more per backend -- about
+ *   double the previous baseline, still bounded.  The hot-path cost
+ *   is unchanged: histogram[bucket]++ is the same single store
+ *   regardless of array length, and the bucket index computation
+ *   (pg_leftmost_one_pos64 - 9) doesn't depend on the array size.
+ *
+ *   ABI note: pg_proc.dat declares pg_stat_get_wait_event_timing's
+ *   histogram return type as _int8 (variable-length int8 array).  The
+ *   array is constructed at SRF emit time via construct_array_builtin
+ *   sized by this constant, so changing the constant changes the
+ *   row-payload length but not the catalog row type.  External
+ *   consumers that addressed buckets by absolute index (e.g.
+ *   "histogram[15] is the overflow bucket") need to be updated;
+ *   consumers that join against pg_wait_event_timing_histogram_buckets
+ *   (the canonical name-and-edge table) continue to work transparently
+ *   because that view is also extended to 32 rows in lockstep.
+ */
+#define WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS	32
+
+/*
+ * Compact per-class mapping for the flat events[] array.
+ *
+ * WAIT_EVENT_TIMING_RAW_CLASSES, WAIT_EVENT_TIMING_DENSE_CLASSES, and
+ * WAIT_EVENT_TIMING_NUM_EVENTS are generated into wait_event_types.h by
+ * generate-wait_event_types.pl from wait_event_names.txt.
+ *
+ * The mapping arrays (wait_event_class_dense, wait_event_class_nevents,
+ * wait_event_class_offset, wait_event_dense_to_classid) and internal
+ * helper functions are in wait_event_timing.c (included from the
+ * generated wait_event_timing_data.h).
+ */
+
+/* Sentinel returned by wait_event_timing_index() for LWLock events */
+#define WAIT_EVENT_TIMING_IDX_LWLOCK	(-2)
+
+/*
+ * Per-event accumulated statistics.  One entry per distinct wait event
+ * per backend.  These are written only by the owning backend, so no
+ * locking is needed.  External readers may see torn reads for 64-bit
+ * fields on 32-bit platforms, but that is acceptable for statistics.
+ */
+typedef struct WaitEventTimingEntry
+{
+	int64		count;			/* number of occurrences */
+	int64		total_ns;		/* total wait duration in nanoseconds */
+	int64		max_ns;			/* longest single wait in nanoseconds */
+	int64		histogram[WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS];
+} WaitEventTimingEntry;
+
+/*
+ * LWLock-specific open-addressing hash table for unbounded tranche IDs.
+ * Per-backend, written only by the owning backend -- no locking needed.
+ * Tranche IDs are dynamically allocated by LWLockNewTrancheId() starting
+ * at LWTRANCHE_FIRST_USER_DEFINED (~88) with no upper bound.  The hash
+ * maps tranche_id -> dense index into lwlock_events[].
+ */
+/*
+ * Hash slot count vs. entry cap.
+ *
+ * The cap on distinct LWLock tranches per backend (and the slot count
+ * of the open-addressing hash that resolves them) is configured at
+ * server start by the GUC wait_event_timing_max_tranches.  Default 192
+ * matches real-world ceilings on deployments without many custom
+ * extensions; raise it for installations that load many extensions
+ * which register their own LWLock tranches.  See guc_parameters.dat.
+ *
+ * The slot count is derived as the next power of two of (2 ×
+ * max_tranches), giving a load factor of at most 50% (typically ~37%
+ * because the next-pow2 jump usually overshoots).  Linear probing gets
+ * expensive fast above 50% load (avg ~8.5 probes on miss at 75%, ~1.6
+ * at 37.5%), and this table sits inside the single-writer hot path in
+ * pgstat_report_wait_end_timing, so probe length matters.  The slot-
+ * table memory cost is small relative to the entry array (4 bytes per
+ * slot vs. ~152 bytes per entry).
+ *
+ * Both the slot table (entries[]) and the dense events array
+ * (lwlock_events[]) are sized at allocation time and stored in the
+ * per-backend DSA region following the WaitEventTimingState header
+ * for that backend; see the layout description there.  The
+ * LWLockTimingHash struct below holds only the immutable size metadata
+ * and the runtime num_used counter -- the arrays themselves are not
+ * struct members because their length is runtime-determined.
+ */
+
+/*
+ * Sentinel marking an empty hash slot.  We deliberately reserve the
+ * upper end of the uint16 range (0xFFFF) instead of 0 so that any
+ * legal LWLock tranche ID -- including the currently-unused tranche 0
+ * (lwlocklist.h: "0 is available; was formerly BufFreelistLock") --
+ * can be stored and matched correctly.  Keeping the sentinel decoupled
+ * from the LWLock numbering makes this hash table robust to future
+ * changes in lwlocklist.h.
+ */
+#define LWLOCK_TIMING_EMPTY_SLOT	((uint16) 0xFFFF)
+
+typedef struct LWLockTimingHashEntry
+{
+	uint16		tranche_id;		/* LWLOCK_TIMING_EMPTY_SLOT (0xFFFF)
+								 * marks an unoccupied slot.  Real
+								 * tranche IDs are uint16 and use the
+								 * remaining range. */
+	uint16		dense_idx;		/* index into lwlock_events[] */
+} LWLockTimingHashEntry;
+
+/*
+ * Header-only struct.  The actual hash slot array and dense events
+ * array live in the per-backend DSA region immediately after the
+ * WaitEventTimingState (in that order); their addresses are recovered
+ * via wait_event_timing_lwlock_entries() / _lwlock_events() helpers
+ * defined in wait_event_timing.c.
+ */
+typedef struct LWLockTimingHash
+{
+	int			num_used;		/* count of occupied entries */
+	int			hash_size;		/* size of slot table (power of 2);
+								 * immutable after allocation */
+	int			max_entries;	/* cap on distinct tranches; immutable
+								 * after allocation, == GUC value at
+								 * postmaster start */
+} LWLockTimingHash;
+
+/* Declaration of the GUC (see guc_parameters.dat). */
+extern PGDLLIMPORT int wait_event_timing_max_tranches;
+
+/*
+ * Per-backend wait event timing state.  Allocated in shared memory,
+ * one per MaxBackends + NUM_AUXILIARY_PROCS slot.
+ *
+ * Synchronization: each slot is written exclusively by its owning backend.
+ * Cross-backend readers (pg_stat_get_wait_event_timing) are lock-free and
+ * tolerate torn reads of 64-bit fields on 32-bit platforms (acceptable for
+ * statistics).  Cross-backend reset is request-based: the caller atomically
+ * bumps reset_generation, and the owning backend observes the change on
+ * its next wait_end and performs the reset itself.  This keeps the hot
+ * path lock-free while guaranteeing atomic, race-free resets.
+ *
+ * DSA layout: each backend's slot is laid out as
+ *
+ *     [ WaitEventTimingState header ]
+ *     [ LWLockTimingHashEntry[hash_size] ]
+ *     [ WaitEventTimingEntry[max_entries]      <- lwlock_events[] ]
+ *
+ * where hash_size and max_entries are runtime-derived from the GUC
+ * wait_event_timing_max_tranches and recorded in the
+ * WaitEventTimingState->lwlock_hash header.  Slots are laid out
+ * contiguously in the shared array using a runtime stride
+ * (wait_event_timing_per_backend_stride in wait_event_timing.c) rather
+ * than the C array-indexing operator [], because per-backend size is
+ * determined at server start.
+ */
+typedef struct WaitEventTimingState
+{
+	/*
+	 * Generation counter for cross-backend reset requests.  Incremented
+	 * atomically by pg_stat_reset_wait_event_timing(target).  The owning
+	 * backend tracks a local last-observed value; when it differs from the
+	 * shared value, the owner performs the reset before the next event
+	 * accumulation.  Pure request-response: no locks needed on any path.
+	 */
+	pg_atomic_uint32 reset_generation;
+
+	/* Current wait start timestamp (set by pgstat_report_wait_start) */
+	instr_time	wait_start;
+
+	/* Current wait_event_info (cached for use in wait_end) */
+	uint32		current_event;
+
+	/*
+	 * Counter of resets that have been *observed and acted on* by this
+	 * backend.  Own-backend resets (pg_stat_reset_wait_event_timing(NULL)
+	 * or own-pid) are synchronous and bump this once per call.
+	 * Cross-backend resets COALESCE: if multiple resets are requested
+	 * for this backend between two of its wait_ends, the owner observes
+	 * them as one and bumps reset_count once.  Callers polling for "did
+	 * my async reset land?" should rely on the N -> N+1 transition;
+	 * do not use this column as a request counter.
+	 */
+	int64		reset_count;
+
+	/* Per-event statistics: flat array for bounded classes */
+	WaitEventTimingEntry events[WAIT_EVENT_TIMING_NUM_EVENTS];
+
+	/* Per-event statistics: hash table for LWLock class (unbounded IDs) */
+	LWLockTimingHash lwlock_hash;
+
+	/* Count of LWLock events dropped because the LWLock-timing hash
+	 * table reached its cap (the GUC wait_event_timing_max_tranches). */
+	int64		lwlock_overflow_count;
+
+	/* Count of flat array events dropped due to eventId exceeding slot count */
+	int64		flat_overflow_count;
+} WaitEventTimingState;
+
+
+/*
+ * Per-session wait event trace ring buffer (10046-style).
+ * When wait_event_trace GUC is on for a session, every wait_end writes
+ * a record to a per-backend ring buffer.  External tools read the buffer
+ * via pg_get_backend_wait_event_trace().
+ *
+ * Query attribution is done by scanning the ring at read time: QUERY_START
+ * and QUERY_END markers delimit which wait events belong to which query_id.
+ * This eliminates the previous per-backend shared-memory hash table.
+ *
+ * The ring buffer is allocated lazily via DSA (Dynamic Shared Memory Areas)
+ * on first use.  Only backends that enable wait_event_trace pay the
+ * per-ring memory cost.  A small control struct in fixed shmem holds
+ * per-backend DSA pointers.
+ *
+ * The ring size is configurable via the wait_event_trace_ring_size_kb
+ * GUC (PGC_POSTMASTER, default 4096 KB = 4 MB = 131072 records of 32
+ * bytes each).  The size is fixed cluster-wide at server start, so all
+ * rings in a given postmaster run have the same dimensions; each ring
+ * still caches its mask in the WaitEventTraceState header (next to
+ * write_pos) so the hot-path index computation is a single
+ * cache-warm load.
+ *
+ * The size MUST be a power of two: the writer indexes the ring as
+ * (pos & ring_mask), and ring_mask = ring_size - 1 only equals "low
+ * log2(ring_size) bits" when ring_size is a power of two.  The GUC
+ * check hook enforces this.
+ */
+
+/* Trace record types */
+#define TRACE_WAIT_EVENT	0
+#define TRACE_QUERY_START	1
+#define TRACE_QUERY_END		2
+#define TRACE_EXEC_START	3
+#define TRACE_EXEC_END		4
+
+typedef struct WaitEventTraceRecord
+{
+	/*
+	 * Seqlock for torn-read detection.  Writers set seq to an odd value
+	 * before filling fields, then to even after.  Readers check seq before
+	 * and after; if either is odd or they differ, the record is skipped.
+	 *
+	 * uint32 wraps after pos > 2^31 (~2.7 hours at 220K events/sec), but
+	 * the protection only needs to hold for the reader's access window
+	 * (~10-20 ns between seq_before and seq_after reads).  A collision
+	 * requires advancing 2^31 positions in that window -- physically
+	 * impossible by 11 orders of magnitude.
+	 */
+	uint32		seq;
+	uint8		record_type;	/* TRACE_WAIT_EVENT / QUERY_START / QUERY_END */
+	uint8		pad[3];
+	int64		timestamp_ns;	/* monotonic clock */
+	union
+	{
+		struct						/* record_type = TRACE_WAIT_EVENT */
+		{
+			uint32	event;			/* wait_event_info */
+			uint32	pad2;
+			int64	duration_ns;
+		}			wait;
+		struct						/* record_type = TRACE_QUERY_START/END */
+		{
+			int64	query_id;
+			int64	pad2;
+		}			query;
+	}			data;
+} WaitEventTraceRecord;			/* 32 bytes */
+
+/*
+ * Compile-time invariants for the trace ring.  These used to live as
+ * prose in the header comment above; the asserts make accidental
+ * violations (e.g. someone adding a field to WaitEventTraceRecord) a
+ * build failure instead of a silently-broken ring.
+ *
+ * The ring size itself is now runtime-configurable via the
+ * wait_event_trace_ring_size_kb GUC; the power-of-two invariant
+ * (required for the mask-indexing pos & ring_mask) is enforced by the
+ * GUC check hook, and the minimum-size invariant by the GUC bounds.
+ */
+StaticAssertDecl(sizeof(WaitEventTraceRecord) == 32,
+				 "WaitEventTraceRecord must be exactly 32 bytes: the "
+				 "seqlock wrap-safety argument relies on single-record, "
+				 "single-cache-line writes, and ARR_DATA_PTR / mask-index "
+				 "math assumes a fixed record stride.");
+
+/*
+ * Per-backend trace ring header followed by the records array.  The
+ * records[] slab is variably sized at allocation time (the postmaster's
+ * value of wait_event_trace_ring_size_kb determines the row count).
+ * write_pos and ring_mask live on the same cache line so the hot path
+ * touches a single line for the index calculation.
+ */
+typedef struct WaitEventTraceState
+{
+	pg_atomic_uint64 write_pos;	/* monotonically increasing, wraps via mask */
+	uint32		ring_mask;		/* (ring_size - 1); ring_size is a power of two */
+	uint32		ring_size_pad;	/* keep 16-byte alignment for the records[] slab */
+	WaitEventTraceRecord records[FLEXIBLE_ARRAY_MEMBER];
+} WaitEventTraceState;
+/* ~4 MB per backend (allocated lazily via DSA).  When the ring wraps,
+ * old records are silently overwritten.  Readers detect overwritten
+ * records via the seqlock (odd seq = in-flight write). */
+
+/*
+ * Per-procNumber trace-ring slot state.
+ *
+ * Slot lifecycle is decoupled from backend lifecycle on purpose: when a
+ * backend exits we deliberately do NOT free its ring.  Instead we
+ * transition the slot to ORPHANED and leave the ring allocated in DSA.
+ * That preserves trace data past backend exit so it remains readable by
+ * cross-backend consumers: the in-tree pg_get_wait_event_trace SRF and
+ * any extension that follows the snapshot pattern documented on
+ * WaitEventTraceControl below.  The original per-backend-ring design
+ * lost data the
+ * instant a parallel worker (or any short-lived backend) terminated,
+ * because the worker's before_shmem_exit callback ran dsa_free before
+ * any consumer could observe the final waits.  See "Slot lifecycle and
+ * orphan-memory accounting" on WaitEventTraceControl below for the
+ * rationale and the bounded-memory cost of this choice.
+ *
+ *   FREE      no ring is allocated; ring_ptr is InvalidDsaPointer.
+ *             This is the initial state of every slot at postmaster
+ *             startup, and the state a slot returns to after
+ *             pg_stat_clear_orphaned_wait_event_rings() or after a new
+ *             backend at this procNumber clears the prior orphan.
+ *
+ *   OWNED     ring is allocated and a live backend at this procNumber
+ *             is writing to it.  Single-writer invariant holds: only
+ *             the owner backend writes to records[].  Cross-backend
+ *             consumers may read concurrently using the per-record
+ *             seqlock protocol.
+ *
+ *   ORPHANED  ring is allocated but the previous owner has exited.
+ *             Data is post-mortem and immutable -- no writer will
+ *             touch it again.  The ring stays in DSA until either
+ *             (a) a new backend takes this procNumber and clears it,
+ *             or (b) the DBA calls
+ *                  pg_stat_clear_orphaned_wait_event_rings()
+ *             to release the memory.  Worst-case orphan footprint is
+ *             bounded at NUM_WAIT_EVENT_TIMING_SLOTS times the
+ *             per-backend ring size set by
+ *             wait_event_trace_ring_size_kb (default 4 MB; one
+ *             orphaned ring per procNumber); see WaitEventTraceControl.
+ */
+typedef enum WaitEventTraceSlotState
+{
+	WAIT_EVENT_TRACE_SLOT_FREE = 0,
+	WAIT_EVENT_TRACE_SLOT_OWNED,
+	WAIT_EVENT_TRACE_SLOT_ORPHANED,
+}			WaitEventTraceSlotState;
+
+/*
+ * Per-procNumber slot in the trace control struct.
+ *
+ * Synchronization model
+ * ---------------------
+ *
+ * generation is bumped on every owner transition (FREE->OWNED at attach,
+ * OWNED->ORPHANED at backend exit, anything->FREE at orphan cleanup or
+ * release-on-disable).  Cross-backend readers snapshot generation
+ * before and after their critical section; if it changed they discard
+ * the read and retry, matching the BackendStatusArray st_changecount
+ * idiom.  Writers never read generation on the hot path -- it is
+ * touched only on slot transitions, which are rare (once per backend
+ * lifecycle plus admin cleanups).
+ *
+ * state is pg_atomic_uint32 only for cheap unlocked "is this slot
+ * worth visiting" probes (e.g. an iterating reader that walks all
+ * MaxBackends slots and skips FREE ones without taking the lock).
+ * Authoritative
+ * reads of state-and-ring_ptr together MUST be done under
+ * WaitEventTraceCtl->lock in LW_SHARED, paired with the
+ * generation-snapshot retry loop above.  Writers always hold the lock
+ * in LW_EXCLUSIVE for the full transition, so a reader holding
+ * LW_SHARED observes an internally consistent slot.
+ *
+ * ring_ptr is touched only under WaitEventTraceCtl->lock; both writers
+ * (transitions) and readers (resolving the DSA pointer to read records)
+ * take the lock around it.  The lock-hold for readers is bounded to
+ * the dsa_get_address + memcpy of the records of interest -- per-record
+ * processing must happen after the lock is released, both for
+ * latency and to avoid lock-ordering issues with other PG subsystems.
+ *
+ * Size: 8 + 4 + 4(pad) + 8 = 24 bytes per slot.  At MaxBackends + AUX
+ * = ~1100 on a default cluster, ~26 KB of fixed shared memory total
+ * for the slot array -- negligible compared to the ring memory itself.
+ */
+typedef struct WaitEventTraceSlot
+{
+	pg_atomic_uint64 generation;	/* bumped on every owner transition;
+									 * cross-backend readers snapshot
+									 * before+after their read and retry
+									 * if it changed (BackendStatusArray
+									 * st_changecount idiom) */
+	pg_atomic_uint32 state;			/* WaitEventTraceSlotState */
+	uint32		pad;				/* explicit pad to keep ring_ptr 8-aligned */
+	dsa_pointer ring_ptr;			/* InvalidDsaPointer when state == FREE;
+									 * valid DSA pointer to the
+									 * WaitEventTraceState chunk otherwise */
+} WaitEventTraceSlot;
+
+/*
+ * Control struct for lazy DSA-based trace ring allocation.
+ * Lives in fixed shared memory, one per cluster.
+ *
+ * The per-backend trace ring is a lock-free transport for external consumers.
+ * Writers (owning backend) update write_pos and use a per-record seqlock
+ * for torn-read detection.
+ *
+ * Slot lifecycle and orphan-memory accounting
+ * -------------------------------------------
+ *
+ * The trace_slots[] array is indexed by procNumber.  Each slot's
+ * lifecycle is independent of the backend lifecycle that briefly
+ * occupies it: when a backend exits we transition its slot to
+ * ORPHANED and leave the DSA-allocated ring in place, instead of the
+ * older design that called dsa_free in the backend's
+ * before_shmem_exit callback.  That older design lost trace data the
+ * instant a backend exited, because the data was gone before any
+ * cross-backend reader could observe it.  This was particularly
+ * acute for parallel workers, which exit in milliseconds at
+ * end-of-parallel-query; a reader polling at 1 Hz would never
+ * observe their waits before the data was freed.
+ *
+ * Persisting the ring past backend exit pays a bounded memory cost:
+ * up to NUM_WAIT_EVENT_TIMING_SLOTS orphaned rings can simultaneously
+ * exist, each sized by wait_event_trace_ring_size_kb (default 4 MB).
+ * At the default 4 MB and MaxBackends=100 + auxiliaries that ceiling
+ * is ~400 MB; at MaxBackends=1000 it is ~4 GB.  Operators who need
+ * a tighter memory cap can lower wait_event_trace_ring_size_kb at
+ * server start (minimum 8 KB); operators who need longer retention
+ * before the FIFO wrap can raise it (maximum 32 MB).  The ceiling is only
+ * reached if every procNumber has been used by a tracing backend and
+ * none of those procNumbers has been reused since.  In typical
+ * deployments this does not happen:
+ *
+ *   * Always-on tracing: connection churn keeps slots cycling, so
+ *     orphans drain naturally as new backends claim procNumbers.
+ *   * Brief diagnostic tracing: capture is enabled, a few backends
+ *     trace, then capture is disabled.  Slots gradually clear as
+ *     the procNumbers are reused; or the DBA calls
+ *     pg_stat_clear_orphaned_wait_event_rings() to release them
+ *     immediately.
+ *   * Long-lived pooled connections that never recycle: the worst
+ *     pathological case.  Operators who hit this should call the
+ *     orphan-clear function after diagnostic sessions.
+ *
+ * Compared to the alternatives, accepting the bounded orphan-memory
+ * cost wins on every other axis we care about: hot-path overhead is
+ * unchanged (single writer, lock-free), correctness is universal
+ * (parallel workers, autovacuum, walsender, all transient backends
+ * preserve their data), DSA's lazy-allocation property is preserved
+ * (capture=off pays zero memory), and the cross-backend reader
+ * pattern below is what pg_get_wait_event_trace uses; extensions
+ * implementing similar tools follow the same pattern with no further
+ * plumbing.  See review_5.md issue #26 for the design discussion.
+ *
+ * External reader pattern (cross-backend consumers)
+ * -------------------------------------------------
+ *
+ * External readers (extensions, background workers reading another
+ * backend's ring) MUST follow this protocol; the in-tree SRF
+ * pg_get_wait_event_trace() is the reference implementation.
+ *
+ * 1. Read trace_slots[procNumber].state without the lock as a cheap
+ *    "worth visiting" check.  If FREE, there is no ring -- nothing
+ *    to do.  Otherwise proceed to step 2.
+ *
+ * 2. Acquire WaitEventTraceCtl->lock in LW_SHARED.  All slot
+ *    transitions (FREE <-> OWNED <-> ORPHANED, including
+ *    dsa_allocate / dsa_free of the ring) take LW_EXCLUSIVE, so the
+ *    SHARED hold makes the slot's state, ring_ptr, and ring memory
+ *    stable for the entire iteration that follows.  This is what
+ *    makes the per-slot generation counter optional for callers
+ *    that, like this in-tree reader, keep the lock held across the
+ *    iteration; callers that release and re-acquire the lock
+ *    between batches must use the generation idiom from step 7
+ *    instead.
+ *
+ * 3. Re-check state under the lock.  If FREE, the slot was
+ *    reassigned between step 1 and the lock acquire; release the
+ *    lock and return.
+ *
+ * 4. Resolve trace_slots[procNumber].ring_ptr via dsa_get_address
+ *    and read write_pos = pg_atomic_read_u64(&ts->write_pos).  No
+ *    barrier is required here: the position-encoded identity
+ *    seqlock check in step 5 rejects any stale-cycle visibility
+ *    (writer's write_pos store seen by reader before the rec->seq
+ *    store) by comparing rec->seq against the expected value for
+ *    iterator position i, which the previous cycle's seq cannot
+ *    equal.  An ordering mismatch on weak-memory architectures
+ *    simply causes the reader to skip the in-flight slot until the
+ *    next call.
+ *
+ * 5. Iterate ring indices [read_start, write_pos), masking each
+ *    through the ring (i & ts->ring_mask, where ring_mask is the
+ *    per-ring mask cached next to write_pos in the ring header).
+ *    For EACH record do the per-record seqlock protocol AGAINST
+ *    SHARED MEMORY, using a POSITION-ENCODED IDENTITY check
+ *    (not just parity):
+ *
+ *        expected_seq = (uint32)(i * 2 + 2);  / writer's complete-even
+ *                                               value for ring position i /
+ *        seq_before = rec_shared->seq;
+ *        pg_read_barrier();
+ *        if (seq_before != expected_seq) continue;
+ *        local_copy = *rec_shared;            / 32-byte struct copy /
+ *        pg_read_barrier();
+ *        seq_after = rec_shared->seq;
+ *        if (seq_after != expected_seq) continue;
+ *
+ *    Append valid records to a local result buffer for emission
+ *    after the lock is released.
+ *
+ *    The writer encodes the ring position into seq: mid-write is
+ *    (pos * 2 + 1), complete is (pos * 2 + 2).  Identity against
+ *    (i * 2 + 2) rejects four distinct failure modes:
+ *
+ *      - Stale previous cycle (seq < expected): writer just
+ *        advanced write_pos to i+1 but the seq store for cycle i
+ *        has not propagated to this CPU's view yet, so we see the
+ *        even seq value from (i - RING_SIZE) -- the slot's
+ *        previous occupant.  Parity-only seqlock would accept
+ *        this and emit a record belonging to the previous cycle
+ *        with the new ring_index, a silent data-attribution bug.
+ *      - Mid-write (seq == expected - 1, odd): writer is in the
+ *        payload-write window between seq=odd and seq=even.
+ *      - Ring wrapped past us (seq > expected): a later cycle on
+ *        this slot completed during our read.
+ *      - Torn write completed mid-read (seq_after differs from
+ *        seq_before): the writer crossed a full cycle while we
+ *        copied the record.
+ *
+ *    Do NOT memcpy the full records[] array up front and then do
+ *    the seqlock check against the local copy: both seq reads
+ *    would hit the same frozen byte in local memory, the check
+ *    degenerates to a no-op, and torn / stale-cycle reads slip
+ *    through.  The seqlock protocol requires the two seq reads to
+ *    go to shared memory at distinct times around the payload
+ *    read, and they must be compared against the expected
+ *    position-encoded value.
+ *
+ * 6. Release the lock.  Per-record post-processing (event-name
+ *    lookups, tuplestore population, network I/O) happens off the
+ *    lock so spills to disk or slow consumers do not extend
+ *    lock-hold.  Lock-hold time is O(records_in_range) loads from
+ *    shared memory; for the full ring this is ~1 ms on modern
+ *    hardware -- on par with a single 4 MB memcpy and acceptable
+ *    given the lock is contended only by other transitions
+ *    (themselves rare) and other readers (which share with us).
+ *
+ * 7. Optional: snapshot trace_slots[procNumber].generation BEFORE
+ *    step 2 and AFTER step 6; if it changed, the slot was
+ *    reassigned across some lock-release boundary.  This in-tree
+ *    reader does not need the snapshot because it holds the lock
+ *    throughout, but readers that batch their work across multiple
+ *    lock-acquire windows (e.g. an extension that polls many slots
+ *    in sequence without holding any single lock too long) should
+ *    use the generation idiom to detect slot reassignment between
+ *    batches.  The generation counter is bumped under LW_EXCLUSIVE
+ *    on every transition (FREE -> OWNED at attach, OWNED ->
+ *    ORPHANED at backend exit, anything -> FREE at release/clear).
+ *
+ * Same-backend readers (the in-tree pg_get_backend_wait_event_trace
+ * SRF) do NOT use the LWLock above -- same-backend serialization is
+ * implicit because a backend can only run one command at a time,
+ * and the SRF coordinates with wait_event_trace_release_slot via
+ * per-backend flags.  That mechanism is private to
+ * wait_event_timing.c; external code should use the cross-backend
+ * protocol described above.
+ */
+typedef struct WaitEventTraceControl
+{
+	dsa_handle	trace_dsa_handle;	/* DSA_HANDLE_INVALID until first use */
+	LWLock		lock;				/* protects DSA creation and slot
+									 * transitions (FREE<->OWNED<->
+									 * ORPHANED including ring_ptr
+									 * dsa_allocate / dsa_free) */
+	WaitEventTraceSlot trace_slots[FLEXIBLE_ARRAY_MEMBER]; /* per procNumber */
+} WaitEventTraceControl;
+
+
+/*
+ * Capture levels for the wait_event_capture GUC.  Order is significant:
+ * higher values are strict supersets of lower ones, and code paths use
+ * "level >= WAIT_EVENT_CAPTURE_STATS" to test for activation.
+ *
+ *   OFF   - No instrumentation, no hot-path cost.
+ *   STATS - Aggregated per-event statistics in pg_stat_wait_event_timing
+ *           (counts, durations, histograms).  Hot path samples wall time
+ *           around every wait.
+ *   TRACE - Everything in STATS plus a per-session ring buffer of
+ *           individual events and query markers, exposed via
+ *           pg_backend_wait_event_trace.  Adds ~4 MB DSA per session.
+ */
+typedef enum WaitEventCaptureLevel
+{
+	WAIT_EVENT_CAPTURE_OFF = 0,
+	WAIT_EVENT_CAPTURE_STATS,
+	WAIT_EVENT_CAPTURE_TRACE,
+}			WaitEventCaptureLevel;
+
+/*
+ * The hot path uses (capture_level != OFF) as the "any capture
+ * mode" gate and (capture_level == TRACE) for the trace-specific
+ * gate.  Either form is order-independent, but the values are
+ * also constrained to a strict OFF < STATS < TRACE order so that
+ * future code paths needing "at least STATS" can compare with
+ * >= safely.  Pin the invariant explicitly to catch enum
+ * reordering at compile time rather than via mysterious runtime
+ * mode switches.
+ */
+StaticAssertDecl(WAIT_EVENT_CAPTURE_OFF == 0 &&
+				 WAIT_EVENT_CAPTURE_STATS == 1 &&
+				 WAIT_EVENT_CAPTURE_TRACE == 2,
+				 "WaitEventCaptureLevel values must be 0=OFF < 1=STATS < 2=TRACE");
+
+/* GUC variables */
+extern PGDLLIMPORT int wait_event_capture;
+extern PGDLLIMPORT int wait_event_trace_ring_size_kb;
+
+/*
+ * Records-per-ring value derived from wait_event_trace_ring_size_kb at
+ * server start.  Cached file-scope so the allocator and any caller
+ * that wants the total record count (rather than the mask) does not
+ * have to redo the divide.  Set once by ProcessConfigFile()'s startup
+ * sync of POSTMASTER-context GUCs; never updated thereafter.
+ */
+extern PGDLLIMPORT uint32 WaitEventTraceRingSize;
+
+/* Pointer to this backend's timing state in shared memory */
+extern PGDLLIMPORT WaitEventTimingState *my_wait_event_timing;
+
+/*
+ * Per-backend gate raised by the before_shmem_exit callback when
+ * proc_exit begins tearing down DSA mappings.  The inline wait-event
+ * hot path checks this and skips ALL wait-event-timing work
+ * (including the lazy re-attach branch) once the gate is up, to
+ * avoid SIGSEGV on dangling pointers after dsm_backend_shutdown.
+ */
+extern PGDLLIMPORT bool wait_event_timing_writes_disabled;
+
+/* This backend's procNumber for the trace ring, or -1 if not set */
+extern PGDLLIMPORT int my_trace_proc_number;
+
+/*
+ * Shared memory setup -- registered via the shmem subsystem registry
+ * (src/include/storage/subsystemlist.h).  Stub builds expose a no-op
+ * callbacks struct so subsystemlist.h references resolve either way.
+ */
+extern PGDLLIMPORT const ShmemCallbacks WaitEventTimingShmemCallbacks;
+extern PGDLLIMPORT const ShmemCallbacks WaitEventTraceControlShmemCallbacks;
+
+/* Called from InitProcess() to point my_wait_event_timing at our slot */
+extern void pgstat_set_wait_event_timing_storage(int procNumber);
+extern void pgstat_reset_wait_event_timing_storage(void);
+
+/* Lazy DSA-based trace ring buffer allocation */
+extern void wait_event_trace_attach(int procNumber);
+
+/* GUC hooks declared in guc_hooks.h */
+
+/* Trace marker functions (defined in wait_event_timing.c) */
+extern void wait_event_trace_query_start(int64 query_id);
+extern void wait_event_trace_query_end(int64 query_id);
+extern void wait_event_trace_exec_start(int64 query_id);
+extern void wait_event_trace_exec_end(int64 query_id);
+
+#endif							/* WAIT_EVENT_TIMING_H */
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 4bca42bb3706a..9ae79c3ce6746 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -39,6 +39,7 @@ subdir('test_json_parser')
 subdir('test_lfind')
 subdir('test_lwlock_tranches')
 subdir('test_misc')
+subdir('test_wait_event_stress')
 subdir('test_oat_hooks')
 subdir('test_parser')
 subdir('test_pg_dump')
diff --git a/src/test/modules/test_wait_event_stress/Makefile b/src/test/modules/test_wait_event_stress/Makefile
new file mode 100644
index 0000000000000..69d10db51831b
--- /dev/null
+++ b/src/test/modules/test_wait_event_stress/Makefile
@@ -0,0 +1,19 @@
+MODULES = test_wait_event_stress
+PGFILEDESC = "test_wait_event_stress - wait event timing overhead measurement"
+
+EXTENSION = test_wait_event_stress
+DATA = test_wait_event_stress--1.0.sql
+
+REGRESS = test_wait_event_stress
+TAP_TESTS = 1
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_wait_event_stress
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_wait_event_stress/expected/test_wait_event_stress.out b/src/test/modules/test_wait_event_stress/expected/test_wait_event_stress.out
new file mode 100644
index 0000000000000..cba09b3c594f4
--- /dev/null
+++ b/src/test/modules/test_wait_event_stress/expected/test_wait_event_stress.out
@@ -0,0 +1,121 @@
+CREATE EXTENSION test_wait_event_stress;
+-- Start from a clean slate so this test is idempotent against any state
+-- left behind by earlier queries in the same session.
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ pg_stat_reset_wait_event_timing 
+---------------------------------
+ 
+(1 row)
+
+-- Basic stress test: verify function works (requires capture to be on so
+-- the instrumentation path actually executes work we can time).
+SET wait_event_capture = stats;
+SELECT stress_wait_events(10000) > 0 AS stress_ok;
+ stress_ok 
+-----------
+ t
+(1 row)
+
+RESET wait_event_capture;
+-- Deterministic exact-count coverage.  Core regression's wait_event_timing
+-- test uses pg_sleep(), which can emit a non-deterministic number of
+-- PgSleep wait events under CPU contention, so it cannot assert exact
+-- counts.  stress_wait_events(N) calls pgstat_report_wait_start/end in a
+-- tight loop exactly N times, giving us strictly deterministic input for
+-- the ring + aggregated-stats pipeline.  This catches symmetric
+-- duplication bugs that the count-agnostic core assertions would miss.
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ pg_stat_reset_wait_event_timing 
+---------------------------------
+ 
+(1 row)
+
+SET wait_event_capture = trace;
+-- stress_wait_events returns elapsed microseconds; on fast TSC-based
+-- timers 5 iterations can round to 0 us, so check IS NOT NULL (the
+-- function succeeded) rather than > 0.
+SELECT stress_wait_events(5) IS NOT NULL AS deterministic_input_ok;
+ deterministic_input_ok 
+------------------------
+ t
+(1 row)
+
+SELECT count(*) = 5 AS ring_has_exactly_five
+FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep';
+ ring_has_exactly_five 
+-----------------------
+ t
+(1 row)
+
+SELECT calls = 5 AS aggregated_has_exactly_five
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+ aggregated_has_exactly_five 
+-----------------------------
+ t
+(1 row)
+
+RESET wait_event_capture;
+-- LWLock hash overflow test: register 200 tranches (> 192 limit)
+-- This should trigger a WARNING about hash table being full
+SET wait_event_capture = stats;
+-- Start from a clean slate so we can make deterministic assertions
+-- about the overflow counter.
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ pg_stat_reset_wait_event_timing 
+---------------------------------
+ 
+(1 row)
+
+SELECT lwlock_overflow_count AS before_overflow
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+ before_overflow 
+-----------------
+               0
+(1 row)
+
+SET client_min_messages = warning;
+SELECT test_lwlock_hash_overflow(200);
+WARNING:  wait_event_timing: LWLock hash table full, timing data for some LWLock tranches will be lost
+HINT:  This backend uses more than 192 distinct LWLock tranches; raise wait_event_timing_max_tranches.
+ test_lwlock_hash_overflow 
+---------------------------
+                       200
+(1 row)
+
+RESET client_min_messages;
+-- After overflow the counter must be visible from SQL.
+SELECT lwlock_overflow_count > 0 AS overflow_visible
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+ overflow_visible 
+------------------
+ t
+(1 row)
+
+-- Reset clears the overflow counter (pins the fix for issue #9).
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ pg_stat_reset_wait_event_timing 
+---------------------------------
+ 
+(1 row)
+
+SELECT lwlock_overflow_count = 0 AS lw_cleared,
+       flat_overflow_count = 0 AS flat_cleared
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+ lw_cleared | flat_cleared 
+------------+--------------
+ t          | t
+(1 row)
+
+-- Verify the function returns the count
+SELECT test_lwlock_hash_overflow(10);
+ test_lwlock_hash_overflow 
+---------------------------
+                        10
+(1 row)
+
+RESET wait_event_capture;
+DROP EXTENSION test_wait_event_stress;
diff --git a/src/test/modules/test_wait_event_stress/meson.build b/src/test/modules/test_wait_event_stress/meson.build
new file mode 100644
index 0000000000000..ef00737017497
--- /dev/null
+++ b/src/test/modules/test_wait_event_stress/meson.build
@@ -0,0 +1,38 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+test_wait_event_stress_sources = files(
+  'test_wait_event_stress.c',
+)
+
+if host_system == 'windows'
+  test_wait_event_stress_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_wait_event_stress',
+    '--FILEDESC', 'test_wait_event_stress - wait event timing overhead measurement',])
+endif
+
+test_wait_event_stress = shared_module('test_wait_event_stress',
+  test_wait_event_stress_sources,
+  kwargs: pg_test_mod_args,
+)
+test_install_libs += test_wait_event_stress
+
+test_install_data += files(
+  'test_wait_event_stress.control',
+  'test_wait_event_stress--1.0.sql',
+)
+
+tests += {
+  'name': 'test_wait_event_stress',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'regress': {
+    'sql': [
+      'test_wait_event_stress',
+    ],
+  },
+  'tap': {
+    'tests': [
+      't/001_orphan_roundtrip.pl',
+    ],
+  },
+}
diff --git a/src/test/modules/test_wait_event_stress/sql/test_wait_event_stress.sql b/src/test/modules/test_wait_event_stress/sql/test_wait_event_stress.sql
new file mode 100644
index 0000000000000..4579d00eb2897
--- /dev/null
+++ b/src/test/modules/test_wait_event_stress/sql/test_wait_event_stress.sql
@@ -0,0 +1,66 @@
+CREATE EXTENSION test_wait_event_stress;
+
+-- Start from a clean slate so this test is idempotent against any state
+-- left behind by earlier queries in the same session.
+SELECT pg_stat_reset_wait_event_timing(NULL);
+
+-- Basic stress test: verify function works (requires capture to be on so
+-- the instrumentation path actually executes work we can time).
+SET wait_event_capture = stats;
+SELECT stress_wait_events(10000) > 0 AS stress_ok;
+RESET wait_event_capture;
+
+-- Deterministic exact-count coverage.  Core regression's wait_event_timing
+-- test uses pg_sleep(), which can emit a non-deterministic number of
+-- PgSleep wait events under CPU contention, so it cannot assert exact
+-- counts.  stress_wait_events(N) calls pgstat_report_wait_start/end in a
+-- tight loop exactly N times, giving us strictly deterministic input for
+-- the ring + aggregated-stats pipeline.  This catches symmetric
+-- duplication bugs that the count-agnostic core assertions would miss.
+SELECT pg_stat_reset_wait_event_timing(NULL);
+SET wait_event_capture = trace;
+-- stress_wait_events returns elapsed microseconds; on fast TSC-based
+-- timers 5 iterations can round to 0 us, so check IS NOT NULL (the
+-- function succeeded) rather than > 0.
+SELECT stress_wait_events(5) IS NOT NULL AS deterministic_input_ok;
+
+SELECT count(*) = 5 AS ring_has_exactly_five
+FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep';
+
+SELECT calls = 5 AS aggregated_has_exactly_five
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+RESET wait_event_capture;
+
+-- LWLock hash overflow test: register 200 tranches (> 192 limit)
+-- This should trigger a WARNING about hash table being full
+SET wait_event_capture = stats;
+
+-- Start from a clean slate so we can make deterministic assertions
+-- about the overflow counter.
+SELECT pg_stat_reset_wait_event_timing(NULL);
+SELECT lwlock_overflow_count AS before_overflow
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+
+SET client_min_messages = warning;
+SELECT test_lwlock_hash_overflow(200);
+RESET client_min_messages;
+
+-- After overflow the counter must be visible from SQL.
+SELECT lwlock_overflow_count > 0 AS overflow_visible
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+
+-- Reset clears the overflow counter (pins the fix for issue #9).
+SELECT pg_stat_reset_wait_event_timing(NULL);
+SELECT lwlock_overflow_count = 0 AS lw_cleared,
+       flat_overflow_count = 0 AS flat_cleared
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+
+-- Verify the function returns the count
+SELECT test_lwlock_hash_overflow(10);
+
+RESET wait_event_capture;
+DROP EXTENSION test_wait_event_stress;
diff --git a/src/test/modules/test_wait_event_stress/t/001_orphan_roundtrip.pl b/src/test/modules/test_wait_event_stress/t/001_orphan_roundtrip.pl
new file mode 100644
index 0000000000000..d5bcde4302325
--- /dev/null
+++ b/src/test/modules/test_wait_event_stress/t/001_orphan_roundtrip.pl
@@ -0,0 +1,519 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+#
+# End-to-end test for wait-event trace orphan persistence and the
+# in-tree cross-backend reader pg_get_wait_event_trace().
+#
+# Three scenarios:
+#
+#   1. Plain backend orphan roundtrip
+#      A writer session enables wait_event_capture = trace, emits a
+#      handful of waits, captures its own procnumber, and disconnects.
+#      A separate long-lived reader session then asserts that
+#      pg_get_wait_event_trace(<writer_procnumber>) returns the
+#      writer's recorded events post-mortem, that
+#      pg_stat_clear_orphaned_wait_event_rings() releases the orphan,
+#      and that a subsequent read returns empty.
+#
+#   2. Parallel-worker orphan roundtrip (the patch's stated motivation)
+#      A query is forced through parallel workers via
+#      debug_parallel_query=on plus zero parallel costs; the workers
+#      exit at end-of-parallel-query in milliseconds.  The test
+#      then asserts pg_stat_clear_orphaned_wait_event_rings()
+#      returns at least 2 -- the leader and at least one worker --
+#      confirming that short-lived parallel workers do leave
+#      readable orphans, the case the orphan-persistence lifecycle
+#      was designed for.
+#
+#   3. OWNED-slot read with a concurrent live writer
+#      A long-lived writer session emits a steady stream of PgSleep
+#      wait events while a separate reader calls
+#      pg_get_wait_event_trace(writer_procnumber) repeatedly.  All
+#      rows must be well-formed (no NULL/empty event_type or
+#      event_name, no negative durations) -- this exercises the
+#      per-record seqlock protocol that protects against torn
+#      reads of records mid-write.  Without the seqlock the reader
+#      would emit malformed records during contention windows.
+#
+# Race-hardening: the reader session is held open for the entire
+# run so its procnumber slot cannot be a recycle target for any
+# writer or parallel worker when they exit, and the test asserts
+# no unrelated client backend is present at the orphan-read
+# moment.  Skipped on builds without --enable-wait-event-timing.
+
+use strict;
+use warnings FATAL => 'all';
+
+use Time::HiRes qw(usleep);
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $node = PostgreSQL::Test::Cluster->new('wet_orphan_roundtrip');
+$node->init;
+# A high max_connections gives plenty of unused procnumber slots so a
+# new backend started during the test window is unlikely to recycle
+# the just-exited writer's slot.  Combined with the long-lived reader
+# session below (which pins its own slot for the full run), this
+# closes the race window to negligible width on a quiet test node.
+$node->append_conf('postgresql.conf', q{
+max_connections = 100
+});
+$node->start;
+
+# Skip when wait-event-timing isn't compiled in.  GUC check hook
+# rejects 'trace' on stub builds; detect via a probe SET.
+my ($rc, $stdout, $stderr) = $node->psql(
+	'postgres',
+	"SET wait_event_capture = trace;");
+if ($stderr =~ /not supported by this build/)
+{
+	plan skip_all =>
+	  'wait_event_timing not compiled in (--enable-wait-event-timing)';
+}
+
+# Long-lived reader session.  Stays connected for the entire test so
+# its procnumber slot is in OWNED state and therefore not eligible as
+# a recycle target for writer/parallel-worker slots when they exit.
+my $reader = $node->background_psql('postgres');
+
+# ------------------------------------------------------------------
+# Scenario 1: plain backend orphan roundtrip
+# ------------------------------------------------------------------
+
+# Spawn the writer as a one-shot psql.  It enables trace, emits a
+# handful of waits inside a DO block (PERFORM avoids empty-row
+# pollution of the captured output), then SELECTs its own procnumber.
+# Because psql one-shot commands spawn a fresh backend that exits
+# when the SQL completes, the writer's slot transitions to ORPHANED
+# on exit.
+my $writer_proc = $node->safe_psql(
+	'postgres', q{
+		SET wait_event_capture = trace;
+		DO $$
+		BEGIN
+		  PERFORM pg_sleep(0.02);
+		  PERFORM pg_sleep(0.02);
+		  PERFORM pg_sleep(0.02);
+		END
+		$$;
+		SELECT procnumber
+		FROM pg_stat_get_wait_event_timing(pg_backend_pid())
+		WHERE pid = pg_backend_pid()
+		LIMIT 1;
+	});
+chomp $writer_proc;
+like($writer_proc, qr/^\d+$/, 'writer reported its procnumber');
+
+# Wait for the writer backend to fully exit.  pg_stat_activity loses
+# the row before the before_shmem_exit callback finishes; we then
+# additionally assert that no other client backend has inherited the
+# writer's procnumber, which would clear the orphan via
+# wait_event_trace_clear_orphan_at_init.
+my $other_clients_query =
+  "SELECT count(*) FROM pg_stat_activity "
+  . "WHERE backend_type = 'client backend' AND pid <> pg_backend_pid();";
+
+my $writer_gone = 0;
+for (my $i = 0; $i < 100; $i++)
+{
+	my $count = $reader->query_safe($other_clients_query);
+	chomp $count;
+	if ($count eq '0') { $writer_gone = 1; last; }
+	usleep(20_000);
+}
+ok($writer_gone, 'writer backend has exited (slot should be ORPHANED)');
+
+# Race-harden: confirm no client backend has taken over the writer's
+# procnumber between its exit and our read.  This is what would
+# clear the orphan; if some other test artifact triggered it we want
+# the test to fail loudly rather than spuriously report "no orphan".
+my $recycler_count = $reader->query_safe(
+	"SELECT count(*) FROM pg_stat_activity "
+	. "WHERE backend_type = 'client backend' "
+	. "  AND pid <> pg_backend_pid();");
+chomp $recycler_count;
+is($recycler_count, '0',
+	'no other client backend present at orphan-read time (slot not recycled)');
+
+# Read the orphaned ring via the cross-backend reader.  At least one
+# record is expected (we emitted three pg_sleep waits).
+my $orphan_rows = $reader->query_safe(
+	"SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+chomp $orphan_rows;
+cmp_ok($orphan_rows, '>=', 1,
+	"pg_get_wait_event_trace($writer_proc) reads ORPHANED ring (rows: $orphan_rows)");
+
+# Admin sweep: clear the orphan.  Should report >= 1 since we know one
+# ORPHANED slot exists.
+my $cleared = $reader->query_safe(
+	"SELECT pg_stat_clear_orphaned_wait_event_rings();");
+chomp $cleared;
+cmp_ok($cleared, '>=', 1,
+	"pg_stat_clear_orphaned_wait_event_rings released $cleared ring(s)");
+
+# After the sweep, the orphan is gone.  Reading the same procnumber
+# returns empty.
+my $after_clear = $reader->query_safe(
+	"SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+chomp $after_clear;
+is($after_clear, '0',
+	"pg_get_wait_event_trace($writer_proc) returns empty after sweep");
+
+# ------------------------------------------------------------------
+# Scenario 2: parallel-worker orphan roundtrip
+# ------------------------------------------------------------------
+#
+# Force parallel workers to participate in a trivial seq scan,
+# capture their procnumbers while alive, then assert each worker's
+# orphaned ring is readable after the parallel query has finished
+# (workers exit in milliseconds at end-of-parallel-query).
+
+# Create the table used to force parallelism.  Done from the reader
+# session so it survives across the writer's lifetime.  Suppress
+# NOTICE so query_safe doesn't treat the "does not exist, skipping"
+# message as a failure (BackgroundPsql::query_safe treats any stderr
+# output as a query failure).  The table is sized large enough
+# (1M rows) and the query is structured (ORDER BY + count(*) under
+# a Gather) so workers reliably emit wait events (tuple queue
+# operations, latch waits) before they exit at end-of-parallel-
+# query.  A smaller table with a plain count(*) can be processed
+# entirely from cache without any wait points, leaving worker
+# trace rings never lazily allocated, never transitioning to
+# OWNED, and never producing ORPHANED slots -- the test would
+# pass without exercising the parallel-worker case.
+$reader->query_safe("SET client_min_messages = warning;");
+$reader->query_safe(q{
+	DROP TABLE IF EXISTS wet_parallel_target;
+	CREATE TABLE wet_parallel_target AS
+	  SELECT i FROM generate_series(1, 1000000) i;
+});
+
+# Spawn a writer session that enables trace and runs a forced-
+# parallel query.  Workers run, then exit at end-of-parallel-query;
+# the leader (this safe_psql backend) then exits when safe_psql
+# returns.  After return, both leader and workers are gone -- each
+# leaves an ORPHANED slot whose ring should be readable.
+$node->safe_psql(
+	'postgres', q{
+		SET wait_event_capture = trace;
+		SET min_parallel_table_scan_size = 0;
+		SET parallel_setup_cost = 0;
+		SET parallel_tuple_cost = 0;
+		SET max_parallel_workers_per_gather = 2;
+		SET debug_parallel_query = on;
+		-- ORDER BY forces a parallel sort and a Gather Merge,
+		-- which routes tuples through shm_mq queues -- workers
+		-- reliably emit MessageQueueSend / MessageQueueReceive
+		-- wait events here, guaranteeing lazy trace-ring
+		-- allocation and OWNED->ORPHANED transition on exit.
+		SELECT count(*) FROM (
+		  SELECT i FROM wet_parallel_target ORDER BY i
+		) s;
+	});
+
+# Wait for all client/worker backends to fully exit.  At
+# safe_psql return the leader has exited, but worker
+# before_shmem_exit callbacks may still be running --
+# pg_stat_clear_orphaned_wait_event_rings counts only
+# slots that have completed their OWNED -> ORPHANED
+# transition, so racing the callbacks under-counts.
+my $exit_drained = 0;
+for (my $i = 0; $i < 200; $i++)
+{
+	my $count = $reader->query_safe(
+		"SELECT count(*) FROM pg_stat_activity "
+		. "WHERE backend_type IN ('client backend', 'parallel worker') "
+		. "  AND pid <> pg_backend_pid();");
+	chomp $count;
+	if ($count eq '0') { $exit_drained = 1; last; }
+	usleep(20_000);
+}
+ok($exit_drained,
+	'all parallel-query backends have exited before counting orphans');
+
+# Count parallel-produced orphans via the admin sweep, which
+# returns the number of rings released.  After a forced-parallel
+# query with the leader and workers all exited, we expect at
+# least 2 orphans (leader + at least one worker).
+#
+# Using the sweep is cheaper than iterating every procnumber and
+# calling pg_get_wait_event_trace on each: it's a single lock
+# acquisition and tells us the count directly.  The read path
+# itself is already covered by scenario 1 above; here we only
+# need to confirm that parallel-worker exits do produce orphans.
+my $parallel_orphans = $reader->query_safe(
+	"SELECT pg_stat_clear_orphaned_wait_event_rings();");
+chomp $parallel_orphans;
+cmp_ok($parallel_orphans, '>=', 2,
+	"parallel-query exit produced >= 2 orphans (leader + worker(s)): $parallel_orphans");
+
+# ------------------------------------------------------------------
+# Scenario 3: OWNED-slot read with a concurrent live writer
+# ------------------------------------------------------------------
+#
+# Exercises the per-record seqlock protocol against an actively
+# writing backend.  OWNED is the case where the seqlock check is
+# load-bearing: the writer is concurrently appending records to
+# the ring while the reader iterates.  A torn read (writer
+# mid-record at the moment of the reader's payload copy) must be
+# detected and the record skipped; well-formed records must be
+# emitted intact, never with a malformed event_type or event_name
+# that would otherwise crash pgstat_get_wait_event_type() or
+# materialise NULL strings into the result.
+#
+# Setup: a long-lived BackgroundPsql writer that has
+# wait_event_capture = trace and runs a tight pg_sleep loop
+# producing a steady stream of PgSleep wait events.  While the
+# writer is emitting, the reader calls
+# pg_get_wait_event_trace(writer_procnumber) repeatedly and
+# asserts every observed row has well-formed event_type,
+# event_name, and a non-negative duration.  Any torn record that
+# slipped through the seqlock surfaces here as a NULL or empty
+# string (or worse, a crash inside the SRF).
+
+my $writer_bg = $node->background_psql('postgres');
+$writer_bg->query_safe("SET client_min_messages = warning;");
+$writer_bg->query_safe("SET wait_event_capture = trace;");
+# Generate at least one wait so the ring is allocated and the
+# procnumber appears in pg_stat_get_wait_event_timing.
+$writer_bg->query_safe("SELECT pg_sleep(0.01);");
+my $writer_bg_proc = $writer_bg->query_safe(
+	"SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) "
+	. "WHERE pid = pg_backend_pid() LIMIT 1;");
+chomp $writer_bg_proc;
+like($writer_bg_proc, qr/^\d+$/,
+	'live writer reported its procnumber');
+
+# Start a burst of wait events asynchronously.  query_until
+# returns as soon as it sees the \echo banner, leaving the DO
+# block executing pg_sleep(0.001) in a tight 1000-iteration loop
+# (~1 s wall, ~1000 PgSleep wait events) in the background.
+$writer_bg->query_until(
+	qr/burst_started/,
+	"\\echo burst_started\n"
+	  . "DO \$\$ BEGIN FOR i IN 1..1000 LOOP PERFORM pg_sleep(0.001); END LOOP; END \$\$;\n");
+
+# Read concurrently from the reader session.  Each read iterates
+# the writer's ring under LW_SHARED; the writer is freely
+# appending records.  Any torn row surfaces as NULL/empty
+# event_type, event_name, or negative duration.
+my $live_read_attempts = 10;
+my $live_reads_ok = 1;
+my $live_total_observed = 0;
+for (my $r = 0; $r < $live_read_attempts; $r++)
+{
+	my $bad = $reader->query_safe(
+		"SELECT count(*) FROM pg_get_wait_event_trace($writer_bg_proc) t "
+		. "WHERE t.wait_event_type IS NULL "
+		. "   OR t.wait_event_type = '' "
+		. "   OR t.wait_event IS NULL "
+		. "   OR t.wait_event = '' "
+		. "   OR t.duration_us < 0;");
+	chomp $bad;
+	if ($bad ne '0')
+	{
+		$live_reads_ok = 0;
+		diag("read $r against live writer returned $bad malformed row(s)");
+		last;
+	}
+
+	my $total = $reader->query_safe(
+		"SELECT count(*) FROM pg_get_wait_event_trace($writer_bg_proc);");
+	chomp $total;
+	$live_total_observed += $total;
+
+	usleep(50_000);
+}
+ok($live_reads_ok,
+	'OWNED-slot reads against live writer produced only well-formed rows');
+cmp_ok($live_total_observed, '>', 0,
+	"OWNED-slot reads observed records across $live_read_attempts reads (total: $live_total_observed)");
+
+# Wait for the writer's DO block to finish; this query_safe
+# blocks until psql is ready to receive new input.
+$writer_bg->query_safe("SELECT 1;");
+$writer_bg->quit;
+
+# ------------------------------------------------------------------
+# Scenario 4: wait_event_trace_clear_orphan_at_init reclaims an
+# orphan when a new backend inherits the same procNumber slot
+# ------------------------------------------------------------------
+#
+# review_6.md issue #8 asked specifically for coverage of the
+# clear_orphan_at_init path (the lazy lifecycle reclaim that
+# runs at every backend's InitProcess and frees a prior orphan
+# whose procNumber the new backend has inherited).  Scenarios 1
+# and 2 above exercise the admin-driven sweep
+# (pg_stat_clear_orphaned_wait_event_rings) but not the init-time
+# per-slot reclaim, leaving a gap in regression coverage for the
+# lifecycle's "common case" path.
+#
+# Strategy:
+#   1. Spawn writer W4, enable wait_event_capture=trace, emit a
+#      wait so W4's slot transitions FREE -> OWNED with a real
+#      trace ring allocated, capture W4's procnumber, disconnect.
+#      W4's slot is now ORPHANED with non-empty ring contents.
+#   2. Verify from the reader session that the orphan is visible
+#      via pg_get_wait_event_trace(w4_proc).
+#   3. Spawn a new backend B with wait_event_capture=stats (so B
+#      does NOT allocate a trace ring of its own).  Query B's
+#      procnumber.  If B inherited W4's procnumber slot, then B's
+#      clear_orphan_at_init must have transitioned the slot from
+#      ORPHANED -> FREE at InitProcess time; we verify that by
+#      asserting pg_get_wait_event_trace(w4_proc) is empty.
+#   4. Retry up to a bounded number of times: procNumber
+#      assignment is determined by ProcGlobal's free list, which
+#      on a quiet single-session test cluster tends to reuse the
+#      just-freed slot quickly, but the reuse is not strictly
+#      guaranteed (aux processes, autovacuum workers, etc. can
+#      take the slot in between).  If we exhaust retries without
+#      a same-procnumber hit, mark the scenario as skipped rather
+#      than fail -- the failure mode is environment-dependent,
+#      not a defect under test.
+
+my $w4 = $node->background_psql('postgres');
+$w4->query_safe("SET client_min_messages = warning;");
+$w4->query_safe("SET wait_event_capture = trace;");
+$w4->query_safe("SELECT pg_sleep(0.01);");
+
+my $w4_proc = $w4->query_safe(
+	"SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) "
+	. "WHERE pid = pg_backend_pid() LIMIT 1;");
+chomp $w4_proc;
+like($w4_proc, qr/^\d+$/,
+	'scenario-4 writer reported its procnumber');
+
+$w4->quit;
+
+# Wait for W4's full server-side exit before reading the orphan
+# or starting the retry loop.  pg_stat_activity loses the row
+# before the before_shmem_exit callbacks finish (the
+# OWNED -> ORPHANED transition lives in such a callback), so
+# polling pg_stat_activity isn't a perfect signal for the
+# transition itself -- but it IS a perfect signal for "the slot
+# has been returned to ProcGlobal->freeProcs and may now be
+# inherited by a new backend", which is what we need before
+# entering the retry loop.  Without this poll, on slower test
+# environments the next query can race ahead of the cleanup and
+# either observe the slot in OWNED state (which the reader
+# still reads correctly, but is a different invariant from what
+# this scenario tests) or observe $w4 still in pg_stat_activity
+# and fail the "no other client backend" assertion below.
+my $w4_gone = 0;
+for (my $i = 0; $i < 100; $i++)
+{
+	my $count = $reader->query_safe(
+		"SELECT count(*) FROM pg_stat_activity "
+		. "WHERE backend_type = 'client backend' "
+		. "  AND pid <> pg_backend_pid();");
+	chomp $count;
+	if ($count eq '0') { $w4_gone = 1; last; }
+	usleep(20_000);
+}
+ok($w4_gone, 'scenario-4 writer backend has exited');
+
+my $orphan_rows4 = $reader->query_safe(
+	"SELECT count(*) FROM pg_get_wait_event_trace($w4_proc);");
+chomp $orphan_rows4;
+cmp_ok($orphan_rows4, '>=', 1,
+	"scenario-4 orphan visible at procnumber $w4_proc before any reclaim "
+	. "(rows: $orphan_rows4)");
+
+# Retry loop: spawn new backends with capture = stats (does NOT
+# allocate a trace ring), capture procnumber, check whether the
+# inheritance landed on w4_proc.
+my $reclaimed = 0;
+my $attempts = 0;
+# ProcGlobal->freeProcs is a FIFO (dlist_push_tail on backend exit,
+# dlist_pop_head_node on new backend init), so after W4 disconnects
+# the just-freed slot goes to the tail of the queue.  To cycle the
+# queue back around to W4's procnumber, a new backend has to be
+# spawned for every free slot ahead of W4's in the queue.  The
+# cluster's max_connections is the upper bound; query it and add
+# a 20% safety margin to absorb any walsender/bgworker free-list
+# overlap or queue-occupancy fluctuations from autovacuum/etc.
+# This adapts automatically if the test config in $node->append_conf
+# above is changed.
+my $max_connections = $node->safe_psql('postgres',
+	'SHOW max_connections;');
+chomp $max_connections;
+my $max_attempts = int($max_connections * 1.2);
+my @observed_procs;
+
+for (my $i = 0; $i < $max_attempts; $i++)
+{
+	$attempts++;
+	my $b = $node->background_psql('postgres');
+	$b->query_safe("SET client_min_messages = warning;");
+	$b->query_safe("SET wait_event_capture = stats;");
+	# Sleep duration kept slightly above the millisecond range so the
+	# pg_stat_get_wait_event_timing query reliably observes a non-zero
+	# entry count on slow CI hosts that may aggressively optimise
+	# sub-millisecond WaitLatch dispatches.
+	$b->query_safe("SELECT pg_sleep(0.005);");
+
+	my $b_proc = $b->query_safe(
+		"SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) "
+		. "WHERE pid = pg_backend_pid() LIMIT 1;");
+	chomp $b_proc;
+	push @observed_procs, $b_proc;
+
+	if ($b_proc eq $w4_proc)
+	{
+		# Inheritance landed.  B's clear_orphan_at_init at
+		# InitProcess time must have FREE'd W4's ORPHANED
+		# slot.  Since B used capture = stats (not trace), B
+		# allocated no new trace ring at this slot, so a
+		# subsequent pg_get_wait_event_trace($w4_proc) should
+		# return zero rows.  If it returns >= 1 row, those
+		# rows are W4's stale records -- the reclaim did not
+		# happen and the test fails.
+		my $rows_after = $reader->query_safe(
+			"SELECT count(*) FROM pg_get_wait_event_trace($w4_proc);");
+		chomp $rows_after;
+		is($rows_after, '0',
+			"clear_orphan_at_init reclaimed W4's orphan when B "
+			. "inherited procnumber $w4_proc (attempt $attempts)")
+		  or diag("expected 0 rows but pg_get_wait_event_trace("
+			. "$w4_proc) returned $rows_after; observed procnumber "
+			. "sequence: " . join(",", @observed_procs));
+		$reclaimed = 1;
+		$b->quit;
+		last;
+	}
+	$b->quit;
+}
+
+SKIP:
+{
+	if (!$reclaimed)
+	{
+		diag("observed procnumbers (W4 was $w4_proc): "
+			. join(",", @observed_procs));
+		skip("procnumber $w4_proc was not reused within $max_attempts attempts; "
+			. "the ProcGlobal free-list order is environment-dependent. "
+			. "The clear_orphan_at_init code path is exercised at every "
+			. "backend init that DOES inherit an orphan; scenarios 1 and 2 "
+			. "above also cover orphan reclamation via the admin sweep.",
+			1);
+	}
+}
+
+# Note on test coverage: the position-encoded identity seqlock
+# in emit_wait_event_trace_for_procnumber() has no direct
+# regression test.  The bug it prevents (reader observing the
+# writer's new write_pos before the writer's rec->seq update has
+# propagated, then emitting a stale record with the wrong ring
+# index) is unreachable on x86 TSO without instrumentation.  The
+# writer-side INJECTION_POINT("wait-event-trace-after-write-pos")
+# is in place to support such a test -- a future TAP scenario
+# can attach with action 'wait' and verify the reader skips the
+# in-flight slot.  Wiring an async BackgroundPsql to wedge inside
+# the wait-event recording path proved fiddly enough to defer to
+# a follow-up; the identity check is correct by construction and
+# the protocol is documented on WaitEventTraceControl.
+
+$reader->quit;
+$node->stop;
+
+done_testing;
diff --git a/src/test/modules/test_wait_event_stress/t/002_ring_wrap.pl b/src/test/modules/test_wait_event_stress/t/002_ring_wrap.pl
new file mode 100644
index 0000000000000..ae3c101bb4934
--- /dev/null
+++ b/src/test/modules/test_wait_event_stress/t/002_ring_wrap.pl
@@ -0,0 +1,134 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+#
+# Wraparound regression test for the wait-event-trace ring buffer.
+#
+# Provisions a cluster with the smallest legal
+# wait_event_trace_ring_size_kb (8 KB = 256 records) and a small
+# max_connections, then drives a session through enough wait
+# events to force the ring to wrap many times.  Verifies that:
+#
+#   1. The session-local SRF (pg_get_backend_wait_event_trace)
+#      remains queryable when the ring has wrapped: the result is
+#      bounded by the ring size, well-formed, and the seq column
+#      reflects the most-recent records (not the oldest).
+#
+#   2. The cross-backend reader (pg_get_wait_event_trace) on a
+#      wrapped, currently-OWNED slot also returns well-formed
+#      records bounded by the ring size, with the per-record
+#      position-encoded identity seqlock correctly distinguishing
+#      current-cycle records from overwritten earlier-cycle ones.
+#
+# If the writer's `pos & ring_mask` indexing or the seqlock's
+# identity check (expected_seq = pos*2 + 2) is wrong, this test
+# either crashes the reader, produces NULL columns, or returns
+# more records than the ring can hold.
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use Time::HiRes qw(usleep);
+
+my $node = PostgreSQL::Test::Cluster->new('wet_ring_wrap');
+$node->init;
+
+# Smallest legal ring size (8 KB = 256 records of 32 bytes each).
+# Combined with the loops below this guarantees many ring wraps in
+# the writer session.  max_connections kept small so the
+# administrative cost of starting/stopping backends stays low.
+$node->append_conf(
+	'postgresql.conf', q{
+max_connections = 20
+wait_event_trace_ring_size_kb = 8
+});
+$node->start;
+
+# Skip if wait-event-timing wasn't compiled in.  Detect via a probe
+# SET; the GUC's check hook rejects non-OFF values on stub builds.
+my ($rc, $stdout, $stderr) = $node->psql(
+	'postgres',
+	"SET wait_event_capture = trace;",
+	on_error_stop => 0);
+if ($stderr =~ /wait event capture is not supported by this build/)
+{
+	plan skip_all => 'wait-event-timing not compiled in';
+}
+
+# Verify the GUC is what we asked for.
+my $ring_kb = $node->safe_psql('postgres', "SHOW wait_event_trace_ring_size_kb;");
+chomp $ring_kb;
+is($ring_kb, '8kB',
+	"wait_event_trace_ring_size_kb is the configured value: $ring_kb");
+
+# Drive the writer past many ring wraps.
+# Ring = 256 records.  Each pg_sleep(0.0001) emits one wait event
+# (the PgSleep latch wait at end).  500 sleeps => roughly 2x the
+# ring size (the parse/plan/exec path emits additional waits per
+# statement, so the actual ring-wrap factor is higher).
+my $writer = $node->background_psql('postgres');
+$writer->query_safe("SET client_min_messages = warning;");
+$writer->query_safe("SET wait_event_capture = trace;");
+$writer->query_safe(
+	"DO \$\$ BEGIN FOR i IN 1..500 LOOP PERFORM pg_sleep(0.0001); END LOOP; END \$\$;"
+);
+
+my $writer_proc = $writer->query_safe(
+	"SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid()) "
+	. "WHERE pid = pg_backend_pid() LIMIT 1;");
+chomp $writer_proc;
+like($writer_proc, qr/^\d+$/, 'writer reported its procnumber');
+
+# Session-local read: at most ring-size records, all well-formed,
+# seq values reflect the wrapped state (not 0..N-1).
+my $local_count = $writer->query_safe(
+	"SELECT count(*) FROM pg_get_backend_wait_event_trace();");
+chomp $local_count;
+cmp_ok($local_count, '<=', 256,
+	"session-local read returns at most ring_size records ($local_count <= 256)");
+cmp_ok($local_count, '>=', 1,
+	"session-local read returns at least one record");
+
+my $local_min_seq = $writer->query_safe(
+	"SELECT min(seq) FROM pg_get_backend_wait_event_trace();");
+chomp $local_min_seq;
+cmp_ok($local_min_seq, '>=', 256,
+	"session-local read sees post-wrap seq (min=$local_min_seq >= 256)");
+
+# Cross-backend read of the OWNED slot via pg_get_wait_event_trace.
+# This exercises the identity-seqlock check under the wrap regime.
+my $reader = $node->background_psql('postgres');
+
+my $cross_count = $reader->query_safe(
+	"SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+chomp $cross_count;
+cmp_ok($cross_count, '<=', 256,
+	"cross-backend read returns at most ring_size records ($cross_count <= 256)");
+cmp_ok($cross_count, '>=', 1,
+	"cross-backend read sees the wrapped ring's records");
+
+my $bad = $reader->query_safe(
+	"SELECT count(*) FROM pg_get_wait_event_trace($writer_proc) "
+	. "WHERE wait_event_type IS NULL "
+	. "   OR wait_event IS NULL "
+	. "   OR wait_event_type = '' "
+	. "   OR wait_event = '' "
+	. "   OR duration_us < 0;");
+chomp $bad;
+is($bad, '0',
+	'cross-backend read after wrap returns only well-formed rows');
+
+# The seq column on the cross-backend SRF reports the writer's
+# ring index, which after many wraps should be far above 256.
+my $cross_min_seq = $reader->query_safe(
+	"SELECT min(seq) FROM pg_get_wait_event_trace($writer_proc);");
+chomp $cross_min_seq;
+cmp_ok($cross_min_seq, '>=', 256,
+	"cross-backend reader sees post-wrap seq (min=$cross_min_seq >= 256)");
+
+$writer->quit;
+$reader->quit;
+$node->stop;
+
+done_testing;
diff --git a/src/test/modules/test_wait_event_stress/test_wait_event_stress--1.0.sql b/src/test/modules/test_wait_event_stress/test_wait_event_stress--1.0.sql
new file mode 100644
index 0000000000000..916fe9456a197
--- /dev/null
+++ b/src/test/modules/test_wait_event_stress/test_wait_event_stress--1.0.sql
@@ -0,0 +1,9 @@
+CREATE FUNCTION stress_wait_events(integer)
+RETURNS bigint
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+
+CREATE FUNCTION test_lwlock_hash_overflow(integer)
+RETURNS integer
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
diff --git a/src/test/modules/test_wait_event_stress/test_wait_event_stress.c b/src/test/modules/test_wait_event_stress/test_wait_event_stress.c
new file mode 100644
index 0000000000000..5ebb740dafb71
--- /dev/null
+++ b/src/test/modules/test_wait_event_stress/test_wait_event_stress.c
@@ -0,0 +1,103 @@
+#include "postgres.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "pgstat.h"
+#include "storage/lwlock.h"
+#include "utils/wait_event.h"
+#include "utils/wait_event_types.h"
+#include "utils/timestamp.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * stress_wait_events(n int) -> bigint
+ *
+ * Calls pgstat_report_wait_start()/pgstat_report_wait_end() in a tight loop
+ * n times.  Returns the elapsed time in microseconds.
+ *
+ * This measures the pure overhead of the wait event timing instrumentation:
+ *   - 2x clock_gettime(CLOCK_MONOTONIC) via VDSO per iteration
+ *   - 1x histogram bucket calculation (CLZ instruction)
+ *   - 1x accumulator update (counter + total_ns)
+ *   - optionally 1x trace ring buffer write
+ *
+ * Usage:
+ *   SELECT stress_wait_events(1000000);  -- 1M iterations
+ *   -- returns elapsed microseconds
+ *   -- overhead per iteration = result / 1000000 microseconds
+ */
+PG_FUNCTION_INFO_V1(stress_wait_events);
+
+Datum
+stress_wait_events(PG_FUNCTION_ARGS)
+{
+	int32		iterations = PG_GETARG_INT32(0);
+	instr_time	start,
+				end;
+	int			i;
+
+	if (iterations < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("iterations must be non-negative")));
+
+	INSTR_TIME_SET_CURRENT(start);
+
+	for (i = 0; i < iterations; i++)
+	{
+		pgstat_report_wait_start(WAIT_EVENT_PG_SLEEP);
+		pgstat_report_wait_end();
+	}
+
+	INSTR_TIME_SET_CURRENT(end);
+
+	PG_RETURN_INT64(INSTR_TIME_GET_MICROSEC(end) - INSTR_TIME_GET_MICROSEC(start));
+}
+
+/*
+ * test_lwlock_hash_overflow(n_tranches int) -> int
+ *
+ * Registers n_tranches custom LWLock tranches and triggers a
+ * pgstat_report_wait_start()/pgstat_report_wait_end() cycle on each.
+ * Returns the number of tranches that were triggered.
+ *
+ * With n_tranches > LWLOCK_TIMING_MAX_ENTRIES (192), this exercises the
+ * hash overflow path and verifies the one-time WARNING fires.
+ *
+ * Usage:
+ *   SET wait_event_capture = stats;
+ *   SET client_min_messages = warning;
+ *   SELECT test_lwlock_hash_overflow(200);
+ *   -- expect WARNING about LWLock hash table full
+ */
+PG_FUNCTION_INFO_V1(test_lwlock_hash_overflow);
+
+Datum
+test_lwlock_hash_overflow(PG_FUNCTION_ARGS)
+{
+	int32		n_tranches = PG_GETARG_INT32(0);
+	int			i;
+	char		name[64];
+
+	if (n_tranches < 0 || n_tranches > 1000)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("n_tranches must be between 0 and 1000")));
+
+	for (i = 0; i < n_tranches; i++)
+	{
+		int		tranche_id;
+		uint32	event;
+
+		snprintf(name, sizeof(name), "test_lwlock_overflow_%d", i);
+		tranche_id = LWLockNewTrancheId(name);
+
+		/* Construct wait_event_info: PG_WAIT_LWLOCK | tranche_id */
+		event = PG_WAIT_LWLOCK | (uint32) tranche_id;
+
+		pgstat_report_wait_start(event);
+		pgstat_report_wait_end();
+	}
+
+	PG_RETURN_INT32(n_tranches);
+}
diff --git a/src/test/modules/test_wait_event_stress/test_wait_event_stress.control b/src/test/modules/test_wait_event_stress/test_wait_event_stress.control
new file mode 100644
index 0000000000000..8c2b50e2af620
--- /dev/null
+++ b/src/test/modules/test_wait_event_stress/test_wait_event_stress.control
@@ -0,0 +1,4 @@
+comment = 'Stress test for wait event timing overhead'
+default_version = '1.0'
+module_pathname = '$libdir/test_wait_event_stress'
+relocatable = true
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index a65a5bf0c4fbc..0a00c00cd5c91 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1332,6 +1332,13 @@ pg_backend_memory_contexts| SELECT name,
     free_chunks,
     used_bytes
    FROM pg_get_backend_memory_contexts() pg_get_backend_memory_contexts(name, ident, type, level, path, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes);
+pg_backend_wait_event_trace| SELECT seq,
+    timestamp_ns,
+    wait_event_type,
+    wait_event,
+    duration_us,
+    query_id
+   FROM pg_get_backend_wait_event_trace() t(seq, timestamp_ns, wait_event_type, wait_event, duration_us, query_id);
 pg_config| SELECT name,
     setting
    FROM pg_config() pg_config(name, setting);
@@ -2415,6 +2422,24 @@ pg_stat_user_tables| SELECT relid,
     stats_reset
    FROM pg_stat_all_tables
   WHERE ((schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (schemaname !~ '^pg_toast'::text));
+pg_stat_wait_event_timing| SELECT pid,
+    backend_type,
+    procnumber,
+    wait_event_type,
+    wait_event,
+    calls,
+    total_time_ms,
+    avg_time_us,
+    max_time_us,
+    histogram
+   FROM pg_stat_get_wait_event_timing(NULL::integer) t(pid, backend_type, procnumber, wait_event_type, wait_event, calls, total_time_ms, avg_time_us, max_time_us, histogram);
+pg_stat_wait_event_timing_overflow| SELECT pid,
+    backend_type,
+    procnumber,
+    lwlock_overflow_count,
+    flat_overflow_count,
+    reset_count
+   FROM pg_stat_get_wait_event_timing_overflow(NULL::integer) t(pid, backend_type, procnumber, lwlock_overflow_count, flat_overflow_count, reset_count);
 pg_stat_wal| SELECT wal_records,
     wal_fpi,
     wal_bytes,
@@ -2891,6 +2916,11 @@ pg_views| SELECT n.nspname AS schemaname,
    FROM (pg_class c
      LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)))
   WHERE (c.relkind = 'v'::"char");
+pg_wait_event_timing_histogram_buckets| SELECT bucket_idx,
+    lower_ns,
+    upper_ns,
+    label
+   FROM ( VALUES (0,(0)::bigint,(1024)::bigint,'<1us'::text), (1,(1024)::bigint,(2048)::bigint,'1-2us'::text), (2,(2048)::bigint,(4096)::bigint,'2-4us'::text), (3,(4096)::bigint,(8192)::bigint,'4-8us'::text), (4,(8192)::bigint,(16384)::bigint,'8-16us'::text), (5,(16384)::bigint,(32768)::bigint,'16-32us'::text), (6,(32768)::bigint,(65536)::bigint,'32-64us'::text), (7,(65536)::bigint,(131072)::bigint,'64-128us'::text), (8,(131072)::bigint,(262144)::bigint,'128-256us'::text), (9,(262144)::bigint,(524288)::bigint,'256-512us'::text), (10,(524288)::bigint,(1048576)::bigint,'512us-1ms'::text), (11,(1048576)::bigint,(2097152)::bigint,'1-2ms'::text), (12,(2097152)::bigint,(4194304)::bigint,'2-4ms'::text), (13,(4194304)::bigint,(8388608)::bigint,'4-8ms'::text), (14,(8388608)::bigint,(16777216)::bigint,'8-16ms'::text), (15,(16777216)::bigint,(33554432)::bigint,'16-32ms'::text), (16,(33554432)::bigint,(67108864)::bigint,'32-64ms'::text), (17,(67108864)::bigint,(134217728)::bigint,'64-128ms'::text), (18,(134217728)::bigint,(268435456)::bigint,'128-256ms'::text), (19,(268435456)::bigint,(536870912)::bigint,'256-512ms'::text), (20,(536870912)::bigint,(1073741824)::bigint,'512ms-1s'::text), (21,(1073741824)::bigint,'2147483648'::bigint,'1-2s'::text), (22,'2147483648'::bigint,'4294967296'::bigint,'2-4s'::text), (23,'4294967296'::bigint,'8589934592'::bigint,'4-8s'::text), (24,'8589934592'::bigint,'17179869184'::bigint,'8-16s'::text), (25,'17179869184'::bigint,'34359738368'::bigint,'16-32s'::text), (26,'34359738368'::bigint,'68719476736'::bigint,'32-64s'::text), (27,'68719476736'::bigint,'137438953472'::bigint,'64-128s'::text), (28,'137438953472'::bigint,'274877906944'::bigint,'128-256s'::text), (29,'274877906944'::bigint,'549755813888'::bigint,'256-512s'::text), (30,'549755813888'::bigint,'1099511627776'::bigint,'512s-1024s'::text), (31,'1099511627776'::bigint,NULL::bigint,'>=1024s'::text)) t(bucket_idx, lower_ns, upper_ns, label);
 pg_wait_events| SELECT type,
     name,
     description
diff --git a/src/test/regress/expected/wait_event_timing.out b/src/test/regress/expected/wait_event_timing.out
new file mode 100644
index 0000000000000..339e98507779e
--- /dev/null
+++ b/src/test/regress/expected/wait_event_timing.out
@@ -0,0 +1,397 @@
+--
+-- Test wait event timing infrastructure
+--
+-- These tests verify the wait event timing SQL interface.
+-- They require --enable-wait-event-timing (or -Dwait_event_timing=true for
+-- meson) at compile time.  Without it, the alternate expected output
+-- wait_event_timing_1.out is used.  The default CI (Cirrus) runs without
+-- timing enabled, so the non-timing path is tested automatically.
+--
+-- Check GUC default
+SHOW wait_event_capture;
+ wait_event_capture 
+--------------------
+ off
+(1 row)
+
+-- Enable stats-level capture for this test (PGC_SUSET, requires superuser)
+SET wait_event_capture = stats;
+-- Verify views exist (zero rows is fine, just checking structure)
+SELECT * FROM pg_stat_wait_event_timing LIMIT 0;
+ pid | backend_type | procnumber | wait_event_type | wait_event | calls | total_time_ms | avg_time_us | max_time_us | histogram 
+-----+--------------+------------+-----------------+------------+-------+---------------+-------------+-------------+-----------
+(0 rows)
+
+SELECT * FROM pg_backend_wait_event_trace LIMIT 0;
+ seq | timestamp_ns | wait_event_type | wait_event | duration_us | query_id 
+-----+--------------+-----------------+------------+-------------+----------
+(0 rows)
+
+-- The histogram-buckets taxonomy view is constant: 16 ordered rows,
+-- ascending bin edges, last bucket open-ended.  Available in both
+-- timing and non-timing builds (defined in system_views.sql, not gated
+-- on the compile flag).
+SELECT count(*) = 32 AS thirty_two_rows,
+       min(bucket_idx) = 0 AS idx_starts_at_zero,
+       max(bucket_idx) = 31 AS idx_ends_at_thirty_one,
+       bool_and(lower_ns IS NOT NULL) AS all_lowers_present,
+       count(*) FILTER (WHERE upper_ns IS NULL) = 1 AS one_open_bucket
+FROM pg_wait_event_timing_histogram_buckets;
+ thirty_two_rows | idx_starts_at_zero | idx_ends_at_thirty_one | all_lowers_present | one_open_bucket 
+-----------------+--------------------+------------------------+--------------------+-----------------
+ t               | t                  | t                      | t                  | t
+(1 row)
+
+-- Verify column types of timing view
+SELECT
+    a.attname,
+    pg_catalog.format_type(a.atttypid, a.atttypmod) as type
+FROM pg_catalog.pg_attribute a
+JOIN pg_catalog.pg_class c ON a.attrelid = c.oid
+JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid
+WHERE n.nspname = 'pg_catalog'
+  AND c.relname = 'pg_stat_wait_event_timing'
+  AND a.attnum > 0
+  AND NOT a.attisdropped
+ORDER BY a.attnum;
+     attname     |       type       
+-----------------+------------------
+ pid             | integer
+ backend_type    | text
+ procnumber      | integer
+ wait_event_type | text
+ wait_event      | text
+ calls           | bigint
+ total_time_ms   | double precision
+ avg_time_us     | double precision
+ max_time_us     | double precision
+ histogram       | bigint[]
+(10 rows)
+
+-- Generate a wait event
+SELECT pg_sleep(0.1);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+-- Verify PgSleep event appears with correct structure
+SELECT
+    pid = pg_backend_pid() AS pid_ok,
+    backend_type,
+    wait_event_type,
+    wait_event,
+    calls >= 1 AS has_calls,
+    total_time_ms > 0 AS has_time,
+    avg_time_us > 0 AS has_avg,
+    max_time_us > 0 AS has_max,
+    pg_typeof(histogram) AS hist_type,
+    array_length(histogram, 1) AS hist_len,
+    calls = (SELECT sum(x) FROM unnest(histogram) x) AS hist_invariant
+FROM pg_stat_wait_event_timing
+WHERE wait_event = 'PgSleep';
+ pid_ok |  backend_type  | wait_event_type | wait_event | has_calls | has_time | has_avg | has_max | hist_type | hist_len | hist_invariant 
+--------+----------------+-----------------+------------+-----------+----------+---------+---------+-----------+----------+----------------
+ t      | client backend | Timeout         | PgSleep    | t         | t        | t       | t       | bigint[]  |       32 | t
+(1 row)
+
+-- Test reset function (own backend)
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ pg_stat_reset_wait_event_timing 
+---------------------------------
+ 
+(1 row)
+
+SELECT count(*) AS after_reset
+FROM pg_stat_wait_event_timing
+WHERE wait_event = 'PgSleep';
+ after_reset 
+-------------
+           0
+(1 row)
+
+-- Test trace ring buffer (need compute_query_id for query markers)
+SET compute_query_id = on;
+SET wait_event_capture = trace;
+SELECT pg_sleep(0.01);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+SELECT
+    wait_event_type,
+    wait_event,
+    duration_us >= 0 AS dur_ok,
+    seq >= 0 AS seq_ok
+FROM pg_backend_wait_event_trace
+WHERE wait_event = 'PgSleep';
+ wait_event_type | wait_event | dur_ok | seq_ok 
+-----------------+------------+--------+--------
+ Timeout         | PgSleep    | t      | t
+(1 row)
+
+-- Test query markers exist in trace
+SELECT count(*) > 0 AS has_query_markers
+FROM pg_backend_wait_event_trace
+WHERE wait_event_type = 'Query';
+ has_query_markers 
+-------------------
+ t
+(1 row)
+
+-- Reset does not crash: NULL and own PID are equivalent
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ pg_stat_reset_wait_event_timing 
+---------------------------------
+ 
+(1 row)
+
+SELECT pg_stat_reset_wait_event_timing(pg_backend_pid());
+ pg_stat_reset_wait_event_timing 
+---------------------------------
+ 
+(1 row)
+
+-- Unknown PID is a silent no-op (matches pg_stat_reset_backend_stats)
+SELECT pg_stat_reset_wait_event_timing(2147483647);
+ pg_stat_reset_wait_event_timing 
+---------------------------------
+ 
+(1 row)
+
+-- Cluster-wide reset (superuser-only)
+SELECT pg_stat_reset_wait_event_timing_all();
+ pg_stat_reset_wait_event_timing_all 
+-------------------------------------
+ 
+(1 row)
+
+-- Trace read (no arguments; always returns own session)
+SELECT count(*) >= 0 AS trace_readable
+FROM pg_get_backend_wait_event_trace();
+ trace_readable 
+----------------
+ t
+(1 row)
+
+-- Test trace lifecycle: drop to stats, then back up to trace
+SET compute_query_id = on;
+SET wait_event_capture = stats;
+SET wait_event_capture = trace;
+SELECT 1 AS reattach_test;
+ reattach_test 
+---------------
+             1
+(1 row)
+
+SELECT count(*) >= 0 AS trace_reattach_ok
+FROM pg_backend_wait_event_trace;
+ trace_reattach_ok 
+-------------------
+ t
+(1 row)
+
+SET wait_event_capture = stats;
+-- Pin issue #15 fix: TRACE -> OFF (or STATS) must release the DSA ring,
+-- and a subsequent re-enable must allocate a fresh, empty ring.  Old
+-- trace records do NOT survive the disable, but aggregated stats in
+-- pg_stat_wait_event_timing DO (they live in a separate DSA allocation).
+--
+-- The assertions below are strict-equal on count-agnostic invariants.
+-- We deliberately avoid "count(*) = N" style assertions here: pg_sleep()
+-- loops around WaitLatch and can emit more than one PgSleep wait event
+-- per call under CPU contention (spurious latch wakes), so a fixed count
+-- would be flaky on busy CI runners.  Instead:
+--
+--   * ring_reallocated is decided by comparing phase 2's max(seq) against
+--     phase 1's (seq is derived from write_pos, which resets to 0 on a
+--     freshly allocated ring -- phase 2's records must have strictly
+--     smaller seq than phase 1's last record iff the ring was freed).
+--
+--   * stats_preserved_exactly checks that aggregated "calls" equals the
+--     exact sum of events seen in the two phase rings.  Whatever each
+--     phase's ring count happens to be, the aggregated counter must land
+--     on that sum; any drop, asymmetric duplication, or reset-on-toggle
+--     bug breaks the equality.
+--
+-- The symmetric-duplication case (both ring and aggregated doubled
+-- identically) is covered separately in test_wait_event_stress using
+-- deterministic exact-count input via stress_wait_events().
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ pg_stat_reset_wait_event_timing 
+---------------------------------
+ 
+(1 row)
+
+SET wait_event_capture = trace;
+SELECT pg_sleep(0.001);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+SELECT pg_sleep(0.001);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+-- Stash phase 1's ring count + highest seq (all phase-1 records).
+CREATE TEMP TABLE wet_phase1 AS
+SELECT count(*) AS n, max(seq) AS max_seq
+FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep';
+-- At least two PgSleep events captured (one per pg_sleep call, ignoring
+-- spurious wakes).  Catches drop bugs.
+SELECT n >= 2 AS phase1_captured_both_sleeps
+FROM wet_phase1;
+ phase1_captured_both_sleeps 
+-----------------------------
+ t
+(1 row)
+
+SET wait_event_capture = off;
+SET wait_event_capture = trace;
+SELECT pg_sleep(0.001);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+-- Phase 2: stash fresh-ring count + max(seq).
+CREATE TEMP TABLE wet_phase2 AS
+SELECT count(*) AS n, max(seq) AS max_seq
+FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep';
+-- The ring was freed iff phase 2's records all have seq strictly smaller
+-- than phase 1's last seq (write_pos started over at 0).  If the ring
+-- had persisted, phase 2 would contain phase 1's records plus new ones,
+-- so max(seq) would be >= phase1.max_seq.  Strict-equal on semantic.
+SELECT n >= 1 AND max_seq < (SELECT max_seq FROM wet_phase1)
+       AS ring_freed_and_reallocated
+FROM wet_phase2;
+ ring_freed_and_reallocated 
+----------------------------
+ t
+(1 row)
+
+-- Aggregated stats must equal the exact sum of the two phase ring counts.
+-- Catches drops (aggregated < sum), asymmetric duplication, and any
+-- reset-on-toggle bug that would wipe aggregated counters.
+SELECT calls = (SELECT n FROM wet_phase1) + (SELECT n FROM wet_phase2)
+       AS stats_preserved_exactly
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+ stats_preserved_exactly 
+-------------------------
+ t
+(1 row)
+
+DROP TABLE wet_phase1, wet_phase2;
+SET wait_event_capture = stats;
+-- Overflow counters view: should be readable and overflow counts should
+-- be zero for a freshly-reset session that hasn't exceeded limits.
+-- reset_count must have incremented at least once (we called reset above).
+SELECT
+    pid = pg_backend_pid() AS pid_ok,
+    lwlock_overflow_count >= 0 AS lw_nonneg,
+    flat_overflow_count >= 0 AS flat_nonneg,
+    reset_count >= 1 AS reset_count_bumped
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+ pid_ok | lw_nonneg | flat_nonneg | reset_count_bumped 
+--------+-----------+-------------+--------------------
+ t      | t         | t           | t
+(1 row)
+
+-- Orphan-clear admin function: smoke-test that it returns a non-negative
+-- count and is callable without error.  Actual orphan-creation requires
+-- a backend exit, which the regression harness can't easily orchestrate
+-- in a portable way; we verify here only that the API works.  Returns
+-- bigint (count of rings freed); typically 0 in a fresh test run.
+SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok;
+ clear_orphans_ok 
+------------------
+ t
+(1 row)
+
+-- PID-filter fast path on the cluster-wide SRFs.  Smoke-test that the
+-- single-slot branch returns rows for the calling backend and zero rows
+-- for a known-bad PID (matching pg_stat_reset_wait_event_timing
+-- semantics).
+SELECT
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing(pg_backend_pid())
+     WHERE pid = pg_backend_pid()) >= 0 AS own_pid_returns_rows,
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing(2147483647)) = 0
+        AS unknown_pid_empty,
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(pg_backend_pid())
+     WHERE pid = pg_backend_pid()) = 1 AS overflow_own_pid_one_row,
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(2147483647)) = 0
+        AS overflow_unknown_pid_empty;
+ own_pid_returns_rows | unknown_pid_empty | overflow_own_pid_one_row | overflow_unknown_pid_empty 
+----------------------+-------------------+--------------------------+----------------------------
+ t                    | t                 | t                        | t
+(1 row)
+
+-- Cross-backend trace SRF: smoke-test that pg_get_wait_event_trace
+-- (procnumber-keyed) is callable and returns sensible results.
+-- Full orphan-readability and the parallel-worker case are exercised
+-- by the TAP test (which can orchestrate backend exits).
+SET wait_event_capture = trace;
+-- generate at least one wait event so the ring is allocated
+SELECT pg_sleep(0.01);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+SELECT
+    -- Own session: pull our procnumber from the timing SRF, then read
+    -- our own trace ring through the cross-backend SRF.
+    (SELECT count(*) FROM pg_get_wait_event_trace(
+        (SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid())
+         WHERE pid = pg_backend_pid() LIMIT 1))) >= 0
+        AS by_procnumber_self_ok,
+    -- Out-of-range procnumber: empty result, no error.
+    (SELECT count(*) FROM pg_get_wait_event_trace(-1)) = 0
+        AS negative_procnumber_empty,
+    (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0
+        AS huge_procnumber_empty;
+ by_procnumber_self_ok | negative_procnumber_empty | huge_procnumber_empty 
+-----------------------+---------------------------+-----------------------
+ t                     | t                         | t
+(1 row)
+
+-- With capture disabled, a never-allocated slot still reads as empty
+-- (the function short-circuits when the trace DSA was never created
+-- or when the slot is FREE).
+SET wait_event_capture = off;
+SELECT (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0
+    AS capture_off_empty;
+ capture_off_empty 
+-------------------
+ t
+(1 row)
+
+-- Permission gating: a role without pg_read_all_stats cannot call the
+-- function.  Cover both the public role and a freshly-created one.
+SET wait_event_capture = stats;
+CREATE ROLE regress_wet_reader_nopriv NOLOGIN;
+DO $$
+DECLARE
+    err text;
+BEGIN
+    SET LOCAL ROLE regress_wet_reader_nopriv;
+    BEGIN
+        PERFORM count(*) FROM pg_get_wait_event_trace(0);
+        err := 'NO ERROR (unexpected: function should be denied)';
+    EXCEPTION WHEN insufficient_privilege THEN
+        err := 'permission denied (expected)';
+    END;
+    RAISE NOTICE 'permission gate: %', err;
+END
+$$;
+NOTICE:  permission gate: permission denied (expected)
+DROP ROLE regress_wet_reader_nopriv;
+-- Clean up
+RESET wait_event_capture;
+RESET compute_query_id;
diff --git a/src/test/regress/expected/wait_event_timing_1.out b/src/test/regress/expected/wait_event_timing_1.out
new file mode 100644
index 0000000000000..2df1b0c3e870a
--- /dev/null
+++ b/src/test/regress/expected/wait_event_timing_1.out
@@ -0,0 +1,405 @@
+--
+-- Test wait event timing infrastructure
+--
+-- These tests verify the wait event timing SQL interface.
+-- They require --enable-wait-event-timing (or -Dwait_event_timing=true for
+-- meson) at compile time.  Without it, the alternate expected output
+-- wait_event_timing_1.out is used.  The default CI (Cirrus) runs without
+-- timing enabled, so the non-timing path is tested automatically.
+--
+-- Check GUC default
+SHOW wait_event_capture;
+ wait_event_capture 
+--------------------
+ off
+(1 row)
+
+-- Enable stats-level capture for this test (PGC_SUSET, requires superuser)
+SET wait_event_capture = stats;
+ERROR:  invalid value for parameter "wait_event_capture": "stats"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+-- Verify views exist (zero rows is fine, just checking structure)
+SELECT * FROM pg_stat_wait_event_timing LIMIT 0;
+ pid | backend_type | procnumber | wait_event_type | wait_event | calls | total_time_ms | avg_time_us | max_time_us | histogram 
+-----+--------------+------------+-----------------+------------+-------+---------------+-------------+-------------+-----------
+(0 rows)
+
+SELECT * FROM pg_backend_wait_event_trace LIMIT 0;
+ seq | timestamp_ns | wait_event_type | wait_event | duration_us | query_id 
+-----+--------------+-----------------+------------+-------------+----------
+(0 rows)
+
+-- The histogram-buckets taxonomy view is constant: 16 ordered rows,
+-- ascending bin edges, last bucket open-ended.  Available in both
+-- timing and non-timing builds (defined in system_views.sql, not gated
+-- on the compile flag).
+SELECT count(*) = 32 AS thirty_two_rows,
+       min(bucket_idx) = 0 AS idx_starts_at_zero,
+       max(bucket_idx) = 31 AS idx_ends_at_thirty_one,
+       bool_and(lower_ns IS NOT NULL) AS all_lowers_present,
+       count(*) FILTER (WHERE upper_ns IS NULL) = 1 AS one_open_bucket
+FROM pg_wait_event_timing_histogram_buckets;
+ thirty_two_rows | idx_starts_at_zero | idx_ends_at_thirty_one | all_lowers_present | one_open_bucket 
+-----------------+--------------------+------------------------+--------------------+-----------------
+ t               | t                  | t                      | t                  | t
+(1 row)
+
+-- Verify column types of timing view
+SELECT
+    a.attname,
+    pg_catalog.format_type(a.atttypid, a.atttypmod) as type
+FROM pg_catalog.pg_attribute a
+JOIN pg_catalog.pg_class c ON a.attrelid = c.oid
+JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid
+WHERE n.nspname = 'pg_catalog'
+  AND c.relname = 'pg_stat_wait_event_timing'
+  AND a.attnum > 0
+  AND NOT a.attisdropped
+ORDER BY a.attnum;
+     attname     |       type       
+-----------------+------------------
+ pid             | integer
+ backend_type    | text
+ procnumber      | integer
+ wait_event_type | text
+ wait_event      | text
+ calls           | bigint
+ total_time_ms   | double precision
+ avg_time_us     | double precision
+ max_time_us     | double precision
+ histogram       | bigint[]
+(10 rows)
+
+-- Generate a wait event
+SELECT pg_sleep(0.1);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+-- Verify PgSleep event appears with correct structure
+SELECT
+    pid = pg_backend_pid() AS pid_ok,
+    backend_type,
+    wait_event_type,
+    wait_event,
+    calls >= 1 AS has_calls,
+    total_time_ms > 0 AS has_time,
+    avg_time_us > 0 AS has_avg,
+    max_time_us > 0 AS has_max,
+    pg_typeof(histogram) AS hist_type,
+    array_length(histogram, 1) AS hist_len,
+    calls = (SELECT sum(x) FROM unnest(histogram) x) AS hist_invariant
+FROM pg_stat_wait_event_timing
+WHERE wait_event = 'PgSleep';
+ pid_ok | backend_type | wait_event_type | wait_event | has_calls | has_time | has_avg | has_max | hist_type | hist_len | hist_invariant 
+--------+--------------+-----------------+------------+-----------+----------+---------+---------+-----------+----------+----------------
+(0 rows)
+
+-- Test reset function (own backend)
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ERROR:  wait event capture is not supported by this build
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+SELECT count(*) AS after_reset
+FROM pg_stat_wait_event_timing
+WHERE wait_event = 'PgSleep';
+ after_reset 
+-------------
+           0
+(1 row)
+
+-- Test trace ring buffer (need compute_query_id for query markers)
+SET compute_query_id = on;
+SET wait_event_capture = trace;
+ERROR:  invalid value for parameter "wait_event_capture": "trace"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+SELECT pg_sleep(0.01);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+SELECT
+    wait_event_type,
+    wait_event,
+    duration_us >= 0 AS dur_ok,
+    seq >= 0 AS seq_ok
+FROM pg_backend_wait_event_trace
+WHERE wait_event = 'PgSleep';
+ wait_event_type | wait_event | dur_ok | seq_ok 
+-----------------+------------+--------+--------
+(0 rows)
+
+-- Test query markers exist in trace
+SELECT count(*) > 0 AS has_query_markers
+FROM pg_backend_wait_event_trace
+WHERE wait_event_type = 'Query';
+ has_query_markers 
+-------------------
+ f
+(1 row)
+
+-- Reset does not crash: NULL and own PID are equivalent
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ERROR:  wait event capture is not supported by this build
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+SELECT pg_stat_reset_wait_event_timing(pg_backend_pid());
+ERROR:  wait event capture is not supported by this build
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+-- Unknown PID is a silent no-op (matches pg_stat_reset_backend_stats)
+SELECT pg_stat_reset_wait_event_timing(2147483647);
+ERROR:  wait event capture is not supported by this build
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+-- Cluster-wide reset (superuser-only)
+SELECT pg_stat_reset_wait_event_timing_all();
+ERROR:  wait event capture is not supported by this build
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+-- Trace read (no arguments; always returns own session)
+SELECT count(*) >= 0 AS trace_readable
+FROM pg_get_backend_wait_event_trace();
+ trace_readable 
+----------------
+ t
+(1 row)
+
+-- Test trace lifecycle: drop to stats, then back up to trace
+SET compute_query_id = on;
+SET wait_event_capture = stats;
+ERROR:  invalid value for parameter "wait_event_capture": "stats"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+SET wait_event_capture = trace;
+ERROR:  invalid value for parameter "wait_event_capture": "trace"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+SELECT 1 AS reattach_test;
+ reattach_test 
+---------------
+             1
+(1 row)
+
+SELECT count(*) >= 0 AS trace_reattach_ok
+FROM pg_backend_wait_event_trace;
+ trace_reattach_ok 
+-------------------
+ t
+(1 row)
+
+SET wait_event_capture = stats;
+ERROR:  invalid value for parameter "wait_event_capture": "stats"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+-- Pin issue #15 fix: TRACE -> OFF (or STATS) must release the DSA ring,
+-- and a subsequent re-enable must allocate a fresh, empty ring.  Old
+-- trace records do NOT survive the disable, but aggregated stats in
+-- pg_stat_wait_event_timing DO (they live in a separate DSA allocation).
+--
+-- The assertions below are strict-equal on count-agnostic invariants.
+-- We deliberately avoid "count(*) = N" style assertions here: pg_sleep()
+-- loops around WaitLatch and can emit more than one PgSleep wait event
+-- per call under CPU contention (spurious latch wakes), so a fixed count
+-- would be flaky on busy CI runners.  Instead:
+--
+--   * ring_reallocated is decided by comparing phase 2's max(seq) against
+--     phase 1's (seq is derived from write_pos, which resets to 0 on a
+--     freshly allocated ring -- phase 2's records must have strictly
+--     smaller seq than phase 1's last record iff the ring was freed).
+--
+--   * stats_preserved_exactly checks that aggregated "calls" equals the
+--     exact sum of events seen in the two phase rings.  Whatever each
+--     phase's ring count happens to be, the aggregated counter must land
+--     on that sum; any drop, asymmetric duplication, or reset-on-toggle
+--     bug breaks the equality.
+--
+-- The symmetric-duplication case (both ring and aggregated doubled
+-- identically) is covered separately in test_wait_event_stress using
+-- deterministic exact-count input via stress_wait_events().
+SELECT pg_stat_reset_wait_event_timing(NULL);
+ERROR:  wait event capture is not supported by this build
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+SET wait_event_capture = trace;
+ERROR:  invalid value for parameter "wait_event_capture": "trace"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+SELECT pg_sleep(0.001);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+SELECT pg_sleep(0.001);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+-- Stash phase 1's ring count + highest seq (all phase-1 records).
+CREATE TEMP TABLE wet_phase1 AS
+SELECT count(*) AS n, max(seq) AS max_seq
+FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep';
+-- At least two PgSleep events captured (one per pg_sleep call, ignoring
+-- spurious wakes).  Catches drop bugs.
+SELECT n >= 2 AS phase1_captured_both_sleeps
+FROM wet_phase1;
+ phase1_captured_both_sleeps 
+-----------------------------
+ f
+(1 row)
+
+SET wait_event_capture = off;
+SET wait_event_capture = trace;
+ERROR:  invalid value for parameter "wait_event_capture": "trace"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+SELECT pg_sleep(0.001);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+-- Phase 2: stash fresh-ring count + max(seq).
+CREATE TEMP TABLE wet_phase2 AS
+SELECT count(*) AS n, max(seq) AS max_seq
+FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep';
+-- The ring was freed iff phase 2's records all have seq strictly smaller
+-- than phase 1's last seq (write_pos started over at 0).  If the ring
+-- had persisted, phase 2 would contain phase 1's records plus new ones,
+-- so max(seq) would be >= phase1.max_seq.  Strict-equal on semantic.
+SELECT n >= 1 AND max_seq < (SELECT max_seq FROM wet_phase1)
+       AS ring_freed_and_reallocated
+FROM wet_phase2;
+ ring_freed_and_reallocated 
+----------------------------
+ f
+(1 row)
+
+-- Aggregated stats must equal the exact sum of the two phase ring counts.
+-- Catches drops (aggregated < sum), asymmetric duplication, and any
+-- reset-on-toggle bug that would wipe aggregated counters.
+SELECT calls = (SELECT n FROM wet_phase1) + (SELECT n FROM wet_phase2)
+       AS stats_preserved_exactly
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+ stats_preserved_exactly 
+-------------------------
+(0 rows)
+
+DROP TABLE wet_phase1, wet_phase2;
+SET wait_event_capture = stats;
+ERROR:  invalid value for parameter "wait_event_capture": "stats"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+-- Overflow counters view: should be readable and overflow counts should
+-- be zero for a freshly-reset session that hasn't exceeded limits.
+-- reset_count must have incremented at least once (we called reset above).
+SELECT
+    pid = pg_backend_pid() AS pid_ok,
+    lwlock_overflow_count >= 0 AS lw_nonneg,
+    flat_overflow_count >= 0 AS flat_nonneg,
+    reset_count >= 1 AS reset_count_bumped
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+ pid_ok | lw_nonneg | flat_nonneg | reset_count_bumped 
+--------+-----------+-------------+--------------------
+(0 rows)
+
+-- Orphan-clear admin function: smoke-test that it returns a non-negative
+-- count and is callable without error.  Actual orphan-creation requires
+-- a backend exit, which the regression harness can't easily orchestrate
+-- in a portable way; we verify here only that the API works.  Returns
+-- bigint (count of rings freed); typically 0 in a fresh test run.
+SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok;
+ clear_orphans_ok 
+------------------
+ t
+(1 row)
+
+-- PID-filter fast path on the cluster-wide SRFs.  Smoke-test that the
+-- single-slot branch returns rows for the calling backend and zero rows
+-- for a known-bad PID (matching pg_stat_reset_wait_event_timing
+-- semantics).
+SELECT
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing(pg_backend_pid())
+     WHERE pid = pg_backend_pid()) >= 0 AS own_pid_returns_rows,
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing(2147483647)) = 0
+        AS unknown_pid_empty,
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(pg_backend_pid())
+     WHERE pid = pg_backend_pid()) = 1 AS overflow_own_pid_one_row,
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(2147483647)) = 0
+        AS overflow_unknown_pid_empty;
+ own_pid_returns_rows | unknown_pid_empty | overflow_own_pid_one_row | overflow_unknown_pid_empty 
+----------------------+-------------------+--------------------------+----------------------------
+ t                    | t                 | f                        | t
+(1 row)
+
+-- Cross-backend trace SRF: smoke-test that pg_get_wait_event_trace
+-- (procnumber-keyed) is callable and returns sensible results.
+-- Full orphan-readability and the parallel-worker case are exercised
+-- by the TAP test (which can orchestrate backend exits).
+SET wait_event_capture = trace;
+ERROR:  invalid value for parameter "wait_event_capture": "trace"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+-- generate at least one wait event so the ring is allocated
+SELECT pg_sleep(0.01);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+SELECT
+    -- Own session: pull our procnumber from the timing SRF, then read
+    -- our own trace ring through the cross-backend SRF.
+    (SELECT count(*) FROM pg_get_wait_event_trace(
+        (SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid())
+         WHERE pid = pg_backend_pid() LIMIT 1))) >= 0
+        AS by_procnumber_self_ok,
+    -- Out-of-range procnumber: empty result, no error.
+    (SELECT count(*) FROM pg_get_wait_event_trace(-1)) = 0
+        AS negative_procnumber_empty,
+    (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0
+        AS huge_procnumber_empty;
+ by_procnumber_self_ok | negative_procnumber_empty | huge_procnumber_empty 
+-----------------------+---------------------------+-----------------------
+ t                     | t                         | t
+(1 row)
+
+-- With capture disabled, a never-allocated slot still reads as empty
+-- (the function short-circuits when the trace DSA was never created
+-- or when the slot is FREE).
+SET wait_event_capture = off;
+SELECT (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0
+    AS capture_off_empty;
+ capture_off_empty 
+-------------------
+ t
+(1 row)
+
+-- Permission gating: a role without pg_read_all_stats cannot call the
+-- function.  Cover both the public role and a freshly-created one.
+SET wait_event_capture = stats;
+ERROR:  invalid value for parameter "wait_event_capture": "stats"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+CREATE ROLE regress_wet_reader_nopriv NOLOGIN;
+DO $$
+DECLARE
+    err text;
+BEGIN
+    SET LOCAL ROLE regress_wet_reader_nopriv;
+    BEGIN
+        PERFORM count(*) FROM pg_get_wait_event_trace(0);
+        err := 'NO ERROR (unexpected: function should be denied)';
+    EXCEPTION WHEN insufficient_privilege THEN
+        err := 'permission denied (expected)';
+    END;
+    RAISE NOTICE 'permission gate: %', err;
+END
+$$;
+NOTICE:  permission gate: permission denied (expected)
+DROP ROLE regress_wet_reader_nopriv;
+-- Clean up
+RESET wait_event_capture;
+RESET compute_query_id;
diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule
index 8fa0a6c47fb30..b47560f85fd04 100644
--- a/src/test/regress/parallel_schedule
+++ b/src/test/regress/parallel_schedule
@@ -135,6 +135,12 @@ test: compression compression_lz4 compression_pglz cluster
 # oidjoins is read-only, though, and should run late for best coverage
 test: oidjoins event_trigger
 
+# wait_event_timing creates and drops temp tables to capture trace-ring
+# state across phases (see wet_phase1/wet_phase2 in the .sql file).  Its
+# DDL would be polluted by event_trigger's ddl_command_end trigger if
+# they ran concurrently, so it gets its own scheduling slot rather than
+# sharing event_trigger's parallel group.
+test: wait_event_timing
 
 # event_trigger_login cannot run concurrently with any other tests because
 # on-login event handling could catch connection of a concurrent test.
diff --git a/src/test/regress/sql/wait_event_timing.sql b/src/test/regress/sql/wait_event_timing.sql
new file mode 100644
index 0000000000000..6531fd7490ca0
--- /dev/null
+++ b/src/test/regress/sql/wait_event_timing.sql
@@ -0,0 +1,260 @@
+--
+-- Test wait event timing infrastructure
+--
+-- These tests verify the wait event timing SQL interface.
+-- They require --enable-wait-event-timing (or -Dwait_event_timing=true for
+-- meson) at compile time.  Without it, the alternate expected output
+-- wait_event_timing_1.out is used.  The default CI (Cirrus) runs without
+-- timing enabled, so the non-timing path is tested automatically.
+--
+
+-- Check GUC default
+SHOW wait_event_capture;
+
+-- Enable stats-level capture for this test (PGC_SUSET, requires superuser)
+SET wait_event_capture = stats;
+
+-- Verify views exist (zero rows is fine, just checking structure)
+SELECT * FROM pg_stat_wait_event_timing LIMIT 0;
+SELECT * FROM pg_backend_wait_event_trace LIMIT 0;
+
+-- The histogram-buckets taxonomy view is constant: 16 ordered rows,
+-- ascending bin edges, last bucket open-ended.  Available in both
+-- timing and non-timing builds (defined in system_views.sql, not gated
+-- on the compile flag).
+SELECT count(*) = 32 AS thirty_two_rows,
+       min(bucket_idx) = 0 AS idx_starts_at_zero,
+       max(bucket_idx) = 31 AS idx_ends_at_thirty_one,
+       bool_and(lower_ns IS NOT NULL) AS all_lowers_present,
+       count(*) FILTER (WHERE upper_ns IS NULL) = 1 AS one_open_bucket
+FROM pg_wait_event_timing_histogram_buckets;
+
+-- Verify column types of timing view
+SELECT
+    a.attname,
+    pg_catalog.format_type(a.atttypid, a.atttypmod) as type
+FROM pg_catalog.pg_attribute a
+JOIN pg_catalog.pg_class c ON a.attrelid = c.oid
+JOIN pg_catalog.pg_namespace n ON c.relnamespace = n.oid
+WHERE n.nspname = 'pg_catalog'
+  AND c.relname = 'pg_stat_wait_event_timing'
+  AND a.attnum > 0
+  AND NOT a.attisdropped
+ORDER BY a.attnum;
+
+-- Generate a wait event
+SELECT pg_sleep(0.1);
+
+-- Verify PgSleep event appears with correct structure
+SELECT
+    pid = pg_backend_pid() AS pid_ok,
+    backend_type,
+    wait_event_type,
+    wait_event,
+    calls >= 1 AS has_calls,
+    total_time_ms > 0 AS has_time,
+    avg_time_us > 0 AS has_avg,
+    max_time_us > 0 AS has_max,
+    pg_typeof(histogram) AS hist_type,
+    array_length(histogram, 1) AS hist_len,
+    calls = (SELECT sum(x) FROM unnest(histogram) x) AS hist_invariant
+FROM pg_stat_wait_event_timing
+WHERE wait_event = 'PgSleep';
+
+-- Test reset function (own backend)
+SELECT pg_stat_reset_wait_event_timing(NULL);
+SELECT count(*) AS after_reset
+FROM pg_stat_wait_event_timing
+WHERE wait_event = 'PgSleep';
+
+-- Test trace ring buffer (need compute_query_id for query markers)
+SET compute_query_id = on;
+SET wait_event_capture = trace;
+SELECT pg_sleep(0.01);
+
+SELECT
+    wait_event_type,
+    wait_event,
+    duration_us >= 0 AS dur_ok,
+    seq >= 0 AS seq_ok
+FROM pg_backend_wait_event_trace
+WHERE wait_event = 'PgSleep';
+
+-- Test query markers exist in trace
+SELECT count(*) > 0 AS has_query_markers
+FROM pg_backend_wait_event_trace
+WHERE wait_event_type = 'Query';
+
+-- Reset does not crash: NULL and own PID are equivalent
+SELECT pg_stat_reset_wait_event_timing(NULL);
+SELECT pg_stat_reset_wait_event_timing(pg_backend_pid());
+
+-- Unknown PID is a silent no-op (matches pg_stat_reset_backend_stats)
+SELECT pg_stat_reset_wait_event_timing(2147483647);
+
+-- Cluster-wide reset (superuser-only)
+SELECT pg_stat_reset_wait_event_timing_all();
+
+-- Trace read (no arguments; always returns own session)
+SELECT count(*) >= 0 AS trace_readable
+FROM pg_get_backend_wait_event_trace();
+
+-- Test trace lifecycle: drop to stats, then back up to trace
+SET compute_query_id = on;
+SET wait_event_capture = stats;
+SET wait_event_capture = trace;
+SELECT 1 AS reattach_test;
+SELECT count(*) >= 0 AS trace_reattach_ok
+FROM pg_backend_wait_event_trace;
+SET wait_event_capture = stats;
+
+-- Pin issue #15 fix: TRACE -> OFF (or STATS) must release the DSA ring,
+-- and a subsequent re-enable must allocate a fresh, empty ring.  Old
+-- trace records do NOT survive the disable, but aggregated stats in
+-- pg_stat_wait_event_timing DO (they live in a separate DSA allocation).
+--
+-- The assertions below are strict-equal on count-agnostic invariants.
+-- We deliberately avoid "count(*) = N" style assertions here: pg_sleep()
+-- loops around WaitLatch and can emit more than one PgSleep wait event
+-- per call under CPU contention (spurious latch wakes), so a fixed count
+-- would be flaky on busy CI runners.  Instead:
+--
+--   * ring_reallocated is decided by comparing phase 2's max(seq) against
+--     phase 1's (seq is derived from write_pos, which resets to 0 on a
+--     freshly allocated ring -- phase 2's records must have strictly
+--     smaller seq than phase 1's last record iff the ring was freed).
+--
+--   * stats_preserved_exactly checks that aggregated "calls" equals the
+--     exact sum of events seen in the two phase rings.  Whatever each
+--     phase's ring count happens to be, the aggregated counter must land
+--     on that sum; any drop, asymmetric duplication, or reset-on-toggle
+--     bug breaks the equality.
+--
+-- The symmetric-duplication case (both ring and aggregated doubled
+-- identically) is covered separately in test_wait_event_stress using
+-- deterministic exact-count input via stress_wait_events().
+SELECT pg_stat_reset_wait_event_timing(NULL);
+SET wait_event_capture = trace;
+SELECT pg_sleep(0.001);
+SELECT pg_sleep(0.001);
+
+-- Stash phase 1's ring count + highest seq (all phase-1 records).
+CREATE TEMP TABLE wet_phase1 AS
+SELECT count(*) AS n, max(seq) AS max_seq
+FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep';
+
+-- At least two PgSleep events captured (one per pg_sleep call, ignoring
+-- spurious wakes).  Catches drop bugs.
+SELECT n >= 2 AS phase1_captured_both_sleeps
+FROM wet_phase1;
+
+SET wait_event_capture = off;
+SET wait_event_capture = trace;
+SELECT pg_sleep(0.001);
+
+-- Phase 2: stash fresh-ring count + max(seq).
+CREATE TEMP TABLE wet_phase2 AS
+SELECT count(*) AS n, max(seq) AS max_seq
+FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep';
+
+-- The ring was freed iff phase 2's records all have seq strictly smaller
+-- than phase 1's last seq (write_pos started over at 0).  If the ring
+-- had persisted, phase 2 would contain phase 1's records plus new ones,
+-- so max(seq) would be >= phase1.max_seq.  Strict-equal on semantic.
+SELECT n >= 1 AND max_seq < (SELECT max_seq FROM wet_phase1)
+       AS ring_freed_and_reallocated
+FROM wet_phase2;
+
+-- Aggregated stats must equal the exact sum of the two phase ring counts.
+-- Catches drops (aggregated < sum), asymmetric duplication, and any
+-- reset-on-toggle bug that would wipe aggregated counters.
+SELECT calls = (SELECT n FROM wet_phase1) + (SELECT n FROM wet_phase2)
+       AS stats_preserved_exactly
+FROM pg_stat_wait_event_timing
+WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep';
+
+DROP TABLE wet_phase1, wet_phase2;
+SET wait_event_capture = stats;
+
+-- Overflow counters view: should be readable and overflow counts should
+-- be zero for a freshly-reset session that hasn't exceeded limits.
+-- reset_count must have incremented at least once (we called reset above).
+SELECT
+    pid = pg_backend_pid() AS pid_ok,
+    lwlock_overflow_count >= 0 AS lw_nonneg,
+    flat_overflow_count >= 0 AS flat_nonneg,
+    reset_count >= 1 AS reset_count_bumped
+FROM pg_stat_wait_event_timing_overflow
+WHERE pid = pg_backend_pid();
+
+-- Orphan-clear admin function: smoke-test that it returns a non-negative
+-- count and is callable without error.  Actual orphan-creation requires
+-- a backend exit, which the regression harness can't easily orchestrate
+-- in a portable way; we verify here only that the API works.  Returns
+-- bigint (count of rings freed); typically 0 in a fresh test run.
+SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok;
+
+-- PID-filter fast path on the cluster-wide SRFs.  Smoke-test that the
+-- single-slot branch returns rows for the calling backend and zero rows
+-- for a known-bad PID (matching pg_stat_reset_wait_event_timing
+-- semantics).
+SELECT
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing(pg_backend_pid())
+     WHERE pid = pg_backend_pid()) >= 0 AS own_pid_returns_rows,
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing(2147483647)) = 0
+        AS unknown_pid_empty,
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(pg_backend_pid())
+     WHERE pid = pg_backend_pid()) = 1 AS overflow_own_pid_one_row,
+    (SELECT count(*) FROM pg_stat_get_wait_event_timing_overflow(2147483647)) = 0
+        AS overflow_unknown_pid_empty;
+
+-- Cross-backend trace SRF: smoke-test that pg_get_wait_event_trace
+-- (procnumber-keyed) is callable and returns sensible results.
+-- Full orphan-readability and the parallel-worker case are exercised
+-- by the TAP test (which can orchestrate backend exits).
+SET wait_event_capture = trace;
+-- generate at least one wait event so the ring is allocated
+SELECT pg_sleep(0.01);
+SELECT
+    -- Own session: pull our procnumber from the timing SRF, then read
+    -- our own trace ring through the cross-backend SRF.
+    (SELECT count(*) FROM pg_get_wait_event_trace(
+        (SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid())
+         WHERE pid = pg_backend_pid() LIMIT 1))) >= 0
+        AS by_procnumber_self_ok,
+    -- Out-of-range procnumber: empty result, no error.
+    (SELECT count(*) FROM pg_get_wait_event_trace(-1)) = 0
+        AS negative_procnumber_empty,
+    (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0
+        AS huge_procnumber_empty;
+
+-- With capture disabled, a never-allocated slot still reads as empty
+-- (the function short-circuits when the trace DSA was never created
+-- or when the slot is FREE).
+SET wait_event_capture = off;
+SELECT (SELECT count(*) FROM pg_get_wait_event_trace(2147483647)) = 0
+    AS capture_off_empty;
+
+-- Permission gating: a role without pg_read_all_stats cannot call the
+-- function.  Cover both the public role and a freshly-created one.
+SET wait_event_capture = stats;
+CREATE ROLE regress_wet_reader_nopriv NOLOGIN;
+DO $$
+DECLARE
+    err text;
+BEGIN
+    SET LOCAL ROLE regress_wet_reader_nopriv;
+    BEGIN
+        PERFORM count(*) FROM pg_get_wait_event_trace(0);
+        err := 'NO ERROR (unexpected: function should be denied)';
+    EXCEPTION WHEN insufficient_privilege THEN
+        err := 'permission denied (expected)';
+    END;
+    RAISE NOTICE 'permission gate: %', err;
+END
+$$;
+DROP ROLE regress_wet_reader_nopriv;
+
+-- Clean up
+RESET wait_event_capture;
+RESET compute_query_id;
diff --git a/src/tools/pgindent/exclude_file_patterns b/src/tools/pgindent/exclude_file_patterns
index 4976a373f9e53..68269fe0c6175 100644
--- a/src/tools/pgindent/exclude_file_patterns
+++ b/src/tools/pgindent/exclude_file_patterns
@@ -17,6 +17,7 @@ src/backend/nodes/\w+\.switch\.c$
 # looks worse with pgindent.
 src/backend/utils/activity/pgstat_wait_event\.c$
 src/backend/utils/activity/wait_event_funcs_data\.c$
+src/backend/utils/activity/wait_event_timing_data\.h$
 src/backend/utils/activity/wait_event_types\.h$
 #
 # This confuses pgindent, and it's a derived file anyway.
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index cbd9e10fc1d47..631f5ede8312c 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1609,6 +1609,8 @@ LWLock
 LWLockHandle
 LWLockMode
 LWLockPadded
+LWLockTimingHash
+LWLockTimingHashEntry
 LWLockTrancheShmemData
 LZ4F_compressionContext_t
 LZ4F_decompressOptions_t
@@ -3414,6 +3416,7 @@ WSAPROTOCOL_INFO
 WaitEvent
 WaitEventActivity
 WaitEventBuffer
+WaitEventCaptureLevel
 WaitEventClient
 WaitEventCustomCounterData
 WaitEventCustomEntryByInfo
@@ -3422,6 +3425,14 @@ WaitEventIO
 WaitEventIPC
 WaitEventSet
 WaitEventTimeout
+WaitEventTimingControl
+WaitEventTimingEntry
+WaitEventTimingState
+WaitEventTraceControl
+WaitEventTraceRecord
+WaitEventTraceSlot
+WaitEventTraceSlotState
+WaitEventTraceState
 WaitLSNProcInfo
 WaitLSNResult
 WaitLSNState
@@ -3452,6 +3463,7 @@ WalUsage
 WalWriteMethod
 WalWriteMethodOps
 Walfile
+WetValidRecord
 WindowAgg
 WindowAggPath
 WindowAggState